From fb6c43a38f2142f12b0dcc13bf279f9af5f7bb25 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Wed, 30 Jan 2013 19:43:48 +0100 Subject: [PATCH] Rewrite scraper to be more modular, and convert the Coursera crawler to the new model --- updater/scrapers/__init__.py | 26 +++++++++ updater/scrapers/coursera.py | 50 +++++++++++++++++ updater/shared/__init__.py | 26 +++++++++ updater/shared/environment.py | 14 +++++ updater/shared/scraper.py | 100 ++++++++++++++++++++++++++++++++++ updater/update.py | 8 +++ updater/update_coursera.py | 47 ---------------- 7 files changed, 224 insertions(+), 47 deletions(-) create mode 100644 updater/scrapers/__init__.py create mode 100644 updater/scrapers/coursera.py create mode 100644 updater/shared/__init__.py create mode 100644 updater/shared/environment.py create mode 100644 updater/shared/scraper.py create mode 100644 updater/update.py delete mode 100644 updater/update_coursera.py diff --git a/updater/scrapers/__init__.py b/updater/scrapers/__init__.py new file mode 100644 index 0000000..29d40ca --- /dev/null +++ b/updater/scrapers/__init__.py @@ -0,0 +1,26 @@ +import inspect, os, sys + +my_path = os.path.dirname(inspect.getfile(inspect.currentframe())) + +def _import_module_into_scope(modulename): + module = __import__(modulename) + + for name in vars(module): + data = getattr(module, name) + globals()[name] = data + +sys.path.insert(0, my_path) + +for fname in os.listdir(my_path): + fpath = os.path.join(my_path, fname) + fbasename, fext = os.path.splitext(fname) + + if os.path.isdir(fpath): + if os.path.isfile(os.path.join(my_path, fname, "__init__.py")): + # This is a python directory module + _import_module_into_scope(fname) + elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__": + # This is a python file module + _import_module_into_scope(fbasename) + +sys.path.remove(my_path) diff --git a/updater/scrapers/coursera.py b/updater/scrapers/coursera.py new file mode 100644 index 0000000..43ab3a3 --- /dev/null +++ b/updater/scrapers/coursera.py @@ -0,0 +1,50 @@ +import datetime, json, sys +import requests, oursql +import shared + +class Coursera(shared.Scraper): + provider_id = 2 + + def run(self): + self.retrieve_dataset() + self.parse_dataset() + + def retrieve_dataset(self): + self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json() + + def parse_dataset(self): + for item in self.dataset: + self.process_item(item) + + def process_item(self, item): + inserted, rowid = self.insert_topic(str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True) + + if inserted: + self.env.log("Inserted topic %s" % item["name"]) + else: + self.env.log("Skipped topic %s" % item["name"]) + + for course in item["courses"]: + self.process_course(course, rowid) + + def process_course(self, course, topicid): + try: + start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"]) + except TypeError, e: + start_date = None + + title = self.generate_title(course['name'], start_date) + + inserted, itemid = self.insert_item(str(course["id"]), title, course["home_link"], has_topic=True, itemtype=self.COURSE, description=course["certificate_description"], start_date=start_date, topic_id=topicid) + + if inserted: + self.env.log("Inserted item %s" % title) + else: + self.env.log("Skipped item %s" % title) + + def generate_title(self, name, date): + if date is None: + return "%s (date undetermined)" % name + else: + return "%s (starting %s)" % (name, date.strftime("%b %d, %Y")) + diff --git a/updater/shared/__init__.py b/updater/shared/__init__.py new file mode 100644 index 0000000..29d40ca --- /dev/null +++ b/updater/shared/__init__.py @@ -0,0 +1,26 @@ +import inspect, os, sys + +my_path = os.path.dirname(inspect.getfile(inspect.currentframe())) + +def _import_module_into_scope(modulename): + module = __import__(modulename) + + for name in vars(module): + data = getattr(module, name) + globals()[name] = data + +sys.path.insert(0, my_path) + +for fname in os.listdir(my_path): + fpath = os.path.join(my_path, fname) + fbasename, fext = os.path.splitext(fname) + + if os.path.isdir(fpath): + if os.path.isfile(os.path.join(my_path, fname, "__init__.py")): + # This is a python directory module + _import_module_into_scope(fname) + elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__": + # This is a python file module + _import_module_into_scope(fbasename) + +sys.path.remove(my_path) diff --git a/updater/shared/environment.py b/updater/shared/environment.py new file mode 100644 index 0000000..a275ae7 --- /dev/null +++ b/updater/shared/environment.py @@ -0,0 +1,14 @@ +import oursql + +class Environment(object): + def connect(self, host="localhost", username="root", password="", database="learn"): + self.db = oursql.connect(host=host, user=username, passwd=password, db=database) + self.connected = True + + def log(self, text): + print text + + def Scraper(self, scraper_class): + s = scraper_class(self.db) + s.env = self + return s diff --git a/updater/shared/scraper.py b/updater/shared/scraper.py new file mode 100644 index 0000000..649c666 --- /dev/null +++ b/updater/shared/scraper.py @@ -0,0 +1,100 @@ +class Scraper(object): + UNKNOWN = 0 + TOPIC = 1 + COURSE = 2 + VIDEO = 3 + ARTICLE = 4 + EXERCISE = 5 + QUIZ = 6 + TEST = 7 + BOOK = 8 + AUDIOBOOK = 9 + LECTURE = 10 + + provider_id = 0 + + def __init__(self, database=None): + if database is not None: + self.db = database + self.can_store = True + else: + self.can_store = False + + def run(self, *args, **kwargs): + raise Exception("No run() method was specified for this scraper.") + + def insert_topic(self, unique_id, title, override=False, **kwargs): + defaults = { + "needs_enrollment": False, + "creation_date": None, + "start_date": None, + "end_date": None, + "parent_id": 0, + "description": "", + "provider_name": "" + } + + for kwarg, val in defaults.iteritems(): + try: + if kwargs[kwarg] == None: + kwargs[kwarg] = defaults[kwarg] + except KeyError, e: + kwargs[kwarg] = defaults[kwarg] + + c = self.db.cursor() + + if override == True: + exists = False + else: + c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id)) + results = c.fetchall() + exists = (len(results) > 0) + + if exists == True: + return (False, results[0][0]) + else: + c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`, `CustomProviderName`)" + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], self.provider_id, unique_id, title, kwargs['description'], kwargs['creation_date'], + kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'], kwargs["provider_name"])) + + return (True, c.lastrowid) + + def insert_item(self, unique_id, title, item_url, override=False, **kwargs): + defaults = { + "views": None, + "has_topic": False, + "itemtype": 0, + "source_url": item_url, + "topic_id": 0, + "parent_id": 0, + "description": "", + "date": None, + "start_date": None, + "end_date": None, + "provider_name": "" + } + + for kwarg, val in defaults.iteritems(): + try: + if kwargs[kwarg] == None: + kwargs[kwarg] = defaults[kwarg] + except KeyError, e: + kwargs[kwarg] = defaults[kwarg] + + c = self.db.cursor() + + if override == True: + exists = False + else: + c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id)) + results = c.fetchall() + exists = (len(results) > 0) + + if exists == True: + return (False, results[0][0]) + else: + c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`, `StartDate`, `EndDate`, `CustomProviderName`)" + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs["has_topic"], kwargs["itemtype"], self.provider_id, unique_id, title, kwargs["description"], item_url, kwargs["source_url"], + kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"])) + + return (True, c.lastrowid) diff --git a/updater/update.py b/updater/update.py new file mode 100644 index 0000000..8efbfb6 --- /dev/null +++ b/updater/update.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python +import shared, scrapers + +env = shared.Environment() +env.connect(host="localhost", username="root", password="", database="learn") + +scraper = env.Scraper(scrapers.Coursera) +scraper.run() diff --git a/updater/update_coursera.py b/updater/update_coursera.py deleted file mode 100644 index e3fc9f6..0000000 --- a/updater/update_coursera.py +++ /dev/null @@ -1,47 +0,0 @@ -import requests -import oursql -import datetime -import json -import lib - -class CourseraCrawler(object): - def __init__(self): - self.db = lib.Database("localhost", "root") - - def retrieve_dataset(self): - #self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json() - self.dataset = json.loads(open("coursera.json", "r").read()) - - def parse_dataset(self): - for item in self.dataset: - self.process_item(item) - - def process_item(self, item): - inserted, rowid = self.db.insert_topic(2, str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True) - - if inserted: - print "Inserted %s" % item["name"] - else: - print "Skipped %s" % item["name"] - - for course in item["courses"]: - self.process_course(course, rowid) - - def process_course(self, course, topicid): - try: - start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"]) - title = "%s: %s-%s-%s" % (course["name"], str(course["start_year"]).zfill(4), str(course["start_month"]).zfill(2), str(course["start_day"]).zfill(2)) - except TypeError, e: - start_date = None - title = "%s (date undetermined)" % (course["name"]) - - inserted, itemid = self.db.insert_item(2, str(course["id"]), True, self.db.COURSE, title, course["home_link"], description=course["certificate_description"], start_date=start_date, topic_id=topicid) - - if inserted: - print "\tInserted %s" % title - else: - print "\tSkipped %s" % title - -crawler = CourseraCrawler() -crawler.retrieve_dataset() -crawler.parse_dataset()