Rewrite scraper to be more modular, and convert the Coursera crawler to the new model

2013-01-30 19:43:48 +01:00 · 2013-01-30 19:43:48 +01:00 · fb6c43a38f
parent c2a8a66dac
commit fb6c43a38f
7 changed files with 224 additions and 47 deletions
--- a/updater/scrapers/init.py
+++ b/updater/scrapers/init.py
@ -0,0 +1,26 @@
 import inspect, os, sys
 my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
 def _import_module_into_scope(modulename):
 	module = __import__(modulename)
 	for name in vars(module):
 		data = getattr(module, name)
 		globals()[name] = data
 sys.path.insert(0, my_path)
 for fname in os.listdir(my_path):
 	fpath = os.path.join(my_path, fname)
 	fbasename, fext = os.path.splitext(fname)
 	if os.path.isdir(fpath):
 		if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
 			# This is a python directory module
 			_import_module_into_scope(fname)
 	elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
 		# This is a python file module
 		_import_module_into_scope(fbasename)
 sys.path.remove(my_path)
--- a/updater/scrapers/coursera.py
+++ b/updater/scrapers/coursera.py
@ -0,0 +1,50 @@
 import datetime, json, sys
 import requests, oursql
 import shared
 class Coursera(shared.Scraper):
 	provider_id = 2
 	def run(self):
 		self.retrieve_dataset()
 		self.parse_dataset()
 	def retrieve_dataset(self):
 		self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
 	def parse_dataset(self):
 		for item in self.dataset:
 			self.process_item(item)
 	def process_item(self, item):
 		inserted, rowid = self.insert_topic(str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
 		if inserted:
 			self.env.log("Inserted topic %s" % item["name"])
 		else:
 			self.env.log("Skipped topic %s" % item["name"])
 		for course in item["courses"]:
 			self.process_course(course, rowid)
 	def process_course(self, course, topicid):
 		try:
 			start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
 		except TypeError, e:
 			start_date = None
 		title = self.generate_title(course['name'], start_date)
 		inserted, itemid = self.insert_item(str(course["id"]), title, course["home_link"], has_topic=True, itemtype=self.COURSE, description=course["certificate_description"], start_date=start_date, topic_id=topicid)
 		if inserted:
 			self.env.log("Inserted item %s" % title)
 		else:
 			self.env.log("Skipped item %s" % title)
 	def generate_title(self, name, date):
 		if date is None:
 			return "%s (date undetermined)" % name
 		else:
 			return "%s (starting %s)" % (name, date.strftime("%b %d, %Y"))
--- a/updater/shared/init.py
+++ b/updater/shared/init.py
@ -0,0 +1,26 @@
 import inspect, os, sys
 my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
 def _import_module_into_scope(modulename):
 	module = __import__(modulename)
 	for name in vars(module):
 		data = getattr(module, name)
 		globals()[name] = data
 sys.path.insert(0, my_path)
 for fname in os.listdir(my_path):
 	fpath = os.path.join(my_path, fname)
 	fbasename, fext = os.path.splitext(fname)
 	if os.path.isdir(fpath):
 		if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
 			# This is a python directory module
 			_import_module_into_scope(fname)
 	elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
 		# This is a python file module
 		_import_module_into_scope(fbasename)
 sys.path.remove(my_path)
--- a/updater/shared/environment.py
+++ b/updater/shared/environment.py
@ -0,0 +1,14 @@
 import oursql
 class Environment(object):
 	def connect(self, host="localhost", username="root", password="", database="learn"):
 		self.db = oursql.connect(host=host, user=username, passwd=password, db=database)
 		self.connected = True
 	def log(self, text):
 		print text
 	def Scraper(self, scraper_class):
 		s = scraper_class(self.db)
 		s.env = self
 		return s
--- a/updater/shared/scraper.py
+++ b/updater/shared/scraper.py
@ -0,0 +1,100 @@
 class Scraper(object):
 	UNKNOWN = 0
 	TOPIC = 1
 	COURSE = 2
 	VIDEO = 3
 	ARTICLE = 4
 	EXERCISE = 5
 	QUIZ = 6
 	TEST = 7
 	BOOK = 8
 	AUDIOBOOK = 9
 	LECTURE = 10
 	provider_id = 0
 	def __init__(self, database=None):
 		if database is not None:
 			self.db = database
 			self.can_store = True
 		else:
 			self.can_store = False
 	def run(self, *args, **kwargs):
 		raise Exception("No run() method was specified for this scraper.")
 	def insert_topic(self, unique_id, title, override=False, **kwargs):
 		defaults = {
 			"needs_enrollment": False,
 			"creation_date": None,
 			"start_date": None,
 			"end_date": None,
 			"parent_id": 0,
 			"description": "",
 			"provider_name": ""
 		}
 		for kwarg, val in defaults.iteritems():
 			try:
 				if kwargs[kwarg] == None:
 					kwargs[kwarg] = defaults[kwarg]
 			except KeyError, e:
 				kwargs[kwarg] = defaults[kwarg]
 		c = self.db.cursor()
 		if override == True:
 			exists = False
 		else:
 			c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
 			results = c.fetchall()
 			exists = (len(results) > 0)
 		if exists == True:
 			return (False, results[0][0])
 		else:
 			c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`, `CustomProviderName`)"
 				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], self.provider_id, unique_id, title, kwargs['description'], kwargs['creation_date'], 
 				                                            kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'], kwargs["provider_name"]))
 			return (True, c.lastrowid)
 	def insert_item(self, unique_id, title, item_url, override=False, **kwargs):
 		defaults = {
 			"views": None,
 			"has_topic": False,
 			"itemtype": 0,
 			"source_url": item_url,
 			"topic_id": 0,
 			"parent_id": 0,
 			"description": "",
 			"date": None,
 			"start_date": None,
 			"end_date": None,
 			"provider_name": ""
 		}
 		for kwarg, val in defaults.iteritems():
 			try:
 				if kwargs[kwarg] == None:
 					kwargs[kwarg] = defaults[kwarg]
 			except KeyError, e:
 				kwargs[kwarg] = defaults[kwarg]
 		c = self.db.cursor()
 		if override == True:
 			exists = False
 		else:
 			c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
 			results = c.fetchall()
 			exists = (len(results) > 0)
 		if exists == True:
 			return (False, results[0][0])
 		else:
 			c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`, `StartDate`, `EndDate`, `CustomProviderName`)"
 				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs["has_topic"], kwargs["itemtype"], self.provider_id, unique_id, title, kwargs["description"], item_url, kwargs["source_url"], 
 									       kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"]))
 			return (True, c.lastrowid)
--- a/updater/update.py
+++ b/updater/update.py
@ -0,0 +1,8 @@
 #!/usr/bin/env python
 import shared, scrapers
 env = shared.Environment()
 env.connect(host="localhost", username="root", password="", database="learn")
 scraper = env.Scraper(scrapers.Coursera)
 scraper.run()
--- a/updater/update_coursera.py
+++ b/updater/update_coursera.py
@ -1,47 +0,0 @@
 import requests
 import oursql
 import datetime
 import json
 import lib
 class CourseraCrawler(object):
 	def __init__(self):
 		self.db = lib.Database("localhost", "root")
 	def retrieve_dataset(self):
 		#self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
 		self.dataset = json.loads(open("coursera.json", "r").read())
 	def parse_dataset(self):
 		for item in self.dataset:
 			self.process_item(item)
 	def process_item(self, item):
 		inserted, rowid = self.db.insert_topic(2, str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
 		if inserted:
 			print "Inserted %s" % item["name"]
 		else:
 			print "Skipped %s" % item["name"]
 		for course in item["courses"]:
 			self.process_course(course, rowid)
 	def process_course(self, course, topicid):
 		try:
 			start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
 			title = "%s: %s-%s-%s" % (course["name"], str(course["start_year"]).zfill(4), str(course["start_month"]).zfill(2), str(course["start_day"]).zfill(2))
 		except TypeError, e:
 			start_date = None
 			title = "%s (date undetermined)" % (course["name"])
 		inserted, itemid = self.db.insert_item(2, str(course["id"]), True, self.db.COURSE, title, course["home_link"], description=course["certificate_description"], start_date=start_date, topic_id=topicid)
 		if inserted:
 			print "\tInserted %s" % title
 		else:
 			print "\tSkipped %s" % title
 crawler = CourseraCrawler()
 crawler.retrieve_dataset()
 crawler.parse_dataset()