Rewrite scraper to be more modular, and convert the Coursera crawler to the new model

2013-01-30 19:43:48 +01:00 · 2013-01-30 19:43:48 +01:00 · fb6c43a38f
parent c2a8a66dac
commit fb6c43a38f
7 changed files with 224 additions and 47 deletions
--- a/updater/scrapers/init.py
+++ b/updater/scrapers/init.py
@ -0,0 +1,26 @@
+import inspect, os, sys
+
+my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
+
+def _import_module_into_scope(modulename):
+	module = __import__(modulename)
+	
+	for name in vars(module):
+		data = getattr(module, name)
+		globals()[name] = data
+
+sys.path.insert(0, my_path)
+
+for fname in os.listdir(my_path):
+	fpath = os.path.join(my_path, fname)
+	fbasename, fext = os.path.splitext(fname)
+	
+	if os.path.isdir(fpath):
+		if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
+			# This is a python directory module
+			_import_module_into_scope(fname)
+	elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
+		# This is a python file module
+		_import_module_into_scope(fbasename)
+
+sys.path.remove(my_path)
--- a/updater/scrapers/coursera.py
+++ b/updater/scrapers/coursera.py
@ -0,0 +1,50 @@
+import datetime, json, sys
+import requests, oursql
+import shared
+
+class Coursera(shared.Scraper):
+	provider_id = 2
+	
+	def run(self):
+		self.retrieve_dataset()
+		self.parse_dataset()
+	
+	def retrieve_dataset(self):
+		self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
+
+	def parse_dataset(self):
+		for item in self.dataset:
+			self.process_item(item)
+		
+	def process_item(self, item):
+		inserted, rowid = self.insert_topic(str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
+		
+		if inserted:
+			self.env.log("Inserted topic %s" % item["name"])
+		else:
+			self.env.log("Skipped topic %s" % item["name"])
+		
+		for course in item["courses"]:
+			self.process_course(course, rowid)
+	
+	def process_course(self, course, topicid):
+		try:
+			start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
+		except TypeError, e:
+			start_date = None
+			
+		title = self.generate_title(course['name'], start_date)
+		
+		inserted, itemid = self.insert_item(str(course["id"]), title, course["home_link"], has_topic=True, itemtype=self.COURSE, description=course["certificate_description"], start_date=start_date, topic_id=topicid)
+		
+		if inserted:
+			self.env.log("Inserted item %s" % title)
+		else:
+			self.env.log("Skipped item %s" % title)
+			
+	def generate_title(self, name, date):
+		if date is None:
+			return "%s (date undetermined)" % name
+		else:
+			return "%s (starting %s)" % (name, date.strftime("%b %d, %Y"))
+			
--- a/updater/shared/init.py
+++ b/updater/shared/init.py
@ -0,0 +1,26 @@
+import inspect, os, sys
+
+my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
+
+def _import_module_into_scope(modulename):
+	module = __import__(modulename)
+	
+	for name in vars(module):
+		data = getattr(module, name)
+		globals()[name] = data
+
+sys.path.insert(0, my_path)
+
+for fname in os.listdir(my_path):
+	fpath = os.path.join(my_path, fname)
+	fbasename, fext = os.path.splitext(fname)
+	
+	if os.path.isdir(fpath):
+		if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
+			# This is a python directory module
+			_import_module_into_scope(fname)
+	elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
+		# This is a python file module
+		_import_module_into_scope(fbasename)
+
+sys.path.remove(my_path)
--- a/updater/shared/environment.py
+++ b/updater/shared/environment.py
@ -0,0 +1,14 @@
+import oursql
+
+class Environment(object):
+	def connect(self, host="localhost", username="root", password="", database="learn"):
+		self.db = oursql.connect(host=host, user=username, passwd=password, db=database)
+		self.connected = True
+		
+	def log(self, text):
+		print text
+		
+	def Scraper(self, scraper_class):
+		s = scraper_class(self.db)
+		s.env = self
+		return s
--- a/updater/shared/scraper.py
+++ b/updater/shared/scraper.py
@ -0,0 +1,100 @@
+class Scraper(object):
+	UNKNOWN = 0
+	TOPIC = 1
+	COURSE = 2
+	VIDEO = 3
+	ARTICLE = 4
+	EXERCISE = 5
+	QUIZ = 6
+	TEST = 7
+	BOOK = 8
+	AUDIOBOOK = 9
+	LECTURE = 10
+	
+	provider_id = 0
+	
+	def __init__(self, database=None):
+		if database is not None:
+			self.db = database
+			self.can_store = True
+		else:
+			self.can_store = False
+			
+	def run(self, *args, **kwargs):
+		raise Exception("No run() method was specified for this scraper.")
+	
+	def insert_topic(self, unique_id, title, override=False, **kwargs):
+		defaults = {
+			"needs_enrollment": False,
+			"creation_date": None,
+			"start_date": None,
+			"end_date": None,
+			"parent_id": 0,
+			"description": "",
+			"provider_name": ""
+		}
+		
+		for kwarg, val in defaults.iteritems():
+			try:
+				if kwargs[kwarg] == None:
+					kwargs[kwarg] = defaults[kwarg]
+			except KeyError, e:
+				kwargs[kwarg] = defaults[kwarg]
+		
+		c = self.db.cursor()
+		
+		if override == True:
+			exists = False
+		else:
+			c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
+			results = c.fetchall()
+			exists = (len(results) > 0)
+			
+		if exists == True:
+			return (False, results[0][0])
+		else:
+			c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`, `CustomProviderName`)"
+				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], self.provider_id, unique_id, title, kwargs['description'], kwargs['creation_date'], 
+				                                            kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'], kwargs["provider_name"]))
+			
+			return (True, c.lastrowid)
+			
+	def insert_item(self, unique_id, title, item_url, override=False, **kwargs):
+		defaults = {
+			"views": None,
+			"has_topic": False,
+			"itemtype": 0,
+			"source_url": item_url,
+			"topic_id": 0,
+			"parent_id": 0,
+			"description": "",
+			"date": None,
+			"start_date": None,
+			"end_date": None,
+			"provider_name": ""
+		}
+		
+		for kwarg, val in defaults.iteritems():
+			try:
+				if kwargs[kwarg] == None:
+					kwargs[kwarg] = defaults[kwarg]
+			except KeyError, e:
+				kwargs[kwarg] = defaults[kwarg]
+		
+		c = self.db.cursor()
+		
+		if override == True:
+			exists = False
+		else:
+			c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
+			results = c.fetchall()
+			exists = (len(results) > 0)
+			
+		if exists == True:
+			return (False, results[0][0])
+		else:
+			c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`, `StartDate`, `EndDate`, `CustomProviderName`)"
+				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs["has_topic"], kwargs["itemtype"], self.provider_id, unique_id, title, kwargs["description"], item_url, kwargs["source_url"], 
+									       kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"]))
+			
+			return (True, c.lastrowid)
--- a/updater/update.py
+++ b/updater/update.py
@ -0,0 +1,8 @@
+#!/usr/bin/env python
+import shared, scrapers
+
+env = shared.Environment()
+env.connect(host="localhost", username="root", password="", database="learn")
+
+scraper = env.Scraper(scrapers.Coursera)
+scraper.run()
--- a/updater/update_coursera.py
+++ b/updater/update_coursera.py
@ -1,47 +0,0 @@
-import requests
-import oursql
-import datetime
-import json
-import lib
-
-class CourseraCrawler(object):
-	def __init__(self):
-		self.db = lib.Database("localhost", "root")
-		
-	def retrieve_dataset(self):
-		#self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
-		self.dataset = json.loads(open("coursera.json", "r").read())
-
-	def parse_dataset(self):
-		for item in self.dataset:
-			self.process_item(item)
-		
-	def process_item(self, item):
-		inserted, rowid = self.db.insert_topic(2, str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
-		
-		if inserted:
-			print "Inserted %s" % item["name"]
-		else:
-			print "Skipped %s" % item["name"]
-		
-		for course in item["courses"]:
-			self.process_course(course, rowid)
-	
-	def process_course(self, course, topicid):
-		try:
-			start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
-			title = "%s: %s-%s-%s" % (course["name"], str(course["start_year"]).zfill(4), str(course["start_month"]).zfill(2), str(course["start_day"]).zfill(2))
-		except TypeError, e:
-			start_date = None
-			title = "%s (date undetermined)" % (course["name"])
-		
-		inserted, itemid = self.db.insert_item(2, str(course["id"]), True, self.db.COURSE, title, course["home_link"], description=course["certificate_description"], start_date=start_date, topic_id=topicid)
-		
-		if inserted:
-			print "\tInserted %s" % title
-		else:
-			print "\tSkipped %s" % title
-			
-crawler = CourseraCrawler()
-crawler.retrieve_dataset()
-crawler.parse_dataset()