Rewrite Khan Academy crawler

2013-01-30 20:42:46 +01:00 · 2013-01-30 20:42:46 +01:00 · 2c3bcc5418
parent d9034b6215
commit 2c3bcc5418
3 changed files with 198 additions and 132 deletions
--- a/updater/scrapers/khan.py
+++ b/updater/scrapers/khan.py
@ -0,0 +1,197 @@
 import datetime, json, sys
 import requests, oursql
 import shared
 class KhanAcademy(shared.Scraper):
 	provider_id = 1
 	def run(self):
 		self.retrieve_dataset()
 		self.process_item(self.dataset, 0)
 	def retrieve_dataset(self):
 		self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
 	def process_item(self, item, level, parent=None):
 		try:
 			kind = item["kind"]
 		except KeyError, e:
 			return
 		if kind == "Topic":
 			self.process_topic(item, level, parent=parent)
 		elif kind in ("Video", "Exercise", "Article", "Scratchpad"):
 			self.process_object(item, level, parent=parent)
 		elif kind == "Separator":
 			pass  # Ignore separators
 		else:
 			self.env.log("Unrecognized kind: %s" % repr(item["kind"]), True)
 		try:
 			children = item["children"]
 		except KeyError, e:
 			return
 		for child in children:
 			self.process_item(child, level + 1, item)
 	def process_topic(self, item, level, parent=None):
 		unique_id = item["id"]
 		try:
 			parent_id = parent["_cl_id"]
 		except TypeError, e:
 			parent_id = 0
 		# Check if a title is set
 		if item["title"] is not None:
 			title = item["title"]
 		else:
 			# No title was set - log this as an error and default to 'Untitled'.
 			self.env.log("No title found for item: %s" % repr(item), True)
 			title = "Untitled"
 		# Check if a description is set, and default to no description if not
 		if item["description"] is not None:
 			description = item["description"]
 		else:
 			description = None
 		# Insert the topic
 		inserted, row_id = self.insert_topic(unique_id, title, description=description, needs_enrollment=False)
 		# Set the ID of the newly inserted row so that all objects in this topic know the ID of their topic.
 		item["_cl_id"] = row_id
 		if inserted:
 			self.env.log("Inserted %s" % title)
 		else:
 			self.env.log("Skipped %s" % title)
 	def process_object(self, item, level, parent=None):
 		unique_id = None
 		# First check for the 'readable_id' property
 		try:
 			unique_id = item["readable_id"]
 		except KeyError, e:
 			pass
 		# If no identifier was found, check for the 'name' property
 		if unique_id is None:
 			try:
 				unique_id = item["name"]
 			except KeyError, e:
 				pass
 		# If still no identifier was found, check for the 'id' property
 		if unique_id is None:
 			try:
 				unique_id = str(item["id"])
 			except KeyError, e:
 				pass
 		# If we *still* do not have an identifier, log the error and bail out
 		if unique_id is None:
 			self.env.log("No suitable identifier found for item: %s" % repr(item), True)
 			return
 		# Determine the object type
 		if item["kind"] == "Video":
 			itemtype = self.VIDEO
 		elif item["kind"] == "Exercise":
 			itemtype = self.EXERCISE
 		elif item["kind"] == "Article":
 			itemtype = self.ARTICLE
 		elif item["kind"] == "Scratchpad":
 			itemtype = self.SANDBOX
 		source_url = None
 		# Determine the source URL via the 'ka_url' property
 		try:
 			source_url = item["ka_url"]
 		except KeyError, e:
 			pass
 		# If no source URL was found, try the 'url' property
 		if source_url is None:			
 			try:
 				source_url = item["url"]
 			except KeyError, e:
 				pass
 		# If still no source URL was found...
 		if source_url is None:
 			if itemtype == self.ARTICLE:
 				# Articles can lack a URL.
 				source_url = None
 			else:
 				# There was no source URL, but this wasn't an article. Log the error and bail out.
 				self.env.log("No source URL found for non-article object: %s" % repr(item), True)
 				return
 		# Determine the (external) item URL
 		try:
 			item_url = item["url"]
 		except KeyError, e:
 			# Apparently there was no external item URL. Use the source URL as item URL - this will most likely be correct.
 			item_url = source_url
 		# If the object is an article, we'll want to use the actual article content as description.
 		if itemtype == self.ARTICLE:
 			description = item["content"]
 		else:
 			# Otherwise, we'll check if there's a 'description' property. If not, leave empty.
 			try:
 				description = item["description"]
 			except KeyError, e:
 				description = None
 		title = None
 		# First check the 'title' property for an object title.
 		try:
 			title = item["title"]
 		except KeyError, e:
 			pass
 		# As second option, check the 'display_name' property.
 		if title is None:
 			try:
 				title = item["display_name"]
 			except KeyError, e:
 				# Apparently it really does not have a title. Log the error and default to 'Untitled'.
 				self.env.log("No object title found for item: %s" % repr(item), True)
 				title = "Untitled"
 		# If a 'views' property is present, include it.
 		try:
 			views = item["views"]
 		except KeyError, e:
 			views = None
 		# If a creation date is present, include it.
 		try:
 			date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
 		except KeyError, e:
 			date = None
 		# Check if there is a parent ID
 		try:
 			parent_id = parent["_cl_id"]
 		except KeyError, e:
 			# No parent ID present - log this as an error and default to 0.
 			self.env.log("No parent ID found for item: %s" % repr(item), True)
 			parent_id = 0
 		# Insert the item
 		inserted, row_id = self.insert_item(unique_id, title, item_url, itemtype=itemtype, has_topic=True, source_url=source_url, description=description, views=views, topic_id=parent_id, date=date)
 		# Store the resulting row ID in the item so that the children know the ID of their parent.
 		item["_cl_id"] = row_id
 		if inserted:
 			self.env.log("Inserted %s" % title)
 		else:
 			self.env.log("Skipped %s" % title)
--- a/updater/update.py
+++ b/updater/update.py
@ -4,5 +4,5 @@ import shared, scrapers
 env = shared.Environment()
 env.connect(host="localhost", username="root", password="", database="learn")
-scraper = env.Scraper(scrapers.Coursera)
+scraper = env.Scraper(scrapers.KhanAcademy)
 scraper.run()
--- a/updater/update_khan.py
+++ b/updater/update_khan.py
@ -1,131 +0,0 @@
 import requests
 import oursql
 import datetime
 import json
 import lib
 class KhanUniversityCrawler(object):
 	def __init__(self):
 		self.db = lib.Database("localhost", "root")
 	def retrieve_dataset(self):
 		self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
 		#self.dataset = json.loads(open("data.json", "r").read())
 	def parse_dataset(self):
 		self.process_item(self.dataset, 0)
 	def process_item(self, item, level, parent=None):
 		try:
 			kind = item["kind"]
 		except KeyError, e:
 			return
 		if kind == "Topic":
 			unique_id = item["id"]
 			try:
 				parent_id = parent["_cl_id"]
 			except TypeError, e:
 				parent_id = 0
 			if item["title"] is not None:
 				title = item["title"]
 			else:
 				title = ""
 			inserted, rowid = self.db.insert_topic(1, unique_id, title, description=item["description"], needs_enrollment=False)
 			item["_cl_id"] = rowid
 			if inserted:
 				print "Inserted %s" % title
 			else:
 				print "Skipped %s" % title
 		elif kind in ("Video", "Exercise", "Article"):
 			try:
 				unique_id = item["readable_id"]
 			except KeyError, e:
 				try:
 					unique_id = item["name"]
 				except KeyError, e:
 					try:
 						unique_id = str(item["id"])
 					except KeyError, e:
 						print repr(item)
 						sys.stderr.write("WARNING: No suitable identifier found for item\n")
 						raise
 						return
 			if item["kind"] == "Video":
 				itemtype = self.db.VIDEO
 			elif item["kind"] == "Exercise":
 				itemtype = self.db.EXERCISE
 			elif item["kind"] == "Article":
 				itemtype = self.db.ARTICLE
 			try:
 				source_url = item["ka_url"]
 			except KeyError, e:
 				if itemtype == self.db.ARTICLE:
 					source_url = ""
 				else:
 					return
 			try:
 				item_url = item["url"]
 			except KeyError, e:
 				try:
 					item_url = item["ka_url"]
 				except KeyError, e:
 					item_url = None
 			if itemtype == self.db.ARTICLE:
 				description = item["content"]
 			else:
 				try:
 					description = item["description"]
 				except KeyError, e:
 					description = None
 			try:
 				title = item["title"]
 			except KeyError, e:
 				try:
 					title = item["display_name"]
 				except KeyError, e:
 					title = "Untitled"
 			try:
 				views = item["views"]
 			except KeyError, e:
 				views = None
 			try:
 				date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
 			except KeyError, e:
 				date = None
 			inserted, rowid = self.db.insert_item(1, unique_id, True, itemtype, title, item_url, source_url=source_url, description=description, views=views, topic_id=parent["_cl_id"], date=date)
 			item["_cl_id"] = rowid
 			if inserted:
 				print "Inserted %s" % title
 			else:
 				print "Skipped %s" % title
 		elif kind == "Separator":
 			pass  # Ignore separators
 		else:
 			sys.stderr.write("Unrecognized kind: %s\n" % item["kind"])
 			sys.stderr.write("%s\n" % (repr(item)))
 		try:
 			children = item["children"]
 		except KeyError, e:
 			pass
 		else:
 			for child in children:
 				self.process_item(child, level + 1, item)
 crawler = KhanUniversityCrawler()
 crawler.retrieve_dataset()
 crawler.parse_dataset()