Rewrite Khan Academy crawler

12 years ago · 2c3bcc5418
parent d9034b6215
commit 2c3bcc5418
3 changed files with 198 additions and 132 deletions
--- a/updater/scrapers/khan.py
+++ b/updater/scrapers/khan.py
@ -0,0 +1,197 @@
+import datetime, json, sys
+import requests, oursql
+import shared
+
+class KhanAcademy(shared.Scraper):
+	provider_id = 1
+	
+	def run(self):
+		self.retrieve_dataset()
+		self.process_item(self.dataset, 0)
+		
+	def retrieve_dataset(self):
+		self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
+		
+	def process_item(self, item, level, parent=None):
+		try:
+			kind = item["kind"]
+		except KeyError, e:
+			return
+		
+		if kind == "Topic":
+			self.process_topic(item, level, parent=parent)
+		elif kind in ("Video", "Exercise", "Article", "Scratchpad"):
+			self.process_object(item, level, parent=parent)
+		elif kind == "Separator":
+			pass  # Ignore separators
+		else:
+			self.env.log("Unrecognized kind: %s" % repr(item["kind"]), True)
+		
+		try:
+			children = item["children"]
+		except KeyError, e:
+			return
+			
+		for child in children:
+			self.process_item(child, level + 1, item)
+			
+	def process_topic(self, item, level, parent=None):
+		unique_id = item["id"]
+			
+		try:
+			parent_id = parent["_cl_id"]
+		except TypeError, e:
+			parent_id = 0
+			
+		# Check if a title is set
+		if item["title"] is not None:
+			title = item["title"]
+		else:
+			# No title was set - log this as an error and default to 'Untitled'.
+			self.env.log("No title found for item: %s" % repr(item), True)
+			title = "Untitled"
+		
+		# Check if a description is set, and default to no description if not
+		if item["description"] is not None:
+			description = item["description"]
+		else:
+			description = None
+		
+		# Insert the topic
+		inserted, row_id = self.insert_topic(unique_id, title, description=description, needs_enrollment=False)
+		
+		# Set the ID of the newly inserted row so that all objects in this topic know the ID of their topic.
+		item["_cl_id"] = row_id
+		
+		if inserted:
+			self.env.log("Inserted %s" % title)
+		else:
+			self.env.log("Skipped %s" % title)
+			
+	def process_object(self, item, level, parent=None):
+		unique_id = None
+		
+		# First check for the 'readable_id' property
+		try:
+			unique_id = item["readable_id"]
+		except KeyError, e:
+			pass
+		
+		# If no identifier was found, check for the 'name' property
+		if unique_id is None:
+			try:
+				unique_id = item["name"]
+			except KeyError, e:
+				pass
+		
+		# If still no identifier was found, check for the 'id' property
+		if unique_id is None:
+			try:
+				unique_id = str(item["id"])
+			except KeyError, e:
+				pass
+		
+		# If we *still* do not have an identifier, log the error and bail out
+		if unique_id is None:
+			self.env.log("No suitable identifier found for item: %s" % repr(item), True)
+			return
+		
+		# Determine the object type
+		if item["kind"] == "Video":
+			itemtype = self.VIDEO
+		elif item["kind"] == "Exercise":
+			itemtype = self.EXERCISE
+		elif item["kind"] == "Article":
+			itemtype = self.ARTICLE
+		elif item["kind"] == "Scratchpad":
+			itemtype = self.SANDBOX
+		
+		source_url = None
+		
+		# Determine the source URL via the 'ka_url' property
+		try:
+			source_url = item["ka_url"]
+		except KeyError, e:
+			pass
+		
+		# If no source URL was found, try the 'url' property
+		if source_url is None:			
+			try:
+				source_url = item["url"]
+			except KeyError, e:
+				pass
+		
+		# If still no source URL was found...
+		if source_url is None:
+			if itemtype == self.ARTICLE:
+				# Articles can lack a URL.
+				source_url = None
+			else:
+				# There was no source URL, but this wasn't an article. Log the error and bail out.
+				self.env.log("No source URL found for non-article object: %s" % repr(item), True)
+				return
+		
+		# Determine the (external) item URL
+		try:
+			item_url = item["url"]
+		except KeyError, e:
+			# Apparently there was no external item URL. Use the source URL as item URL - this will most likely be correct.
+			item_url = source_url
+		
+		# If the object is an article, we'll want to use the actual article content as description.
+		if itemtype == self.ARTICLE:
+			description = item["content"]
+		else:
+			# Otherwise, we'll check if there's a 'description' property. If not, leave empty.
+			try:
+				description = item["description"]
+			except KeyError, e:
+				description = None
+		
+		title = None
+		
+		# First check the 'title' property for an object title.
+		try:
+			title = item["title"]
+		except KeyError, e:
+			pass
+		
+		# As second option, check the 'display_name' property.
+		if title is None:
+			try:
+				title = item["display_name"]
+			except KeyError, e:
+				# Apparently it really does not have a title. Log the error and default to 'Untitled'.
+				self.env.log("No object title found for item: %s" % repr(item), True)
+				title = "Untitled"
+		
+		# If a 'views' property is present, include it.
+		try:
+			views = item["views"]
+		except KeyError, e:
+			views = None
+		
+		# If a creation date is present, include it.
+		try:
+			date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
+		except KeyError, e:
+			date = None
+		
+		# Check if there is a parent ID
+		try:
+			parent_id = parent["_cl_id"]
+		except KeyError, e:
+			# No parent ID present - log this as an error and default to 0.
+			self.env.log("No parent ID found for item: %s" % repr(item), True)
+			parent_id = 0
+		
+		# Insert the item
+		inserted, row_id = self.insert_item(unique_id, title, item_url, itemtype=itemtype, has_topic=True, source_url=source_url, description=description, views=views, topic_id=parent_id, date=date)
+		
+		# Store the resulting row ID in the item so that the children know the ID of their parent.
+		item["_cl_id"] = row_id
+		
+		if inserted:
+			self.env.log("Inserted %s" % title)
+		else:
+			self.env.log("Skipped %s" % title)
--- a/updater/update.py
+++ b/updater/update.py
@ -4,5 +4,5 @@ import shared, scrapers
 env = shared.Environment()
 env.connect(host="localhost", username="root", password="", database="learn")

-scraper = env.Scraper(scrapers.Coursera)
+scraper = env.Scraper(scrapers.KhanAcademy)
 scraper.run()
--- a/updater/update_khan.py
+++ b/updater/update_khan.py
@ -1,131 +0,0 @@
-import requests
-import oursql
-import datetime
-import json
-import lib
-
-class KhanUniversityCrawler(object):
-	def __init__(self):
-		self.db = lib.Database("localhost", "root")
-		
-	def retrieve_dataset(self):
-		self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
-		#self.dataset = json.loads(open("data.json", "r").read())
-
-	def parse_dataset(self):
-		self.process_item(self.dataset, 0)
-		
-	def process_item(self, item, level, parent=None):
-		try:
-			kind = item["kind"]
-		except KeyError, e:
-			return
-		
-		if kind == "Topic":
-			unique_id = item["id"]
-			
-			try:
-				parent_id = parent["_cl_id"]
-			except TypeError, e:
-				parent_id = 0
-				
-			if item["title"] is not None:
-				title = item["title"]
-			else:
-				title = ""
-			
-			inserted, rowid = self.db.insert_topic(1, unique_id, title, description=item["description"], needs_enrollment=False)
-			item["_cl_id"] = rowid
-			
-			if inserted:
-				print "Inserted %s" % title
-			else:
-				print "Skipped %s" % title
-		elif kind in ("Video", "Exercise", "Article"):
-			try:
-				unique_id = item["readable_id"]
-			except KeyError, e:
-				try:
-					unique_id = item["name"]
-				except KeyError, e:
-					try:
-						unique_id = str(item["id"])
-					except KeyError, e:
-						print repr(item)
-						sys.stderr.write("WARNING: No suitable identifier found for item\n")
-						raise
-						return
-					
-			if item["kind"] == "Video":
-				itemtype = self.db.VIDEO
-			elif item["kind"] == "Exercise":
-				itemtype = self.db.EXERCISE
-			elif item["kind"] == "Article":
-				itemtype = self.db.ARTICLE
-				
-			try:
-				source_url = item["ka_url"]
-			except KeyError, e:
-				if itemtype == self.db.ARTICLE:
-					source_url = ""
-				else:
-					return
-				
-			try:
-				item_url = item["url"]
-			except KeyError, e:
-				try:
-					item_url = item["ka_url"]
-				except KeyError, e:
-					item_url = None
-				
-			if itemtype == self.db.ARTICLE:
-				description = item["content"]
-			else:
-				try:
-					description = item["description"]
-				except KeyError, e:
-					description = None
-			
-			try:
-				title = item["title"]
-			except KeyError, e:
-				try:
-					title = item["display_name"]
-				except KeyError, e:
-					title = "Untitled"
-				
-			try:
-				views = item["views"]
-			except KeyError, e:
-				views = None
-				
-			try:
-				date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
-			except KeyError, e:
-				date = None
-			
-			inserted, rowid = self.db.insert_item(1, unique_id, True, itemtype, title, item_url, source_url=source_url, description=description, views=views, topic_id=parent["_cl_id"], date=date)
-			item["_cl_id"] = rowid
-			
-			if inserted:
-				print "Inserted %s" % title
-			else:
-				print "Skipped %s" % title
-		elif kind == "Separator":
-			pass  # Ignore separators
-		else:
-			sys.stderr.write("Unrecognized kind: %s\n" % item["kind"])
-			sys.stderr.write("%s\n" % (repr(item)))
-		
-		try:
-			children = item["children"]
-		except KeyError, e:
-			pass
-		else:
-			for child in children:
-				self.process_item(child, level + 1, item)
-			
-crawler = KhanUniversityCrawler()
-crawler.retrieve_dataset()
-crawler.parse_dataset()