crytolearn/updater/update_khan.py

import requests
import oursql
import datetime
import json
import lib

class KhanUniversityCrawler(object):
	def __init__(self):
		self.db = lib.Database("localhost", "root")
		
	def retrieve_dataset(self):
		self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
		#self.dataset = json.loads(open("data.json", "r").read())

	def parse_dataset(self):
		self.process_item(self.dataset, 0)
		
	def process_item(self, item, level, parent=None):
		try:
			kind = item["kind"]
		except KeyError, e:
			return
		
		if kind == "Topic":
			unique_id = item["id"]
			
			try:
				parent_id = parent["_cl_id"]
			except TypeError, e:
				parent_id = 0
				
			if item["title"] is not None:
				title = item["title"]
			else:
				title = ""
			
			inserted, rowid = self.db.insert_topic(1, unique_id, title, description=item["description"], needs_enrollment=False)
			item["_cl_id"] = rowid
			
			if inserted:
				print "Inserted %s" % title
			else:
				print "Skipped %s" % title
		elif kind in ("Video", "Exercise", "Article"):
			try:
				unique_id = item["readable_id"]
			except KeyError, e:
				try:
					unique_id = item["name"]
				except KeyError, e:
					try:
						unique_id = str(item["id"])
					except KeyError, e:
						print repr(item)
						sys.stderr.write("WARNING: No suitable identifier found for item\n")
						raise
						return
					
			if item["kind"] == "Video":
				itemtype = self.db.VIDEO
			elif item["kind"] == "Exercise":
				itemtype = self.db.EXERCISE
			elif item["kind"] == "Article":
				itemtype = self.db.ARTICLE
				
			try:
				source_url = item["ka_url"]
			except KeyError, e:
				if itemtype == self.db.ARTICLE:
					source_url = ""
				else:
					return
				
			try:
				item_url = item["url"]
			except KeyError, e:
				try:
					item_url = item["ka_url"]
				except KeyError, e:
					item_url = None
				
			if itemtype == self.db.ARTICLE:
				description = item["content"]
			else:
				try:
					description = item["description"]
				except KeyError, e:
					description = None
			
			try:
				title = item["title"]
			except KeyError, e:
				try:
					title = item["display_name"]
				except KeyError, e:
					title = "Untitled"
				
			try:
				views = item["views"]
			except KeyError, e:
				views = None
				
			try:
				date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
			except KeyError, e:
				date = None
			
			inserted, rowid = self.db.insert_item(1, unique_id, True, itemtype, title, item_url, source_url=source_url, description=description, views=views, topic_id=parent["_cl_id"], date=date)
			item["_cl_id"] = rowid
			
			if inserted:
				print "Inserted %s" % title
			else:
				print "Skipped %s" % title
		elif kind == "Separator":
			pass  # Ignore separators
		else:
			sys.stderr.write("Unrecognized kind: %s\n" % item["kind"])
			sys.stderr.write("%s\n" % (repr(item)))
		
		try:
			children = item["children"]
		except KeyError, e:
			pass
		else:
			for child in children:
				self.process_item(child, level + 1, item)
			
crawler = KhanUniversityCrawler()
crawler.retrieve_dataset()
crawler.parse_dataset()
Reorganize updater code and add first design idea for frontend 12 years ago			`import requests`
			`import oursql`
			`import datetime`
			`import json`
			`import lib`

			`class KhanUniversityCrawler(object):`
			`def __init__(self):`
			`self.db = lib.Database("localhost", "root")`

			`def retrieve_dataset(self):`
			`self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()`
			`#self.dataset = json.loads(open("data.json", "r").read())`

			`def parse_dataset(self):`
			`self.process_item(self.dataset, 0)`

			`def process_item(self, item, level, parent=None):`
			`try:`
			`kind = item["kind"]`
			`except KeyError, e:`
			`return`

			`if kind == "Topic":`
			`unique_id = item["id"]`

			`try:`
			`parent_id = parent["_cl_id"]`
			`except TypeError, e:`
			`parent_id = 0`

			`if item["title"] is not None:`
			`title = item["title"]`
			`else:`
			`title = ""`

			`inserted, rowid = self.db.insert_topic(1, unique_id, title, description=item["description"], needs_enrollment=False)`
			`item["_cl_id"] = rowid`

			`if inserted:`
			`print "Inserted %s" % title`
			`else:`
			`print "Skipped %s" % title`
			`elif kind in ("Video", "Exercise", "Article"):`
			`try:`
			`unique_id = item["readable_id"]`
			`except KeyError, e:`
			`try:`
			`unique_id = item["name"]`
			`except KeyError, e:`
			`try:`
			`unique_id = str(item["id"])`
			`except KeyError, e:`
			`print repr(item)`
			`sys.stderr.write("WARNING: No suitable identifier found for item\n")`
			`raise`
			`return`

			`if item["kind"] == "Video":`
			`itemtype = self.db.VIDEO`
			`elif item["kind"] == "Exercise":`
			`itemtype = self.db.EXERCISE`
			`elif item["kind"] == "Article":`
			`itemtype = self.db.ARTICLE`

			`try:`
			`source_url = item["ka_url"]`
			`except KeyError, e:`
			`if itemtype == self.db.ARTICLE:`
			`source_url = ""`
			`else:`
			`return`

			`try:`
			`item_url = item["url"]`
			`except KeyError, e:`
			`try:`
			`item_url = item["ka_url"]`
			`except KeyError, e:`
			`item_url = None`

			`if itemtype == self.db.ARTICLE:`
			`description = item["content"]`
			`else:`
			`try:`
			`description = item["description"]`
			`except KeyError, e:`
			`description = None`

			`try:`
			`title = item["title"]`
			`except KeyError, e:`
			`try:`
			`title = item["display_name"]`
			`except KeyError, e:`
			`title = "Untitled"`

			`try:`
			`views = item["views"]`
			`except KeyError, e:`
			`views = None`

			`try:`
			`date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")`
			`except KeyError, e:`
			`date = None`

			`inserted, rowid = self.db.insert_item(1, unique_id, True, itemtype, title, item_url, source_url=source_url, description=description, views=views, topic_id=parent["_cl_id"], date=date)`
			`item["_cl_id"] = rowid`

			`if inserted:`
			`print "Inserted %s" % title`
			`else:`
			`print "Skipped %s" % title`
			`elif kind == "Separator":`
			`pass # Ignore separators`
			`else:`
			`sys.stderr.write("Unrecognized kind: %s\n" % item["kind"])`
			`sys.stderr.write("%s\n" % (repr(item)))`

			`try:`
			`children = item["children"]`
			`except KeyError, e:`
			`pass`
			`else:`
			`for child in children:`
			`self.process_item(child, level + 1, item)`

			`crawler = KhanUniversityCrawler()`
			`crawler.retrieve_dataset()`
			`crawler.parse_dataset()`