import requests import oursql import datetime import json database = oursql.connect(host="localhost", user="root", db="learn") def unicodedammit(input_string): if isinstance(input_string, str): return input_string.decode('utf-8') else: return input_string class KhanUniversityCrawler(object): TOPIC = 1 COURSE = 2 VIDEO = 3 ARTICLE = 4 EXERCISE = 5 QUIZ = 6 TEST = 7 BOOK = 8 def __init__(self): pass def retrieve_dataset(self): #self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json() self.dataset = json.loads(open("data.json", "r").read()) def parse_dataset(self): self.process_item(self.dataset, 0) def process_item(self, item, level, parent=None): global database c = database.cursor() try: kind = item["kind"] except KeyError, e: return if kind == "Topic": unique_id = item["id"] try: parent_id = parent["_cl_id"] except TypeError, e: parent_id = 0 if item["description"] is not None: description = item["description"] else: description = "" if item["title"] is not None: title = item["title"] else: title = "" c.execute("SELECT `Id` FROM topics WHERE `ProviderId` = ? LIMIT 1", (unique_id,)) results = c.fetchall() exists = (len(results) > 0) if not exists: c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`)" "VALUES (?, 1, ?, ?, ?, ?, 0)", (parent_id, unique_id, title, description, datetime.datetime.now())) print "Inserted topic %s" % title item["_cl_id"] = c.lastrowid else: print u"Skipped topic %s" % title item["_cl_id"] = results[0][0] elif kind in ("Video", "Exercise", "Article"): try: unique_id = item["readable_id"] except KeyError, e: try: unique_id = item["name"] except KeyError, e: try: unique_id = str(item["id"]) except KeyError, e: print repr(item) sys.stderr.write("WARNING: No suitable identifier found for item\n") raise return if item["kind"] == "Video": itemtype = self.VIDEO elif item["kind"] == "Exercise": itemtype = self.EXERCISE elif item["kind"] == "Article": itemtype = self.ARTICLE try: source_url = item["ka_url"] except KeyError, e: if itemtype == self.ARTICLE: source_url = "" else: return try: item_url = item["url"] except KeyError, e: item_url = source_url if itemtype == self.ARTICLE: description = item["content"] else: try: description = item["description"] except KeyError, e: description = "" if description is None: description = "" try: title = item["title"] except KeyError, e: try: title = item["display_name"] except KeyError, e: title = "Untitled" try: views = item["views"] except KeyError, e: views = 0 c.execute("SELECT `Id` FROM items WHERE `ProviderId` = ? LIMIT 1", (unique_id,)) results = c.fetchall() exists = (len(results) > 0) if not exists: try: c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`)" "VALUES (1, ?, 1, ?, ?, ?, ?, ?, ?, ?, 0)", (itemtype, unique_id, title, description, item_url, source_url, views, parent["_cl_id"])) except oursql.ProgrammingError, e: print repr((itemtype, unique_id, title, description, item_url, source_url, views, parent["_cl_id"])) print repr(description) raise print "Inserted item %s" % title item["_cl_id"] = c.lastrowid else: print "Skipped item %s" % title item["_cl_id"] = results[0][0] elif kind == "Separator": pass # Ignore separators else: print "Unrecognized kind: %s" % item["kind"] print repr(item) date = datetime.datetime.strptime("2008-08-12T12:20:30Z", "%Y-%m-%dT%H:%M:%SZ") try: children = item["children"] except KeyError, e: pass else: for child in children: self.process_item(child, level + 1, item) crawler = KhanUniversityCrawler() crawler.retrieve_dataset() crawler.parse_dataset()