commit 8152ec8dca045cffee287addc8bbb7f5f459c2f9 Author: Sven Slootweg Date: Sun Jan 27 22:31:57 2013 +0100 First version of update script diff --git a/updater/update.py b/updater/update.py new file mode 100644 index 0000000..ce6d256 --- /dev/null +++ b/updater/update.py @@ -0,0 +1,171 @@ +import requests +import oursql +import datetime +import json + +database = oursql.connect(host="localhost", user="root", db="learn") + +def unicodedammit(input_string): + if isinstance(input_string, str): + return input_string.decode('utf-8') + else: + return input_string + +class KhanUniversityCrawler(object): + TOPIC = 1 + COURSE = 2 + VIDEO = 3 + ARTICLE = 4 + EXERCISE = 5 + QUIZ = 6 + TEST = 7 + BOOK = 8 + + def __init__(self): + pass + + def retrieve_dataset(self): + #self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json() + self.dataset = json.loads(open("data.json", "r").read()) + + def parse_dataset(self): + self.process_item(self.dataset, 0) + + def process_item(self, item, level, parent=None): + global database + + c = database.cursor() + + try: + kind = item["kind"] + except KeyError, e: + return + + if kind == "Topic": + unique_id = item["id"] + + try: + parent_id = parent["_cl_id"] + except TypeError, e: + parent_id = 0 + + if item["description"] is not None: + description = item["description"] + else: + description = "" + + if item["title"] is not None: + title = item["title"] + else: + title = "" + + c.execute("SELECT `Id` FROM topics WHERE `ProviderId` = ? LIMIT 1", (unique_id,)) + results = c.fetchall() + exists = (len(results) > 0) + + if not exists: + c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`)" + "VALUES (?, 1, ?, ?, ?, ?, 0)", (parent_id, unique_id, title, description, datetime.datetime.now())) + + print "Inserted topic %s" % title + + item["_cl_id"] = c.lastrowid + else: + print u"Skipped topic %s" % title + item["_cl_id"] = results[0][0] + elif kind in ("Video", "Exercise", "Article"): + try: + unique_id = item["readable_id"] + except KeyError, e: + try: + unique_id = item["name"] + except KeyError, e: + try: + unique_id = str(item["id"]) + except KeyError, e: + print repr(item) + sys.stderr.write("WARNING: No suitable identifier found for item\n") + raise + return + + if item["kind"] == "Video": + itemtype = self.VIDEO + elif item["kind"] == "Exercise": + itemtype = self.EXERCISE + elif item["kind"] == "Article": + itemtype = self.ARTICLE + + try: + source_url = item["ka_url"] + except KeyError, e: + if itemtype == self.ARTICLE: + source_url = "" + else: + return + + try: + item_url = item["url"] + except KeyError, e: + item_url = source_url + + if itemtype == self.ARTICLE: + description = item["content"] + else: + try: + description = item["description"] + except KeyError, e: + description = "" + + if description is None: + description = "" + + try: + title = item["title"] + except KeyError, e: + try: + title = item["display_name"] + except KeyError, e: + title = "Untitled" + + try: + views = item["views"] + except KeyError, e: + views = 0 + + c.execute("SELECT `Id` FROM items WHERE `ProviderId` = ? LIMIT 1", (unique_id,)) + results = c.fetchall() + exists = (len(results) > 0) + + if not exists: + try: + c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`)" + "VALUES (1, ?, 1, ?, ?, ?, ?, ?, ?, ?, 0)", (itemtype, unique_id, title, description, item_url, source_url, views, parent["_cl_id"])) + except oursql.ProgrammingError, e: + print repr((itemtype, unique_id, title, description, item_url, source_url, views, parent["_cl_id"])) + print repr(description) + raise + + print "Inserted item %s" % title + + item["_cl_id"] = c.lastrowid + else: + print "Skipped item %s" % title + item["_cl_id"] = results[0][0] + elif kind == "Separator": + pass # Ignore separators + else: + print "Unrecognized kind: %s" % item["kind"] + print repr(item) + date = datetime.datetime.strptime("2008-08-12T12:20:30Z", "%Y-%m-%dT%H:%M:%SZ") + + try: + children = item["children"] + except KeyError, e: + pass + else: + for child in children: + self.process_item(child, level + 1, item) + +crawler = KhanUniversityCrawler() +crawler.retrieve_dataset() +crawler.parse_dataset()