From 2c3bcc5418a883fbeb3fd0fb1578106eb22512bd Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Wed, 30 Jan 2013 20:42:46 +0100 Subject: [PATCH] Rewrite Khan Academy crawler --- updater/scrapers/khan.py | 197 +++++++++++++++++++++++++++++++++++++++ updater/update.py | 2 +- updater/update_khan.py | 131 -------------------------- 3 files changed, 198 insertions(+), 132 deletions(-) create mode 100644 updater/scrapers/khan.py delete mode 100644 updater/update_khan.py diff --git a/updater/scrapers/khan.py b/updater/scrapers/khan.py new file mode 100644 index 0000000..c4fa312 --- /dev/null +++ b/updater/scrapers/khan.py @@ -0,0 +1,197 @@ +import datetime, json, sys +import requests, oursql +import shared + +class KhanAcademy(shared.Scraper): + provider_id = 1 + + def run(self): + self.retrieve_dataset() + self.process_item(self.dataset, 0) + + def retrieve_dataset(self): + self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json() + + def process_item(self, item, level, parent=None): + try: + kind = item["kind"] + except KeyError, e: + return + + if kind == "Topic": + self.process_topic(item, level, parent=parent) + elif kind in ("Video", "Exercise", "Article", "Scratchpad"): + self.process_object(item, level, parent=parent) + elif kind == "Separator": + pass # Ignore separators + else: + self.env.log("Unrecognized kind: %s" % repr(item["kind"]), True) + + try: + children = item["children"] + except KeyError, e: + return + + for child in children: + self.process_item(child, level + 1, item) + + def process_topic(self, item, level, parent=None): + unique_id = item["id"] + + try: + parent_id = parent["_cl_id"] + except TypeError, e: + parent_id = 0 + + # Check if a title is set + if item["title"] is not None: + title = item["title"] + else: + # No title was set - log this as an error and default to 'Untitled'. + self.env.log("No title found for item: %s" % repr(item), True) + title = "Untitled" + + # Check if a description is set, and default to no description if not + if item["description"] is not None: + description = item["description"] + else: + description = None + + # Insert the topic + inserted, row_id = self.insert_topic(unique_id, title, description=description, needs_enrollment=False) + + # Set the ID of the newly inserted row so that all objects in this topic know the ID of their topic. + item["_cl_id"] = row_id + + if inserted: + self.env.log("Inserted %s" % title) + else: + self.env.log("Skipped %s" % title) + + def process_object(self, item, level, parent=None): + unique_id = None + + # First check for the 'readable_id' property + try: + unique_id = item["readable_id"] + except KeyError, e: + pass + + # If no identifier was found, check for the 'name' property + if unique_id is None: + try: + unique_id = item["name"] + except KeyError, e: + pass + + # If still no identifier was found, check for the 'id' property + if unique_id is None: + try: + unique_id = str(item["id"]) + except KeyError, e: + pass + + # If we *still* do not have an identifier, log the error and bail out + if unique_id is None: + self.env.log("No suitable identifier found for item: %s" % repr(item), True) + return + + # Determine the object type + if item["kind"] == "Video": + itemtype = self.VIDEO + elif item["kind"] == "Exercise": + itemtype = self.EXERCISE + elif item["kind"] == "Article": + itemtype = self.ARTICLE + elif item["kind"] == "Scratchpad": + itemtype = self.SANDBOX + + source_url = None + + # Determine the source URL via the 'ka_url' property + try: + source_url = item["ka_url"] + except KeyError, e: + pass + + # If no source URL was found, try the 'url' property + if source_url is None: + try: + source_url = item["url"] + except KeyError, e: + pass + + # If still no source URL was found... + if source_url is None: + if itemtype == self.ARTICLE: + # Articles can lack a URL. + source_url = None + else: + # There was no source URL, but this wasn't an article. Log the error and bail out. + self.env.log("No source URL found for non-article object: %s" % repr(item), True) + return + + # Determine the (external) item URL + try: + item_url = item["url"] + except KeyError, e: + # Apparently there was no external item URL. Use the source URL as item URL - this will most likely be correct. + item_url = source_url + + # If the object is an article, we'll want to use the actual article content as description. + if itemtype == self.ARTICLE: + description = item["content"] + else: + # Otherwise, we'll check if there's a 'description' property. If not, leave empty. + try: + description = item["description"] + except KeyError, e: + description = None + + title = None + + # First check the 'title' property for an object title. + try: + title = item["title"] + except KeyError, e: + pass + + # As second option, check the 'display_name' property. + if title is None: + try: + title = item["display_name"] + except KeyError, e: + # Apparently it really does not have a title. Log the error and default to 'Untitled'. + self.env.log("No object title found for item: %s" % repr(item), True) + title = "Untitled" + + # If a 'views' property is present, include it. + try: + views = item["views"] + except KeyError, e: + views = None + + # If a creation date is present, include it. + try: + date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ") + except KeyError, e: + date = None + + # Check if there is a parent ID + try: + parent_id = parent["_cl_id"] + except KeyError, e: + # No parent ID present - log this as an error and default to 0. + self.env.log("No parent ID found for item: %s" % repr(item), True) + parent_id = 0 + + # Insert the item + inserted, row_id = self.insert_item(unique_id, title, item_url, itemtype=itemtype, has_topic=True, source_url=source_url, description=description, views=views, topic_id=parent_id, date=date) + + # Store the resulting row ID in the item so that the children know the ID of their parent. + item["_cl_id"] = row_id + + if inserted: + self.env.log("Inserted %s" % title) + else: + self.env.log("Skipped %s" % title) diff --git a/updater/update.py b/updater/update.py index 8efbfb6..81c182e 100644 --- a/updater/update.py +++ b/updater/update.py @@ -4,5 +4,5 @@ import shared, scrapers env = shared.Environment() env.connect(host="localhost", username="root", password="", database="learn") -scraper = env.Scraper(scrapers.Coursera) +scraper = env.Scraper(scrapers.KhanAcademy) scraper.run() diff --git a/updater/update_khan.py b/updater/update_khan.py deleted file mode 100644 index 8cc5dfe..0000000 --- a/updater/update_khan.py +++ /dev/null @@ -1,131 +0,0 @@ -import requests -import oursql -import datetime -import json -import lib - -class KhanUniversityCrawler(object): - def __init__(self): - self.db = lib.Database("localhost", "root") - - def retrieve_dataset(self): - self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json() - #self.dataset = json.loads(open("data.json", "r").read()) - - def parse_dataset(self): - self.process_item(self.dataset, 0) - - def process_item(self, item, level, parent=None): - try: - kind = item["kind"] - except KeyError, e: - return - - if kind == "Topic": - unique_id = item["id"] - - try: - parent_id = parent["_cl_id"] - except TypeError, e: - parent_id = 0 - - if item["title"] is not None: - title = item["title"] - else: - title = "" - - inserted, rowid = self.db.insert_topic(1, unique_id, title, description=item["description"], needs_enrollment=False) - item["_cl_id"] = rowid - - if inserted: - print "Inserted %s" % title - else: - print "Skipped %s" % title - elif kind in ("Video", "Exercise", "Article"): - try: - unique_id = item["readable_id"] - except KeyError, e: - try: - unique_id = item["name"] - except KeyError, e: - try: - unique_id = str(item["id"]) - except KeyError, e: - print repr(item) - sys.stderr.write("WARNING: No suitable identifier found for item\n") - raise - return - - if item["kind"] == "Video": - itemtype = self.db.VIDEO - elif item["kind"] == "Exercise": - itemtype = self.db.EXERCISE - elif item["kind"] == "Article": - itemtype = self.db.ARTICLE - - try: - source_url = item["ka_url"] - except KeyError, e: - if itemtype == self.db.ARTICLE: - source_url = "" - else: - return - - try: - item_url = item["url"] - except KeyError, e: - try: - item_url = item["ka_url"] - except KeyError, e: - item_url = None - - if itemtype == self.db.ARTICLE: - description = item["content"] - else: - try: - description = item["description"] - except KeyError, e: - description = None - - try: - title = item["title"] - except KeyError, e: - try: - title = item["display_name"] - except KeyError, e: - title = "Untitled" - - try: - views = item["views"] - except KeyError, e: - views = None - - try: - date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ") - except KeyError, e: - date = None - - inserted, rowid = self.db.insert_item(1, unique_id, True, itemtype, title, item_url, source_url=source_url, description=description, views=views, topic_id=parent["_cl_id"], date=date) - item["_cl_id"] = rowid - - if inserted: - print "Inserted %s" % title - else: - print "Skipped %s" % title - elif kind == "Separator": - pass # Ignore separators - else: - sys.stderr.write("Unrecognized kind: %s\n" % item["kind"]) - sys.stderr.write("%s\n" % (repr(item))) - - try: - children = item["children"] - except KeyError, e: - pass - else: - for child in children: - self.process_item(child, level + 1, item) - -crawler = KhanUniversityCrawler() -crawler.retrieve_dataset() -crawler.parse_dataset()