Rewrite Khan Academy crawler
parent
d9034b6215
commit
2c3bcc5418
@ -0,0 +1,197 @@
|
|||||||
|
import datetime, json, sys
|
||||||
|
import requests, oursql
|
||||||
|
import shared
|
||||||
|
|
||||||
|
class KhanAcademy(shared.Scraper):
|
||||||
|
provider_id = 1
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
self.retrieve_dataset()
|
||||||
|
self.process_item(self.dataset, 0)
|
||||||
|
|
||||||
|
def retrieve_dataset(self):
|
||||||
|
self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
|
||||||
|
|
||||||
|
def process_item(self, item, level, parent=None):
|
||||||
|
try:
|
||||||
|
kind = item["kind"]
|
||||||
|
except KeyError, e:
|
||||||
|
return
|
||||||
|
|
||||||
|
if kind == "Topic":
|
||||||
|
self.process_topic(item, level, parent=parent)
|
||||||
|
elif kind in ("Video", "Exercise", "Article", "Scratchpad"):
|
||||||
|
self.process_object(item, level, parent=parent)
|
||||||
|
elif kind == "Separator":
|
||||||
|
pass # Ignore separators
|
||||||
|
else:
|
||||||
|
self.env.log("Unrecognized kind: %s" % repr(item["kind"]), True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
children = item["children"]
|
||||||
|
except KeyError, e:
|
||||||
|
return
|
||||||
|
|
||||||
|
for child in children:
|
||||||
|
self.process_item(child, level + 1, item)
|
||||||
|
|
||||||
|
def process_topic(self, item, level, parent=None):
|
||||||
|
unique_id = item["id"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
parent_id = parent["_cl_id"]
|
||||||
|
except TypeError, e:
|
||||||
|
parent_id = 0
|
||||||
|
|
||||||
|
# Check if a title is set
|
||||||
|
if item["title"] is not None:
|
||||||
|
title = item["title"]
|
||||||
|
else:
|
||||||
|
# No title was set - log this as an error and default to 'Untitled'.
|
||||||
|
self.env.log("No title found for item: %s" % repr(item), True)
|
||||||
|
title = "Untitled"
|
||||||
|
|
||||||
|
# Check if a description is set, and default to no description if not
|
||||||
|
if item["description"] is not None:
|
||||||
|
description = item["description"]
|
||||||
|
else:
|
||||||
|
description = None
|
||||||
|
|
||||||
|
# Insert the topic
|
||||||
|
inserted, row_id = self.insert_topic(unique_id, title, description=description, needs_enrollment=False)
|
||||||
|
|
||||||
|
# Set the ID of the newly inserted row so that all objects in this topic know the ID of their topic.
|
||||||
|
item["_cl_id"] = row_id
|
||||||
|
|
||||||
|
if inserted:
|
||||||
|
self.env.log("Inserted %s" % title)
|
||||||
|
else:
|
||||||
|
self.env.log("Skipped %s" % title)
|
||||||
|
|
||||||
|
def process_object(self, item, level, parent=None):
|
||||||
|
unique_id = None
|
||||||
|
|
||||||
|
# First check for the 'readable_id' property
|
||||||
|
try:
|
||||||
|
unique_id = item["readable_id"]
|
||||||
|
except KeyError, e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# If no identifier was found, check for the 'name' property
|
||||||
|
if unique_id is None:
|
||||||
|
try:
|
||||||
|
unique_id = item["name"]
|
||||||
|
except KeyError, e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# If still no identifier was found, check for the 'id' property
|
||||||
|
if unique_id is None:
|
||||||
|
try:
|
||||||
|
unique_id = str(item["id"])
|
||||||
|
except KeyError, e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# If we *still* do not have an identifier, log the error and bail out
|
||||||
|
if unique_id is None:
|
||||||
|
self.env.log("No suitable identifier found for item: %s" % repr(item), True)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Determine the object type
|
||||||
|
if item["kind"] == "Video":
|
||||||
|
itemtype = self.VIDEO
|
||||||
|
elif item["kind"] == "Exercise":
|
||||||
|
itemtype = self.EXERCISE
|
||||||
|
elif item["kind"] == "Article":
|
||||||
|
itemtype = self.ARTICLE
|
||||||
|
elif item["kind"] == "Scratchpad":
|
||||||
|
itemtype = self.SANDBOX
|
||||||
|
|
||||||
|
source_url = None
|
||||||
|
|
||||||
|
# Determine the source URL via the 'ka_url' property
|
||||||
|
try:
|
||||||
|
source_url = item["ka_url"]
|
||||||
|
except KeyError, e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# If no source URL was found, try the 'url' property
|
||||||
|
if source_url is None:
|
||||||
|
try:
|
||||||
|
source_url = item["url"]
|
||||||
|
except KeyError, e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# If still no source URL was found...
|
||||||
|
if source_url is None:
|
||||||
|
if itemtype == self.ARTICLE:
|
||||||
|
# Articles can lack a URL.
|
||||||
|
source_url = None
|
||||||
|
else:
|
||||||
|
# There was no source URL, but this wasn't an article. Log the error and bail out.
|
||||||
|
self.env.log("No source URL found for non-article object: %s" % repr(item), True)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Determine the (external) item URL
|
||||||
|
try:
|
||||||
|
item_url = item["url"]
|
||||||
|
except KeyError, e:
|
||||||
|
# Apparently there was no external item URL. Use the source URL as item URL - this will most likely be correct.
|
||||||
|
item_url = source_url
|
||||||
|
|
||||||
|
# If the object is an article, we'll want to use the actual article content as description.
|
||||||
|
if itemtype == self.ARTICLE:
|
||||||
|
description = item["content"]
|
||||||
|
else:
|
||||||
|
# Otherwise, we'll check if there's a 'description' property. If not, leave empty.
|
||||||
|
try:
|
||||||
|
description = item["description"]
|
||||||
|
except KeyError, e:
|
||||||
|
description = None
|
||||||
|
|
||||||
|
title = None
|
||||||
|
|
||||||
|
# First check the 'title' property for an object title.
|
||||||
|
try:
|
||||||
|
title = item["title"]
|
||||||
|
except KeyError, e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# As second option, check the 'display_name' property.
|
||||||
|
if title is None:
|
||||||
|
try:
|
||||||
|
title = item["display_name"]
|
||||||
|
except KeyError, e:
|
||||||
|
# Apparently it really does not have a title. Log the error and default to 'Untitled'.
|
||||||
|
self.env.log("No object title found for item: %s" % repr(item), True)
|
||||||
|
title = "Untitled"
|
||||||
|
|
||||||
|
# If a 'views' property is present, include it.
|
||||||
|
try:
|
||||||
|
views = item["views"]
|
||||||
|
except KeyError, e:
|
||||||
|
views = None
|
||||||
|
|
||||||
|
# If a creation date is present, include it.
|
||||||
|
try:
|
||||||
|
date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
except KeyError, e:
|
||||||
|
date = None
|
||||||
|
|
||||||
|
# Check if there is a parent ID
|
||||||
|
try:
|
||||||
|
parent_id = parent["_cl_id"]
|
||||||
|
except KeyError, e:
|
||||||
|
# No parent ID present - log this as an error and default to 0.
|
||||||
|
self.env.log("No parent ID found for item: %s" % repr(item), True)
|
||||||
|
parent_id = 0
|
||||||
|
|
||||||
|
# Insert the item
|
||||||
|
inserted, row_id = self.insert_item(unique_id, title, item_url, itemtype=itemtype, has_topic=True, source_url=source_url, description=description, views=views, topic_id=parent_id, date=date)
|
||||||
|
|
||||||
|
# Store the resulting row ID in the item so that the children know the ID of their parent.
|
||||||
|
item["_cl_id"] = row_id
|
||||||
|
|
||||||
|
if inserted:
|
||||||
|
self.env.log("Inserted %s" % title)
|
||||||
|
else:
|
||||||
|
self.env.log("Skipped %s" % title)
|
@ -1,131 +0,0 @@
|
|||||||
import requests
|
|
||||||
import oursql
|
|
||||||
import datetime
|
|
||||||
import json
|
|
||||||
import lib
|
|
||||||
|
|
||||||
class KhanUniversityCrawler(object):
|
|
||||||
def __init__(self):
|
|
||||||
self.db = lib.Database("localhost", "root")
|
|
||||||
|
|
||||||
def retrieve_dataset(self):
|
|
||||||
self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
|
|
||||||
#self.dataset = json.loads(open("data.json", "r").read())
|
|
||||||
|
|
||||||
def parse_dataset(self):
|
|
||||||
self.process_item(self.dataset, 0)
|
|
||||||
|
|
||||||
def process_item(self, item, level, parent=None):
|
|
||||||
try:
|
|
||||||
kind = item["kind"]
|
|
||||||
except KeyError, e:
|
|
||||||
return
|
|
||||||
|
|
||||||
if kind == "Topic":
|
|
||||||
unique_id = item["id"]
|
|
||||||
|
|
||||||
try:
|
|
||||||
parent_id = parent["_cl_id"]
|
|
||||||
except TypeError, e:
|
|
||||||
parent_id = 0
|
|
||||||
|
|
||||||
if item["title"] is not None:
|
|
||||||
title = item["title"]
|
|
||||||
else:
|
|
||||||
title = ""
|
|
||||||
|
|
||||||
inserted, rowid = self.db.insert_topic(1, unique_id, title, description=item["description"], needs_enrollment=False)
|
|
||||||
item["_cl_id"] = rowid
|
|
||||||
|
|
||||||
if inserted:
|
|
||||||
print "Inserted %s" % title
|
|
||||||
else:
|
|
||||||
print "Skipped %s" % title
|
|
||||||
elif kind in ("Video", "Exercise", "Article"):
|
|
||||||
try:
|
|
||||||
unique_id = item["readable_id"]
|
|
||||||
except KeyError, e:
|
|
||||||
try:
|
|
||||||
unique_id = item["name"]
|
|
||||||
except KeyError, e:
|
|
||||||
try:
|
|
||||||
unique_id = str(item["id"])
|
|
||||||
except KeyError, e:
|
|
||||||
print repr(item)
|
|
||||||
sys.stderr.write("WARNING: No suitable identifier found for item\n")
|
|
||||||
raise
|
|
||||||
return
|
|
||||||
|
|
||||||
if item["kind"] == "Video":
|
|
||||||
itemtype = self.db.VIDEO
|
|
||||||
elif item["kind"] == "Exercise":
|
|
||||||
itemtype = self.db.EXERCISE
|
|
||||||
elif item["kind"] == "Article":
|
|
||||||
itemtype = self.db.ARTICLE
|
|
||||||
|
|
||||||
try:
|
|
||||||
source_url = item["ka_url"]
|
|
||||||
except KeyError, e:
|
|
||||||
if itemtype == self.db.ARTICLE:
|
|
||||||
source_url = ""
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
|
||||||
item_url = item["url"]
|
|
||||||
except KeyError, e:
|
|
||||||
try:
|
|
||||||
item_url = item["ka_url"]
|
|
||||||
except KeyError, e:
|
|
||||||
item_url = None
|
|
||||||
|
|
||||||
if itemtype == self.db.ARTICLE:
|
|
||||||
description = item["content"]
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
description = item["description"]
|
|
||||||
except KeyError, e:
|
|
||||||
description = None
|
|
||||||
|
|
||||||
try:
|
|
||||||
title = item["title"]
|
|
||||||
except KeyError, e:
|
|
||||||
try:
|
|
||||||
title = item["display_name"]
|
|
||||||
except KeyError, e:
|
|
||||||
title = "Untitled"
|
|
||||||
|
|
||||||
try:
|
|
||||||
views = item["views"]
|
|
||||||
except KeyError, e:
|
|
||||||
views = None
|
|
||||||
|
|
||||||
try:
|
|
||||||
date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
|
|
||||||
except KeyError, e:
|
|
||||||
date = None
|
|
||||||
|
|
||||||
inserted, rowid = self.db.insert_item(1, unique_id, True, itemtype, title, item_url, source_url=source_url, description=description, views=views, topic_id=parent["_cl_id"], date=date)
|
|
||||||
item["_cl_id"] = rowid
|
|
||||||
|
|
||||||
if inserted:
|
|
||||||
print "Inserted %s" % title
|
|
||||||
else:
|
|
||||||
print "Skipped %s" % title
|
|
||||||
elif kind == "Separator":
|
|
||||||
pass # Ignore separators
|
|
||||||
else:
|
|
||||||
sys.stderr.write("Unrecognized kind: %s\n" % item["kind"])
|
|
||||||
sys.stderr.write("%s\n" % (repr(item)))
|
|
||||||
|
|
||||||
try:
|
|
||||||
children = item["children"]
|
|
||||||
except KeyError, e:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
for child in children:
|
|
||||||
self.process_item(child, level + 1, item)
|
|
||||||
|
|
||||||
crawler = KhanUniversityCrawler()
|
|
||||||
crawler.retrieve_dataset()
|
|
||||||
crawler.parse_dataset()
|
|
Loading…
Reference in New Issue