You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
132 lines
3.1 KiB
Python
132 lines
3.1 KiB
Python
import requests
|
|
import oursql
|
|
import datetime
|
|
import json
|
|
import lib
|
|
|
|
class KhanUniversityCrawler(object):
|
|
def __init__(self):
|
|
self.db = lib.Database("localhost", "root")
|
|
|
|
def retrieve_dataset(self):
|
|
self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
|
|
#self.dataset = json.loads(open("data.json", "r").read())
|
|
|
|
def parse_dataset(self):
|
|
self.process_item(self.dataset, 0)
|
|
|
|
def process_item(self, item, level, parent=None):
|
|
try:
|
|
kind = item["kind"]
|
|
except KeyError, e:
|
|
return
|
|
|
|
if kind == "Topic":
|
|
unique_id = item["id"]
|
|
|
|
try:
|
|
parent_id = parent["_cl_id"]
|
|
except TypeError, e:
|
|
parent_id = 0
|
|
|
|
if item["title"] is not None:
|
|
title = item["title"]
|
|
else:
|
|
title = ""
|
|
|
|
inserted, rowid = self.db.insert_topic(1, unique_id, title, description=item["description"], needs_enrollment=False)
|
|
item["_cl_id"] = rowid
|
|
|
|
if inserted:
|
|
print "Inserted %s" % title
|
|
else:
|
|
print "Skipped %s" % title
|
|
elif kind in ("Video", "Exercise", "Article"):
|
|
try:
|
|
unique_id = item["readable_id"]
|
|
except KeyError, e:
|
|
try:
|
|
unique_id = item["name"]
|
|
except KeyError, e:
|
|
try:
|
|
unique_id = str(item["id"])
|
|
except KeyError, e:
|
|
print repr(item)
|
|
sys.stderr.write("WARNING: No suitable identifier found for item\n")
|
|
raise
|
|
return
|
|
|
|
if item["kind"] == "Video":
|
|
itemtype = self.db.VIDEO
|
|
elif item["kind"] == "Exercise":
|
|
itemtype = self.db.EXERCISE
|
|
elif item["kind"] == "Article":
|
|
itemtype = self.db.ARTICLE
|
|
|
|
try:
|
|
source_url = item["ka_url"]
|
|
except KeyError, e:
|
|
if itemtype == self.db.ARTICLE:
|
|
source_url = ""
|
|
else:
|
|
return
|
|
|
|
try:
|
|
item_url = item["url"]
|
|
except KeyError, e:
|
|
try:
|
|
item_url = item["ka_url"]
|
|
except KeyError, e:
|
|
item_url = None
|
|
|
|
if itemtype == self.db.ARTICLE:
|
|
description = item["content"]
|
|
else:
|
|
try:
|
|
description = item["description"]
|
|
except KeyError, e:
|
|
description = None
|
|
|
|
try:
|
|
title = item["title"]
|
|
except KeyError, e:
|
|
try:
|
|
title = item["display_name"]
|
|
except KeyError, e:
|
|
title = "Untitled"
|
|
|
|
try:
|
|
views = item["views"]
|
|
except KeyError, e:
|
|
views = None
|
|
|
|
try:
|
|
date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
|
|
except KeyError, e:
|
|
date = None
|
|
|
|
inserted, rowid = self.db.insert_item(1, unique_id, True, itemtype, title, item_url, source_url=source_url, description=description, views=views, topic_id=parent["_cl_id"], date=date)
|
|
item["_cl_id"] = rowid
|
|
|
|
if inserted:
|
|
print "Inserted %s" % title
|
|
else:
|
|
print "Skipped %s" % title
|
|
elif kind == "Separator":
|
|
pass # Ignore separators
|
|
else:
|
|
sys.stderr.write("Unrecognized kind: %s\n" % item["kind"])
|
|
sys.stderr.write("%s\n" % (repr(item)))
|
|
|
|
try:
|
|
children = item["children"]
|
|
except KeyError, e:
|
|
pass
|
|
else:
|
|
for child in children:
|
|
self.process_item(child, level + 1, item)
|
|
|
|
crawler = KhanUniversityCrawler()
|
|
crawler.retrieve_dataset()
|
|
crawler.parse_dataset()
|