You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

132 lines
3.1 KiB
Python

import requests
import oursql
import datetime
import json
import lib
class KhanUniversityCrawler(object):
def __init__(self):
self.db = lib.Database("localhost", "root")
def retrieve_dataset(self):
self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
#self.dataset = json.loads(open("data.json", "r").read())
def parse_dataset(self):
self.process_item(self.dataset, 0)
def process_item(self, item, level, parent=None):
try:
kind = item["kind"]
except KeyError, e:
return
if kind == "Topic":
unique_id = item["id"]
try:
parent_id = parent["_cl_id"]
except TypeError, e:
parent_id = 0
if item["title"] is not None:
title = item["title"]
else:
title = ""
inserted, rowid = self.db.insert_topic(1, unique_id, title, description=item["description"], needs_enrollment=False)
item["_cl_id"] = rowid
if inserted:
print "Inserted %s" % title
else:
print "Skipped %s" % title
elif kind in ("Video", "Exercise", "Article"):
try:
unique_id = item["readable_id"]
except KeyError, e:
try:
unique_id = item["name"]
except KeyError, e:
try:
unique_id = str(item["id"])
except KeyError, e:
print repr(item)
sys.stderr.write("WARNING: No suitable identifier found for item\n")
raise
return
if item["kind"] == "Video":
itemtype = self.db.VIDEO
elif item["kind"] == "Exercise":
itemtype = self.db.EXERCISE
elif item["kind"] == "Article":
itemtype = self.db.ARTICLE
try:
source_url = item["ka_url"]
except KeyError, e:
if itemtype == self.db.ARTICLE:
source_url = ""
else:
return
try:
item_url = item["url"]
except KeyError, e:
try:
item_url = item["ka_url"]
except KeyError, e:
item_url = None
if itemtype == self.db.ARTICLE:
description = item["content"]
else:
try:
description = item["description"]
except KeyError, e:
description = None
try:
title = item["title"]
except KeyError, e:
try:
title = item["display_name"]
except KeyError, e:
title = "Untitled"
try:
views = item["views"]
except KeyError, e:
views = None
try:
date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
except KeyError, e:
date = None
inserted, rowid = self.db.insert_item(1, unique_id, True, itemtype, title, item_url, source_url=source_url, description=description, views=views, topic_id=parent["_cl_id"], date=date)
item["_cl_id"] = rowid
if inserted:
print "Inserted %s" % title
else:
print "Skipped %s" % title
elif kind == "Separator":
pass # Ignore separators
else:
sys.stderr.write("Unrecognized kind: %s\n" % item["kind"])
sys.stderr.write("%s\n" % (repr(item)))
try:
children = item["children"]
except KeyError, e:
pass
else:
for child in children:
self.process_item(child, level + 1, item)
crawler = KhanUniversityCrawler()
crawler.retrieve_dataset()
crawler.parse_dataset()