You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
48 lines
1.5 KiB
Python
48 lines
1.5 KiB
Python
import requests
|
|
import oursql
|
|
import datetime
|
|
import json
|
|
import lib
|
|
|
|
class CourseraCrawler(object):
|
|
def __init__(self):
|
|
self.db = lib.Database("localhost", "root")
|
|
|
|
def retrieve_dataset(self):
|
|
#self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
|
|
self.dataset = json.loads(open("coursera.json", "r").read())
|
|
|
|
def parse_dataset(self):
|
|
for item in self.dataset:
|
|
self.process_item(item)
|
|
|
|
def process_item(self, item):
|
|
inserted, rowid = self.db.insert_topic(2, str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
|
|
|
|
if inserted:
|
|
print "Inserted %s" % item["name"]
|
|
else:
|
|
print "Skipped %s" % item["name"]
|
|
|
|
for course in item["courses"]:
|
|
self.process_course(course, rowid)
|
|
|
|
def process_course(self, course, topicid):
|
|
try:
|
|
start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
|
|
title = "%s: %s-%s-%s" % (course["name"], str(course["start_year"]).zfill(4), str(course["start_month"]).zfill(2), str(course["start_day"]).zfill(2))
|
|
except TypeError, e:
|
|
start_date = None
|
|
title = "%s (date undetermined)" % (course["name"])
|
|
|
|
inserted, itemid = self.db.insert_item(2, str(course["id"]), True, self.db.COURSE, title, course["home_link"], description=course["certificate_description"], start_date=start_date, topic_id=topicid)
|
|
|
|
if inserted:
|
|
print "\tInserted %s" % title
|
|
else:
|
|
print "\tSkipped %s" % title
|
|
|
|
crawler = CourseraCrawler()
|
|
crawler.retrieve_dataset()
|
|
crawler.parse_dataset()
|