diff --git a/updater/scrapers/ureddit.py b/updater/scrapers/ureddit.py new file mode 100644 index 0000000..12ff44e --- /dev/null +++ b/updater/scrapers/ureddit.py @@ -0,0 +1,55 @@ +import datetime, json, simplejson, sys, re +import requests +import shared + +class UniversityOfReddit(shared.Scraper): + provider_id = 3 + + def run(self): + data = requests.get("http://ureddit.com/api?type=catalog").json() + + for category in data["categories"]: + self.parse_category(category['id'], category['value']) + + def parse_category(self, category_id, category_name): + try: + data = requests.get("http://ureddit.com/api?type=category&id=%s" % category_id).json() + except simplejson.decoder.JSONDecodeError, e: + return + + for _class in data["classes"]: + if not self.topic_exists(_class['id']): + self.parse_class(_class['id'], _class['value'], category_name) + else: + self.env.log("Skipped class %s" % _class['value']) + + def parse_class(self, class_id, class_name, category_name): + try: + data = requests.get("http://ureddit.com/api?type=class&id=%s" % class_id).json() + except simplejson.decoder.JSONDecodeError, e: + self.env.log("Skipped %s due to JSON formatting error" % class_name, True) + return + + if data["status"] == '1' or data["status"] == '3' or data["status"] == '5': + try: + creation_date = datetime.datetime.strptime(data["created"], '%Y-%m-%d %H:%M:%S') + except ValueError, e: + creation_date = None + + class_page = data["url"] + + inserted, topic_id = self.insert_topic(str(class_id), data["name"], needs_enrollment=True, description=data["description"], creation_date=creation_date) + + if inserted: + self.env.log("Inserted topic %s" % data["name"]) + else: + self.env.log("Skipped topic %s" % data["name"]) + + inserted, item_id = self.insert_item(str(class_id), data["name"], class_page, itemtype=self.COURSE, has_topic=True, topic_id=topic_id, date=creation_date, description=data["description"]) + + if inserted: + self.env.log("Inserted item %s" % data["name"]) + else: + self.env.log("Skipped item %s" % data["name"]) + else: + self.env.log("Skipped %s due to status (%s)" % (data["name"], data["status_description"])) diff --git a/updater/update.py b/updater/update.py index 81c182e..4d4d335 100644 --- a/updater/update.py +++ b/updater/update.py @@ -4,5 +4,5 @@ import shared, scrapers env = shared.Environment() env.connect(host="localhost", username="root", password="", database="learn") -scraper = env.Scraper(scrapers.KhanAcademy) +scraper = env.Scraper(scrapers.UniversityOfReddit) scraper.run() diff --git a/updater/update_ureddit.py b/updater/update_ureddit.py deleted file mode 100644 index 80a94f9..0000000 --- a/updater/update_ureddit.py +++ /dev/null @@ -1,100 +0,0 @@ -import requests -import oursql -import datetime -import json, simplejson -import lib -import re - -class UredditCrawler(object): - def __init__(self): - self.db = lib.Database("localhost", "root") - - def parse_catalog(self): - data = requests.get("http://ureddit.com/api?type=catalog").json() - - for category in data["categories"]: - self.parse_category(category['id'], category['value']) - - def parse_category(self, category_id, category_name): - try: - data = requests.get("http://ureddit.com/api?type=category&id=%s" % category_id).json() - except simplejson.decoder.JSONDecodeError, e: - return - - for _class in data["classes"]: - if not self.db.topic_exists(3, _class['id']): - self.parse_class(_class['id'], _class['value'], category_name) - else: - print "Skipped class %s" % _class['value'] - - def parse_class(self, class_id, class_name, category_name): - try: - data = requests.get("http://ureddit.com/api?type=class&id=%s" % class_id).json() - except simplejson.decoder.JSONDecodeError, e: - print "Skipped %s due to JSON formatting error" % class_name - return - - try: - creation_date = datetime.datetime.strptime(data["created"], '%Y-%m-%d %H:%M:%S') - except ValueError, e: - creation_date = None - - # Hack to get the class page as this isn't returned by the API - html_data = requests.get("http://ureddit.com/show_class.php?id=%s&show=true" % class_id).text - matches = re.search('