import requests import oursql import datetime import json, simplejson import lib import re class UredditCrawler(object): def __init__(self): self.db = lib.Database("localhost", "root") def parse_catalog(self): data = requests.get("http://ureddit.com/api?type=catalog").json() for category in data["categories"]: self.parse_category(category['id'], category['value']) def parse_category(self, category_id, category_name): try: data = requests.get("http://ureddit.com/api?type=category&id=%s" % category_id).json() except simplejson.decoder.JSONDecodeError, e: return for _class in data["classes"]: if not self.db.topic_exists(3, _class['id']): self.parse_class(_class['id'], _class['value'], category_name) else: print "Skipped class %s" % _class['value'] def parse_class(self, class_id, class_name, category_name): try: data = requests.get("http://ureddit.com/api?type=class&id=%s" % class_id).json() except simplejson.decoder.JSONDecodeError, e: print "Skipped %s due to JSON formatting error" % class_name return try: creation_date = datetime.datetime.strptime(data["created"], '%Y-%m-%d %H:%M:%S') except ValueError, e: creation_date = None # Hack to get the class page as this isn't returned by the API html_data = requests.get("http://ureddit.com/show_class.php?id=%s&show=true" % class_id).text matches = re.search('