From 703a34bfa26e9416c5e2db023dee9499c101b0f2 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 27 Jan 2013 23:06:32 +0100 Subject: [PATCH] Reorganize updater code and add first design idea for frontend --- .gitignore | 1 + frontend/index.html | 25 ++++++ frontend/style.css | 57 ++++++++++++++ updater/lib.py | 85 ++++++++++++++++++++ updater/update.py | 171 ----------------------------------------- updater/update_khan.py | 131 +++++++++++++++++++++++++++++++ 6 files changed, 299 insertions(+), 171 deletions(-) create mode 100644 .gitignore create mode 100644 frontend/index.html create mode 100644 frontend/style.css create mode 100644 updater/lib.py delete mode 100644 updater/update.py create mode 100644 updater/update_khan.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/frontend/index.html b/frontend/index.html new file mode 100644 index 0000000..ee85385 --- /dev/null +++ b/frontend/index.html @@ -0,0 +1,25 @@ + + + + learn.cryto.net + + + + + +
+

learn.cryto.net :: Learn something new!

+
+
+
+ I want to learn about . +
+
+ + diff --git a/frontend/style.css b/frontend/style.css new file mode 100644 index 0000000..75681ed --- /dev/null +++ b/frontend/style.css @@ -0,0 +1,57 @@ +body +{ + background-color: #F3FFF7; + padding: 0px; + margin: 0px; + font-family: sans-serif; +} + +.header +{ + background-color: #C9F9DF; + color: #009B53; + padding: 12px 14px; +} + +.header h1 +{ + margin: 0px; + font-weight: normal; +} + +.search-large +{ + color: #006824; + width: 960px; + text-align: center; + margin: 180px auto; + font-size: 42px; +} + +.search-top +{ + color: #006824; + width: 960px; + margin: 16px; + font-size: 26px; +} + +.search-large input, .search-top input +{ + color: #006824; + border: 0px; + background-color: transparent; + border-bottom: 2px solid #1FDF62; +} + +.search-large input +{ + font-size: 42px; + width: 300px; +} + +.search-top input +{ + font-size: 26px; + width: 180px; +} diff --git a/updater/lib.py b/updater/lib.py new file mode 100644 index 0000000..ab7f9d4 --- /dev/null +++ b/updater/lib.py @@ -0,0 +1,85 @@ +import datetime, oursql + +class Database(object): + TOPIC = 1 + COURSE = 2 + VIDEO = 3 + ARTICLE = 4 + EXERCISE = 5 + QUIZ = 6 + TEST = 7 + BOOK = 8 + AUDIOBOOK = 9 + + def __init__(self, host, user, password=None, database="learn"): + self.database = oursql.connect(host=host, user=user, db=database) + + def insert_topic(self, provider, unique_id, title, override=False, **kwargs): + defaults = { + "needs_enrollment": False, + "creation_date": None, + "start_date": None, + "end_date": None, + "parent_id": 0, + "description": "" + } + + for kwarg, val in defaults.iteritems(): + try: + if kwargs[kwarg] == None: + kwargs[kwarg] = defaults[kwarg] + except KeyError, e: + kwargs[kwarg] = defaults[kwarg] + + c = self.database.cursor() + + if override == True: + exists = False + else: + c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (provider, unique_id)) + results = c.fetchall() + exists = (len(results) > 0) + + if exists == True: + return (False, results[0][0]) + else: + c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`)" + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], provider, unique_id, title, kwargs['description'], kwargs['creation_date'], + kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'])) + + return (True, c.lastrowid) + + def insert_item(self, provider, unique_id, has_topic, itemtype, title, item_url, override=False, **kwargs): + defaults = { + "views": None, + "source_url": item_url, + "topic_id": 0, + "parent_id": 0, + "description": "", + "date": None + } + + for kwarg, val in defaults.iteritems(): + try: + if kwargs[kwarg] == None: + kwargs[kwarg] = defaults[kwarg] + except KeyError, e: + kwargs[kwarg] = defaults[kwarg] + + c = self.database.cursor() + + if override == True: + exists = False + else: + c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (provider, unique_id)) + results = c.fetchall() + exists = (len(results) > 0) + + if exists == True: + return (False, results[0][0]) + else: + c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`)" + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (has_topic, itemtype, provider, unique_id, title, kwargs["description"], item_url, kwargs["source_url"], + kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"])) + + return (True, c.lastrowid) diff --git a/updater/update.py b/updater/update.py deleted file mode 100644 index ce6d256..0000000 --- a/updater/update.py +++ /dev/null @@ -1,171 +0,0 @@ -import requests -import oursql -import datetime -import json - -database = oursql.connect(host="localhost", user="root", db="learn") - -def unicodedammit(input_string): - if isinstance(input_string, str): - return input_string.decode('utf-8') - else: - return input_string - -class KhanUniversityCrawler(object): - TOPIC = 1 - COURSE = 2 - VIDEO = 3 - ARTICLE = 4 - EXERCISE = 5 - QUIZ = 6 - TEST = 7 - BOOK = 8 - - def __init__(self): - pass - - def retrieve_dataset(self): - #self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json() - self.dataset = json.loads(open("data.json", "r").read()) - - def parse_dataset(self): - self.process_item(self.dataset, 0) - - def process_item(self, item, level, parent=None): - global database - - c = database.cursor() - - try: - kind = item["kind"] - except KeyError, e: - return - - if kind == "Topic": - unique_id = item["id"] - - try: - parent_id = parent["_cl_id"] - except TypeError, e: - parent_id = 0 - - if item["description"] is not None: - description = item["description"] - else: - description = "" - - if item["title"] is not None: - title = item["title"] - else: - title = "" - - c.execute("SELECT `Id` FROM topics WHERE `ProviderId` = ? LIMIT 1", (unique_id,)) - results = c.fetchall() - exists = (len(results) > 0) - - if not exists: - c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`)" - "VALUES (?, 1, ?, ?, ?, ?, 0)", (parent_id, unique_id, title, description, datetime.datetime.now())) - - print "Inserted topic %s" % title - - item["_cl_id"] = c.lastrowid - else: - print u"Skipped topic %s" % title - item["_cl_id"] = results[0][0] - elif kind in ("Video", "Exercise", "Article"): - try: - unique_id = item["readable_id"] - except KeyError, e: - try: - unique_id = item["name"] - except KeyError, e: - try: - unique_id = str(item["id"]) - except KeyError, e: - print repr(item) - sys.stderr.write("WARNING: No suitable identifier found for item\n") - raise - return - - if item["kind"] == "Video": - itemtype = self.VIDEO - elif item["kind"] == "Exercise": - itemtype = self.EXERCISE - elif item["kind"] == "Article": - itemtype = self.ARTICLE - - try: - source_url = item["ka_url"] - except KeyError, e: - if itemtype == self.ARTICLE: - source_url = "" - else: - return - - try: - item_url = item["url"] - except KeyError, e: - item_url = source_url - - if itemtype == self.ARTICLE: - description = item["content"] - else: - try: - description = item["description"] - except KeyError, e: - description = "" - - if description is None: - description = "" - - try: - title = item["title"] - except KeyError, e: - try: - title = item["display_name"] - except KeyError, e: - title = "Untitled" - - try: - views = item["views"] - except KeyError, e: - views = 0 - - c.execute("SELECT `Id` FROM items WHERE `ProviderId` = ? LIMIT 1", (unique_id,)) - results = c.fetchall() - exists = (len(results) > 0) - - if not exists: - try: - c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`)" - "VALUES (1, ?, 1, ?, ?, ?, ?, ?, ?, ?, 0)", (itemtype, unique_id, title, description, item_url, source_url, views, parent["_cl_id"])) - except oursql.ProgrammingError, e: - print repr((itemtype, unique_id, title, description, item_url, source_url, views, parent["_cl_id"])) - print repr(description) - raise - - print "Inserted item %s" % title - - item["_cl_id"] = c.lastrowid - else: - print "Skipped item %s" % title - item["_cl_id"] = results[0][0] - elif kind == "Separator": - pass # Ignore separators - else: - print "Unrecognized kind: %s" % item["kind"] - print repr(item) - date = datetime.datetime.strptime("2008-08-12T12:20:30Z", "%Y-%m-%dT%H:%M:%SZ") - - try: - children = item["children"] - except KeyError, e: - pass - else: - for child in children: - self.process_item(child, level + 1, item) - -crawler = KhanUniversityCrawler() -crawler.retrieve_dataset() -crawler.parse_dataset() diff --git a/updater/update_khan.py b/updater/update_khan.py new file mode 100644 index 0000000..8cc5dfe --- /dev/null +++ b/updater/update_khan.py @@ -0,0 +1,131 @@ +import requests +import oursql +import datetime +import json +import lib + +class KhanUniversityCrawler(object): + def __init__(self): + self.db = lib.Database("localhost", "root") + + def retrieve_dataset(self): + self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json() + #self.dataset = json.loads(open("data.json", "r").read()) + + def parse_dataset(self): + self.process_item(self.dataset, 0) + + def process_item(self, item, level, parent=None): + try: + kind = item["kind"] + except KeyError, e: + return + + if kind == "Topic": + unique_id = item["id"] + + try: + parent_id = parent["_cl_id"] + except TypeError, e: + parent_id = 0 + + if item["title"] is not None: + title = item["title"] + else: + title = "" + + inserted, rowid = self.db.insert_topic(1, unique_id, title, description=item["description"], needs_enrollment=False) + item["_cl_id"] = rowid + + if inserted: + print "Inserted %s" % title + else: + print "Skipped %s" % title + elif kind in ("Video", "Exercise", "Article"): + try: + unique_id = item["readable_id"] + except KeyError, e: + try: + unique_id = item["name"] + except KeyError, e: + try: + unique_id = str(item["id"]) + except KeyError, e: + print repr(item) + sys.stderr.write("WARNING: No suitable identifier found for item\n") + raise + return + + if item["kind"] == "Video": + itemtype = self.db.VIDEO + elif item["kind"] == "Exercise": + itemtype = self.db.EXERCISE + elif item["kind"] == "Article": + itemtype = self.db.ARTICLE + + try: + source_url = item["ka_url"] + except KeyError, e: + if itemtype == self.db.ARTICLE: + source_url = "" + else: + return + + try: + item_url = item["url"] + except KeyError, e: + try: + item_url = item["ka_url"] + except KeyError, e: + item_url = None + + if itemtype == self.db.ARTICLE: + description = item["content"] + else: + try: + description = item["description"] + except KeyError, e: + description = None + + try: + title = item["title"] + except KeyError, e: + try: + title = item["display_name"] + except KeyError, e: + title = "Untitled" + + try: + views = item["views"] + except KeyError, e: + views = None + + try: + date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ") + except KeyError, e: + date = None + + inserted, rowid = self.db.insert_item(1, unique_id, True, itemtype, title, item_url, source_url=source_url, description=description, views=views, topic_id=parent["_cl_id"], date=date) + item["_cl_id"] = rowid + + if inserted: + print "Inserted %s" % title + else: + print "Skipped %s" % title + elif kind == "Separator": + pass # Ignore separators + else: + sys.stderr.write("Unrecognized kind: %s\n" % item["kind"]) + sys.stderr.write("%s\n" % (repr(item))) + + try: + children = item["children"] + except KeyError, e: + pass + else: + for child in children: + self.process_item(child, level + 1, item) + +crawler = KhanUniversityCrawler() +crawler.retrieve_dataset() +crawler.parse_dataset()