Add crawlers for coursera and ureddit, get first quick and dirty version of frontend done, and fix buigs and stuff

develop
Sven Slootweg 11 years ago
parent 703a34bfa2
commit 6ec1a2d90b

@ -0,0 +1,30 @@
{
"database": {
"driver": "mysql",
"pdo": true,
"hostname": "localhost",
"username": "root",
"password": "",
"database": "learn"
},
"locale": {
"path": "locales",
"extension": "lng",
"default_locale": "english",
"default_timezone": "Europe/Amsterdam"
},
"memcache": {
"enabled": true,
"compressed": true,
"hostname": "localhost",
"port": 11211
},
"class_map": {
"item": "Item",
"topic": "Topic"
},
"components": [
"router",
"errorhandler"
]
}

@ -0,0 +1,148 @@
<?php
/*
* Cryto Learn is more free software. It is licensed under the WTFPL, which
* allows you to do pretty much anything with it, without having to
* ask permission. Commercial use is allowed, and no attribution is
* required. We do politely request that you share your modifications
* to benefit other developers, but you are under no enforced
* obligation to do so :)
*
* Please read the accompanying LICENSE document for the full WTFPL
* licensing text.
*/
if(!isset($_APP)) { die("Unauthorized."); }
class Item extends CPHPDatabaseRecordClass
{
public $table_name = "items";
public $fill_query = "SELECT * FROM items WHERE `Id` = :Id";
public $verify_query = "SELECT * FROM items WHERE `Id` = :Id";
public $prototype = array(
'string' => array(
'Title' => "Title",
'Description' => "Description",
'SourceUrl' => "SourceUrl",
'ItemUrl' => "ItemUrl"
),
'numeric' => array(
'Type' => "Type",
'Provider' => "Provider",
'Views' => "Views",
'TopicId' => "TopicId",
'ParentId' => "ParentId"
),
'boolean' => array(
'HasTopic' => "HasTopic"
),
'timestamp' => array(
'CreationDate' => "Date",
'StartDate' => "StartDate",
'EndDate' => "EndDate"
),
'topic' => array(
'Topic' => "TopicId"
),
'item' => array(
'Parent' => "ParentId"
)
);
public function __get($name)
{
switch($name)
{
case "sTypeName":
return $this->GetTypeName();
break;
case "sProviderName":
return $this->GetProviderName();
break;
default:
return parent::__get($name);
break;
}
}
public function GetTypeName()
{
switch($this->sType)
{
case 1:
return "topic";
case 2:
return "course";
case 3:
return "video";
case 4:
return "article";
case 5:
return "exercise";
case 6:
return "quiz";
case 7:
return "test";
case 8:
return "book";
case 9:
return "audiobook";
default:
return "unknown";
}
}
public function GetProviderName()
{
switch($this->sProvider)
{
case 1:
return "Khan University";
case 2:
return "Coursera";
case 3:
return "University of Reddit";
default:
return "Unknown";
}
}
public function GetChildren()
{
try
{
return Item::CreateFromQuery("SELECT * FROM items WHERE `ParentId` = :ParentId", array(':ParentId' => $this->sId));
}
catch (NotFoundException $e)
{
return array();
}
}
public function AsDataset($fetch_children = true)
{
$child_data = array();
if($fetch_children == true)
{
foreach($this->GetChildren() as $child)
{
$child_data[] = $child->AsDataset();
}
}
return array(
"title" => $this->uTitle,
"description" => $this->uDescription,
"url" => $this->uItemUrl,
"source" => $this->uSourceUrl,
"created" => $this->sCreationDate,
"start" => $this->sStartDate,
"end" => $this->sEndDate,
"type" => $this->sTypeName,
"provider" => $this->sProviderName,
"views" => $this->sViews,
"children" => $child_data
);
}
}

@ -0,0 +1,131 @@
<?php
/*
* Cryto Learn is more free software. It is licensed under the WTFPL, which
* allows you to do pretty much anything with it, without having to
* ask permission. Commercial use is allowed, and no attribution is
* required. We do politely request that you share your modifications
* to benefit other developers, but you are under no enforced
* obligation to do so :)
*
* Please read the accompanying LICENSE document for the full WTFPL
* licensing text.
*/
if(!isset($_APP)) { die("Unauthorized."); }
class Topic extends CPHPDatabaseRecordClass
{
public $table_name = "topics";
public $fill_query = "SELECT * FROM topics WHERE `Id` = :Id";
public $verify_query = "SELECT * FROM topics WHERE `Id` = :Id";
public $prototype = array(
'string' => array(
'Title' => "Title",
'ProviderId' => "ProviderId",
'Description' => "Description"
),
'numeric' => array(
'ParentId' => "ParentId",
'Provider' => "Provider"
),
'boolean' => array(
'NeedsEnrollment' => "NeedsEnrollment"
),
'timestamp' => array(
'CreationDate' => "Created",
'StartDate' => "StartDate",
'EndDate' => "EndDate"
),
'topic' => array(
'Parent' => "ParentId"
)
);
public function __get($name)
{
switch($name)
{
case "sProviderName":
return $this->GetProviderName();
break;
default:
return parent::__get($name);
break;
}
}
public function GetProviderName()
{
switch($this->sProvider)
{
case 1:
return "Khan University";
case 2:
return "Coursera";
case 3:
return "University of Reddit";
default:
return "Unknown";
}
}
public function AsDataset($fetch_children = true, $fetch_items = true)
{
$child_data = array();
if($fetch_children == true)
{
foreach($this->GetChildren() as $child)
{
$child_data[] = $child->AsDataset();
}
}
$item_data = array();
if($fetch_items == true)
{
foreach($this->GetItems() as $item)
{
$item_data[] = $item->AsDataset();
}
}
return array(
"title" => $this->uTitle,
"description" => $this->uDescription,
"created" => $this->sCreationDate,
"start" => $this->sStartDate,
"end" => $this->sEndDate,
"provider" => $this->sProviderName,
"needs_enrollment" => $this->sNeedsEnrollment,
"children" => $child_data,
"items" => $item_data
);
}
public function GetItems()
{
try
{
return Item::CreateFromQuery("SELECT * FROM items WHERE `TopicId` = :TopicId", array(':TopicId' => $this->sId));
}
catch (NotFoundException $e)
{
return array();
}
}
public function GetChildren()
{
try
{
return Topic::CreateFromQuery("SELECT * FROM topics WHERE `ParentId` = :ParentId", array(':ParentId' => $this->sId));
}
catch (NotFoundException $e)
{
return array();
}
}
}

@ -0,0 +1 @@
../../cphp

File diff suppressed because one or more lines are too long

@ -0,0 +1,26 @@
<?php
/*
* Cryto Learn is more free software. It is licensed under the WTFPL, which
* allows you to do pretty much anything with it, without having to
* ask permission. Commercial use is allowed, and no attribution is
* required. We do politely request that you share your modifications
* to benefit other developers, but you are under no enforced
* obligation to do so :)
*
* Please read the accompanying LICENSE document for the full WTFPL
* licensing text.
*/
if(!isset($_APP)) { die("Unauthorized."); }
$_CPHP = true;
$_CPHP_CONFIG = "../config.json";
require("cphp/base.php");
function __autoload($class_name)
{
global $_APP;
$class_name = str_replace("\\", "/", strtolower($class_name));
require_once("classes/{$class_name}.php");
}

@ -0,0 +1,14 @@
<?php
/*
* Cryto Learn is more free software. It is licensed under the WTFPL, which
* allows you to do pretty much anything with it, without having to
* ask permission. Commercial use is allowed, and no attribution is
* required. We do politely request that you share your modifications
* to benefit other developers, but you are under no enforced
* obligation to do so :)
*
* Please read the accompanying LICENSE document for the full WTFPL
* licensing text.
*/
require("rewrite.php");

@ -0,0 +1,24 @@
_locale; en_US.UTF-8,en_US
_datetime_short; %d/%m/%Y %H:%M:%S
_datetime_long; %A %B %d, %Y %H:%M:%S
_date_short; %d/%m/%Y
_date_long; %A %B %d, %Y
_time; %H:%M:%S
event-now; now
event-future; in the future
event-past; in the past
event-1second-ago; 1 second ago
event-seconds-ago; %1$d seconds ago
event-1minutes-ago; 1 minute ago
event-minutes-ago; %1$d minutes ago
event-1hour-ago; 1 hour ago
event-hours-ago; %1$d hours ago
event-1day-ago; 1 day ago
event-days-ago; %1$d days ago
event-1week-ago; 1 week ago
event-weeks-ago; %1$d weeks ago
event-1month-ago; 1 month ago
event-months-ago; %1$d months ago
event-1year-ago; 1 year ago
event-years-ago; %1$d years ago

@ -0,0 +1,49 @@
<?php
/*
* Cryto Learn is more free software. It is licensed under the WTFPL, which
* allows you to do pretty much anything with it, without having to
* ask permission. Commercial use is allowed, and no attribution is
* required. We do politely request that you share your modifications
* to benefit other developers, but you are under no enforced
* obligation to do so :)
*
* Please read the accompanying LICENSE document for the full WTFPL
* licensing text.
*/
if(!isset($_APP)) { die("Unauthorized."); }
if(empty($_POST['q']))
{
die(json_encode(array(
"error" => "No search query specified."
)));
}
else
{
$query = $_POST['q'];
$terms = explode(" ", $query);
$db_query_terms = array();
foreach($terms as $term)
{
$db_query_terms[] = "`Title` LIKE ?";
$db_query_arguments[] = "%{$term}%";
}
$db_query = implode(" AND ", $db_query_terms);
array_unshift($db_query_arguments, '');
unset($db_query_arguments[0]);
$results_topics = Topic::CreateFromQuery("SELECT * FROM topics WHERE {$db_query}", $db_query_arguments);
$return_objects = array();
foreach($results_topics as $topic)
{
$return_objects[] = $topic->AsDataset();
}
$sPageContents = json_encode($return_objects);
}

@ -0,0 +1,18 @@
<?php
/*
* Cryto Learn is more free software. It is licensed under the WTFPL, which
* allows you to do pretty much anything with it, without having to
* ask permission. Commercial use is allowed, and no attribution is
* required. We do politely request that you share your modifications
* to benefit other developers, but you are under no enforced
* obligation to do so :)
*
* Please read the accompanying LICENSE document for the full WTFPL
* licensing text.
*/
if(!isset($_APP)) { die("Unauthorized."); }
$sPageContents = NewTemplater::Render("ui/index", $locale->strings, array());
$sPageType = "ui";

@ -0,0 +1,33 @@
<?php
$_APP = true;
require("includes/base.php");
$sPageContents = "";
$router = new CPHPRouter();
$router->allow_slash = true;
$router->ignore_query = true;
$router->routes = array(
0 => array(
"^/$" => "modules/ui/index.php",
"^/api/search$" => "modules/api/search.php",
"^/api/dump$" => "modules/api/dump.php"
)
);
$router->RouteRequest();
echo($sPageContents);
/*
$data = array();
foreach(Topic::CreateFromQuery("SELECT * FROM topics WHERE `ParentId` = 0") as $topic)
{
$data[] = $topic->AsDataset();
}
echo(json_encode($data));
* */

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 KiB

@ -6,6 +6,11 @@ body
font-family: sans-serif;
}
#templates
{
display: none;
}
.header
{
background-color: #C9F9DF;
@ -55,3 +60,79 @@ body
font-size: 26px;
width: 180px;
}
.spinner
{
margin-left: 14px;
}
.topic, .item
{
padding: 9px 12px;
margin: 5px 20px;
background-color: #79E1A8;
font-size: 20px;
width: 960px;
}
.topic
{
margin-top: 19px;
cursor: pointer;
}
.item
{
margin-left: 34px;
width: 926px;
font-size: 18px;
background-color: #97F3C1;
display: none;
}
.type
{
font-size: 18px;
color: gray;
}
.type:after
{
content: ":";
}
a.title
{
color: #041F9F;
}
.toggler
{
display: block;
float: left;
width: 16px;
height: 16px;
margin-top: 2px;
margin-right: 8px;
font-size: 13px;
text-align: center;
font-weight: bold;
border: 1px solid black;
background-color: #D2ECCF;
}
.providername
{
font-size: 18px;
color: gray;
}
.providername:before
{
content: "(";
}
.providername:after
{
content: ")";
}

@ -0,0 +1,140 @@
<!doctype html>
<html>
<head>
<title>learn.cryto.net</title>
<link rel="stylesheet" href="style.css">
<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.9.0/jquery.min.js"></script>
<script>
var search_timeout = null;
$(function(){
/*$("input").val("data");
runSearch();*/
$("input").keypress(function(){
if(typeof search_timeout !== "null")
{
clearTimeout(search_timeout);
}
search_timeout = setTimeout(runSearch, 800)
});
});
function runSearch()
{
$(".search-large").removeClass("search-large").addClass("search-top");
$(".spinner").show();
var query = $("input#query").val();
$.post("/api/search", {q: query}, function(response){
$(".spinner").hide();
$(".results").html("");
for(i in response)
{
if(response[i].items.length > 0)
{
var result_wrapper = instantiateTemplate("result_wrapper");
var result_block = instantiateTemplate("result_topic");
result_block.children(".title").html(response[i].title);
result_block.children(".providername").html(response[i].provider);
result_block.appendTo(result_wrapper);
for(x in response[i].items)
{
item = response[i].items[x];
var item_block = instantiateTemplate("result_item");
item_block.children(".title").html(item.title);
item_block.children(".title").attr("href", item.url);
item_block.children(".type").html(item.type);
item_block.insertAfter(result_block);
}
result_wrapper.appendTo(".results");
}
}
setHandlers();
}, "json");
}
function setHandlers()
{
$(".toggler, .topic").each(
function(){
$(this).click(function(event){
toggleItems(this, event);
});
}
);
}
function instantiateTemplate(template_name)
{
var instance = $("#template_" + template_name).clone();
instance.removeAttr("id");
return instance;
}
function toggleItems(ctx, event)
{
var parent = $(ctx).parentsUntil(".wrapper");
if(parent.length == 0)
{
var wrapper = $(ctx).parent();
}
else
{
var wrapper = parent.parent();
}
var toggler = wrapper.find(".toggler");
if(typeof toggler.data("toggled") == "undefined" || toggler.data("toggled") == false)
{
toggler.data("toggled", true);
toggler.html("-");
wrapper.find(".item").show();
}
else
{
toggler.data("toggled", false);
toggler.html("+");
wrapper.find(".item").hide();
}
event.stopPropagation();
}
</script>
</head>
<body>
<div class="header">
<h1><strong>learn.cryto.net</strong> :: Learn something new!</h1>
</div>
<div class="main">
<div class="search-large">
I want to learn about <input type="text" id="query">. <img src="/static/spinner.gif" class="spinner" style="display: none;">
</div>
<div class="results">
</div>
</div>
<div id="templates">
<div id="template_result_wrapper" class="wrapper"></div>
<div id="template_result_topic" class="topic">
<span class="toggler">+</span>
<strong>Topic: </strong>
<span class="title"></span>
<span class="providername"></span>
</div>
<div id="template_result_item" class="item">
<span class="type"></span>
<a href="#" class="title"></a>
</div>
</div>
</body>
</html>

File diff suppressed because one or more lines are too long

@ -10,10 +10,21 @@ class Database(object):
TEST = 7
BOOK = 8
AUDIOBOOK = 9
LECTURE = 10
def __init__(self, host, user, password=None, database="learn"):
self.database = oursql.connect(host=host, user=user, db=database)
def topic_exists(self, provider, unique_id):
c = self.database.cursor()
c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (provider, unique_id))
return (len(c.fetchall()) > 0)
def item_exists(self, provider, unique_id):
c = self.database.cursor()
c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (provider, unique_id))
return (len(c.fetchall()) > 0)
def insert_topic(self, provider, unique_id, title, override=False, **kwargs):
defaults = {
"needs_enrollment": False,
@ -56,7 +67,9 @@ class Database(object):
"topic_id": 0,
"parent_id": 0,
"description": "",
"date": None
"date": None,
"start_date": None,
"end_date": None
}
for kwarg, val in defaults.iteritems():
@ -78,8 +91,8 @@ class Database(object):
if exists == True:
return (False, results[0][0])
else:
c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`)"
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (has_topic, itemtype, provider, unique_id, title, kwargs["description"], item_url, kwargs["source_url"],
kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"]))
c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`, `StartDate`, `EndDate`)"
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (has_topic, itemtype, provider, unique_id, title, kwargs["description"], item_url, kwargs["source_url"],
kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"]))
return (True, c.lastrowid)

@ -0,0 +1,47 @@
import requests
import oursql
import datetime
import json
import lib
class CourseraCrawler(object):
def __init__(self):
self.db = lib.Database("localhost", "root")
def retrieve_dataset(self):
#self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
self.dataset = json.loads(open("coursera.json", "r").read())
def parse_dataset(self):
for item in self.dataset:
self.process_item(item)
def process_item(self, item):
inserted, rowid = self.db.insert_topic(2, str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
if inserted:
print "Inserted %s" % item["name"]
else:
print "Skipped %s" % item["name"]
for course in item["courses"]:
self.process_course(course, rowid)
def process_course(self, course, topicid):
try:
start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
title = "%s: %s-%s-%s" % (course["name"], str(course["start_year"]).zfill(4), str(course["start_month"]).zfill(2), str(course["start_day"]).zfill(2))
except TypeError, e:
start_date = None
title = "%s (date undetermined)" % (course["name"])
inserted, itemid = self.db.insert_item(2, str(course["id"]), True, self.db.COURSE, title, course["home_link"], description=course["certificate_description"], start_date=start_date, topic_id=topicid)
if inserted:
print "\tInserted %s" % title
else:
print "\tSkipped %s" % title
crawler = CourseraCrawler()
crawler.retrieve_dataset()
crawler.parse_dataset()

@ -0,0 +1,100 @@
import requests
import oursql
import datetime
import json, simplejson
import lib
import re
class UredditCrawler(object):
def __init__(self):
self.db = lib.Database("localhost", "root")
def parse_catalog(self):
data = requests.get("http://ureddit.com/api?type=catalog").json()
for category in data["categories"]:
self.parse_category(category['id'], category['value'])
def parse_category(self, category_id, category_name):
try:
data = requests.get("http://ureddit.com/api?type=category&id=%s" % category_id).json()
except simplejson.decoder.JSONDecodeError, e:
return
for _class in data["classes"]:
if not self.db.topic_exists(3, _class['id']):
self.parse_class(_class['id'], _class['value'], category_name)
else:
print "Skipped class %s" % _class['value']
def parse_class(self, class_id, class_name, category_name):
try:
data = requests.get("http://ureddit.com/api?type=class&id=%s" % class_id).json()
except simplejson.decoder.JSONDecodeError, e:
print "Skipped %s due to JSON formatting error" % class_name
return
try:
creation_date = datetime.datetime.strptime(data["created"], '%Y-%m-%d %H:%M:%S')
except ValueError, e:
creation_date = None
# Hack to get the class page as this isn't returned by the API
html_data = requests.get("http://ureddit.com/show_class.php?id=%s&show=true" % class_id).text
matches = re.search('<a href="([^"]+)"><button class="button">class page<\/button><\/a>', html_data)
if matches is not None:
class_page = "http://ureddit.com%s" % matches.group(1)
else:
class_page = None
inserted, topic_id = self.db.insert_topic(3, str(class_id), data["name"], needs_enrollment=True, description=data["description"], creation_date=creation_date)
if inserted:
print "Inserted %s" % data["name"]
else:
print "Skipped %s" % data["name"]
inserted, item_id = self.db.insert_item(3, str(class_id), True, self.db.COURSE, data["name"], class_page, topic_id=topic_id, date=creation_date, description=data["description"])
if inserted:
print "\tInserted %s" % data["name"]
else:
print "\tSkipped %s" % data["name"]
def retrieve_dataset(self):
#self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
self.dataset = json.loads(open("coursera.json", "r").read())
def parse_dataset(self):
for item in self.dataset:
self.process_item(item)
def process_item(self, item):
inserted, rowid = self.db.insert_topic(2, str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
if inserted:
print "Inserted %s" % item["name"]
else:
print "Skipped %s" % item["name"]
for course in item["courses"]:
self.process_course(course, rowid)
def process_course(self, course, topicid):
try:
start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
title = "%s: %s-%s-%s" % (course["name"], str(course["start_year"]).zfill(4), str(course["start_month"]).zfill(2), str(course["start_day"]).zfill(2))
except TypeError, e:
start_date = None
title = "%s (date undetermined)" % (course["name"])
inserted, itemid = self.db.insert_item(2, str(course["id"]), True, self.db.COURSE, title, course["home_link"], description=course["certificate_description"], start_date=start_date, topic_id=topicid)
if inserted:
print "\tInserted %s" % title
else:
print "\tSkipped %s" % title
crawler = UredditCrawler()
crawler.parse_catalog()
Loading…
Cancel
Save