Add crawlers for coursera and ureddit, get first quick and dirty version of frontend done, and fix buigs and stuff
parent
703a34bfa2
commit
6ec1a2d90b
@ -0,0 +1,30 @@
|
||||
{
|
||||
"database": {
|
||||
"driver": "mysql",
|
||||
"pdo": true,
|
||||
"hostname": "localhost",
|
||||
"username": "root",
|
||||
"password": "",
|
||||
"database": "learn"
|
||||
},
|
||||
"locale": {
|
||||
"path": "locales",
|
||||
"extension": "lng",
|
||||
"default_locale": "english",
|
||||
"default_timezone": "Europe/Amsterdam"
|
||||
},
|
||||
"memcache": {
|
||||
"enabled": true,
|
||||
"compressed": true,
|
||||
"hostname": "localhost",
|
||||
"port": 11211
|
||||
},
|
||||
"class_map": {
|
||||
"item": "Item",
|
||||
"topic": "Topic"
|
||||
},
|
||||
"components": [
|
||||
"router",
|
||||
"errorhandler"
|
||||
]
|
||||
}
|
@ -0,0 +1,148 @@
|
||||
<?php
|
||||
/*
|
||||
* Cryto Learn is more free software. It is licensed under the WTFPL, which
|
||||
* allows you to do pretty much anything with it, without having to
|
||||
* ask permission. Commercial use is allowed, and no attribution is
|
||||
* required. We do politely request that you share your modifications
|
||||
* to benefit other developers, but you are under no enforced
|
||||
* obligation to do so :)
|
||||
*
|
||||
* Please read the accompanying LICENSE document for the full WTFPL
|
||||
* licensing text.
|
||||
*/
|
||||
|
||||
if(!isset($_APP)) { die("Unauthorized."); }
|
||||
|
||||
class Item extends CPHPDatabaseRecordClass
|
||||
{
|
||||
public $table_name = "items";
|
||||
public $fill_query = "SELECT * FROM items WHERE `Id` = :Id";
|
||||
public $verify_query = "SELECT * FROM items WHERE `Id` = :Id";
|
||||
|
||||
public $prototype = array(
|
||||
'string' => array(
|
||||
'Title' => "Title",
|
||||
'Description' => "Description",
|
||||
'SourceUrl' => "SourceUrl",
|
||||
'ItemUrl' => "ItemUrl"
|
||||
),
|
||||
'numeric' => array(
|
||||
'Type' => "Type",
|
||||
'Provider' => "Provider",
|
||||
'Views' => "Views",
|
||||
'TopicId' => "TopicId",
|
||||
'ParentId' => "ParentId"
|
||||
),
|
||||
'boolean' => array(
|
||||
'HasTopic' => "HasTopic"
|
||||
),
|
||||
'timestamp' => array(
|
||||
'CreationDate' => "Date",
|
||||
'StartDate' => "StartDate",
|
||||
'EndDate' => "EndDate"
|
||||
),
|
||||
'topic' => array(
|
||||
'Topic' => "TopicId"
|
||||
),
|
||||
'item' => array(
|
||||
'Parent' => "ParentId"
|
||||
)
|
||||
);
|
||||
|
||||
public function __get($name)
|
||||
{
|
||||
switch($name)
|
||||
{
|
||||
case "sTypeName":
|
||||
return $this->GetTypeName();
|
||||
break;
|
||||
case "sProviderName":
|
||||
return $this->GetProviderName();
|
||||
break;
|
||||
default:
|
||||
return parent::__get($name);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
public function GetTypeName()
|
||||
{
|
||||
switch($this->sType)
|
||||
{
|
||||
case 1:
|
||||
return "topic";
|
||||
case 2:
|
||||
return "course";
|
||||
case 3:
|
||||
return "video";
|
||||
case 4:
|
||||
return "article";
|
||||
case 5:
|
||||
return "exercise";
|
||||
case 6:
|
||||
return "quiz";
|
||||
case 7:
|
||||
return "test";
|
||||
case 8:
|
||||
return "book";
|
||||
case 9:
|
||||
return "audiobook";
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
public function GetProviderName()
|
||||
{
|
||||
switch($this->sProvider)
|
||||
{
|
||||
case 1:
|
||||
return "Khan University";
|
||||
case 2:
|
||||
return "Coursera";
|
||||
case 3:
|
||||
return "University of Reddit";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
public function GetChildren()
|
||||
{
|
||||
try
|
||||
{
|
||||
return Item::CreateFromQuery("SELECT * FROM items WHERE `ParentId` = :ParentId", array(':ParentId' => $this->sId));
|
||||
}
|
||||
catch (NotFoundException $e)
|
||||
{
|
||||
return array();
|
||||
}
|
||||
}
|
||||
|
||||
public function AsDataset($fetch_children = true)
|
||||
{
|
||||
$child_data = array();
|
||||
|
||||
if($fetch_children == true)
|
||||
{
|
||||
foreach($this->GetChildren() as $child)
|
||||
{
|
||||
$child_data[] = $child->AsDataset();
|
||||
}
|
||||
}
|
||||
|
||||
return array(
|
||||
"title" => $this->uTitle,
|
||||
"description" => $this->uDescription,
|
||||
"url" => $this->uItemUrl,
|
||||
"source" => $this->uSourceUrl,
|
||||
"created" => $this->sCreationDate,
|
||||
"start" => $this->sStartDate,
|
||||
"end" => $this->sEndDate,
|
||||
"type" => $this->sTypeName,
|
||||
"provider" => $this->sProviderName,
|
||||
"views" => $this->sViews,
|
||||
"children" => $child_data
|
||||
);
|
||||
}
|
||||
}
|
@ -0,0 +1,131 @@
|
||||
<?php
|
||||
/*
|
||||
* Cryto Learn is more free software. It is licensed under the WTFPL, which
|
||||
* allows you to do pretty much anything with it, without having to
|
||||
* ask permission. Commercial use is allowed, and no attribution is
|
||||
* required. We do politely request that you share your modifications
|
||||
* to benefit other developers, but you are under no enforced
|
||||
* obligation to do so :)
|
||||
*
|
||||
* Please read the accompanying LICENSE document for the full WTFPL
|
||||
* licensing text.
|
||||
*/
|
||||
|
||||
if(!isset($_APP)) { die("Unauthorized."); }
|
||||
|
||||
class Topic extends CPHPDatabaseRecordClass
|
||||
{
|
||||
public $table_name = "topics";
|
||||
public $fill_query = "SELECT * FROM topics WHERE `Id` = :Id";
|
||||
public $verify_query = "SELECT * FROM topics WHERE `Id` = :Id";
|
||||
|
||||
public $prototype = array(
|
||||
'string' => array(
|
||||
'Title' => "Title",
|
||||
'ProviderId' => "ProviderId",
|
||||
'Description' => "Description"
|
||||
),
|
||||
'numeric' => array(
|
||||
'ParentId' => "ParentId",
|
||||
'Provider' => "Provider"
|
||||
),
|
||||
'boolean' => array(
|
||||
'NeedsEnrollment' => "NeedsEnrollment"
|
||||
),
|
||||
'timestamp' => array(
|
||||
'CreationDate' => "Created",
|
||||
'StartDate' => "StartDate",
|
||||
'EndDate' => "EndDate"
|
||||
),
|
||||
'topic' => array(
|
||||
'Parent' => "ParentId"
|
||||
)
|
||||
);
|
||||
|
||||
public function __get($name)
|
||||
{
|
||||
switch($name)
|
||||
{
|
||||
case "sProviderName":
|
||||
return $this->GetProviderName();
|
||||
break;
|
||||
default:
|
||||
return parent::__get($name);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
public function GetProviderName()
|
||||
{
|
||||
switch($this->sProvider)
|
||||
{
|
||||
case 1:
|
||||
return "Khan University";
|
||||
case 2:
|
||||
return "Coursera";
|
||||
case 3:
|
||||
return "University of Reddit";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
public function AsDataset($fetch_children = true, $fetch_items = true)
|
||||
{
|
||||
$child_data = array();
|
||||
|
||||
if($fetch_children == true)
|
||||
{
|
||||
foreach($this->GetChildren() as $child)
|
||||
{
|
||||
$child_data[] = $child->AsDataset();
|
||||
}
|
||||
}
|
||||
|
||||
$item_data = array();
|
||||
|
||||
if($fetch_items == true)
|
||||
{
|
||||
foreach($this->GetItems() as $item)
|
||||
{
|
||||
$item_data[] = $item->AsDataset();
|
||||
}
|
||||
}
|
||||
|
||||
return array(
|
||||
"title" => $this->uTitle,
|
||||
"description" => $this->uDescription,
|
||||
"created" => $this->sCreationDate,
|
||||
"start" => $this->sStartDate,
|
||||
"end" => $this->sEndDate,
|
||||
"provider" => $this->sProviderName,
|
||||
"needs_enrollment" => $this->sNeedsEnrollment,
|
||||
"children" => $child_data,
|
||||
"items" => $item_data
|
||||
);
|
||||
}
|
||||
|
||||
public function GetItems()
|
||||
{
|
||||
try
|
||||
{
|
||||
return Item::CreateFromQuery("SELECT * FROM items WHERE `TopicId` = :TopicId", array(':TopicId' => $this->sId));
|
||||
}
|
||||
catch (NotFoundException $e)
|
||||
{
|
||||
return array();
|
||||
}
|
||||
}
|
||||
|
||||
public function GetChildren()
|
||||
{
|
||||
try
|
||||
{
|
||||
return Topic::CreateFromQuery("SELECT * FROM topics WHERE `ParentId` = :ParentId", array(':ParentId' => $this->sId));
|
||||
}
|
||||
catch (NotFoundException $e)
|
||||
{
|
||||
return array();
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1 @@
|
||||
../../cphp
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,26 @@
|
||||
<?php
|
||||
/*
|
||||
* Cryto Learn is more free software. It is licensed under the WTFPL, which
|
||||
* allows you to do pretty much anything with it, without having to
|
||||
* ask permission. Commercial use is allowed, and no attribution is
|
||||
* required. We do politely request that you share your modifications
|
||||
* to benefit other developers, but you are under no enforced
|
||||
* obligation to do so :)
|
||||
*
|
||||
* Please read the accompanying LICENSE document for the full WTFPL
|
||||
* licensing text.
|
||||
*/
|
||||
|
||||
if(!isset($_APP)) { die("Unauthorized."); }
|
||||
|
||||
$_CPHP = true;
|
||||
$_CPHP_CONFIG = "../config.json";
|
||||
require("cphp/base.php");
|
||||
|
||||
function __autoload($class_name)
|
||||
{
|
||||
global $_APP;
|
||||
|
||||
$class_name = str_replace("\\", "/", strtolower($class_name));
|
||||
require_once("classes/{$class_name}.php");
|
||||
}
|
@ -0,0 +1,14 @@
|
||||
<?php
|
||||
/*
|
||||
* Cryto Learn is more free software. It is licensed under the WTFPL, which
|
||||
* allows you to do pretty much anything with it, without having to
|
||||
* ask permission. Commercial use is allowed, and no attribution is
|
||||
* required. We do politely request that you share your modifications
|
||||
* to benefit other developers, but you are under no enforced
|
||||
* obligation to do so :)
|
||||
*
|
||||
* Please read the accompanying LICENSE document for the full WTFPL
|
||||
* licensing text.
|
||||
*/
|
||||
|
||||
require("rewrite.php");
|
@ -0,0 +1,24 @@
|
||||
_locale; en_US.UTF-8,en_US
|
||||
_datetime_short; %d/%m/%Y %H:%M:%S
|
||||
_datetime_long; %A %B %d, %Y %H:%M:%S
|
||||
_date_short; %d/%m/%Y
|
||||
_date_long; %A %B %d, %Y
|
||||
_time; %H:%M:%S
|
||||
|
||||
event-now; now
|
||||
event-future; in the future
|
||||
event-past; in the past
|
||||
event-1second-ago; 1 second ago
|
||||
event-seconds-ago; %1$d seconds ago
|
||||
event-1minutes-ago; 1 minute ago
|
||||
event-minutes-ago; %1$d minutes ago
|
||||
event-1hour-ago; 1 hour ago
|
||||
event-hours-ago; %1$d hours ago
|
||||
event-1day-ago; 1 day ago
|
||||
event-days-ago; %1$d days ago
|
||||
event-1week-ago; 1 week ago
|
||||
event-weeks-ago; %1$d weeks ago
|
||||
event-1month-ago; 1 month ago
|
||||
event-months-ago; %1$d months ago
|
||||
event-1year-ago; 1 year ago
|
||||
event-years-ago; %1$d years ago
|
@ -0,0 +1,49 @@
|
||||
<?php
|
||||
/*
|
||||
* Cryto Learn is more free software. It is licensed under the WTFPL, which
|
||||
* allows you to do pretty much anything with it, without having to
|
||||
* ask permission. Commercial use is allowed, and no attribution is
|
||||
* required. We do politely request that you share your modifications
|
||||
* to benefit other developers, but you are under no enforced
|
||||
* obligation to do so :)
|
||||
*
|
||||
* Please read the accompanying LICENSE document for the full WTFPL
|
||||
* licensing text.
|
||||
*/
|
||||
|
||||
if(!isset($_APP)) { die("Unauthorized."); }
|
||||
|
||||
if(empty($_POST['q']))
|
||||
{
|
||||
die(json_encode(array(
|
||||
"error" => "No search query specified."
|
||||
)));
|
||||
}
|
||||
else
|
||||
{
|
||||
$query = $_POST['q'];
|
||||
$terms = explode(" ", $query);
|
||||
|
||||
$db_query_terms = array();
|
||||
|
||||
foreach($terms as $term)
|
||||
{
|
||||
$db_query_terms[] = "`Title` LIKE ?";
|
||||
$db_query_arguments[] = "%{$term}%";
|
||||
}
|
||||
|
||||
$db_query = implode(" AND ", $db_query_terms);
|
||||
array_unshift($db_query_arguments, '');
|
||||
unset($db_query_arguments[0]);
|
||||
|
||||
$results_topics = Topic::CreateFromQuery("SELECT * FROM topics WHERE {$db_query}", $db_query_arguments);
|
||||
|
||||
$return_objects = array();
|
||||
|
||||
foreach($results_topics as $topic)
|
||||
{
|
||||
$return_objects[] = $topic->AsDataset();
|
||||
}
|
||||
|
||||
$sPageContents = json_encode($return_objects);
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
<?php
|
||||
/*
|
||||
* Cryto Learn is more free software. It is licensed under the WTFPL, which
|
||||
* allows you to do pretty much anything with it, without having to
|
||||
* ask permission. Commercial use is allowed, and no attribution is
|
||||
* required. We do politely request that you share your modifications
|
||||
* to benefit other developers, but you are under no enforced
|
||||
* obligation to do so :)
|
||||
*
|
||||
* Please read the accompanying LICENSE document for the full WTFPL
|
||||
* licensing text.
|
||||
*/
|
||||
|
||||
if(!isset($_APP)) { die("Unauthorized."); }
|
||||
|
||||
$sPageContents = NewTemplater::Render("ui/index", $locale->strings, array());
|
||||
|
||||
$sPageType = "ui";
|
@ -0,0 +1,33 @@
|
||||
<?php
|
||||
$_APP = true;
|
||||
require("includes/base.php");
|
||||
|
||||
$sPageContents = "";
|
||||
|
||||
$router = new CPHPRouter();
|
||||
|
||||
$router->allow_slash = true;
|
||||
$router->ignore_query = true;
|
||||
|
||||
$router->routes = array(
|
||||
0 => array(
|
||||
"^/$" => "modules/ui/index.php",
|
||||
"^/api/search$" => "modules/api/search.php",
|
||||
"^/api/dump$" => "modules/api/dump.php"
|
||||
)
|
||||
);
|
||||
|
||||
$router->RouteRequest();
|
||||
|
||||
echo($sPageContents);
|
||||
|
||||
/*
|
||||
$data = array();
|
||||
|
||||
foreach(Topic::CreateFromQuery("SELECT * FROM topics WHERE `ParentId` = 0") as $topic)
|
||||
{
|
||||
$data[] = $topic->AsDataset();
|
||||
}
|
||||
|
||||
echo(json_encode($data));
|
||||
* */
|
Binary file not shown.
After Width: | Height: | Size: 1.8 KiB |
@ -0,0 +1,140 @@
|
||||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<title>learn.cryto.net</title>
|
||||
<link rel="stylesheet" href="style.css">
|
||||
<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.9.0/jquery.min.js"></script>
|
||||
<script>
|
||||
var search_timeout = null;
|
||||
|
||||
$(function(){
|
||||
/*$("input").val("data");
|
||||
runSearch();*/
|
||||
|
||||
$("input").keypress(function(){
|
||||
if(typeof search_timeout !== "null")
|
||||
{
|
||||
clearTimeout(search_timeout);
|
||||
}
|
||||
|
||||
search_timeout = setTimeout(runSearch, 800)
|
||||
});
|
||||
});
|
||||
|
||||
function runSearch()
|
||||
{
|
||||
$(".search-large").removeClass("search-large").addClass("search-top");
|
||||
$(".spinner").show();
|
||||
var query = $("input#query").val();
|
||||
|
||||
$.post("/api/search", {q: query}, function(response){
|
||||
$(".spinner").hide();
|
||||
$(".results").html("");
|
||||
|
||||
for(i in response)
|
||||
{
|
||||
if(response[i].items.length > 0)
|
||||
{
|
||||
var result_wrapper = instantiateTemplate("result_wrapper");
|
||||
|
||||
var result_block = instantiateTemplate("result_topic");
|
||||
result_block.children(".title").html(response[i].title);
|
||||
result_block.children(".providername").html(response[i].provider);
|
||||
result_block.appendTo(result_wrapper);
|
||||
|
||||
for(x in response[i].items)
|
||||
{
|
||||
item = response[i].items[x];
|
||||
|
||||
var item_block = instantiateTemplate("result_item");
|
||||
item_block.children(".title").html(item.title);
|
||||
item_block.children(".title").attr("href", item.url);
|
||||
item_block.children(".type").html(item.type);
|
||||
item_block.insertAfter(result_block);
|
||||
}
|
||||
|
||||
result_wrapper.appendTo(".results");
|
||||
}
|
||||
}
|
||||
|
||||
setHandlers();
|
||||
}, "json");
|
||||
}
|
||||
|
||||
function setHandlers()
|
||||
{
|
||||
$(".toggler, .topic").each(
|
||||
function(){
|
||||
$(this).click(function(event){
|
||||
toggleItems(this, event);
|
||||
});
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
function instantiateTemplate(template_name)
|
||||
{
|
||||
var instance = $("#template_" + template_name).clone();
|
||||
instance.removeAttr("id");
|
||||
return instance;
|
||||
}
|
||||
|
||||
function toggleItems(ctx, event)
|
||||
{
|
||||
var parent = $(ctx).parentsUntil(".wrapper");
|
||||
|
||||
if(parent.length == 0)
|
||||
{
|
||||
var wrapper = $(ctx).parent();
|
||||
}
|
||||
else
|
||||
{
|
||||
var wrapper = parent.parent();
|
||||
}
|
||||
|
||||
var toggler = wrapper.find(".toggler");
|
||||
|
||||
if(typeof toggler.data("toggled") == "undefined" || toggler.data("toggled") == false)
|
||||
{
|
||||
toggler.data("toggled", true);
|
||||
toggler.html("-");
|
||||
wrapper.find(".item").show();
|
||||
}
|
||||
else
|
||||
{
|
||||
toggler.data("toggled", false);
|
||||
toggler.html("+");
|
||||
wrapper.find(".item").hide();
|
||||
}
|
||||
|
||||
event.stopPropagation();
|
||||
}
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
<div class="header">
|
||||
<h1><strong>learn.cryto.net</strong> :: Learn something new!</h1>
|
||||
</div>
|
||||
<div class="main">
|
||||
<div class="search-large">
|
||||
I want to learn about <input type="text" id="query">. <img src="/static/spinner.gif" class="spinner" style="display: none;">
|
||||
</div>
|
||||
<div class="results">
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<div id="templates">
|
||||
<div id="template_result_wrapper" class="wrapper"></div>
|
||||
<div id="template_result_topic" class="topic">
|
||||
<span class="toggler">+</span>
|
||||
<strong>Topic: </strong>
|
||||
<span class="title"></span>
|
||||
<span class="providername"></span>
|
||||
</div>
|
||||
<div id="template_result_item" class="item">
|
||||
<span class="type"></span>
|
||||
<a href="#" class="title"></a>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,47 @@
|
||||
import requests
|
||||
import oursql
|
||||
import datetime
|
||||
import json
|
||||
import lib
|
||||
|
||||
class CourseraCrawler(object):
|
||||
def __init__(self):
|
||||
self.db = lib.Database("localhost", "root")
|
||||
|
||||
def retrieve_dataset(self):
|
||||
#self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
|
||||
self.dataset = json.loads(open("coursera.json", "r").read())
|
||||
|
||||
def parse_dataset(self):
|
||||
for item in self.dataset:
|
||||
self.process_item(item)
|
||||
|
||||
def process_item(self, item):
|
||||
inserted, rowid = self.db.insert_topic(2, str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
|
||||
|
||||
if inserted:
|
||||
print "Inserted %s" % item["name"]
|
||||
else:
|
||||
print "Skipped %s" % item["name"]
|
||||
|
||||
for course in item["courses"]:
|
||||
self.process_course(course, rowid)
|
||||
|
||||
def process_course(self, course, topicid):
|
||||
try:
|
||||
start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
|
||||
title = "%s: %s-%s-%s" % (course["name"], str(course["start_year"]).zfill(4), str(course["start_month"]).zfill(2), str(course["start_day"]).zfill(2))
|
||||
except TypeError, e:
|
||||
start_date = None
|
||||
title = "%s (date undetermined)" % (course["name"])
|
||||
|
||||
inserted, itemid = self.db.insert_item(2, str(course["id"]), True, self.db.COURSE, title, course["home_link"], description=course["certificate_description"], start_date=start_date, topic_id=topicid)
|
||||
|
||||
if inserted:
|
||||
print "\tInserted %s" % title
|
||||
else:
|
||||
print "\tSkipped %s" % title
|
||||
|
||||
crawler = CourseraCrawler()
|
||||
crawler.retrieve_dataset()
|
||||
crawler.parse_dataset()
|
@ -0,0 +1,100 @@
|
||||
import requests
|
||||
import oursql
|
||||
import datetime
|
||||
import json, simplejson
|
||||
import lib
|
||||
import re
|
||||
|
||||
class UredditCrawler(object):
|
||||
def __init__(self):
|
||||
self.db = lib.Database("localhost", "root")
|
||||
|
||||
def parse_catalog(self):
|
||||
data = requests.get("http://ureddit.com/api?type=catalog").json()
|
||||
|
||||
for category in data["categories"]:
|
||||
self.parse_category(category['id'], category['value'])
|
||||
|
||||
def parse_category(self, category_id, category_name):
|
||||
try:
|
||||
data = requests.get("http://ureddit.com/api?type=category&id=%s" % category_id).json()
|
||||
except simplejson.decoder.JSONDecodeError, e:
|
||||
return
|
||||
|
||||
for _class in data["classes"]:
|
||||
if not self.db.topic_exists(3, _class['id']):
|
||||
self.parse_class(_class['id'], _class['value'], category_name)
|
||||
else:
|
||||
print "Skipped class %s" % _class['value']
|
||||
|
||||
def parse_class(self, class_id, class_name, category_name):
|
||||
try:
|
||||
data = requests.get("http://ureddit.com/api?type=class&id=%s" % class_id).json()
|
||||
except simplejson.decoder.JSONDecodeError, e:
|
||||
print "Skipped %s due to JSON formatting error" % class_name
|
||||
return
|
||||
|
||||
try:
|
||||
creation_date = datetime.datetime.strptime(data["created"], '%Y-%m-%d %H:%M:%S')
|
||||
except ValueError, e:
|
||||
creation_date = None
|
||||
|
||||
# Hack to get the class page as this isn't returned by the API
|
||||
html_data = requests.get("http://ureddit.com/show_class.php?id=%s&show=true" % class_id).text
|
||||
matches = re.search('<a href="([^"]+)"><button class="button">class page<\/button><\/a>', html_data)
|
||||
|
||||
if matches is not None:
|
||||
class_page = "http://ureddit.com%s" % matches.group(1)
|
||||
else:
|
||||
class_page = None
|
||||
|
||||
inserted, topic_id = self.db.insert_topic(3, str(class_id), data["name"], needs_enrollment=True, description=data["description"], creation_date=creation_date)
|
||||
|
||||
if inserted:
|
||||
print "Inserted %s" % data["name"]
|
||||
else:
|
||||
print "Skipped %s" % data["name"]
|
||||
|
||||
inserted, item_id = self.db.insert_item(3, str(class_id), True, self.db.COURSE, data["name"], class_page, topic_id=topic_id, date=creation_date, description=data["description"])
|
||||
|
||||
if inserted:
|
||||
print "\tInserted %s" % data["name"]
|
||||
else:
|
||||
print "\tSkipped %s" % data["name"]
|
||||
|
||||
def retrieve_dataset(self):
|
||||
#self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
|
||||
self.dataset = json.loads(open("coursera.json", "r").read())
|
||||
|
||||
def parse_dataset(self):
|
||||
for item in self.dataset:
|
||||
self.process_item(item)
|
||||
|
||||
def process_item(self, item):
|
||||
inserted, rowid = self.db.insert_topic(2, str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
|
||||
|
||||
if inserted:
|
||||
print "Inserted %s" % item["name"]
|
||||
else:
|
||||
print "Skipped %s" % item["name"]
|
||||
|
||||
for course in item["courses"]:
|
||||
self.process_course(course, rowid)
|
||||
|
||||
def process_course(self, course, topicid):
|
||||
try:
|
||||
start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
|
||||
title = "%s: %s-%s-%s" % (course["name"], str(course["start_year"]).zfill(4), str(course["start_month"]).zfill(2), str(course["start_day"]).zfill(2))
|
||||
except TypeError, e:
|
||||
start_date = None
|
||||
title = "%s (date undetermined)" % (course["name"])
|
||||
|
||||
inserted, itemid = self.db.insert_item(2, str(course["id"]), True, self.db.COURSE, title, course["home_link"], description=course["certificate_description"], start_date=start_date, topic_id=topicid)
|
||||
|
||||
if inserted:
|
||||
print "\tInserted %s" % title
|
||||
else:
|
||||
print "\tSkipped %s" % title
|
||||
|
||||
crawler = UredditCrawler()
|
||||
crawler.parse_catalog()
|
Loading…
Reference in New Issue