Compare commits

...

31 Commits

Author SHA1 Message Date
Sven Slootweg d98ee113bc Rewrite generic OCW parser, BeautifulSoup fix to allow exclusion of comments for string retrieval, and fix BS4 bug 12 years ago
Sven Slootweg 98340b38a0 Rewrite University of Reddit crawler - now with less hacks! 12 years ago
Sven Slootweg 8bbffb9429 Add topic_exists and item_exists methods to Scraper class 12 years ago
Sven Slootweg 0e4df4549f No need to import oursql from within the scrapers 12 years ago
Sven Slootweg 2c3bcc5418 Rewrite Khan Academy crawler 12 years ago
Sven Slootweg d9034b6215 Consistently use row_id, and not itemid or rowid 12 years ago
Sven Slootweg 8c0033074b Support both output logging and error logging in the Environment.log() method 12 years ago
Sven Slootweg b3edd35ecf Add support for lectures and sandboxes 12 years ago
Sven Slootweg d6d8eb70b9 Fix typo - it should be Khan Academy, not Khan University. 12 years ago
Sven Slootweg fb6c43a38f Rewrite scraper to be more modular, and convert the Coursera crawler to the new model 12 years ago
Sven Slootweg c2a8a66dac Update README to fix dependencies list 12 years ago
Sven Slootweg a690cb2c8f Add rudimentary first version of the OCW scraper 12 years ago
Sven Slootweg f188d443d1 Add README 12 years ago
Sven Slootweg 43c700ac2b Add list of various OCW sources for parser development 12 years ago
Sven Slootweg 26b68952fa Add table structure updates for new version of updater 12 years ago
Sven Slootweg a4e744f892 Add list of sources for book data 12 years ago
Sven Slootweg d3bd59f813 Add modified version of BeautifulSoup4 (nth-of-type pseudoselector and full-featured direct descendant support) 12 years ago
Sven Slootweg 8e951f6b27 Add simple script for searching from a terminal 12 years ago
Sven Slootweg d387541822 Support custom provider names 12 years ago
Sven Slootweg a6e350c0d9 Add dumping script 12 years ago
Sven Slootweg 0f5cade812 Simple dumper 12 years ago
Sven Slootweg fa74d394a7 Filter _ search terms 12 years ago
Sven Slootweg a9d2576eaf Add donation link 12 years ago
Sven Slootweg f57d45fa53 Add header message 12 years ago
Sven Slootweg 1503c1f75f Add 404 page 12 years ago
Sven Slootweg bfbfd821b5 Include a small preview in the search results 12 years ago
Sven Slootweg efeef5f70e Change search term requirements 12 years ago
Sven Slootweg 3f02174ba3 Implement some very basic methods to prevent overloading 12 years ago
Sven Slootweg 1fbb21e6d8 Properly use the password when connecting the crawlers 12 years ago
Sven Slootweg dd4c62bc4e Very basic error handling 12 years ago
Sven Slootweg 6ec1a2d90b Add crawlers for coursera and ureddit, get first quick and dirty version of frontend done, and fix buigs and stuff 12 years ago

@ -0,0 +1,9 @@
# Cryto Learn
This is the source code for http://learn.cryto.net/. It consists of the following:
* The updating script, a few very rudimentary scrapers for various educational sources. Requires Python 2. Dependencies are [oursql](http://packages.python.org/oursql/), [requests](http://docs.python-requests.org/en/latest/) and BeautifulSoup 4 (custom version included). Located in `updater/`.
* The frontend, a fairly hacky and messy PHP-based search interface. Needs cleaning up, but not an immediate priority. Requires PHP 5.3+ and uses [CPHP](http://github.com/joepie91/cphp). Located in `frontend/`.
* A simple shell search script, using the Cryto Learn API to search for the specified string and print results to stdout. Requires Python 2. Also very rudimentary.
Licensed under the WTFPL. It may or may not work on your system, use at your own risk, etc. etc.

@ -0,0 +1,7 @@
API:
http://www.goodreads.com/api
https://developers.google.com/books/docs/getting-started#books_api_v1
Dumps:
http://openlibrary.org/data/ol_dump_latest.txt.gz
http://www.librarything.com/feeds/

@ -0,0 +1,30 @@
{
"database": {
"driver": "mysql",
"pdo": true,
"hostname": "localhost",
"username": "root",
"password": "",
"database": "learn"
},
"locale": {
"path": "locales",
"extension": "lng",
"default_locale": "english",
"default_timezone": "Europe/Amsterdam"
},
"memcache": {
"enabled": true,
"compressed": true,
"hostname": "localhost",
"port": 11211
},
"class_map": {
"item": "Item",
"topic": "Topic"
},
"components": [
"router",
"errorhandler"
]
}

@ -0,0 +1,152 @@
<?php
/*
* Cryto Learn is more free software. It is licensed under the WTFPL, which
* allows you to do pretty much anything with it, without having to
* ask permission. Commercial use is allowed, and no attribution is
* required. We do politely request that you share your modifications
* to benefit other developers, but you are under no enforced
* obligation to do so :)
*
* Please read the accompanying LICENSE document for the full WTFPL
* licensing text.
*/
if(!isset($_APP)) { die("Unauthorized."); }
class Item extends CPHPDatabaseRecordClass
{
public $table_name = "items";
public $fill_query = "SELECT * FROM items WHERE `Id` = :Id";
public $verify_query = "SELECT * FROM items WHERE `Id` = :Id";
public $prototype = array(
'string' => array(
'Title' => "Title",
'Description' => "Description",
'SourceUrl' => "SourceUrl",
'ItemUrl' => "ItemUrl"
),
'numeric' => array(
'Type' => "Type",
'Provider' => "Provider",
'Views' => "Views",
'TopicId' => "TopicId",
'ParentId' => "ParentId"
),
'boolean' => array(
'HasTopic' => "HasTopic"
),
'timestamp' => array(
'CreationDate' => "Date",
'StartDate' => "StartDate",
'EndDate' => "EndDate"
),
'topic' => array(
'Topic' => "TopicId"
),
'item' => array(
'Parent' => "ParentId"
)
);
public function __get($name)
{
switch($name)
{
case "sTypeName":
return $this->GetTypeName();
break;
case "sProviderName":
return $this->GetProviderName();
break;
default:
return parent::__get($name);
break;
}
}
public function GetTypeName()
{
switch($this->sType)
{
case 1:
return "topic";
case 2:
return "course";
case 3:
return "video";
case 4:
return "article";
case 5:
return "exercise";
case 6:
return "quiz";
case 7:
return "test";
case 8:
return "book";
case 9:
return "audiobook";
case 10:
return "lecture";
case 11:
return "sandbox";
default:
return "unknown";
}
}
public function GetProviderName()
{
switch($this->sProvider)
{
case 1:
return "Khan Academy";
case 2:
return "Coursera";
case 3:
return "University of Reddit";
default:
return "Unknown";
}
}
public function GetChildren()
{
try
{
return Item::CreateFromQuery("SELECT * FROM items WHERE `ParentId` = :ParentId", array(':ParentId' => $this->sId));
}
catch (NotFoundException $e)
{
return array();
}
}
public function AsDataset($fetch_children = true)
{
$child_data = array();
if($fetch_children == true)
{
foreach($this->GetChildren() as $child)
{
$child_data[] = $child->AsDataset();
}
}
return array(
"title" => $this->uTitle,
"description" => $this->uDescription,
"url" => $this->uItemUrl,
"source" => $this->uSourceUrl,
"created" => $this->sCreationDate,
"start" => $this->sStartDate,
"end" => $this->sEndDate,
"type" => $this->sTypeName,
"provider" => $this->sProviderName,
"views" => $this->sViews,
"children" => $child_data
);
}
}

@ -0,0 +1,131 @@
<?php
/*
* Cryto Learn is more free software. It is licensed under the WTFPL, which
* allows you to do pretty much anything with it, without having to
* ask permission. Commercial use is allowed, and no attribution is
* required. We do politely request that you share your modifications
* to benefit other developers, but you are under no enforced
* obligation to do so :)
*
* Please read the accompanying LICENSE document for the full WTFPL
* licensing text.
*/
if(!isset($_APP)) { die("Unauthorized."); }
class Topic extends CPHPDatabaseRecordClass
{
public $table_name = "topics";
public $fill_query = "SELECT * FROM topics WHERE `Id` = :Id";
public $verify_query = "SELECT * FROM topics WHERE `Id` = :Id";
public $prototype = array(
'string' => array(
'Title' => "Title",
'ProviderId' => "ProviderId",
'Description' => "Description"
),
'numeric' => array(
'ParentId' => "ParentId",
'Provider' => "Provider"
),
'boolean' => array(
'NeedsEnrollment' => "NeedsEnrollment"
),
'timestamp' => array(
'CreationDate' => "Created",
'StartDate' => "StartDate",
'EndDate' => "EndDate"
),
'topic' => array(
'Parent' => "ParentId"
)
);
public function __get($name)
{
switch($name)
{
case "sProviderName":
return $this->GetProviderName();
break;
default:
return parent::__get($name);
break;
}
}
public function GetProviderName()
{
switch($this->sProvider)
{
case 1:
return "Khan Academy";
case 2:
return "Coursera";
case 3:
return "University of Reddit";
default:
return "Unknown";
}
}
public function AsDataset($fetch_children = true, $fetch_items = true)
{
$child_data = array();
if($fetch_children == true)
{
foreach($this->GetChildren() as $child)
{
$child_data[] = $child->AsDataset();
}
}
$item_data = array();
if($fetch_items == true)
{
foreach($this->GetItems() as $item)
{
$item_data[] = $item->AsDataset();
}
}
return array(
"title" => $this->uTitle,
"description" => $this->uDescription,
"created" => $this->sCreationDate,
"start" => $this->sStartDate,
"end" => $this->sEndDate,
"provider" => $this->sProviderName,
"needs_enrollment" => $this->sNeedsEnrollment,
"children" => $child_data,
"items" => $item_data
);
}
public function GetItems()
{
try
{
return Item::CreateFromQuery("SELECT * FROM items WHERE `TopicId` = :TopicId", array(':TopicId' => $this->sId));
}
catch (NotFoundException $e)
{
return array();
}
}
public function GetChildren()
{
try
{
return Topic::CreateFromQuery("SELECT * FROM topics WHERE `ParentId` = :ParentId", array(':ParentId' => $this->sId));
}
catch (NotFoundException $e)
{
return array();
}
}
}

@ -0,0 +1 @@
../../cphp

File diff suppressed because one or more lines are too long

@ -0,0 +1,26 @@
<?php
/*
* Cryto Learn is more free software. It is licensed under the WTFPL, which
* allows you to do pretty much anything with it, without having to
* ask permission. Commercial use is allowed, and no attribution is
* required. We do politely request that you share your modifications
* to benefit other developers, but you are under no enforced
* obligation to do so :)
*
* Please read the accompanying LICENSE document for the full WTFPL
* licensing text.
*/
if(!isset($_APP)) { die("Unauthorized."); }
$_CPHP = true;
$_CPHP_CONFIG = "../config.json";
require("cphp/base.php");
function __autoload($class_name)
{
global $_APP;
$class_name = str_replace("\\", "/", strtolower($class_name));
require_once("classes/{$class_name}.php");
}

@ -0,0 +1,14 @@
<?php
/*
* Cryto Learn is more free software. It is licensed under the WTFPL, which
* allows you to do pretty much anything with it, without having to
* ask permission. Commercial use is allowed, and no attribution is
* required. We do politely request that you share your modifications
* to benefit other developers, but you are under no enforced
* obligation to do so :)
*
* Please read the accompanying LICENSE document for the full WTFPL
* licensing text.
*/
require("rewrite.php");

@ -0,0 +1,24 @@
_locale; en_US.UTF-8,en_US
_datetime_short; %d/%m/%Y %H:%M:%S
_datetime_long; %A %B %d, %Y %H:%M:%S
_date_short; %d/%m/%Y
_date_long; %A %B %d, %Y
_time; %H:%M:%S
event-now; now
event-future; in the future
event-past; in the past
event-1second-ago; 1 second ago
event-seconds-ago; %1$d seconds ago
event-1minutes-ago; 1 minute ago
event-minutes-ago; %1$d minutes ago
event-1hour-ago; 1 hour ago
event-hours-ago; %1$d hours ago
event-1day-ago; 1 day ago
event-days-ago; %1$d days ago
event-1week-ago; 1 week ago
event-weeks-ago; %1$d weeks ago
event-1month-ago; 1 month ago
event-months-ago; %1$d months ago
event-1year-ago; 1 year ago
event-years-ago; %1$d years ago

@ -0,0 +1,28 @@
<?php
/*
* Cryto Learn is more free software. It is licensed under the WTFPL, which
* allows you to do pretty much anything with it, without having to
* ask permission. Commercial use is allowed, and no attribution is
* required. We do politely request that you share your modifications
* to benefit other developers, but you are under no enforced
* obligation to do so :)
*
* Please read the accompanying LICENSE document for the full WTFPL
* licensing text.
*/
if(!isset($_APP)) { die("Unauthorized."); }
if($_GET['key'] !== "derp")
{
die();
}
$data = array();
foreach(Topic::CreateFromQuery("SELECT * FROM topics WHERE `ParentId` = 0") as $topic)
{
$data[] = $topic->AsDataset();
}
echo(json_encode($data));

@ -0,0 +1,69 @@
<?php
/*
* Cryto Learn is more free software. It is licensed under the WTFPL, which
* allows you to do pretty much anything with it, without having to
* ask permission. Commercial use is allowed, and no attribution is
* required. We do politely request that you share your modifications
* to benefit other developers, but you are under no enforced
* obligation to do so :)
*
* Please read the accompanying LICENSE document for the full WTFPL
* licensing text.
*/
if(!isset($_APP)) { die("Unauthorized."); }
if(empty($_POST['q']))
{
die(json_encode(array(
"error" => "No search query specified."
)));
}
else
{
$query = $_POST['q'];
$terms = explode(" ", $query);
$db_query_terms = array();
$valid_term = false;
foreach($terms as $term)
{
$db_query_terms[] = "`Title` LIKE ?";
$term = str_replace("%", "\%", $term);
$term = str_replace("_", "\_", $term);
$valid_term = $valid_term || (strlen($term) >= 2);
$db_query_arguments[] = "%{$term}%";
}
if($valid_term)
{
$db_query = implode(" AND ", $db_query_terms);
array_unshift($db_query_arguments, '');
unset($db_query_arguments[0]);
try
{
$results_topics = Topic::CreateFromQuery("SELECT * FROM topics WHERE {$db_query}", $db_query_arguments);
$return_objects = array();
foreach($results_topics as $topic)
{
$return_objects[] = $topic->AsDataset();
}
$sPageContents = json_encode($return_objects);
}
catch (NotFoundException $e)
{
$sPageContents = json_encode(array("error" => "No results found for the specified query.", "query" => $query));
}
}
else
{
die(json_encode(array(
"error" => "No valid search query specified."
)));
}
}

@ -0,0 +1,18 @@
<?php
/*
* Cryto Learn is more free software. It is licensed under the WTFPL, which
* allows you to do pretty much anything with it, without having to
* ask permission. Commercial use is allowed, and no attribution is
* required. We do politely request that you share your modifications
* to benefit other developers, but you are under no enforced
* obligation to do so :)
*
* Please read the accompanying LICENSE document for the full WTFPL
* licensing text.
*/
if(!isset($_APP)) { die("Unauthorized."); }
$sPageContents = NewTemplater::Render("ui/index", $locale->strings, array());
$sPageType = "ui";

@ -0,0 +1,34 @@
<?php
$_APP = true;
require("includes/base.php");
$sPageContents = "";
$router = new CPHPRouter();
$router->allow_slash = true;
$router->ignore_query = true;
$router->routes = array(
0 => array(
"^/$" => "modules/ui/index.php",
"^/api/search$" => "modules/api/search.php",
"^/api/dump$" => "modules/api/dump.php"
)
);
try
{
$router->RouteRequest();
}
catch (RouterException $e)
{
http_status_code(404);
$sPageContents = "404 not found";
}
echo($sPageContents);
/*
* */

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 KiB

@ -6,6 +6,11 @@ body
font-family: sans-serif; font-family: sans-serif;
} }
#templates
{
display: none;
}
.header .header
{ {
background-color: #C9F9DF; background-color: #C9F9DF;
@ -19,6 +24,12 @@ body
font-weight: normal; font-weight: normal;
} }
.header h2
{
margin: 2px;
font-size: 17px;
}
.search-large .search-large
{ {
color: #006824; color: #006824;
@ -55,3 +66,95 @@ body
font-size: 26px; font-size: 26px;
width: 180px; width: 180px;
} }
.spinner
{
margin-left: 14px;
}
.topic, .item
{
padding: 9px 12px;
margin: 5px 20px;
background-color: #79E1A8;
font-size: 20px;
width: 960px;
}
.topic
{
margin-top: 19px;
cursor: pointer;
}
.item
{
margin-left: 34px;
width: 926px;
font-size: 18px;
background-color: #97F3C1;
display: none;
}
.type
{
font-size: 18px;
color: gray;
}
.type:after
{
content: ":";
}
a.title
{
color: #041F9F;
}
.toggler
{
display: block;
float: left;
width: 16px;
height: 16px;
margin-top: 2px;
margin-right: 8px;
font-size: 13px;
text-align: center;
font-weight: bold;
border: 1px solid black;
background-color: #D2ECCF;
}
.providername
{
font-size: 18px;
color: gray;
}
.providername:before
{
content: "(";
}
.providername:after
{
content: ")";
}
.error
{
margin: 8px 16px;
font-size: 19px;
}
.description
{
margin-top: 4px;
font-size: 13px;
max-height: 15px;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}

@ -0,0 +1,160 @@
<!doctype html>
<html>
<head>
<title>learn.cryto.net</title>
<link rel="stylesheet" href="style.css">
<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.9.0/jquery.min.js"></script>
<script>
var search_timeout = null;
$(function(){
/*$("input").val("data");
runSearch();*/
$("input").keyup(function(){
if(typeof search_timeout !== "null")
{
clearTimeout(search_timeout);
}
search_timeout = setTimeout(runSearch, 800)
});
});
function runSearch()
{
$(".search-large").removeClass("search-large").addClass("search-top");
$(".spinner").show();
var query = $("input#query").val();
if(query.length >= 3)
{
$.post("/api/search", {q: query}, function(response){
$(".spinner").hide();
$(".results").html("");
if(typeof response.error == "undefined")
{
for(i in response)
{
if(response[i].items.length > 0)
{
var result_wrapper = instantiateTemplate("result_wrapper");
var result_block = instantiateTemplate("result_topic");
result_block.children(".title").html(response[i].title);
result_block.children(".description").html(response[i].description);
result_block.children(".providername").html(response[i].provider);
result_block.appendTo(result_wrapper);
for(x in response[i].items)
{
item = response[i].items[x];
var item_block = instantiateTemplate("result_item");
item_block.children(".title").html(item.title);
item_block.children(".title").attr("href", item.url);
item_block.children(".type").html(item.type);
item_block.insertAfter(result_block);
}
result_wrapper.appendTo(".results");
}
}
}
else
{
$(".results").html("<div class='error'>No results.</div>");
}
setHandlers();
}, "json");
}
else
{
$(".spinner").hide();
$(".results").html("<div class='error'>Enter at least 3 characters.</div>");
}
}
function setHandlers()
{
$(".toggler, .topic").each(
function(){
$(this).click(function(event){
toggleItems(this, event);
});
}
);
}
function instantiateTemplate(template_name)
{
var instance = $("#template_" + template_name).clone();
instance.removeAttr("id");
return instance;
}
function toggleItems(ctx, event)
{
var parent = $(ctx).parentsUntil(".wrapper");
if(parent.length == 0)
{
var wrapper = $(ctx).parent();
}
else
{
var wrapper = parent.parent();
}
var toggler = wrapper.find(".toggler");
if(typeof toggler.data("toggled") == "undefined" || toggler.data("toggled") == false)
{
toggler.data("toggled", true);
toggler.html("-");
wrapper.find(".item").show();
}
else
{
toggler.data("toggled", false);
toggler.html("+");
wrapper.find(".item").hide();
}
event.stopPropagation();
}
</script>
</head>
<body>
<div class="header">
<h1><strong>learn.cryto.net</strong> :: Learn something new!</h1>
<h2>Currently searching Coursera, Khan University, University of Reddit. Comments? <a href="mailto:learn@cryto.net">learn@cryto.net</a> or
<a href="irc://irc.cryto.net/crytocc">irc.cryto.net #crytocc</a></h2>
<h2>Like the service and wish to donate? <a href="http://cryto.net/~joepie91/donate.html">You can do that here :)</a></h2>
</div>
<div class="main">
<div class="search-large">
I want to learn about <input type="text" id="query">. <img src="/static/spinner.gif" class="spinner" style="display: none;">
</div>
<div class="results">
</div>
</div>
<div id="templates">
<div id="template_result_wrapper" class="wrapper"></div>
<div id="template_result_topic" class="topic">
<span class="toggler">+</span>
<strong>Topic: </strong>
<span class="title"></span>
<span class="providername"></span>
<div class="description"></div>
</div>
<div id="template_result_item" class="item">
<span class="type"></span>
<a href="#" class="title"></a>
</div>
</div>
</body>
</html>

@ -0,0 +1,51 @@
"ocw.kaplan.edu": self._metadata_kaplan,
"ocw.korea.edu": self._metadata_korea,
"kyotomm.jp": self._metadata_kyoto,
"ocw.kyushu-u.ac.jp": self._metadata_kyushu,
"open-marhi.ru": self._metadata_moscow,
"yctrtrc.ncku.edu.tw": self._metadata_chengkung,
"ocw.nctu.edu.tw": self._metadata_chiaotung,
"opencourse.ndhu.edu.tw": self._metadata_donghwa,
"ocw.njit.edu": self._metadata_njit,
"graduateschool.paristech.fr": self._metadata_paris,
"peoples-uni.org": self._metadata_oaei,
"ocw.sbu.ac.ir": self._metadata_shahid,
"studentscircle.net": self._metadata_studentscircle,
"ocw.tmu.edu.tw:8080": self._metadata_taipei,
"openlearn.open.ac.uk": self._metadata_openuni,
"www.ocw.titech.ac.jp": self._metadata_tokyo,
"feedproxy.google.com": self._metadata_tudelft,
"ocw.tufts.edu": self._metadata_tufts,
"ocw.unu.edu": self._metadata_un,
"ocw.uc3m.es": self._metadata_madrid,
"ocw.ua.es": self._metadata_alicante,
"ocw.unican.es": self._metadata_cantabria,
"ocw.ugr.es": self._metadata_granada,
"ocw.udem.edu.mx": self._metadata_monterrey,
"ocw.um.es": self._metadata_murcia,
"ocw.uniovi.es": self._metadata_oviedo,
"ocw.usal.es": self._metadata_salamanca,
"ocwus.us.es": self._metadata_sevilla,
"ocw.unizar.es": self._metadata_zaragoza,
"ocw.univalle.edu.co3": self._metadata_colombia,
"ocw.uned.ac.cr": self._metadata_distancia,
"www.icesi.edu.co": self._metadata_icesi,
"ocw.innova.uned.es": self._metadata_innova,
"upv.es": self._metadata_valencia,
"ocw.upm.es": self._metadata_upm,
"ocw.utpl.edu.ec": self._metadata_utpl,
"ocw.uab.cat": self._metadata_uab,
"ocw.ub.edu": self._metadata_ub,
"ocw.uib.es": self._metadata_uib,
"ocw.udl.cat": self._metadata_udl,
"ocw.uv.es": self._metadata_uv,
"e-ujier.uji.e": self._metadata_uji,
"ocw.uoc.edu": self._metadata_uoc,
"ocw.utm.my": self._metadata_utm,
"ocw.uci.edu": self._metadata_uci,
"opencontent.uct.ac.za": self._metadata_uct,
"ocw.umb.edu:8080": self._metadata_boston,
"open.umich.edu": self._metadata_michigan,
"ocw.nd.edu": self._metadata_notredame,
"ocw.usu.ac.id": self._metadata_usu,
"ocw.tsukuba.ac.jp": self._metadata_tsukaba

@ -0,0 +1,116 @@
# AGH University of Science and Technology
http://open.agh.edu.pl/course/view.php?id=97
# Funda Getulio Vargas - FGV Online
http://www5.fgv.br/fgvonline/CursosGratuitosFormulario.aspx?id_curso=OCWAJUEAD_00_01/2011_1
# Gunadarma University
http://ocw.gunadarma.ac.id/course/about
# Johns Hopkins Bloomberg School of Public Health
http://ocw.jhsph.edu/courses/AdolHealthDev/?source=rss
# Kaplan University Online & Campus Learning
http://ocw.kaplan.edu/arts-and-sciences/academic-strategies
# Korea University
http://ocw.korea.edu/ocw/college-of-science/general-physics-i
# Kyoto Seika University
http://www.kyotomm.jp/event/exh/kyotomagic2012.php
# Kyushu University
http://ocw.kyushu-u.ac.jp/90901/0007/index.html
# Massachusetts Institute of Technology
http://ocw.mit.edu/courses/civil-and-environmental-engineering/1-00-introduction-to-computers-and-engineering-problem-solving-fall-2005
# MOSCOW ARCHITECTURAL INSTITUTE
http://www.open-marhi.ru/courses/detail/index.php?ID=6631
# National Cheng Kung University
http://yctrtrc.ncku.edu.tw/site2/newocwcourse/OCW_MAIN.php?cid=141
# National Chiao Tung University
http://ocw.nctu.edu.tw/riki_detail.php?pgid=335
# National Dong Hwa University
http://opencourse.ndhu.edu.tw/moodle/mod/forum/discuss.php?d=3
# New Jersey Institute of Technology
http://ocw.njit.edu/ocw/som/acct/acct-615-anandarajan/index.php
# Paris Tech
http://graduateschool.paristech.fr/cours.php?id=309132
# People's Open Access Education Initiative
http://www.peoples-uni.org/node/236
# Shahid Beheshti University
http://ocw.sbu.ac.ir/Default.aspx?tabid=5352&language=fa-IR
# Students Circle Network
http://studentscircle.net/live/2011/07/a-guide-before-learning-a-new-javascript-framework/
# Taipei Medical University
http://ocw.tmu.edu.tw:8080/eduCommons/general-education/53f28a1882076b7753f24eba72698a556790-shih-chi-analysis-on-historical-figures
# The Open University
http://openlearn.open.ac.uk/course/view.php?name=DD208_3
# The Open University of Israel
http://peer-news.blogspot.com/2011/12/2-10934.html
# Tokyo Institute of Technology
http://www.ocw.titech.ac.jp/index.php?module=General&Nendo=2012&action=T0300&GakubuCD=223&GakkaCD=224710&KougiCD=70030&Gakki=1&lang=EN
# TU Delft
http://feedproxy.google.com/~r/tudelft/OCW/~3/0sA6qPQKcOg/bachelor-civiele-techniek
# Tufts University
http://ocw.tufts.edu/Course/39
# UNISUL - Universidade do Sul de Santa Catarina
http://labspace.open.ac.uk
# United Nations University
http://ocw.unu.edu/international-institute-for-software-technology/building-a-community-of-practice-for-electronic-governance
# Universidad Carlos III de Madrid
http://ocw.uc3m.es/ingenieria-electrica/accionamientos-electricos
# Universidad de Alicante
http://ocw.ua.es/Ciencias_Sociales_y_Juridicas/actividades-deportivas-medio-ambiente
# Universidad de Cantabria
http://ocw.unican.es/ciencias-de-la-salud/actuacion-en-situaciones-especiales
# Universidad de Granada
http://ocw.ugr.es/course/view.php?id=23&topic=1
# Universidad de Monterrey
http://ocw.udem.edu.mx/cursos-de-profesional/administracion-de-tecnologias-de-informacion
# Universidad de Murcia
http://ocw.um.es/cc.-sociales/actividad-fisica-en-el-envejecimiento
# Universidad de Oviedo
http://ocw.uniovi.es/course/view.php?id=28&ocw=1
# Universidad de Salamanca
http://ocw.usal.es/ciencias-sociales-1/curso-cero-matematicas-para-ciencias-sociales-nivelacion-de-conocimientos
# Universidad de Sevilla
http://ocwus.us.es/matematica-aplicada/pp-3
# Universidad de Zaragoza
http://ocw.unizar.es/ocw/ciencias-de-la-salud-1/actividades-fisicas-y-deportivas-aereas
# Universidad del Valle - Colombia
http://ocw.univalle.edu.co/ocw/ingenieria-electronica-telecomunicaciones-y-afines/arquitectura-de-procesos-industriales
# Universidad Estatal a Distancia
http://ocw.uned.ac.cr/eduCommons/ciencias-de-la-administracion/compras-y-almacenamiento
# Universidad Icesi
http://www.icesi.edu.co/ocw/tic/administracion_plataformas_y_seguridad
# Universidad Nacional de Educacion a Distancia
http://ocw.innova.uned.es/ocwuniversia/psicologia/analisis-de-datos-en-Psico-I
# Universidad Politica de Valencia
http://www.upv.es/ocwasi/2010/6842
# Universidad Politica Madrid
http://ocw.upm.es/ingenieria-cartografica-geodesica-y-fotogrametria/3d-scanning-and-modeling
# UNIVERSIDAD TECNICA PARTICULAR DE LOJA
http://ocw.utpl.edu.ec/economia
# Universitat Auta de Barcelona
http://ocw.uab.cat/enginyeries/apunts-de-calcul-matricial-i-resolucio-de-sistemes
# Universitat de Barcelona
http://ocw.ub.edu/admistracio-i-direccio-dempreses
# Universitat de les Illes Balears
http://ocw.uib.es/ocw/infermeria/atencion-de-enfermeria-frente-situaciones-de
# Universitat de Lleida
http://ocw.udl.cat/arts-i-humanitats
# Universitat de Valia
http://ocw.uv.es/ciencias-sociales-y-juridicas/2-2
# Universitat Jaume I
http://e-ujier.uji.es/pls/www/!gri_www.euji22101?p_id=15&p_tipo=A&p_curso=IG23&p_idioma=CA
# Universitat Oberta de Catalunya
http://ocw.uoc.edu/informatica-tecnologia-i-multimedia/administracio-avancada-del-sistema-operatiu-gnu-linux
# Universiti Teknologi Malaysia
http://ocw.utm.my/course/view.php?id=90
# University of California, Irvine
http://ocw.uci.edu/courses/course.aspx?id=113
# University of Cape Town
http://opencontent.uct.ac.za/Centre-for-Higher-Education-Development/Centre-for-Open-Learning/A-developmental-state-The-challenge-ahead
# University of Massachusetts Boston
http://ocw.umb.edu:8080/eduCommons/about
# University of Michigan
http://open.umich.edu/education/med/oernetwork/med/em/aetc-redirect/2009
# University of Notre Dame
http://ocw.nd.edu/history/african-american-history-ii
# University of Sumatera Utara
http://ocw.usu.ac.id/course/detail/teknik-sipil-s1/4110000007-struktur-bangunan-sipil-i.html
# University of Tsukuba
http://ocw.tsukuba.ac.jp/6570740672698cea79d15b6678147a7679d130fb65705b665c02653b/66f87c4d7d394ecb

@ -0,0 +1,22 @@
#!/usr/bin/env python
import requests, sys, re
query = sys.argv[1]
results = requests.post("http://learn.cryto.net/api/search", {"q": query}).json()
for result in results:
name = result["title"].rstrip()
description = result["description"].strip().replace("\n", " ")
if len(description) > 200:
description = re.match("^(.{0,300})\W", description).group(1) + "..."
print "## %s\n%s" % (name, description)
for item in result["items"]:
name = item["title"].ljust(70)
print "\t[%s] %s\t%s" % (item["type"], name, item["url"])
print ""

@ -0,0 +1,2 @@
ALTER TABLE `items` ADD `CustomProviderName` VARCHAR( 250 ) NULL DEFAULT NULL;
ALTER TABLE `topics` ADD `CustomProviderName` VARCHAR( 250 ) NULL DEFAULT NULL;

@ -0,0 +1,361 @@
"""Beautiful Soup
Elixir and Tonic
"The Screen-Scraper's Friend"
http://www.crummy.com/software/BeautifulSoup/
Beautiful Soup uses a pluggable XML or HTML parser to parse a
(possibly invalid) document into a tree representation. Beautiful Soup
provides provides methods and Pythonic idioms that make it easy to
navigate, search, and modify the parse tree.
Beautiful Soup works with Python 2.6 and up. It works better if lxml
and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the
documentation:
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.1.3"
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
__license__ = "MIT"
__all__ = ['BeautifulSoup']
import re
import warnings
from .builder import builder_registry
from .dammit import UnicodeDammit
from .element import (
CData,
Comment,
DEFAULT_OUTPUT_ENCODING,
Declaration,
Doctype,
NavigableString,
PageElement,
ProcessingInstruction,
ResultSet,
SoupStrainer,
Tag,
)
# The very first thing we do is give a useful error if someone is
# running this code under Python 3 without converting it.
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
class BeautifulSoup(Tag):
"""
This class defines the basic interface called by the tree builders.
These methods will be called by the parser:
reset()
feed(markup)
The tree builder may call these methods from its feed() implementation:
handle_starttag(name, attrs) # See note about return value
handle_endtag(name)
handle_data(data) # Appends to the current data node
endData(containerClass=NavigableString) # Ends the current data node
No matter how complicated the underlying parser is, you should be
able to build a tree using 'start tag' events, 'end tag' events,
'data' events, and "done with data" events.
If you encounter an empty-element tag (aka a self-closing tag,
like HTML's <br> tag), call handle_starttag and then
handle_endtag.
"""
ROOT_TAG_NAME = u'[document]'
# If the end-user gives no indication which tree builder they
# want, look for one with these features.
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
# Used when determining whether a text node is all whitespace and
# can be replaced with a single space. A text node that contains
# fancy Unicode spaces (usually non-breaking) should be left
# alone.
STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, **kwargs):
"""The Soup object is initialized as the 'root tag', and the
provided markup (which can be a string or a file-like object)
is fed into the underlying parser."""
if 'convertEntities' in kwargs:
warnings.warn(
"BS4 does not respect the convertEntities argument to the "
"BeautifulSoup constructor. Entities are always converted "
"to Unicode characters.")
if 'markupMassage' in kwargs:
del kwargs['markupMassage']
warnings.warn(
"BS4 does not respect the markupMassage argument to the "
"BeautifulSoup constructor. The tree builder is responsible "
"for any necessary markup massage.")
if 'smartQuotesTo' in kwargs:
del kwargs['smartQuotesTo']
warnings.warn(
"BS4 does not respect the smartQuotesTo argument to the "
"BeautifulSoup constructor. Smart quotes are always converted "
"to Unicode characters.")
if 'selfClosingTags' in kwargs:
del kwargs['selfClosingTags']
warnings.warn(
"BS4 does not respect the selfClosingTags argument to the "
"BeautifulSoup constructor. The tree builder is responsible "
"for understanding self-closing tags.")
if 'isHTML' in kwargs:
del kwargs['isHTML']
warnings.warn(
"BS4 does not respect the isHTML argument to the "
"BeautifulSoup constructor. You can pass in features='html' "
"or features='xml' to get a builder capable of handling "
"one or the other.")
def deprecated_argument(old_name, new_name):
if old_name in kwargs:
warnings.warn(
'The "%s" argument to the BeautifulSoup constructor '
'has been renamed to "%s."' % (old_name, new_name))
value = kwargs[old_name]
del kwargs[old_name]
return value
return None
parse_only = parse_only or deprecated_argument(
"parseOnlyThese", "parse_only")
from_encoding = from_encoding or deprecated_argument(
"fromEncoding", "from_encoding")
if len(kwargs) > 0:
arg = kwargs.keys().pop()
raise TypeError(
"__init__() got an unexpected keyword argument '%s'" % arg)
if builder is None:
if isinstance(features, basestring):
features = [features]
if features is None or len(features) == 0:
features = self.DEFAULT_BUILDER_FEATURES
builder_class = builder_registry.lookup(*features)
if builder_class is None:
raise FeatureNotFound(
"Couldn't find a tree builder with the features you "
"requested: %s. Do you need to install a parser library?"
% ",".join(features))
builder = builder_class()
self.builder = builder
self.is_xml = builder.is_xml
self.builder.soup = self
self.parse_only = parse_only
self.reset()
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
(self.markup, self.original_encoding, self.declared_html_encoding,
self.contains_replacement_characters) = (
self.builder.prepare_markup(markup, from_encoding))
try:
self._feed()
except StopParsing:
pass
# Clear out the markup and remove the builder's circular
# reference to this object.
self.markup = None
self.builder.soup = None
def _feed(self):
# Convert the document to Unicode.
self.builder.reset()
self.builder.feed(self.markup)
# Close out any unfinished strings and close all the open tags.
self.endData()
while self.currentTag.name != self.ROOT_TAG_NAME:
self.popTag()
def reset(self):
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
self.hidden = 1
self.builder.reset()
self.currentData = []
self.currentTag = None
self.tagStack = []
self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
"""Create a new tag associated with this soup."""
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
def new_string(self, s):
"""Create a new NavigableString associated with this soup."""
navigable = NavigableString(s)
navigable.setup()
return navigable
def insert_before(self, successor):
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
def insert_after(self, successor):
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
def popTag(self):
tag = self.tagStack.pop()
#print "Pop", tag.name
if self.tagStack:
self.currentTag = self.tagStack[-1]
return self.currentTag
def pushTag(self, tag):
#print "Push", tag.name
if self.currentTag:
self.currentTag.contents.append(tag)
self.tagStack.append(tag)
self.currentTag = self.tagStack[-1]
def endData(self, containerClass=NavigableString):
if self.currentData:
currentData = u''.join(self.currentData)
if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
not set([tag.name for tag in self.tagStack]).intersection(
self.builder.preserve_whitespace_tags)):
if '\n' in currentData:
currentData = '\n'
else:
currentData = ' '
self.currentData = []
if self.parse_only and len(self.tagStack) <= 1 and \
(not self.parse_only.text or \
not self.parse_only.search(currentData)):
return
o = containerClass(currentData)
self.object_was_parsed(o)
def object_was_parsed(self, o, parent=None, previous_element=None):
"""Add an object to the parse tree."""
parent = parent or self.currentTag
previous_element = previous_element or self.previous_element
o.setup(parent, previous_element)
if self.previous_element:
self.previous_element.next_element = o
self.previous_element = o
parent.contents.append(o)
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
instance of the given tag. If inclusivePop is false, pops the tag
stack up to but *not* including the most recent instqance of
the given tag."""
#print "Popping to %s" % name
if name == self.ROOT_TAG_NAME:
return
numPops = 0
mostRecentTag = None
for i in range(len(self.tagStack) - 1, 0, -1):
if (name == self.tagStack[i].name
and nsprefix == self.tagStack[i].prefix):
numPops = len(self.tagStack) - i
break
if not inclusivePop:
numPops = numPops - 1
for i in range(0, numPops):
mostRecentTag = self.popTag()
return mostRecentTag
def handle_starttag(self, name, namespace, nsprefix, attrs):
"""Push a start tag on to the stack.
If this method returns None, the tag was rejected by the
SoupStrainer. You should proceed as if the tag had not occured
in the document. For instance, if this was a self-closing tag,
don't call handle_endtag.
"""
# print "Start tag %s: %s" % (name, attrs)
self.endData()
if (self.parse_only and len(self.tagStack) <= 1
and (self.parse_only.text
or not self.parse_only.search_tag(name, attrs))):
return None
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
self.currentTag, self.previous_element)
if tag is None:
return tag
if self.previous_element:
self.previous_element.next_element = tag
self.previous_element = tag
self.pushTag(tag)
return tag
def handle_endtag(self, name, nsprefix=None):
#print "End tag: " + name
self.endData()
self._popToTag(name, nsprefix)
def handle_data(self, data):
self.currentData.append(data)
def decode(self, pretty_print=False,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
"""Returns a string or Unicode representation of this document.
To get Unicode, pass None for encoding."""
if self.is_xml:
# Print the XML declaration
encoding_part = ''
if eventual_encoding != None:
encoding_part = ' encoding="%s"' % eventual_encoding
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
else:
prefix = u''
if not pretty_print:
indent_level = None
else:
indent_level = 0
return prefix + super(BeautifulSoup, self).decode(
indent_level, eventual_encoding, formatter)
class BeautifulStoneSoup(BeautifulSoup):
"""Deprecated interface to an XML parser."""
def __init__(self, *args, **kwargs):
kwargs['features'] = 'xml'
warnings.warn(
'The BeautifulStoneSoup class is deprecated. Instead of using '
'it, pass features="xml" into the BeautifulSoup constructor.')
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
class StopParsing(Exception):
pass
class FeatureNotFound(ValueError):
pass
#By default, act as an HTML pretty-printer.
if __name__ == '__main__':
import sys
soup = BeautifulSoup(sys.stdin)
print soup.prettify()

@ -0,0 +1,316 @@
from collections import defaultdict
import itertools
import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
whitespace_re
)
__all__ = [
'HTMLTreeBuilder',
'SAXTreeBuilder',
'TreeBuilder',
'TreeBuilderRegistry',
]
# Some useful features for a TreeBuilder to have.
FAST = 'fast'
PERMISSIVE = 'permissive'
STRICT = 'strict'
XML = 'xml'
HTML = 'html'
HTML_5 = 'html5'
class TreeBuilderRegistry(object):
def __init__(self):
self.builders_for_feature = defaultdict(list)
self.builders = []
def register(self, treebuilder_class):
"""Register a treebuilder based on its advertised features."""
for feature in treebuilder_class.features:
self.builders_for_feature[feature].insert(0, treebuilder_class)
self.builders.insert(0, treebuilder_class)
def lookup(self, *features):
if len(self.builders) == 0:
# There are no builders at all.
return None
if len(features) == 0:
# They didn't ask for any features. Give them the most
# recently registered builder.
return self.builders[0]
# Go down the list of features in order, and eliminate any builders
# that don't match every feature.
features = list(features)
features.reverse()
candidates = None
candidate_set = None
while len(features) > 0:
feature = features.pop()
we_have_the_feature = self.builders_for_feature.get(feature, [])
if len(we_have_the_feature) > 0:
if candidates is None:
candidates = we_have_the_feature
candidate_set = set(candidates)
else:
# Eliminate any candidates that don't have this feature.
candidate_set = candidate_set.intersection(
set(we_have_the_feature))
# The only valid candidates are the ones in candidate_set.
# Go through the original list of candidates and pick the first one
# that's in candidate_set.
if candidate_set is None:
return None
for candidate in candidates:
if candidate in candidate_set:
return candidate
return None
# The BeautifulSoup class will take feature lists from developers and use them
# to look up builders in this registry.
builder_registry = TreeBuilderRegistry()
class TreeBuilder(object):
"""Turn a document into a Beautiful Soup object tree."""
features = []
is_xml = False
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
cdata_list_attributes = {}
def __init__(self):
self.soup = None
def reset(self):
pass
def can_be_empty_element(self, tag_name):
"""Might a tag with this name be an empty-element tag?
The final markup may or may not actually present this tag as
self-closing.
For instance: an HTMLBuilder does not consider a <p> tag to be
an empty-element tag (it's not in
HTMLBuilder.empty_element_tags). This means an empty <p> tag
will be presented as "<p></p>", not "<p />".
The default implementation has no opinion about which tags are
empty-element tags, so a tag will be presented as an
empty-element tag if and only if it has no contents.
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
be left alone.
"""
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
def feed(self, markup):
raise NotImplementedError()
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
return markup, None, None, False
def test_fragment_to_document(self, fragment):
"""Wrap an HTML fragment to make it look like a document.
Different parsers do this differently. For instance, lxml
introduces an empty <head> tag, and html5lib
doesn't. Abstracting this away lets us write simple tests
which run HTML fragments through the parser and compare the
results against other HTML fragments.
This method should not be used outside of tests.
"""
return fragment
def set_up_substitutions(self, tag):
return False
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
"""Replaces class="foo bar" with class=["foo", "bar"]
Modifies its input in place.
"""
if self.cdata_list_attributes:
universal = self.cdata_list_attributes.get('*', [])
tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), [])
for cdata_list_attr in itertools.chain(universal, tag_specific):
if cdata_list_attr in dict(attrs):
# Basically, we have a "class" attribute whose
# value is a whitespace-separated list of CSS
# classes. Split it into a list.
value = attrs[cdata_list_attr]
if isinstance(value, basestring):
values = whitespace_re.split(value)
else:
# html5lib sometimes calls setAttributes twice
# for the same tag when rearranging the parse
# tree. On the second call the attribute value
# here is already a list. If this happens,
# leave the value alone rather than trying to
# split it again.
values = value
attrs[cdata_list_attr] = values
return attrs
class SAXTreeBuilder(TreeBuilder):
"""A Beautiful Soup treebuilder that listens for SAX events."""
def feed(self, markup):
raise NotImplementedError()
def close(self):
pass
def startElement(self, name, attrs):
attrs = dict((key[1], value) for key, value in list(attrs.items()))
#print "Start %s, %r" % (name, attrs)
self.soup.handle_starttag(name, attrs)
def endElement(self, name):
#print "End %s" % name
self.soup.handle_endtag(name)
def startElementNS(self, nsTuple, nodeName, attrs):
# Throw away (ns, nodeName) for now.
self.startElement(nodeName, attrs)
def endElementNS(self, nsTuple, nodeName):
# Throw away (ns, nodeName) for now.
self.endElement(nodeName)
#handler.endElementNS((ns, node.nodeName), node.nodeName)
def startPrefixMapping(self, prefix, nodeValue):
# Ignore the prefix for now.
pass
def endPrefixMapping(self, prefix):
# Ignore the prefix for now.
# handler.endPrefixMapping(prefix)
pass
def characters(self, content):
self.soup.handle_data(content)
def startDocument(self):
pass
def endDocument(self):
pass
class HTMLTreeBuilder(TreeBuilder):
"""This TreeBuilder knows facts about HTML.
Such as which tags are empty-element tags.
"""
preserve_whitespace_tags = set(['pre', 'textarea'])
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
# class="foo bar" means that the 'class' attribute has two values,
# 'foo' and 'bar', not the single value 'foo bar'. When we
# encounter one of these attributes, we will parse its value into
# a list of values if possible. Upon output, the list will be
# converted back into a string.
cdata_list_attributes = {
"*" : ['class', 'accesskey', 'dropzone'],
"a" : ['rel', 'rev'],
"link" : ['rel', 'rev'],
"td" : ["headers"],
"th" : ["headers"],
"td" : ["headers"],
"form" : ["accept-charset"],
"object" : ["archive"],
# These are HTML5 specific, as are *.accesskey and *.dropzone above.
"area" : ["rel"],
"icon" : ["sizes"],
"iframe" : ["sandbox"],
"output" : ["for"],
}
def set_up_substitutions(self, tag):
# We are only interested in <meta> tags
if tag.name != 'meta':
return False
http_equiv = tag.get('http-equiv')
content = tag.get('content')
charset = tag.get('charset')
# We are interested in <meta> tags that say what encoding the
# document was originally in. This means HTML 5-style <meta>
# tags that provide the "charset" attribute. It also means
# HTML 4-style <meta> tags that provide the "content"
# attribute and have "http-equiv" set to "content-type".
#
# In both cases we will replace the value of the appropriate
# attribute with a standin object that can take on any
# encoding.
meta_encoding = None
if charset is not None:
# HTML 5 style:
# <meta charset="utf8">
meta_encoding = charset
tag['charset'] = CharsetMetaAttributeValue(charset)
elif (content is not None and http_equiv is not None
and http_equiv.lower() == 'content-type'):
# HTML 4 style:
# <meta http-equiv="content-type" content="text/html; charset=utf8">
tag['content'] = ContentMetaAttributeValue(content)
return (meta_encoding is not None)
def register_treebuilders_from(module):
"""Copy TreeBuilders from the given module into this module."""
# I'm fairly sure this is not the best way to do this.
this_module = sys.modules['bs4.builder']
for name in module.__all__:
obj = getattr(module, name)
if issubclass(obj, TreeBuilder):
setattr(this_module, name, obj)
this_module.__all__.append(name)
# Register the builder while we're at it.
this_module.builder_registry.register(obj)
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only
# want to use HTMLParser as a last result.
from . import _htmlparser
register_treebuilders_from(_htmlparser)
try:
from . import _html5lib
register_treebuilders_from(_html5lib)
except ImportError:
# They don't have html5lib installed.
pass
try:
from . import _lxml
register_treebuilders_from(_lxml)
except ImportError:
# They don't have lxml installed.
pass

@ -0,0 +1,221 @@
__all__ = [
'HTML5TreeBuilder',
]
import warnings
from bs4.builder import (
PERMISSIVE,
HTML,
HTML_5,
HTMLTreeBuilder,
)
from bs4.element import NamespacedAttribute
import html5lib
from html5lib.constants import namespaces
from bs4.element import (
Comment,
Doctype,
NavigableString,
Tag,
)
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree."""
features = ['html5lib', PERMISSIVE, HTML_5, HTML]
def prepare_markup(self, markup, user_specified_encoding):
# Store the user-specified encoding for use later on.
self.user_specified_encoding = user_specified_encoding
return markup, None, None, False
# These methods are defined by Beautiful Soup.
def feed(self, markup):
if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
doc = parser.parse(markup, encoding=self.user_specified_encoding)
# Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib(
self.soup, namespaceHTMLElements)
return self.underlying_builder
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<html><head></head><body>%s</body></html>' % fragment
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
def __init__(self, soup, namespaceHTMLElements):
self.soup = soup
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def documentClass(self):
self.soup.reset()
return Element(self.soup, self.soup, None)
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
doctype = Doctype.for_name_and_ids(name, publicId, systemId)
self.soup.object_was_parsed(doctype)
def elementClass(self, name, namespace):
tag = self.soup.new_tag(name, namespace)
return Element(tag, self.soup, namespace)
def commentClass(self, data):
return TextNode(Comment(data), self.soup)
def fragmentClass(self):
self.soup = BeautifulSoup("")
self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None)
def appendChild(self, node):
# XXX This code is not covered by the BS4 tests.
self.soup.append(node.element)
def getDocument(self):
return self.soup
def getFragment(self):
return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
class AttrList(object):
def __init__(self, element):
self.element = element
self.attrs = dict(self.element.attrs)
def __iter__(self):
return list(self.attrs.items()).__iter__()
def __setitem__(self, name, value):
"set attr", name, value
self.element[name] = value
def items(self):
return list(self.attrs.items())
def keys(self):
return list(self.attrs.keys())
def __len__(self):
return len(self.attrs)
def __getitem__(self, name):
return self.attrs[name]
def __contains__(self, name):
return name in list(self.attrs.keys())
class Element(html5lib.treebuilders._base.Node):
def __init__(self, element, soup, namespace):
html5lib.treebuilders._base.Node.__init__(self, element.name)
self.element = element
self.soup = soup
self.namespace = namespace
def appendChild(self, node):
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
# Concatenate new text onto old text node
# XXX This has O(n^2) performance, for input like
# "a</a>a</a>a</a>..."
old_element = self.element.contents[-1]
new_element = self.soup.new_string(old_element + node.element)
old_element.replace_with(new_element)
else:
self.soup.object_was_parsed(node.element, parent=self.element)
def getAttributes(self):
return AttrList(self.element)
def setAttributes(self, attributes):
if attributes is not None and len(attributes) > 0:
converted_attributes = []
for name, value in list(attributes.items()):
if isinstance(name, tuple):
new_name = NamespacedAttribute(*name)
del attributes[name]
attributes[new_name] = value
self.soup.builder._replace_cdata_list_attribute_values(
self.name, attributes)
for name, value in attributes.items():
self.element[name] = value
# The attributes may contain variables that need substitution.
# Call set_up_substitutions manually.
#
# The Tag constructor called this method when the Tag was created,
# but we just set/changed the attributes, so call it again.
self.soup.builder.set_up_substitutions(self.element)
attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None):
text = TextNode(self.soup.new_string(data), self.soup)
if insertBefore:
self.insertBefore(text, insertBefore)
else:
self.appendChild(text)
def insertBefore(self, node, refNode):
index = self.element.index(refNode.element)
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[index-1].__class__ == NavigableString):
# (See comments in appendChild)
old_node = self.element.contents[index-1]
new_str = self.soup.new_string(old_node + node.element)
old_node.replace_with(new_str)
else:
self.element.insert(index, node.element)
node.parent = self
def removeChild(self, node):
node.element.extract()
def reparentChildren(self, newParent):
while self.element.contents:
child = self.element.contents[0]
child.extract()
if isinstance(child, Tag):
newParent.appendChild(
Element(child, self.soup, namespaces["html"]))
else:
newParent.appendChild(
TextNode(child, self.soup))
def cloneNode(self):
tag = self.soup.new_tag(self.element.name, self.namespace)
node = Element(tag, self.soup, self.namespace)
for key,value in self.attributes:
node.attributes[key] = value
return node
def hasContent(self):
return self.element.contents
def getNameTuple(self):
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class TextNode(Element):
def __init__(self, element, soup):
html5lib.treebuilders._base.Node.__init__(self, None)
self.element = element
self.soup = soup
def cloneNode(self):
raise NotImplementedError

@ -0,0 +1,244 @@
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
__all__ = [
'HTMLParserTreeBuilder',
]
from HTMLParser import (
HTMLParser,
HTMLParseError,
)
import sys
import warnings
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
# argument, which we'd like to set to False. Unfortunately,
# http://bugs.python.org/issue13273 makes strict=True a better bet
# before Python 3.2.3.
#
# At the end of this file, we monkeypatch HTMLParser so that
# strict=True works well on Python 3.2.2.
major, minor, release = sys.version_info[:3]
CONSTRUCTOR_TAKES_STRICT = (
major > 3
or (major == 3 and minor > 2)
or (major == 3 and minor == 2 and release >= 3))
from bs4.element import (
CData,
Comment,
Declaration,
Doctype,
ProcessingInstruction,
)
from bs4.dammit import EntitySubstitution, UnicodeDammit
from bs4.builder import (
HTML,
HTMLTreeBuilder,
STRICT,
)
HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser):
def handle_starttag(self, name, attrs):
# XXX namespace
self.soup.handle_starttag(name, None, None, dict(attrs))
def handle_endtag(self, name):
self.soup.handle_endtag(name)
def handle_data(self, data):
self.soup.handle_data(data)
def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed.
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
else:
real_name = int(name)
try:
data = unichr(real_name)
except (ValueError, OverflowError), e:
data = u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def handle_entityref(self, name):
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
if character is not None:
data = character
else:
data = "&%s;" % name
self.handle_data(data)
def handle_comment(self, data):
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(Comment)
def handle_decl(self, data):
self.soup.endData()
if data.startswith("DOCTYPE "):
data = data[len("DOCTYPE "):]
self.soup.handle_data(data)
self.soup.endData(Doctype)
def unknown_decl(self, data):
if data.upper().startswith('CDATA['):
cls = CData
data = data[len('CDATA['):]
else:
cls = Declaration
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(cls)
def handle_pi(self, data):
self.soup.endData()
if data.endswith("?") and data.lower().startswith("xml"):
# "An XHTML processing instruction using the trailing '?'
# will cause the '?' to be included in data." - HTMLParser
# docs.
#
# Strip the question mark so we don't end up with two
# question marks.
data = data[:-1]
self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction)
class HTMLParserTreeBuilder(HTMLTreeBuilder):
is_xml = False
features = [HTML, STRICT, HTMLPARSER]
def __init__(self, *args, **kwargs):
if CONSTRUCTOR_TAKES_STRICT:
kwargs['strict'] = False
self.parser_args = (args, kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
"""
:return: A 4-tuple (markup, original encoding, encoding
declared within markup, whether any characters had to be
replaced with REPLACEMENT CHARACTER).
"""
if isinstance(markup, unicode):
return markup, None, None, False
try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
return (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding,
dammit.contains_replacement_characters)
def feed(self, markup):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
try:
parser.feed(markup)
except HTMLParseError, e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
import re
attrfind_tolerant = re.compile(
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
(?:\s*=\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|\"[^\"]*\" # LIT-enclosed value
|[^'\">\s]+ # bare value
)
)?
)
)*
\s* # trailing whitespace
""", re.VERBOSE)
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
from html.parser import tagfind, attrfind
def parse_starttag(self, i):
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
match = tagfind.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = rawdata[i+1:k].lower()
while k < endpos:
if self.strict:
m = attrfind.match(rawdata, k)
else:
m = attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest:
attrvalue = None
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = self.unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
end = rawdata[k:endpos].strip()
if end not in (">", "/>"):
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
lineno = lineno + self.__starttag_text.count("\n")
offset = len(self.__starttag_text) \
- self.__starttag_text.rfind("\n")
else:
offset = offset + len(self.__starttag_text)
if self.strict:
self.error("junk characters in start tag: %r"
% (rawdata[k:endpos][:20],))
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'):
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
return endpos
def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
CONSTRUCTOR_TAKES_STRICT = True

@ -0,0 +1,196 @@
__all__ = [
'LXMLTreeBuilderForXML',
'LXMLTreeBuilder',
]
from StringIO import StringIO
import collections
from lxml import etree
from bs4.element import Comment, Doctype, NamespacedAttribute
from bs4.builder import (
FAST,
HTML,
HTMLTreeBuilder,
PERMISSIVE,
TreeBuilder,
XML)
from bs4.dammit import UnicodeDammit
LXML = 'lxml'
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
is_xml = True
# Well, it's permissive by XML parser standards.
features = [LXML, XML, FAST, PERMISSIVE]
CHUNK_SIZE = 512
# This namespace mapping is specified in the XML Namespace
# standard.
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
@property
def default_parser(self):
# This can either return a parser object or a class, which
# will be instantiated with default arguments.
return etree.XMLParser(target=self, strip_cdata=False, recover=True)
def __init__(self, parser=None, empty_element_tags=None):
if empty_element_tags is not None:
self.empty_element_tags = set(empty_element_tags)
if parser is None:
# Use the default parser.
parser = self.default_parser
if isinstance(parser, collections.Callable):
# Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False)
self.parser = parser
self.soup = None
self.nsmaps = [self.DEFAULT_NSMAPS]
def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag
# name. Copied from lxml's src/lxml/sax.py.
if tag[0] == '{':
return tuple(tag[1:].split('}', 1))
else:
return (None, tag)
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
"""
:return: A 3-tuple (markup, original encoding, encoding
declared within markup).
"""
if isinstance(markup, unicode):
return markup, None, None, False
try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
return (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding,
dammit.contains_replacement_characters)
def feed(self, markup):
if isinstance(markup, basestring):
markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty,
# or the parser won't be initialized.
data = markup.read(self.CHUNK_SIZE)
self.parser.feed(data)
while data != '':
# Now call feed() on the rest of the data, chunk by chunk.
data = markup.read(self.CHUNK_SIZE)
if data != '':
self.parser.feed(data)
self.parser.close()
def close(self):
self.nsmaps = [self.DEFAULT_NSMAPS]
def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
attrs = dict(attrs)
nsprefix = None
# Invert each namespace map as it comes in.
if len(self.nsmaps) > 1:
# There are no new namespaces for this tag, but
# non-default namespaces are in play, so we need a
# separate tag stack to know when they end.
self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
self.nsmaps.append(inverted_nsmap)
# Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later.
attrs = attrs.copy()
for prefix, namespace in nsmap.items():
attribute = NamespacedAttribute(
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace
# Namespaces are in play. Find any attributes that came in
# from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects.
new_attrs = {}
for attr, value in attrs.items():
namespace, attr = self._getNsTag(attr)
if namespace is None:
new_attrs[attr] = value
else:
nsprefix = self._prefix_for_namespace(namespace)
attr = NamespacedAttribute(nsprefix, attr, namespace)
new_attrs[attr] = value
attrs = new_attrs
namespace, name = self._getNsTag(name)
nsprefix = self._prefix_for_namespace(namespace)
self.soup.handle_starttag(name, namespace, nsprefix, attrs)
def _prefix_for_namespace(self, namespace):
"""Find the currently active prefix for the given namespace."""
if namespace is None:
return None
for inverted_nsmap in reversed(self.nsmaps):
if inverted_nsmap is not None and namespace in inverted_nsmap:
return inverted_nsmap[namespace]
return None
def end(self, name):
self.soup.endData()
completed_tag = self.soup.tagStack[-1]
namespace, name = self._getNsTag(name)
nsprefix = None
if namespace is not None:
for inverted_nsmap in reversed(self.nsmaps):
if inverted_nsmap is not None and namespace in inverted_nsmap:
nsprefix = inverted_nsmap[namespace]
break
self.soup.handle_endtag(name, nsprefix)
if len(self.nsmaps) > 1:
# This tag, or one of its parents, introduced a namespace
# mapping, so pop it off the stack.
self.nsmaps.pop()
def pi(self, target, data):
pass
def data(self, content):
self.soup.handle_data(content)
def doctype(self, name, pubid, system):
self.soup.endData()
doctype = Doctype.for_name_and_ids(name, pubid, system)
self.soup.object_was_parsed(doctype)
def comment(self, content):
"Handle comments as Comment objects."
self.soup.endData()
self.soup.handle_data(content)
self.soup.endData(Comment)
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
features = [LXML, HTML, FAST, PERMISSIVE]
is_xml = False
@property
def default_parser(self):
return etree.HTMLParser
def feed(self, markup):
self.parser.feed(markup)
self.parser.close()
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<html><body>%s</body></html>' % fragment

@ -0,0 +1,802 @@
# -*- coding: utf-8 -*-
"""Beautiful Soup bonus library: Unicode, Dammit
This class forces XML data into a standard format (usually to UTF-8 or
Unicode). It is heavily based on code from Mark Pilgrim's Universal
Feed Parser. It does not rewrite the XML or HTML to reflect a new
encoding; that's the tree builder's job.
"""
import codecs
from htmlentitydefs import codepoint2name
import re
import logging
# Import a library to autodetect character encodings.
chardet_type = None
try:
# First try the fast C implementation.
# PyPI package: cchardet
import cchardet
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
except ImportError:
try:
# Fall back to the pure Python implementation
# Debian package: python-chardet
# PyPI package: chardet
import chardet
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
except ImportError:
# No chardet available.
def chardet_dammit(s):
return None
# Available from http://cjkpython.i18n.org/.
try:
import iconv_codec
except ImportError:
pass
xml_encoding_re = re.compile(
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
html_meta_re = re.compile(
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
class EntitySubstitution(object):
"""Substitute XML or HTML entities for the corresponding characters."""
def _populate_class_variables():
lookup = {}
reverse_lookup = {}
characters_for_re = []
for codepoint, name in list(codepoint2name.items()):
character = unichr(codepoint)
if codepoint != 34:
# There's no point in turning the quotation mark into
# &quot;, unless it happens within an attribute value, which
# is handled elsewhere.
characters_for_re.append(character)
lookup[character] = name
# But we do want to turn &quot; into the quotation mark.
reverse_lookup[name] = character
re_definition = "[%s]" % "".join(characters_for_re)
return lookup, reverse_lookup, re.compile(re_definition)
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
CHARACTER_TO_XML_ENTITY = {
"'": "apos",
'"': "quot",
"&": "amp",
"<": "lt",
">": "gt",
}
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
")")
@classmethod
def _substitute_html_entity(cls, matchobj):
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
return "&%s;" % entity
@classmethod
def _substitute_xml_entity(cls, matchobj):
"""Used with a regular expression to substitute the
appropriate XML entity for an XML special character."""
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
return "&%s;" % entity
@classmethod
def quoted_attribute_value(self, value):
"""Make a value into a quoted XML attribute, possibly escaping it.
Most strings will be quoted using double quotes.
Bob's Bar -> "Bob's Bar"
If a string contains double quotes, it will be quoted using
single quotes.
Welcome to "my bar" -> 'Welcome to "my bar"'
If a string contains both single and double quotes, the
double quotes will be escaped, and the string will be quoted
using double quotes.
Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
"""
quote_with = '"'
if '"' in value:
if "'" in value:
# The string contains both single and double
# quotes. Turn the double quotes into
# entities. We quote the double quotes rather than
# the single quotes because the entity name is
# "&quot;" whether this is HTML or XML. If we
# quoted the single quotes, we'd have to decide
# between &apos; and &squot;.
replace_with = "&quot;"
value = value.replace('"', replace_with)
else:
# There are double quotes but no single quotes.
# We can use single quotes to quote the attribute.
quote_with = "'"
return quote_with + value + quote_with
@classmethod
def substitute_xml(cls, value, make_quoted_attribute=False):
"""Substitute XML entities for special XML characters.
:param value: A string to be substituted. The less-than sign will
become &lt;, the greater-than sign will become &gt;, and any
ampersands that are not part of an entity defition will
become &amp;.
:param make_quoted_attribute: If True, then the string will be
quoted, as befits an attribute value.
"""
# Escape angle brackets, and ampersands that aren't part of
# entities.
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
cls._substitute_xml_entity, value)
if make_quoted_attribute:
value = cls.quoted_attribute_value(value)
return value
@classmethod
def substitute_html(cls, s):
"""Replace certain Unicode characters with named HTML entities.
This differs from data.encode(encoding, 'xmlcharrefreplace')
in that the goal is to make the result more readable (to those
with ASCII displays) rather than to recover from
errors. There's absolutely nothing wrong with a UTF-8 string
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
character with "&eacute;" will make it more readable to some
people.
"""
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
cls._substitute_html_entity, s)
class UnicodeDammit:
"""A class for detecting the encoding of a *ML document and
converting it to a Unicode string. If the source encoding is
windows-1252, can replace MS smart quotes with their HTML or XML
equivalents."""
# This dictionary maps commonly seen values for "charset" in HTML
# meta tags to the corresponding Python codec names. It only covers
# values that aren't in Python's aliases and can't be determined
# by the heuristics in find_codec.
CHARSET_ALIASES = {"macintosh": "mac-roman",
"x-sjis": "shift-jis"}
ENCODINGS_WITH_SMART_QUOTES = [
"windows-1252",
"iso-8859-1",
"iso-8859-2",
]
def __init__(self, markup, override_encodings=[],
smart_quotes_to=None, is_html=False):
self.declared_html_encoding = None
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
self.contains_replacement_characters = False
if markup == '' or isinstance(markup, unicode):
self.markup = markup
self.unicode_markup = unicode(markup)
self.original_encoding = None
return
new_markup, document_encoding, sniffed_encoding = \
self._detectEncoding(markup, is_html)
self.markup = new_markup
u = None
if new_markup != markup:
# _detectEncoding modified the markup, then converted it to
# Unicode and then to UTF-8. So convert it from UTF-8.
u = self._convert_from("utf8")
self.original_encoding = sniffed_encoding
if not u:
for proposed_encoding in (
override_encodings + [document_encoding, sniffed_encoding]):
if proposed_encoding is not None:
u = self._convert_from(proposed_encoding)
if u:
break
# If no luck and we have auto-detection library, try that:
if not u and not isinstance(self.markup, unicode):
u = self._convert_from(chardet_dammit(self.markup))
# As a last resort, try utf-8 and windows-1252:
if not u:
for proposed_encoding in ("utf-8", "windows-1252"):
u = self._convert_from(proposed_encoding)
if u:
break
# As an absolute last resort, try the encodings again with
# character replacement.
if not u:
for proposed_encoding in (
override_encodings + [
document_encoding, sniffed_encoding, "utf-8", "windows-1252"]):
if proposed_encoding != "ascii":
u = self._convert_from(proposed_encoding, "replace")
if u is not None:
logging.warning(
"Some characters could not be decoded, and were "
"replaced with REPLACEMENT CHARACTER.")
self.contains_replacement_characters = True
break
# We could at this point force it to ASCII, but that would
# destroy so much data that I think giving up is better
self.unicode_markup = u
if not u:
self.original_encoding = None
def _sub_ms_char(self, match):
"""Changes a MS smart quote character to an XML or HTML
entity, or an ASCII character."""
orig = match.group(1)
if self.smart_quotes_to == 'ascii':
sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
else:
sub = self.MS_CHARS.get(orig)
if type(sub) == tuple:
if self.smart_quotes_to == 'xml':
sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
else:
sub = '&'.encode() + sub[0].encode() + ';'.encode()
else:
sub = sub.encode()
return sub
def _convert_from(self, proposed, errors="strict"):
proposed = self.find_codec(proposed)
if not proposed or (proposed, errors) in self.tried_encodings:
return None
self.tried_encodings.append((proposed, errors))
markup = self.markup
# Convert smart quotes to HTML if coming from an encoding
# that might have them.
if (self.smart_quotes_to is not None
and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
smart_quotes_re = b"([\x80-\x9f])"
smart_quotes_compiled = re.compile(smart_quotes_re)
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
try:
#print "Trying to convert document to %s (errors=%s)" % (
# proposed, errors)
u = self._to_unicode(markup, proposed, errors)
self.markup = u
self.original_encoding = proposed
except Exception as e:
#print "That didn't work!"
#print e
return None
#print "Correct encoding: %s" % proposed
return self.markup
def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases'''
# strip Byte Order Mark (if present)
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
and (data[2:4] != '\x00\x00'):
encoding = 'utf-16be'
data = data[2:]
elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
and (data[2:4] != '\x00\x00'):
encoding = 'utf-16le'
data = data[2:]
elif data[:3] == '\xef\xbb\xbf':
encoding = 'utf-8'
data = data[3:]
elif data[:4] == '\x00\x00\xfe\xff':
encoding = 'utf-32be'
data = data[4:]
elif data[:4] == '\xff\xfe\x00\x00':
encoding = 'utf-32le'
data = data[4:]
newdata = unicode(data, encoding, errors)
return newdata
def _detectEncoding(self, xml_data, is_html=False):
"""Given a document, tries to detect its XML encoding."""
xml_encoding = sniffed_xml_encoding = None
try:
if xml_data[:4] == b'\x4c\x6f\xa7\x94':
# EBCDIC
xml_data = self._ebcdic_to_ascii(xml_data)
elif xml_data[:4] == b'\x00\x3c\x00\x3f':
# UTF-16BE
sniffed_xml_encoding = 'utf-16be'
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
and (xml_data[2:4] != b'\x00\x00'):
# UTF-16BE with BOM
sniffed_xml_encoding = 'utf-16be'
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
elif xml_data[:4] == b'\x3c\x00\x3f\x00':
# UTF-16LE
sniffed_xml_encoding = 'utf-16le'
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
(xml_data[2:4] != b'\x00\x00'):
# UTF-16LE with BOM
sniffed_xml_encoding = 'utf-16le'
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
elif xml_data[:4] == b'\x00\x00\x00\x3c':
# UTF-32BE
sniffed_xml_encoding = 'utf-32be'
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
elif xml_data[:4] == b'\x3c\x00\x00\x00':
# UTF-32LE
sniffed_xml_encoding = 'utf-32le'
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
elif xml_data[:4] == b'\x00\x00\xfe\xff':
# UTF-32BE with BOM
sniffed_xml_encoding = 'utf-32be'
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
elif xml_data[:4] == b'\xff\xfe\x00\x00':
# UTF-32LE with BOM
sniffed_xml_encoding = 'utf-32le'
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
elif xml_data[:3] == b'\xef\xbb\xbf':
# UTF-8 with BOM
sniffed_xml_encoding = 'utf-8'
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
else:
sniffed_xml_encoding = 'ascii'
pass
except:
xml_encoding_match = None
xml_encoding_match = xml_encoding_re.match(xml_data)
if not xml_encoding_match and is_html:
xml_encoding_match = html_meta_re.search(xml_data)
if xml_encoding_match is not None:
xml_encoding = xml_encoding_match.groups()[0].decode(
'ascii').lower()
if is_html:
self.declared_html_encoding = xml_encoding
if sniffed_xml_encoding and \
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
'utf-16', 'utf-32', 'utf_16', 'utf_32',
'utf16', 'u16')):
xml_encoding = sniffed_xml_encoding
return xml_data, xml_encoding, sniffed_xml_encoding
def find_codec(self, charset):
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
or (charset and self._codec(charset.replace("-", ""))) \
or (charset and self._codec(charset.replace("-", "_"))) \
or charset
def _codec(self, charset):
if not charset:
return charset
codec = None
try:
codecs.lookup(charset)
codec = charset
except (LookupError, ValueError):
pass
return codec
EBCDIC_TO_ASCII_MAP = None
def _ebcdic_to_ascii(self, s):
c = self.__class__
if not c.EBCDIC_TO_ASCII_MAP:
emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
201,202,106,107,108,109,110,111,112,113,114,203,204,205,
206,207,208,209,126,115,116,117,118,119,120,121,122,210,
211,212,213,214,215,216,217,218,219,220,221,222,223,224,
225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
250,251,252,253,254,255)
import string
c.EBCDIC_TO_ASCII_MAP = string.maketrans(
''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
return s.translate(c.EBCDIC_TO_ASCII_MAP)
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
MS_CHARS = {b'\x80': ('euro', '20AC'),
b'\x81': ' ',
b'\x82': ('sbquo', '201A'),
b'\x83': ('fnof', '192'),
b'\x84': ('bdquo', '201E'),
b'\x85': ('hellip', '2026'),
b'\x86': ('dagger', '2020'),
b'\x87': ('Dagger', '2021'),
b'\x88': ('circ', '2C6'),
b'\x89': ('permil', '2030'),
b'\x8A': ('Scaron', '160'),
b'\x8B': ('lsaquo', '2039'),
b'\x8C': ('OElig', '152'),
b'\x8D': '?',
b'\x8E': ('#x17D', '17D'),
b'\x8F': '?',
b'\x90': '?',
b'\x91': ('lsquo', '2018'),
b'\x92': ('rsquo', '2019'),
b'\x93': ('ldquo', '201C'),
b'\x94': ('rdquo', '201D'),
b'\x95': ('bull', '2022'),
b'\x96': ('ndash', '2013'),
b'\x97': ('mdash', '2014'),
b'\x98': ('tilde', '2DC'),
b'\x99': ('trade', '2122'),
b'\x9a': ('scaron', '161'),
b'\x9b': ('rsaquo', '203A'),
b'\x9c': ('oelig', '153'),
b'\x9d': '?',
b'\x9e': ('#x17E', '17E'),
b'\x9f': ('Yuml', ''),}
# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
# horrors like stripping diacritical marks to turn á into a, but also
# contains non-horrors like turning “ into ".
MS_CHARS_TO_ASCII = {
b'\x80' : 'EUR',
b'\x81' : ' ',
b'\x82' : ',',
b'\x83' : 'f',
b'\x84' : ',,',
b'\x85' : '...',
b'\x86' : '+',
b'\x87' : '++',
b'\x88' : '^',
b'\x89' : '%',
b'\x8a' : 'S',
b'\x8b' : '<',
b'\x8c' : 'OE',
b'\x8d' : '?',
b'\x8e' : 'Z',
b'\x8f' : '?',
b'\x90' : '?',
b'\x91' : "'",
b'\x92' : "'",
b'\x93' : '"',
b'\x94' : '"',
b'\x95' : '*',
b'\x96' : '-',
b'\x97' : '--',
b'\x98' : '~',
b'\x99' : '(TM)',
b'\x9a' : 's',
b'\x9b' : '>',
b'\x9c' : 'oe',
b'\x9d' : '?',
b'\x9e' : 'z',
b'\x9f' : 'Y',
b'\xa0' : ' ',
b'\xa1' : '!',
b'\xa2' : 'c',
b'\xa3' : 'GBP',
b'\xa4' : '$', #This approximation is especially parochial--this is the
#generic currency symbol.
b'\xa5' : 'YEN',
b'\xa6' : '|',
b'\xa7' : 'S',
b'\xa8' : '..',
b'\xa9' : '',
b'\xaa' : '(th)',
b'\xab' : '<<',
b'\xac' : '!',
b'\xad' : ' ',
b'\xae' : '(R)',
b'\xaf' : '-',
b'\xb0' : 'o',
b'\xb1' : '+-',
b'\xb2' : '2',
b'\xb3' : '3',
b'\xb4' : ("'", 'acute'),
b'\xb5' : 'u',
b'\xb6' : 'P',
b'\xb7' : '*',
b'\xb8' : ',',
b'\xb9' : '1',
b'\xba' : '(th)',
b'\xbb' : '>>',
b'\xbc' : '1/4',
b'\xbd' : '1/2',
b'\xbe' : '3/4',
b'\xbf' : '?',
b'\xc0' : 'A',
b'\xc1' : 'A',
b'\xc2' : 'A',
b'\xc3' : 'A',
b'\xc4' : 'A',
b'\xc5' : 'A',
b'\xc6' : 'AE',
b'\xc7' : 'C',
b'\xc8' : 'E',
b'\xc9' : 'E',
b'\xca' : 'E',
b'\xcb' : 'E',
b'\xcc' : 'I',
b'\xcd' : 'I',
b'\xce' : 'I',
b'\xcf' : 'I',
b'\xd0' : 'D',
b'\xd1' : 'N',
b'\xd2' : 'O',
b'\xd3' : 'O',
b'\xd4' : 'O',
b'\xd5' : 'O',
b'\xd6' : 'O',
b'\xd7' : '*',
b'\xd8' : 'O',
b'\xd9' : 'U',
b'\xda' : 'U',
b'\xdb' : 'U',
b'\xdc' : 'U',
b'\xdd' : 'Y',
b'\xde' : 'b',
b'\xdf' : 'B',
b'\xe0' : 'a',
b'\xe1' : 'a',
b'\xe2' : 'a',
b'\xe3' : 'a',
b'\xe4' : 'a',
b'\xe5' : 'a',
b'\xe6' : 'ae',
b'\xe7' : 'c',
b'\xe8' : 'e',
b'\xe9' : 'e',
b'\xea' : 'e',
b'\xeb' : 'e',
b'\xec' : 'i',
b'\xed' : 'i',
b'\xee' : 'i',
b'\xef' : 'i',
b'\xf0' : 'o',
b'\xf1' : 'n',
b'\xf2' : 'o',
b'\xf3' : 'o',
b'\xf4' : 'o',
b'\xf5' : 'o',
b'\xf6' : 'o',
b'\xf7' : '/',
b'\xf8' : 'o',
b'\xf9' : 'u',
b'\xfa' : 'u',
b'\xfb' : 'u',
b'\xfc' : 'u',
b'\xfd' : 'y',
b'\xfe' : 'b',
b'\xff' : 'y',
}
# A map used when removing rogue Windows-1252/ISO-8859-1
# characters in otherwise UTF-8 documents.
#
# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
# Windows-1252.
WINDOWS_1252_TO_UTF8 = {
0x80 : b'\xe2\x82\xac', # €
0x82 : b'\xe2\x80\x9a', #
0x83 : b'\xc6\x92', # ƒ
0x84 : b'\xe2\x80\x9e', # „
0x85 : b'\xe2\x80\xa6', # …
0x86 : b'\xe2\x80\xa0', # †
0x87 : b'\xe2\x80\xa1', # ‡
0x88 : b'\xcb\x86', # ˆ
0x89 : b'\xe2\x80\xb0', # ‰
0x8a : b'\xc5\xa0', # Š
0x8b : b'\xe2\x80\xb9', #
0x8c : b'\xc5\x92', # Œ
0x8e : b'\xc5\xbd', # Ž
0x91 : b'\xe2\x80\x98', #
0x92 : b'\xe2\x80\x99', #
0x93 : b'\xe2\x80\x9c', # “
0x94 : b'\xe2\x80\x9d', # ”
0x95 : b'\xe2\x80\xa2', # •
0x96 : b'\xe2\x80\x93', #
0x97 : b'\xe2\x80\x94', # —
0x98 : b'\xcb\x9c', # ˜
0x99 : b'\xe2\x84\xa2', # ™
0x9a : b'\xc5\xa1', # š
0x9b : b'\xe2\x80\xba', #
0x9c : b'\xc5\x93', # œ
0x9e : b'\xc5\xbe', # ž
0x9f : b'\xc5\xb8', # Ÿ
0xa0 : b'\xc2\xa0', #  
0xa1 : b'\xc2\xa1', # ¡
0xa2 : b'\xc2\xa2', # ¢
0xa3 : b'\xc2\xa3', # £
0xa4 : b'\xc2\xa4', # ¤
0xa5 : b'\xc2\xa5', # ¥
0xa6 : b'\xc2\xa6', # ¦
0xa7 : b'\xc2\xa7', # §
0xa8 : b'\xc2\xa8', # ¨
0xa9 : b'\xc2\xa9', # ©
0xaa : b'\xc2\xaa', # ª
0xab : b'\xc2\xab', # «
0xac : b'\xc2\xac', # ¬
0xad : b'\xc2\xad', # ­
0xae : b'\xc2\xae', # ®
0xaf : b'\xc2\xaf', # ¯
0xb0 : b'\xc2\xb0', # °
0xb1 : b'\xc2\xb1', # ±
0xb2 : b'\xc2\xb2', # ²
0xb3 : b'\xc2\xb3', # ³
0xb4 : b'\xc2\xb4', # ´
0xb5 : b'\xc2\xb5', # µ
0xb6 : b'\xc2\xb6', # ¶
0xb7 : b'\xc2\xb7', # ·
0xb8 : b'\xc2\xb8', # ¸
0xb9 : b'\xc2\xb9', # ¹
0xba : b'\xc2\xba', # º
0xbb : b'\xc2\xbb', # »
0xbc : b'\xc2\xbc', # ¼
0xbd : b'\xc2\xbd', # ½
0xbe : b'\xc2\xbe', # ¾
0xbf : b'\xc2\xbf', # ¿
0xc0 : b'\xc3\x80', # À
0xc1 : b'\xc3\x81', # Á
0xc2 : b'\xc3\x82', # Â
0xc3 : b'\xc3\x83', # Ã
0xc4 : b'\xc3\x84', # Ä
0xc5 : b'\xc3\x85', # Å
0xc6 : b'\xc3\x86', # Æ
0xc7 : b'\xc3\x87', # Ç
0xc8 : b'\xc3\x88', # È
0xc9 : b'\xc3\x89', # É
0xca : b'\xc3\x8a', # Ê
0xcb : b'\xc3\x8b', # Ë
0xcc : b'\xc3\x8c', # Ì
0xcd : b'\xc3\x8d', # Í
0xce : b'\xc3\x8e', # Î
0xcf : b'\xc3\x8f', # Ï
0xd0 : b'\xc3\x90', # Ð
0xd1 : b'\xc3\x91', # Ñ
0xd2 : b'\xc3\x92', # Ò
0xd3 : b'\xc3\x93', # Ó
0xd4 : b'\xc3\x94', # Ô
0xd5 : b'\xc3\x95', # Õ
0xd6 : b'\xc3\x96', # Ö
0xd7 : b'\xc3\x97', # ×
0xd8 : b'\xc3\x98', # Ø
0xd9 : b'\xc3\x99', # Ù
0xda : b'\xc3\x9a', # Ú
0xdb : b'\xc3\x9b', # Û
0xdc : b'\xc3\x9c', # Ü
0xdd : b'\xc3\x9d', # Ý
0xde : b'\xc3\x9e', # Þ
0xdf : b'\xc3\x9f', # ß
0xe0 : b'\xc3\xa0', # à
0xe1 : b'\xa1', # á
0xe2 : b'\xc3\xa2', # â
0xe3 : b'\xc3\xa3', # ã
0xe4 : b'\xc3\xa4', # ä
0xe5 : b'\xc3\xa5', # å
0xe6 : b'\xc3\xa6', # æ
0xe7 : b'\xc3\xa7', # ç
0xe8 : b'\xc3\xa8', # è
0xe9 : b'\xc3\xa9', # é
0xea : b'\xc3\xaa', # ê
0xeb : b'\xc3\xab', # ë
0xec : b'\xc3\xac', # ì
0xed : b'\xc3\xad', # í
0xee : b'\xc3\xae', # î
0xef : b'\xc3\xaf', # ï
0xf0 : b'\xc3\xb0', # ð
0xf1 : b'\xc3\xb1', # ñ
0xf2 : b'\xc3\xb2', # ò
0xf3 : b'\xc3\xb3', # ó
0xf4 : b'\xc3\xb4', # ô
0xf5 : b'\xc3\xb5', # õ
0xf6 : b'\xc3\xb6', # ö
0xf7 : b'\xc3\xb7', # ÷
0xf8 : b'\xc3\xb8', # ø
0xf9 : b'\xc3\xb9', # ù
0xfa : b'\xc3\xba', # ú
0xfb : b'\xc3\xbb', # û
0xfc : b'\xc3\xbc', # ü
0xfd : b'\xc3\xbd', # ý
0xfe : b'\xc3\xbe', # þ
}
MULTIBYTE_MARKERS_AND_SIZES = [
(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
(0xe0, 0xef, 3), # 3-byte characters start with E0-EF
(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
]
FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
@classmethod
def detwingle(cls, in_bytes, main_encoding="utf8",
embedded_encoding="windows-1252"):
"""Fix characters from one encoding embedded in some other encoding.
Currently the only situation supported is Windows-1252 (or its
subset ISO-8859-1), embedded in UTF-8.
The input must be a bytestring. If you've already converted
the document to Unicode, you're too late.
The output is a bytestring in which `embedded_encoding`
characters have been converted to their `main_encoding`
equivalents.
"""
if embedded_encoding.replace('_', '-').lower() not in (
'windows-1252', 'windows_1252'):
raise NotImplementedError(
"Windows-1252 and ISO-8859-1 are the only currently supported "
"embedded encodings.")
if main_encoding.lower() not in ('utf8', 'utf-8'):
raise NotImplementedError(
"UTF-8 is the only currently supported main encoding.")
byte_chunks = []
chunk_start = 0
pos = 0
while pos < len(in_bytes):
byte = in_bytes[pos]
if not isinstance(byte, int):
# Python 2.x
byte = ord(byte)
if (byte >= cls.FIRST_MULTIBYTE_MARKER
and byte <= cls.LAST_MULTIBYTE_MARKER):
# This is the start of a UTF-8 multibyte character. Skip
# to the end.
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
if byte >= start and byte <= end:
pos += size
break
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
# We found a Windows-1252 character!
# Save the string up to this point as a chunk.
byte_chunks.append(in_bytes[chunk_start:pos])
# Now translate the Windows-1252 character into UTF-8
# and add it as another, one-byte chunk.
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
pos += 1
chunk_start = pos
else:
# Go on to the next character.
pos += 1
if chunk_start == 0:
# The string is unchanged.
return in_bytes
else:
# Store the final chunk.
byte_chunks.append(in_bytes[chunk_start:])
return b''.join(byte_chunks)

File diff suppressed because it is too large Load Diff

@ -0,0 +1,554 @@
"""Helper classes for tests."""
import copy
import functools
import unittest
from unittest import TestCase
from bs4 import BeautifulSoup
from bs4.element import (
CharsetMetaAttributeValue,
Comment,
ContentMetaAttributeValue,
Doctype,
SoupStrainer,
)
from bs4.builder import HTMLParserTreeBuilder
default_builder = HTMLParserTreeBuilder
class SoupTest(unittest.TestCase):
@property
def default_builder(self):
return default_builder()
def soup(self, markup, **kwargs):
"""Build a Beautiful Soup object from markup."""
builder = kwargs.pop('builder', self.default_builder)
return BeautifulSoup(markup, builder=builder, **kwargs)
def document_for(self, markup):
"""Turn an HTML fragment into a document.
The details depend on the builder.
"""
return self.default_builder.test_fragment_to_document(markup)
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
builder = self.default_builder
obj = BeautifulSoup(to_parse, builder=builder)
if compare_parsed_to is None:
compare_parsed_to = to_parse
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
class HTMLTreeBuilderSmokeTest(object):
"""A basic test of a treebuilder's competence.
Any HTML treebuilder, present or future, should be able to pass
these tests. With invalid markup, there's room for interpretation,
and different parsers can handle it differently. But with the
markup in these tests, there's not much room for interpretation.
"""
def assertDoctypeHandled(self, doctype_fragment):
"""Assert that a given doctype string is handled correctly."""
doctype_str, soup = self._document_with_doctype(doctype_fragment)
# Make sure a Doctype object was created.
doctype = soup.contents[0]
self.assertEqual(doctype.__class__, Doctype)
self.assertEqual(doctype, doctype_fragment)
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
# Make sure that the doctype was correctly associated with the
# parse tree and that the rest of the document parsed.
self.assertEqual(soup.p.contents[0], 'foo')
def _document_with_doctype(self, doctype_fragment):
"""Generate and parse a document with the given doctype."""
doctype = '<!DOCTYPE %s>' % doctype_fragment
markup = doctype + '\n<p>foo</p>'
soup = self.soup(markup)
return doctype, soup
def test_normal_doctypes(self):
"""Make sure normal, everyday HTML doctypes are handled correctly."""
self.assertDoctypeHandled("html")
self.assertDoctypeHandled(
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
def test_public_doctype_with_url(self):
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
self.assertDoctypeHandled(doctype)
def test_system_doctype(self):
self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
def test_namespaced_system_doctype(self):
# We can handle a namespaced doctype with a system ID.
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
def test_namespaced_public_doctype(self):
# Test a namespaced doctype with a public id.
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
def test_real_xhtml_document(self):
"""A real XHTML document should come out more or less the same as it went in."""
markup = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Hello.</title></head>
<body>Goodbye.</body>
</html>"""
soup = self.soup(markup)
self.assertEqual(
soup.encode("utf-8").replace(b"\n", b""),
markup.replace(b"\n", b""))
def test_deepcopy(self):
"""Make sure you can copy the tree builder.
This is important because the builder is part of a
BeautifulSoup object, and we want to be able to copy that.
"""
copy.deepcopy(self.default_builder)
def test_p_tag_is_never_empty_element(self):
"""A <p> tag is never designated as an empty-element tag.
Even if the markup shows it as an empty-element tag, it
shouldn't be presented that way.
"""
soup = self.soup("<p/>")
self.assertFalse(soup.p.is_empty_element)
self.assertEqual(str(soup.p), "<p></p>")
def test_unclosed_tags_get_closed(self):
"""A tag that's not closed by the end of the document should be closed.
This applies to all tags except empty-element tags.
"""
self.assertSoupEquals("<p>", "<p></p>")
self.assertSoupEquals("<b>", "<b></b>")
self.assertSoupEquals("<br>", "<br/>")
def test_br_is_always_empty_element_tag(self):
"""A <br> tag is designated as an empty-element tag.
Some parsers treat <br></br> as one <br/> tag, some parsers as
two tags, but it should always be an empty-element tag.
"""
soup = self.soup("<br></br>")
self.assertTrue(soup.br.is_empty_element)
self.assertEqual(str(soup.br), "<br/>")
def test_nested_formatting_elements(self):
self.assertSoupEquals("<em><em></em></em>")
def test_comment(self):
# Comments are represented as Comment objects.
markup = "<p>foo<!--foobar-->baz</p>"
self.assertSoupEquals(markup)
soup = self.soup(markup)
comment = soup.find(text="foobar")
self.assertEqual(comment.__class__, Comment)
# The comment is properly integrated into the tree.
foo = soup.find(text="foo")
self.assertEqual(comment, foo.next_element)
baz = soup.find(text="baz")
self.assertEquals(comment, baz.previous_element)
def test_preserved_whitespace_in_pre_and_textarea(self):
"""Whitespace must be preserved in <pre> and <textarea> tags."""
self.assertSoupEquals("<pre> </pre>")
self.assertSoupEquals("<textarea> woo </textarea>")
def test_nested_inline_elements(self):
"""Inline elements can be nested indefinitely."""
b_tag = "<b>Inside a B tag</b>"
self.assertSoupEquals(b_tag)
nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
self.assertSoupEquals(nested_b_tag)
double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
self.assertSoupEquals(nested_b_tag)
def test_nested_block_level_elements(self):
"""Block elements can be nested."""
soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
blockquote = soup.blockquote
self.assertEqual(blockquote.p.b.string, 'Foo')
self.assertEqual(blockquote.b.string, 'Foo')
def test_correctly_nested_tables(self):
"""One table can go inside another one."""
markup = ('<table id="1">'
'<tr>'
"<td>Here's another table:"
'<table id="2">'
'<tr><td>foo</td></tr>'
'</table></td>')
self.assertSoupEquals(
markup,
'<table id="1"><tr><td>Here\'s another table:'
'<table id="2"><tr><td>foo</td></tr></table>'
'</td></tr></table>')
self.assertSoupEquals(
"<table><thead><tr><td>Foo</td></tr></thead>"
"<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
def test_deeply_nested_multivalued_attribute(self):
# html5lib can set the attributes of the same tag many times
# as it rearranges the tree. This has caused problems with
# multivalued attributes.
markup = '<table><div><div class="css"></div></div></table>'
soup = self.soup(markup)
self.assertEqual(["css"], soup.div.div['class'])
def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
def test_entities_in_attributes_converted_to_unicode(self):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
def test_entities_in_text_converted_to_unicode(self):
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
def test_quot_entity_converted_to_quotation_mark(self):
self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
'<p>I said "good day!"</p>')
def test_out_of_range_entity(self):
expect = u"\N{REPLACEMENT CHARACTER}"
self.assertSoupEquals("&#10000000000000;", expect)
self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect)
def test_basic_namespaces(self):
"""Parsers don't need to *understand* namespaces, but at the
very least they should not choke on namespaces or lose
data."""
markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
soup = self.soup(markup)
self.assertEqual(markup, soup.encode())
html = soup.html
self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
self.assertEqual(
'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
self.assertEqual(
'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
def test_multivalued_attribute_value_becomes_list(self):
markup = b'<a class="foo bar">'
soup = self.soup(markup)
self.assertEqual(['foo', 'bar'], soup.a['class'])
#
# Generally speaking, tests below this point are more tests of
# Beautiful Soup than tests of the tree builders. But parsers are
# weird, so we run these tests separately for every tree builder
# to detect any differences between them.
#
def test_soupstrainer(self):
"""Parsers should be able to work with SoupStrainers."""
strainer = SoupStrainer("b")
soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
parse_only=strainer)
self.assertEqual(soup.decode(), "<b>bold</b>")
def test_single_quote_attribute_values_become_double_quotes(self):
self.assertSoupEquals("<foo attr='bar'></foo>",
'<foo attr="bar"></foo>')
def test_attribute_values_with_nested_quotes_are_left_alone(self):
text = """<foo attr='bar "brawls" happen'>a</foo>"""
self.assertSoupEquals(text)
def test_attribute_values_with_double_nested_quotes_get_quoted(self):
text = """<foo attr='bar "brawls" happen'>a</foo>"""
soup = self.soup(text)
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
self.assertSoupEquals(
soup.foo.decode(),
"""<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
def test_ampersand_in_attribute_value_gets_escaped(self):
self.assertSoupEquals('<this is="really messed up & stuff"></this>',
'<this is="really messed up &amp; stuff"></this>')
self.assertSoupEquals(
'<a href="http://example.org?a=1&b=2;3">foo</a>',
'<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
def test_entities_in_strings_converted_during_parsing(self):
# Both XML and HTML entities are converted to Unicode characters
# during parsing.
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
self.assertSoupEquals(text, expected)
def test_smart_quotes_converted_on_the_way_in(self):
# Microsoft smart quotes are converted to Unicode characters during
# parsing.
quote = b"<p>\x91Foo\x92</p>"
soup = self.soup(quote)
self.assertEqual(
soup.p.string,
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
def test_non_breaking_spaces_converted_on_the_way_in(self):
soup = self.soup("<a>&nbsp;&nbsp;</a>")
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
def test_entities_converted_on_the_way_out(self):
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
soup = self.soup(text)
self.assertEqual(soup.p.encode("utf-8"), expected)
def test_real_iso_latin_document(self):
# Smoke test of interrelated functionality, using an
# easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# That's because we're going to encode it into ISO-Latin-1, and use
# that to test.
iso_latin_html = unicode_html.encode("iso-8859-1")
# Parse the ISO-Latin-1 HTML.
soup = self.soup(iso_latin_html)
# Encode it to UTF-8.
result = soup.encode("utf-8")
# What do we expect the result to look like? Well, it would
# look like unicode_html, except that the META tag would say
# UTF-8 instead of ISO-Latin-1.
expected = unicode_html.replace("ISO-Latin-1", "utf-8")
# And, of course, it would be in UTF-8, not Unicode.
expected = expected.encode("utf-8")
# Ta-da!
self.assertEqual(result, expected)
def test_real_shift_jis_document(self):
# Smoke test to make sure the parser can handle a document in
# Shift-JIS encoding, without choking.
shift_jis_html = (
b'<html><head></head><body><pre>'
b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
b'</pre></body></html>')
unicode_html = shift_jis_html.decode("shift-jis")
soup = self.soup(unicode_html)
# Make sure the parse tree is correctly encoded to various
# encodings.
self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
def test_real_hebrew_document(self):
# A real-world test to make sure we can convert ISO-8859-9 (a
# Hebrew encoding) to UTF-8.
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
soup = self.soup(
hebrew_document, from_encoding="iso8859-8")
self.assertEqual(soup.original_encoding, 'iso8859-8')
self.assertEqual(
soup.encode('utf-8'),
hebrew_document.decode("iso8859-8").encode("utf-8"))
def test_meta_tag_reflects_current_encoding(self):
# Here's the <meta> tag saying that a document is
# encoded in Shift-JIS.
meta_tag = ('<meta content="text/html; charset=x-sjis" '
'http-equiv="Content-type"/>')
# Here's a document incorporating that meta tag.
shift_jis_html = (
'<html><head>\n%s\n'
'<meta http-equiv="Content-language" content="ja"/>'
'</head><body>Shift-JIS markup goes here.') % meta_tag
soup = self.soup(shift_jis_html)
# Parse the document, and the charset is seemingly unaffected.
parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
content = parsed_meta['content']
self.assertEqual('text/html; charset=x-sjis', content)
# But that value is actually a ContentMetaAttributeValue object.
self.assertTrue(isinstance(content, ContentMetaAttributeValue))
# And it will take on a value that reflects its current
# encoding.
self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
# For the rest of the story, see TestSubstitutions in
# test_tree.py.
def test_html5_style_meta_tag_reflects_current_encoding(self):
# Here's the <meta> tag saying that a document is
# encoded in Shift-JIS.
meta_tag = ('<meta id="encoding" charset="x-sjis" />')
# Here's a document incorporating that meta tag.
shift_jis_html = (
'<html><head>\n%s\n'
'<meta http-equiv="Content-language" content="ja"/>'
'</head><body>Shift-JIS markup goes here.') % meta_tag
soup = self.soup(shift_jis_html)
# Parse the document, and the charset is seemingly unaffected.
parsed_meta = soup.find('meta', id="encoding")
charset = parsed_meta['charset']
self.assertEqual('x-sjis', charset)
# But that value is actually a CharsetMetaAttributeValue object.
self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
# And it will take on a value that reflects its current
# encoding.
self.assertEqual('utf8', charset.encode("utf8"))
def test_tag_with_no_attributes_can_have_attributes_added(self):
data = self.soup("<a>text</a>")
data.a['foo'] = 'bar'
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
class XMLTreeBuilderSmokeTest(object):
def test_docstring_generated(self):
soup = self.soup("<root/>")
self.assertEqual(
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
def test_real_xhtml_document(self):
"""A real XHTML document should come out *exactly* the same as it went in."""
markup = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Hello.</title></head>
<body>Goodbye.</body>
</html>"""
soup = self.soup(markup)
self.assertEqual(
soup.encode("utf-8"), markup)
def test_popping_namespaced_tag(self):
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup)
self.assertEqual(
unicode(soup.rss), markup)
def test_docstring_includes_correct_encoding(self):
soup = self.soup("<root/>")
self.assertEqual(
soup.encode("latin1"),
b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
def test_large_xml_document(self):
"""A large XML document should come out the same as it went in."""
markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
+ b'0' * (2**12)
+ b'</root>')
soup = self.soup(markup)
self.assertEqual(soup.encode("utf-8"), markup)
def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
self.assertSoupEquals("<p>", "<p/>")
self.assertSoupEquals("<p>foo</p>")
def test_namespaces_are_preserved(self):
markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
soup = self.soup(markup)
root = soup.root
self.assertEqual("http://example.com/", root['xmlns:a'])
self.assertEqual("http://example.net/", root['xmlns:b'])
def test_closing_namespaced_tag(self):
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.p), markup)
def test_namespaced_attributes(self):
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup)
def test_namespaced_attributes_xml_namespace(self):
markup = '<foo xml:lang="fr">bar</foo>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup)
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5."""
def test_real_xhtml_document(self):
# Since XHTML is not HTML5, HTML5 parsers are not tested to handle
# XHTML documents in any particular way.
pass
def test_html_tags_have_namespace(self):
markup = "<a>"
soup = self.soup(markup)
self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
def test_svg_tags_have_namespace(self):
markup = '<svg><circle/></svg>'
soup = self.soup(markup)
namespace = "http://www.w3.org/2000/svg"
self.assertEqual(namespace, soup.svg.namespace)
self.assertEqual(namespace, soup.circle.namespace)
def test_mathml_tags_have_namespace(self):
markup = '<math><msqrt>5</msqrt></math>'
soup = self.soup(markup)
namespace = 'http://www.w3.org/1998/Math/MathML'
self.assertEqual(namespace, soup.math.namespace)
self.assertEqual(namespace, soup.msqrt.namespace)
def test_xml_declaration_becomes_comment(self):
markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
soup = self.soup(markup)
self.assertTrue(isinstance(soup.contents[0], Comment))
self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
self.assertEqual("html", soup.contents[0].next_element.name)
def skipIf(condition, reason):
def nothing(test, *args, **kwargs):
return None
def decorator(test_item):
if condition:
return nothing
else:
return test_item
return decorator

@ -0,0 +1 @@
"The beautifulsoup tests."

@ -0,0 +1,141 @@
"""Tests of the builder registry."""
import unittest
from bs4 import BeautifulSoup
from bs4.builder import (
builder_registry as registry,
HTMLParserTreeBuilder,
TreeBuilderRegistry,
)
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
except ImportError:
HTML5LIB_PRESENT = False
try:
from bs4.builder import (
LXMLTreeBuilderForXML,
LXMLTreeBuilder,
)
LXML_PRESENT = True
except ImportError:
LXML_PRESENT = False
class BuiltInRegistryTest(unittest.TestCase):
"""Test the built-in registry with the default builders registered."""
def test_combination(self):
if LXML_PRESENT:
self.assertEqual(registry.lookup('fast', 'html'),
LXMLTreeBuilder)
if LXML_PRESENT:
self.assertEqual(registry.lookup('permissive', 'xml'),
LXMLTreeBuilderForXML)
self.assertEqual(registry.lookup('strict', 'html'),
HTMLParserTreeBuilder)
if HTML5LIB_PRESENT:
self.assertEqual(registry.lookup('html5lib', 'html'),
HTML5TreeBuilder)
def test_lookup_by_markup_type(self):
if LXML_PRESENT:
self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
else:
self.assertEqual(registry.lookup('xml'), None)
if HTML5LIB_PRESENT:
self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
else:
self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
def test_named_library(self):
if LXML_PRESENT:
self.assertEqual(registry.lookup('lxml', 'xml'),
LXMLTreeBuilderForXML)
self.assertEqual(registry.lookup('lxml', 'html'),
LXMLTreeBuilder)
if HTML5LIB_PRESENT:
self.assertEqual(registry.lookup('html5lib'),
HTML5TreeBuilder)
self.assertEqual(registry.lookup('html.parser'),
HTMLParserTreeBuilder)
def test_beautifulsoup_constructor_does_lookup(self):
# You can pass in a string.
BeautifulSoup("", features="html")
# Or a list of strings.
BeautifulSoup("", features=["html", "fast"])
# You'll get an exception if BS can't find an appropriate
# builder.
self.assertRaises(ValueError, BeautifulSoup,
"", features="no-such-feature")
class RegistryTest(unittest.TestCase):
"""Test the TreeBuilderRegistry class in general."""
def setUp(self):
self.registry = TreeBuilderRegistry()
def builder_for_features(self, *feature_list):
cls = type('Builder_' + '_'.join(feature_list),
(object,), {'features' : feature_list})
self.registry.register(cls)
return cls
def test_register_with_no_features(self):
builder = self.builder_for_features()
# Since the builder advertises no features, you can't find it
# by looking up features.
self.assertEqual(self.registry.lookup('foo'), None)
# But you can find it by doing a lookup with no features, if
# this happens to be the only registered builder.
self.assertEqual(self.registry.lookup(), builder)
def test_register_with_features_makes_lookup_succeed(self):
builder = self.builder_for_features('foo', 'bar')
self.assertEqual(self.registry.lookup('foo'), builder)
self.assertEqual(self.registry.lookup('bar'), builder)
def test_lookup_fails_when_no_builder_implements_feature(self):
builder = self.builder_for_features('foo', 'bar')
self.assertEqual(self.registry.lookup('baz'), None)
def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
builder1 = self.builder_for_features('foo')
builder2 = self.builder_for_features('bar')
self.assertEqual(self.registry.lookup(), builder2)
def test_lookup_fails_when_no_tree_builders_registered(self):
self.assertEqual(self.registry.lookup(), None)
def test_lookup_gets_most_recent_builder_supporting_all_features(self):
has_one = self.builder_for_features('foo')
has_the_other = self.builder_for_features('bar')
has_both_early = self.builder_for_features('foo', 'bar', 'baz')
has_both_late = self.builder_for_features('foo', 'bar', 'quux')
lacks_one = self.builder_for_features('bar')
has_the_other = self.builder_for_features('foo')
# There are two builders featuring 'foo' and 'bar', but
# the one that also features 'quux' was registered later.
self.assertEqual(self.registry.lookup('foo', 'bar'),
has_both_late)
# There is only one builder featuring 'foo', 'bar', and 'baz'.
self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
has_both_early)
def test_lookup_fails_when_cannot_reconcile_requested_features(self):
builder1 = self.builder_for_features('foo', 'bar')
builder2 = self.builder_for_features('foo', 'baz')
self.assertEqual(self.registry.lookup('bar', 'baz'), None)

@ -0,0 +1,36 @@
"Test harness for doctests."
# pylint: disable-msg=E0611,W0142
__metaclass__ = type
__all__ = [
'additional_tests',
]
import atexit
import doctest
import os
#from pkg_resources import (
# resource_filename, resource_exists, resource_listdir, cleanup_resources)
import unittest
DOCTEST_FLAGS = (
doctest.ELLIPSIS |
doctest.NORMALIZE_WHITESPACE |
doctest.REPORT_NDIFF)
# def additional_tests():
# "Run the doc tests (README.txt and docs/*, if any exist)"
# doctest_files = [
# os.path.abspath(resource_filename('bs4', 'README.txt'))]
# if resource_exists('bs4', 'docs'):
# for name in resource_listdir('bs4', 'docs'):
# if name.endswith('.txt'):
# doctest_files.append(
# os.path.abspath(
# resource_filename('bs4', 'docs/%s' % name)))
# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
# atexit.register(cleanup_resources)
# return unittest.TestSuite((
# doctest.DocFileSuite(*doctest_files, **kwargs)))

@ -0,0 +1,72 @@
"""Tests to ensure that the html5lib tree builder generates good trees."""
import warnings
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
except ImportError, e:
HTML5LIB_PRESENT = False
from bs4.element import SoupStrainer
from bs4.testing import (
HTML5TreeBuilderSmokeTest,
SoupTest,
skipIf,
)
@skipIf(
not HTML5LIB_PRESENT,
"html5lib seems not to be present, not testing its tree builder.")
class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
"""See ``HTML5TreeBuilderSmokeTest``."""
@property
def default_builder(self):
return HTML5TreeBuilder()
def test_soupstrainer(self):
# The html5lib tree builder does not support SoupStrainers.
strainer = SoupStrainer("b")
markup = "<p>A <b>bold</b> statement.</p>"
with warnings.catch_warnings(record=True) as w:
soup = self.soup(markup, parse_only=strainer)
self.assertEqual(
soup.decode(), self.document_for(markup))
self.assertTrue(
"the html5lib tree builder doesn't support parse_only" in
str(w[0].message))
def test_correctly_nested_tables(self):
"""html5lib inserts <tbody> tags where other parsers don't."""
markup = ('<table id="1">'
'<tr>'
"<td>Here's another table:"
'<table id="2">'
'<tr><td>foo</td></tr>'
'</table></td>')
self.assertSoupEquals(
markup,
'<table id="1"><tbody><tr><td>Here\'s another table:'
'<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
'</td></tr></tbody></table>')
self.assertSoupEquals(
"<table><thead><tr><td>Foo</td></tr></thead>"
"<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
def test_xml_declaration_followed_by_doctype(self):
markup = '''<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html>
<head>
</head>
<body>
<p>foo</p>
</body>
</html>'''
soup = self.soup(markup)
# Verify that we can reach the <p> tag; this means the tree is connected.
self.assertEquals("<p>foo</p>", soup.p.encode())

@ -0,0 +1,19 @@
"""Tests to ensure that the html.parser tree builder generates good
trees."""
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
@property
def default_builder(self):
return HTMLParserTreeBuilder()
def test_namespaced_system_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass
def test_namespaced_public_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass

@ -0,0 +1,75 @@
"""Tests to ensure that the lxml tree builder generates good trees."""
import re
import warnings
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
except ImportError, e:
LXML_PRESENT = False
from bs4 import (
BeautifulSoup,
BeautifulStoneSoup,
)
from bs4.element import Comment, Doctype, SoupStrainer
from bs4.testing import skipIf
from bs4.tests import test_htmlparser
from bs4.testing import (
HTMLTreeBuilderSmokeTest,
XMLTreeBuilderSmokeTest,
SoupTest,
skipIf,
)
@skipIf(
not LXML_PRESENT,
"lxml seems not to be present, not testing its tree builder.")
class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``."""
@property
def default_builder(self):
return LXMLTreeBuilder()
def test_out_of_range_entity(self):
self.assertSoupEquals(
"<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
self.assertSoupEquals(
"<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
def test_beautifulstonesoup_is_xml_parser(self):
# Make sure that the deprecated BSS class uses an xml builder
# if one is installed.
with warnings.catch_warnings(record=False) as w:
soup = BeautifulStoneSoup("<b />")
self.assertEqual(u"<b/>", unicode(soup.b))
def test_real_xhtml_document(self):
"""lxml strips the XML definition from an XHTML doc, which is fine."""
markup = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Hello.</title></head>
<body>Goodbye.</body>
</html>"""
soup = self.soup(markup)
self.assertEqual(
soup.encode("utf-8").replace(b"\n", b''),
markup.replace(b'\n', b'').replace(
b'<?xml version="1.0" encoding="utf-8"?>', b''))
@skipIf(
not LXML_PRESENT,
"lxml seems not to be present, not testing its XML tree builder.")
class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``."""
@property
def default_builder(self):
return LXMLTreeBuilderForXML()

@ -0,0 +1,378 @@
# -*- coding: utf-8 -*-
"""Tests of Beautiful Soup as a whole."""
import logging
import unittest
import sys
from bs4 import (
BeautifulSoup,
BeautifulStoneSoup,
)
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
SoupStrainer,
NamespacedAttribute,
)
import bs4.dammit
from bs4.dammit import EntitySubstitution, UnicodeDammit
from bs4.testing import (
SoupTest,
skipIf,
)
import warnings
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
except ImportError, e:
LXML_PRESENT = False
PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
class TestDeprecatedConstructorArguments(SoupTest):
def test_parseOnlyThese_renamed_to_parse_only(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
msg = str(w[0].message)
self.assertTrue("parseOnlyThese" in msg)
self.assertTrue("parse_only" in msg)
self.assertEqual(b"<b></b>", soup.encode())
def test_fromEncoding_renamed_to_from_encoding(self):
with warnings.catch_warnings(record=True) as w:
utf8 = b"\xc3\xa9"
soup = self.soup(utf8, fromEncoding="utf8")
msg = str(w[0].message)
self.assertTrue("fromEncoding" in msg)
self.assertTrue("from_encoding" in msg)
self.assertEqual("utf8", soup.original_encoding)
def test_unrecognized_keyword_argument(self):
self.assertRaises(
TypeError, self.soup, "<a>", no_such_argument=True)
@skipIf(
not LXML_PRESENT,
"lxml not present, not testing BeautifulStoneSoup.")
def test_beautifulstonesoup(self):
with warnings.catch_warnings(record=True) as w:
soup = BeautifulStoneSoup("<markup>")
self.assertTrue(isinstance(soup, BeautifulSoup))
self.assertTrue("BeautifulStoneSoup class is deprecated")
class TestSelectiveParsing(SoupTest):
def test_parse_with_soupstrainer(self):
markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
strainer = SoupStrainer("b")
soup = self.soup(markup, parse_only=strainer)
self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
class TestEntitySubstitution(unittest.TestCase):
"""Standalone tests of the EntitySubstitution class."""
def setUp(self):
self.sub = EntitySubstitution
def test_simple_html_substitution(self):
# Unicode characters corresponding to named HTML entites
# are substituted, and no others.
s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
self.assertEqual(self.sub.substitute_html(s),
u"foo&forall;\N{SNOWMAN}&otilde;bar")
def test_smart_quote_substitution(self):
# MS smart quotes are a common source of frustration, so we
# give them a special test.
quotes = b"\x91\x92foo\x93\x94"
dammit = UnicodeDammit(quotes)
self.assertEqual(self.sub.substitute_html(dammit.markup),
"&lsquo;&rsquo;foo&ldquo;&rdquo;")
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
s = 'Welcome to "my bar"'
self.assertEqual(self.sub.substitute_xml(s, False), s)
def test_xml_attribute_quoting_normally_uses_double_quotes(self):
self.assertEqual(self.sub.substitute_xml("Welcome", True),
'"Welcome"')
self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
'"Bob\'s Bar"')
def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
s = 'Welcome to "my bar"'
self.assertEqual(self.sub.substitute_xml(s, True),
"'Welcome to \"my bar\"'")
def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
s = 'Welcome to "Bob\'s Bar"'
self.assertEqual(
self.sub.substitute_xml(s, True),
'"Welcome to &quot;Bob\'s Bar&quot;"')
def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
quoted = 'Welcome to "Bob\'s Bar"'
self.assertEqual(self.sub.substitute_xml(quoted), quoted)
def test_xml_quoting_handles_angle_brackets(self):
self.assertEqual(
self.sub.substitute_xml("foo<bar>"),
"foo&lt;bar&gt;")
def test_xml_quoting_handles_ampersands(self):
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self):
self.assertEqual(
self.sub.substitute_xml("&Aacute;T&T"),
"&Aacute;T&amp;T")
def test_quotes_not_html_substituted(self):
"""There's no need to do this except inside attribute values."""
text = 'Bob\'s "bar"'
self.assertEqual(self.sub.substitute_html(text), text)
class TestEncodingConversion(SoupTest):
# Test Beautiful Soup's ability to decode and encode from various
# encodings.
def setUp(self):
super(TestEncodingConversion, self).setUp()
self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
self.utf8_data = self.unicode_data.encode("utf-8")
# Just so you know what it looks like.
self.assertEqual(
self.utf8_data,
b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
def test_ascii_in_unicode_out(self):
# ASCII input is converted to Unicode. The original_encoding
# attribute is set.
ascii = b"<foo>a</foo>"
soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode()
self.assertTrue(isinstance(unicode_output, unicode))
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
self.assertEqual(soup_from_ascii.original_encoding.lower(), "ascii")
def test_unicode_in_unicode_out(self):
# Unicode input is left alone. The original_encoding attribute
# is not set.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
self.assertEqual(soup_from_unicode.original_encoding, None)
def test_utf8_in_unicode_out(self):
# UTF-8 input is converted to Unicode. The original_encoding
# attribute is set.
soup_from_utf8 = self.soup(self.utf8_data)
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
def test_utf8_out(self):
# The internal data structures can be encoded as UTF-8.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
@skipIf(
PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
def test_attribute_name_containing_unicode_characters(self):
markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of Unicode, Dammit."""
def test_smart_quotes_to_unicode(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
self.assertEqual(
dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
def test_smart_quotes_to_xml_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
self.assertEqual(
dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
def test_smart_quotes_to_html_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="html")
self.assertEqual(
dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
def test_smart_quotes_to_ascii(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
self.assertEqual(
dammit.unicode_markup, """<foo>''""</foo>""")
def test_detect_utf8(self):
utf8 = b"\xc3\xa9"
dammit = UnicodeDammit(utf8)
self.assertEqual(dammit.unicode_markup, u'\xe9')
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_convert_hebrew(self):
hebrew = b"\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
dammit = UnicodeDammit(utf_8)
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_ignore_invalid_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_detect_html5_style_meta_tag(self):
for data in (
b'<html><meta charset="euc-jp" /></html>',
b"<html><meta charset='euc-jp' /></html>",
b"<html><meta charset=euc-jp /></html>",
b"<html><meta charset=euc-jp/></html>"):
dammit = UnicodeDammit(data, is_html=True)
self.assertEqual(
"euc-jp", dammit.original_encoding)
def test_last_ditch_entity_replacement(self):
# This is a UTF-8 document that contains bytestrings
# completely incompatible with UTF-8 (ie. encoded with some other
# encoding).
#
# Since there is no consistent encoding for the document,
# Unicode, Dammit will eventually encode the document as UTF-8
# and encode the incompatible characters as REPLACEMENT
# CHARACTER.
#
# If chardet is installed, it will detect that the document
# can be converted into ISO-8859-1 without errors. This happens
# to be the wrong encoding, but it is a consistent encoding, so the
# code we're testing here won't run.
#
# So we temporarily disable chardet if it's present.
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
<html><b>\330\250\330\252\330\261</b>
<i>\310\322\321\220\312\321\355\344</i></html>"""
chardet = bs4.dammit.chardet_dammit
logging.disable(logging.WARNING)
try:
def noop(str):
return None
bs4.dammit.chardet_dammit = noop
dammit = UnicodeDammit(doc)
self.assertEqual(True, dammit.contains_replacement_characters)
self.assertTrue(u"\ufffd" in dammit.unicode_markup)
soup = BeautifulSoup(doc, "html.parser")
self.assertTrue(soup.contains_replacement_characters)
finally:
logging.disable(logging.NOTSET)
bs4.dammit.chardet_dammit = chardet
def test_sniffed_xml_encoding(self):
# A document written in UTF-16LE will be converted by a different
# code path that sniffs the byte order markers.
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
dammit = UnicodeDammit(data)
self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
self.assertEqual("utf-16le", dammit.original_encoding)
def test_detwingle(self):
# Here's a UTF8 document.
utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
# Here's a Windows-1252 document.
windows_1252 = (
u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
# Through some unholy alchemy, they've been stuck together.
doc = utf8 + windows_1252 + utf8
# The document can't be turned into UTF-8:
self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
# Unicode, Dammit thinks the whole document is Windows-1252,
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
# But if we run it through fix_embedded_windows_1252, it's fixed:
fixed = UnicodeDammit.detwingle(doc)
self.assertEqual(
u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
def test_detwingle_ignores_multibyte_characters(self):
# Each of these characters has a UTF-8 representation ending
# in \x93. \x93 is a smart quote if interpreted as
# Windows-1252. But our code knows to skip over multibyte
# UTF-8 characters, so they'll survive the process unscathed.
for tricky_unicode_char in (
u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
):
input = tricky_unicode_char.encode("utf8")
self.assertTrue(input.endswith(b'\x93'))
output = UnicodeDammit.detwingle(input)
self.assertEqual(output, input)
class TestNamedspacedAttribute(SoupTest):
def test_name_may_be_none(self):
a = NamespacedAttribute("xmlns", None)
self.assertEqual(a, "xmlns")
def test_attribute_is_equivalent_to_colon_separated_string(self):
a = NamespacedAttribute("a", "b")
self.assertEqual("a:b", a)
def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
a = NamespacedAttribute("a", "b", "c")
b = NamespacedAttribute("a", "b", "c")
self.assertEqual(a, b)
# The actual namespace is not considered.
c = NamespacedAttribute("a", "b", None)
self.assertEqual(a, c)
# But name and prefix are important.
d = NamespacedAttribute("a", "z", "c")
self.assertNotEqual(a, d)
e = NamespacedAttribute("z", "b", "c")
self.assertNotEqual(a, e)
class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
def test_content_meta_attribute_value(self):
value = CharsetMetaAttributeValue("euc-jp")
self.assertEqual("euc-jp", value)
self.assertEqual("euc-jp", value.original_value)
self.assertEqual("utf8", value.encode("utf8"))
def test_content_meta_attribute_value(self):
value = ContentMetaAttributeValue("text/html; charset=euc-jp")
self.assertEqual("text/html; charset=euc-jp", value)
self.assertEqual("text/html; charset=euc-jp", value.original_value)
self.assertEqual("text/html; charset=utf8", value.encode("utf8"))

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

@ -10,9 +10,20 @@ class Database(object):
TEST = 7 TEST = 7
BOOK = 8 BOOK = 8
AUDIOBOOK = 9 AUDIOBOOK = 9
LECTURE = 10
def __init__(self, host, user, password=None, database="learn"): def __init__(self, host, user, password=None, database="learn"):
self.database = oursql.connect(host=host, user=user, db=database) self.database = oursql.connect(host=host, user=user, passwd=password, db=database)
def topic_exists(self, provider, unique_id):
c = self.database.cursor()
c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (provider, unique_id))
return (len(c.fetchall()) > 0)
def item_exists(self, provider, unique_id):
c = self.database.cursor()
c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (provider, unique_id))
return (len(c.fetchall()) > 0)
def insert_topic(self, provider, unique_id, title, override=False, **kwargs): def insert_topic(self, provider, unique_id, title, override=False, **kwargs):
defaults = { defaults = {
@ -21,7 +32,8 @@ class Database(object):
"start_date": None, "start_date": None,
"end_date": None, "end_date": None,
"parent_id": 0, "parent_id": 0,
"description": "" "description": "",
"provider_name": ""
} }
for kwarg, val in defaults.iteritems(): for kwarg, val in defaults.iteritems():
@ -43,9 +55,9 @@ class Database(object):
if exists == True: if exists == True:
return (False, results[0][0]) return (False, results[0][0])
else: else:
c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`)" c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`, `CustomProviderName`)"
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], provider, unique_id, title, kwargs['description'], kwargs['creation_date'], "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], provider, unique_id, title, kwargs['description'], kwargs['creation_date'],
kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'])) kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'], kwargs["provider_name"]))
return (True, c.lastrowid) return (True, c.lastrowid)
@ -56,7 +68,10 @@ class Database(object):
"topic_id": 0, "topic_id": 0,
"parent_id": 0, "parent_id": 0,
"description": "", "description": "",
"date": None "date": None,
"start_date": None,
"end_date": None,
"provider_name": ""
} }
for kwarg, val in defaults.iteritems(): for kwarg, val in defaults.iteritems():
@ -78,8 +93,8 @@ class Database(object):
if exists == True: if exists == True:
return (False, results[0][0]) return (False, results[0][0])
else: else:
c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`)" c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`, `StartDate`, `EndDate`, `CustomProviderName`)"
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (has_topic, itemtype, provider, unique_id, title, kwargs["description"], item_url, kwargs["source_url"], "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (has_topic, itemtype, provider, unique_id, title, kwargs["description"], item_url, kwargs["source_url"],
kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"])) kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"]))
return (True, c.lastrowid) return (True, c.lastrowid)

@ -0,0 +1,26 @@
import inspect, os, sys
my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
def _import_module_into_scope(modulename):
module = __import__(modulename)
for name in vars(module):
data = getattr(module, name)
globals()[name] = data
sys.path.insert(0, my_path)
for fname in os.listdir(my_path):
fpath = os.path.join(my_path, fname)
fbasename, fext = os.path.splitext(fname)
if os.path.isdir(fpath):
if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
# This is a python directory module
_import_module_into_scope(fname)
elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
# This is a python file module
_import_module_into_scope(fbasename)
sys.path.remove(my_path)

@ -0,0 +1,50 @@
import datetime, json, sys
import requests
import shared
class Coursera(shared.Scraper):
provider_id = 2
def run(self):
self.retrieve_dataset()
self.parse_dataset()
def retrieve_dataset(self):
self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
def parse_dataset(self):
for item in self.dataset:
self.process_item(item)
def process_item(self, item):
inserted, row_id = self.insert_topic(str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
if inserted:
self.env.log("Inserted topic %s" % item["name"])
else:
self.env.log("Skipped topic %s" % item["name"])
for course in item["courses"]:
self.process_course(course, row_id)
def process_course(self, course, topicid):
try:
start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
except TypeError, e:
start_date = None
title = self.generate_title(course['name'], start_date)
inserted, row_id = self.insert_item(str(course["id"]), title, course["home_link"], has_topic=True, itemtype=self.COURSE, description=course["certificate_description"], start_date=start_date, topic_id=topicid)
if inserted:
self.env.log("Inserted item %s" % title)
else:
self.env.log("Skipped item %s" % title)
def generate_title(self, name, date):
if date is None:
return "%s (date undetermined)" % name
else:
return "%s (starting %s)" % (name, date.strftime("%b %d, %Y"))

@ -0,0 +1,201 @@
import requests
import oursql
import datetime
import json
import sys, os
import shared
from bs4 import BeautifulSoup
import bs4
rsess = requests.Session()
rsess.headers['User-Agent'] = 'http://learn.cryto.net/ (scraper@cryto.net) - We mean no harm, thanks for making knowledge free :)'
class OpenCourseWare(shared.Scraper):
def run(self):
overview = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource").text
soup = BeautifulSoup(overview)
for element in soup.find(id="pagecontent")("a"):
#if "Hopkins" not in element.string:
# continue
self.process_source(int(element["href"].split("/")[-1]), element.string)
def process_source(self, source_id, source_name):
data = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/browse/source/%d" % source_id).text
soup = BeautifulSoup(data)
courses = soup.select("table#cfResultsTable tr")
for course in courses[:3]:
links = course("a")
if len(links) > 0:
external = links[0]
details = links[1]
self.parse_course(external.string, external["href"], details["href"].split("/")[-1], source_name)
def parse_course(self, course_name, course_url, course_id, source_name):
self.env.log("Parsing %s" % course_url)
# First fetch metadata from ocwconsortium.org
ocw_data = self._metadata_ocw(course_id)
ocw_data["providername"] = source_name
ocw_data["url"] = course_url
# Now fetch metadata from the particular course provider
provider_data = self._metadata_provider(course_url)
if provider_data != False:
data = ocw_data.copy()
data.update(provider_data)
# TODO: insert data
self.env.log(repr(data))
def _metadata_ocw(self, course_id):
soup = BeautifulSoup(rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/course/%s" % course_id).text)
metadata = soup.select("dl.coursepage")[0]
if len(metadata) > 0:
data = self._parse_ocw_dl(metadata.select("dd"), metadata.select("dt"))
else:
# No metadata provided by ocwconsortium.
data = {}
return data
def _parse_ocw_dl(self, dd, dt):
data = {}
for i in xrange(0, len(dd)):
label = dd[i].string.strip().rstrip(":")
value = dt[i].string
if value is not None:
value = value.strip()
if label == "Tags":
if value == None:
data["tags"] = []
else:
data["tags"] = [x.strip() for x in value.split(",")]
elif label == "Source":
data["providername"] = value
elif label == "Language":
data["language"] = value
elif label == "Link":
# We can ignore this, we already have it anyway
pass
elif label == "Author":
if value == None:
data["author"] = None
else:
data["author"] = value
elif label == "License":
if value == None:
data["license"] = None
else:
data["license"] = value
elif label == "Date Published":
data["creation_date"] = datetime.datetime.strptime(value, "%b %d, %Y")
else:
self.env.log("UNKNOWN: %s => %s" % (label, value), True)
return data
def _metadata_provider(self, url):
providers = {
"oer.avu.org": self._metadata_avu,
"ocw.capilanou.ca": self._metadata_capilano,
"ocw.hokudai.ac.jp": self._metadata_hokkaido,
"ocw.ie.edu": self._metadata_ie,
"ocw.jhsph.edu": self._metadata_hopkins,
}
host = url.split("/")[2]
data = {}
for provider, func in providers.iteritems():
if host.endswith(provider):
return func(url)
return False
def _metadata_avu(self, url):
# African Virtual University
soup = BeautifulSoup(rsess.get(url + "?show=full").text)
table = soup.select("table.ds-includeSet-table")[0]
data = {"providername": "African Virtual University"}
for row in table("tr"):
cells = row("td")
label = cells[0].string
value = cells[1].string
if label == "dc.identifier.uri":
data["identifier_uri"] = value
elif label == "dc.type":
data["object_type"] = value
elif label == "dc.date.accessioned":
data["creation_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
elif label == "dc.date.issued":
data["issued_date"] = datetime.datetime.strptime(value, "%Y-%m-%d")
elif label == "dc.date.available":
data["available_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
elif label == "dc.language.iso":
data["language"] = value
elif label == "dc.description.abstract":
data["description"] = " ".join(x for y in cells[1]("p") for x in y.strings)
elif label == "dc.contributor.author":
data["author"] = value
elif label == "dc.title":
data["title"] = value
else:
self.env.log("UNKNOWN KEY: %s => %s" % (label, value), True)
return data
def _metadata_capilano(self, url):
# Capilano University
soup = BeautifulSoup(rsess.get(url).text)
data = {"providername": "Capilano University"}
data["title"] = soup.select("h1.documentFirstHeading")[0].string.strip()
data["description"] = " ".join(x for y in soup.select("#about > p") for x in y.strings).strip()
return data
def _metadata_hokkaido(self, url):
# Hokkaido University
soup = BeautifulSoup(rsess.get(url).text)
data = {"providername": "Hokkaido University"}
data["title"] = soup.select("#MAIN h1")[0].string.strip()
data["description"] = soup.select("#MAIN p")[0].string.strip()
return data
def _metadata_ie(self, url):
# IE University
course_id = url.split("=")[1]
soup = BeautifulSoup(rsess.get("http://ocw.ie.edu/ocw/cur%s01_esp.html" % course_id.zfill(2)).text)
data = {"providername": "IE University"}
data["title"] = soup.select(".ari_18_negrita")[0].string.strip()
data["description"] = " ".join(x.strip() for x in soup.select(".ari_12_negra")[-1].strings)
data["author"] = soup.select(".ari_12_negra")[2].select(".ari_12_negrita")[0].string.strip()
return data
def _metadata_hopkins(self, url):
# Johns Hopkins Bloomberg School of Public Health
soup = BeautifulSoup(rsess.get(url).text)
data = {"providername": "Johns Hopkins Bloomberg School of Public Health"}
data["title"] = self.soup_to_text(soup.select("h1")[-1])
data["author"] = self.soup_to_text(soup.select("#courseInfoBox p:nth-of-type(1)"))
data["description"] = self.soup_to_text(soup.select("#courseImageAndInfoBox > p"))
return data

@ -0,0 +1,197 @@
import datetime, json, sys
import requests
import shared
class KhanAcademy(shared.Scraper):
provider_id = 1
def run(self):
self.retrieve_dataset()
self.process_item(self.dataset, 0)
def retrieve_dataset(self):
self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
def process_item(self, item, level, parent=None):
try:
kind = item["kind"]
except KeyError, e:
return
if kind == "Topic":
self.process_topic(item, level, parent=parent)
elif kind in ("Video", "Exercise", "Article", "Scratchpad"):
self.process_object(item, level, parent=parent)
elif kind == "Separator":
pass # Ignore separators
else:
self.env.log("Unrecognized kind: %s" % repr(item["kind"]), True)
try:
children = item["children"]
except KeyError, e:
return
for child in children:
self.process_item(child, level + 1, item)
def process_topic(self, item, level, parent=None):
unique_id = item["id"]
try:
parent_id = parent["_cl_id"]
except TypeError, e:
parent_id = 0
# Check if a title is set
if item["title"] is not None:
title = item["title"]
else:
# No title was set - log this as an error and default to 'Untitled'.
self.env.log("No title found for item: %s" % repr(item), True)
title = "Untitled"
# Check if a description is set, and default to no description if not
if item["description"] is not None:
description = item["description"]
else:
description = None
# Insert the topic
inserted, row_id = self.insert_topic(unique_id, title, description=description, needs_enrollment=False)
# Set the ID of the newly inserted row so that all objects in this topic know the ID of their topic.
item["_cl_id"] = row_id
if inserted:
self.env.log("Inserted %s" % title)
else:
self.env.log("Skipped %s" % title)
def process_object(self, item, level, parent=None):
unique_id = None
# First check for the 'readable_id' property
try:
unique_id = item["readable_id"]
except KeyError, e:
pass
# If no identifier was found, check for the 'name' property
if unique_id is None:
try:
unique_id = item["name"]
except KeyError, e:
pass
# If still no identifier was found, check for the 'id' property
if unique_id is None:
try:
unique_id = str(item["id"])
except KeyError, e:
pass
# If we *still* do not have an identifier, log the error and bail out
if unique_id is None:
self.env.log("No suitable identifier found for item: %s" % repr(item), True)
return
# Determine the object type
if item["kind"] == "Video":
itemtype = self.VIDEO
elif item["kind"] == "Exercise":
itemtype = self.EXERCISE
elif item["kind"] == "Article":
itemtype = self.ARTICLE
elif item["kind"] == "Scratchpad":
itemtype = self.SANDBOX
source_url = None
# Determine the source URL via the 'ka_url' property
try:
source_url = item["ka_url"]
except KeyError, e:
pass
# If no source URL was found, try the 'url' property
if source_url is None:
try:
source_url = item["url"]
except KeyError, e:
pass
# If still no source URL was found...
if source_url is None:
if itemtype == self.ARTICLE:
# Articles can lack a URL.
source_url = None
else:
# There was no source URL, but this wasn't an article. Log the error and bail out.
self.env.log("No source URL found for non-article object: %s" % repr(item), True)
return
# Determine the (external) item URL
try:
item_url = item["url"]
except KeyError, e:
# Apparently there was no external item URL. Use the source URL as item URL - this will most likely be correct.
item_url = source_url
# If the object is an article, we'll want to use the actual article content as description.
if itemtype == self.ARTICLE:
description = item["content"]
else:
# Otherwise, we'll check if there's a 'description' property. If not, leave empty.
try:
description = item["description"]
except KeyError, e:
description = None
title = None
# First check the 'title' property for an object title.
try:
title = item["title"]
except KeyError, e:
pass
# As second option, check the 'display_name' property.
if title is None:
try:
title = item["display_name"]
except KeyError, e:
# Apparently it really does not have a title. Log the error and default to 'Untitled'.
self.env.log("No object title found for item: %s" % repr(item), True)
title = "Untitled"
# If a 'views' property is present, include it.
try:
views = item["views"]
except KeyError, e:
views = None
# If a creation date is present, include it.
try:
date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
except KeyError, e:
date = None
# Check if there is a parent ID
try:
parent_id = parent["_cl_id"]
except KeyError, e:
# No parent ID present - log this as an error and default to 0.
self.env.log("No parent ID found for item: %s" % repr(item), True)
parent_id = 0
# Insert the item
inserted, row_id = self.insert_item(unique_id, title, item_url, itemtype=itemtype, has_topic=True, source_url=source_url, description=description, views=views, topic_id=parent_id, date=date)
# Store the resulting row ID in the item so that the children know the ID of their parent.
item["_cl_id"] = row_id
if inserted:
self.env.log("Inserted %s" % title)
else:
self.env.log("Skipped %s" % title)

@ -0,0 +1,55 @@
import datetime, json, simplejson, sys, re
import requests
import shared
class UniversityOfReddit(shared.Scraper):
provider_id = 3
def run(self):
data = requests.get("http://ureddit.com/api?type=catalog").json()
for category in data["categories"]:
self.parse_category(category['id'], category['value'])
def parse_category(self, category_id, category_name):
try:
data = requests.get("http://ureddit.com/api?type=category&id=%s" % category_id).json()
except simplejson.decoder.JSONDecodeError, e:
return
for _class in data["classes"]:
if not self.topic_exists(_class['id']):
self.parse_class(_class['id'], _class['value'], category_name)
else:
self.env.log("Skipped class %s" % _class['value'])
def parse_class(self, class_id, class_name, category_name):
try:
data = requests.get("http://ureddit.com/api?type=class&id=%s" % class_id).json()
except simplejson.decoder.JSONDecodeError, e:
self.env.log("Skipped %s due to JSON formatting error" % class_name, True)
return
if data["status"] == '1' or data["status"] == '3' or data["status"] == '5':
try:
creation_date = datetime.datetime.strptime(data["created"], '%Y-%m-%d %H:%M:%S')
except ValueError, e:
creation_date = None
class_page = data["url"]
inserted, topic_id = self.insert_topic(str(class_id), data["name"], needs_enrollment=True, description=data["description"], creation_date=creation_date)
if inserted:
self.env.log("Inserted topic %s" % data["name"])
else:
self.env.log("Skipped topic %s" % data["name"])
inserted, item_id = self.insert_item(str(class_id), data["name"], class_page, itemtype=self.COURSE, has_topic=True, topic_id=topic_id, date=creation_date, description=data["description"])
if inserted:
self.env.log("Inserted item %s" % data["name"])
else:
self.env.log("Skipped item %s" % data["name"])
else:
self.env.log("Skipped %s due to status (%s)" % (data["name"], data["status_description"]))

@ -0,0 +1,26 @@
import inspect, os, sys
my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
def _import_module_into_scope(modulename):
module = __import__(modulename)
for name in vars(module):
data = getattr(module, name)
globals()[name] = data
sys.path.insert(0, my_path)
for fname in os.listdir(my_path):
fpath = os.path.join(my_path, fname)
fbasename, fext = os.path.splitext(fname)
if os.path.isdir(fpath):
if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
# This is a python directory module
_import_module_into_scope(fname)
elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
# This is a python file module
_import_module_into_scope(fbasename)
sys.path.remove(my_path)

@ -0,0 +1,17 @@
import oursql, sys
class Environment(object):
def connect(self, host="localhost", username="root", password="", database="learn"):
self.db = oursql.connect(host=host, user=username, passwd=password, db=database)
self.connected = True
def log(self, text, is_error=False):
if is_error == False:
sys.stdout.write(text + "\n")
else:
sys.stderr.write(text + "\n")
def Scraper(self, scraper_class):
s = scraper_class(self.db)
s.env = self
return s

@ -0,0 +1,122 @@
class Scraper(object):
UNKNOWN = 0
TOPIC = 1
COURSE = 2
VIDEO = 3
ARTICLE = 4
EXERCISE = 5
QUIZ = 6
TEST = 7
BOOK = 8
AUDIOBOOK = 9
LECTURE = 10
SANDBOX = 11
provider_id = 0
def __init__(self, database=None):
if database is not None:
self.db = database
self.can_store = True
else:
self.can_store = False
def run(self, *args, **kwargs):
raise Exception("No run() method was specified for this scraper.")
def topic_exists(self, unique_id):
c = self.db.cursor()
c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
return (len(c.fetchall()) > 0)
def item_exists(self, unique_id):
c = self.db.cursor()
c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
return (len(c.fetchall()) > 0)
def insert_topic(self, unique_id, title, override=False, **kwargs):
defaults = {
"needs_enrollment": False,
"creation_date": None,
"start_date": None,
"end_date": None,
"parent_id": 0,
"description": "",
"provider_name": ""
}
for kwarg, val in defaults.iteritems():
try:
if kwargs[kwarg] == None:
kwargs[kwarg] = defaults[kwarg]
except KeyError, e:
kwargs[kwarg] = defaults[kwarg]
c = self.db.cursor()
if override == True:
exists = False
else:
c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
results = c.fetchall()
exists = (len(results) > 0)
if exists == True:
return (False, results[0][0])
else:
c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`, `CustomProviderName`)"
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], self.provider_id, unique_id, title, kwargs['description'], kwargs['creation_date'],
kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'], kwargs["provider_name"]))
return (True, c.lastrowid)
def insert_item(self, unique_id, title, item_url, override=False, **kwargs):
defaults = {
"views": None,
"has_topic": False,
"itemtype": 0,
"source_url": item_url,
"topic_id": 0,
"parent_id": 0,
"description": "",
"date": None,
"start_date": None,
"end_date": None,
"provider_name": ""
}
for kwarg, val in defaults.iteritems():
try:
if kwargs[kwarg] == None:
kwargs[kwarg] = defaults[kwarg]
except KeyError, e:
kwargs[kwarg] = defaults[kwarg]
c = self.db.cursor()
if override == True:
exists = False
else:
c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
results = c.fetchall()
exists = (len(results) > 0)
if exists == True:
return (False, results[0][0])
else:
c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`, `StartDate`, `EndDate`, `CustomProviderName`)"
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs["has_topic"], kwargs["itemtype"], self.provider_id, unique_id, title, kwargs["description"], item_url, kwargs["source_url"],
kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"]))
return (True, c.lastrowid)
def soup_to_text(self, soup):
strings = []
try:
for el in soup:
strings += el._all_strings(True, True)
except AttributeError, e:
strings = soup._all_strings(True, True)
return " ".join(strings)

@ -0,0 +1,4 @@
import update_ocw
c = update_ocw.OpenCourseWareCrawler()
print c.get_provider_data("http://ocw.jhsph.edu/courses/AdolHealthDev/?source=rss")

@ -0,0 +1,8 @@
#!/usr/bin/env python
import shared, scrapers
env = shared.Environment()
env.connect(host="localhost", username="root", password="", database="learn")
scraper = env.Scraper(scrapers.OpenCourseWare)
scraper.run()

@ -1,131 +0,0 @@
import requests
import oursql
import datetime
import json
import lib
class KhanUniversityCrawler(object):
def __init__(self):
self.db = lib.Database("localhost", "root")
def retrieve_dataset(self):
self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
#self.dataset = json.loads(open("data.json", "r").read())
def parse_dataset(self):
self.process_item(self.dataset, 0)
def process_item(self, item, level, parent=None):
try:
kind = item["kind"]
except KeyError, e:
return
if kind == "Topic":
unique_id = item["id"]
try:
parent_id = parent["_cl_id"]
except TypeError, e:
parent_id = 0
if item["title"] is not None:
title = item["title"]
else:
title = ""
inserted, rowid = self.db.insert_topic(1, unique_id, title, description=item["description"], needs_enrollment=False)
item["_cl_id"] = rowid
if inserted:
print "Inserted %s" % title
else:
print "Skipped %s" % title
elif kind in ("Video", "Exercise", "Article"):
try:
unique_id = item["readable_id"]
except KeyError, e:
try:
unique_id = item["name"]
except KeyError, e:
try:
unique_id = str(item["id"])
except KeyError, e:
print repr(item)
sys.stderr.write("WARNING: No suitable identifier found for item\n")
raise
return
if item["kind"] == "Video":
itemtype = self.db.VIDEO
elif item["kind"] == "Exercise":
itemtype = self.db.EXERCISE
elif item["kind"] == "Article":
itemtype = self.db.ARTICLE
try:
source_url = item["ka_url"]
except KeyError, e:
if itemtype == self.db.ARTICLE:
source_url = ""
else:
return
try:
item_url = item["url"]
except KeyError, e:
try:
item_url = item["ka_url"]
except KeyError, e:
item_url = None
if itemtype == self.db.ARTICLE:
description = item["content"]
else:
try:
description = item["description"]
except KeyError, e:
description = None
try:
title = item["title"]
except KeyError, e:
try:
title = item["display_name"]
except KeyError, e:
title = "Untitled"
try:
views = item["views"]
except KeyError, e:
views = None
try:
date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
except KeyError, e:
date = None
inserted, rowid = self.db.insert_item(1, unique_id, True, itemtype, title, item_url, source_url=source_url, description=description, views=views, topic_id=parent["_cl_id"], date=date)
item["_cl_id"] = rowid
if inserted:
print "Inserted %s" % title
else:
print "Skipped %s" % title
elif kind == "Separator":
pass # Ignore separators
else:
sys.stderr.write("Unrecognized kind: %s\n" % item["kind"])
sys.stderr.write("%s\n" % (repr(item)))
try:
children = item["children"]
except KeyError, e:
pass
else:
for child in children:
self.process_item(child, level + 1, item)
crawler = KhanUniversityCrawler()
crawler.retrieve_dataset()
crawler.parse_dataset()

@ -0,0 +1,288 @@
import requests
import oursql
import datetime
import json
import lib
from bs4 import BeautifulSoup
import bs4
def combine_dict(a, b):
c = a.copy()
c.update(b)
return c
rsess = requests.Session()
rsess.headers['User-Agent'] = 'http://learn.cryto.net/ (scraper@cryto.net) - We mean no harm, thanks for making knowledge free :)'
class OpenCourseWareCrawler(object):
def __init__(self):
self.db = lib.Database("localhost", "root", password="")
def parse_catalog(self):
overview = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource").text
soup = BeautifulSoup(overview)
for element in soup.find(id="pagecontent")("a"):
self.parse_source(int(element["href"].split("/")[-1]), element.string)
def parse_source(self, source_id, source_name):
data = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/browse/source/%d" % source_id).text
soup = BeautifulSoup(data)
courses = soup.select("table#cfResultsTable tr")
print "# " + source_name
for course in courses[:2]:
links = course("a")
if len(links) > 0:
external = links[0]
details = links[1]
self.parse_course(external.string, external["href"], details["href"].split("/")[-1])
def parse_course(self, course_name, course_url, course_id):
# First fetch metadata from ocwconsortium.org
print course_url
metadata_soup = BeautifulSoup(rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/course/%s" % course_id).text)
metadata = metadata_soup.select("dl.coursepage")[0]
if len(metadata) > 0:
data = self.parse_dl(metadata.select("dd"), metadata.select("dt"))
else:
# No metadata provided by ocwconsortium.
data = {}
# Now fetch metadata from the particular course provider
provider_data = self.get_provider_data(course_url)
if provider_data != {}:
print repr(provider_data)
def parse_dl(self, dd, dt):
data = {}
for i in xrange(0, len(dd)):
label = dd[i].string.strip().rstrip(":")
value = dt[i].string
if value is not None:
value = value.strip()
if label == "Tags":
if value == None:
data["tags"] = []
else:
data["tags"] = [x.strip() for x in value.split(",")]
elif label == "Source":
data["source"] = value
elif label == "Language":
data["language"] = value
elif label == "Link":
# We can ignore this, we already have it anyway
pass
elif label == "Author":
if value == None:
data["author"] = None
else:
data["author"] = value
elif label == "License":
if value == None:
data["license"] = None
else:
data["license"] = value
elif label == "Date Published":
data["creation_date"] = datetime.datetime.strptime(value, "%b %d, %Y")
else:
print "UNKNOWN: %s => %s" % (label, value)
return data
def get_provider_data(self, url):
providers = {
"oer.avu.org": self._data_avu,
"ocw.capilanou.ca": self._data_capilano,
"ocw.hokudai.ac.jp": self._data_hokkaido,
"ocw.ie.edu": self._data_ie,
"ocw.jhsph.edu": self._data_hopkins,
}
""",
"ocw.kaplan.edu": self._data_kaplan,
"ocw.korea.edu": self._data_korea,
"kyotomm.jp": self._data_kyoto,
"ocw.kyushu-u.ac.jp": self._data_kyushu,
"open-marhi.ru": self._data_moscow,
"yctrtrc.ncku.edu.tw": self._data_chengkung,
"ocw.nctu.edu.tw": self._data_chiaotung,
"opencourse.ndhu.edu.tw": self._data_donghwa,
"ocw.njit.edu": self._data_njit,
"graduateschool.paristech.fr": self._data_paris,
"peoples-uni.org": self._data_oaei,
"ocw.sbu.ac.ir": self._data_shahid,
"studentscircle.net": self._data_studentscircle,
"ocw.tmu.edu.tw:8080": self._data_taipei,
"openlearn.open.ac.uk": self._data_openuni,
"www.ocw.titech.ac.jp": self._data_tokyo,
"feedproxy.google.com": self._data_tudelft,
"ocw.tufts.edu": self._data_tufts,
"ocw.unu.edu": self._data_un,
"ocw.uc3m.es": self._data_madrid,
"ocw.ua.es": self._data_alicante,
"ocw.unican.es": self._data_cantabria,
"ocw.ugr.es": self._data_granada,
"ocw.udem.edu.mx": self._data_monterrey,
"ocw.um.es": self._data_murcia,
"ocw.uniovi.es": self._data_oviedo,
"ocw.usal.es": self._data_salamanca,
"ocwus.us.es": self._data_sevilla,
"ocw.unizar.es": self._data_zaragoza,
"ocw.univalle.edu.co3": self._data_colombia,
"ocw.uned.ac.cr": self._data_distancia,
"www.icesi.edu.co": self._data_icesi,
"ocw.innova.uned.es": self._data_innova,
"upv.es": self._data_valencia,
"ocw.upm.es": self._data_upm,
"ocw.utpl.edu.ec": self._data_utpl,
"ocw.uab.cat": self._data_uab,
"ocw.ub.edu": self._data_ub,
"ocw.uib.es": self._data_uib,
"ocw.udl.cat": self._data_udl,
"ocw.uv.es": self._data_uv,
"e-ujier.uji.e": self._data_uji,
"ocw.uoc.edu": self._data_uoc,
"ocw.utm.my": self._data_utm,
"ocw.uci.edu": self._data_uci,
"opencontent.uct.ac.za": self._data_uct,
"ocw.umb.edu:8080": self._data_boston,
"open.umich.edu": self._data_michigan,
"ocw.nd.edu": self._data_notredame,
"ocw.usu.ac.id": self._data_usu,
"ocw.tsukuba.ac.jp": self._data_tsukaba"""
host = url.split("/")[2]
data = {}
for provider, func in providers.iteritems():
if host.endswith(provider):
data = func(url)
return data
def _data_avu(self, url):
# African Virtual University
soup = BeautifulSoup(rsess.get(url + "?show=full").text)
table = soup.select("table.ds-includeSet-table")[0]
data = {"providername": "African Virtual University"}
for row in table("tr"):
cells = row("td")
label = cells[0].string
value = cells[1].string
if label == "dc.identifier.uri":
data["identifier_uri"] = value
elif label == "dc.type":
data["object_type"] = value
elif label == "dc.date.accessioned":
data["creation_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
elif label == "dc.date.issued":
data["issued_date"] = datetime.datetime.strptime(value, "%Y-%m-%d")
elif label == "dc.date.available":
data["available_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
elif label == "dc.language.iso":
data["language"] = value
elif label == "dc.description.abstract":
data["description"] = " ".join(x for y in cells[1]("p") for x in y.strings)
elif label == "dc.contributor.author":
data["author"] = value
elif label == "dc.title":
data["title"] = value
else:
print "UNKNOWN KEY: %s => %s" % (label, value)
return data
def _data_capilano(self, url):
# Capilano University
soup = BeautifulSoup(rsess.get(url).text)
data = {"providername": "Capilano University"}
data["title"] = soup.select("h1.documentFirstHeading")[0].string.strip()
data["description"] = " ".join(x for y in soup.select("#about > p") for x in y.strings).strip()
return data
def _data_hokkaido(self, url):
# Hokkaido University
soup = BeautifulSoup(rsess.get(url).text)
data = {"providername": "Hokkaido University"}
data["title"] = soup.select("#MAIN h1")[0].string.strip()
data["description"] = soup.select("#MAIN p")[0].string.strip()
return data
def _data_ie(self, url):
# IE University
course_id = url.split("=")[1]
soup = BeautifulSoup(rsess.get("http://ocw.ie.edu/ocw/cur%s01_esp.html" % course_id.zfill(2)).text)
data = {"providername": "IE University"}
data["title"] = soup.select(".ari_18_negrita")[0].string.strip()
data["description"] = " ".join(x.strip() for x in soup.select(".ari_12_negra")[-1].strings)
data["author"] = soup.select(".ari_12_negra")[2].select(".ari_12_negrita")[0].string.strip()
return data
def _data_hopkins(self, url):
# Johns Hopkins Bloomberg School of Public Health
soup = BeautifulSoup(rsess.get(url).text)
data = {"providername": "Johns Hopkins Bloomberg School of Public Health"}
data["title"] = " ".join(x.strip() for x in soup.select("h1")[-1].strings if type(x) != bs4.element.Comment)
data["author"] = soup.select("#courseInfoBox p")[0].string.strip()
data["description"] = soup.select("#courseImageAndInfoBox p")[-1].string.strip()
return data
def parse_dataset(self):
for item in self.dataset:
self.process_item(item)
def process_item(self, item):
inserted, rowid = self.db.insert_topic(2, str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
if inserted:
print "Inserted %s" % item["name"]
else:
print "Skipped %s" % item["name"]
for course in item["courses"]:
self.process_course(course, rowid)
def process_course(self, course, topicid):
try:
start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
title = "%s: %s-%s-%s" % (course["name"], str(course["start_year"]).zfill(4), str(course["start_month"]).zfill(2), str(course["start_day"]).zfill(2))
except TypeError, e:
start_date = None
title = "%s (date undetermined)" % (course["name"])
inserted, itemid = self.db.insert_item(2, str(course["id"]), True, self.db.COURSE, title, course["home_link"], description=course["certificate_description"], start_date=start_date, topic_id=topicid)
if inserted:
print "\tInserted %s" % title
else:
print "\tSkipped %s" % title
#crawler = OpenCourseWareCrawler()
#crawler.parse_catalog()
Loading…
Cancel
Save