Compare commits
31 commits
Author | SHA1 | Date | |
---|---|---|---|
|
d98ee113bc | ||
|
98340b38a0 | ||
|
8bbffb9429 | ||
|
0e4df4549f | ||
|
2c3bcc5418 | ||
|
d9034b6215 | ||
|
8c0033074b | ||
|
b3edd35ecf | ||
|
d6d8eb70b9 | ||
|
fb6c43a38f | ||
|
c2a8a66dac | ||
|
a690cb2c8f | ||
|
f188d443d1 | ||
|
43c700ac2b | ||
|
26b68952fa | ||
|
a4e744f892 | ||
|
d3bd59f813 | ||
|
8e951f6b27 | ||
|
d387541822 | ||
|
a6e350c0d9 | ||
|
0f5cade812 | ||
|
fa74d394a7 | ||
|
a9d2576eaf | ||
|
f57d45fa53 | ||
|
1503c1f75f | ||
|
bfbfd821b5 | ||
|
efeef5f70e | ||
|
3f02174ba3 | ||
|
1fbb21e6d8 | ||
|
dd4c62bc4e | ||
|
6ec1a2d90b |
9
README.md
Normal file
9
README.md
Normal file
|
@ -0,0 +1,9 @@
|
|||
# Cryto Learn
|
||||
|
||||
This is the source code for http://learn.cryto.net/. It consists of the following:
|
||||
|
||||
* The updating script, a few very rudimentary scrapers for various educational sources. Requires Python 2. Dependencies are [oursql](http://packages.python.org/oursql/), [requests](http://docs.python-requests.org/en/latest/) and BeautifulSoup 4 (custom version included). Located in `updater/`.
|
||||
* The frontend, a fairly hacky and messy PHP-based search interface. Needs cleaning up, but not an immediate priority. Requires PHP 5.3+ and uses [CPHP](http://github.com/joepie91/cphp). Located in `frontend/`.
|
||||
* A simple shell search script, using the Cryto Learn API to search for the specified string and print results to stdout. Requires Python 2. Also very rudimentary.
|
||||
|
||||
Licensed under the WTFPL. It may or may not work on your system, use at your own risk, etc. etc.
|
7
book_data_sources.txt
Normal file
7
book_data_sources.txt
Normal file
|
@ -0,0 +1,7 @@
|
|||
API:
|
||||
http://www.goodreads.com/api
|
||||
https://developers.google.com/books/docs/getting-started#books_api_v1
|
||||
|
||||
Dumps:
|
||||
http://openlibrary.org/data/ol_dump_latest.txt.gz
|
||||
http://www.librarything.com/feeds/
|
30
config.json
Normal file
30
config.json
Normal file
|
@ -0,0 +1,30 @@
|
|||
{
|
||||
"database": {
|
||||
"driver": "mysql",
|
||||
"pdo": true,
|
||||
"hostname": "localhost",
|
||||
"username": "root",
|
||||
"password": "",
|
||||
"database": "learn"
|
||||
},
|
||||
"locale": {
|
||||
"path": "locales",
|
||||
"extension": "lng",
|
||||
"default_locale": "english",
|
||||
"default_timezone": "Europe/Amsterdam"
|
||||
},
|
||||
"memcache": {
|
||||
"enabled": true,
|
||||
"compressed": true,
|
||||
"hostname": "localhost",
|
||||
"port": 11211
|
||||
},
|
||||
"class_map": {
|
||||
"item": "Item",
|
||||
"topic": "Topic"
|
||||
},
|
||||
"components": [
|
||||
"router",
|
||||
"errorhandler"
|
||||
]
|
||||
}
|
152
frontend/classes/item.php
Normal file
152
frontend/classes/item.php
Normal file
|
@ -0,0 +1,152 @@
|
|||
<?php
|
||||
/*
|
||||
* Cryto Learn is more free software. It is licensed under the WTFPL, which
|
||||
* allows you to do pretty much anything with it, without having to
|
||||
* ask permission. Commercial use is allowed, and no attribution is
|
||||
* required. We do politely request that you share your modifications
|
||||
* to benefit other developers, but you are under no enforced
|
||||
* obligation to do so :)
|
||||
*
|
||||
* Please read the accompanying LICENSE document for the full WTFPL
|
||||
* licensing text.
|
||||
*/
|
||||
|
||||
if(!isset($_APP)) { die("Unauthorized."); }
|
||||
|
||||
class Item extends CPHPDatabaseRecordClass
|
||||
{
|
||||
public $table_name = "items";
|
||||
public $fill_query = "SELECT * FROM items WHERE `Id` = :Id";
|
||||
public $verify_query = "SELECT * FROM items WHERE `Id` = :Id";
|
||||
|
||||
public $prototype = array(
|
||||
'string' => array(
|
||||
'Title' => "Title",
|
||||
'Description' => "Description",
|
||||
'SourceUrl' => "SourceUrl",
|
||||
'ItemUrl' => "ItemUrl"
|
||||
),
|
||||
'numeric' => array(
|
||||
'Type' => "Type",
|
||||
'Provider' => "Provider",
|
||||
'Views' => "Views",
|
||||
'TopicId' => "TopicId",
|
||||
'ParentId' => "ParentId"
|
||||
),
|
||||
'boolean' => array(
|
||||
'HasTopic' => "HasTopic"
|
||||
),
|
||||
'timestamp' => array(
|
||||
'CreationDate' => "Date",
|
||||
'StartDate' => "StartDate",
|
||||
'EndDate' => "EndDate"
|
||||
),
|
||||
'topic' => array(
|
||||
'Topic' => "TopicId"
|
||||
),
|
||||
'item' => array(
|
||||
'Parent' => "ParentId"
|
||||
)
|
||||
);
|
||||
|
||||
public function __get($name)
|
||||
{
|
||||
switch($name)
|
||||
{
|
||||
case "sTypeName":
|
||||
return $this->GetTypeName();
|
||||
break;
|
||||
case "sProviderName":
|
||||
return $this->GetProviderName();
|
||||
break;
|
||||
default:
|
||||
return parent::__get($name);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
public function GetTypeName()
|
||||
{
|
||||
switch($this->sType)
|
||||
{
|
||||
case 1:
|
||||
return "topic";
|
||||
case 2:
|
||||
return "course";
|
||||
case 3:
|
||||
return "video";
|
||||
case 4:
|
||||
return "article";
|
||||
case 5:
|
||||
return "exercise";
|
||||
case 6:
|
||||
return "quiz";
|
||||
case 7:
|
||||
return "test";
|
||||
case 8:
|
||||
return "book";
|
||||
case 9:
|
||||
return "audiobook";
|
||||
case 10:
|
||||
return "lecture";
|
||||
case 11:
|
||||
return "sandbox";
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
public function GetProviderName()
|
||||
{
|
||||
switch($this->sProvider)
|
||||
{
|
||||
case 1:
|
||||
return "Khan Academy";
|
||||
case 2:
|
||||
return "Coursera";
|
||||
case 3:
|
||||
return "University of Reddit";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
public function GetChildren()
|
||||
{
|
||||
try
|
||||
{
|
||||
return Item::CreateFromQuery("SELECT * FROM items WHERE `ParentId` = :ParentId", array(':ParentId' => $this->sId));
|
||||
}
|
||||
catch (NotFoundException $e)
|
||||
{
|
||||
return array();
|
||||
}
|
||||
}
|
||||
|
||||
public function AsDataset($fetch_children = true)
|
||||
{
|
||||
$child_data = array();
|
||||
|
||||
if($fetch_children == true)
|
||||
{
|
||||
foreach($this->GetChildren() as $child)
|
||||
{
|
||||
$child_data[] = $child->AsDataset();
|
||||
}
|
||||
}
|
||||
|
||||
return array(
|
||||
"title" => $this->uTitle,
|
||||
"description" => $this->uDescription,
|
||||
"url" => $this->uItemUrl,
|
||||
"source" => $this->uSourceUrl,
|
||||
"created" => $this->sCreationDate,
|
||||
"start" => $this->sStartDate,
|
||||
"end" => $this->sEndDate,
|
||||
"type" => $this->sTypeName,
|
||||
"provider" => $this->sProviderName,
|
||||
"views" => $this->sViews,
|
||||
"children" => $child_data
|
||||
);
|
||||
}
|
||||
}
|
131
frontend/classes/topic.php
Normal file
131
frontend/classes/topic.php
Normal file
|
@ -0,0 +1,131 @@
|
|||
<?php
|
||||
/*
|
||||
* Cryto Learn is more free software. It is licensed under the WTFPL, which
|
||||
* allows you to do pretty much anything with it, without having to
|
||||
* ask permission. Commercial use is allowed, and no attribution is
|
||||
* required. We do politely request that you share your modifications
|
||||
* to benefit other developers, but you are under no enforced
|
||||
* obligation to do so :)
|
||||
*
|
||||
* Please read the accompanying LICENSE document for the full WTFPL
|
||||
* licensing text.
|
||||
*/
|
||||
|
||||
if(!isset($_APP)) { die("Unauthorized."); }
|
||||
|
||||
class Topic extends CPHPDatabaseRecordClass
|
||||
{
|
||||
public $table_name = "topics";
|
||||
public $fill_query = "SELECT * FROM topics WHERE `Id` = :Id";
|
||||
public $verify_query = "SELECT * FROM topics WHERE `Id` = :Id";
|
||||
|
||||
public $prototype = array(
|
||||
'string' => array(
|
||||
'Title' => "Title",
|
||||
'ProviderId' => "ProviderId",
|
||||
'Description' => "Description"
|
||||
),
|
||||
'numeric' => array(
|
||||
'ParentId' => "ParentId",
|
||||
'Provider' => "Provider"
|
||||
),
|
||||
'boolean' => array(
|
||||
'NeedsEnrollment' => "NeedsEnrollment"
|
||||
),
|
||||
'timestamp' => array(
|
||||
'CreationDate' => "Created",
|
||||
'StartDate' => "StartDate",
|
||||
'EndDate' => "EndDate"
|
||||
),
|
||||
'topic' => array(
|
||||
'Parent' => "ParentId"
|
||||
)
|
||||
);
|
||||
|
||||
public function __get($name)
|
||||
{
|
||||
switch($name)
|
||||
{
|
||||
case "sProviderName":
|
||||
return $this->GetProviderName();
|
||||
break;
|
||||
default:
|
||||
return parent::__get($name);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
public function GetProviderName()
|
||||
{
|
||||
switch($this->sProvider)
|
||||
{
|
||||
case 1:
|
||||
return "Khan Academy";
|
||||
case 2:
|
||||
return "Coursera";
|
||||
case 3:
|
||||
return "University of Reddit";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
public function AsDataset($fetch_children = true, $fetch_items = true)
|
||||
{
|
||||
$child_data = array();
|
||||
|
||||
if($fetch_children == true)
|
||||
{
|
||||
foreach($this->GetChildren() as $child)
|
||||
{
|
||||
$child_data[] = $child->AsDataset();
|
||||
}
|
||||
}
|
||||
|
||||
$item_data = array();
|
||||
|
||||
if($fetch_items == true)
|
||||
{
|
||||
foreach($this->GetItems() as $item)
|
||||
{
|
||||
$item_data[] = $item->AsDataset();
|
||||
}
|
||||
}
|
||||
|
||||
return array(
|
||||
"title" => $this->uTitle,
|
||||
"description" => $this->uDescription,
|
||||
"created" => $this->sCreationDate,
|
||||
"start" => $this->sStartDate,
|
||||
"end" => $this->sEndDate,
|
||||
"provider" => $this->sProviderName,
|
||||
"needs_enrollment" => $this->sNeedsEnrollment,
|
||||
"children" => $child_data,
|
||||
"items" => $item_data
|
||||
);
|
||||
}
|
||||
|
||||
public function GetItems()
|
||||
{
|
||||
try
|
||||
{
|
||||
return Item::CreateFromQuery("SELECT * FROM items WHERE `TopicId` = :TopicId", array(':TopicId' => $this->sId));
|
||||
}
|
||||
catch (NotFoundException $e)
|
||||
{
|
||||
return array();
|
||||
}
|
||||
}
|
||||
|
||||
public function GetChildren()
|
||||
{
|
||||
try
|
||||
{
|
||||
return Topic::CreateFromQuery("SELECT * FROM topics WHERE `ParentId` = :ParentId", array(':ParentId' => $this->sId));
|
||||
}
|
||||
catch (NotFoundException $e)
|
||||
{
|
||||
return array();
|
||||
}
|
||||
}
|
||||
}
|
1
frontend/cphp
Symbolic link
1
frontend/cphp
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../cphp
|
1
frontend/dump.json
Normal file
1
frontend/dump.json
Normal file
File diff suppressed because one or more lines are too long
26
frontend/includes/base.php
Normal file
26
frontend/includes/base.php
Normal file
|
@ -0,0 +1,26 @@
|
|||
<?php
|
||||
/*
|
||||
* Cryto Learn is more free software. It is licensed under the WTFPL, which
|
||||
* allows you to do pretty much anything with it, without having to
|
||||
* ask permission. Commercial use is allowed, and no attribution is
|
||||
* required. We do politely request that you share your modifications
|
||||
* to benefit other developers, but you are under no enforced
|
||||
* obligation to do so :)
|
||||
*
|
||||
* Please read the accompanying LICENSE document for the full WTFPL
|
||||
* licensing text.
|
||||
*/
|
||||
|
||||
if(!isset($_APP)) { die("Unauthorized."); }
|
||||
|
||||
$_CPHP = true;
|
||||
$_CPHP_CONFIG = "../config.json";
|
||||
require("cphp/base.php");
|
||||
|
||||
function __autoload($class_name)
|
||||
{
|
||||
global $_APP;
|
||||
|
||||
$class_name = str_replace("\\", "/", strtolower($class_name));
|
||||
require_once("classes/{$class_name}.php");
|
||||
}
|
14
frontend/index.php
Normal file
14
frontend/index.php
Normal file
|
@ -0,0 +1,14 @@
|
|||
<?php
|
||||
/*
|
||||
* Cryto Learn is more free software. It is licensed under the WTFPL, which
|
||||
* allows you to do pretty much anything with it, without having to
|
||||
* ask permission. Commercial use is allowed, and no attribution is
|
||||
* required. We do politely request that you share your modifications
|
||||
* to benefit other developers, but you are under no enforced
|
||||
* obligation to do so :)
|
||||
*
|
||||
* Please read the accompanying LICENSE document for the full WTFPL
|
||||
* licensing text.
|
||||
*/
|
||||
|
||||
require("rewrite.php");
|
24
frontend/locales/english.lng
Normal file
24
frontend/locales/english.lng
Normal file
|
@ -0,0 +1,24 @@
|
|||
_locale; en_US.UTF-8,en_US
|
||||
_datetime_short; %d/%m/%Y %H:%M:%S
|
||||
_datetime_long; %A %B %d, %Y %H:%M:%S
|
||||
_date_short; %d/%m/%Y
|
||||
_date_long; %A %B %d, %Y
|
||||
_time; %H:%M:%S
|
||||
|
||||
event-now; now
|
||||
event-future; in the future
|
||||
event-past; in the past
|
||||
event-1second-ago; 1 second ago
|
||||
event-seconds-ago; %1$d seconds ago
|
||||
event-1minutes-ago; 1 minute ago
|
||||
event-minutes-ago; %1$d minutes ago
|
||||
event-1hour-ago; 1 hour ago
|
||||
event-hours-ago; %1$d hours ago
|
||||
event-1day-ago; 1 day ago
|
||||
event-days-ago; %1$d days ago
|
||||
event-1week-ago; 1 week ago
|
||||
event-weeks-ago; %1$d weeks ago
|
||||
event-1month-ago; 1 month ago
|
||||
event-months-ago; %1$d months ago
|
||||
event-1year-ago; 1 year ago
|
||||
event-years-ago; %1$d years ago
|
28
frontend/modules/api/dump.php
Normal file
28
frontend/modules/api/dump.php
Normal file
|
@ -0,0 +1,28 @@
|
|||
<?php
|
||||
/*
|
||||
* Cryto Learn is more free software. It is licensed under the WTFPL, which
|
||||
* allows you to do pretty much anything with it, without having to
|
||||
* ask permission. Commercial use is allowed, and no attribution is
|
||||
* required. We do politely request that you share your modifications
|
||||
* to benefit other developers, but you are under no enforced
|
||||
* obligation to do so :)
|
||||
*
|
||||
* Please read the accompanying LICENSE document for the full WTFPL
|
||||
* licensing text.
|
||||
*/
|
||||
|
||||
if(!isset($_APP)) { die("Unauthorized."); }
|
||||
|
||||
if($_GET['key'] !== "derp")
|
||||
{
|
||||
die();
|
||||
}
|
||||
|
||||
$data = array();
|
||||
|
||||
foreach(Topic::CreateFromQuery("SELECT * FROM topics WHERE `ParentId` = 0") as $topic)
|
||||
{
|
||||
$data[] = $topic->AsDataset();
|
||||
}
|
||||
|
||||
echo(json_encode($data));
|
69
frontend/modules/api/search.php
Normal file
69
frontend/modules/api/search.php
Normal file
|
@ -0,0 +1,69 @@
|
|||
<?php
|
||||
/*
|
||||
* Cryto Learn is more free software. It is licensed under the WTFPL, which
|
||||
* allows you to do pretty much anything with it, without having to
|
||||
* ask permission. Commercial use is allowed, and no attribution is
|
||||
* required. We do politely request that you share your modifications
|
||||
* to benefit other developers, but you are under no enforced
|
||||
* obligation to do so :)
|
||||
*
|
||||
* Please read the accompanying LICENSE document for the full WTFPL
|
||||
* licensing text.
|
||||
*/
|
||||
|
||||
if(!isset($_APP)) { die("Unauthorized."); }
|
||||
|
||||
if(empty($_POST['q']))
|
||||
{
|
||||
die(json_encode(array(
|
||||
"error" => "No search query specified."
|
||||
)));
|
||||
}
|
||||
else
|
||||
{
|
||||
$query = $_POST['q'];
|
||||
$terms = explode(" ", $query);
|
||||
|
||||
$db_query_terms = array();
|
||||
$valid_term = false;
|
||||
|
||||
foreach($terms as $term)
|
||||
{
|
||||
$db_query_terms[] = "`Title` LIKE ?";
|
||||
$term = str_replace("%", "\%", $term);
|
||||
$term = str_replace("_", "\_", $term);
|
||||
$valid_term = $valid_term || (strlen($term) >= 2);
|
||||
$db_query_arguments[] = "%{$term}%";
|
||||
}
|
||||
|
||||
if($valid_term)
|
||||
{
|
||||
$db_query = implode(" AND ", $db_query_terms);
|
||||
array_unshift($db_query_arguments, '');
|
||||
unset($db_query_arguments[0]);
|
||||
|
||||
try
|
||||
{
|
||||
$results_topics = Topic::CreateFromQuery("SELECT * FROM topics WHERE {$db_query}", $db_query_arguments);
|
||||
|
||||
$return_objects = array();
|
||||
|
||||
foreach($results_topics as $topic)
|
||||
{
|
||||
$return_objects[] = $topic->AsDataset();
|
||||
}
|
||||
|
||||
$sPageContents = json_encode($return_objects);
|
||||
}
|
||||
catch (NotFoundException $e)
|
||||
{
|
||||
$sPageContents = json_encode(array("error" => "No results found for the specified query.", "query" => $query));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
die(json_encode(array(
|
||||
"error" => "No valid search query specified."
|
||||
)));
|
||||
}
|
||||
}
|
18
frontend/modules/ui/index.php
Normal file
18
frontend/modules/ui/index.php
Normal file
|
@ -0,0 +1,18 @@
|
|||
<?php
|
||||
/*
|
||||
* Cryto Learn is more free software. It is licensed under the WTFPL, which
|
||||
* allows you to do pretty much anything with it, without having to
|
||||
* ask permission. Commercial use is allowed, and no attribution is
|
||||
* required. We do politely request that you share your modifications
|
||||
* to benefit other developers, but you are under no enforced
|
||||
* obligation to do so :)
|
||||
*
|
||||
* Please read the accompanying LICENSE document for the full WTFPL
|
||||
* licensing text.
|
||||
*/
|
||||
|
||||
if(!isset($_APP)) { die("Unauthorized."); }
|
||||
|
||||
$sPageContents = NewTemplater::Render("ui/index", $locale->strings, array());
|
||||
|
||||
$sPageType = "ui";
|
34
frontend/rewrite.php
Normal file
34
frontend/rewrite.php
Normal file
|
@ -0,0 +1,34 @@
|
|||
<?php
|
||||
$_APP = true;
|
||||
require("includes/base.php");
|
||||
|
||||
$sPageContents = "";
|
||||
|
||||
$router = new CPHPRouter();
|
||||
|
||||
$router->allow_slash = true;
|
||||
$router->ignore_query = true;
|
||||
|
||||
$router->routes = array(
|
||||
0 => array(
|
||||
"^/$" => "modules/ui/index.php",
|
||||
"^/api/search$" => "modules/api/search.php",
|
||||
"^/api/dump$" => "modules/api/dump.php"
|
||||
)
|
||||
);
|
||||
|
||||
try
|
||||
{
|
||||
$router->RouteRequest();
|
||||
}
|
||||
catch (RouterException $e)
|
||||
{
|
||||
http_status_code(404);
|
||||
$sPageContents = "404 not found";
|
||||
}
|
||||
|
||||
echo($sPageContents);
|
||||
|
||||
/*
|
||||
|
||||
* */
|
BIN
frontend/static/spinner.gif
Normal file
BIN
frontend/static/spinner.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.8 KiB |
|
@ -6,6 +6,11 @@ body
|
|||
font-family: sans-serif;
|
||||
}
|
||||
|
||||
#templates
|
||||
{
|
||||
display: none;
|
||||
}
|
||||
|
||||
.header
|
||||
{
|
||||
background-color: #C9F9DF;
|
||||
|
@ -19,6 +24,12 @@ body
|
|||
font-weight: normal;
|
||||
}
|
||||
|
||||
.header h2
|
||||
{
|
||||
margin: 2px;
|
||||
font-size: 17px;
|
||||
}
|
||||
|
||||
.search-large
|
||||
{
|
||||
color: #006824;
|
||||
|
@ -55,3 +66,95 @@ body
|
|||
font-size: 26px;
|
||||
width: 180px;
|
||||
}
|
||||
|
||||
.spinner
|
||||
{
|
||||
margin-left: 14px;
|
||||
}
|
||||
|
||||
.topic, .item
|
||||
{
|
||||
padding: 9px 12px;
|
||||
margin: 5px 20px;
|
||||
background-color: #79E1A8;
|
||||
font-size: 20px;
|
||||
width: 960px;
|
||||
}
|
||||
|
||||
.topic
|
||||
{
|
||||
margin-top: 19px;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.item
|
||||
{
|
||||
margin-left: 34px;
|
||||
width: 926px;
|
||||
font-size: 18px;
|
||||
background-color: #97F3C1;
|
||||
display: none;
|
||||
}
|
||||
|
||||
.type
|
||||
{
|
||||
font-size: 18px;
|
||||
color: gray;
|
||||
}
|
||||
|
||||
.type:after
|
||||
{
|
||||
content: ":";
|
||||
}
|
||||
|
||||
a.title
|
||||
{
|
||||
color: #041F9F;
|
||||
}
|
||||
|
||||
.toggler
|
||||
{
|
||||
display: block;
|
||||
float: left;
|
||||
width: 16px;
|
||||
height: 16px;
|
||||
margin-top: 2px;
|
||||
margin-right: 8px;
|
||||
font-size: 13px;
|
||||
text-align: center;
|
||||
font-weight: bold;
|
||||
border: 1px solid black;
|
||||
background-color: #D2ECCF;
|
||||
}
|
||||
|
||||
.providername
|
||||
{
|
||||
font-size: 18px;
|
||||
color: gray;
|
||||
}
|
||||
|
||||
.providername:before
|
||||
{
|
||||
content: "(";
|
||||
}
|
||||
|
||||
.providername:after
|
||||
{
|
||||
content: ")";
|
||||
}
|
||||
|
||||
.error
|
||||
{
|
||||
margin: 8px 16px;
|
||||
font-size: 19px;
|
||||
}
|
||||
|
||||
.description
|
||||
{
|
||||
margin-top: 4px;
|
||||
font-size: 13px;
|
||||
max-height: 15px;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
|
160
frontend/templates/ui/index.tpl
Normal file
160
frontend/templates/ui/index.tpl
Normal file
|
@ -0,0 +1,160 @@
|
|||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<title>learn.cryto.net</title>
|
||||
<link rel="stylesheet" href="style.css">
|
||||
<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.9.0/jquery.min.js"></script>
|
||||
<script>
|
||||
var search_timeout = null;
|
||||
|
||||
$(function(){
|
||||
/*$("input").val("data");
|
||||
runSearch();*/
|
||||
|
||||
$("input").keyup(function(){
|
||||
if(typeof search_timeout !== "null")
|
||||
{
|
||||
clearTimeout(search_timeout);
|
||||
}
|
||||
|
||||
search_timeout = setTimeout(runSearch, 800)
|
||||
});
|
||||
});
|
||||
|
||||
function runSearch()
|
||||
{
|
||||
$(".search-large").removeClass("search-large").addClass("search-top");
|
||||
$(".spinner").show();
|
||||
var query = $("input#query").val();
|
||||
|
||||
if(query.length >= 3)
|
||||
{
|
||||
$.post("/api/search", {q: query}, function(response){
|
||||
$(".spinner").hide();
|
||||
$(".results").html("");
|
||||
|
||||
if(typeof response.error == "undefined")
|
||||
{
|
||||
for(i in response)
|
||||
{
|
||||
if(response[i].items.length > 0)
|
||||
{
|
||||
var result_wrapper = instantiateTemplate("result_wrapper");
|
||||
|
||||
var result_block = instantiateTemplate("result_topic");
|
||||
result_block.children(".title").html(response[i].title);
|
||||
result_block.children(".description").html(response[i].description);
|
||||
result_block.children(".providername").html(response[i].provider);
|
||||
result_block.appendTo(result_wrapper);
|
||||
|
||||
for(x in response[i].items)
|
||||
{
|
||||
item = response[i].items[x];
|
||||
|
||||
var item_block = instantiateTemplate("result_item");
|
||||
item_block.children(".title").html(item.title);
|
||||
item_block.children(".title").attr("href", item.url);
|
||||
item_block.children(".type").html(item.type);
|
||||
item_block.insertAfter(result_block);
|
||||
}
|
||||
|
||||
result_wrapper.appendTo(".results");
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
$(".results").html("<div class='error'>No results.</div>");
|
||||
}
|
||||
|
||||
setHandlers();
|
||||
}, "json");
|
||||
}
|
||||
else
|
||||
{
|
||||
$(".spinner").hide();
|
||||
$(".results").html("<div class='error'>Enter at least 3 characters.</div>");
|
||||
}
|
||||
}
|
||||
|
||||
function setHandlers()
|
||||
{
|
||||
$(".toggler, .topic").each(
|
||||
function(){
|
||||
$(this).click(function(event){
|
||||
toggleItems(this, event);
|
||||
});
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
function instantiateTemplate(template_name)
|
||||
{
|
||||
var instance = $("#template_" + template_name).clone();
|
||||
instance.removeAttr("id");
|
||||
return instance;
|
||||
}
|
||||
|
||||
function toggleItems(ctx, event)
|
||||
{
|
||||
var parent = $(ctx).parentsUntil(".wrapper");
|
||||
|
||||
if(parent.length == 0)
|
||||
{
|
||||
var wrapper = $(ctx).parent();
|
||||
}
|
||||
else
|
||||
{
|
||||
var wrapper = parent.parent();
|
||||
}
|
||||
|
||||
var toggler = wrapper.find(".toggler");
|
||||
|
||||
if(typeof toggler.data("toggled") == "undefined" || toggler.data("toggled") == false)
|
||||
{
|
||||
toggler.data("toggled", true);
|
||||
toggler.html("-");
|
||||
wrapper.find(".item").show();
|
||||
}
|
||||
else
|
||||
{
|
||||
toggler.data("toggled", false);
|
||||
toggler.html("+");
|
||||
wrapper.find(".item").hide();
|
||||
}
|
||||
|
||||
event.stopPropagation();
|
||||
}
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
<div class="header">
|
||||
<h1><strong>learn.cryto.net</strong> :: Learn something new!</h1>
|
||||
<h2>Currently searching Coursera, Khan University, University of Reddit. Comments? <a href="mailto:learn@cryto.net">learn@cryto.net</a> or
|
||||
<a href="irc://irc.cryto.net/crytocc">irc.cryto.net #crytocc</a></h2>
|
||||
<h2>Like the service and wish to donate? <a href="http://cryto.net/~joepie91/donate.html">You can do that here :)</a></h2>
|
||||
</div>
|
||||
<div class="main">
|
||||
<div class="search-large">
|
||||
I want to learn about <input type="text" id="query">. <img src="/static/spinner.gif" class="spinner" style="display: none;">
|
||||
</div>
|
||||
<div class="results">
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<div id="templates">
|
||||
<div id="template_result_wrapper" class="wrapper"></div>
|
||||
<div id="template_result_topic" class="topic">
|
||||
<span class="toggler">+</span>
|
||||
<strong>Topic: </strong>
|
||||
<span class="title"></span>
|
||||
<span class="providername"></span>
|
||||
<div class="description"></div>
|
||||
</div>
|
||||
<div id="template_result_item" class="item">
|
||||
<span class="type"></span>
|
||||
<a href="#" class="title"></a>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
51
ocw_functions.txt
Normal file
51
ocw_functions.txt
Normal file
|
@ -0,0 +1,51 @@
|
|||
"ocw.kaplan.edu": self._metadata_kaplan,
|
||||
"ocw.korea.edu": self._metadata_korea,
|
||||
"kyotomm.jp": self._metadata_kyoto,
|
||||
"ocw.kyushu-u.ac.jp": self._metadata_kyushu,
|
||||
"open-marhi.ru": self._metadata_moscow,
|
||||
"yctrtrc.ncku.edu.tw": self._metadata_chengkung,
|
||||
"ocw.nctu.edu.tw": self._metadata_chiaotung,
|
||||
"opencourse.ndhu.edu.tw": self._metadata_donghwa,
|
||||
"ocw.njit.edu": self._metadata_njit,
|
||||
"graduateschool.paristech.fr": self._metadata_paris,
|
||||
"peoples-uni.org": self._metadata_oaei,
|
||||
"ocw.sbu.ac.ir": self._metadata_shahid,
|
||||
"studentscircle.net": self._metadata_studentscircle,
|
||||
"ocw.tmu.edu.tw:8080": self._metadata_taipei,
|
||||
"openlearn.open.ac.uk": self._metadata_openuni,
|
||||
"www.ocw.titech.ac.jp": self._metadata_tokyo,
|
||||
"feedproxy.google.com": self._metadata_tudelft,
|
||||
"ocw.tufts.edu": self._metadata_tufts,
|
||||
"ocw.unu.edu": self._metadata_un,
|
||||
"ocw.uc3m.es": self._metadata_madrid,
|
||||
"ocw.ua.es": self._metadata_alicante,
|
||||
"ocw.unican.es": self._metadata_cantabria,
|
||||
"ocw.ugr.es": self._metadata_granada,
|
||||
"ocw.udem.edu.mx": self._metadata_monterrey,
|
||||
"ocw.um.es": self._metadata_murcia,
|
||||
"ocw.uniovi.es": self._metadata_oviedo,
|
||||
"ocw.usal.es": self._metadata_salamanca,
|
||||
"ocwus.us.es": self._metadata_sevilla,
|
||||
"ocw.unizar.es": self._metadata_zaragoza,
|
||||
"ocw.univalle.edu.co3": self._metadata_colombia,
|
||||
"ocw.uned.ac.cr": self._metadata_distancia,
|
||||
"www.icesi.edu.co": self._metadata_icesi,
|
||||
"ocw.innova.uned.es": self._metadata_innova,
|
||||
"upv.es": self._metadata_valencia,
|
||||
"ocw.upm.es": self._metadata_upm,
|
||||
"ocw.utpl.edu.ec": self._metadata_utpl,
|
||||
"ocw.uab.cat": self._metadata_uab,
|
||||
"ocw.ub.edu": self._metadata_ub,
|
||||
"ocw.uib.es": self._metadata_uib,
|
||||
"ocw.udl.cat": self._metadata_udl,
|
||||
"ocw.uv.es": self._metadata_uv,
|
||||
"e-ujier.uji.e": self._metadata_uji,
|
||||
"ocw.uoc.edu": self._metadata_uoc,
|
||||
"ocw.utm.my": self._metadata_utm,
|
||||
"ocw.uci.edu": self._metadata_uci,
|
||||
"opencontent.uct.ac.za": self._metadata_uct,
|
||||
"ocw.umb.edu:8080": self._metadata_boston,
|
||||
"open.umich.edu": self._metadata_michigan,
|
||||
"ocw.nd.edu": self._metadata_notredame,
|
||||
"ocw.usu.ac.id": self._metadata_usu,
|
||||
"ocw.tsukuba.ac.jp": self._metadata_tsukaba
|
116
ocw_sources.txt
Normal file
116
ocw_sources.txt
Normal file
|
@ -0,0 +1,116 @@
|
|||
# AGH University of Science and Technology
|
||||
http://open.agh.edu.pl/course/view.php?id=97
|
||||
# Funda Getulio Vargas - FGV Online
|
||||
http://www5.fgv.br/fgvonline/CursosGratuitosFormulario.aspx?id_curso=OCWAJUEAD_00_01/2011_1
|
||||
# Gunadarma University
|
||||
http://ocw.gunadarma.ac.id/course/about
|
||||
# Johns Hopkins Bloomberg School of Public Health
|
||||
http://ocw.jhsph.edu/courses/AdolHealthDev/?source=rss
|
||||
# Kaplan University Online & Campus Learning
|
||||
http://ocw.kaplan.edu/arts-and-sciences/academic-strategies
|
||||
# Korea University
|
||||
http://ocw.korea.edu/ocw/college-of-science/general-physics-i
|
||||
# Kyoto Seika University
|
||||
http://www.kyotomm.jp/event/exh/kyotomagic2012.php
|
||||
# Kyushu University
|
||||
http://ocw.kyushu-u.ac.jp/90901/0007/index.html
|
||||
# Massachusetts Institute of Technology
|
||||
http://ocw.mit.edu/courses/civil-and-environmental-engineering/1-00-introduction-to-computers-and-engineering-problem-solving-fall-2005
|
||||
# MOSCOW ARCHITECTURAL INSTITUTE
|
||||
http://www.open-marhi.ru/courses/detail/index.php?ID=6631
|
||||
# National Cheng Kung University
|
||||
http://yctrtrc.ncku.edu.tw/site2/newocwcourse/OCW_MAIN.php?cid=141
|
||||
# National Chiao Tung University
|
||||
http://ocw.nctu.edu.tw/riki_detail.php?pgid=335
|
||||
# National Dong Hwa University
|
||||
http://opencourse.ndhu.edu.tw/moodle/mod/forum/discuss.php?d=3
|
||||
# New Jersey Institute of Technology
|
||||
http://ocw.njit.edu/ocw/som/acct/acct-615-anandarajan/index.php
|
||||
# Paris Tech
|
||||
http://graduateschool.paristech.fr/cours.php?id=309132
|
||||
# People's Open Access Education Initiative
|
||||
http://www.peoples-uni.org/node/236
|
||||
# Shahid Beheshti University
|
||||
http://ocw.sbu.ac.ir/Default.aspx?tabid=5352&language=fa-IR
|
||||
# Students Circle Network
|
||||
http://studentscircle.net/live/2011/07/a-guide-before-learning-a-new-javascript-framework/
|
||||
# Taipei Medical University
|
||||
http://ocw.tmu.edu.tw:8080/eduCommons/general-education/53f28a1882076b7753f24eba72698a556790-shih-chi-analysis-on-historical-figures
|
||||
# The Open University
|
||||
http://openlearn.open.ac.uk/course/view.php?name=DD208_3
|
||||
# The Open University of Israel
|
||||
http://peer-news.blogspot.com/2011/12/2-10934.html
|
||||
# Tokyo Institute of Technology
|
||||
http://www.ocw.titech.ac.jp/index.php?module=General&Nendo=2012&action=T0300&GakubuCD=223&GakkaCD=224710&KougiCD=70030&Gakki=1&lang=EN
|
||||
# TU Delft
|
||||
http://feedproxy.google.com/~r/tudelft/OCW/~3/0sA6qPQKcOg/bachelor-civiele-techniek
|
||||
# Tufts University
|
||||
http://ocw.tufts.edu/Course/39
|
||||
# UNISUL - Universidade do Sul de Santa Catarina
|
||||
http://labspace.open.ac.uk
|
||||
# United Nations University
|
||||
http://ocw.unu.edu/international-institute-for-software-technology/building-a-community-of-practice-for-electronic-governance
|
||||
# Universidad Carlos III de Madrid
|
||||
http://ocw.uc3m.es/ingenieria-electrica/accionamientos-electricos
|
||||
# Universidad de Alicante
|
||||
http://ocw.ua.es/Ciencias_Sociales_y_Juridicas/actividades-deportivas-medio-ambiente
|
||||
# Universidad de Cantabria
|
||||
http://ocw.unican.es/ciencias-de-la-salud/actuacion-en-situaciones-especiales
|
||||
# Universidad de Granada
|
||||
http://ocw.ugr.es/course/view.php?id=23&topic=1
|
||||
# Universidad de Monterrey
|
||||
http://ocw.udem.edu.mx/cursos-de-profesional/administracion-de-tecnologias-de-informacion
|
||||
# Universidad de Murcia
|
||||
http://ocw.um.es/cc.-sociales/actividad-fisica-en-el-envejecimiento
|
||||
# Universidad de Oviedo
|
||||
http://ocw.uniovi.es/course/view.php?id=28&ocw=1
|
||||
# Universidad de Salamanca
|
||||
http://ocw.usal.es/ciencias-sociales-1/curso-cero-matematicas-para-ciencias-sociales-nivelacion-de-conocimientos
|
||||
# Universidad de Sevilla
|
||||
http://ocwus.us.es/matematica-aplicada/pp-3
|
||||
# Universidad de Zaragoza
|
||||
http://ocw.unizar.es/ocw/ciencias-de-la-salud-1/actividades-fisicas-y-deportivas-aereas
|
||||
# Universidad del Valle - Colombia
|
||||
http://ocw.univalle.edu.co/ocw/ingenieria-electronica-telecomunicaciones-y-afines/arquitectura-de-procesos-industriales
|
||||
# Universidad Estatal a Distancia
|
||||
http://ocw.uned.ac.cr/eduCommons/ciencias-de-la-administracion/compras-y-almacenamiento
|
||||
# Universidad Icesi
|
||||
http://www.icesi.edu.co/ocw/tic/administracion_plataformas_y_seguridad
|
||||
# Universidad Nacional de Educacion a Distancia
|
||||
http://ocw.innova.uned.es/ocwuniversia/psicologia/analisis-de-datos-en-Psico-I
|
||||
# Universidad Politica de Valencia
|
||||
http://www.upv.es/ocwasi/2010/6842
|
||||
# Universidad Politica Madrid
|
||||
http://ocw.upm.es/ingenieria-cartografica-geodesica-y-fotogrametria/3d-scanning-and-modeling
|
||||
# UNIVERSIDAD TECNICA PARTICULAR DE LOJA
|
||||
http://ocw.utpl.edu.ec/economia
|
||||
# Universitat Auta de Barcelona
|
||||
http://ocw.uab.cat/enginyeries/apunts-de-calcul-matricial-i-resolucio-de-sistemes
|
||||
# Universitat de Barcelona
|
||||
http://ocw.ub.edu/admistracio-i-direccio-dempreses
|
||||
# Universitat de les Illes Balears
|
||||
http://ocw.uib.es/ocw/infermeria/atencion-de-enfermeria-frente-situaciones-de
|
||||
# Universitat de Lleida
|
||||
http://ocw.udl.cat/arts-i-humanitats
|
||||
# Universitat de Valia
|
||||
http://ocw.uv.es/ciencias-sociales-y-juridicas/2-2
|
||||
# Universitat Jaume I
|
||||
http://e-ujier.uji.es/pls/www/!gri_www.euji22101?p_id=15&p_tipo=A&p_curso=IG23&p_idioma=CA
|
||||
# Universitat Oberta de Catalunya
|
||||
http://ocw.uoc.edu/informatica-tecnologia-i-multimedia/administracio-avancada-del-sistema-operatiu-gnu-linux
|
||||
# Universiti Teknologi Malaysia
|
||||
http://ocw.utm.my/course/view.php?id=90
|
||||
# University of California, Irvine
|
||||
http://ocw.uci.edu/courses/course.aspx?id=113
|
||||
# University of Cape Town
|
||||
http://opencontent.uct.ac.za/Centre-for-Higher-Education-Development/Centre-for-Open-Learning/A-developmental-state-The-challenge-ahead
|
||||
# University of Massachusetts Boston
|
||||
http://ocw.umb.edu:8080/eduCommons/about
|
||||
# University of Michigan
|
||||
http://open.umich.edu/education/med/oernetwork/med/em/aetc-redirect/2009
|
||||
# University of Notre Dame
|
||||
http://ocw.nd.edu/history/african-american-history-ii
|
||||
# University of Sumatera Utara
|
||||
http://ocw.usu.ac.id/course/detail/teknik-sipil-s1/4110000007-struktur-bangunan-sipil-i.html
|
||||
# University of Tsukuba
|
||||
http://ocw.tsukuba.ac.jp/6570740672698cea79d15b6678147a7679d130fb65705b665c02653b/66f87c4d7d394ecb
|
22
shellsearch/search.py
Normal file
22
shellsearch/search.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import requests, sys, re
|
||||
|
||||
query = sys.argv[1]
|
||||
|
||||
results = requests.post("http://learn.cryto.net/api/search", {"q": query}).json()
|
||||
|
||||
for result in results:
|
||||
name = result["title"].rstrip()
|
||||
description = result["description"].strip().replace("\n", " ")
|
||||
|
||||
if len(description) > 200:
|
||||
description = re.match("^(.{0,300})\W", description).group(1) + "..."
|
||||
|
||||
print "## %s\n%s" % (name, description)
|
||||
|
||||
for item in result["items"]:
|
||||
name = item["title"].ljust(70)
|
||||
print "\t[%s] %s\t%s" % (item["type"], name, item["url"])
|
||||
|
||||
print ""
|
2
update.sql
Normal file
2
update.sql
Normal file
|
@ -0,0 +1,2 @@
|
|||
ALTER TABLE `items` ADD `CustomProviderName` VARCHAR( 250 ) NULL DEFAULT NULL;
|
||||
ALTER TABLE `topics` ADD `CustomProviderName` VARCHAR( 250 ) NULL DEFAULT NULL;
|
361
updater/bs4/__init__.py
Normal file
361
updater/bs4/__init__.py
Normal file
|
@ -0,0 +1,361 @@
|
|||
"""Beautiful Soup
|
||||
Elixir and Tonic
|
||||
"The Screen-Scraper's Friend"
|
||||
http://www.crummy.com/software/BeautifulSoup/
|
||||
|
||||
Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
||||
(possibly invalid) document into a tree representation. Beautiful Soup
|
||||
provides provides methods and Pythonic idioms that make it easy to
|
||||
navigate, search, and modify the parse tree.
|
||||
|
||||
Beautiful Soup works with Python 2.6 and up. It works better if lxml
|
||||
and/or html5lib is installed.
|
||||
|
||||
For more than you ever wanted to know about Beautiful Soup, see the
|
||||
documentation:
|
||||
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
"""
|
||||
|
||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||
__version__ = "4.1.3"
|
||||
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = ['BeautifulSoup']
|
||||
|
||||
import re
|
||||
import warnings
|
||||
|
||||
from .builder import builder_registry
|
||||
from .dammit import UnicodeDammit
|
||||
from .element import (
|
||||
CData,
|
||||
Comment,
|
||||
DEFAULT_OUTPUT_ENCODING,
|
||||
Declaration,
|
||||
Doctype,
|
||||
NavigableString,
|
||||
PageElement,
|
||||
ProcessingInstruction,
|
||||
ResultSet,
|
||||
SoupStrainer,
|
||||
Tag,
|
||||
)
|
||||
|
||||
# The very first thing we do is give a useful error if someone is
|
||||
# running this code under Python 3 without converting it.
|
||||
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||
|
||||
class BeautifulSoup(Tag):
|
||||
"""
|
||||
This class defines the basic interface called by the tree builders.
|
||||
|
||||
These methods will be called by the parser:
|
||||
reset()
|
||||
feed(markup)
|
||||
|
||||
The tree builder may call these methods from its feed() implementation:
|
||||
handle_starttag(name, attrs) # See note about return value
|
||||
handle_endtag(name)
|
||||
handle_data(data) # Appends to the current data node
|
||||
endData(containerClass=NavigableString) # Ends the current data node
|
||||
|
||||
No matter how complicated the underlying parser is, you should be
|
||||
able to build a tree using 'start tag' events, 'end tag' events,
|
||||
'data' events, and "done with data" events.
|
||||
|
||||
If you encounter an empty-element tag (aka a self-closing tag,
|
||||
like HTML's <br> tag), call handle_starttag and then
|
||||
handle_endtag.
|
||||
"""
|
||||
ROOT_TAG_NAME = u'[document]'
|
||||
|
||||
# If the end-user gives no indication which tree builder they
|
||||
# want, look for one with these features.
|
||||
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
|
||||
|
||||
# Used when determining whether a text node is all whitespace and
|
||||
# can be replaced with a single space. A text node that contains
|
||||
# fancy Unicode spaces (usually non-breaking) should be left
|
||||
# alone.
|
||||
STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
|
||||
|
||||
def __init__(self, markup="", features=None, builder=None,
|
||||
parse_only=None, from_encoding=None, **kwargs):
|
||||
"""The Soup object is initialized as the 'root tag', and the
|
||||
provided markup (which can be a string or a file-like object)
|
||||
is fed into the underlying parser."""
|
||||
|
||||
if 'convertEntities' in kwargs:
|
||||
warnings.warn(
|
||||
"BS4 does not respect the convertEntities argument to the "
|
||||
"BeautifulSoup constructor. Entities are always converted "
|
||||
"to Unicode characters.")
|
||||
|
||||
if 'markupMassage' in kwargs:
|
||||
del kwargs['markupMassage']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the markupMassage argument to the "
|
||||
"BeautifulSoup constructor. The tree builder is responsible "
|
||||
"for any necessary markup massage.")
|
||||
|
||||
if 'smartQuotesTo' in kwargs:
|
||||
del kwargs['smartQuotesTo']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the smartQuotesTo argument to the "
|
||||
"BeautifulSoup constructor. Smart quotes are always converted "
|
||||
"to Unicode characters.")
|
||||
|
||||
if 'selfClosingTags' in kwargs:
|
||||
del kwargs['selfClosingTags']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the selfClosingTags argument to the "
|
||||
"BeautifulSoup constructor. The tree builder is responsible "
|
||||
"for understanding self-closing tags.")
|
||||
|
||||
if 'isHTML' in kwargs:
|
||||
del kwargs['isHTML']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the isHTML argument to the "
|
||||
"BeautifulSoup constructor. You can pass in features='html' "
|
||||
"or features='xml' to get a builder capable of handling "
|
||||
"one or the other.")
|
||||
|
||||
def deprecated_argument(old_name, new_name):
|
||||
if old_name in kwargs:
|
||||
warnings.warn(
|
||||
'The "%s" argument to the BeautifulSoup constructor '
|
||||
'has been renamed to "%s."' % (old_name, new_name))
|
||||
value = kwargs[old_name]
|
||||
del kwargs[old_name]
|
||||
return value
|
||||
return None
|
||||
|
||||
parse_only = parse_only or deprecated_argument(
|
||||
"parseOnlyThese", "parse_only")
|
||||
|
||||
from_encoding = from_encoding or deprecated_argument(
|
||||
"fromEncoding", "from_encoding")
|
||||
|
||||
if len(kwargs) > 0:
|
||||
arg = kwargs.keys().pop()
|
||||
raise TypeError(
|
||||
"__init__() got an unexpected keyword argument '%s'" % arg)
|
||||
|
||||
if builder is None:
|
||||
if isinstance(features, basestring):
|
||||
features = [features]
|
||||
if features is None or len(features) == 0:
|
||||
features = self.DEFAULT_BUILDER_FEATURES
|
||||
builder_class = builder_registry.lookup(*features)
|
||||
if builder_class is None:
|
||||
raise FeatureNotFound(
|
||||
"Couldn't find a tree builder with the features you "
|
||||
"requested: %s. Do you need to install a parser library?"
|
||||
% ",".join(features))
|
||||
builder = builder_class()
|
||||
self.builder = builder
|
||||
self.is_xml = builder.is_xml
|
||||
self.builder.soup = self
|
||||
|
||||
self.parse_only = parse_only
|
||||
|
||||
self.reset()
|
||||
|
||||
if hasattr(markup, 'read'): # It's a file-type object.
|
||||
markup = markup.read()
|
||||
(self.markup, self.original_encoding, self.declared_html_encoding,
|
||||
self.contains_replacement_characters) = (
|
||||
self.builder.prepare_markup(markup, from_encoding))
|
||||
|
||||
try:
|
||||
self._feed()
|
||||
except StopParsing:
|
||||
pass
|
||||
|
||||
# Clear out the markup and remove the builder's circular
|
||||
# reference to this object.
|
||||
self.markup = None
|
||||
self.builder.soup = None
|
||||
|
||||
def _feed(self):
|
||||
# Convert the document to Unicode.
|
||||
self.builder.reset()
|
||||
|
||||
self.builder.feed(self.markup)
|
||||
# Close out any unfinished strings and close all the open tags.
|
||||
self.endData()
|
||||
while self.currentTag.name != self.ROOT_TAG_NAME:
|
||||
self.popTag()
|
||||
|
||||
def reset(self):
|
||||
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
|
||||
self.hidden = 1
|
||||
self.builder.reset()
|
||||
self.currentData = []
|
||||
self.currentTag = None
|
||||
self.tagStack = []
|
||||
self.pushTag(self)
|
||||
|
||||
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
|
||||
"""Create a new tag associated with this soup."""
|
||||
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
|
||||
|
||||
def new_string(self, s):
|
||||
"""Create a new NavigableString associated with this soup."""
|
||||
navigable = NavigableString(s)
|
||||
navigable.setup()
|
||||
return navigable
|
||||
|
||||
def insert_before(self, successor):
|
||||
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
|
||||
|
||||
def insert_after(self, successor):
|
||||
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
|
||||
|
||||
def popTag(self):
|
||||
tag = self.tagStack.pop()
|
||||
#print "Pop", tag.name
|
||||
if self.tagStack:
|
||||
self.currentTag = self.tagStack[-1]
|
||||
return self.currentTag
|
||||
|
||||
def pushTag(self, tag):
|
||||
#print "Push", tag.name
|
||||
if self.currentTag:
|
||||
self.currentTag.contents.append(tag)
|
||||
self.tagStack.append(tag)
|
||||
self.currentTag = self.tagStack[-1]
|
||||
|
||||
def endData(self, containerClass=NavigableString):
|
||||
if self.currentData:
|
||||
currentData = u''.join(self.currentData)
|
||||
if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
|
||||
not set([tag.name for tag in self.tagStack]).intersection(
|
||||
self.builder.preserve_whitespace_tags)):
|
||||
if '\n' in currentData:
|
||||
currentData = '\n'
|
||||
else:
|
||||
currentData = ' '
|
||||
self.currentData = []
|
||||
if self.parse_only and len(self.tagStack) <= 1 and \
|
||||
(not self.parse_only.text or \
|
||||
not self.parse_only.search(currentData)):
|
||||
return
|
||||
o = containerClass(currentData)
|
||||
self.object_was_parsed(o)
|
||||
|
||||
def object_was_parsed(self, o, parent=None, previous_element=None):
|
||||
"""Add an object to the parse tree."""
|
||||
parent = parent or self.currentTag
|
||||
previous_element = previous_element or self.previous_element
|
||||
o.setup(parent, previous_element)
|
||||
if self.previous_element:
|
||||
self.previous_element.next_element = o
|
||||
self.previous_element = o
|
||||
parent.contents.append(o)
|
||||
|
||||
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||
"""Pops the tag stack up to and including the most recent
|
||||
instance of the given tag. If inclusivePop is false, pops the tag
|
||||
stack up to but *not* including the most recent instqance of
|
||||
the given tag."""
|
||||
#print "Popping to %s" % name
|
||||
if name == self.ROOT_TAG_NAME:
|
||||
return
|
||||
|
||||
numPops = 0
|
||||
mostRecentTag = None
|
||||
|
||||
for i in range(len(self.tagStack) - 1, 0, -1):
|
||||
if (name == self.tagStack[i].name
|
||||
and nsprefix == self.tagStack[i].prefix):
|
||||
numPops = len(self.tagStack) - i
|
||||
break
|
||||
if not inclusivePop:
|
||||
numPops = numPops - 1
|
||||
|
||||
for i in range(0, numPops):
|
||||
mostRecentTag = self.popTag()
|
||||
return mostRecentTag
|
||||
|
||||
def handle_starttag(self, name, namespace, nsprefix, attrs):
|
||||
"""Push a start tag on to the stack.
|
||||
|
||||
If this method returns None, the tag was rejected by the
|
||||
SoupStrainer. You should proceed as if the tag had not occured
|
||||
in the document. For instance, if this was a self-closing tag,
|
||||
don't call handle_endtag.
|
||||
"""
|
||||
|
||||
# print "Start tag %s: %s" % (name, attrs)
|
||||
self.endData()
|
||||
|
||||
if (self.parse_only and len(self.tagStack) <= 1
|
||||
and (self.parse_only.text
|
||||
or not self.parse_only.search_tag(name, attrs))):
|
||||
return None
|
||||
|
||||
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
|
||||
self.currentTag, self.previous_element)
|
||||
if tag is None:
|
||||
return tag
|
||||
if self.previous_element:
|
||||
self.previous_element.next_element = tag
|
||||
self.previous_element = tag
|
||||
self.pushTag(tag)
|
||||
return tag
|
||||
|
||||
def handle_endtag(self, name, nsprefix=None):
|
||||
#print "End tag: " + name
|
||||
self.endData()
|
||||
self._popToTag(name, nsprefix)
|
||||
|
||||
def handle_data(self, data):
|
||||
self.currentData.append(data)
|
||||
|
||||
def decode(self, pretty_print=False,
|
||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||
formatter="minimal"):
|
||||
"""Returns a string or Unicode representation of this document.
|
||||
To get Unicode, pass None for encoding."""
|
||||
|
||||
if self.is_xml:
|
||||
# Print the XML declaration
|
||||
encoding_part = ''
|
||||
if eventual_encoding != None:
|
||||
encoding_part = ' encoding="%s"' % eventual_encoding
|
||||
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
|
||||
else:
|
||||
prefix = u''
|
||||
if not pretty_print:
|
||||
indent_level = None
|
||||
else:
|
||||
indent_level = 0
|
||||
return prefix + super(BeautifulSoup, self).decode(
|
||||
indent_level, eventual_encoding, formatter)
|
||||
|
||||
class BeautifulStoneSoup(BeautifulSoup):
|
||||
"""Deprecated interface to an XML parser."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['features'] = 'xml'
|
||||
warnings.warn(
|
||||
'The BeautifulStoneSoup class is deprecated. Instead of using '
|
||||
'it, pass features="xml" into the BeautifulSoup constructor.')
|
||||
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class StopParsing(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class FeatureNotFound(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
#By default, act as an HTML pretty-printer.
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
soup = BeautifulSoup(sys.stdin)
|
||||
print soup.prettify()
|
316
updater/bs4/builder/__init__.py
Normal file
316
updater/bs4/builder/__init__.py
Normal file
|
@ -0,0 +1,316 @@
|
|||
from collections import defaultdict
|
||||
import itertools
|
||||
import sys
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
ContentMetaAttributeValue,
|
||||
whitespace_re
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'HTMLTreeBuilder',
|
||||
'SAXTreeBuilder',
|
||||
'TreeBuilder',
|
||||
'TreeBuilderRegistry',
|
||||
]
|
||||
|
||||
# Some useful features for a TreeBuilder to have.
|
||||
FAST = 'fast'
|
||||
PERMISSIVE = 'permissive'
|
||||
STRICT = 'strict'
|
||||
XML = 'xml'
|
||||
HTML = 'html'
|
||||
HTML_5 = 'html5'
|
||||
|
||||
|
||||
class TreeBuilderRegistry(object):
|
||||
|
||||
def __init__(self):
|
||||
self.builders_for_feature = defaultdict(list)
|
||||
self.builders = []
|
||||
|
||||
def register(self, treebuilder_class):
|
||||
"""Register a treebuilder based on its advertised features."""
|
||||
for feature in treebuilder_class.features:
|
||||
self.builders_for_feature[feature].insert(0, treebuilder_class)
|
||||
self.builders.insert(0, treebuilder_class)
|
||||
|
||||
def lookup(self, *features):
|
||||
if len(self.builders) == 0:
|
||||
# There are no builders at all.
|
||||
return None
|
||||
|
||||
if len(features) == 0:
|
||||
# They didn't ask for any features. Give them the most
|
||||
# recently registered builder.
|
||||
return self.builders[0]
|
||||
|
||||
# Go down the list of features in order, and eliminate any builders
|
||||
# that don't match every feature.
|
||||
features = list(features)
|
||||
features.reverse()
|
||||
candidates = None
|
||||
candidate_set = None
|
||||
while len(features) > 0:
|
||||
feature = features.pop()
|
||||
we_have_the_feature = self.builders_for_feature.get(feature, [])
|
||||
if len(we_have_the_feature) > 0:
|
||||
if candidates is None:
|
||||
candidates = we_have_the_feature
|
||||
candidate_set = set(candidates)
|
||||
else:
|
||||
# Eliminate any candidates that don't have this feature.
|
||||
candidate_set = candidate_set.intersection(
|
||||
set(we_have_the_feature))
|
||||
|
||||
# The only valid candidates are the ones in candidate_set.
|
||||
# Go through the original list of candidates and pick the first one
|
||||
# that's in candidate_set.
|
||||
if candidate_set is None:
|
||||
return None
|
||||
for candidate in candidates:
|
||||
if candidate in candidate_set:
|
||||
return candidate
|
||||
return None
|
||||
|
||||
# The BeautifulSoup class will take feature lists from developers and use them
|
||||
# to look up builders in this registry.
|
||||
builder_registry = TreeBuilderRegistry()
|
||||
|
||||
class TreeBuilder(object):
|
||||
"""Turn a document into a Beautiful Soup object tree."""
|
||||
|
||||
features = []
|
||||
|
||||
is_xml = False
|
||||
preserve_whitespace_tags = set()
|
||||
empty_element_tags = None # A tag will be considered an empty-element
|
||||
# tag when and only when it has no contents.
|
||||
|
||||
# A value for these tag/attribute combinations is a space- or
|
||||
# comma-separated list of CDATA, rather than a single CDATA.
|
||||
cdata_list_attributes = {}
|
||||
|
||||
|
||||
def __init__(self):
|
||||
self.soup = None
|
||||
|
||||
def reset(self):
|
||||
pass
|
||||
|
||||
def can_be_empty_element(self, tag_name):
|
||||
"""Might a tag with this name be an empty-element tag?
|
||||
|
||||
The final markup may or may not actually present this tag as
|
||||
self-closing.
|
||||
|
||||
For instance: an HTMLBuilder does not consider a <p> tag to be
|
||||
an empty-element tag (it's not in
|
||||
HTMLBuilder.empty_element_tags). This means an empty <p> tag
|
||||
will be presented as "<p></p>", not "<p />".
|
||||
|
||||
The default implementation has no opinion about which tags are
|
||||
empty-element tags, so a tag will be presented as an
|
||||
empty-element tag if and only if it has no contents.
|
||||
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
|
||||
be left alone.
|
||||
"""
|
||||
if self.empty_element_tags is None:
|
||||
return True
|
||||
return tag_name in self.empty_element_tags
|
||||
|
||||
def feed(self, markup):
|
||||
raise NotImplementedError()
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None):
|
||||
return markup, None, None, False
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""Wrap an HTML fragment to make it look like a document.
|
||||
|
||||
Different parsers do this differently. For instance, lxml
|
||||
introduces an empty <head> tag, and html5lib
|
||||
doesn't. Abstracting this away lets us write simple tests
|
||||
which run HTML fragments through the parser and compare the
|
||||
results against other HTML fragments.
|
||||
|
||||
This method should not be used outside of tests.
|
||||
"""
|
||||
return fragment
|
||||
|
||||
def set_up_substitutions(self, tag):
|
||||
return False
|
||||
|
||||
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
|
||||
"""Replaces class="foo bar" with class=["foo", "bar"]
|
||||
|
||||
Modifies its input in place.
|
||||
"""
|
||||
if self.cdata_list_attributes:
|
||||
universal = self.cdata_list_attributes.get('*', [])
|
||||
tag_specific = self.cdata_list_attributes.get(
|
||||
tag_name.lower(), [])
|
||||
for cdata_list_attr in itertools.chain(universal, tag_specific):
|
||||
if cdata_list_attr in dict(attrs):
|
||||
# Basically, we have a "class" attribute whose
|
||||
# value is a whitespace-separated list of CSS
|
||||
# classes. Split it into a list.
|
||||
value = attrs[cdata_list_attr]
|
||||
if isinstance(value, basestring):
|
||||
values = whitespace_re.split(value)
|
||||
else:
|
||||
# html5lib sometimes calls setAttributes twice
|
||||
# for the same tag when rearranging the parse
|
||||
# tree. On the second call the attribute value
|
||||
# here is already a list. If this happens,
|
||||
# leave the value alone rather than trying to
|
||||
# split it again.
|
||||
values = value
|
||||
attrs[cdata_list_attr] = values
|
||||
return attrs
|
||||
|
||||
class SAXTreeBuilder(TreeBuilder):
|
||||
"""A Beautiful Soup treebuilder that listens for SAX events."""
|
||||
|
||||
def feed(self, markup):
|
||||
raise NotImplementedError()
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
attrs = dict((key[1], value) for key, value in list(attrs.items()))
|
||||
#print "Start %s, %r" % (name, attrs)
|
||||
self.soup.handle_starttag(name, attrs)
|
||||
|
||||
def endElement(self, name):
|
||||
#print "End %s" % name
|
||||
self.soup.handle_endtag(name)
|
||||
|
||||
def startElementNS(self, nsTuple, nodeName, attrs):
|
||||
# Throw away (ns, nodeName) for now.
|
||||
self.startElement(nodeName, attrs)
|
||||
|
||||
def endElementNS(self, nsTuple, nodeName):
|
||||
# Throw away (ns, nodeName) for now.
|
||||
self.endElement(nodeName)
|
||||
#handler.endElementNS((ns, node.nodeName), node.nodeName)
|
||||
|
||||
def startPrefixMapping(self, prefix, nodeValue):
|
||||
# Ignore the prefix for now.
|
||||
pass
|
||||
|
||||
def endPrefixMapping(self, prefix):
|
||||
# Ignore the prefix for now.
|
||||
# handler.endPrefixMapping(prefix)
|
||||
pass
|
||||
|
||||
def characters(self, content):
|
||||
self.soup.handle_data(content)
|
||||
|
||||
def startDocument(self):
|
||||
pass
|
||||
|
||||
def endDocument(self):
|
||||
pass
|
||||
|
||||
|
||||
class HTMLTreeBuilder(TreeBuilder):
|
||||
"""This TreeBuilder knows facts about HTML.
|
||||
|
||||
Such as which tags are empty-element tags.
|
||||
"""
|
||||
|
||||
preserve_whitespace_tags = set(['pre', 'textarea'])
|
||||
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
|
||||
'spacer', 'link', 'frame', 'base'])
|
||||
|
||||
# The HTML standard defines these attributes as containing a
|
||||
# space-separated list of values, not a single value. That is,
|
||||
# class="foo bar" means that the 'class' attribute has two values,
|
||||
# 'foo' and 'bar', not the single value 'foo bar'. When we
|
||||
# encounter one of these attributes, we will parse its value into
|
||||
# a list of values if possible. Upon output, the list will be
|
||||
# converted back into a string.
|
||||
cdata_list_attributes = {
|
||||
"*" : ['class', 'accesskey', 'dropzone'],
|
||||
"a" : ['rel', 'rev'],
|
||||
"link" : ['rel', 'rev'],
|
||||
"td" : ["headers"],
|
||||
"th" : ["headers"],
|
||||
"td" : ["headers"],
|
||||
"form" : ["accept-charset"],
|
||||
"object" : ["archive"],
|
||||
|
||||
# These are HTML5 specific, as are *.accesskey and *.dropzone above.
|
||||
"area" : ["rel"],
|
||||
"icon" : ["sizes"],
|
||||
"iframe" : ["sandbox"],
|
||||
"output" : ["for"],
|
||||
}
|
||||
|
||||
def set_up_substitutions(self, tag):
|
||||
# We are only interested in <meta> tags
|
||||
if tag.name != 'meta':
|
||||
return False
|
||||
|
||||
http_equiv = tag.get('http-equiv')
|
||||
content = tag.get('content')
|
||||
charset = tag.get('charset')
|
||||
|
||||
# We are interested in <meta> tags that say what encoding the
|
||||
# document was originally in. This means HTML 5-style <meta>
|
||||
# tags that provide the "charset" attribute. It also means
|
||||
# HTML 4-style <meta> tags that provide the "content"
|
||||
# attribute and have "http-equiv" set to "content-type".
|
||||
#
|
||||
# In both cases we will replace the value of the appropriate
|
||||
# attribute with a standin object that can take on any
|
||||
# encoding.
|
||||
meta_encoding = None
|
||||
if charset is not None:
|
||||
# HTML 5 style:
|
||||
# <meta charset="utf8">
|
||||
meta_encoding = charset
|
||||
tag['charset'] = CharsetMetaAttributeValue(charset)
|
||||
|
||||
elif (content is not None and http_equiv is not None
|
||||
and http_equiv.lower() == 'content-type'):
|
||||
# HTML 4 style:
|
||||
# <meta http-equiv="content-type" content="text/html; charset=utf8">
|
||||
tag['content'] = ContentMetaAttributeValue(content)
|
||||
|
||||
return (meta_encoding is not None)
|
||||
|
||||
def register_treebuilders_from(module):
|
||||
"""Copy TreeBuilders from the given module into this module."""
|
||||
# I'm fairly sure this is not the best way to do this.
|
||||
this_module = sys.modules['bs4.builder']
|
||||
for name in module.__all__:
|
||||
obj = getattr(module, name)
|
||||
|
||||
if issubclass(obj, TreeBuilder):
|
||||
setattr(this_module, name, obj)
|
||||
this_module.__all__.append(name)
|
||||
# Register the builder while we're at it.
|
||||
this_module.builder_registry.register(obj)
|
||||
|
||||
# Builders are registered in reverse order of priority, so that custom
|
||||
# builder registrations will take precedence. In general, we want lxml
|
||||
# to take precedence over html5lib, because it's faster. And we only
|
||||
# want to use HTMLParser as a last result.
|
||||
from . import _htmlparser
|
||||
register_treebuilders_from(_htmlparser)
|
||||
try:
|
||||
from . import _html5lib
|
||||
register_treebuilders_from(_html5lib)
|
||||
except ImportError:
|
||||
# They don't have html5lib installed.
|
||||
pass
|
||||
try:
|
||||
from . import _lxml
|
||||
register_treebuilders_from(_lxml)
|
||||
except ImportError:
|
||||
# They don't have lxml installed.
|
||||
pass
|
221
updater/bs4/builder/_html5lib.py
Normal file
221
updater/bs4/builder/_html5lib.py
Normal file
|
@ -0,0 +1,221 @@
|
|||
__all__ = [
|
||||
'HTML5TreeBuilder',
|
||||
]
|
||||
|
||||
import warnings
|
||||
from bs4.builder import (
|
||||
PERMISSIVE,
|
||||
HTML,
|
||||
HTML_5,
|
||||
HTMLTreeBuilder,
|
||||
)
|
||||
from bs4.element import NamespacedAttribute
|
||||
import html5lib
|
||||
from html5lib.constants import namespaces
|
||||
from bs4.element import (
|
||||
Comment,
|
||||
Doctype,
|
||||
NavigableString,
|
||||
Tag,
|
||||
)
|
||||
|
||||
class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||
"""Use html5lib to build a tree."""
|
||||
|
||||
features = ['html5lib', PERMISSIVE, HTML_5, HTML]
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding):
|
||||
# Store the user-specified encoding for use later on.
|
||||
self.user_specified_encoding = user_specified_encoding
|
||||
return markup, None, None, False
|
||||
|
||||
# These methods are defined by Beautiful Soup.
|
||||
def feed(self, markup):
|
||||
if self.soup.parse_only is not None:
|
||||
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
|
||||
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
||||
doc = parser.parse(markup, encoding=self.user_specified_encoding)
|
||||
|
||||
# Set the character encoding detected by the tokenizer.
|
||||
if isinstance(markup, unicode):
|
||||
# We need to special-case this because html5lib sets
|
||||
# charEncoding to UTF-8 if it gets Unicode input.
|
||||
doc.original_encoding = None
|
||||
else:
|
||||
doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
|
||||
|
||||
def create_treebuilder(self, namespaceHTMLElements):
|
||||
self.underlying_builder = TreeBuilderForHtml5lib(
|
||||
self.soup, namespaceHTMLElements)
|
||||
return self.underlying_builder
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<html><head></head><body>%s</body></html>' % fragment
|
||||
|
||||
|
||||
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
|
||||
|
||||
def __init__(self, soup, namespaceHTMLElements):
|
||||
self.soup = soup
|
||||
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
|
||||
|
||||
def documentClass(self):
|
||||
self.soup.reset()
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
doctype = Doctype.for_name_and_ids(name, publicId, systemId)
|
||||
self.soup.object_was_parsed(doctype)
|
||||
|
||||
def elementClass(self, name, namespace):
|
||||
tag = self.soup.new_tag(name, namespace)
|
||||
return Element(tag, self.soup, namespace)
|
||||
|
||||
def commentClass(self, data):
|
||||
return TextNode(Comment(data), self.soup)
|
||||
|
||||
def fragmentClass(self):
|
||||
self.soup = BeautifulSoup("")
|
||||
self.soup.name = "[document_fragment]"
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def appendChild(self, node):
|
||||
# XXX This code is not covered by the BS4 tests.
|
||||
self.soup.append(node.element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.soup
|
||||
|
||||
def getFragment(self):
|
||||
return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
|
||||
|
||||
class AttrList(object):
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
self.attrs = dict(self.element.attrs)
|
||||
def __iter__(self):
|
||||
return list(self.attrs.items()).__iter__()
|
||||
def __setitem__(self, name, value):
|
||||
"set attr", name, value
|
||||
self.element[name] = value
|
||||
def items(self):
|
||||
return list(self.attrs.items())
|
||||
def keys(self):
|
||||
return list(self.attrs.keys())
|
||||
def __len__(self):
|
||||
return len(self.attrs)
|
||||
def __getitem__(self, name):
|
||||
return self.attrs[name]
|
||||
def __contains__(self, name):
|
||||
return name in list(self.attrs.keys())
|
||||
|
||||
|
||||
class Element(html5lib.treebuilders._base.Node):
|
||||
def __init__(self, element, soup, namespace):
|
||||
html5lib.treebuilders._base.Node.__init__(self, element.name)
|
||||
self.element = element
|
||||
self.soup = soup
|
||||
self.namespace = namespace
|
||||
|
||||
def appendChild(self, node):
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[-1].__class__ == NavigableString):
|
||||
# Concatenate new text onto old text node
|
||||
# XXX This has O(n^2) performance, for input like
|
||||
# "a</a>a</a>a</a>..."
|
||||
old_element = self.element.contents[-1]
|
||||
new_element = self.soup.new_string(old_element + node.element)
|
||||
old_element.replace_with(new_element)
|
||||
else:
|
||||
self.soup.object_was_parsed(node.element, parent=self.element)
|
||||
|
||||
def getAttributes(self):
|
||||
return AttrList(self.element)
|
||||
|
||||
def setAttributes(self, attributes):
|
||||
if attributes is not None and len(attributes) > 0:
|
||||
|
||||
converted_attributes = []
|
||||
for name, value in list(attributes.items()):
|
||||
if isinstance(name, tuple):
|
||||
new_name = NamespacedAttribute(*name)
|
||||
del attributes[name]
|
||||
attributes[new_name] = value
|
||||
|
||||
self.soup.builder._replace_cdata_list_attribute_values(
|
||||
self.name, attributes)
|
||||
for name, value in attributes.items():
|
||||
self.element[name] = value
|
||||
|
||||
# The attributes may contain variables that need substitution.
|
||||
# Call set_up_substitutions manually.
|
||||
#
|
||||
# The Tag constructor called this method when the Tag was created,
|
||||
# but we just set/changed the attributes, so call it again.
|
||||
self.soup.builder.set_up_substitutions(self.element)
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
text = TextNode(self.soup.new_string(data), self.soup)
|
||||
if insertBefore:
|
||||
self.insertBefore(text, insertBefore)
|
||||
else:
|
||||
self.appendChild(text)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self.element.index(refNode.element)
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[index-1].__class__ == NavigableString):
|
||||
# (See comments in appendChild)
|
||||
old_node = self.element.contents[index-1]
|
||||
new_str = self.soup.new_string(old_node + node.element)
|
||||
old_node.replace_with(new_str)
|
||||
else:
|
||||
self.element.insert(index, node.element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
node.element.extract()
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
while self.element.contents:
|
||||
child = self.element.contents[0]
|
||||
child.extract()
|
||||
if isinstance(child, Tag):
|
||||
newParent.appendChild(
|
||||
Element(child, self.soup, namespaces["html"]))
|
||||
else:
|
||||
newParent.appendChild(
|
||||
TextNode(child, self.soup))
|
||||
|
||||
def cloneNode(self):
|
||||
tag = self.soup.new_tag(self.element.name, self.namespace)
|
||||
node = Element(tag, self.soup, self.namespace)
|
||||
for key,value in self.attributes:
|
||||
node.attributes[key] = value
|
||||
return node
|
||||
|
||||
def hasContent(self):
|
||||
return self.element.contents
|
||||
|
||||
def getNameTuple(self):
|
||||
if self.namespace == None:
|
||||
return namespaces["html"], self.name
|
||||
else:
|
||||
return self.namespace, self.name
|
||||
|
||||
nameTuple = property(getNameTuple)
|
||||
|
||||
class TextNode(Element):
|
||||
def __init__(self, element, soup):
|
||||
html5lib.treebuilders._base.Node.__init__(self, None)
|
||||
self.element = element
|
||||
self.soup = soup
|
||||
|
||||
def cloneNode(self):
|
||||
raise NotImplementedError
|
244
updater/bs4/builder/_htmlparser.py
Normal file
244
updater/bs4/builder/_htmlparser.py
Normal file
|
@ -0,0 +1,244 @@
|
|||
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
||||
|
||||
__all__ = [
|
||||
'HTMLParserTreeBuilder',
|
||||
]
|
||||
|
||||
from HTMLParser import (
|
||||
HTMLParser,
|
||||
HTMLParseError,
|
||||
)
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
|
||||
# argument, which we'd like to set to False. Unfortunately,
|
||||
# http://bugs.python.org/issue13273 makes strict=True a better bet
|
||||
# before Python 3.2.3.
|
||||
#
|
||||
# At the end of this file, we monkeypatch HTMLParser so that
|
||||
# strict=True works well on Python 3.2.2.
|
||||
major, minor, release = sys.version_info[:3]
|
||||
CONSTRUCTOR_TAKES_STRICT = (
|
||||
major > 3
|
||||
or (major == 3 and minor > 2)
|
||||
or (major == 3 and minor == 2 and release >= 3))
|
||||
|
||||
from bs4.element import (
|
||||
CData,
|
||||
Comment,
|
||||
Declaration,
|
||||
Doctype,
|
||||
ProcessingInstruction,
|
||||
)
|
||||
from bs4.dammit import EntitySubstitution, UnicodeDammit
|
||||
|
||||
from bs4.builder import (
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
STRICT,
|
||||
)
|
||||
|
||||
|
||||
HTMLPARSER = 'html.parser'
|
||||
|
||||
class BeautifulSoupHTMLParser(HTMLParser):
|
||||
def handle_starttag(self, name, attrs):
|
||||
# XXX namespace
|
||||
self.soup.handle_starttag(name, None, None, dict(attrs))
|
||||
|
||||
def handle_endtag(self, name):
|
||||
self.soup.handle_endtag(name)
|
||||
|
||||
def handle_data(self, data):
|
||||
self.soup.handle_data(data)
|
||||
|
||||
def handle_charref(self, name):
|
||||
# XXX workaround for a bug in HTMLParser. Remove this once
|
||||
# it's fixed.
|
||||
if name.startswith('x'):
|
||||
real_name = int(name.lstrip('x'), 16)
|
||||
else:
|
||||
real_name = int(name)
|
||||
|
||||
try:
|
||||
data = unichr(real_name)
|
||||
except (ValueError, OverflowError), e:
|
||||
data = u"\N{REPLACEMENT CHARACTER}"
|
||||
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_entityref(self, name):
|
||||
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
|
||||
if character is not None:
|
||||
data = character
|
||||
else:
|
||||
data = "&%s;" % name
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_comment(self, data):
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(Comment)
|
||||
|
||||
def handle_decl(self, data):
|
||||
self.soup.endData()
|
||||
if data.startswith("DOCTYPE "):
|
||||
data = data[len("DOCTYPE "):]
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(Doctype)
|
||||
|
||||
def unknown_decl(self, data):
|
||||
if data.upper().startswith('CDATA['):
|
||||
cls = CData
|
||||
data = data[len('CDATA['):]
|
||||
else:
|
||||
cls = Declaration
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(cls)
|
||||
|
||||
def handle_pi(self, data):
|
||||
self.soup.endData()
|
||||
if data.endswith("?") and data.lower().startswith("xml"):
|
||||
# "An XHTML processing instruction using the trailing '?'
|
||||
# will cause the '?' to be included in data." - HTMLParser
|
||||
# docs.
|
||||
#
|
||||
# Strip the question mark so we don't end up with two
|
||||
# question marks.
|
||||
data = data[:-1]
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(ProcessingInstruction)
|
||||
|
||||
|
||||
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||
|
||||
is_xml = False
|
||||
features = [HTML, STRICT, HTMLPARSER]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
if CONSTRUCTOR_TAKES_STRICT:
|
||||
kwargs['strict'] = False
|
||||
self.parser_args = (args, kwargs)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None):
|
||||
"""
|
||||
:return: A 4-tuple (markup, original encoding, encoding
|
||||
declared within markup, whether any characters had to be
|
||||
replaced with REPLACEMENT CHARACTER).
|
||||
"""
|
||||
if isinstance(markup, unicode):
|
||||
return markup, None, None, False
|
||||
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
|
||||
return (dammit.markup, dammit.original_encoding,
|
||||
dammit.declared_html_encoding,
|
||||
dammit.contains_replacement_characters)
|
||||
|
||||
def feed(self, markup):
|
||||
args, kwargs = self.parser_args
|
||||
parser = BeautifulSoupHTMLParser(*args, **kwargs)
|
||||
parser.soup = self.soup
|
||||
try:
|
||||
parser.feed(markup)
|
||||
except HTMLParseError, e:
|
||||
warnings.warn(RuntimeWarning(
|
||||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||||
raise e
|
||||
|
||||
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
|
||||
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
|
||||
# string.
|
||||
#
|
||||
# XXX This code can be removed once most Python 3 users are on 3.2.3.
|
||||
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
|
||||
import re
|
||||
attrfind_tolerant = re.compile(
|
||||
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
|
||||
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
|
||||
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
|
||||
|
||||
locatestarttagend = re.compile(r"""
|
||||
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
||||
(?:\s+ # whitespace before attribute name
|
||||
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
|
||||
(?:\s*=\s* # value indicator
|
||||
(?:'[^']*' # LITA-enclosed value
|
||||
|\"[^\"]*\" # LIT-enclosed value
|
||||
|[^'\">\s]+ # bare value
|
||||
)
|
||||
)?
|
||||
)
|
||||
)*
|
||||
\s* # trailing whitespace
|
||||
""", re.VERBOSE)
|
||||
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
|
||||
|
||||
from html.parser import tagfind, attrfind
|
||||
|
||||
def parse_starttag(self, i):
|
||||
self.__starttag_text = None
|
||||
endpos = self.check_for_whole_start_tag(i)
|
||||
if endpos < 0:
|
||||
return endpos
|
||||
rawdata = self.rawdata
|
||||
self.__starttag_text = rawdata[i:endpos]
|
||||
|
||||
# Now parse the data between i+1 and j into a tag and attrs
|
||||
attrs = []
|
||||
match = tagfind.match(rawdata, i+1)
|
||||
assert match, 'unexpected call to parse_starttag()'
|
||||
k = match.end()
|
||||
self.lasttag = tag = rawdata[i+1:k].lower()
|
||||
while k < endpos:
|
||||
if self.strict:
|
||||
m = attrfind.match(rawdata, k)
|
||||
else:
|
||||
m = attrfind_tolerant.match(rawdata, k)
|
||||
if not m:
|
||||
break
|
||||
attrname, rest, attrvalue = m.group(1, 2, 3)
|
||||
if not rest:
|
||||
attrvalue = None
|
||||
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
|
||||
attrvalue[:1] == '"' == attrvalue[-1:]:
|
||||
attrvalue = attrvalue[1:-1]
|
||||
if attrvalue:
|
||||
attrvalue = self.unescape(attrvalue)
|
||||
attrs.append((attrname.lower(), attrvalue))
|
||||
k = m.end()
|
||||
|
||||
end = rawdata[k:endpos].strip()
|
||||
if end not in (">", "/>"):
|
||||
lineno, offset = self.getpos()
|
||||
if "\n" in self.__starttag_text:
|
||||
lineno = lineno + self.__starttag_text.count("\n")
|
||||
offset = len(self.__starttag_text) \
|
||||
- self.__starttag_text.rfind("\n")
|
||||
else:
|
||||
offset = offset + len(self.__starttag_text)
|
||||
if self.strict:
|
||||
self.error("junk characters in start tag: %r"
|
||||
% (rawdata[k:endpos][:20],))
|
||||
self.handle_data(rawdata[i:endpos])
|
||||
return endpos
|
||||
if end.endswith('/>'):
|
||||
# XHTML-style empty tag: <span attr="value" />
|
||||
self.handle_startendtag(tag, attrs)
|
||||
else:
|
||||
self.handle_starttag(tag, attrs)
|
||||
if tag in self.CDATA_CONTENT_ELEMENTS:
|
||||
self.set_cdata_mode(tag)
|
||||
return endpos
|
||||
|
||||
def set_cdata_mode(self, elem):
|
||||
self.cdata_elem = elem.lower()
|
||||
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
|
||||
|
||||
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
|
||||
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
|
||||
|
||||
CONSTRUCTOR_TAKES_STRICT = True
|
196
updater/bs4/builder/_lxml.py
Normal file
196
updater/bs4/builder/_lxml.py
Normal file
|
@ -0,0 +1,196 @@
|
|||
__all__ = [
|
||||
'LXMLTreeBuilderForXML',
|
||||
'LXMLTreeBuilder',
|
||||
]
|
||||
|
||||
from StringIO import StringIO
|
||||
import collections
|
||||
from lxml import etree
|
||||
from bs4.element import Comment, Doctype, NamespacedAttribute
|
||||
from bs4.builder import (
|
||||
FAST,
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
PERMISSIVE,
|
||||
TreeBuilder,
|
||||
XML)
|
||||
from bs4.dammit import UnicodeDammit
|
||||
|
||||
LXML = 'lxml'
|
||||
|
||||
class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
DEFAULT_PARSER_CLASS = etree.XMLParser
|
||||
|
||||
is_xml = True
|
||||
|
||||
# Well, it's permissive by XML parser standards.
|
||||
features = [LXML, XML, FAST, PERMISSIVE]
|
||||
|
||||
CHUNK_SIZE = 512
|
||||
|
||||
# This namespace mapping is specified in the XML Namespace
|
||||
# standard.
|
||||
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
|
||||
|
||||
@property
|
||||
def default_parser(self):
|
||||
# This can either return a parser object or a class, which
|
||||
# will be instantiated with default arguments.
|
||||
return etree.XMLParser(target=self, strip_cdata=False, recover=True)
|
||||
|
||||
def __init__(self, parser=None, empty_element_tags=None):
|
||||
if empty_element_tags is not None:
|
||||
self.empty_element_tags = set(empty_element_tags)
|
||||
if parser is None:
|
||||
# Use the default parser.
|
||||
parser = self.default_parser
|
||||
if isinstance(parser, collections.Callable):
|
||||
# Instantiate the parser with default arguments
|
||||
parser = parser(target=self, strip_cdata=False)
|
||||
self.parser = parser
|
||||
self.soup = None
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||
|
||||
def _getNsTag(self, tag):
|
||||
# Split the namespace URL out of a fully-qualified lxml tag
|
||||
# name. Copied from lxml's src/lxml/sax.py.
|
||||
if tag[0] == '{':
|
||||
return tuple(tag[1:].split('}', 1))
|
||||
else:
|
||||
return (None, tag)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None):
|
||||
"""
|
||||
:return: A 3-tuple (markup, original encoding, encoding
|
||||
declared within markup).
|
||||
"""
|
||||
if isinstance(markup, unicode):
|
||||
return markup, None, None, False
|
||||
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
|
||||
return (dammit.markup, dammit.original_encoding,
|
||||
dammit.declared_html_encoding,
|
||||
dammit.contains_replacement_characters)
|
||||
|
||||
def feed(self, markup):
|
||||
if isinstance(markup, basestring):
|
||||
markup = StringIO(markup)
|
||||
# Call feed() at least once, even if the markup is empty,
|
||||
# or the parser won't be initialized.
|
||||
data = markup.read(self.CHUNK_SIZE)
|
||||
self.parser.feed(data)
|
||||
while data != '':
|
||||
# Now call feed() on the rest of the data, chunk by chunk.
|
||||
data = markup.read(self.CHUNK_SIZE)
|
||||
if data != '':
|
||||
self.parser.feed(data)
|
||||
self.parser.close()
|
||||
|
||||
def close(self):
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||
|
||||
def start(self, name, attrs, nsmap={}):
|
||||
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
|
||||
attrs = dict(attrs)
|
||||
nsprefix = None
|
||||
# Invert each namespace map as it comes in.
|
||||
if len(self.nsmaps) > 1:
|
||||
# There are no new namespaces for this tag, but
|
||||
# non-default namespaces are in play, so we need a
|
||||
# separate tag stack to know when they end.
|
||||
self.nsmaps.append(None)
|
||||
elif len(nsmap) > 0:
|
||||
# A new namespace mapping has come into play.
|
||||
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
|
||||
self.nsmaps.append(inverted_nsmap)
|
||||
# Also treat the namespace mapping as a set of attributes on the
|
||||
# tag, so we can recreate it later.
|
||||
attrs = attrs.copy()
|
||||
for prefix, namespace in nsmap.items():
|
||||
attribute = NamespacedAttribute(
|
||||
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
||||
attrs[attribute] = namespace
|
||||
|
||||
# Namespaces are in play. Find any attributes that came in
|
||||
# from lxml with namespaces attached to their names, and
|
||||
# turn then into NamespacedAttribute objects.
|
||||
new_attrs = {}
|
||||
for attr, value in attrs.items():
|
||||
namespace, attr = self._getNsTag(attr)
|
||||
if namespace is None:
|
||||
new_attrs[attr] = value
|
||||
else:
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
attr = NamespacedAttribute(nsprefix, attr, namespace)
|
||||
new_attrs[attr] = value
|
||||
attrs = new_attrs
|
||||
|
||||
namespace, name = self._getNsTag(name)
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
self.soup.handle_starttag(name, namespace, nsprefix, attrs)
|
||||
|
||||
def _prefix_for_namespace(self, namespace):
|
||||
"""Find the currently active prefix for the given namespace."""
|
||||
if namespace is None:
|
||||
return None
|
||||
for inverted_nsmap in reversed(self.nsmaps):
|
||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||
return inverted_nsmap[namespace]
|
||||
return None
|
||||
|
||||
def end(self, name):
|
||||
self.soup.endData()
|
||||
completed_tag = self.soup.tagStack[-1]
|
||||
namespace, name = self._getNsTag(name)
|
||||
nsprefix = None
|
||||
if namespace is not None:
|
||||
for inverted_nsmap in reversed(self.nsmaps):
|
||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||
nsprefix = inverted_nsmap[namespace]
|
||||
break
|
||||
self.soup.handle_endtag(name, nsprefix)
|
||||
if len(self.nsmaps) > 1:
|
||||
# This tag, or one of its parents, introduced a namespace
|
||||
# mapping, so pop it off the stack.
|
||||
self.nsmaps.pop()
|
||||
|
||||
def pi(self, target, data):
|
||||
pass
|
||||
|
||||
def data(self, content):
|
||||
self.soup.handle_data(content)
|
||||
|
||||
def doctype(self, name, pubid, system):
|
||||
self.soup.endData()
|
||||
doctype = Doctype.for_name_and_ids(name, pubid, system)
|
||||
self.soup.object_was_parsed(doctype)
|
||||
|
||||
def comment(self, content):
|
||||
"Handle comments as Comment objects."
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(content)
|
||||
self.soup.endData(Comment)
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
||||
|
||||
|
||||
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||
|
||||
features = [LXML, HTML, FAST, PERMISSIVE]
|
||||
is_xml = False
|
||||
|
||||
@property
|
||||
def default_parser(self):
|
||||
return etree.HTMLParser
|
||||
|
||||
def feed(self, markup):
|
||||
self.parser.feed(markup)
|
||||
self.parser.close()
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<html><body>%s</body></html>' % fragment
|
802
updater/bs4/dammit.py
Normal file
802
updater/bs4/dammit.py
Normal file
|
@ -0,0 +1,802 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""Beautiful Soup bonus library: Unicode, Dammit
|
||||
|
||||
This class forces XML data into a standard format (usually to UTF-8 or
|
||||
Unicode). It is heavily based on code from Mark Pilgrim's Universal
|
||||
Feed Parser. It does not rewrite the XML or HTML to reflect a new
|
||||
encoding; that's the tree builder's job.
|
||||
"""
|
||||
|
||||
import codecs
|
||||
from htmlentitydefs import codepoint2name
|
||||
import re
|
||||
import logging
|
||||
|
||||
# Import a library to autodetect character encodings.
|
||||
chardet_type = None
|
||||
try:
|
||||
# First try the fast C implementation.
|
||||
# PyPI package: cchardet
|
||||
import cchardet
|
||||
def chardet_dammit(s):
|
||||
return cchardet.detect(s)['encoding']
|
||||
except ImportError:
|
||||
try:
|
||||
# Fall back to the pure Python implementation
|
||||
# Debian package: python-chardet
|
||||
# PyPI package: chardet
|
||||
import chardet
|
||||
def chardet_dammit(s):
|
||||
return chardet.detect(s)['encoding']
|
||||
#import chardet.constants
|
||||
#chardet.constants._debug = 1
|
||||
except ImportError:
|
||||
# No chardet available.
|
||||
def chardet_dammit(s):
|
||||
return None
|
||||
|
||||
# Available from http://cjkpython.i18n.org/.
|
||||
try:
|
||||
import iconv_codec
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
xml_encoding_re = re.compile(
|
||||
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
|
||||
html_meta_re = re.compile(
|
||||
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
|
||||
|
||||
class EntitySubstitution(object):
|
||||
|
||||
"""Substitute XML or HTML entities for the corresponding characters."""
|
||||
|
||||
def _populate_class_variables():
|
||||
lookup = {}
|
||||
reverse_lookup = {}
|
||||
characters_for_re = []
|
||||
for codepoint, name in list(codepoint2name.items()):
|
||||
character = unichr(codepoint)
|
||||
if codepoint != 34:
|
||||
# There's no point in turning the quotation mark into
|
||||
# ", unless it happens within an attribute value, which
|
||||
# is handled elsewhere.
|
||||
characters_for_re.append(character)
|
||||
lookup[character] = name
|
||||
# But we do want to turn " into the quotation mark.
|
||||
reverse_lookup[name] = character
|
||||
re_definition = "[%s]" % "".join(characters_for_re)
|
||||
return lookup, reverse_lookup, re.compile(re_definition)
|
||||
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
|
||||
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
|
||||
|
||||
CHARACTER_TO_XML_ENTITY = {
|
||||
"'": "apos",
|
||||
'"': "quot",
|
||||
"&": "amp",
|
||||
"<": "lt",
|
||||
">": "gt",
|
||||
}
|
||||
|
||||
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
||||
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
|
||||
")")
|
||||
|
||||
@classmethod
|
||||
def _substitute_html_entity(cls, matchobj):
|
||||
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
|
||||
return "&%s;" % entity
|
||||
|
||||
@classmethod
|
||||
def _substitute_xml_entity(cls, matchobj):
|
||||
"""Used with a regular expression to substitute the
|
||||
appropriate XML entity for an XML special character."""
|
||||
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
|
||||
return "&%s;" % entity
|
||||
|
||||
@classmethod
|
||||
def quoted_attribute_value(self, value):
|
||||
"""Make a value into a quoted XML attribute, possibly escaping it.
|
||||
|
||||
Most strings will be quoted using double quotes.
|
||||
|
||||
Bob's Bar -> "Bob's Bar"
|
||||
|
||||
If a string contains double quotes, it will be quoted using
|
||||
single quotes.
|
||||
|
||||
Welcome to "my bar" -> 'Welcome to "my bar"'
|
||||
|
||||
If a string contains both single and double quotes, the
|
||||
double quotes will be escaped, and the string will be quoted
|
||||
using double quotes.
|
||||
|
||||
Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
|
||||
"""
|
||||
quote_with = '"'
|
||||
if '"' in value:
|
||||
if "'" in value:
|
||||
# The string contains both single and double
|
||||
# quotes. Turn the double quotes into
|
||||
# entities. We quote the double quotes rather than
|
||||
# the single quotes because the entity name is
|
||||
# """ whether this is HTML or XML. If we
|
||||
# quoted the single quotes, we'd have to decide
|
||||
# between ' and &squot;.
|
||||
replace_with = """
|
||||
value = value.replace('"', replace_with)
|
||||
else:
|
||||
# There are double quotes but no single quotes.
|
||||
# We can use single quotes to quote the attribute.
|
||||
quote_with = "'"
|
||||
return quote_with + value + quote_with
|
||||
|
||||
@classmethod
|
||||
def substitute_xml(cls, value, make_quoted_attribute=False):
|
||||
"""Substitute XML entities for special XML characters.
|
||||
|
||||
:param value: A string to be substituted. The less-than sign will
|
||||
become <, the greater-than sign will become >, and any
|
||||
ampersands that are not part of an entity defition will
|
||||
become &.
|
||||
|
||||
:param make_quoted_attribute: If True, then the string will be
|
||||
quoted, as befits an attribute value.
|
||||
"""
|
||||
# Escape angle brackets, and ampersands that aren't part of
|
||||
# entities.
|
||||
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
|
||||
cls._substitute_xml_entity, value)
|
||||
|
||||
if make_quoted_attribute:
|
||||
value = cls.quoted_attribute_value(value)
|
||||
return value
|
||||
|
||||
@classmethod
|
||||
def substitute_html(cls, s):
|
||||
"""Replace certain Unicode characters with named HTML entities.
|
||||
|
||||
This differs from data.encode(encoding, 'xmlcharrefreplace')
|
||||
in that the goal is to make the result more readable (to those
|
||||
with ASCII displays) rather than to recover from
|
||||
errors. There's absolutely nothing wrong with a UTF-8 string
|
||||
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
|
||||
character with "é" will make it more readable to some
|
||||
people.
|
||||
"""
|
||||
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
|
||||
cls._substitute_html_entity, s)
|
||||
|
||||
|
||||
class UnicodeDammit:
|
||||
"""A class for detecting the encoding of a *ML document and
|
||||
converting it to a Unicode string. If the source encoding is
|
||||
windows-1252, can replace MS smart quotes with their HTML or XML
|
||||
equivalents."""
|
||||
|
||||
# This dictionary maps commonly seen values for "charset" in HTML
|
||||
# meta tags to the corresponding Python codec names. It only covers
|
||||
# values that aren't in Python's aliases and can't be determined
|
||||
# by the heuristics in find_codec.
|
||||
CHARSET_ALIASES = {"macintosh": "mac-roman",
|
||||
"x-sjis": "shift-jis"}
|
||||
|
||||
ENCODINGS_WITH_SMART_QUOTES = [
|
||||
"windows-1252",
|
||||
"iso-8859-1",
|
||||
"iso-8859-2",
|
||||
]
|
||||
|
||||
def __init__(self, markup, override_encodings=[],
|
||||
smart_quotes_to=None, is_html=False):
|
||||
self.declared_html_encoding = None
|
||||
self.smart_quotes_to = smart_quotes_to
|
||||
self.tried_encodings = []
|
||||
self.contains_replacement_characters = False
|
||||
|
||||
if markup == '' or isinstance(markup, unicode):
|
||||
self.markup = markup
|
||||
self.unicode_markup = unicode(markup)
|
||||
self.original_encoding = None
|
||||
return
|
||||
|
||||
new_markup, document_encoding, sniffed_encoding = \
|
||||
self._detectEncoding(markup, is_html)
|
||||
self.markup = new_markup
|
||||
|
||||
u = None
|
||||
if new_markup != markup:
|
||||
# _detectEncoding modified the markup, then converted it to
|
||||
# Unicode and then to UTF-8. So convert it from UTF-8.
|
||||
u = self._convert_from("utf8")
|
||||
self.original_encoding = sniffed_encoding
|
||||
|
||||
if not u:
|
||||
for proposed_encoding in (
|
||||
override_encodings + [document_encoding, sniffed_encoding]):
|
||||
if proposed_encoding is not None:
|
||||
u = self._convert_from(proposed_encoding)
|
||||
if u:
|
||||
break
|
||||
|
||||
# If no luck and we have auto-detection library, try that:
|
||||
if not u and not isinstance(self.markup, unicode):
|
||||
u = self._convert_from(chardet_dammit(self.markup))
|
||||
|
||||
# As a last resort, try utf-8 and windows-1252:
|
||||
if not u:
|
||||
for proposed_encoding in ("utf-8", "windows-1252"):
|
||||
u = self._convert_from(proposed_encoding)
|
||||
if u:
|
||||
break
|
||||
|
||||
# As an absolute last resort, try the encodings again with
|
||||
# character replacement.
|
||||
if not u:
|
||||
for proposed_encoding in (
|
||||
override_encodings + [
|
||||
document_encoding, sniffed_encoding, "utf-8", "windows-1252"]):
|
||||
if proposed_encoding != "ascii":
|
||||
u = self._convert_from(proposed_encoding, "replace")
|
||||
if u is not None:
|
||||
logging.warning(
|
||||
"Some characters could not be decoded, and were "
|
||||
"replaced with REPLACEMENT CHARACTER.")
|
||||
self.contains_replacement_characters = True
|
||||
break
|
||||
|
||||
# We could at this point force it to ASCII, but that would
|
||||
# destroy so much data that I think giving up is better
|
||||
self.unicode_markup = u
|
||||
if not u:
|
||||
self.original_encoding = None
|
||||
|
||||
def _sub_ms_char(self, match):
|
||||
"""Changes a MS smart quote character to an XML or HTML
|
||||
entity, or an ASCII character."""
|
||||
orig = match.group(1)
|
||||
if self.smart_quotes_to == 'ascii':
|
||||
sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
|
||||
else:
|
||||
sub = self.MS_CHARS.get(orig)
|
||||
if type(sub) == tuple:
|
||||
if self.smart_quotes_to == 'xml':
|
||||
sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
|
||||
else:
|
||||
sub = '&'.encode() + sub[0].encode() + ';'.encode()
|
||||
else:
|
||||
sub = sub.encode()
|
||||
return sub
|
||||
|
||||
def _convert_from(self, proposed, errors="strict"):
|
||||
proposed = self.find_codec(proposed)
|
||||
if not proposed or (proposed, errors) in self.tried_encodings:
|
||||
return None
|
||||
self.tried_encodings.append((proposed, errors))
|
||||
markup = self.markup
|
||||
# Convert smart quotes to HTML if coming from an encoding
|
||||
# that might have them.
|
||||
if (self.smart_quotes_to is not None
|
||||
and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
|
||||
smart_quotes_re = b"([\x80-\x9f])"
|
||||
smart_quotes_compiled = re.compile(smart_quotes_re)
|
||||
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
|
||||
|
||||
try:
|
||||
#print "Trying to convert document to %s (errors=%s)" % (
|
||||
# proposed, errors)
|
||||
u = self._to_unicode(markup, proposed, errors)
|
||||
self.markup = u
|
||||
self.original_encoding = proposed
|
||||
except Exception as e:
|
||||
#print "That didn't work!"
|
||||
#print e
|
||||
return None
|
||||
#print "Correct encoding: %s" % proposed
|
||||
return self.markup
|
||||
|
||||
def _to_unicode(self, data, encoding, errors="strict"):
|
||||
'''Given a string and its encoding, decodes the string into Unicode.
|
||||
%encoding is a string recognized by encodings.aliases'''
|
||||
|
||||
# strip Byte Order Mark (if present)
|
||||
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
|
||||
and (data[2:4] != '\x00\x00'):
|
||||
encoding = 'utf-16be'
|
||||
data = data[2:]
|
||||
elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
|
||||
and (data[2:4] != '\x00\x00'):
|
||||
encoding = 'utf-16le'
|
||||
data = data[2:]
|
||||
elif data[:3] == '\xef\xbb\xbf':
|
||||
encoding = 'utf-8'
|
||||
data = data[3:]
|
||||
elif data[:4] == '\x00\x00\xfe\xff':
|
||||
encoding = 'utf-32be'
|
||||
data = data[4:]
|
||||
elif data[:4] == '\xff\xfe\x00\x00':
|
||||
encoding = 'utf-32le'
|
||||
data = data[4:]
|
||||
newdata = unicode(data, encoding, errors)
|
||||
return newdata
|
||||
|
||||
def _detectEncoding(self, xml_data, is_html=False):
|
||||
"""Given a document, tries to detect its XML encoding."""
|
||||
xml_encoding = sniffed_xml_encoding = None
|
||||
try:
|
||||
if xml_data[:4] == b'\x4c\x6f\xa7\x94':
|
||||
# EBCDIC
|
||||
xml_data = self._ebcdic_to_ascii(xml_data)
|
||||
elif xml_data[:4] == b'\x00\x3c\x00\x3f':
|
||||
# UTF-16BE
|
||||
sniffed_xml_encoding = 'utf-16be'
|
||||
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
|
||||
elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
|
||||
and (xml_data[2:4] != b'\x00\x00'):
|
||||
# UTF-16BE with BOM
|
||||
sniffed_xml_encoding = 'utf-16be'
|
||||
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
|
||||
elif xml_data[:4] == b'\x3c\x00\x3f\x00':
|
||||
# UTF-16LE
|
||||
sniffed_xml_encoding = 'utf-16le'
|
||||
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
|
||||
elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
|
||||
(xml_data[2:4] != b'\x00\x00'):
|
||||
# UTF-16LE with BOM
|
||||
sniffed_xml_encoding = 'utf-16le'
|
||||
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
|
||||
elif xml_data[:4] == b'\x00\x00\x00\x3c':
|
||||
# UTF-32BE
|
||||
sniffed_xml_encoding = 'utf-32be'
|
||||
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
|
||||
elif xml_data[:4] == b'\x3c\x00\x00\x00':
|
||||
# UTF-32LE
|
||||
sniffed_xml_encoding = 'utf-32le'
|
||||
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
|
||||
elif xml_data[:4] == b'\x00\x00\xfe\xff':
|
||||
# UTF-32BE with BOM
|
||||
sniffed_xml_encoding = 'utf-32be'
|
||||
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
|
||||
elif xml_data[:4] == b'\xff\xfe\x00\x00':
|
||||
# UTF-32LE with BOM
|
||||
sniffed_xml_encoding = 'utf-32le'
|
||||
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
|
||||
elif xml_data[:3] == b'\xef\xbb\xbf':
|
||||
# UTF-8 with BOM
|
||||
sniffed_xml_encoding = 'utf-8'
|
||||
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
|
||||
else:
|
||||
sniffed_xml_encoding = 'ascii'
|
||||
pass
|
||||
except:
|
||||
xml_encoding_match = None
|
||||
xml_encoding_match = xml_encoding_re.match(xml_data)
|
||||
if not xml_encoding_match and is_html:
|
||||
xml_encoding_match = html_meta_re.search(xml_data)
|
||||
if xml_encoding_match is not None:
|
||||
xml_encoding = xml_encoding_match.groups()[0].decode(
|
||||
'ascii').lower()
|
||||
if is_html:
|
||||
self.declared_html_encoding = xml_encoding
|
||||
if sniffed_xml_encoding and \
|
||||
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
|
||||
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
|
||||
'utf-16', 'utf-32', 'utf_16', 'utf_32',
|
||||
'utf16', 'u16')):
|
||||
xml_encoding = sniffed_xml_encoding
|
||||
return xml_data, xml_encoding, sniffed_xml_encoding
|
||||
|
||||
def find_codec(self, charset):
|
||||
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
|
||||
or (charset and self._codec(charset.replace("-", ""))) \
|
||||
or (charset and self._codec(charset.replace("-", "_"))) \
|
||||
or charset
|
||||
|
||||
def _codec(self, charset):
|
||||
if not charset:
|
||||
return charset
|
||||
codec = None
|
||||
try:
|
||||
codecs.lookup(charset)
|
||||
codec = charset
|
||||
except (LookupError, ValueError):
|
||||
pass
|
||||
return codec
|
||||
|
||||
EBCDIC_TO_ASCII_MAP = None
|
||||
|
||||
def _ebcdic_to_ascii(self, s):
|
||||
c = self.__class__
|
||||
if not c.EBCDIC_TO_ASCII_MAP:
|
||||
emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
|
||||
16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
|
||||
128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
|
||||
144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
|
||||
32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
|
||||
38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
|
||||
45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
|
||||
186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
|
||||
195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
|
||||
201,202,106,107,108,109,110,111,112,113,114,203,204,205,
|
||||
206,207,208,209,126,115,116,117,118,119,120,121,122,210,
|
||||
211,212,213,214,215,216,217,218,219,220,221,222,223,224,
|
||||
225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
|
||||
73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
|
||||
82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
|
||||
90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
|
||||
250,251,252,253,254,255)
|
||||
import string
|
||||
c.EBCDIC_TO_ASCII_MAP = string.maketrans(
|
||||
''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
|
||||
return s.translate(c.EBCDIC_TO_ASCII_MAP)
|
||||
|
||||
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
|
||||
MS_CHARS = {b'\x80': ('euro', '20AC'),
|
||||
b'\x81': ' ',
|
||||
b'\x82': ('sbquo', '201A'),
|
||||
b'\x83': ('fnof', '192'),
|
||||
b'\x84': ('bdquo', '201E'),
|
||||
b'\x85': ('hellip', '2026'),
|
||||
b'\x86': ('dagger', '2020'),
|
||||
b'\x87': ('Dagger', '2021'),
|
||||
b'\x88': ('circ', '2C6'),
|
||||
b'\x89': ('permil', '2030'),
|
||||
b'\x8A': ('Scaron', '160'),
|
||||
b'\x8B': ('lsaquo', '2039'),
|
||||
b'\x8C': ('OElig', '152'),
|
||||
b'\x8D': '?',
|
||||
b'\x8E': ('#x17D', '17D'),
|
||||
b'\x8F': '?',
|
||||
b'\x90': '?',
|
||||
b'\x91': ('lsquo', '2018'),
|
||||
b'\x92': ('rsquo', '2019'),
|
||||
b'\x93': ('ldquo', '201C'),
|
||||
b'\x94': ('rdquo', '201D'),
|
||||
b'\x95': ('bull', '2022'),
|
||||
b'\x96': ('ndash', '2013'),
|
||||
b'\x97': ('mdash', '2014'),
|
||||
b'\x98': ('tilde', '2DC'),
|
||||
b'\x99': ('trade', '2122'),
|
||||
b'\x9a': ('scaron', '161'),
|
||||
b'\x9b': ('rsaquo', '203A'),
|
||||
b'\x9c': ('oelig', '153'),
|
||||
b'\x9d': '?',
|
||||
b'\x9e': ('#x17E', '17E'),
|
||||
b'\x9f': ('Yuml', ''),}
|
||||
|
||||
# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
|
||||
# horrors like stripping diacritical marks to turn á into a, but also
|
||||
# contains non-horrors like turning “ into ".
|
||||
MS_CHARS_TO_ASCII = {
|
||||
b'\x80' : 'EUR',
|
||||
b'\x81' : ' ',
|
||||
b'\x82' : ',',
|
||||
b'\x83' : 'f',
|
||||
b'\x84' : ',,',
|
||||
b'\x85' : '...',
|
||||
b'\x86' : '+',
|
||||
b'\x87' : '++',
|
||||
b'\x88' : '^',
|
||||
b'\x89' : '%',
|
||||
b'\x8a' : 'S',
|
||||
b'\x8b' : '<',
|
||||
b'\x8c' : 'OE',
|
||||
b'\x8d' : '?',
|
||||
b'\x8e' : 'Z',
|
||||
b'\x8f' : '?',
|
||||
b'\x90' : '?',
|
||||
b'\x91' : "'",
|
||||
b'\x92' : "'",
|
||||
b'\x93' : '"',
|
||||
b'\x94' : '"',
|
||||
b'\x95' : '*',
|
||||
b'\x96' : '-',
|
||||
b'\x97' : '--',
|
||||
b'\x98' : '~',
|
||||
b'\x99' : '(TM)',
|
||||
b'\x9a' : 's',
|
||||
b'\x9b' : '>',
|
||||
b'\x9c' : 'oe',
|
||||
b'\x9d' : '?',
|
||||
b'\x9e' : 'z',
|
||||
b'\x9f' : 'Y',
|
||||
b'\xa0' : ' ',
|
||||
b'\xa1' : '!',
|
||||
b'\xa2' : 'c',
|
||||
b'\xa3' : 'GBP',
|
||||
b'\xa4' : '$', #This approximation is especially parochial--this is the
|
||||
#generic currency symbol.
|
||||
b'\xa5' : 'YEN',
|
||||
b'\xa6' : '|',
|
||||
b'\xa7' : 'S',
|
||||
b'\xa8' : '..',
|
||||
b'\xa9' : '',
|
||||
b'\xaa' : '(th)',
|
||||
b'\xab' : '<<',
|
||||
b'\xac' : '!',
|
||||
b'\xad' : ' ',
|
||||
b'\xae' : '(R)',
|
||||
b'\xaf' : '-',
|
||||
b'\xb0' : 'o',
|
||||
b'\xb1' : '+-',
|
||||
b'\xb2' : '2',
|
||||
b'\xb3' : '3',
|
||||
b'\xb4' : ("'", 'acute'),
|
||||
b'\xb5' : 'u',
|
||||
b'\xb6' : 'P',
|
||||
b'\xb7' : '*',
|
||||
b'\xb8' : ',',
|
||||
b'\xb9' : '1',
|
||||
b'\xba' : '(th)',
|
||||
b'\xbb' : '>>',
|
||||
b'\xbc' : '1/4',
|
||||
b'\xbd' : '1/2',
|
||||
b'\xbe' : '3/4',
|
||||
b'\xbf' : '?',
|
||||
b'\xc0' : 'A',
|
||||
b'\xc1' : 'A',
|
||||
b'\xc2' : 'A',
|
||||
b'\xc3' : 'A',
|
||||
b'\xc4' : 'A',
|
||||
b'\xc5' : 'A',
|
||||
b'\xc6' : 'AE',
|
||||
b'\xc7' : 'C',
|
||||
b'\xc8' : 'E',
|
||||
b'\xc9' : 'E',
|
||||
b'\xca' : 'E',
|
||||
b'\xcb' : 'E',
|
||||
b'\xcc' : 'I',
|
||||
b'\xcd' : 'I',
|
||||
b'\xce' : 'I',
|
||||
b'\xcf' : 'I',
|
||||
b'\xd0' : 'D',
|
||||
b'\xd1' : 'N',
|
||||
b'\xd2' : 'O',
|
||||
b'\xd3' : 'O',
|
||||
b'\xd4' : 'O',
|
||||
b'\xd5' : 'O',
|
||||
b'\xd6' : 'O',
|
||||
b'\xd7' : '*',
|
||||
b'\xd8' : 'O',
|
||||
b'\xd9' : 'U',
|
||||
b'\xda' : 'U',
|
||||
b'\xdb' : 'U',
|
||||
b'\xdc' : 'U',
|
||||
b'\xdd' : 'Y',
|
||||
b'\xde' : 'b',
|
||||
b'\xdf' : 'B',
|
||||
b'\xe0' : 'a',
|
||||
b'\xe1' : 'a',
|
||||
b'\xe2' : 'a',
|
||||
b'\xe3' : 'a',
|
||||
b'\xe4' : 'a',
|
||||
b'\xe5' : 'a',
|
||||
b'\xe6' : 'ae',
|
||||
b'\xe7' : 'c',
|
||||
b'\xe8' : 'e',
|
||||
b'\xe9' : 'e',
|
||||
b'\xea' : 'e',
|
||||
b'\xeb' : 'e',
|
||||
b'\xec' : 'i',
|
||||
b'\xed' : 'i',
|
||||
b'\xee' : 'i',
|
||||
b'\xef' : 'i',
|
||||
b'\xf0' : 'o',
|
||||
b'\xf1' : 'n',
|
||||
b'\xf2' : 'o',
|
||||
b'\xf3' : 'o',
|
||||
b'\xf4' : 'o',
|
||||
b'\xf5' : 'o',
|
||||
b'\xf6' : 'o',
|
||||
b'\xf7' : '/',
|
||||
b'\xf8' : 'o',
|
||||
b'\xf9' : 'u',
|
||||
b'\xfa' : 'u',
|
||||
b'\xfb' : 'u',
|
||||
b'\xfc' : 'u',
|
||||
b'\xfd' : 'y',
|
||||
b'\xfe' : 'b',
|
||||
b'\xff' : 'y',
|
||||
}
|
||||
|
||||
# A map used when removing rogue Windows-1252/ISO-8859-1
|
||||
# characters in otherwise UTF-8 documents.
|
||||
#
|
||||
# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
|
||||
# Windows-1252.
|
||||
WINDOWS_1252_TO_UTF8 = {
|
||||
0x80 : b'\xe2\x82\xac', # €
|
||||
0x82 : b'\xe2\x80\x9a', # ‚
|
||||
0x83 : b'\xc6\x92', # ƒ
|
||||
0x84 : b'\xe2\x80\x9e', # „
|
||||
0x85 : b'\xe2\x80\xa6', # …
|
||||
0x86 : b'\xe2\x80\xa0', # †
|
||||
0x87 : b'\xe2\x80\xa1', # ‡
|
||||
0x88 : b'\xcb\x86', # ˆ
|
||||
0x89 : b'\xe2\x80\xb0', # ‰
|
||||
0x8a : b'\xc5\xa0', # Š
|
||||
0x8b : b'\xe2\x80\xb9', # ‹
|
||||
0x8c : b'\xc5\x92', # Œ
|
||||
0x8e : b'\xc5\xbd', # Ž
|
||||
0x91 : b'\xe2\x80\x98', # ‘
|
||||
0x92 : b'\xe2\x80\x99', # ’
|
||||
0x93 : b'\xe2\x80\x9c', # “
|
||||
0x94 : b'\xe2\x80\x9d', # ”
|
||||
0x95 : b'\xe2\x80\xa2', # •
|
||||
0x96 : b'\xe2\x80\x93', # –
|
||||
0x97 : b'\xe2\x80\x94', # —
|
||||
0x98 : b'\xcb\x9c', # ˜
|
||||
0x99 : b'\xe2\x84\xa2', # ™
|
||||
0x9a : b'\xc5\xa1', # š
|
||||
0x9b : b'\xe2\x80\xba', # ›
|
||||
0x9c : b'\xc5\x93', # œ
|
||||
0x9e : b'\xc5\xbe', # ž
|
||||
0x9f : b'\xc5\xb8', # Ÿ
|
||||
0xa0 : b'\xc2\xa0', #
|
||||
0xa1 : b'\xc2\xa1', # ¡
|
||||
0xa2 : b'\xc2\xa2', # ¢
|
||||
0xa3 : b'\xc2\xa3', # £
|
||||
0xa4 : b'\xc2\xa4', # ¤
|
||||
0xa5 : b'\xc2\xa5', # ¥
|
||||
0xa6 : b'\xc2\xa6', # ¦
|
||||
0xa7 : b'\xc2\xa7', # §
|
||||
0xa8 : b'\xc2\xa8', # ¨
|
||||
0xa9 : b'\xc2\xa9', # ©
|
||||
0xaa : b'\xc2\xaa', # ª
|
||||
0xab : b'\xc2\xab', # «
|
||||
0xac : b'\xc2\xac', # ¬
|
||||
0xad : b'\xc2\xad', #
|
||||
0xae : b'\xc2\xae', # ®
|
||||
0xaf : b'\xc2\xaf', # ¯
|
||||
0xb0 : b'\xc2\xb0', # °
|
||||
0xb1 : b'\xc2\xb1', # ±
|
||||
0xb2 : b'\xc2\xb2', # ²
|
||||
0xb3 : b'\xc2\xb3', # ³
|
||||
0xb4 : b'\xc2\xb4', # ´
|
||||
0xb5 : b'\xc2\xb5', # µ
|
||||
0xb6 : b'\xc2\xb6', # ¶
|
||||
0xb7 : b'\xc2\xb7', # ·
|
||||
0xb8 : b'\xc2\xb8', # ¸
|
||||
0xb9 : b'\xc2\xb9', # ¹
|
||||
0xba : b'\xc2\xba', # º
|
||||
0xbb : b'\xc2\xbb', # »
|
||||
0xbc : b'\xc2\xbc', # ¼
|
||||
0xbd : b'\xc2\xbd', # ½
|
||||
0xbe : b'\xc2\xbe', # ¾
|
||||
0xbf : b'\xc2\xbf', # ¿
|
||||
0xc0 : b'\xc3\x80', # À
|
||||
0xc1 : b'\xc3\x81', # Á
|
||||
0xc2 : b'\xc3\x82', # Â
|
||||
0xc3 : b'\xc3\x83', # Ã
|
||||
0xc4 : b'\xc3\x84', # Ä
|
||||
0xc5 : b'\xc3\x85', # Å
|
||||
0xc6 : b'\xc3\x86', # Æ
|
||||
0xc7 : b'\xc3\x87', # Ç
|
||||
0xc8 : b'\xc3\x88', # È
|
||||
0xc9 : b'\xc3\x89', # É
|
||||
0xca : b'\xc3\x8a', # Ê
|
||||
0xcb : b'\xc3\x8b', # Ë
|
||||
0xcc : b'\xc3\x8c', # Ì
|
||||
0xcd : b'\xc3\x8d', # Í
|
||||
0xce : b'\xc3\x8e', # Î
|
||||
0xcf : b'\xc3\x8f', # Ï
|
||||
0xd0 : b'\xc3\x90', # Ð
|
||||
0xd1 : b'\xc3\x91', # Ñ
|
||||
0xd2 : b'\xc3\x92', # Ò
|
||||
0xd3 : b'\xc3\x93', # Ó
|
||||
0xd4 : b'\xc3\x94', # Ô
|
||||
0xd5 : b'\xc3\x95', # Õ
|
||||
0xd6 : b'\xc3\x96', # Ö
|
||||
0xd7 : b'\xc3\x97', # ×
|
||||
0xd8 : b'\xc3\x98', # Ø
|
||||
0xd9 : b'\xc3\x99', # Ù
|
||||
0xda : b'\xc3\x9a', # Ú
|
||||
0xdb : b'\xc3\x9b', # Û
|
||||
0xdc : b'\xc3\x9c', # Ü
|
||||
0xdd : b'\xc3\x9d', # Ý
|
||||
0xde : b'\xc3\x9e', # Þ
|
||||
0xdf : b'\xc3\x9f', # ß
|
||||
0xe0 : b'\xc3\xa0', # à
|
||||
0xe1 : b'\xa1', # á
|
||||
0xe2 : b'\xc3\xa2', # â
|
||||
0xe3 : b'\xc3\xa3', # ã
|
||||
0xe4 : b'\xc3\xa4', # ä
|
||||
0xe5 : b'\xc3\xa5', # å
|
||||
0xe6 : b'\xc3\xa6', # æ
|
||||
0xe7 : b'\xc3\xa7', # ç
|
||||
0xe8 : b'\xc3\xa8', # è
|
||||
0xe9 : b'\xc3\xa9', # é
|
||||
0xea : b'\xc3\xaa', # ê
|
||||
0xeb : b'\xc3\xab', # ë
|
||||
0xec : b'\xc3\xac', # ì
|
||||
0xed : b'\xc3\xad', # í
|
||||
0xee : b'\xc3\xae', # î
|
||||
0xef : b'\xc3\xaf', # ï
|
||||
0xf0 : b'\xc3\xb0', # ð
|
||||
0xf1 : b'\xc3\xb1', # ñ
|
||||
0xf2 : b'\xc3\xb2', # ò
|
||||
0xf3 : b'\xc3\xb3', # ó
|
||||
0xf4 : b'\xc3\xb4', # ô
|
||||
0xf5 : b'\xc3\xb5', # õ
|
||||
0xf6 : b'\xc3\xb6', # ö
|
||||
0xf7 : b'\xc3\xb7', # ÷
|
||||
0xf8 : b'\xc3\xb8', # ø
|
||||
0xf9 : b'\xc3\xb9', # ù
|
||||
0xfa : b'\xc3\xba', # ú
|
||||
0xfb : b'\xc3\xbb', # û
|
||||
0xfc : b'\xc3\xbc', # ü
|
||||
0xfd : b'\xc3\xbd', # ý
|
||||
0xfe : b'\xc3\xbe', # þ
|
||||
}
|
||||
|
||||
MULTIBYTE_MARKERS_AND_SIZES = [
|
||||
(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
|
||||
(0xe0, 0xef, 3), # 3-byte characters start with E0-EF
|
||||
(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
|
||||
]
|
||||
|
||||
FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
|
||||
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
|
||||
|
||||
@classmethod
|
||||
def detwingle(cls, in_bytes, main_encoding="utf8",
|
||||
embedded_encoding="windows-1252"):
|
||||
"""Fix characters from one encoding embedded in some other encoding.
|
||||
|
||||
Currently the only situation supported is Windows-1252 (or its
|
||||
subset ISO-8859-1), embedded in UTF-8.
|
||||
|
||||
The input must be a bytestring. If you've already converted
|
||||
the document to Unicode, you're too late.
|
||||
|
||||
The output is a bytestring in which `embedded_encoding`
|
||||
characters have been converted to their `main_encoding`
|
||||
equivalents.
|
||||
"""
|
||||
if embedded_encoding.replace('_', '-').lower() not in (
|
||||
'windows-1252', 'windows_1252'):
|
||||
raise NotImplementedError(
|
||||
"Windows-1252 and ISO-8859-1 are the only currently supported "
|
||||
"embedded encodings.")
|
||||
|
||||
if main_encoding.lower() not in ('utf8', 'utf-8'):
|
||||
raise NotImplementedError(
|
||||
"UTF-8 is the only currently supported main encoding.")
|
||||
|
||||
byte_chunks = []
|
||||
|
||||
chunk_start = 0
|
||||
pos = 0
|
||||
while pos < len(in_bytes):
|
||||
byte = in_bytes[pos]
|
||||
if not isinstance(byte, int):
|
||||
# Python 2.x
|
||||
byte = ord(byte)
|
||||
if (byte >= cls.FIRST_MULTIBYTE_MARKER
|
||||
and byte <= cls.LAST_MULTIBYTE_MARKER):
|
||||
# This is the start of a UTF-8 multibyte character. Skip
|
||||
# to the end.
|
||||
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
|
||||
if byte >= start and byte <= end:
|
||||
pos += size
|
||||
break
|
||||
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
|
||||
# We found a Windows-1252 character!
|
||||
# Save the string up to this point as a chunk.
|
||||
byte_chunks.append(in_bytes[chunk_start:pos])
|
||||
|
||||
# Now translate the Windows-1252 character into UTF-8
|
||||
# and add it as another, one-byte chunk.
|
||||
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
|
||||
pos += 1
|
||||
chunk_start = pos
|
||||
else:
|
||||
# Go on to the next character.
|
||||
pos += 1
|
||||
if chunk_start == 0:
|
||||
# The string is unchanged.
|
||||
return in_bytes
|
||||
else:
|
||||
# Store the final chunk.
|
||||
byte_chunks.append(in_bytes[chunk_start:])
|
||||
return b''.join(byte_chunks)
|
||||
|
1386
updater/bs4/element.py
Normal file
1386
updater/bs4/element.py
Normal file
File diff suppressed because it is too large
Load diff
554
updater/bs4/testing.py
Normal file
554
updater/bs4/testing.py
Normal file
|
@ -0,0 +1,554 @@
|
|||
"""Helper classes for tests."""
|
||||
|
||||
import copy
|
||||
import functools
|
||||
import unittest
|
||||
from unittest import TestCase
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
Comment,
|
||||
ContentMetaAttributeValue,
|
||||
Doctype,
|
||||
SoupStrainer,
|
||||
)
|
||||
|
||||
from bs4.builder import HTMLParserTreeBuilder
|
||||
default_builder = HTMLParserTreeBuilder
|
||||
|
||||
|
||||
class SoupTest(unittest.TestCase):
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return default_builder()
|
||||
|
||||
def soup(self, markup, **kwargs):
|
||||
"""Build a Beautiful Soup object from markup."""
|
||||
builder = kwargs.pop('builder', self.default_builder)
|
||||
return BeautifulSoup(markup, builder=builder, **kwargs)
|
||||
|
||||
def document_for(self, markup):
|
||||
"""Turn an HTML fragment into a document.
|
||||
|
||||
The details depend on the builder.
|
||||
"""
|
||||
return self.default_builder.test_fragment_to_document(markup)
|
||||
|
||||
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
|
||||
builder = self.default_builder
|
||||
obj = BeautifulSoup(to_parse, builder=builder)
|
||||
if compare_parsed_to is None:
|
||||
compare_parsed_to = to_parse
|
||||
|
||||
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
|
||||
|
||||
|
||||
class HTMLTreeBuilderSmokeTest(object):
|
||||
|
||||
"""A basic test of a treebuilder's competence.
|
||||
|
||||
Any HTML treebuilder, present or future, should be able to pass
|
||||
these tests. With invalid markup, there's room for interpretation,
|
||||
and different parsers can handle it differently. But with the
|
||||
markup in these tests, there's not much room for interpretation.
|
||||
"""
|
||||
|
||||
def assertDoctypeHandled(self, doctype_fragment):
|
||||
"""Assert that a given doctype string is handled correctly."""
|
||||
doctype_str, soup = self._document_with_doctype(doctype_fragment)
|
||||
|
||||
# Make sure a Doctype object was created.
|
||||
doctype = soup.contents[0]
|
||||
self.assertEqual(doctype.__class__, Doctype)
|
||||
self.assertEqual(doctype, doctype_fragment)
|
||||
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
|
||||
|
||||
# Make sure that the doctype was correctly associated with the
|
||||
# parse tree and that the rest of the document parsed.
|
||||
self.assertEqual(soup.p.contents[0], 'foo')
|
||||
|
||||
def _document_with_doctype(self, doctype_fragment):
|
||||
"""Generate and parse a document with the given doctype."""
|
||||
doctype = '<!DOCTYPE %s>' % doctype_fragment
|
||||
markup = doctype + '\n<p>foo</p>'
|
||||
soup = self.soup(markup)
|
||||
return doctype, soup
|
||||
|
||||
def test_normal_doctypes(self):
|
||||
"""Make sure normal, everyday HTML doctypes are handled correctly."""
|
||||
self.assertDoctypeHandled("html")
|
||||
self.assertDoctypeHandled(
|
||||
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
|
||||
|
||||
def test_public_doctype_with_url(self):
|
||||
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
|
||||
self.assertDoctypeHandled(doctype)
|
||||
|
||||
def test_system_doctype(self):
|
||||
self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
|
||||
|
||||
def test_namespaced_system_doctype(self):
|
||||
# We can handle a namespaced doctype with a system ID.
|
||||
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
|
||||
|
||||
def test_namespaced_public_doctype(self):
|
||||
# Test a namespaced doctype with a public id.
|
||||
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
"""A real XHTML document should come out more or less the same as it went in."""
|
||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>Hello.</title></head>
|
||||
<body>Goodbye.</body>
|
||||
</html>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
soup.encode("utf-8").replace(b"\n", b""),
|
||||
markup.replace(b"\n", b""))
|
||||
|
||||
def test_deepcopy(self):
|
||||
"""Make sure you can copy the tree builder.
|
||||
|
||||
This is important because the builder is part of a
|
||||
BeautifulSoup object, and we want to be able to copy that.
|
||||
"""
|
||||
copy.deepcopy(self.default_builder)
|
||||
|
||||
def test_p_tag_is_never_empty_element(self):
|
||||
"""A <p> tag is never designated as an empty-element tag.
|
||||
|
||||
Even if the markup shows it as an empty-element tag, it
|
||||
shouldn't be presented that way.
|
||||
"""
|
||||
soup = self.soup("<p/>")
|
||||
self.assertFalse(soup.p.is_empty_element)
|
||||
self.assertEqual(str(soup.p), "<p></p>")
|
||||
|
||||
def test_unclosed_tags_get_closed(self):
|
||||
"""A tag that's not closed by the end of the document should be closed.
|
||||
|
||||
This applies to all tags except empty-element tags.
|
||||
"""
|
||||
self.assertSoupEquals("<p>", "<p></p>")
|
||||
self.assertSoupEquals("<b>", "<b></b>")
|
||||
|
||||
self.assertSoupEquals("<br>", "<br/>")
|
||||
|
||||
def test_br_is_always_empty_element_tag(self):
|
||||
"""A <br> tag is designated as an empty-element tag.
|
||||
|
||||
Some parsers treat <br></br> as one <br/> tag, some parsers as
|
||||
two tags, but it should always be an empty-element tag.
|
||||
"""
|
||||
soup = self.soup("<br></br>")
|
||||
self.assertTrue(soup.br.is_empty_element)
|
||||
self.assertEqual(str(soup.br), "<br/>")
|
||||
|
||||
def test_nested_formatting_elements(self):
|
||||
self.assertSoupEquals("<em><em></em></em>")
|
||||
|
||||
def test_comment(self):
|
||||
# Comments are represented as Comment objects.
|
||||
markup = "<p>foo<!--foobar-->baz</p>"
|
||||
self.assertSoupEquals(markup)
|
||||
|
||||
soup = self.soup(markup)
|
||||
comment = soup.find(text="foobar")
|
||||
self.assertEqual(comment.__class__, Comment)
|
||||
|
||||
# The comment is properly integrated into the tree.
|
||||
foo = soup.find(text="foo")
|
||||
self.assertEqual(comment, foo.next_element)
|
||||
baz = soup.find(text="baz")
|
||||
self.assertEquals(comment, baz.previous_element)
|
||||
|
||||
def test_preserved_whitespace_in_pre_and_textarea(self):
|
||||
"""Whitespace must be preserved in <pre> and <textarea> tags."""
|
||||
self.assertSoupEquals("<pre> </pre>")
|
||||
self.assertSoupEquals("<textarea> woo </textarea>")
|
||||
|
||||
def test_nested_inline_elements(self):
|
||||
"""Inline elements can be nested indefinitely."""
|
||||
b_tag = "<b>Inside a B tag</b>"
|
||||
self.assertSoupEquals(b_tag)
|
||||
|
||||
nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
|
||||
self.assertSoupEquals(nested_b_tag)
|
||||
|
||||
double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
|
||||
self.assertSoupEquals(nested_b_tag)
|
||||
|
||||
def test_nested_block_level_elements(self):
|
||||
"""Block elements can be nested."""
|
||||
soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
|
||||
blockquote = soup.blockquote
|
||||
self.assertEqual(blockquote.p.b.string, 'Foo')
|
||||
self.assertEqual(blockquote.b.string, 'Foo')
|
||||
|
||||
def test_correctly_nested_tables(self):
|
||||
"""One table can go inside another one."""
|
||||
markup = ('<table id="1">'
|
||||
'<tr>'
|
||||
"<td>Here's another table:"
|
||||
'<table id="2">'
|
||||
'<tr><td>foo</td></tr>'
|
||||
'</table></td>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
markup,
|
||||
'<table id="1"><tr><td>Here\'s another table:'
|
||||
'<table id="2"><tr><td>foo</td></tr></table>'
|
||||
'</td></tr></table>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
"<table><thead><tr><td>Foo</td></tr></thead>"
|
||||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||
|
||||
def test_deeply_nested_multivalued_attribute(self):
|
||||
# html5lib can set the attributes of the same tag many times
|
||||
# as it rearranges the tree. This has caused problems with
|
||||
# multivalued attributes.
|
||||
markup = '<table><div><div class="css"></div></div></table>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(["css"], soup.div.div['class'])
|
||||
|
||||
def test_angle_brackets_in_attribute_values_are_escaped(self):
|
||||
self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
|
||||
|
||||
def test_entities_in_attributes_converted_to_unicode(self):
|
||||
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
|
||||
def test_entities_in_text_converted_to_unicode(self):
|
||||
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
|
||||
def test_quot_entity_converted_to_quotation_mark(self):
|
||||
self.assertSoupEquals("<p>I said "good day!"</p>",
|
||||
'<p>I said "good day!"</p>')
|
||||
|
||||
def test_out_of_range_entity(self):
|
||||
expect = u"\N{REPLACEMENT CHARACTER}"
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
|
||||
def test_basic_namespaces(self):
|
||||
"""Parsers don't need to *understand* namespaces, but at the
|
||||
very least they should not choke on namespaces or lose
|
||||
data."""
|
||||
|
||||
markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.encode())
|
||||
html = soup.html
|
||||
self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
|
||||
self.assertEqual(
|
||||
'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
|
||||
self.assertEqual(
|
||||
'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
|
||||
|
||||
def test_multivalued_attribute_value_becomes_list(self):
|
||||
markup = b'<a class="foo bar">'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(['foo', 'bar'], soup.a['class'])
|
||||
|
||||
#
|
||||
# Generally speaking, tests below this point are more tests of
|
||||
# Beautiful Soup than tests of the tree builders. But parsers are
|
||||
# weird, so we run these tests separately for every tree builder
|
||||
# to detect any differences between them.
|
||||
#
|
||||
|
||||
def test_soupstrainer(self):
|
||||
"""Parsers should be able to work with SoupStrainers."""
|
||||
strainer = SoupStrainer("b")
|
||||
soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
|
||||
parse_only=strainer)
|
||||
self.assertEqual(soup.decode(), "<b>bold</b>")
|
||||
|
||||
def test_single_quote_attribute_values_become_double_quotes(self):
|
||||
self.assertSoupEquals("<foo attr='bar'></foo>",
|
||||
'<foo attr="bar"></foo>')
|
||||
|
||||
def test_attribute_values_with_nested_quotes_are_left_alone(self):
|
||||
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
||||
self.assertSoupEquals(text)
|
||||
|
||||
def test_attribute_values_with_double_nested_quotes_get_quoted(self):
|
||||
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
||||
soup = self.soup(text)
|
||||
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
|
||||
self.assertSoupEquals(
|
||||
soup.foo.decode(),
|
||||
"""<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""")
|
||||
|
||||
def test_ampersand_in_attribute_value_gets_escaped(self):
|
||||
self.assertSoupEquals('<this is="really messed up & stuff"></this>',
|
||||
'<this is="really messed up & stuff"></this>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
'<a href="http://example.org?a=1&b=2;3">foo</a>',
|
||||
'<a href="http://example.org?a=1&b=2;3">foo</a>')
|
||||
|
||||
def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
|
||||
self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>')
|
||||
|
||||
def test_entities_in_strings_converted_during_parsing(self):
|
||||
# Both XML and HTML entities are converted to Unicode characters
|
||||
# during parsing.
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
||||
self.assertSoupEquals(text, expected)
|
||||
|
||||
def test_smart_quotes_converted_on_the_way_in(self):
|
||||
# Microsoft smart quotes are converted to Unicode characters during
|
||||
# parsing.
|
||||
quote = b"<p>\x91Foo\x92</p>"
|
||||
soup = self.soup(quote)
|
||||
self.assertEqual(
|
||||
soup.p.string,
|
||||
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
||||
|
||||
def test_non_breaking_spaces_converted_on_the_way_in(self):
|
||||
soup = self.soup("<a> </a>")
|
||||
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
|
||||
|
||||
def test_entities_converted_on_the_way_out(self):
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
||||
soup = self.soup(text)
|
||||
self.assertEqual(soup.p.encode("utf-8"), expected)
|
||||
|
||||
def test_real_iso_latin_document(self):
|
||||
# Smoke test of interrelated functionality, using an
|
||||
# easy-to-understand document.
|
||||
|
||||
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
|
||||
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
||||
|
||||
# That's because we're going to encode it into ISO-Latin-1, and use
|
||||
# that to test.
|
||||
iso_latin_html = unicode_html.encode("iso-8859-1")
|
||||
|
||||
# Parse the ISO-Latin-1 HTML.
|
||||
soup = self.soup(iso_latin_html)
|
||||
# Encode it to UTF-8.
|
||||
result = soup.encode("utf-8")
|
||||
|
||||
# What do we expect the result to look like? Well, it would
|
||||
# look like unicode_html, except that the META tag would say
|
||||
# UTF-8 instead of ISO-Latin-1.
|
||||
expected = unicode_html.replace("ISO-Latin-1", "utf-8")
|
||||
|
||||
# And, of course, it would be in UTF-8, not Unicode.
|
||||
expected = expected.encode("utf-8")
|
||||
|
||||
# Ta-da!
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
def test_real_shift_jis_document(self):
|
||||
# Smoke test to make sure the parser can handle a document in
|
||||
# Shift-JIS encoding, without choking.
|
||||
shift_jis_html = (
|
||||
b'<html><head></head><body><pre>'
|
||||
b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
|
||||
b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
|
||||
b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
|
||||
b'</pre></body></html>')
|
||||
unicode_html = shift_jis_html.decode("shift-jis")
|
||||
soup = self.soup(unicode_html)
|
||||
|
||||
# Make sure the parse tree is correctly encoded to various
|
||||
# encodings.
|
||||
self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
|
||||
self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
|
||||
|
||||
def test_real_hebrew_document(self):
|
||||
# A real-world test to make sure we can convert ISO-8859-9 (a
|
||||
# Hebrew encoding) to UTF-8.
|
||||
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
|
||||
soup = self.soup(
|
||||
hebrew_document, from_encoding="iso8859-8")
|
||||
self.assertEqual(soup.original_encoding, 'iso8859-8')
|
||||
self.assertEqual(
|
||||
soup.encode('utf-8'),
|
||||
hebrew_document.decode("iso8859-8").encode("utf-8"))
|
||||
|
||||
def test_meta_tag_reflects_current_encoding(self):
|
||||
# Here's the <meta> tag saying that a document is
|
||||
# encoded in Shift-JIS.
|
||||
meta_tag = ('<meta content="text/html; charset=x-sjis" '
|
||||
'http-equiv="Content-type"/>')
|
||||
|
||||
# Here's a document incorporating that meta tag.
|
||||
shift_jis_html = (
|
||||
'<html><head>\n%s\n'
|
||||
'<meta http-equiv="Content-language" content="ja"/>'
|
||||
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
||||
soup = self.soup(shift_jis_html)
|
||||
|
||||
# Parse the document, and the charset is seemingly unaffected.
|
||||
parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
|
||||
content = parsed_meta['content']
|
||||
self.assertEqual('text/html; charset=x-sjis', content)
|
||||
|
||||
# But that value is actually a ContentMetaAttributeValue object.
|
||||
self.assertTrue(isinstance(content, ContentMetaAttributeValue))
|
||||
|
||||
# And it will take on a value that reflects its current
|
||||
# encoding.
|
||||
self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
|
||||
|
||||
# For the rest of the story, see TestSubstitutions in
|
||||
# test_tree.py.
|
||||
|
||||
def test_html5_style_meta_tag_reflects_current_encoding(self):
|
||||
# Here's the <meta> tag saying that a document is
|
||||
# encoded in Shift-JIS.
|
||||
meta_tag = ('<meta id="encoding" charset="x-sjis" />')
|
||||
|
||||
# Here's a document incorporating that meta tag.
|
||||
shift_jis_html = (
|
||||
'<html><head>\n%s\n'
|
||||
'<meta http-equiv="Content-language" content="ja"/>'
|
||||
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
||||
soup = self.soup(shift_jis_html)
|
||||
|
||||
# Parse the document, and the charset is seemingly unaffected.
|
||||
parsed_meta = soup.find('meta', id="encoding")
|
||||
charset = parsed_meta['charset']
|
||||
self.assertEqual('x-sjis', charset)
|
||||
|
||||
# But that value is actually a CharsetMetaAttributeValue object.
|
||||
self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
|
||||
|
||||
# And it will take on a value that reflects its current
|
||||
# encoding.
|
||||
self.assertEqual('utf8', charset.encode("utf8"))
|
||||
|
||||
def test_tag_with_no_attributes_can_have_attributes_added(self):
|
||||
data = self.soup("<a>text</a>")
|
||||
data.a['foo'] = 'bar'
|
||||
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
|
||||
|
||||
class XMLTreeBuilderSmokeTest(object):
|
||||
|
||||
def test_docstring_generated(self):
|
||||
soup = self.soup("<root/>")
|
||||
self.assertEqual(
|
||||
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
"""A real XHTML document should come out *exactly* the same as it went in."""
|
||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>Hello.</title></head>
|
||||
<body>Goodbye.</body>
|
||||
</html>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
soup.encode("utf-8"), markup)
|
||||
|
||||
def test_popping_namespaced_tag(self):
|
||||
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
unicode(soup.rss), markup)
|
||||
|
||||
def test_docstring_includes_correct_encoding(self):
|
||||
soup = self.soup("<root/>")
|
||||
self.assertEqual(
|
||||
soup.encode("latin1"),
|
||||
b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
|
||||
|
||||
def test_large_xml_document(self):
|
||||
"""A large XML document should come out the same as it went in."""
|
||||
markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
|
||||
+ b'0' * (2**12)
|
||||
+ b'</root>')
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(soup.encode("utf-8"), markup)
|
||||
|
||||
|
||||
def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
|
||||
self.assertSoupEquals("<p>", "<p/>")
|
||||
self.assertSoupEquals("<p>foo</p>")
|
||||
|
||||
def test_namespaces_are_preserved(self):
|
||||
markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
|
||||
soup = self.soup(markup)
|
||||
root = soup.root
|
||||
self.assertEqual("http://example.com/", root['xmlns:a'])
|
||||
self.assertEqual("http://example.net/", root['xmlns:b'])
|
||||
|
||||
def test_closing_namespaced_tag(self):
|
||||
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.p), markup)
|
||||
|
||||
def test_namespaced_attributes(self):
|
||||
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.foo), markup)
|
||||
|
||||
def test_namespaced_attributes_xml_namespace(self):
|
||||
markup = '<foo xml:lang="fr">bar</foo>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.foo), markup)
|
||||
|
||||
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
||||
"""Smoke test for a tree builder that supports HTML5."""
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
# Since XHTML is not HTML5, HTML5 parsers are not tested to handle
|
||||
# XHTML documents in any particular way.
|
||||
pass
|
||||
|
||||
def test_html_tags_have_namespace(self):
|
||||
markup = "<a>"
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
|
||||
|
||||
def test_svg_tags_have_namespace(self):
|
||||
markup = '<svg><circle/></svg>'
|
||||
soup = self.soup(markup)
|
||||
namespace = "http://www.w3.org/2000/svg"
|
||||
self.assertEqual(namespace, soup.svg.namespace)
|
||||
self.assertEqual(namespace, soup.circle.namespace)
|
||||
|
||||
|
||||
def test_mathml_tags_have_namespace(self):
|
||||
markup = '<math><msqrt>5</msqrt></math>'
|
||||
soup = self.soup(markup)
|
||||
namespace = 'http://www.w3.org/1998/Math/MathML'
|
||||
self.assertEqual(namespace, soup.math.namespace)
|
||||
self.assertEqual(namespace, soup.msqrt.namespace)
|
||||
|
||||
def test_xml_declaration_becomes_comment(self):
|
||||
markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
|
||||
soup = self.soup(markup)
|
||||
self.assertTrue(isinstance(soup.contents[0], Comment))
|
||||
self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
|
||||
self.assertEqual("html", soup.contents[0].next_element.name)
|
||||
|
||||
def skipIf(condition, reason):
|
||||
def nothing(test, *args, **kwargs):
|
||||
return None
|
||||
|
||||
def decorator(test_item):
|
||||
if condition:
|
||||
return nothing
|
||||
else:
|
||||
return test_item
|
||||
|
||||
return decorator
|
1
updater/bs4/tests/__init__.py
Normal file
1
updater/bs4/tests/__init__.py
Normal file
|
@ -0,0 +1 @@
|
|||
"The beautifulsoup tests."
|
141
updater/bs4/tests/test_builder_registry.py
Normal file
141
updater/bs4/tests/test_builder_registry.py
Normal file
|
@ -0,0 +1,141 @@
|
|||
"""Tests of the builder registry."""
|
||||
|
||||
import unittest
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.builder import (
|
||||
builder_registry as registry,
|
||||
HTMLParserTreeBuilder,
|
||||
TreeBuilderRegistry,
|
||||
)
|
||||
|
||||
try:
|
||||
from bs4.builder import HTML5TreeBuilder
|
||||
HTML5LIB_PRESENT = True
|
||||
except ImportError:
|
||||
HTML5LIB_PRESENT = False
|
||||
|
||||
try:
|
||||
from bs4.builder import (
|
||||
LXMLTreeBuilderForXML,
|
||||
LXMLTreeBuilder,
|
||||
)
|
||||
LXML_PRESENT = True
|
||||
except ImportError:
|
||||
LXML_PRESENT = False
|
||||
|
||||
|
||||
class BuiltInRegistryTest(unittest.TestCase):
|
||||
"""Test the built-in registry with the default builders registered."""
|
||||
|
||||
def test_combination(self):
|
||||
if LXML_PRESENT:
|
||||
self.assertEqual(registry.lookup('fast', 'html'),
|
||||
LXMLTreeBuilder)
|
||||
|
||||
if LXML_PRESENT:
|
||||
self.assertEqual(registry.lookup('permissive', 'xml'),
|
||||
LXMLTreeBuilderForXML)
|
||||
self.assertEqual(registry.lookup('strict', 'html'),
|
||||
HTMLParserTreeBuilder)
|
||||
if HTML5LIB_PRESENT:
|
||||
self.assertEqual(registry.lookup('html5lib', 'html'),
|
||||
HTML5TreeBuilder)
|
||||
|
||||
def test_lookup_by_markup_type(self):
|
||||
if LXML_PRESENT:
|
||||
self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
|
||||
self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
|
||||
else:
|
||||
self.assertEqual(registry.lookup('xml'), None)
|
||||
if HTML5LIB_PRESENT:
|
||||
self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
|
||||
else:
|
||||
self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
|
||||
|
||||
def test_named_library(self):
|
||||
if LXML_PRESENT:
|
||||
self.assertEqual(registry.lookup('lxml', 'xml'),
|
||||
LXMLTreeBuilderForXML)
|
||||
self.assertEqual(registry.lookup('lxml', 'html'),
|
||||
LXMLTreeBuilder)
|
||||
if HTML5LIB_PRESENT:
|
||||
self.assertEqual(registry.lookup('html5lib'),
|
||||
HTML5TreeBuilder)
|
||||
|
||||
self.assertEqual(registry.lookup('html.parser'),
|
||||
HTMLParserTreeBuilder)
|
||||
|
||||
def test_beautifulsoup_constructor_does_lookup(self):
|
||||
# You can pass in a string.
|
||||
BeautifulSoup("", features="html")
|
||||
# Or a list of strings.
|
||||
BeautifulSoup("", features=["html", "fast"])
|
||||
|
||||
# You'll get an exception if BS can't find an appropriate
|
||||
# builder.
|
||||
self.assertRaises(ValueError, BeautifulSoup,
|
||||
"", features="no-such-feature")
|
||||
|
||||
class RegistryTest(unittest.TestCase):
|
||||
"""Test the TreeBuilderRegistry class in general."""
|
||||
|
||||
def setUp(self):
|
||||
self.registry = TreeBuilderRegistry()
|
||||
|
||||
def builder_for_features(self, *feature_list):
|
||||
cls = type('Builder_' + '_'.join(feature_list),
|
||||
(object,), {'features' : feature_list})
|
||||
|
||||
self.registry.register(cls)
|
||||
return cls
|
||||
|
||||
def test_register_with_no_features(self):
|
||||
builder = self.builder_for_features()
|
||||
|
||||
# Since the builder advertises no features, you can't find it
|
||||
# by looking up features.
|
||||
self.assertEqual(self.registry.lookup('foo'), None)
|
||||
|
||||
# But you can find it by doing a lookup with no features, if
|
||||
# this happens to be the only registered builder.
|
||||
self.assertEqual(self.registry.lookup(), builder)
|
||||
|
||||
def test_register_with_features_makes_lookup_succeed(self):
|
||||
builder = self.builder_for_features('foo', 'bar')
|
||||
self.assertEqual(self.registry.lookup('foo'), builder)
|
||||
self.assertEqual(self.registry.lookup('bar'), builder)
|
||||
|
||||
def test_lookup_fails_when_no_builder_implements_feature(self):
|
||||
builder = self.builder_for_features('foo', 'bar')
|
||||
self.assertEqual(self.registry.lookup('baz'), None)
|
||||
|
||||
def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
|
||||
builder1 = self.builder_for_features('foo')
|
||||
builder2 = self.builder_for_features('bar')
|
||||
self.assertEqual(self.registry.lookup(), builder2)
|
||||
|
||||
def test_lookup_fails_when_no_tree_builders_registered(self):
|
||||
self.assertEqual(self.registry.lookup(), None)
|
||||
|
||||
def test_lookup_gets_most_recent_builder_supporting_all_features(self):
|
||||
has_one = self.builder_for_features('foo')
|
||||
has_the_other = self.builder_for_features('bar')
|
||||
has_both_early = self.builder_for_features('foo', 'bar', 'baz')
|
||||
has_both_late = self.builder_for_features('foo', 'bar', 'quux')
|
||||
lacks_one = self.builder_for_features('bar')
|
||||
has_the_other = self.builder_for_features('foo')
|
||||
|
||||
# There are two builders featuring 'foo' and 'bar', but
|
||||
# the one that also features 'quux' was registered later.
|
||||
self.assertEqual(self.registry.lookup('foo', 'bar'),
|
||||
has_both_late)
|
||||
|
||||
# There is only one builder featuring 'foo', 'bar', and 'baz'.
|
||||
self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
|
||||
has_both_early)
|
||||
|
||||
def test_lookup_fails_when_cannot_reconcile_requested_features(self):
|
||||
builder1 = self.builder_for_features('foo', 'bar')
|
||||
builder2 = self.builder_for_features('foo', 'baz')
|
||||
self.assertEqual(self.registry.lookup('bar', 'baz'), None)
|
36
updater/bs4/tests/test_docs.py
Normal file
36
updater/bs4/tests/test_docs.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
"Test harness for doctests."
|
||||
|
||||
# pylint: disable-msg=E0611,W0142
|
||||
|
||||
__metaclass__ = type
|
||||
__all__ = [
|
||||
'additional_tests',
|
||||
]
|
||||
|
||||
import atexit
|
||||
import doctest
|
||||
import os
|
||||
#from pkg_resources import (
|
||||
# resource_filename, resource_exists, resource_listdir, cleanup_resources)
|
||||
import unittest
|
||||
|
||||
DOCTEST_FLAGS = (
|
||||
doctest.ELLIPSIS |
|
||||
doctest.NORMALIZE_WHITESPACE |
|
||||
doctest.REPORT_NDIFF)
|
||||
|
||||
|
||||
# def additional_tests():
|
||||
# "Run the doc tests (README.txt and docs/*, if any exist)"
|
||||
# doctest_files = [
|
||||
# os.path.abspath(resource_filename('bs4', 'README.txt'))]
|
||||
# if resource_exists('bs4', 'docs'):
|
||||
# for name in resource_listdir('bs4', 'docs'):
|
||||
# if name.endswith('.txt'):
|
||||
# doctest_files.append(
|
||||
# os.path.abspath(
|
||||
# resource_filename('bs4', 'docs/%s' % name)))
|
||||
# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
|
||||
# atexit.register(cleanup_resources)
|
||||
# return unittest.TestSuite((
|
||||
# doctest.DocFileSuite(*doctest_files, **kwargs)))
|
72
updater/bs4/tests/test_html5lib.py
Normal file
72
updater/bs4/tests/test_html5lib.py
Normal file
|
@ -0,0 +1,72 @@
|
|||
"""Tests to ensure that the html5lib tree builder generates good trees."""
|
||||
|
||||
import warnings
|
||||
|
||||
try:
|
||||
from bs4.builder import HTML5TreeBuilder
|
||||
HTML5LIB_PRESENT = True
|
||||
except ImportError, e:
|
||||
HTML5LIB_PRESENT = False
|
||||
from bs4.element import SoupStrainer
|
||||
from bs4.testing import (
|
||||
HTML5TreeBuilderSmokeTest,
|
||||
SoupTest,
|
||||
skipIf,
|
||||
)
|
||||
|
||||
@skipIf(
|
||||
not HTML5LIB_PRESENT,
|
||||
"html5lib seems not to be present, not testing its tree builder.")
|
||||
class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||
"""See ``HTML5TreeBuilderSmokeTest``."""
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return HTML5TreeBuilder()
|
||||
|
||||
def test_soupstrainer(self):
|
||||
# The html5lib tree builder does not support SoupStrainers.
|
||||
strainer = SoupStrainer("b")
|
||||
markup = "<p>A <b>bold</b> statement.</p>"
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup(markup, parse_only=strainer)
|
||||
self.assertEqual(
|
||||
soup.decode(), self.document_for(markup))
|
||||
|
||||
self.assertTrue(
|
||||
"the html5lib tree builder doesn't support parse_only" in
|
||||
str(w[0].message))
|
||||
|
||||
def test_correctly_nested_tables(self):
|
||||
"""html5lib inserts <tbody> tags where other parsers don't."""
|
||||
markup = ('<table id="1">'
|
||||
'<tr>'
|
||||
"<td>Here's another table:"
|
||||
'<table id="2">'
|
||||
'<tr><td>foo</td></tr>'
|
||||
'</table></td>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
markup,
|
||||
'<table id="1"><tbody><tr><td>Here\'s another table:'
|
||||
'<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
|
||||
'</td></tr></tbody></table>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
"<table><thead><tr><td>Foo</td></tr></thead>"
|
||||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||
|
||||
def test_xml_declaration_followed_by_doctype(self):
|
||||
markup = '''<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
<body>
|
||||
<p>foo</p>
|
||||
</body>
|
||||
</html>'''
|
||||
soup = self.soup(markup)
|
||||
# Verify that we can reach the <p> tag; this means the tree is connected.
|
||||
self.assertEquals("<p>foo</p>", soup.p.encode())
|
19
updater/bs4/tests/test_htmlparser.py
Normal file
19
updater/bs4/tests/test_htmlparser.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
"""Tests to ensure that the html.parser tree builder generates good
|
||||
trees."""
|
||||
|
||||
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
|
||||
from bs4.builder import HTMLParserTreeBuilder
|
||||
|
||||
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return HTMLParserTreeBuilder()
|
||||
|
||||
def test_namespaced_system_doctype(self):
|
||||
# html.parser can't handle namespaced doctypes, so skip this one.
|
||||
pass
|
||||
|
||||
def test_namespaced_public_doctype(self):
|
||||
# html.parser can't handle namespaced doctypes, so skip this one.
|
||||
pass
|
75
updater/bs4/tests/test_lxml.py
Normal file
75
updater/bs4/tests/test_lxml.py
Normal file
|
@ -0,0 +1,75 @@
|
|||
"""Tests to ensure that the lxml tree builder generates good trees."""
|
||||
|
||||
import re
|
||||
import warnings
|
||||
|
||||
try:
|
||||
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
||||
LXML_PRESENT = True
|
||||
except ImportError, e:
|
||||
LXML_PRESENT = False
|
||||
|
||||
from bs4 import (
|
||||
BeautifulSoup,
|
||||
BeautifulStoneSoup,
|
||||
)
|
||||
from bs4.element import Comment, Doctype, SoupStrainer
|
||||
from bs4.testing import skipIf
|
||||
from bs4.tests import test_htmlparser
|
||||
from bs4.testing import (
|
||||
HTMLTreeBuilderSmokeTest,
|
||||
XMLTreeBuilderSmokeTest,
|
||||
SoupTest,
|
||||
skipIf,
|
||||
)
|
||||
|
||||
@skipIf(
|
||||
not LXML_PRESENT,
|
||||
"lxml seems not to be present, not testing its tree builder.")
|
||||
class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||
"""See ``HTMLTreeBuilderSmokeTest``."""
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return LXMLTreeBuilder()
|
||||
|
||||
def test_out_of_range_entity(self):
|
||||
self.assertSoupEquals(
|
||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||
self.assertSoupEquals(
|
||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||
self.assertSoupEquals(
|
||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||
|
||||
def test_beautifulstonesoup_is_xml_parser(self):
|
||||
# Make sure that the deprecated BSS class uses an xml builder
|
||||
# if one is installed.
|
||||
with warnings.catch_warnings(record=False) as w:
|
||||
soup = BeautifulStoneSoup("<b />")
|
||||
self.assertEqual(u"<b/>", unicode(soup.b))
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
"""lxml strips the XML definition from an XHTML doc, which is fine."""
|
||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>Hello.</title></head>
|
||||
<body>Goodbye.</body>
|
||||
</html>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
soup.encode("utf-8").replace(b"\n", b''),
|
||||
markup.replace(b'\n', b'').replace(
|
||||
b'<?xml version="1.0" encoding="utf-8"?>', b''))
|
||||
|
||||
|
||||
@skipIf(
|
||||
not LXML_PRESENT,
|
||||
"lxml seems not to be present, not testing its XML tree builder.")
|
||||
class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
|
||||
"""See ``HTMLTreeBuilderSmokeTest``."""
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return LXMLTreeBuilderForXML()
|
||||
|
378
updater/bs4/tests/test_soup.py
Normal file
378
updater/bs4/tests/test_soup.py
Normal file
|
@ -0,0 +1,378 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""Tests of Beautiful Soup as a whole."""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
import sys
|
||||
from bs4 import (
|
||||
BeautifulSoup,
|
||||
BeautifulStoneSoup,
|
||||
)
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
ContentMetaAttributeValue,
|
||||
SoupStrainer,
|
||||
NamespacedAttribute,
|
||||
)
|
||||
import bs4.dammit
|
||||
from bs4.dammit import EntitySubstitution, UnicodeDammit
|
||||
from bs4.testing import (
|
||||
SoupTest,
|
||||
skipIf,
|
||||
)
|
||||
import warnings
|
||||
|
||||
try:
|
||||
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
||||
LXML_PRESENT = True
|
||||
except ImportError, e:
|
||||
LXML_PRESENT = False
|
||||
|
||||
PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
|
||||
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
|
||||
|
||||
class TestDeprecatedConstructorArguments(SoupTest):
|
||||
|
||||
def test_parseOnlyThese_renamed_to_parse_only(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
|
||||
msg = str(w[0].message)
|
||||
self.assertTrue("parseOnlyThese" in msg)
|
||||
self.assertTrue("parse_only" in msg)
|
||||
self.assertEqual(b"<b></b>", soup.encode())
|
||||
|
||||
def test_fromEncoding_renamed_to_from_encoding(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
utf8 = b"\xc3\xa9"
|
||||
soup = self.soup(utf8, fromEncoding="utf8")
|
||||
msg = str(w[0].message)
|
||||
self.assertTrue("fromEncoding" in msg)
|
||||
self.assertTrue("from_encoding" in msg)
|
||||
self.assertEqual("utf8", soup.original_encoding)
|
||||
|
||||
def test_unrecognized_keyword_argument(self):
|
||||
self.assertRaises(
|
||||
TypeError, self.soup, "<a>", no_such_argument=True)
|
||||
|
||||
@skipIf(
|
||||
not LXML_PRESENT,
|
||||
"lxml not present, not testing BeautifulStoneSoup.")
|
||||
def test_beautifulstonesoup(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = BeautifulStoneSoup("<markup>")
|
||||
self.assertTrue(isinstance(soup, BeautifulSoup))
|
||||
self.assertTrue("BeautifulStoneSoup class is deprecated")
|
||||
|
||||
class TestSelectiveParsing(SoupTest):
|
||||
|
||||
def test_parse_with_soupstrainer(self):
|
||||
markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
|
||||
strainer = SoupStrainer("b")
|
||||
soup = self.soup(markup, parse_only=strainer)
|
||||
self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
|
||||
|
||||
|
||||
class TestEntitySubstitution(unittest.TestCase):
|
||||
"""Standalone tests of the EntitySubstitution class."""
|
||||
def setUp(self):
|
||||
self.sub = EntitySubstitution
|
||||
|
||||
def test_simple_html_substitution(self):
|
||||
# Unicode characters corresponding to named HTML entites
|
||||
# are substituted, and no others.
|
||||
s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
|
||||
self.assertEqual(self.sub.substitute_html(s),
|
||||
u"foo∀\N{SNOWMAN}õbar")
|
||||
|
||||
def test_smart_quote_substitution(self):
|
||||
# MS smart quotes are a common source of frustration, so we
|
||||
# give them a special test.
|
||||
quotes = b"\x91\x92foo\x93\x94"
|
||||
dammit = UnicodeDammit(quotes)
|
||||
self.assertEqual(self.sub.substitute_html(dammit.markup),
|
||||
"‘’foo“”")
|
||||
|
||||
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
|
||||
s = 'Welcome to "my bar"'
|
||||
self.assertEqual(self.sub.substitute_xml(s, False), s)
|
||||
|
||||
def test_xml_attribute_quoting_normally_uses_double_quotes(self):
|
||||
self.assertEqual(self.sub.substitute_xml("Welcome", True),
|
||||
'"Welcome"')
|
||||
self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
|
||||
'"Bob\'s Bar"')
|
||||
|
||||
def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
|
||||
s = 'Welcome to "my bar"'
|
||||
self.assertEqual(self.sub.substitute_xml(s, True),
|
||||
"'Welcome to \"my bar\"'")
|
||||
|
||||
def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
|
||||
s = 'Welcome to "Bob\'s Bar"'
|
||||
self.assertEqual(
|
||||
self.sub.substitute_xml(s, True),
|
||||
'"Welcome to "Bob\'s Bar""')
|
||||
|
||||
def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
|
||||
quoted = 'Welcome to "Bob\'s Bar"'
|
||||
self.assertEqual(self.sub.substitute_xml(quoted), quoted)
|
||||
|
||||
def test_xml_quoting_handles_angle_brackets(self):
|
||||
self.assertEqual(
|
||||
self.sub.substitute_xml("foo<bar>"),
|
||||
"foo<bar>")
|
||||
|
||||
def test_xml_quoting_handles_ampersands(self):
|
||||
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T")
|
||||
|
||||
def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self):
|
||||
self.assertEqual(
|
||||
self.sub.substitute_xml("ÁT&T"),
|
||||
"ÁT&T")
|
||||
|
||||
def test_quotes_not_html_substituted(self):
|
||||
"""There's no need to do this except inside attribute values."""
|
||||
text = 'Bob\'s "bar"'
|
||||
self.assertEqual(self.sub.substitute_html(text), text)
|
||||
|
||||
|
||||
class TestEncodingConversion(SoupTest):
|
||||
# Test Beautiful Soup's ability to decode and encode from various
|
||||
# encodings.
|
||||
|
||||
def setUp(self):
|
||||
super(TestEncodingConversion, self).setUp()
|
||||
self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
|
||||
self.utf8_data = self.unicode_data.encode("utf-8")
|
||||
# Just so you know what it looks like.
|
||||
self.assertEqual(
|
||||
self.utf8_data,
|
||||
b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
|
||||
|
||||
def test_ascii_in_unicode_out(self):
|
||||
# ASCII input is converted to Unicode. The original_encoding
|
||||
# attribute is set.
|
||||
ascii = b"<foo>a</foo>"
|
||||
soup_from_ascii = self.soup(ascii)
|
||||
unicode_output = soup_from_ascii.decode()
|
||||
self.assertTrue(isinstance(unicode_output, unicode))
|
||||
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
|
||||
self.assertEqual(soup_from_ascii.original_encoding.lower(), "ascii")
|
||||
|
||||
def test_unicode_in_unicode_out(self):
|
||||
# Unicode input is left alone. The original_encoding attribute
|
||||
# is not set.
|
||||
soup_from_unicode = self.soup(self.unicode_data)
|
||||
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
|
||||
self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
|
||||
self.assertEqual(soup_from_unicode.original_encoding, None)
|
||||
|
||||
def test_utf8_in_unicode_out(self):
|
||||
# UTF-8 input is converted to Unicode. The original_encoding
|
||||
# attribute is set.
|
||||
soup_from_utf8 = self.soup(self.utf8_data)
|
||||
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
|
||||
self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
|
||||
|
||||
def test_utf8_out(self):
|
||||
# The internal data structures can be encoded as UTF-8.
|
||||
soup_from_unicode = self.soup(self.unicode_data)
|
||||
self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
|
||||
|
||||
@skipIf(
|
||||
PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
|
||||
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
|
||||
def test_attribute_name_containing_unicode_characters(self):
|
||||
markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
|
||||
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
|
||||
|
||||
class TestUnicodeDammit(unittest.TestCase):
|
||||
"""Standalone tests of Unicode, Dammit."""
|
||||
|
||||
def test_smart_quotes_to_unicode(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup)
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
|
||||
|
||||
def test_smart_quotes_to_xml_entities(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, "<foo>‘’“”</foo>")
|
||||
|
||||
def test_smart_quotes_to_html_entities(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup, smart_quotes_to="html")
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, "<foo>‘’“”</foo>")
|
||||
|
||||
def test_smart_quotes_to_ascii(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, """<foo>''""</foo>""")
|
||||
|
||||
def test_detect_utf8(self):
|
||||
utf8 = b"\xc3\xa9"
|
||||
dammit = UnicodeDammit(utf8)
|
||||
self.assertEqual(dammit.unicode_markup, u'\xe9')
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
|
||||
def test_convert_hebrew(self):
|
||||
hebrew = b"\xed\xe5\xec\xf9"
|
||||
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
|
||||
self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
|
||||
|
||||
def test_dont_see_smart_quotes_where_there_are_none(self):
|
||||
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
|
||||
dammit = UnicodeDammit(utf_8)
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
|
||||
|
||||
def test_ignore_inappropriate_codecs(self):
|
||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
||||
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
|
||||
def test_ignore_invalid_codecs(self):
|
||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
||||
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
|
||||
dammit = UnicodeDammit(utf8_data, [bad_encoding])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
|
||||
def test_detect_html5_style_meta_tag(self):
|
||||
|
||||
for data in (
|
||||
b'<html><meta charset="euc-jp" /></html>',
|
||||
b"<html><meta charset='euc-jp' /></html>",
|
||||
b"<html><meta charset=euc-jp /></html>",
|
||||
b"<html><meta charset=euc-jp/></html>"):
|
||||
dammit = UnicodeDammit(data, is_html=True)
|
||||
self.assertEqual(
|
||||
"euc-jp", dammit.original_encoding)
|
||||
|
||||
def test_last_ditch_entity_replacement(self):
|
||||
# This is a UTF-8 document that contains bytestrings
|
||||
# completely incompatible with UTF-8 (ie. encoded with some other
|
||||
# encoding).
|
||||
#
|
||||
# Since there is no consistent encoding for the document,
|
||||
# Unicode, Dammit will eventually encode the document as UTF-8
|
||||
# and encode the incompatible characters as REPLACEMENT
|
||||
# CHARACTER.
|
||||
#
|
||||
# If chardet is installed, it will detect that the document
|
||||
# can be converted into ISO-8859-1 without errors. This happens
|
||||
# to be the wrong encoding, but it is a consistent encoding, so the
|
||||
# code we're testing here won't run.
|
||||
#
|
||||
# So we temporarily disable chardet if it's present.
|
||||
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
|
||||
<html><b>\330\250\330\252\330\261</b>
|
||||
<i>\310\322\321\220\312\321\355\344</i></html>"""
|
||||
chardet = bs4.dammit.chardet_dammit
|
||||
logging.disable(logging.WARNING)
|
||||
try:
|
||||
def noop(str):
|
||||
return None
|
||||
bs4.dammit.chardet_dammit = noop
|
||||
dammit = UnicodeDammit(doc)
|
||||
self.assertEqual(True, dammit.contains_replacement_characters)
|
||||
self.assertTrue(u"\ufffd" in dammit.unicode_markup)
|
||||
|
||||
soup = BeautifulSoup(doc, "html.parser")
|
||||
self.assertTrue(soup.contains_replacement_characters)
|
||||
finally:
|
||||
logging.disable(logging.NOTSET)
|
||||
bs4.dammit.chardet_dammit = chardet
|
||||
|
||||
def test_sniffed_xml_encoding(self):
|
||||
# A document written in UTF-16LE will be converted by a different
|
||||
# code path that sniffs the byte order markers.
|
||||
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
|
||||
dammit = UnicodeDammit(data)
|
||||
self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
|
||||
self.assertEqual("utf-16le", dammit.original_encoding)
|
||||
|
||||
def test_detwingle(self):
|
||||
# Here's a UTF8 document.
|
||||
utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
|
||||
|
||||
# Here's a Windows-1252 document.
|
||||
windows_1252 = (
|
||||
u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
|
||||
u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
|
||||
|
||||
# Through some unholy alchemy, they've been stuck together.
|
||||
doc = utf8 + windows_1252 + utf8
|
||||
|
||||
# The document can't be turned into UTF-8:
|
||||
self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
|
||||
|
||||
# Unicode, Dammit thinks the whole document is Windows-1252,
|
||||
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
|
||||
|
||||
# But if we run it through fix_embedded_windows_1252, it's fixed:
|
||||
|
||||
fixed = UnicodeDammit.detwingle(doc)
|
||||
self.assertEqual(
|
||||
u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
|
||||
|
||||
def test_detwingle_ignores_multibyte_characters(self):
|
||||
# Each of these characters has a UTF-8 representation ending
|
||||
# in \x93. \x93 is a smart quote if interpreted as
|
||||
# Windows-1252. But our code knows to skip over multibyte
|
||||
# UTF-8 characters, so they'll survive the process unscathed.
|
||||
for tricky_unicode_char in (
|
||||
u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
|
||||
u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
|
||||
u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
|
||||
):
|
||||
input = tricky_unicode_char.encode("utf8")
|
||||
self.assertTrue(input.endswith(b'\x93'))
|
||||
output = UnicodeDammit.detwingle(input)
|
||||
self.assertEqual(output, input)
|
||||
|
||||
class TestNamedspacedAttribute(SoupTest):
|
||||
|
||||
def test_name_may_be_none(self):
|
||||
a = NamespacedAttribute("xmlns", None)
|
||||
self.assertEqual(a, "xmlns")
|
||||
|
||||
def test_attribute_is_equivalent_to_colon_separated_string(self):
|
||||
a = NamespacedAttribute("a", "b")
|
||||
self.assertEqual("a:b", a)
|
||||
|
||||
def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
|
||||
a = NamespacedAttribute("a", "b", "c")
|
||||
b = NamespacedAttribute("a", "b", "c")
|
||||
self.assertEqual(a, b)
|
||||
|
||||
# The actual namespace is not considered.
|
||||
c = NamespacedAttribute("a", "b", None)
|
||||
self.assertEqual(a, c)
|
||||
|
||||
# But name and prefix are important.
|
||||
d = NamespacedAttribute("a", "z", "c")
|
||||
self.assertNotEqual(a, d)
|
||||
|
||||
e = NamespacedAttribute("z", "b", "c")
|
||||
self.assertNotEqual(a, e)
|
||||
|
||||
|
||||
class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
|
||||
|
||||
def test_content_meta_attribute_value(self):
|
||||
value = CharsetMetaAttributeValue("euc-jp")
|
||||
self.assertEqual("euc-jp", value)
|
||||
self.assertEqual("euc-jp", value.original_value)
|
||||
self.assertEqual("utf8", value.encode("utf8"))
|
||||
|
||||
|
||||
def test_content_meta_attribute_value(self):
|
||||
value = ContentMetaAttributeValue("text/html; charset=euc-jp")
|
||||
self.assertEqual("text/html; charset=euc-jp", value)
|
||||
self.assertEqual("text/html; charset=euc-jp", value.original_value)
|
||||
self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
|
1726
updater/bs4/tests/test_tree.py
Normal file
1726
updater/bs4/tests/test_tree.py
Normal file
File diff suppressed because it is too large
Load diff
1
updater/coursera.json
Normal file
1
updater/coursera.json
Normal file
File diff suppressed because one or more lines are too long
|
@ -10,9 +10,20 @@ class Database(object):
|
|||
TEST = 7
|
||||
BOOK = 8
|
||||
AUDIOBOOK = 9
|
||||
LECTURE = 10
|
||||
|
||||
def __init__(self, host, user, password=None, database="learn"):
|
||||
self.database = oursql.connect(host=host, user=user, db=database)
|
||||
self.database = oursql.connect(host=host, user=user, passwd=password, db=database)
|
||||
|
||||
def topic_exists(self, provider, unique_id):
|
||||
c = self.database.cursor()
|
||||
c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (provider, unique_id))
|
||||
return (len(c.fetchall()) > 0)
|
||||
|
||||
def item_exists(self, provider, unique_id):
|
||||
c = self.database.cursor()
|
||||
c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (provider, unique_id))
|
||||
return (len(c.fetchall()) > 0)
|
||||
|
||||
def insert_topic(self, provider, unique_id, title, override=False, **kwargs):
|
||||
defaults = {
|
||||
|
@ -21,7 +32,8 @@ class Database(object):
|
|||
"start_date": None,
|
||||
"end_date": None,
|
||||
"parent_id": 0,
|
||||
"description": ""
|
||||
"description": "",
|
||||
"provider_name": ""
|
||||
}
|
||||
|
||||
for kwarg, val in defaults.iteritems():
|
||||
|
@ -43,9 +55,9 @@ class Database(object):
|
|||
if exists == True:
|
||||
return (False, results[0][0])
|
||||
else:
|
||||
c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`)"
|
||||
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], provider, unique_id, title, kwargs['description'], kwargs['creation_date'],
|
||||
kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date']))
|
||||
c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`, `CustomProviderName`)"
|
||||
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], provider, unique_id, title, kwargs['description'], kwargs['creation_date'],
|
||||
kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'], kwargs["provider_name"]))
|
||||
|
||||
return (True, c.lastrowid)
|
||||
|
||||
|
@ -56,7 +68,10 @@ class Database(object):
|
|||
"topic_id": 0,
|
||||
"parent_id": 0,
|
||||
"description": "",
|
||||
"date": None
|
||||
"date": None,
|
||||
"start_date": None,
|
||||
"end_date": None,
|
||||
"provider_name": ""
|
||||
}
|
||||
|
||||
for kwarg, val in defaults.iteritems():
|
||||
|
@ -78,8 +93,8 @@ class Database(object):
|
|||
if exists == True:
|
||||
return (False, results[0][0])
|
||||
else:
|
||||
c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`)"
|
||||
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (has_topic, itemtype, provider, unique_id, title, kwargs["description"], item_url, kwargs["source_url"],
|
||||
kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"]))
|
||||
c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`, `StartDate`, `EndDate`, `CustomProviderName`)"
|
||||
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (has_topic, itemtype, provider, unique_id, title, kwargs["description"], item_url, kwargs["source_url"],
|
||||
kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"]))
|
||||
|
||||
return (True, c.lastrowid)
|
||||
|
|
26
updater/scrapers/__init__.py
Normal file
26
updater/scrapers/__init__.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
import inspect, os, sys
|
||||
|
||||
my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
|
||||
|
||||
def _import_module_into_scope(modulename):
|
||||
module = __import__(modulename)
|
||||
|
||||
for name in vars(module):
|
||||
data = getattr(module, name)
|
||||
globals()[name] = data
|
||||
|
||||
sys.path.insert(0, my_path)
|
||||
|
||||
for fname in os.listdir(my_path):
|
||||
fpath = os.path.join(my_path, fname)
|
||||
fbasename, fext = os.path.splitext(fname)
|
||||
|
||||
if os.path.isdir(fpath):
|
||||
if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
|
||||
# This is a python directory module
|
||||
_import_module_into_scope(fname)
|
||||
elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
|
||||
# This is a python file module
|
||||
_import_module_into_scope(fbasename)
|
||||
|
||||
sys.path.remove(my_path)
|
50
updater/scrapers/coursera.py
Normal file
50
updater/scrapers/coursera.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
import datetime, json, sys
|
||||
import requests
|
||||
import shared
|
||||
|
||||
class Coursera(shared.Scraper):
|
||||
provider_id = 2
|
||||
|
||||
def run(self):
|
||||
self.retrieve_dataset()
|
||||
self.parse_dataset()
|
||||
|
||||
def retrieve_dataset(self):
|
||||
self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
|
||||
|
||||
def parse_dataset(self):
|
||||
for item in self.dataset:
|
||||
self.process_item(item)
|
||||
|
||||
def process_item(self, item):
|
||||
inserted, row_id = self.insert_topic(str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
|
||||
|
||||
if inserted:
|
||||
self.env.log("Inserted topic %s" % item["name"])
|
||||
else:
|
||||
self.env.log("Skipped topic %s" % item["name"])
|
||||
|
||||
for course in item["courses"]:
|
||||
self.process_course(course, row_id)
|
||||
|
||||
def process_course(self, course, topicid):
|
||||
try:
|
||||
start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
|
||||
except TypeError, e:
|
||||
start_date = None
|
||||
|
||||
title = self.generate_title(course['name'], start_date)
|
||||
|
||||
inserted, row_id = self.insert_item(str(course["id"]), title, course["home_link"], has_topic=True, itemtype=self.COURSE, description=course["certificate_description"], start_date=start_date, topic_id=topicid)
|
||||
|
||||
if inserted:
|
||||
self.env.log("Inserted item %s" % title)
|
||||
else:
|
||||
self.env.log("Skipped item %s" % title)
|
||||
|
||||
def generate_title(self, name, date):
|
||||
if date is None:
|
||||
return "%s (date undetermined)" % name
|
||||
else:
|
||||
return "%s (starting %s)" % (name, date.strftime("%b %d, %Y"))
|
||||
|
201
updater/scrapers/genericocw.py
Normal file
201
updater/scrapers/genericocw.py
Normal file
|
@ -0,0 +1,201 @@
|
|||
import requests
|
||||
import oursql
|
||||
import datetime
|
||||
import json
|
||||
import sys, os
|
||||
import shared
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import bs4
|
||||
|
||||
rsess = requests.Session()
|
||||
rsess.headers['User-Agent'] = 'http://learn.cryto.net/ (scraper@cryto.net) - We mean no harm, thanks for making knowledge free :)'
|
||||
|
||||
class OpenCourseWare(shared.Scraper):
|
||||
def run(self):
|
||||
overview = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource").text
|
||||
soup = BeautifulSoup(overview)
|
||||
|
||||
for element in soup.find(id="pagecontent")("a"):
|
||||
#if "Hopkins" not in element.string:
|
||||
# continue
|
||||
self.process_source(int(element["href"].split("/")[-1]), element.string)
|
||||
|
||||
def process_source(self, source_id, source_name):
|
||||
data = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/browse/source/%d" % source_id).text
|
||||
soup = BeautifulSoup(data)
|
||||
|
||||
courses = soup.select("table#cfResultsTable tr")
|
||||
|
||||
for course in courses[:3]:
|
||||
links = course("a")
|
||||
|
||||
if len(links) > 0:
|
||||
external = links[0]
|
||||
details = links[1]
|
||||
|
||||
self.parse_course(external.string, external["href"], details["href"].split("/")[-1], source_name)
|
||||
|
||||
def parse_course(self, course_name, course_url, course_id, source_name):
|
||||
self.env.log("Parsing %s" % course_url)
|
||||
|
||||
# First fetch metadata from ocwconsortium.org
|
||||
ocw_data = self._metadata_ocw(course_id)
|
||||
ocw_data["providername"] = source_name
|
||||
ocw_data["url"] = course_url
|
||||
|
||||
# Now fetch metadata from the particular course provider
|
||||
provider_data = self._metadata_provider(course_url)
|
||||
|
||||
if provider_data != False:
|
||||
data = ocw_data.copy()
|
||||
data.update(provider_data)
|
||||
|
||||
# TODO: insert data
|
||||
self.env.log(repr(data))
|
||||
|
||||
def _metadata_ocw(self, course_id):
|
||||
soup = BeautifulSoup(rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/course/%s" % course_id).text)
|
||||
metadata = soup.select("dl.coursepage")[0]
|
||||
|
||||
if len(metadata) > 0:
|
||||
data = self._parse_ocw_dl(metadata.select("dd"), metadata.select("dt"))
|
||||
else:
|
||||
# No metadata provided by ocwconsortium.
|
||||
data = {}
|
||||
|
||||
return data
|
||||
|
||||
def _parse_ocw_dl(self, dd, dt):
|
||||
data = {}
|
||||
|
||||
for i in xrange(0, len(dd)):
|
||||
label = dd[i].string.strip().rstrip(":")
|
||||
value = dt[i].string
|
||||
|
||||
if value is not None:
|
||||
value = value.strip()
|
||||
|
||||
if label == "Tags":
|
||||
if value == None:
|
||||
data["tags"] = []
|
||||
else:
|
||||
data["tags"] = [x.strip() for x in value.split(",")]
|
||||
elif label == "Source":
|
||||
data["providername"] = value
|
||||
elif label == "Language":
|
||||
data["language"] = value
|
||||
elif label == "Link":
|
||||
# We can ignore this, we already have it anyway
|
||||
pass
|
||||
elif label == "Author":
|
||||
if value == None:
|
||||
data["author"] = None
|
||||
else:
|
||||
data["author"] = value
|
||||
elif label == "License":
|
||||
if value == None:
|
||||
data["license"] = None
|
||||
else:
|
||||
data["license"] = value
|
||||
elif label == "Date Published":
|
||||
data["creation_date"] = datetime.datetime.strptime(value, "%b %d, %Y")
|
||||
else:
|
||||
self.env.log("UNKNOWN: %s => %s" % (label, value), True)
|
||||
|
||||
return data
|
||||
|
||||
def _metadata_provider(self, url):
|
||||
providers = {
|
||||
"oer.avu.org": self._metadata_avu,
|
||||
"ocw.capilanou.ca": self._metadata_capilano,
|
||||
"ocw.hokudai.ac.jp": self._metadata_hokkaido,
|
||||
"ocw.ie.edu": self._metadata_ie,
|
||||
"ocw.jhsph.edu": self._metadata_hopkins,
|
||||
}
|
||||
|
||||
host = url.split("/")[2]
|
||||
data = {}
|
||||
|
||||
for provider, func in providers.iteritems():
|
||||
if host.endswith(provider):
|
||||
return func(url)
|
||||
|
||||
return False
|
||||
|
||||
def _metadata_avu(self, url):
|
||||
# African Virtual University
|
||||
soup = BeautifulSoup(rsess.get(url + "?show=full").text)
|
||||
table = soup.select("table.ds-includeSet-table")[0]
|
||||
data = {"providername": "African Virtual University"}
|
||||
|
||||
for row in table("tr"):
|
||||
cells = row("td")
|
||||
label = cells[0].string
|
||||
value = cells[1].string
|
||||
|
||||
if label == "dc.identifier.uri":
|
||||
data["identifier_uri"] = value
|
||||
elif label == "dc.type":
|
||||
data["object_type"] = value
|
||||
elif label == "dc.date.accessioned":
|
||||
data["creation_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
|
||||
elif label == "dc.date.issued":
|
||||
data["issued_date"] = datetime.datetime.strptime(value, "%Y-%m-%d")
|
||||
elif label == "dc.date.available":
|
||||
data["available_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
|
||||
elif label == "dc.language.iso":
|
||||
data["language"] = value
|
||||
elif label == "dc.description.abstract":
|
||||
data["description"] = " ".join(x for y in cells[1]("p") for x in y.strings)
|
||||
elif label == "dc.contributor.author":
|
||||
data["author"] = value
|
||||
elif label == "dc.title":
|
||||
data["title"] = value
|
||||
else:
|
||||
self.env.log("UNKNOWN KEY: %s => %s" % (label, value), True)
|
||||
|
||||
return data
|
||||
|
||||
def _metadata_capilano(self, url):
|
||||
# Capilano University
|
||||
soup = BeautifulSoup(rsess.get(url).text)
|
||||
data = {"providername": "Capilano University"}
|
||||
|
||||
data["title"] = soup.select("h1.documentFirstHeading")[0].string.strip()
|
||||
data["description"] = " ".join(x for y in soup.select("#about > p") for x in y.strings).strip()
|
||||
|
||||
return data
|
||||
|
||||
def _metadata_hokkaido(self, url):
|
||||
# Hokkaido University
|
||||
soup = BeautifulSoup(rsess.get(url).text)
|
||||
data = {"providername": "Hokkaido University"}
|
||||
|
||||
data["title"] = soup.select("#MAIN h1")[0].string.strip()
|
||||
data["description"] = soup.select("#MAIN p")[0].string.strip()
|
||||
|
||||
return data
|
||||
|
||||
def _metadata_ie(self, url):
|
||||
# IE University
|
||||
course_id = url.split("=")[1]
|
||||
soup = BeautifulSoup(rsess.get("http://ocw.ie.edu/ocw/cur%s01_esp.html" % course_id.zfill(2)).text)
|
||||
data = {"providername": "IE University"}
|
||||
|
||||
data["title"] = soup.select(".ari_18_negrita")[0].string.strip()
|
||||
data["description"] = " ".join(x.strip() for x in soup.select(".ari_12_negra")[-1].strings)
|
||||
data["author"] = soup.select(".ari_12_negra")[2].select(".ari_12_negrita")[0].string.strip()
|
||||
|
||||
return data
|
||||
|
||||
def _metadata_hopkins(self, url):
|
||||
# Johns Hopkins Bloomberg School of Public Health
|
||||
soup = BeautifulSoup(rsess.get(url).text)
|
||||
data = {"providername": "Johns Hopkins Bloomberg School of Public Health"}
|
||||
|
||||
data["title"] = self.soup_to_text(soup.select("h1")[-1])
|
||||
data["author"] = self.soup_to_text(soup.select("#courseInfoBox p:nth-of-type(1)"))
|
||||
data["description"] = self.soup_to_text(soup.select("#courseImageAndInfoBox > p"))
|
||||
|
||||
return data
|
197
updater/scrapers/khan.py
Normal file
197
updater/scrapers/khan.py
Normal file
|
@ -0,0 +1,197 @@
|
|||
import datetime, json, sys
|
||||
import requests
|
||||
import shared
|
||||
|
||||
class KhanAcademy(shared.Scraper):
|
||||
provider_id = 1
|
||||
|
||||
def run(self):
|
||||
self.retrieve_dataset()
|
||||
self.process_item(self.dataset, 0)
|
||||
|
||||
def retrieve_dataset(self):
|
||||
self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
|
||||
|
||||
def process_item(self, item, level, parent=None):
|
||||
try:
|
||||
kind = item["kind"]
|
||||
except KeyError, e:
|
||||
return
|
||||
|
||||
if kind == "Topic":
|
||||
self.process_topic(item, level, parent=parent)
|
||||
elif kind in ("Video", "Exercise", "Article", "Scratchpad"):
|
||||
self.process_object(item, level, parent=parent)
|
||||
elif kind == "Separator":
|
||||
pass # Ignore separators
|
||||
else:
|
||||
self.env.log("Unrecognized kind: %s" % repr(item["kind"]), True)
|
||||
|
||||
try:
|
||||
children = item["children"]
|
||||
except KeyError, e:
|
||||
return
|
||||
|
||||
for child in children:
|
||||
self.process_item(child, level + 1, item)
|
||||
|
||||
def process_topic(self, item, level, parent=None):
|
||||
unique_id = item["id"]
|
||||
|
||||
try:
|
||||
parent_id = parent["_cl_id"]
|
||||
except TypeError, e:
|
||||
parent_id = 0
|
||||
|
||||
# Check if a title is set
|
||||
if item["title"] is not None:
|
||||
title = item["title"]
|
||||
else:
|
||||
# No title was set - log this as an error and default to 'Untitled'.
|
||||
self.env.log("No title found for item: %s" % repr(item), True)
|
||||
title = "Untitled"
|
||||
|
||||
# Check if a description is set, and default to no description if not
|
||||
if item["description"] is not None:
|
||||
description = item["description"]
|
||||
else:
|
||||
description = None
|
||||
|
||||
# Insert the topic
|
||||
inserted, row_id = self.insert_topic(unique_id, title, description=description, needs_enrollment=False)
|
||||
|
||||
# Set the ID of the newly inserted row so that all objects in this topic know the ID of their topic.
|
||||
item["_cl_id"] = row_id
|
||||
|
||||
if inserted:
|
||||
self.env.log("Inserted %s" % title)
|
||||
else:
|
||||
self.env.log("Skipped %s" % title)
|
||||
|
||||
def process_object(self, item, level, parent=None):
|
||||
unique_id = None
|
||||
|
||||
# First check for the 'readable_id' property
|
||||
try:
|
||||
unique_id = item["readable_id"]
|
||||
except KeyError, e:
|
||||
pass
|
||||
|
||||
# If no identifier was found, check for the 'name' property
|
||||
if unique_id is None:
|
||||
try:
|
||||
unique_id = item["name"]
|
||||
except KeyError, e:
|
||||
pass
|
||||
|
||||
# If still no identifier was found, check for the 'id' property
|
||||
if unique_id is None:
|
||||
try:
|
||||
unique_id = str(item["id"])
|
||||
except KeyError, e:
|
||||
pass
|
||||
|
||||
# If we *still* do not have an identifier, log the error and bail out
|
||||
if unique_id is None:
|
||||
self.env.log("No suitable identifier found for item: %s" % repr(item), True)
|
||||
return
|
||||
|
||||
# Determine the object type
|
||||
if item["kind"] == "Video":
|
||||
itemtype = self.VIDEO
|
||||
elif item["kind"] == "Exercise":
|
||||
itemtype = self.EXERCISE
|
||||
elif item["kind"] == "Article":
|
||||
itemtype = self.ARTICLE
|
||||
elif item["kind"] == "Scratchpad":
|
||||
itemtype = self.SANDBOX
|
||||
|
||||
source_url = None
|
||||
|
||||
# Determine the source URL via the 'ka_url' property
|
||||
try:
|
||||
source_url = item["ka_url"]
|
||||
except KeyError, e:
|
||||
pass
|
||||
|
||||
# If no source URL was found, try the 'url' property
|
||||
if source_url is None:
|
||||
try:
|
||||
source_url = item["url"]
|
||||
except KeyError, e:
|
||||
pass
|
||||
|
||||
# If still no source URL was found...
|
||||
if source_url is None:
|
||||
if itemtype == self.ARTICLE:
|
||||
# Articles can lack a URL.
|
||||
source_url = None
|
||||
else:
|
||||
# There was no source URL, but this wasn't an article. Log the error and bail out.
|
||||
self.env.log("No source URL found for non-article object: %s" % repr(item), True)
|
||||
return
|
||||
|
||||
# Determine the (external) item URL
|
||||
try:
|
||||
item_url = item["url"]
|
||||
except KeyError, e:
|
||||
# Apparently there was no external item URL. Use the source URL as item URL - this will most likely be correct.
|
||||
item_url = source_url
|
||||
|
||||
# If the object is an article, we'll want to use the actual article content as description.
|
||||
if itemtype == self.ARTICLE:
|
||||
description = item["content"]
|
||||
else:
|
||||
# Otherwise, we'll check if there's a 'description' property. If not, leave empty.
|
||||
try:
|
||||
description = item["description"]
|
||||
except KeyError, e:
|
||||
description = None
|
||||
|
||||
title = None
|
||||
|
||||
# First check the 'title' property for an object title.
|
||||
try:
|
||||
title = item["title"]
|
||||
except KeyError, e:
|
||||
pass
|
||||
|
||||
# As second option, check the 'display_name' property.
|
||||
if title is None:
|
||||
try:
|
||||
title = item["display_name"]
|
||||
except KeyError, e:
|
||||
# Apparently it really does not have a title. Log the error and default to 'Untitled'.
|
||||
self.env.log("No object title found for item: %s" % repr(item), True)
|
||||
title = "Untitled"
|
||||
|
||||
# If a 'views' property is present, include it.
|
||||
try:
|
||||
views = item["views"]
|
||||
except KeyError, e:
|
||||
views = None
|
||||
|
||||
# If a creation date is present, include it.
|
||||
try:
|
||||
date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
|
||||
except KeyError, e:
|
||||
date = None
|
||||
|
||||
# Check if there is a parent ID
|
||||
try:
|
||||
parent_id = parent["_cl_id"]
|
||||
except KeyError, e:
|
||||
# No parent ID present - log this as an error and default to 0.
|
||||
self.env.log("No parent ID found for item: %s" % repr(item), True)
|
||||
parent_id = 0
|
||||
|
||||
# Insert the item
|
||||
inserted, row_id = self.insert_item(unique_id, title, item_url, itemtype=itemtype, has_topic=True, source_url=source_url, description=description, views=views, topic_id=parent_id, date=date)
|
||||
|
||||
# Store the resulting row ID in the item so that the children know the ID of their parent.
|
||||
item["_cl_id"] = row_id
|
||||
|
||||
if inserted:
|
||||
self.env.log("Inserted %s" % title)
|
||||
else:
|
||||
self.env.log("Skipped %s" % title)
|
55
updater/scrapers/ureddit.py
Normal file
55
updater/scrapers/ureddit.py
Normal file
|
@ -0,0 +1,55 @@
|
|||
import datetime, json, simplejson, sys, re
|
||||
import requests
|
||||
import shared
|
||||
|
||||
class UniversityOfReddit(shared.Scraper):
|
||||
provider_id = 3
|
||||
|
||||
def run(self):
|
||||
data = requests.get("http://ureddit.com/api?type=catalog").json()
|
||||
|
||||
for category in data["categories"]:
|
||||
self.parse_category(category['id'], category['value'])
|
||||
|
||||
def parse_category(self, category_id, category_name):
|
||||
try:
|
||||
data = requests.get("http://ureddit.com/api?type=category&id=%s" % category_id).json()
|
||||
except simplejson.decoder.JSONDecodeError, e:
|
||||
return
|
||||
|
||||
for _class in data["classes"]:
|
||||
if not self.topic_exists(_class['id']):
|
||||
self.parse_class(_class['id'], _class['value'], category_name)
|
||||
else:
|
||||
self.env.log("Skipped class %s" % _class['value'])
|
||||
|
||||
def parse_class(self, class_id, class_name, category_name):
|
||||
try:
|
||||
data = requests.get("http://ureddit.com/api?type=class&id=%s" % class_id).json()
|
||||
except simplejson.decoder.JSONDecodeError, e:
|
||||
self.env.log("Skipped %s due to JSON formatting error" % class_name, True)
|
||||
return
|
||||
|
||||
if data["status"] == '1' or data["status"] == '3' or data["status"] == '5':
|
||||
try:
|
||||
creation_date = datetime.datetime.strptime(data["created"], '%Y-%m-%d %H:%M:%S')
|
||||
except ValueError, e:
|
||||
creation_date = None
|
||||
|
||||
class_page = data["url"]
|
||||
|
||||
inserted, topic_id = self.insert_topic(str(class_id), data["name"], needs_enrollment=True, description=data["description"], creation_date=creation_date)
|
||||
|
||||
if inserted:
|
||||
self.env.log("Inserted topic %s" % data["name"])
|
||||
else:
|
||||
self.env.log("Skipped topic %s" % data["name"])
|
||||
|
||||
inserted, item_id = self.insert_item(str(class_id), data["name"], class_page, itemtype=self.COURSE, has_topic=True, topic_id=topic_id, date=creation_date, description=data["description"])
|
||||
|
||||
if inserted:
|
||||
self.env.log("Inserted item %s" % data["name"])
|
||||
else:
|
||||
self.env.log("Skipped item %s" % data["name"])
|
||||
else:
|
||||
self.env.log("Skipped %s due to status (%s)" % (data["name"], data["status_description"]))
|
26
updater/shared/__init__.py
Normal file
26
updater/shared/__init__.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
import inspect, os, sys
|
||||
|
||||
my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
|
||||
|
||||
def _import_module_into_scope(modulename):
|
||||
module = __import__(modulename)
|
||||
|
||||
for name in vars(module):
|
||||
data = getattr(module, name)
|
||||
globals()[name] = data
|
||||
|
||||
sys.path.insert(0, my_path)
|
||||
|
||||
for fname in os.listdir(my_path):
|
||||
fpath = os.path.join(my_path, fname)
|
||||
fbasename, fext = os.path.splitext(fname)
|
||||
|
||||
if os.path.isdir(fpath):
|
||||
if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
|
||||
# This is a python directory module
|
||||
_import_module_into_scope(fname)
|
||||
elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
|
||||
# This is a python file module
|
||||
_import_module_into_scope(fbasename)
|
||||
|
||||
sys.path.remove(my_path)
|
17
updater/shared/environment.py
Normal file
17
updater/shared/environment.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
import oursql, sys
|
||||
|
||||
class Environment(object):
|
||||
def connect(self, host="localhost", username="root", password="", database="learn"):
|
||||
self.db = oursql.connect(host=host, user=username, passwd=password, db=database)
|
||||
self.connected = True
|
||||
|
||||
def log(self, text, is_error=False):
|
||||
if is_error == False:
|
||||
sys.stdout.write(text + "\n")
|
||||
else:
|
||||
sys.stderr.write(text + "\n")
|
||||
|
||||
def Scraper(self, scraper_class):
|
||||
s = scraper_class(self.db)
|
||||
s.env = self
|
||||
return s
|
122
updater/shared/scraper.py
Normal file
122
updater/shared/scraper.py
Normal file
|
@ -0,0 +1,122 @@
|
|||
class Scraper(object):
|
||||
UNKNOWN = 0
|
||||
TOPIC = 1
|
||||
COURSE = 2
|
||||
VIDEO = 3
|
||||
ARTICLE = 4
|
||||
EXERCISE = 5
|
||||
QUIZ = 6
|
||||
TEST = 7
|
||||
BOOK = 8
|
||||
AUDIOBOOK = 9
|
||||
LECTURE = 10
|
||||
SANDBOX = 11
|
||||
|
||||
provider_id = 0
|
||||
|
||||
def __init__(self, database=None):
|
||||
if database is not None:
|
||||
self.db = database
|
||||
self.can_store = True
|
||||
else:
|
||||
self.can_store = False
|
||||
|
||||
def run(self, *args, **kwargs):
|
||||
raise Exception("No run() method was specified for this scraper.")
|
||||
|
||||
def topic_exists(self, unique_id):
|
||||
c = self.db.cursor()
|
||||
c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
|
||||
return (len(c.fetchall()) > 0)
|
||||
|
||||
def item_exists(self, unique_id):
|
||||
c = self.db.cursor()
|
||||
c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
|
||||
return (len(c.fetchall()) > 0)
|
||||
|
||||
def insert_topic(self, unique_id, title, override=False, **kwargs):
|
||||
defaults = {
|
||||
"needs_enrollment": False,
|
||||
"creation_date": None,
|
||||
"start_date": None,
|
||||
"end_date": None,
|
||||
"parent_id": 0,
|
||||
"description": "",
|
||||
"provider_name": ""
|
||||
}
|
||||
|
||||
for kwarg, val in defaults.iteritems():
|
||||
try:
|
||||
if kwargs[kwarg] == None:
|
||||
kwargs[kwarg] = defaults[kwarg]
|
||||
except KeyError, e:
|
||||
kwargs[kwarg] = defaults[kwarg]
|
||||
|
||||
c = self.db.cursor()
|
||||
|
||||
if override == True:
|
||||
exists = False
|
||||
else:
|
||||
c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
|
||||
results = c.fetchall()
|
||||
exists = (len(results) > 0)
|
||||
|
||||
if exists == True:
|
||||
return (False, results[0][0])
|
||||
else:
|
||||
c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`, `CustomProviderName`)"
|
||||
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], self.provider_id, unique_id, title, kwargs['description'], kwargs['creation_date'],
|
||||
kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'], kwargs["provider_name"]))
|
||||
|
||||
return (True, c.lastrowid)
|
||||
|
||||
def insert_item(self, unique_id, title, item_url, override=False, **kwargs):
|
||||
defaults = {
|
||||
"views": None,
|
||||
"has_topic": False,
|
||||
"itemtype": 0,
|
||||
"source_url": item_url,
|
||||
"topic_id": 0,
|
||||
"parent_id": 0,
|
||||
"description": "",
|
||||
"date": None,
|
||||
"start_date": None,
|
||||
"end_date": None,
|
||||
"provider_name": ""
|
||||
}
|
||||
|
||||
for kwarg, val in defaults.iteritems():
|
||||
try:
|
||||
if kwargs[kwarg] == None:
|
||||
kwargs[kwarg] = defaults[kwarg]
|
||||
except KeyError, e:
|
||||
kwargs[kwarg] = defaults[kwarg]
|
||||
|
||||
c = self.db.cursor()
|
||||
|
||||
if override == True:
|
||||
exists = False
|
||||
else:
|
||||
c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
|
||||
results = c.fetchall()
|
||||
exists = (len(results) > 0)
|
||||
|
||||
if exists == True:
|
||||
return (False, results[0][0])
|
||||
else:
|
||||
c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`, `StartDate`, `EndDate`, `CustomProviderName`)"
|
||||
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs["has_topic"], kwargs["itemtype"], self.provider_id, unique_id, title, kwargs["description"], item_url, kwargs["source_url"],
|
||||
kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"]))
|
||||
|
||||
return (True, c.lastrowid)
|
||||
|
||||
def soup_to_text(self, soup):
|
||||
strings = []
|
||||
|
||||
try:
|
||||
for el in soup:
|
||||
strings += el._all_strings(True, True)
|
||||
except AttributeError, e:
|
||||
strings = soup._all_strings(True, True)
|
||||
|
||||
return " ".join(strings)
|
4
updater/test_ocw.py
Normal file
4
updater/test_ocw.py
Normal file
|
@ -0,0 +1,4 @@
|
|||
import update_ocw
|
||||
|
||||
c = update_ocw.OpenCourseWareCrawler()
|
||||
print c.get_provider_data("http://ocw.jhsph.edu/courses/AdolHealthDev/?source=rss")
|
8
updater/update.py
Normal file
8
updater/update.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
#!/usr/bin/env python
|
||||
import shared, scrapers
|
||||
|
||||
env = shared.Environment()
|
||||
env.connect(host="localhost", username="root", password="", database="learn")
|
||||
|
||||
scraper = env.Scraper(scrapers.OpenCourseWare)
|
||||
scraper.run()
|
|
@ -1,131 +0,0 @@
|
|||
import requests
|
||||
import oursql
|
||||
import datetime
|
||||
import json
|
||||
import lib
|
||||
|
||||
class KhanUniversityCrawler(object):
|
||||
def __init__(self):
|
||||
self.db = lib.Database("localhost", "root")
|
||||
|
||||
def retrieve_dataset(self):
|
||||
self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
|
||||
#self.dataset = json.loads(open("data.json", "r").read())
|
||||
|
||||
def parse_dataset(self):
|
||||
self.process_item(self.dataset, 0)
|
||||
|
||||
def process_item(self, item, level, parent=None):
|
||||
try:
|
||||
kind = item["kind"]
|
||||
except KeyError, e:
|
||||
return
|
||||
|
||||
if kind == "Topic":
|
||||
unique_id = item["id"]
|
||||
|
||||
try:
|
||||
parent_id = parent["_cl_id"]
|
||||
except TypeError, e:
|
||||
parent_id = 0
|
||||
|
||||
if item["title"] is not None:
|
||||
title = item["title"]
|
||||
else:
|
||||
title = ""
|
||||
|
||||
inserted, rowid = self.db.insert_topic(1, unique_id, title, description=item["description"], needs_enrollment=False)
|
||||
item["_cl_id"] = rowid
|
||||
|
||||
if inserted:
|
||||
print "Inserted %s" % title
|
||||
else:
|
||||
print "Skipped %s" % title
|
||||
elif kind in ("Video", "Exercise", "Article"):
|
||||
try:
|
||||
unique_id = item["readable_id"]
|
||||
except KeyError, e:
|
||||
try:
|
||||
unique_id = item["name"]
|
||||
except KeyError, e:
|
||||
try:
|
||||
unique_id = str(item["id"])
|
||||
except KeyError, e:
|
||||
print repr(item)
|
||||
sys.stderr.write("WARNING: No suitable identifier found for item\n")
|
||||
raise
|
||||
return
|
||||
|
||||
if item["kind"] == "Video":
|
||||
itemtype = self.db.VIDEO
|
||||
elif item["kind"] == "Exercise":
|
||||
itemtype = self.db.EXERCISE
|
||||
elif item["kind"] == "Article":
|
||||
itemtype = self.db.ARTICLE
|
||||
|
||||
try:
|
||||
source_url = item["ka_url"]
|
||||
except KeyError, e:
|
||||
if itemtype == self.db.ARTICLE:
|
||||
source_url = ""
|
||||
else:
|
||||
return
|
||||
|
||||
try:
|
||||
item_url = item["url"]
|
||||
except KeyError, e:
|
||||
try:
|
||||
item_url = item["ka_url"]
|
||||
except KeyError, e:
|
||||
item_url = None
|
||||
|
||||
if itemtype == self.db.ARTICLE:
|
||||
description = item["content"]
|
||||
else:
|
||||
try:
|
||||
description = item["description"]
|
||||
except KeyError, e:
|
||||
description = None
|
||||
|
||||
try:
|
||||
title = item["title"]
|
||||
except KeyError, e:
|
||||
try:
|
||||
title = item["display_name"]
|
||||
except KeyError, e:
|
||||
title = "Untitled"
|
||||
|
||||
try:
|
||||
views = item["views"]
|
||||
except KeyError, e:
|
||||
views = None
|
||||
|
||||
try:
|
||||
date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
|
||||
except KeyError, e:
|
||||
date = None
|
||||
|
||||
inserted, rowid = self.db.insert_item(1, unique_id, True, itemtype, title, item_url, source_url=source_url, description=description, views=views, topic_id=parent["_cl_id"], date=date)
|
||||
item["_cl_id"] = rowid
|
||||
|
||||
if inserted:
|
||||
print "Inserted %s" % title
|
||||
else:
|
||||
print "Skipped %s" % title
|
||||
elif kind == "Separator":
|
||||
pass # Ignore separators
|
||||
else:
|
||||
sys.stderr.write("Unrecognized kind: %s\n" % item["kind"])
|
||||
sys.stderr.write("%s\n" % (repr(item)))
|
||||
|
||||
try:
|
||||
children = item["children"]
|
||||
except KeyError, e:
|
||||
pass
|
||||
else:
|
||||
for child in children:
|
||||
self.process_item(child, level + 1, item)
|
||||
|
||||
crawler = KhanUniversityCrawler()
|
||||
crawler.retrieve_dataset()
|
||||
crawler.parse_dataset()
|
288
updater/update_ocw.py
Normal file
288
updater/update_ocw.py
Normal file
|
@ -0,0 +1,288 @@
|
|||
import requests
|
||||
import oursql
|
||||
import datetime
|
||||
import json
|
||||
import lib
|
||||
from bs4 import BeautifulSoup
|
||||
import bs4
|
||||
|
||||
def combine_dict(a, b):
|
||||
c = a.copy()
|
||||
c.update(b)
|
||||
return c
|
||||
|
||||
rsess = requests.Session()
|
||||
rsess.headers['User-Agent'] = 'http://learn.cryto.net/ (scraper@cryto.net) - We mean no harm, thanks for making knowledge free :)'
|
||||
|
||||
class OpenCourseWareCrawler(object):
|
||||
def __init__(self):
|
||||
self.db = lib.Database("localhost", "root", password="")
|
||||
|
||||
def parse_catalog(self):
|
||||
overview = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource").text
|
||||
soup = BeautifulSoup(overview)
|
||||
|
||||
for element in soup.find(id="pagecontent")("a"):
|
||||
self.parse_source(int(element["href"].split("/")[-1]), element.string)
|
||||
|
||||
def parse_source(self, source_id, source_name):
|
||||
data = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/browse/source/%d" % source_id).text
|
||||
soup = BeautifulSoup(data)
|
||||
|
||||
courses = soup.select("table#cfResultsTable tr")
|
||||
|
||||
print "# " + source_name
|
||||
|
||||
for course in courses[:2]:
|
||||
links = course("a")
|
||||
|
||||
if len(links) > 0:
|
||||
external = links[0]
|
||||
details = links[1]
|
||||
|
||||
self.parse_course(external.string, external["href"], details["href"].split("/")[-1])
|
||||
|
||||
def parse_course(self, course_name, course_url, course_id):
|
||||
# First fetch metadata from ocwconsortium.org
|
||||
|
||||
print course_url
|
||||
|
||||
metadata_soup = BeautifulSoup(rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/course/%s" % course_id).text)
|
||||
|
||||
metadata = metadata_soup.select("dl.coursepage")[0]
|
||||
|
||||
if len(metadata) > 0:
|
||||
data = self.parse_dl(metadata.select("dd"), metadata.select("dt"))
|
||||
else:
|
||||
# No metadata provided by ocwconsortium.
|
||||
data = {}
|
||||
|
||||
# Now fetch metadata from the particular course provider
|
||||
provider_data = self.get_provider_data(course_url)
|
||||
|
||||
if provider_data != {}:
|
||||
print repr(provider_data)
|
||||
|
||||
def parse_dl(self, dd, dt):
|
||||
data = {}
|
||||
|
||||
for i in xrange(0, len(dd)):
|
||||
label = dd[i].string.strip().rstrip(":")
|
||||
value = dt[i].string
|
||||
|
||||
if value is not None:
|
||||
value = value.strip()
|
||||
|
||||
if label == "Tags":
|
||||
if value == None:
|
||||
data["tags"] = []
|
||||
else:
|
||||
data["tags"] = [x.strip() for x in value.split(",")]
|
||||
elif label == "Source":
|
||||
data["source"] = value
|
||||
elif label == "Language":
|
||||
data["language"] = value
|
||||
elif label == "Link":
|
||||
# We can ignore this, we already have it anyway
|
||||
pass
|
||||
elif label == "Author":
|
||||
if value == None:
|
||||
data["author"] = None
|
||||
else:
|
||||
data["author"] = value
|
||||
elif label == "License":
|
||||
if value == None:
|
||||
data["license"] = None
|
||||
else:
|
||||
data["license"] = value
|
||||
elif label == "Date Published":
|
||||
data["creation_date"] = datetime.datetime.strptime(value, "%b %d, %Y")
|
||||
else:
|
||||
print "UNKNOWN: %s => %s" % (label, value)
|
||||
|
||||
return data
|
||||
|
||||
def get_provider_data(self, url):
|
||||
providers = {
|
||||
"oer.avu.org": self._data_avu,
|
||||
"ocw.capilanou.ca": self._data_capilano,
|
||||
"ocw.hokudai.ac.jp": self._data_hokkaido,
|
||||
"ocw.ie.edu": self._data_ie,
|
||||
"ocw.jhsph.edu": self._data_hopkins,
|
||||
}
|
||||
|
||||
""",
|
||||
|
||||
|
||||
|
||||
"ocw.kaplan.edu": self._data_kaplan,
|
||||
"ocw.korea.edu": self._data_korea,
|
||||
"kyotomm.jp": self._data_kyoto,
|
||||
"ocw.kyushu-u.ac.jp": self._data_kyushu,
|
||||
|
||||
"open-marhi.ru": self._data_moscow,
|
||||
"yctrtrc.ncku.edu.tw": self._data_chengkung,
|
||||
"ocw.nctu.edu.tw": self._data_chiaotung,
|
||||
"opencourse.ndhu.edu.tw": self._data_donghwa,
|
||||
"ocw.njit.edu": self._data_njit,
|
||||
"graduateschool.paristech.fr": self._data_paris,
|
||||
"peoples-uni.org": self._data_oaei,
|
||||
"ocw.sbu.ac.ir": self._data_shahid,
|
||||
"studentscircle.net": self._data_studentscircle,
|
||||
"ocw.tmu.edu.tw:8080": self._data_taipei,
|
||||
"openlearn.open.ac.uk": self._data_openuni,
|
||||
"www.ocw.titech.ac.jp": self._data_tokyo,
|
||||
"feedproxy.google.com": self._data_tudelft,
|
||||
"ocw.tufts.edu": self._data_tufts,
|
||||
"ocw.unu.edu": self._data_un,
|
||||
"ocw.uc3m.es": self._data_madrid,
|
||||
"ocw.ua.es": self._data_alicante,
|
||||
"ocw.unican.es": self._data_cantabria,
|
||||
"ocw.ugr.es": self._data_granada,
|
||||
"ocw.udem.edu.mx": self._data_monterrey,
|
||||
"ocw.um.es": self._data_murcia,
|
||||
"ocw.uniovi.es": self._data_oviedo,
|
||||
"ocw.usal.es": self._data_salamanca,
|
||||
"ocwus.us.es": self._data_sevilla,
|
||||
"ocw.unizar.es": self._data_zaragoza,
|
||||
"ocw.univalle.edu.co3": self._data_colombia,
|
||||
"ocw.uned.ac.cr": self._data_distancia,
|
||||
"www.icesi.edu.co": self._data_icesi,
|
||||
"ocw.innova.uned.es": self._data_innova,
|
||||
"upv.es": self._data_valencia,
|
||||
"ocw.upm.es": self._data_upm,
|
||||
"ocw.utpl.edu.ec": self._data_utpl,
|
||||
"ocw.uab.cat": self._data_uab,
|
||||
"ocw.ub.edu": self._data_ub,
|
||||
"ocw.uib.es": self._data_uib,
|
||||
"ocw.udl.cat": self._data_udl,
|
||||
"ocw.uv.es": self._data_uv,
|
||||
"e-ujier.uji.e": self._data_uji,
|
||||
"ocw.uoc.edu": self._data_uoc,
|
||||
"ocw.utm.my": self._data_utm,
|
||||
"ocw.uci.edu": self._data_uci,
|
||||
"opencontent.uct.ac.za": self._data_uct,
|
||||
"ocw.umb.edu:8080": self._data_boston,
|
||||
"open.umich.edu": self._data_michigan,
|
||||
"ocw.nd.edu": self._data_notredame,
|
||||
"ocw.usu.ac.id": self._data_usu,
|
||||
"ocw.tsukuba.ac.jp": self._data_tsukaba"""
|
||||
|
||||
host = url.split("/")[2]
|
||||
data = {}
|
||||
|
||||
for provider, func in providers.iteritems():
|
||||
if host.endswith(provider):
|
||||
data = func(url)
|
||||
|
||||
return data
|
||||
|
||||
def _data_avu(self, url):
|
||||
# African Virtual University
|
||||
soup = BeautifulSoup(rsess.get(url + "?show=full").text)
|
||||
table = soup.select("table.ds-includeSet-table")[0]
|
||||
data = {"providername": "African Virtual University"}
|
||||
|
||||
for row in table("tr"):
|
||||
cells = row("td")
|
||||
label = cells[0].string
|
||||
value = cells[1].string
|
||||
|
||||
if label == "dc.identifier.uri":
|
||||
data["identifier_uri"] = value
|
||||
elif label == "dc.type":
|
||||
data["object_type"] = value
|
||||
elif label == "dc.date.accessioned":
|
||||
data["creation_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
|
||||
elif label == "dc.date.issued":
|
||||
data["issued_date"] = datetime.datetime.strptime(value, "%Y-%m-%d")
|
||||
elif label == "dc.date.available":
|
||||
data["available_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
|
||||
elif label == "dc.language.iso":
|
||||
data["language"] = value
|
||||
elif label == "dc.description.abstract":
|
||||
data["description"] = " ".join(x for y in cells[1]("p") for x in y.strings)
|
||||
elif label == "dc.contributor.author":
|
||||
data["author"] = value
|
||||
elif label == "dc.title":
|
||||
data["title"] = value
|
||||
else:
|
||||
print "UNKNOWN KEY: %s => %s" % (label, value)
|
||||
|
||||
return data
|
||||
|
||||
def _data_capilano(self, url):
|
||||
# Capilano University
|
||||
soup = BeautifulSoup(rsess.get(url).text)
|
||||
data = {"providername": "Capilano University"}
|
||||
|
||||
data["title"] = soup.select("h1.documentFirstHeading")[0].string.strip()
|
||||
data["description"] = " ".join(x for y in soup.select("#about > p") for x in y.strings).strip()
|
||||
|
||||
return data
|
||||
|
||||
def _data_hokkaido(self, url):
|
||||
# Hokkaido University
|
||||
soup = BeautifulSoup(rsess.get(url).text)
|
||||
data = {"providername": "Hokkaido University"}
|
||||
|
||||
data["title"] = soup.select("#MAIN h1")[0].string.strip()
|
||||
data["description"] = soup.select("#MAIN p")[0].string.strip()
|
||||
|
||||
return data
|
||||
|
||||
def _data_ie(self, url):
|
||||
# IE University
|
||||
course_id = url.split("=")[1]
|
||||
soup = BeautifulSoup(rsess.get("http://ocw.ie.edu/ocw/cur%s01_esp.html" % course_id.zfill(2)).text)
|
||||
data = {"providername": "IE University"}
|
||||
|
||||
data["title"] = soup.select(".ari_18_negrita")[0].string.strip()
|
||||
data["description"] = " ".join(x.strip() for x in soup.select(".ari_12_negra")[-1].strings)
|
||||
data["author"] = soup.select(".ari_12_negra")[2].select(".ari_12_negrita")[0].string.strip()
|
||||
|
||||
return data
|
||||
|
||||
def _data_hopkins(self, url):
|
||||
# Johns Hopkins Bloomberg School of Public Health
|
||||
soup = BeautifulSoup(rsess.get(url).text)
|
||||
data = {"providername": "Johns Hopkins Bloomberg School of Public Health"}
|
||||
|
||||
data["title"] = " ".join(x.strip() for x in soup.select("h1")[-1].strings if type(x) != bs4.element.Comment)
|
||||
data["author"] = soup.select("#courseInfoBox p")[0].string.strip()
|
||||
data["description"] = soup.select("#courseImageAndInfoBox p")[-1].string.strip()
|
||||
|
||||
return data
|
||||
|
||||
def parse_dataset(self):
|
||||
for item in self.dataset:
|
||||
self.process_item(item)
|
||||
|
||||
def process_item(self, item):
|
||||
inserted, rowid = self.db.insert_topic(2, str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
|
||||
|
||||
if inserted:
|
||||
print "Inserted %s" % item["name"]
|
||||
else:
|
||||
print "Skipped %s" % item["name"]
|
||||
|
||||
for course in item["courses"]:
|
||||
self.process_course(course, rowid)
|
||||
|
||||
def process_course(self, course, topicid):
|
||||
try:
|
||||
start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
|
||||
title = "%s: %s-%s-%s" % (course["name"], str(course["start_year"]).zfill(4), str(course["start_month"]).zfill(2), str(course["start_day"]).zfill(2))
|
||||
except TypeError, e:
|
||||
start_date = None
|
||||
title = "%s (date undetermined)" % (course["name"])
|
||||
|
||||
inserted, itemid = self.db.insert_item(2, str(course["id"]), True, self.db.COURSE, title, course["home_link"], description=course["certificate_description"], start_date=start_date, topic_id=topicid)
|
||||
|
||||
if inserted:
|
||||
print "\tInserted %s" % title
|
||||
else:
|
||||
print "\tSkipped %s" % title
|
||||
|
||||
#crawler = OpenCourseWareCrawler()
|
||||
#crawler.parse_catalog()
|
Loading…
Reference in a new issue