Rewrite generic OCW parser, BeautifulSoup fix to allow exclusion of comments for string retrieval, and fix BS4 bug

Rewrite University of Reddit crawler - now with less hacks!
Add topic_exists and item_exists methods to Scraper class
2013-01-31 01:36:20 +01:00 · 2013-01-30 22:36:42 +01:00 · 2013-01-30 22:30:13 +01:00 · 2013-01-30 22:03:55 +01:00 · 2013-01-30 20:42:46 +01:00 · 2013-01-30 20:42:23 +01:00
51 changed files with 8545 additions and 140 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,9 @@
 # Cryto Learn
 This is the source code for http://learn.cryto.net/. It consists of the following:
 * The updating script, a few very rudimentary scrapers for various educational sources. Requires Python 2. Dependencies are [oursql](http://packages.python.org/oursql/), [requests](http://docs.python-requests.org/en/latest/) and BeautifulSoup 4 (custom version included). Located in `updater/`.
 * The frontend, a fairly hacky and messy PHP-based search interface. Needs cleaning up, but not an immediate priority. Requires PHP 5.3+ and uses [CPHP](http://github.com/joepie91/cphp). Located in `frontend/`.
 * A simple shell search script, using the Cryto Learn API to search for the specified string and print results to stdout. Requires Python 2. Also very rudimentary.
 Licensed under the WTFPL. It may or may not work on your system, use at your own risk, etc. etc.
--- a/book_data_sources.txt
+++ b/book_data_sources.txt
@ -0,0 +1,7 @@
 API:
 http://www.goodreads.com/api
 https://developers.google.com/books/docs/getting-started#books_api_v1
 Dumps:
 http://openlibrary.org/data/ol_dump_latest.txt.gz
 http://www.librarything.com/feeds/
--- a/config.json
+++ b/config.json
@ -0,0 +1,30 @@
 {
 	"database": {
 		"driver": 	"mysql",
 		"pdo":		true,
 		"hostname": 	"localhost",
 		"username": 	"root",
 		"password": 	"",
 		"database": 	"learn"
 	},
 	"locale": {
 		"path": 		"locales",
 		"extension":		"lng",
 		"default_locale": 	"english",
 		"default_timezone": 	"Europe/Amsterdam"
 	},
 	"memcache": {
 		"enabled": 	true,
 		"compressed": 	true,
 		"hostname": 	"localhost",
 		"port": 	11211
 	},
 	"class_map": {
 		"item": 	 	"Item",
 		"topic": 		"Topic"
 	},
 	"components": [
 		"router",
 		"errorhandler"
 	]
 }
--- a/frontend/classes/item.php
+++ b/frontend/classes/item.php
@ -0,0 +1,152 @@
 <?php
 /*
 * Cryto Learn is more free software. It is licensed under the WTFPL, which
 * allows you to do pretty much anything with it, without having to
 * ask permission. Commercial use is allowed, and no attribution is
 * required. We do politely request that you share your modifications
 * to benefit other developers, but you are under no enforced
 * obligation to do so :)
 * 
 * Please read the accompanying LICENSE document for the full WTFPL
 * licensing text.
 */
 if(!isset($_APP)) { die("Unauthorized."); }
 class Item extends CPHPDatabaseRecordClass
 {
 	public $table_name = "items";
 	public $fill_query = "SELECT * FROM items WHERE `Id` = :Id";
 	public $verify_query = "SELECT * FROM items WHERE `Id` = :Id";
 	public $prototype = array(
 		'string' => array(
 			'Title'			=> "Title",
 			'Description'		=> "Description",
 			'SourceUrl'		=> "SourceUrl",
 			'ItemUrl'		=> "ItemUrl"
 		),
 		'numeric' => array(
 			'Type'			=> "Type",
 			'Provider'		=> "Provider",
 			'Views'			=> "Views",
 			'TopicId'		=> "TopicId",
 			'ParentId'		=> "ParentId"
 		),
 		'boolean' => array(
 			'HasTopic'		=> "HasTopic"
 		),
 		'timestamp' => array(
 			'CreationDate'		=> "Date",
 			'StartDate'		=> "StartDate",
 			'EndDate'		=> "EndDate"
 		),
 		'topic' => array(
 			'Topic'			=> "TopicId"
 		),
 		'item' => array(
 			'Parent'		=> "ParentId"
 		)
 	);
 	public function __get($name)
 	{
 		switch($name)
 		{
 			case "sTypeName":
 				return $this->GetTypeName();
 				break;
 			case "sProviderName":
 				return $this->GetProviderName();
 				break;
 			default:
 				return parent::__get($name);
 				break;
 		}
 	}
 	public function GetTypeName()
 	{
 		switch($this->sType)
 		{
 			case 1:
 				return "topic";
 			case 2:
 				return "course";
 			case 3:
 				return "video";
 			case 4:
 				return "article";
 			case 5:
 				return "exercise";
 			case 6:
 				return "quiz";
 			case 7:
 				return "test";
 			case 8:
 				return "book";
 			case 9:
 				return "audiobook";
 			case 10:
 				return "lecture";
 			case 11:
 				return "sandbox";
 			default:
 				return "unknown";
 		}
 	}
 	public function GetProviderName()
 	{
 		switch($this->sProvider)
 		{
 			case 1:
 				return "Khan Academy";
 			case 2:
 				return "Coursera";
 			case 3:
 				return "University of Reddit";
 			default:
 				return "Unknown";
 		}
 	}
 	public function GetChildren()
 	{
 		try
 		{
 			return Item::CreateFromQuery("SELECT * FROM items WHERE `ParentId` = :ParentId", array(':ParentId' => $this->sId));
 		}
 		catch (NotFoundException $e)
 		{
 			return array();
 		}
 	}
 	public function AsDataset($fetch_children = true)
 	{
 		$child_data = array();
 		if($fetch_children == true)
 		{
 			foreach($this->GetChildren() as $child)
 			{
 				$child_data[] = $child->AsDataset();
 			}
 		}
 		return array(
 			"title"		=> $this->uTitle,
 			"description"	=> $this->uDescription,
 			"url"		=> $this->uItemUrl,
 			"source"	=> $this->uSourceUrl,
 			"created"	=> $this->sCreationDate,
 			"start"		=> $this->sStartDate,
 			"end"		=> $this->sEndDate,
 			"type"		=> $this->sTypeName,
 			"provider"	=> $this->sProviderName,
 			"views"		=> $this->sViews,
 			"children"	=> $child_data
 		);
 	}
 }
--- a/frontend/classes/topic.php
+++ b/frontend/classes/topic.php
@ -0,0 +1,131 @@
 <?php
 /*
 * Cryto Learn is more free software. It is licensed under the WTFPL, which
 * allows you to do pretty much anything with it, without having to
 * ask permission. Commercial use is allowed, and no attribution is
 * required. We do politely request that you share your modifications
 * to benefit other developers, but you are under no enforced
 * obligation to do so :)
 * 
 * Please read the accompanying LICENSE document for the full WTFPL
 * licensing text.
 */
 if(!isset($_APP)) { die("Unauthorized."); }
 class Topic extends CPHPDatabaseRecordClass
 {
 	public $table_name = "topics";
 	public $fill_query = "SELECT * FROM topics WHERE `Id` = :Id";
 	public $verify_query = "SELECT * FROM topics WHERE `Id` = :Id";
 	public $prototype = array(
 		'string' => array(
 			'Title'			=> "Title",
 			'ProviderId'		=> "ProviderId",
 			'Description'		=> "Description"
 		),
 		'numeric' => array(
 			'ParentId'		=> "ParentId",
 			'Provider'		=> "Provider"
 		),
 		'boolean' => array(
 			'NeedsEnrollment'	=> "NeedsEnrollment"
 		),
 		'timestamp' => array(
 			'CreationDate'		=> "Created",
 			'StartDate'		=> "StartDate",
 			'EndDate'		=> "EndDate"
 		),
 		'topic' => array(
 			'Parent'		=> "ParentId"
 		)
 	);
 	public function __get($name)
 	{
 		switch($name)
 		{
 			case "sProviderName":
 				return $this->GetProviderName();
 				break;
 			default:
 				return parent::__get($name);
 				break;
 		}
 	}
 	public function GetProviderName()
 	{
 		switch($this->sProvider)
 		{
 			case 1:
 				return "Khan Academy";
 			case 2:
 				return "Coursera";
 			case 3:
 				return "University of Reddit";
 			default:
 				return "Unknown";
 		}
 	}
 	public function AsDataset($fetch_children = true, $fetch_items = true)
 	{
 		$child_data = array();
 		if($fetch_children == true)
 		{
 			foreach($this->GetChildren() as $child)
 			{
 				$child_data[] = $child->AsDataset();
 			}
 		}
 		$item_data = array();
 		if($fetch_items == true)
 		{
 			foreach($this->GetItems() as $item)
 			{
 				$item_data[] = $item->AsDataset();
 			}
 		}
 		return array(
 			"title"			=> $this->uTitle,
 			"description"		=> $this->uDescription,
 			"created"		=> $this->sCreationDate,
 			"start"			=> $this->sStartDate,
 			"end"			=> $this->sEndDate,
 			"provider"		=> $this->sProviderName,
 			"needs_enrollment"	=> $this->sNeedsEnrollment,
 			"children"		=> $child_data,
 			"items"			=> $item_data
 		);
 	}
 	public function GetItems()
 	{
 		try
 		{
 			return Item::CreateFromQuery("SELECT * FROM items WHERE `TopicId` = :TopicId", array(':TopicId' => $this->sId));
 		}
 		catch (NotFoundException $e)
 		{
 			return array();
 		}
 	}
 	public function GetChildren()
 	{
 		try
 		{
 			return Topic::CreateFromQuery("SELECT * FROM topics WHERE `ParentId` = :ParentId", array(':ParentId' => $this->sId));
 		}
 		catch (NotFoundException $e)
 		{
 			return array();
 		}
 	}
 }
--- a/frontend/cphp
+++ b/frontend/cphp
@ -0,0 +1 @@
 ../../cphp
--- a/frontend/dump.json
+++ b/frontend/dump.json
--- a/frontend/includes/base.php
+++ b/frontend/includes/base.php
@ -0,0 +1,26 @@
 <?php
 /*
 * Cryto Learn is more free software. It is licensed under the WTFPL, which
 * allows you to do pretty much anything with it, without having to
 * ask permission. Commercial use is allowed, and no attribution is
 * required. We do politely request that you share your modifications
 * to benefit other developers, but you are under no enforced
 * obligation to do so :)
 * 
 * Please read the accompanying LICENSE document for the full WTFPL
 * licensing text.
 */
 if(!isset($_APP)) { die("Unauthorized."); }
 $_CPHP = true;
 $_CPHP_CONFIG = "../config.json";
 require("cphp/base.php");
 function __autoload($class_name) 
 {
 	global $_APP;
 	$class_name = str_replace("\\", "/", strtolower($class_name));
 	require_once("classes/{$class_name}.php");
 }
--- a/frontend/index.php
+++ b/frontend/index.php
@ -0,0 +1,14 @@
 <?php
 /*
 * Cryto Learn is more free software. It is licensed under the WTFPL, which
 * allows you to do pretty much anything with it, without having to
 * ask permission. Commercial use is allowed, and no attribution is
 * required. We do politely request that you share your modifications
 * to benefit other developers, but you are under no enforced
 * obligation to do so :)
 * 
 * Please read the accompanying LICENSE document for the full WTFPL
 * licensing text.
 */
 require("rewrite.php");
--- a/frontend/locales/english.lng
+++ b/frontend/locales/english.lng
@ -0,0 +1,24 @@
 _locale;								en_US.UTF-8,en_US
 _datetime_short;							%d/%m/%Y %H:%M:%S
 _datetime_long;								%A %B %d, %Y %H:%M:%S
 _date_short;								%d/%m/%Y
 _date_long;								%A %B %d, %Y
 _time;									%H:%M:%S
 event-now;								now
 event-future;								in the future
 event-past;								in the past
 event-1second-ago;							1 second ago
 event-seconds-ago;							%1$d seconds ago
 event-1minutes-ago;							1 minute ago
 event-minutes-ago;							%1$d minutes ago
 event-1hour-ago;							1 hour ago
 event-hours-ago;							%1$d hours ago
 event-1day-ago;								1 day ago
 event-days-ago;								%1$d days ago
 event-1week-ago;							1 week ago
 event-weeks-ago;							%1$d weeks ago
 event-1month-ago;							1 month ago
 event-months-ago;							%1$d months ago
 event-1year-ago;							1 year ago
 event-years-ago;							%1$d years ago
--- a/frontend/modules/api/dump.php
+++ b/frontend/modules/api/dump.php
@ -0,0 +1,28 @@
 <?php
 /*
 * Cryto Learn is more free software. It is licensed under the WTFPL, which
 * allows you to do pretty much anything with it, without having to
 * ask permission. Commercial use is allowed, and no attribution is
 * required. We do politely request that you share your modifications
 * to benefit other developers, but you are under no enforced
 * obligation to do so :)
 * 
 * Please read the accompanying LICENSE document for the full WTFPL
 * licensing text.
 */
 if(!isset($_APP)) { die("Unauthorized."); }
 if($_GET['key'] !== "derp")
 {
 	die();
 }
 $data = array();
 foreach(Topic::CreateFromQuery("SELECT * FROM topics WHERE `ParentId` = 0") as $topic)
 {
 	$data[] = $topic->AsDataset();
 }
 echo(json_encode($data));
--- a/frontend/modules/api/search.php
+++ b/frontend/modules/api/search.php
@ -0,0 +1,69 @@
 <?php
 /*
 * Cryto Learn is more free software. It is licensed under the WTFPL, which
 * allows you to do pretty much anything with it, without having to
 * ask permission. Commercial use is allowed, and no attribution is
 * required. We do politely request that you share your modifications
 * to benefit other developers, but you are under no enforced
 * obligation to do so :)
 * 
 * Please read the accompanying LICENSE document for the full WTFPL
 * licensing text.
 */
 if(!isset($_APP)) { die("Unauthorized."); }
 if(empty($_POST['q']))
 {
 	die(json_encode(array(
 		"error" => "No search query specified."
 	)));
 }
 else
 {
 	$query = $_POST['q'];
 	$terms = explode(" ", $query);
 	$db_query_terms = array();
 	$valid_term = false;
 	foreach($terms as $term)
 	{
 		$db_query_terms[] = "`Title` LIKE ?";
 		$term = str_replace("%", "\%", $term);
 		$term = str_replace("_", "\_", $term);
 		$valid_term = $valid_term || (strlen($term) >= 2);
 		$db_query_arguments[] = "%{$term}%";
 	}
 	if($valid_term)
 	{
 		$db_query = implode(" AND ", $db_query_terms);
 		array_unshift($db_query_arguments, '');
 		unset($db_query_arguments[0]);
 		try
 		{
 			$results_topics = Topic::CreateFromQuery("SELECT * FROM topics WHERE {$db_query}", $db_query_arguments);
 			$return_objects = array();
 			foreach($results_topics as $topic)
 			{
 				$return_objects[] =  $topic->AsDataset();
 			}
 			$sPageContents = json_encode($return_objects);
 		}
 		catch (NotFoundException $e)
 		{
 			$sPageContents = json_encode(array("error" => "No results found for the specified query.", "query" => $query));
 		}
 	}
 	else
 	{
 		die(json_encode(array(
 			"error" => "No valid search query specified."
 		)));
 	}
 }
--- a/frontend/modules/ui/index.php
+++ b/frontend/modules/ui/index.php
@ -0,0 +1,18 @@
 <?php
 /*
 * Cryto Learn is more free software. It is licensed under the WTFPL, which
 * allows you to do pretty much anything with it, without having to
 * ask permission. Commercial use is allowed, and no attribution is
 * required. We do politely request that you share your modifications
 * to benefit other developers, but you are under no enforced
 * obligation to do so :)
 * 
 * Please read the accompanying LICENSE document for the full WTFPL
 * licensing text.
 */
 if(!isset($_APP)) { die("Unauthorized."); }
 $sPageContents = NewTemplater::Render("ui/index", $locale->strings, array());
 $sPageType = "ui";
--- a/frontend/rewrite.php
+++ b/frontend/rewrite.php
@ -0,0 +1,34 @@
 <?php
 $_APP = true;
 require("includes/base.php");
 $sPageContents = "";
 $router = new CPHPRouter();
 $router->allow_slash = true;
 $router->ignore_query = true;
 $router->routes = array(
 	0 => array(
 		"^/$"							=> "modules/ui/index.php",
 		"^/api/search$"						=> "modules/api/search.php",
 		"^/api/dump$"						=> "modules/api/dump.php"
 	)
 );
 try
 {
 	$router->RouteRequest();
 }
 catch (RouterException $e)
 {
 	http_status_code(404);
 	$sPageContents = "404 not found";
 }
 echo($sPageContents);
 /*
 * */
--- a/frontend/static/spinner.gif
+++ b/frontend/static/spinner.gif
--- a/frontend/style.css
+++ b/frontend/style.css
@ -6,6 +6,11 @@ body
 	font-family: sans-serif;
 }
 #templates
 {
 	display: none;
 }
 .header
 {
 	background-color: #C9F9DF;
@ -19,6 +24,12 @@ body
 	font-weight: normal;
 }
 .header h2
 {
 	margin: 2px;
 	font-size: 17px;
 }
 .search-large
 {
 	color: #006824;
@ -55,3 +66,95 @@ body
 	font-size: 26px;
 	width: 180px;
 }
 .spinner
 {
 	margin-left: 14px;
 }
 .topic, .item
 {
 	padding: 9px 12px;
 	margin: 5px 20px;
 	background-color: #79E1A8;
 	font-size: 20px;
 	width: 960px;
 }
 .topic 
 {
 	margin-top: 19px;
 	cursor: pointer;
 }
 .item
 {
 	margin-left: 34px;
 	width: 926px;
 	font-size: 18px;
 	background-color: #97F3C1;
 	display: none;
 }
 .type
 {
 	font-size: 18px;
 	color: gray;
 }
 .type:after
 {
 	content: ":";
 }
 a.title
 {
 	color: #041F9F;
 }
 .toggler
 {
 	display: block;
 	float: left;
 	width: 16px;
 	height: 16px;
 	margin-top: 2px;
 	margin-right: 8px;
 	font-size: 13px;
 	text-align: center;
 	font-weight: bold;
 	border: 1px solid black;
 	background-color: #D2ECCF;
 }
 .providername
 {
 	font-size: 18px;
 	color: gray;
 }
 .providername:before
 {
 	content: "(";
 }
 .providername:after
 {
 	content: ")";
 }
 .error
 {
 	margin: 8px 16px;
 	font-size: 19px;
 }
 .description
 {
 	margin-top: 4px;
 	font-size: 13px;
 	max-height: 15px;
 	overflow: hidden;
 	text-overflow: ellipsis;
 	white-space: nowrap;
 }
--- a/frontend/templates/ui/index.tpl
+++ b/frontend/templates/ui/index.tpl
@ -0,0 +1,160 @@
 <!doctype html>
 <html>
 	<head>
 		<title>learn.cryto.net</title>
 		<link rel="stylesheet" href="style.css">
 		<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.9.0/jquery.min.js"></script>
 		<script>
 			var search_timeout = null;
 			$(function(){
 				/*$("input").val("data");
 				runSearch();*/
 				$("input").keyup(function(){
 					if(typeof search_timeout !== "null")
 					{
 						clearTimeout(search_timeout);
 					}
 					search_timeout = setTimeout(runSearch, 800)
 				});
 			});
 			function runSearch()
 			{
 				$(".search-large").removeClass("search-large").addClass("search-top");
 				$(".spinner").show();
 				var query = $("input#query").val();
 				if(query.length >= 3)
 				{
 					$.post("/api/search", {q: query}, function(response){
 						$(".spinner").hide();
 						$(".results").html("");
 						if(typeof response.error == "undefined")
 						{
 							for(i in response)
 							{
 								if(response[i].items.length > 0)
 								{
 									var result_wrapper = instantiateTemplate("result_wrapper");
 									var result_block = instantiateTemplate("result_topic");
 									result_block.children(".title").html(response[i].title);
 									result_block.children(".description").html(response[i].description);
 									result_block.children(".providername").html(response[i].provider);
 									result_block.appendTo(result_wrapper);
 									for(x in response[i].items)
 									{
 										item = response[i].items[x];
 										var item_block = instantiateTemplate("result_item");
 										item_block.children(".title").html(item.title);
 										item_block.children(".title").attr("href", item.url);
 										item_block.children(".type").html(item.type);
 										item_block.insertAfter(result_block);
 									}
 									result_wrapper.appendTo(".results");
 								}
 							}
 						}
 						else
 						{
 							$(".results").html("<div class='error'>No results.</div>");
 						}
 						setHandlers();
 					}, "json");
 				}
 				else
 				{
 					$(".spinner").hide();
 					$(".results").html("<div class='error'>Enter at least 3 characters.</div>");
 				}
 			}
 			function setHandlers()
 			{
 				$(".toggler, .topic").each(
 					function(){
 						$(this).click(function(event){
 							toggleItems(this, event);
 						});
 					}
 				);
 			}
 			function instantiateTemplate(template_name)
 			{
 				var instance = $("#template_" + template_name).clone();
 				instance.removeAttr("id");
 				return instance;
 			}
 			function toggleItems(ctx, event)
 			{
 				var parent = $(ctx).parentsUntil(".wrapper");
 				if(parent.length == 0)
 				{
 					var wrapper = $(ctx).parent();
 				}
 				else
 				{
 					var wrapper = parent.parent();
 				}
 				var toggler = wrapper.find(".toggler");
 				if(typeof toggler.data("toggled") == "undefined" || toggler.data("toggled") == false)
 				{
 					toggler.data("toggled", true);
 					toggler.html("-");
 					wrapper.find(".item").show();
 				}
 				else
 				{
 					toggler.data("toggled", false);
 					toggler.html("+");
 					wrapper.find(".item").hide();
 				}
 				event.stopPropagation();
 			}
 		</script>
 	</head>
 	<body>
 		<div class="header">
 			<h1><strong>learn.cryto.net</strong> :: Learn something new!</h1>
 			<h2>Currently searching Coursera, Khan University, University of Reddit. Comments? <a href="mailto:learn@cryto.net">learn@cryto.net</a> or 
 			<a href="irc://irc.cryto.net/crytocc">irc.cryto.net #crytocc</a></h2>
 			<h2>Like the service and wish to donate? <a href="http://cryto.net/~joepie91/donate.html">You can do that here :)</a></h2>
 		</div>
 		<div class="main">
 			<div class="search-large">
 				I want to learn about <input type="text" id="query">. <img src="/static/spinner.gif" class="spinner" style="display: none;">
 			</div>
 			<div class="results">
 			</div>
 		</div>
 		<div id="templates">
 			<div id="template_result_wrapper" class="wrapper"></div>
 			<div id="template_result_topic" class="topic">
 				<span class="toggler">+</span>
 				<strong>Topic: </strong>
 				<span class="title"></span>
 				<span class="providername"></span>
 				<div class="description"></div>
 			</div>
 			<div id="template_result_item" class="item">
 				<span class="type"></span>
 				<a href="#" class="title"></a>
 			</div>
 		</div>
 	</body>
 </html>
--- a/ocw_functions.txt
+++ b/ocw_functions.txt
@ -0,0 +1,51 @@
 "ocw.kaplan.edu": self._metadata_kaplan,
 "ocw.korea.edu": self._metadata_korea,
 "kyotomm.jp": self._metadata_kyoto,
 "ocw.kyushu-u.ac.jp": self._metadata_kyushu,
 "open-marhi.ru": self._metadata_moscow,
 "yctrtrc.ncku.edu.tw": self._metadata_chengkung,
 "ocw.nctu.edu.tw": self._metadata_chiaotung,
 "opencourse.ndhu.edu.tw": self._metadata_donghwa,
 "ocw.njit.edu": self._metadata_njit,
 "graduateschool.paristech.fr": self._metadata_paris,
 "peoples-uni.org": self._metadata_oaei,
 "ocw.sbu.ac.ir": self._metadata_shahid,
 "studentscircle.net": self._metadata_studentscircle,
 "ocw.tmu.edu.tw:8080": self._metadata_taipei,
 "openlearn.open.ac.uk": self._metadata_openuni,
 "www.ocw.titech.ac.jp": self._metadata_tokyo,
 "feedproxy.google.com": self._metadata_tudelft,
 "ocw.tufts.edu": self._metadata_tufts,
 "ocw.unu.edu": self._metadata_un,
 "ocw.uc3m.es": self._metadata_madrid,
 "ocw.ua.es": self._metadata_alicante,
 "ocw.unican.es": self._metadata_cantabria,
 "ocw.ugr.es": self._metadata_granada,
 "ocw.udem.edu.mx": self._metadata_monterrey,
 "ocw.um.es": self._metadata_murcia,
 "ocw.uniovi.es": self._metadata_oviedo,
 "ocw.usal.es": self._metadata_salamanca,
 "ocwus.us.es": self._metadata_sevilla,
 "ocw.unizar.es": self._metadata_zaragoza,
 "ocw.univalle.edu.co3": self._metadata_colombia,
 "ocw.uned.ac.cr": self._metadata_distancia,
 "www.icesi.edu.co": self._metadata_icesi,
 "ocw.innova.uned.es": self._metadata_innova,
 "upv.es": self._metadata_valencia,
 "ocw.upm.es": self._metadata_upm,
 "ocw.utpl.edu.ec": self._metadata_utpl,
 "ocw.uab.cat": self._metadata_uab,
 "ocw.ub.edu": self._metadata_ub,
 "ocw.uib.es": self._metadata_uib,
 "ocw.udl.cat": self._metadata_udl,
 "ocw.uv.es": self._metadata_uv,
 "e-ujier.uji.e": self._metadata_uji,
 "ocw.uoc.edu": self._metadata_uoc,
 "ocw.utm.my": self._metadata_utm,
 "ocw.uci.edu": self._metadata_uci,
 "opencontent.uct.ac.za": self._metadata_uct,
 "ocw.umb.edu:8080": self._metadata_boston,
 "open.umich.edu": self._metadata_michigan,
 "ocw.nd.edu": self._metadata_notredame,
 "ocw.usu.ac.id": self._metadata_usu,
 "ocw.tsukuba.ac.jp": self._metadata_tsukaba
--- a/ocw_sources.txt
+++ b/ocw_sources.txt
@ -0,0 +1,116 @@
 # AGH University of Science and Technology
 http://open.agh.edu.pl/course/view.php?id=97
 # Funda Getulio Vargas - FGV Online
 http://www5.fgv.br/fgvonline/CursosGratuitosFormulario.aspx?id_curso=OCWAJUEAD_00_01/2011_1
 # Gunadarma University
 http://ocw.gunadarma.ac.id/course/about
 # Johns Hopkins Bloomberg School of Public Health
 http://ocw.jhsph.edu/courses/AdolHealthDev/?source=rss
 # Kaplan University Online & Campus Learning
 http://ocw.kaplan.edu/arts-and-sciences/academic-strategies
 # Korea University
 http://ocw.korea.edu/ocw/college-of-science/general-physics-i
 # Kyoto Seika University
 http://www.kyotomm.jp/event/exh/kyotomagic2012.php
 # Kyushu University
 http://ocw.kyushu-u.ac.jp/90901/0007/index.html
 # Massachusetts Institute of Technology
 http://ocw.mit.edu/courses/civil-and-environmental-engineering/1-00-introduction-to-computers-and-engineering-problem-solving-fall-2005
 # MOSCOW ARCHITECTURAL INSTITUTE
 http://www.open-marhi.ru/courses/detail/index.php?ID=6631
 # National Cheng Kung University
 http://yctrtrc.ncku.edu.tw/site2/newocwcourse/OCW_MAIN.php?cid=141
 # National Chiao Tung University
 http://ocw.nctu.edu.tw/riki_detail.php?pgid=335
 # National Dong Hwa University
 http://opencourse.ndhu.edu.tw/moodle/mod/forum/discuss.php?d=3
 # New Jersey Institute of Technology
 http://ocw.njit.edu/ocw/som/acct/acct-615-anandarajan/index.php
 # Paris Tech
 http://graduateschool.paristech.fr/cours.php?id=309132
 # People's Open Access Education Initiative
 http://www.peoples-uni.org/node/236
 # Shahid Beheshti University
 http://ocw.sbu.ac.ir/Default.aspx?tabid=5352&language=fa-IR
 # Students Circle Network
 http://studentscircle.net/live/2011/07/a-guide-before-learning-a-new-javascript-framework/
 # Taipei Medical University
 http://ocw.tmu.edu.tw:8080/eduCommons/general-education/53f28a1882076b7753f24eba72698a556790-shih-chi-analysis-on-historical-figures
 # The Open University
 http://openlearn.open.ac.uk/course/view.php?name=DD208_3
 # The Open University of Israel
 http://peer-news.blogspot.com/2011/12/2-10934.html
 # Tokyo Institute of Technology
 http://www.ocw.titech.ac.jp/index.php?module=General&Nendo=2012&action=T0300&GakubuCD=223&GakkaCD=224710&KougiCD=70030&Gakki=1&lang=EN
 # TU Delft
 http://feedproxy.google.com/~r/tudelft/OCW/~3/0sA6qPQKcOg/bachelor-civiele-techniek
 # Tufts University
 http://ocw.tufts.edu/Course/39
 # UNISUL - Universidade do Sul de Santa Catarina
 http://labspace.open.ac.uk
 # United Nations University
 http://ocw.unu.edu/international-institute-for-software-technology/building-a-community-of-practice-for-electronic-governance
 # Universidad Carlos III de Madrid
 http://ocw.uc3m.es/ingenieria-electrica/accionamientos-electricos
 # Universidad de Alicante
 http://ocw.ua.es/Ciencias_Sociales_y_Juridicas/actividades-deportivas-medio-ambiente
 # Universidad de Cantabria
 http://ocw.unican.es/ciencias-de-la-salud/actuacion-en-situaciones-especiales
 # Universidad de Granada
 http://ocw.ugr.es/course/view.php?id=23&topic=1
 # Universidad de Monterrey
 http://ocw.udem.edu.mx/cursos-de-profesional/administracion-de-tecnologias-de-informacion
 # Universidad de Murcia
 http://ocw.um.es/cc.-sociales/actividad-fisica-en-el-envejecimiento
 # Universidad de Oviedo
 http://ocw.uniovi.es/course/view.php?id=28&ocw=1
 # Universidad de Salamanca
 http://ocw.usal.es/ciencias-sociales-1/curso-cero-matematicas-para-ciencias-sociales-nivelacion-de-conocimientos
 # Universidad de Sevilla
 http://ocwus.us.es/matematica-aplicada/pp-3
 # Universidad de Zaragoza
 http://ocw.unizar.es/ocw/ciencias-de-la-salud-1/actividades-fisicas-y-deportivas-aereas
 # Universidad del Valle - Colombia
 http://ocw.univalle.edu.co/ocw/ingenieria-electronica-telecomunicaciones-y-afines/arquitectura-de-procesos-industriales
 # Universidad Estatal a Distancia
 http://ocw.uned.ac.cr/eduCommons/ciencias-de-la-administracion/compras-y-almacenamiento
 # Universidad Icesi
 http://www.icesi.edu.co/ocw/tic/administracion_plataformas_y_seguridad
 # Universidad Nacional de Educacion a Distancia
 http://ocw.innova.uned.es/ocwuniversia/psicologia/analisis-de-datos-en-Psico-I
 # Universidad Politica de Valencia
 http://www.upv.es/ocwasi/2010/6842
 # Universidad Politica Madrid
 http://ocw.upm.es/ingenieria-cartografica-geodesica-y-fotogrametria/3d-scanning-and-modeling
 # UNIVERSIDAD TECNICA PARTICULAR DE LOJA
 http://ocw.utpl.edu.ec/economia
 # Universitat Auta de Barcelona
 http://ocw.uab.cat/enginyeries/apunts-de-calcul-matricial-i-resolucio-de-sistemes
 # Universitat de Barcelona
 http://ocw.ub.edu/admistracio-i-direccio-dempreses
 # Universitat de les Illes Balears
 http://ocw.uib.es/ocw/infermeria/atencion-de-enfermeria-frente-situaciones-de
 # Universitat de Lleida
 http://ocw.udl.cat/arts-i-humanitats
 # Universitat de Valia
 http://ocw.uv.es/ciencias-sociales-y-juridicas/2-2
 # Universitat Jaume I
 http://e-ujier.uji.es/pls/www/!gri_www.euji22101?p_id=15&p_tipo=A&p_curso=IG23&p_idioma=CA
 # Universitat Oberta de Catalunya
 http://ocw.uoc.edu/informatica-tecnologia-i-multimedia/administracio-avancada-del-sistema-operatiu-gnu-linux
 # Universiti Teknologi Malaysia
 http://ocw.utm.my/course/view.php?id=90
 # University of California, Irvine
 http://ocw.uci.edu/courses/course.aspx?id=113
 # University of Cape Town
 http://opencontent.uct.ac.za/Centre-for-Higher-Education-Development/Centre-for-Open-Learning/A-developmental-state-The-challenge-ahead
 # University of Massachusetts Boston
 http://ocw.umb.edu:8080/eduCommons/about
 # University of Michigan
 http://open.umich.edu/education/med/oernetwork/med/em/aetc-redirect/2009
 # University of Notre Dame
 http://ocw.nd.edu/history/african-american-history-ii
 # University of Sumatera Utara
 http://ocw.usu.ac.id/course/detail/teknik-sipil-s1/4110000007-struktur-bangunan-sipil-i.html
 # University of Tsukuba
 http://ocw.tsukuba.ac.jp/6570740672698cea79d15b6678147a7679d130fb65705b665c02653b/66f87c4d7d394ecb
--- a/shellsearch/search.py
+++ b/shellsearch/search.py
@ -0,0 +1,22 @@
 #!/usr/bin/env python
 import requests, sys, re
 query = sys.argv[1]
 results = requests.post("http://learn.cryto.net/api/search", {"q": query}).json()
 for result in results:
 	name = result["title"].rstrip()
 	description = result["description"].strip().replace("\n", " ")
 	if len(description) > 200:
 		description = re.match("^(.{0,300})\W", description).group(1) + "..."
 	print "## %s\n%s" % (name, description)
 	for item in result["items"]:
 		name = item["title"].ljust(70)
 		print "\t[%s] %s\t%s" % (item["type"], name, item["url"])
 	print ""
--- a/update.sql
+++ b/update.sql
@ -0,0 +1,2 @@
 ALTER TABLE  `items` ADD  `CustomProviderName` VARCHAR( 250 ) NULL DEFAULT NULL;
 ALTER TABLE  `topics` ADD  `CustomProviderName` VARCHAR( 250 ) NULL DEFAULT NULL;
--- a/updater/bs4/init.py
+++ b/updater/bs4/init.py
@ -0,0 +1,361 @@
 """Beautiful Soup
 Elixir and Tonic
 "The Screen-Scraper's Friend"
 http://www.crummy.com/software/BeautifulSoup/
 Beautiful Soup uses a pluggable XML or HTML parser to parse a
 (possibly invalid) document into a tree representation. Beautiful Soup
 provides provides methods and Pythonic idioms that make it easy to
 navigate, search, and modify the parse tree.
 Beautiful Soup works with Python 2.6 and up. It works better if lxml
 and/or html5lib is installed.
 For more than you ever wanted to know about Beautiful Soup, see the
 documentation:
 http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 """
 __author__ = "Leonard Richardson (leonardr@segfault.org)"
 __version__ = "4.1.3"
 __copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
 __license__ = "MIT"
 __all__ = ['BeautifulSoup']
 import re
 import warnings
 from .builder import builder_registry
 from .dammit import UnicodeDammit
 from .element import (
    CData,
    Comment,
    DEFAULT_OUTPUT_ENCODING,
    Declaration,
    Doctype,
    NavigableString,
    PageElement,
    ProcessingInstruction,
    ResultSet,
    SoupStrainer,
    Tag,
    )
 # The very first thing we do is give a useful error if someone is
 # running this code under Python 3 without converting it.
 syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
 class BeautifulSoup(Tag):
    """
    This class defines the basic interface called by the tree builders.
    These methods will be called by the parser:
      reset()
      feed(markup)
    The tree builder may call these methods from its feed() implementation:
      handle_starttag(name, attrs) # See note about return value
      handle_endtag(name)
      handle_data(data) # Appends to the current data node
      endData(containerClass=NavigableString) # Ends the current data node
    No matter how complicated the underlying parser is, you should be
    able to build a tree using 'start tag' events, 'end tag' events,
    'data' events, and "done with data" events.
    If you encounter an empty-element tag (aka a self-closing tag,
    like HTML's <br> tag), call handle_starttag and then
    handle_endtag.
    """
    ROOT_TAG_NAME = u'[document]'
    # If the end-user gives no indication which tree builder they
    # want, look for one with these features.
    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
    # Used when determining whether a text node is all whitespace and
    # can be replaced with a single space. A text node that contains
    # fancy Unicode spaces (usually non-breaking) should be left
    # alone.
    STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
    def __init__(self, markup="", features=None, builder=None,
                 parse_only=None, from_encoding=None, **kwargs):
        """The Soup object is initialized as the 'root tag', and the
        provided markup (which can be a string or a file-like object)
        is fed into the underlying parser."""
        if 'convertEntities' in kwargs:
            warnings.warn(
                "BS4 does not respect the convertEntities argument to the "
                "BeautifulSoup constructor. Entities are always converted "
                "to Unicode characters.")
        if 'markupMassage' in kwargs:
            del kwargs['markupMassage']
            warnings.warn(
                "BS4 does not respect the markupMassage argument to the "
                "BeautifulSoup constructor. The tree builder is responsible "
                "for any necessary markup massage.")
        if 'smartQuotesTo' in kwargs:
            del kwargs['smartQuotesTo']
            warnings.warn(
                "BS4 does not respect the smartQuotesTo argument to the "
                "BeautifulSoup constructor. Smart quotes are always converted "
                "to Unicode characters.")
        if 'selfClosingTags' in kwargs:
            del kwargs['selfClosingTags']
            warnings.warn(
                "BS4 does not respect the selfClosingTags argument to the "
                "BeautifulSoup constructor. The tree builder is responsible "
                "for understanding self-closing tags.")
        if 'isHTML' in kwargs:
            del kwargs['isHTML']
            warnings.warn(
                "BS4 does not respect the isHTML argument to the "
                "BeautifulSoup constructor. You can pass in features='html' "
                "or features='xml' to get a builder capable of handling "
                "one or the other.")
        def deprecated_argument(old_name, new_name):
            if old_name in kwargs:
                warnings.warn(
                    'The "%s" argument to the BeautifulSoup constructor '
                    'has been renamed to "%s."' % (old_name, new_name))
                value = kwargs[old_name]
                del kwargs[old_name]
                return value
            return None
        parse_only = parse_only or deprecated_argument(
            "parseOnlyThese", "parse_only")
        from_encoding = from_encoding or deprecated_argument(
            "fromEncoding", "from_encoding")
        if len(kwargs) > 0:
            arg = kwargs.keys().pop()
            raise TypeError(
                "__init__() got an unexpected keyword argument '%s'" % arg)
        if builder is None:
            if isinstance(features, basestring):
                features = [features]
            if features is None or len(features) == 0:
                features = self.DEFAULT_BUILDER_FEATURES
            builder_class = builder_registry.lookup(*features)
            if builder_class is None:
                raise FeatureNotFound(
                    "Couldn't find a tree builder with the features you "
                    "requested: %s. Do you need to install a parser library?"
                    % ",".join(features))
            builder = builder_class()
        self.builder = builder
        self.is_xml = builder.is_xml
        self.builder.soup = self
        self.parse_only = parse_only
        self.reset()
        if hasattr(markup, 'read'):        # It's a file-type object.
            markup = markup.read()
        (self.markup, self.original_encoding, self.declared_html_encoding,
         self.contains_replacement_characters) = (
            self.builder.prepare_markup(markup, from_encoding))
        try:
            self._feed()
        except StopParsing:
            pass
        # Clear out the markup and remove the builder's circular
        # reference to this object.
        self.markup = None
        self.builder.soup = None
    def _feed(self):
        # Convert the document to Unicode.
        self.builder.reset()
        self.builder.feed(self.markup)
        # Close out any unfinished strings and close all the open tags.
        self.endData()
        while self.currentTag.name != self.ROOT_TAG_NAME:
            self.popTag()
    def reset(self):
        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
        self.hidden = 1
        self.builder.reset()
        self.currentData = []
        self.currentTag = None
        self.tagStack = []
        self.pushTag(self)
    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
        """Create a new tag associated with this soup."""
        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
    def new_string(self, s):
        """Create a new NavigableString associated with this soup."""
        navigable = NavigableString(s)
        navigable.setup()
        return navigable
    def insert_before(self, successor):
        raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
    def insert_after(self, successor):
        raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
    def popTag(self):
        tag = self.tagStack.pop()
        #print "Pop", tag.name
        if self.tagStack:
            self.currentTag = self.tagStack[-1]
        return self.currentTag
    def pushTag(self, tag):
        #print "Push", tag.name
        if self.currentTag:
            self.currentTag.contents.append(tag)
        self.tagStack.append(tag)
        self.currentTag = self.tagStack[-1]
    def endData(self, containerClass=NavigableString):
        if self.currentData:
            currentData = u''.join(self.currentData)
            if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
                not set([tag.name for tag in self.tagStack]).intersection(
                    self.builder.preserve_whitespace_tags)):
                if '\n' in currentData:
                    currentData = '\n'
                else:
                    currentData = ' '
            self.currentData = []
            if self.parse_only and len(self.tagStack) <= 1 and \
                   (not self.parse_only.text or \
                    not self.parse_only.search(currentData)):
                return
            o = containerClass(currentData)
            self.object_was_parsed(o)
    def object_was_parsed(self, o, parent=None, previous_element=None):
        """Add an object to the parse tree."""
        parent = parent or self.currentTag
        previous_element = previous_element or self.previous_element
        o.setup(parent, previous_element)
        if self.previous_element:
            self.previous_element.next_element = o
        self.previous_element = o
        parent.contents.append(o)
    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
        """Pops the tag stack up to and including the most recent
        instance of the given tag. If inclusivePop is false, pops the tag
        stack up to but *not* including the most recent instqance of
        the given tag."""
        #print "Popping to %s" % name
        if name == self.ROOT_TAG_NAME:
            return
        numPops = 0
        mostRecentTag = None
        for i in range(len(self.tagStack) - 1, 0, -1):
            if (name == self.tagStack[i].name
                and nsprefix == self.tagStack[i].prefix):
                numPops = len(self.tagStack) - i
                break
        if not inclusivePop:
            numPops = numPops - 1
        for i in range(0, numPops):
            mostRecentTag = self.popTag()
        return mostRecentTag
    def handle_starttag(self, name, namespace, nsprefix, attrs):
        """Push a start tag on to the stack.
        If this method returns None, the tag was rejected by the
        SoupStrainer. You should proceed as if the tag had not occured
        in the document. For instance, if this was a self-closing tag,
        don't call handle_endtag.
        """
        # print "Start tag %s: %s" % (name, attrs)
        self.endData()
        if (self.parse_only and len(self.tagStack) <= 1
            and (self.parse_only.text
                 or not self.parse_only.search_tag(name, attrs))):
            return None
        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
                  self.currentTag, self.previous_element)
        if tag is None:
            return tag
        if self.previous_element:
            self.previous_element.next_element = tag
        self.previous_element = tag
        self.pushTag(tag)
        return tag
    def handle_endtag(self, name, nsprefix=None):
        #print "End tag: " + name
        self.endData()
        self._popToTag(name, nsprefix)
    def handle_data(self, data):
        self.currentData.append(data)
    def decode(self, pretty_print=False,
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
               formatter="minimal"):
        """Returns a string or Unicode representation of this document.
        To get Unicode, pass None for encoding."""
        if self.is_xml:
            # Print the XML declaration
            encoding_part = ''
            if eventual_encoding != None:
                encoding_part = ' encoding="%s"' % eventual_encoding
            prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
        else:
            prefix = u''
        if not pretty_print:
            indent_level = None
        else:
            indent_level = 0
        return prefix + super(BeautifulSoup, self).decode(
            indent_level, eventual_encoding, formatter)
 class BeautifulStoneSoup(BeautifulSoup):
    """Deprecated interface to an XML parser."""
    def __init__(self, *args, **kwargs):
        kwargs['features'] = 'xml'
        warnings.warn(
            'The BeautifulStoneSoup class is deprecated. Instead of using '
            'it, pass features="xml" into the BeautifulSoup constructor.')
        super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
 class StopParsing(Exception):
    pass
 class FeatureNotFound(ValueError):
    pass
 #By default, act as an HTML pretty-printer.
 if __name__ == '__main__':
    import sys
    soup = BeautifulSoup(sys.stdin)
    print soup.prettify()
--- a/updater/bs4/builder/init.py
+++ b/updater/bs4/builder/init.py
@ -0,0 +1,316 @@
 from collections import defaultdict
 import itertools
 import sys
 from bs4.element import (
    CharsetMetaAttributeValue,
    ContentMetaAttributeValue,
    whitespace_re
    )
 __all__ = [
    'HTMLTreeBuilder',
    'SAXTreeBuilder',
    'TreeBuilder',
    'TreeBuilderRegistry',
    ]
 # Some useful features for a TreeBuilder to have.
 FAST = 'fast'
 PERMISSIVE = 'permissive'
 STRICT = 'strict'
 XML = 'xml'
 HTML = 'html'
 HTML_5 = 'html5'
 class TreeBuilderRegistry(object):
    def __init__(self):
        self.builders_for_feature = defaultdict(list)
        self.builders = []
    def register(self, treebuilder_class):
        """Register a treebuilder based on its advertised features."""
        for feature in treebuilder_class.features:
            self.builders_for_feature[feature].insert(0, treebuilder_class)
        self.builders.insert(0, treebuilder_class)
    def lookup(self, *features):
        if len(self.builders) == 0:
            # There are no builders at all.
            return None
        if len(features) == 0:
            # They didn't ask for any features. Give them the most
            # recently registered builder.
            return self.builders[0]
        # Go down the list of features in order, and eliminate any builders
        # that don't match every feature.
        features = list(features)
        features.reverse()
        candidates = None
        candidate_set = None
        while len(features) > 0:
            feature = features.pop()
            we_have_the_feature = self.builders_for_feature.get(feature, [])
            if len(we_have_the_feature) > 0:
                if candidates is None:
                    candidates = we_have_the_feature
                    candidate_set = set(candidates)
                else:
                    # Eliminate any candidates that don't have this feature.
                    candidate_set = candidate_set.intersection(
                        set(we_have_the_feature))
        # The only valid candidates are the ones in candidate_set.
        # Go through the original list of candidates and pick the first one
        # that's in candidate_set.
        if candidate_set is None:
            return None
        for candidate in candidates:
            if candidate in candidate_set:
                return candidate
        return None
 # The BeautifulSoup class will take feature lists from developers and use them
 # to look up builders in this registry.
 builder_registry = TreeBuilderRegistry()
 class TreeBuilder(object):
    """Turn a document into a Beautiful Soup object tree."""
    features = []
    is_xml = False
    preserve_whitespace_tags = set()
    empty_element_tags = None # A tag will be considered an empty-element
                              # tag when and only when it has no contents.
    # A value for these tag/attribute combinations is a space- or
    # comma-separated list of CDATA, rather than a single CDATA.
    cdata_list_attributes = {}
    def __init__(self):
        self.soup = None
    def reset(self):
        pass
    def can_be_empty_element(self, tag_name):
        """Might a tag with this name be an empty-element tag?
        The final markup may or may not actually present this tag as
        self-closing.
        For instance: an HTMLBuilder does not consider a <p> tag to be
        an empty-element tag (it's not in
        HTMLBuilder.empty_element_tags). This means an empty <p> tag
        will be presented as "<p></p>", not "<p />".
        The default implementation has no opinion about which tags are
        empty-element tags, so a tag will be presented as an
        empty-element tag if and only if it has no contents.
        "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
        be left alone.
        """
        if self.empty_element_tags is None:
            return True
        return tag_name in self.empty_element_tags
    def feed(self, markup):
        raise NotImplementedError()
    def prepare_markup(self, markup, user_specified_encoding=None,
                       document_declared_encoding=None):
        return markup, None, None, False
    def test_fragment_to_document(self, fragment):
        """Wrap an HTML fragment to make it look like a document.
        Different parsers do this differently. For instance, lxml
        introduces an empty <head> tag, and html5lib
        doesn't. Abstracting this away lets us write simple tests
        which run HTML fragments through the parser and compare the
        results against other HTML fragments.
        This method should not be used outside of tests.
        """
        return fragment
    def set_up_substitutions(self, tag):
        return False
    def _replace_cdata_list_attribute_values(self, tag_name, attrs):
        """Replaces class="foo bar" with class=["foo", "bar"]
        Modifies its input in place.
        """
        if self.cdata_list_attributes:
            universal = self.cdata_list_attributes.get('*', [])
            tag_specific = self.cdata_list_attributes.get(
                tag_name.lower(), [])
            for cdata_list_attr in itertools.chain(universal, tag_specific):
                if cdata_list_attr in dict(attrs):
                    # Basically, we have a "class" attribute whose
                    # value is a whitespace-separated list of CSS
                    # classes. Split it into a list.
                    value = attrs[cdata_list_attr]
                    if isinstance(value, basestring):
                        values = whitespace_re.split(value)
                    else:
                        # html5lib sometimes calls setAttributes twice
                        # for the same tag when rearranging the parse
                        # tree. On the second call the attribute value
                        # here is already a list.  If this happens,
                        # leave the value alone rather than trying to
                        # split it again.
                        values = value
                    attrs[cdata_list_attr] = values
        return attrs
 class SAXTreeBuilder(TreeBuilder):
    """A Beautiful Soup treebuilder that listens for SAX events."""
    def feed(self, markup):
        raise NotImplementedError()
    def close(self):
        pass
    def startElement(self, name, attrs):
        attrs = dict((key[1], value) for key, value in list(attrs.items()))
        #print "Start %s, %r" % (name, attrs)
        self.soup.handle_starttag(name, attrs)
    def endElement(self, name):
        #print "End %s" % name
        self.soup.handle_endtag(name)
    def startElementNS(self, nsTuple, nodeName, attrs):
        # Throw away (ns, nodeName) for now.
        self.startElement(nodeName, attrs)
    def endElementNS(self, nsTuple, nodeName):
        # Throw away (ns, nodeName) for now.
        self.endElement(nodeName)
        #handler.endElementNS((ns, node.nodeName), node.nodeName)
    def startPrefixMapping(self, prefix, nodeValue):
        # Ignore the prefix for now.
        pass
    def endPrefixMapping(self, prefix):
        # Ignore the prefix for now.
        # handler.endPrefixMapping(prefix)
        pass
    def characters(self, content):
        self.soup.handle_data(content)
    def startDocument(self):
        pass
    def endDocument(self):
        pass
 class HTMLTreeBuilder(TreeBuilder):
    """This TreeBuilder knows facts about HTML.
    Such as which tags are empty-element tags.
    """
    preserve_whitespace_tags = set(['pre', 'textarea'])
    empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
                              'spacer', 'link', 'frame', 'base'])
    # The HTML standard defines these attributes as containing a
    # space-separated list of values, not a single value. That is,
    # class="foo bar" means that the 'class' attribute has two values,
    # 'foo' and 'bar', not the single value 'foo bar'.  When we
    # encounter one of these attributes, we will parse its value into
    # a list of values if possible. Upon output, the list will be
    # converted back into a string.
    cdata_list_attributes = {
        "*" : ['class', 'accesskey', 'dropzone'],
        "a" : ['rel', 'rev'],
        "link" :  ['rel', 'rev'],
        "td" : ["headers"],
        "th" : ["headers"],
        "td" : ["headers"],
        "form" : ["accept-charset"],
        "object" : ["archive"],
        # These are HTML5 specific, as are *.accesskey and *.dropzone above.
        "area" : ["rel"],
        "icon" : ["sizes"],
        "iframe" : ["sandbox"],
        "output" : ["for"],
        }
    def set_up_substitutions(self, tag):
        # We are only interested in <meta> tags
        if tag.name != 'meta':
            return False
        http_equiv = tag.get('http-equiv')
        content = tag.get('content')
        charset = tag.get('charset')
        # We are interested in <meta> tags that say what encoding the
        # document was originally in. This means HTML 5-style <meta>
        # tags that provide the "charset" attribute. It also means
        # HTML 4-style <meta> tags that provide the "content"
        # attribute and have "http-equiv" set to "content-type".
        #
        # In both cases we will replace the value of the appropriate
        # attribute with a standin object that can take on any
        # encoding.
        meta_encoding = None
        if charset is not None:
            # HTML 5 style:
            # <meta charset="utf8">
            meta_encoding = charset
            tag['charset'] = CharsetMetaAttributeValue(charset)
        elif (content is not None and http_equiv is not None
              and http_equiv.lower() == 'content-type'):
            # HTML 4 style:
            # <meta http-equiv="content-type" content="text/html; charset=utf8">
            tag['content'] = ContentMetaAttributeValue(content)
        return (meta_encoding is not None)
 def register_treebuilders_from(module):
    """Copy TreeBuilders from the given module into this module."""
    # I'm fairly sure this is not the best way to do this.
    this_module = sys.modules['bs4.builder']
    for name in module.__all__:
        obj = getattr(module, name)
        if issubclass(obj, TreeBuilder):
            setattr(this_module, name, obj)
            this_module.__all__.append(name)
            # Register the builder while we're at it.
            this_module.builder_registry.register(obj)
 # Builders are registered in reverse order of priority, so that custom
 # builder registrations will take precedence. In general, we want lxml
 # to take precedence over html5lib, because it's faster. And we only
 # want to use HTMLParser as a last result.
 from . import _htmlparser
 register_treebuilders_from(_htmlparser)
 try:
    from . import _html5lib
    register_treebuilders_from(_html5lib)
 except ImportError:
    # They don't have html5lib installed.
    pass
 try:
    from . import _lxml
    register_treebuilders_from(_lxml)
 except ImportError:
    # They don't have lxml installed.
    pass
--- a/updater/bs4/builder/_html5lib.py
+++ b/updater/bs4/builder/_html5lib.py
@ -0,0 +1,221 @@
 __all__ = [
    'HTML5TreeBuilder',
    ]
 import warnings
 from bs4.builder import (
    PERMISSIVE,
    HTML,
    HTML_5,
    HTMLTreeBuilder,
    )
 from bs4.element import NamespacedAttribute
 import html5lib
 from html5lib.constants import namespaces
 from bs4.element import (
    Comment,
    Doctype,
    NavigableString,
    Tag,
    )
 class HTML5TreeBuilder(HTMLTreeBuilder):
    """Use html5lib to build a tree."""
    features = ['html5lib', PERMISSIVE, HTML_5, HTML]
    def prepare_markup(self, markup, user_specified_encoding):
        # Store the user-specified encoding for use later on.
        self.user_specified_encoding = user_specified_encoding
        return markup, None, None, False
    # These methods are defined by Beautiful Soup.
    def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
        doc = parser.parse(markup, encoding=self.user_specified_encoding)
        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, unicode):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
    def create_treebuilder(self, namespaceHTMLElements):
        self.underlying_builder = TreeBuilderForHtml5lib(
            self.soup, namespaceHTMLElements)
        return self.underlying_builder
    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
        return u'<html><head></head><body>%s</body></html>' % fragment
 class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
    def __init__(self, soup, namespaceHTMLElements):
        self.soup = soup
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
    def documentClass(self):
        self.soup.reset()
        return Element(self.soup, self.soup, None)
    def insertDoctype(self, token):
        name = token["name"]
        publicId = token["publicId"]
        systemId = token["systemId"]
        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
        self.soup.object_was_parsed(doctype)
    def elementClass(self, name, namespace):
        tag = self.soup.new_tag(name, namespace)
        return Element(tag, self.soup, namespace)
    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)
    def fragmentClass(self):
        self.soup = BeautifulSoup("")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None)
    def appendChild(self, node):
        # XXX This code is not covered by the BS4 tests.
        self.soup.append(node.element)
    def getDocument(self):
        return self.soup
    def getFragment(self):
        return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
 class AttrList(object):
    def __init__(self, element):
        self.element = element
        self.attrs = dict(self.element.attrs)
    def __iter__(self):
        return list(self.attrs.items()).__iter__()
    def __setitem__(self, name, value):
        "set attr", name, value
        self.element[name] = value
    def items(self):
        return list(self.attrs.items())
    def keys(self):
        return list(self.attrs.keys())
    def __len__(self):
        return len(self.attrs)
    def __getitem__(self, name):
        return self.attrs[name]
    def __contains__(self, name):
        return name in list(self.attrs.keys())
 class Element(html5lib.treebuilders._base.Node):
    def __init__(self, element, soup, namespace):
        html5lib.treebuilders._base.Node.__init__(self, element.name)
        self.element = element
        self.soup = soup
        self.namespace = namespace
    def appendChild(self, node):
        if (node.element.__class__ == NavigableString and self.element.contents
            and self.element.contents[-1].__class__ == NavigableString):
            # Concatenate new text onto old text node
            # XXX This has O(n^2) performance, for input like
            # "a</a>a</a>a</a>..."
            old_element = self.element.contents[-1]
            new_element = self.soup.new_string(old_element + node.element)
            old_element.replace_with(new_element)
        else:
            self.soup.object_was_parsed(node.element, parent=self.element)
    def getAttributes(self):
        return AttrList(self.element)
    def setAttributes(self, attributes):
        if attributes is not None and len(attributes) > 0:
            converted_attributes = []
            for name, value in list(attributes.items()):
                if isinstance(name, tuple):
                    new_name = NamespacedAttribute(*name)
                    del attributes[name]
                    attributes[new_name] = value
            self.soup.builder._replace_cdata_list_attribute_values(
                self.name, attributes)
            for name, value in attributes.items():
                self.element[name] = value
            # The attributes may contain variables that need substitution.
            # Call set_up_substitutions manually.
            #
            # The Tag constructor called this method when the Tag was created,
            # but we just set/changed the attributes, so call it again.
            self.soup.builder.set_up_substitutions(self.element)
    attributes = property(getAttributes, setAttributes)
    def insertText(self, data, insertBefore=None):
        text = TextNode(self.soup.new_string(data), self.soup)
        if insertBefore:
            self.insertBefore(text, insertBefore)
        else:
            self.appendChild(text)
    def insertBefore(self, node, refNode):
        index = self.element.index(refNode.element)
        if (node.element.__class__ == NavigableString and self.element.contents
            and self.element.contents[index-1].__class__ == NavigableString):
            # (See comments in appendChild)
            old_node = self.element.contents[index-1]
            new_str = self.soup.new_string(old_node + node.element)
            old_node.replace_with(new_str)
        else:
            self.element.insert(index, node.element)
            node.parent = self
    def removeChild(self, node):
        node.element.extract()
    def reparentChildren(self, newParent):
        while self.element.contents:
            child = self.element.contents[0]
            child.extract()
            if isinstance(child, Tag):
                newParent.appendChild(
                    Element(child, self.soup, namespaces["html"]))
            else:
                newParent.appendChild(
                    TextNode(child, self.soup))
    def cloneNode(self):
        tag = self.soup.new_tag(self.element.name, self.namespace)
        node = Element(tag, self.soup, self.namespace)
        for key,value in self.attributes:
            node.attributes[key] = value
        return node
    def hasContent(self):
        return self.element.contents
    def getNameTuple(self):
        if self.namespace == None:
            return namespaces["html"], self.name
        else:
            return self.namespace, self.name
    nameTuple = property(getNameTuple)
 class TextNode(Element):
    def __init__(self, element, soup):
        html5lib.treebuilders._base.Node.__init__(self, None)
        self.element = element
        self.soup = soup
    def cloneNode(self):
        raise NotImplementedError
--- a/updater/bs4/builder/_htmlparser.py
+++ b/updater/bs4/builder/_htmlparser.py
@ -0,0 +1,244 @@
 """Use the HTMLParser library to parse HTML files that aren't too bad."""
 __all__ = [
    'HTMLParserTreeBuilder',
    ]
 from HTMLParser import (
    HTMLParser,
    HTMLParseError,
    )
 import sys
 import warnings
 # Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
 # argument, which we'd like to set to False. Unfortunately,
 # http://bugs.python.org/issue13273 makes strict=True a better bet
 # before Python 3.2.3.
 #
 # At the end of this file, we monkeypatch HTMLParser so that
 # strict=True works well on Python 3.2.2.
 major, minor, release = sys.version_info[:3]
 CONSTRUCTOR_TAKES_STRICT = (
    major > 3
    or (major == 3 and minor > 2)
    or (major == 3 and minor == 2 and release >= 3))
 from bs4.element import (
    CData,
    Comment,
    Declaration,
    Doctype,
    ProcessingInstruction,
    )
 from bs4.dammit import EntitySubstitution, UnicodeDammit
 from bs4.builder import (
    HTML,
    HTMLTreeBuilder,
    STRICT,
    )
 HTMLPARSER = 'html.parser'
 class BeautifulSoupHTMLParser(HTMLParser):
    def handle_starttag(self, name, attrs):
        # XXX namespace
        self.soup.handle_starttag(name, None, None, dict(attrs))
    def handle_endtag(self, name):
        self.soup.handle_endtag(name)
    def handle_data(self, data):
        self.soup.handle_data(data)
    def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed.
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        else:
            real_name = int(name)
        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"
        self.handle_data(data)
    def handle_entityref(self, name):
        character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
        if character is not None:
            data = character
        else:
            data = "&%s;" % name
        self.handle_data(data)
    def handle_comment(self, data):
        self.soup.endData()
        self.soup.handle_data(data)
        self.soup.endData(Comment)
    def handle_decl(self, data):
        self.soup.endData()
        if data.startswith("DOCTYPE "):
            data = data[len("DOCTYPE "):]
        self.soup.handle_data(data)
        self.soup.endData(Doctype)
    def unknown_decl(self, data):
        if data.upper().startswith('CDATA['):
            cls = CData
            data = data[len('CDATA['):]
        else:
            cls = Declaration
        self.soup.endData()
        self.soup.handle_data(data)
        self.soup.endData(cls)
    def handle_pi(self, data):
        self.soup.endData()
        if data.endswith("?") and data.lower().startswith("xml"):
            # "An XHTML processing instruction using the trailing '?'
            # will cause the '?' to be included in data." - HTMLParser
            # docs.
            #
            # Strip the question mark so we don't end up with two
            # question marks.
            data = data[:-1]
        self.soup.handle_data(data)
        self.soup.endData(ProcessingInstruction)
 class HTMLParserTreeBuilder(HTMLTreeBuilder):
    is_xml = False
    features = [HTML, STRICT, HTMLPARSER]
    def __init__(self, *args, **kwargs):
        if CONSTRUCTOR_TAKES_STRICT:
            kwargs['strict'] = False
        self.parser_args = (args, kwargs)
    def prepare_markup(self, markup, user_specified_encoding=None,
                       document_declared_encoding=None):
        """
        :return: A 4-tuple (markup, original encoding, encoding
        declared within markup, whether any characters had to be
        replaced with REPLACEMENT CHARACTER).
        """
        if isinstance(markup, unicode):
            return markup, None, None, False
        try_encodings = [user_specified_encoding, document_declared_encoding]
        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
        return (dammit.markup, dammit.original_encoding,
                dammit.declared_html_encoding,
                dammit.contains_replacement_characters)
    def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e
 # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
 # 3.2.3 code. This ensures they don't treat markup like <p></p> as a
 # string.
 #
 # XXX This code can be removed once most Python 3 users are on 3.2.3.
 if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
    import re
    attrfind_tolerant = re.compile(
        r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
        r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
    HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
    locatestarttagend = re.compile(r"""
  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
  (?:\s+                             # whitespace before attribute name
    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
      (?:\s*=\s*                     # value indicator
        (?:'[^']*'                   # LITA-enclosed value
          |\"[^\"]*\"                # LIT-enclosed value
          |[^'\">\s]+                # bare value
         )
       )?
     )
   )*
  \s*                                # trailing whitespace
 """, re.VERBOSE)
    BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
    from html.parser import tagfind, attrfind
    def parse_starttag(self, i):
        self.__starttag_text = None
        endpos = self.check_for_whole_start_tag(i)
        if endpos < 0:
            return endpos
        rawdata = self.rawdata
        self.__starttag_text = rawdata[i:endpos]
        # Now parse the data between i+1 and j into a tag and attrs
        attrs = []
        match = tagfind.match(rawdata, i+1)
        assert match, 'unexpected call to parse_starttag()'
        k = match.end()
        self.lasttag = tag = rawdata[i+1:k].lower()
        while k < endpos:
            if self.strict:
                m = attrfind.match(rawdata, k)
            else:
                m = attrfind_tolerant.match(rawdata, k)
            if not m:
                break
            attrname, rest, attrvalue = m.group(1, 2, 3)
            if not rest:
                attrvalue = None
            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                 attrvalue[:1] == '"' == attrvalue[-1:]:
                attrvalue = attrvalue[1:-1]
            if attrvalue:
                attrvalue = self.unescape(attrvalue)
            attrs.append((attrname.lower(), attrvalue))
            k = m.end()
        end = rawdata[k:endpos].strip()
        if end not in (">", "/>"):
            lineno, offset = self.getpos()
            if "\n" in self.__starttag_text:
                lineno = lineno + self.__starttag_text.count("\n")
                offset = len(self.__starttag_text) \
                         - self.__starttag_text.rfind("\n")
            else:
                offset = offset + len(self.__starttag_text)
            if self.strict:
                self.error("junk characters in start tag: %r"
                           % (rawdata[k:endpos][:20],))
            self.handle_data(rawdata[i:endpos])
            return endpos
        if end.endswith('/>'):
            # XHTML-style empty tag: <span attr="value" />
            self.handle_startendtag(tag, attrs)
        else:
            self.handle_starttag(tag, attrs)
            if tag in self.CDATA_CONTENT_ELEMENTS:
                self.set_cdata_mode(tag)
        return endpos
    def set_cdata_mode(self, elem):
        self.cdata_elem = elem.lower()
        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
    BeautifulSoupHTMLParser.parse_starttag = parse_starttag
    BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
    CONSTRUCTOR_TAKES_STRICT = True
--- a/updater/bs4/builder/_lxml.py
+++ b/updater/bs4/builder/_lxml.py
@ -0,0 +1,196 @@
 __all__ = [
    'LXMLTreeBuilderForXML',
    'LXMLTreeBuilder',
    ]
 from StringIO import StringIO
 import collections
 from lxml import etree
 from bs4.element import Comment, Doctype, NamespacedAttribute
 from bs4.builder import (
    FAST,
    HTML,
    HTMLTreeBuilder,
    PERMISSIVE,
    TreeBuilder,
    XML)
 from bs4.dammit import UnicodeDammit
 LXML = 'lxml'
 class LXMLTreeBuilderForXML(TreeBuilder):
    DEFAULT_PARSER_CLASS = etree.XMLParser
    is_xml = True
    # Well, it's permissive by XML parser standards.
    features = [LXML, XML, FAST, PERMISSIVE]
    CHUNK_SIZE = 512
    # This namespace mapping is specified in the XML Namespace
    # standard.
    DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
    @property
    def default_parser(self):
        # This can either return a parser object or a class, which
        # will be instantiated with default arguments.
        return etree.XMLParser(target=self, strip_cdata=False, recover=True)
    def __init__(self, parser=None, empty_element_tags=None):
        if empty_element_tags is not None:
            self.empty_element_tags = set(empty_element_tags)
        if parser is None:
            # Use the default parser.
            parser = self.default_parser
        if isinstance(parser, collections.Callable):
            # Instantiate the parser with default arguments
            parser = parser(target=self, strip_cdata=False)
        self.parser = parser
        self.soup = None
        self.nsmaps = [self.DEFAULT_NSMAPS]
    def _getNsTag(self, tag):
        # Split the namespace URL out of a fully-qualified lxml tag
        # name. Copied from lxml's src/lxml/sax.py.
        if tag[0] == '{':
            return tuple(tag[1:].split('}', 1))
        else:
            return (None, tag)
    def prepare_markup(self, markup, user_specified_encoding=None,
                       document_declared_encoding=None):
        """
        :return: A 3-tuple (markup, original encoding, encoding
        declared within markup).
        """
        if isinstance(markup, unicode):
            return markup, None, None, False
        try_encodings = [user_specified_encoding, document_declared_encoding]
        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
        return (dammit.markup, dammit.original_encoding,
                dammit.declared_html_encoding,
                dammit.contains_replacement_characters)
    def feed(self, markup):
        if isinstance(markup, basestring):
            markup = StringIO(markup)
        # Call feed() at least once, even if the markup is empty,
        # or the parser won't be initialized.
        data = markup.read(self.CHUNK_SIZE)
        self.parser.feed(data)
        while data != '':
            # Now call feed() on the rest of the data, chunk by chunk.
            data = markup.read(self.CHUNK_SIZE)
            if data != '':
                self.parser.feed(data)
        self.parser.close()
    def close(self):
        self.nsmaps = [self.DEFAULT_NSMAPS]
    def start(self, name, attrs, nsmap={}):
        # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
        attrs = dict(attrs)
        nsprefix = None
        # Invert each namespace map as it comes in.
        if len(self.nsmaps) > 1:
            # There are no new namespaces for this tag, but
            # non-default namespaces are in play, so we need a
            # separate tag stack to know when they end.
            self.nsmaps.append(None)
        elif len(nsmap) > 0:
            # A new namespace mapping has come into play.
            inverted_nsmap = dict((value, key) for key, value in nsmap.items())
            self.nsmaps.append(inverted_nsmap)
            # Also treat the namespace mapping as a set of attributes on the
            # tag, so we can recreate it later.
            attrs = attrs.copy()
            for prefix, namespace in nsmap.items():
                attribute = NamespacedAttribute(
                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
                attrs[attribute] = namespace
        # Namespaces are in play. Find any attributes that came in
        # from lxml with namespaces attached to their names, and
        # turn then into NamespacedAttribute objects.
        new_attrs = {}
        for attr, value in attrs.items():
            namespace, attr = self._getNsTag(attr)
            if namespace is None:
                new_attrs[attr] = value
            else:
                nsprefix = self._prefix_for_namespace(namespace)
                attr = NamespacedAttribute(nsprefix, attr, namespace)
                new_attrs[attr] = value
        attrs = new_attrs
        namespace, name = self._getNsTag(name)
        nsprefix = self._prefix_for_namespace(namespace)
        self.soup.handle_starttag(name, namespace, nsprefix, attrs)
    def _prefix_for_namespace(self, namespace):
        """Find the currently active prefix for the given namespace."""
        if namespace is None:
            return None
        for inverted_nsmap in reversed(self.nsmaps):
            if inverted_nsmap is not None and namespace in inverted_nsmap:
                return inverted_nsmap[namespace]
        return None
    def end(self, name):
        self.soup.endData()
        completed_tag = self.soup.tagStack[-1]
        namespace, name = self._getNsTag(name)
        nsprefix = None
        if namespace is not None:
            for inverted_nsmap in reversed(self.nsmaps):
                if inverted_nsmap is not None and namespace in inverted_nsmap:
                    nsprefix = inverted_nsmap[namespace]
                    break
        self.soup.handle_endtag(name, nsprefix)
        if len(self.nsmaps) > 1:
            # This tag, or one of its parents, introduced a namespace
            # mapping, so pop it off the stack.
            self.nsmaps.pop()
    def pi(self, target, data):
        pass
    def data(self, content):
        self.soup.handle_data(content)
    def doctype(self, name, pubid, system):
        self.soup.endData()
        doctype = Doctype.for_name_and_ids(name, pubid, system)
        self.soup.object_was_parsed(doctype)
    def comment(self, content):
        "Handle comments as Comment objects."
        self.soup.endData()
        self.soup.handle_data(content)
        self.soup.endData(Comment)
    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
        return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
 class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
    features = [LXML, HTML, FAST, PERMISSIVE]
    is_xml = False
    @property
    def default_parser(self):
        return etree.HTMLParser
    def feed(self, markup):
        self.parser.feed(markup)
        self.parser.close()
    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
        return u'<html><body>%s</body></html>' % fragment
--- a/updater/bs4/dammit.py
+++ b/updater/bs4/dammit.py
@ -0,0 +1,802 @@
 # -*- coding: utf-8 -*-
 """Beautiful Soup bonus library: Unicode, Dammit
 This class forces XML data into a standard format (usually to UTF-8 or
 Unicode).  It is heavily based on code from Mark Pilgrim's Universal
 Feed Parser. It does not rewrite the XML or HTML to reflect a new
 encoding; that's the tree builder's job.
 """
 import codecs
 from htmlentitydefs import codepoint2name
 import re
 import logging
 # Import a library to autodetect character encodings.
 chardet_type = None
 try:
    # First try the fast C implementation.
    #  PyPI package: cchardet
    import cchardet
    def chardet_dammit(s):
        return cchardet.detect(s)['encoding']
 except ImportError:
    try:
        # Fall back to the pure Python implementation
        #  Debian package: python-chardet
        #  PyPI package: chardet
        import chardet
        def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1
    except ImportError:
        # No chardet available.
        def chardet_dammit(s):
            return None
 # Available from http://cjkpython.i18n.org/.
 try:
    import iconv_codec
 except ImportError:
    pass
 xml_encoding_re = re.compile(
    '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
 html_meta_re = re.compile(
    '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
 class EntitySubstitution(object):
    """Substitute XML or HTML entities for the corresponding characters."""
    def _populate_class_variables():
        lookup = {}
        reverse_lookup = {}
        characters_for_re = []
        for codepoint, name in list(codepoint2name.items()):
            character = unichr(codepoint)
            if codepoint != 34:
                # There's no point in turning the quotation mark into
                # &quot;, unless it happens within an attribute value, which
                # is handled elsewhere.
                characters_for_re.append(character)
                lookup[character] = name
            # But we do want to turn &quot; into the quotation mark.
            reverse_lookup[name] = character
        re_definition = "[%s]" % "".join(characters_for_re)
        return lookup, reverse_lookup, re.compile(re_definition)
    (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
     CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
    CHARACTER_TO_XML_ENTITY = {
        "'": "apos",
        '"': "quot",
        "&": "amp",
        "<": "lt",
        ">": "gt",
        }
    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
                                           ")")
    @classmethod
    def _substitute_html_entity(cls, matchobj):
        entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
        return "&%s;" % entity
    @classmethod
    def _substitute_xml_entity(cls, matchobj):
        """Used with a regular expression to substitute the
        appropriate XML entity for an XML special character."""
        entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
        return "&%s;" % entity
    @classmethod
    def quoted_attribute_value(self, value):
        """Make a value into a quoted XML attribute, possibly escaping it.
         Most strings will be quoted using double quotes.
          Bob's Bar -> "Bob's Bar"
         If a string contains double quotes, it will be quoted using
         single quotes.
          Welcome to "my bar" -> 'Welcome to "my bar"'
         If a string contains both single and double quotes, the
         double quotes will be escaped, and the string will be quoted
         using double quotes.
          Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
        """
        quote_with = '"'
        if '"' in value:
            if "'" in value:
                # The string contains both single and double
                # quotes.  Turn the double quotes into
                # entities. We quote the double quotes rather than
                # the single quotes because the entity name is
                # "&quot;" whether this is HTML or XML.  If we
                # quoted the single quotes, we'd have to decide
                # between &apos; and &squot;.
                replace_with = "&quot;"
                value = value.replace('"', replace_with)
            else:
                # There are double quotes but no single quotes.
                # We can use single quotes to quote the attribute.
                quote_with = "'"
        return quote_with + value + quote_with
    @classmethod
    def substitute_xml(cls, value, make_quoted_attribute=False):
        """Substitute XML entities for special XML characters.
        :param value: A string to be substituted. The less-than sign will
          become &lt;, the greater-than sign will become &gt;, and any
          ampersands that are not part of an entity defition will
          become &amp;.
        :param make_quoted_attribute: If True, then the string will be
         quoted, as befits an attribute value.
        """
        # Escape angle brackets, and ampersands that aren't part of
        # entities.
        value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
            cls._substitute_xml_entity, value)
        if make_quoted_attribute:
            value = cls.quoted_attribute_value(value)
        return value
    @classmethod
    def substitute_html(cls, s):
        """Replace certain Unicode characters with named HTML entities.
        This differs from data.encode(encoding, 'xmlcharrefreplace')
        in that the goal is to make the result more readable (to those
        with ASCII displays) rather than to recover from
        errors. There's absolutely nothing wrong with a UTF-8 string
        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
        character with "&eacute;" will make it more readable to some
        people.
        """
        return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
            cls._substitute_html_entity, s)
 class UnicodeDammit:
    """A class for detecting the encoding of a *ML document and
    converting it to a Unicode string. If the source encoding is
    windows-1252, can replace MS smart quotes with their HTML or XML
    equivalents."""
    # This dictionary maps commonly seen values for "charset" in HTML
    # meta tags to the corresponding Python codec names. It only covers
    # values that aren't in Python's aliases and can't be determined
    # by the heuristics in find_codec.
    CHARSET_ALIASES = {"macintosh": "mac-roman",
                       "x-sjis": "shift-jis"}
    ENCODINGS_WITH_SMART_QUOTES = [
        "windows-1252",
        "iso-8859-1",
        "iso-8859-2",
        ]
    def __init__(self, markup, override_encodings=[],
                 smart_quotes_to=None, is_html=False):
        self.declared_html_encoding = None
        self.smart_quotes_to = smart_quotes_to
        self.tried_encodings = []
        self.contains_replacement_characters = False
        if markup == '' or isinstance(markup, unicode):
            self.markup = markup
            self.unicode_markup = unicode(markup)
            self.original_encoding = None
            return
        new_markup, document_encoding, sniffed_encoding = \
            self._detectEncoding(markup, is_html)
        self.markup = new_markup
        u = None
        if new_markup != markup:
            # _detectEncoding modified the markup, then converted it to
            # Unicode and then to UTF-8. So convert it from UTF-8.
            u = self._convert_from("utf8")
            self.original_encoding = sniffed_encoding
        if not u:
            for proposed_encoding in (
                override_encodings + [document_encoding, sniffed_encoding]):
                if proposed_encoding is not None:
                    u = self._convert_from(proposed_encoding)
                    if u:
                        break
        # If no luck and we have auto-detection library, try that:
        if not u and not isinstance(self.markup, unicode):
            u = self._convert_from(chardet_dammit(self.markup))
        # As a last resort, try utf-8 and windows-1252:
        if not u:
            for proposed_encoding in ("utf-8", "windows-1252"):
                u = self._convert_from(proposed_encoding)
                if u:
                    break
        # As an absolute last resort, try the encodings again with
        # character replacement.
        if not u:
            for proposed_encoding in (
                override_encodings + [
                    document_encoding, sniffed_encoding, "utf-8", "windows-1252"]):
                if proposed_encoding != "ascii":
                    u = self._convert_from(proposed_encoding, "replace")
                if u is not None:
                    logging.warning(
                            "Some characters could not be decoded, and were "
                            "replaced with REPLACEMENT CHARACTER.")
                    self.contains_replacement_characters = True
                    break
        # We could at this point force it to ASCII, but that would
        # destroy so much data that I think giving up is better
        self.unicode_markup = u
        if not u:
            self.original_encoding = None
    def _sub_ms_char(self, match):
        """Changes a MS smart quote character to an XML or HTML
        entity, or an ASCII character."""
        orig = match.group(1)
        if self.smart_quotes_to == 'ascii':
            sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
        else:
            sub = self.MS_CHARS.get(orig)
            if type(sub) == tuple:
                if self.smart_quotes_to == 'xml':
                    sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
                else:
                    sub = '&'.encode() + sub[0].encode() + ';'.encode()
            else:
                sub = sub.encode()
        return sub
    def _convert_from(self, proposed, errors="strict"):
        proposed = self.find_codec(proposed)
        if not proposed or (proposed, errors) in self.tried_encodings:
            return None
        self.tried_encodings.append((proposed, errors))
        markup = self.markup
        # Convert smart quotes to HTML if coming from an encoding
        # that might have them.
        if (self.smart_quotes_to is not None
            and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
            smart_quotes_re = b"([\x80-\x9f])"
            smart_quotes_compiled = re.compile(smart_quotes_re)
            markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
        try:
            #print "Trying to convert document to %s (errors=%s)" % (
            #    proposed, errors)
            u = self._to_unicode(markup, proposed, errors)
            self.markup = u
            self.original_encoding = proposed
        except Exception as e:
            #print "That didn't work!"
            #print e
            return None
        #print "Correct encoding: %s" % proposed
        return self.markup
    def _to_unicode(self, data, encoding, errors="strict"):
        '''Given a string and its encoding, decodes the string into Unicode.
        %encoding is a string recognized by encodings.aliases'''
        # strip Byte Order Mark (if present)
        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
               and (data[2:4] != '\x00\x00'):
            encoding = 'utf-16be'
            data = data[2:]
        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
                 and (data[2:4] != '\x00\x00'):
            encoding = 'utf-16le'
            data = data[2:]
        elif data[:3] == '\xef\xbb\xbf':
            encoding = 'utf-8'
            data = data[3:]
        elif data[:4] == '\x00\x00\xfe\xff':
            encoding = 'utf-32be'
            data = data[4:]
        elif data[:4] == '\xff\xfe\x00\x00':
            encoding = 'utf-32le'
            data = data[4:]
        newdata = unicode(data, encoding, errors)
        return newdata
    def _detectEncoding(self, xml_data, is_html=False):
        """Given a document, tries to detect its XML encoding."""
        xml_encoding = sniffed_xml_encoding = None
        try:
            if xml_data[:4] == b'\x4c\x6f\xa7\x94':
                # EBCDIC
                xml_data = self._ebcdic_to_ascii(xml_data)
            elif xml_data[:4] == b'\x00\x3c\x00\x3f':
                # UTF-16BE
                sniffed_xml_encoding = 'utf-16be'
                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
                     and (xml_data[2:4] != b'\x00\x00'):
                # UTF-16BE with BOM
                sniffed_xml_encoding = 'utf-16be'
                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
            elif xml_data[:4] == b'\x3c\x00\x3f\x00':
                # UTF-16LE
                sniffed_xml_encoding = 'utf-16le'
                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
                     (xml_data[2:4] != b'\x00\x00'):
                # UTF-16LE with BOM
                sniffed_xml_encoding = 'utf-16le'
                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
            elif xml_data[:4] == b'\x00\x00\x00\x3c':
                # UTF-32BE
                sniffed_xml_encoding = 'utf-32be'
                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
            elif xml_data[:4] == b'\x3c\x00\x00\x00':
                # UTF-32LE
                sniffed_xml_encoding = 'utf-32le'
                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
            elif xml_data[:4] == b'\x00\x00\xfe\xff':
                # UTF-32BE with BOM
                sniffed_xml_encoding = 'utf-32be'
                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
            elif xml_data[:4] == b'\xff\xfe\x00\x00':
                # UTF-32LE with BOM
                sniffed_xml_encoding = 'utf-32le'
                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
            elif xml_data[:3] == b'\xef\xbb\xbf':
                # UTF-8 with BOM
                sniffed_xml_encoding = 'utf-8'
                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
            else:
                sniffed_xml_encoding = 'ascii'
                pass
        except:
            xml_encoding_match = None
        xml_encoding_match = xml_encoding_re.match(xml_data)
        if not xml_encoding_match and is_html:
            xml_encoding_match = html_meta_re.search(xml_data)
        if xml_encoding_match is not None:
            xml_encoding = xml_encoding_match.groups()[0].decode(
                'ascii').lower()
            if is_html:
                self.declared_html_encoding = xml_encoding
            if sniffed_xml_encoding and \
               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
                                 'utf16', 'u16')):
                xml_encoding = sniffed_xml_encoding
        return xml_data, xml_encoding, sniffed_xml_encoding
    def find_codec(self, charset):
        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
               or (charset and self._codec(charset.replace("-", ""))) \
               or (charset and self._codec(charset.replace("-", "_"))) \
               or charset
    def _codec(self, charset):
        if not charset:
            return charset
        codec = None
        try:
            codecs.lookup(charset)
            codec = charset
        except (LookupError, ValueError):
            pass
        return codec
    EBCDIC_TO_ASCII_MAP = None
    def _ebcdic_to_ascii(self, s):
        c = self.__class__
        if not c.EBCDIC_TO_ASCII_MAP:
            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
                    250,251,252,253,254,255)
            import string
            c.EBCDIC_TO_ASCII_MAP = string.maketrans(
            ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
        return s.translate(c.EBCDIC_TO_ASCII_MAP)
    # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
    MS_CHARS = {b'\x80': ('euro', '20AC'),
                b'\x81': ' ',
                b'\x82': ('sbquo', '201A'),
                b'\x83': ('fnof', '192'),
                b'\x84': ('bdquo', '201E'),
                b'\x85': ('hellip', '2026'),
                b'\x86': ('dagger', '2020'),
                b'\x87': ('Dagger', '2021'),
                b'\x88': ('circ', '2C6'),
                b'\x89': ('permil', '2030'),
                b'\x8A': ('Scaron', '160'),
                b'\x8B': ('lsaquo', '2039'),
                b'\x8C': ('OElig', '152'),
                b'\x8D': '?',
                b'\x8E': ('#x17D', '17D'),
                b'\x8F': '?',
                b'\x90': '?',
                b'\x91': ('lsquo', '2018'),
                b'\x92': ('rsquo', '2019'),
                b'\x93': ('ldquo', '201C'),
                b'\x94': ('rdquo', '201D'),
                b'\x95': ('bull', '2022'),
                b'\x96': ('ndash', '2013'),
                b'\x97': ('mdash', '2014'),
                b'\x98': ('tilde', '2DC'),
                b'\x99': ('trade', '2122'),
                b'\x9a': ('scaron', '161'),
                b'\x9b': ('rsaquo', '203A'),
                b'\x9c': ('oelig', '153'),
                b'\x9d': '?',
                b'\x9e': ('#x17E', '17E'),
                b'\x9f': ('Yuml', ''),}
    # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
    # horrors like stripping diacritical marks to turn á into a, but also
    # contains non-horrors like turning “ into ".
    MS_CHARS_TO_ASCII = {
        b'\x80' : 'EUR',
        b'\x81' : ' ',
        b'\x82' : ',',
        b'\x83' : 'f',
        b'\x84' : ',,',
        b'\x85' : '...',
        b'\x86' : '+',
        b'\x87' : '++',
        b'\x88' : '^',
        b'\x89' : '%',
        b'\x8a' : 'S',
        b'\x8b' : '<',
        b'\x8c' : 'OE',
        b'\x8d' : '?',
        b'\x8e' : 'Z',
        b'\x8f' : '?',
        b'\x90' : '?',
        b'\x91' : "'",
        b'\x92' : "'",
        b'\x93' : '"',
        b'\x94' : '"',
        b'\x95' : '*',
        b'\x96' : '-',
        b'\x97' : '--',
        b'\x98' : '~',
        b'\x99' : '(TM)',
        b'\x9a' : 's',
        b'\x9b' : '>',
        b'\x9c' : 'oe',
        b'\x9d' : '?',
        b'\x9e' : 'z',
        b'\x9f' : 'Y',
        b'\xa0' : ' ',
        b'\xa1' : '!',
        b'\xa2' : 'c',
        b'\xa3' : 'GBP',
        b'\xa4' : '$', #This approximation is especially parochial--this is the
                       #generic currency symbol.
        b'\xa5' : 'YEN',
        b'\xa6' : '|',
        b'\xa7' : 'S',
        b'\xa8' : '..',
        b'\xa9' : '',
        b'\xaa' : '(th)',
        b'\xab' : '<<',
        b'\xac' : '!',
        b'\xad' : ' ',
        b'\xae' : '(R)',
        b'\xaf' : '-',
        b'\xb0' : 'o',
        b'\xb1' : '+-',
        b'\xb2' : '2',
        b'\xb3' : '3',
        b'\xb4' : ("'", 'acute'),
        b'\xb5' : 'u',
        b'\xb6' : 'P',
        b'\xb7' : '*',
        b'\xb8' : ',',
        b'\xb9' : '1',
        b'\xba' : '(th)',
        b'\xbb' : '>>',
        b'\xbc' : '1/4',
        b'\xbd' : '1/2',
        b'\xbe' : '3/4',
        b'\xbf' : '?',
        b'\xc0' : 'A',
        b'\xc1' : 'A',
        b'\xc2' : 'A',
        b'\xc3' : 'A',
        b'\xc4' : 'A',
        b'\xc5' : 'A',
        b'\xc6' : 'AE',
        b'\xc7' : 'C',
        b'\xc8' : 'E',
        b'\xc9' : 'E',
        b'\xca' : 'E',
        b'\xcb' : 'E',
        b'\xcc' : 'I',
        b'\xcd' : 'I',
        b'\xce' : 'I',
        b'\xcf' : 'I',
        b'\xd0' : 'D',
        b'\xd1' : 'N',
        b'\xd2' : 'O',
        b'\xd3' : 'O',
        b'\xd4' : 'O',
        b'\xd5' : 'O',
        b'\xd6' : 'O',
        b'\xd7' : '*',
        b'\xd8' : 'O',
        b'\xd9' : 'U',
        b'\xda' : 'U',
        b'\xdb' : 'U',
        b'\xdc' : 'U',
        b'\xdd' : 'Y',
        b'\xde' : 'b',
        b'\xdf' : 'B',
        b'\xe0' : 'a',
        b'\xe1' : 'a',
        b'\xe2' : 'a',
        b'\xe3' : 'a',
        b'\xe4' : 'a',
        b'\xe5' : 'a',
        b'\xe6' : 'ae',
        b'\xe7' : 'c',
        b'\xe8' : 'e',
        b'\xe9' : 'e',
        b'\xea' : 'e',
        b'\xeb' : 'e',
        b'\xec' : 'i',
        b'\xed' : 'i',
        b'\xee' : 'i',
        b'\xef' : 'i',
        b'\xf0' : 'o',
        b'\xf1' : 'n',
        b'\xf2' : 'o',
        b'\xf3' : 'o',
        b'\xf4' : 'o',
        b'\xf5' : 'o',
        b'\xf6' : 'o',
        b'\xf7' : '/',
        b'\xf8' : 'o',
        b'\xf9' : 'u',
        b'\xfa' : 'u',
        b'\xfb' : 'u',
        b'\xfc' : 'u',
        b'\xfd' : 'y',
        b'\xfe' : 'b',
        b'\xff' : 'y',
        }
    # A map used when removing rogue Windows-1252/ISO-8859-1
    # characters in otherwise UTF-8 documents.
    #
    # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
    # Windows-1252.
    WINDOWS_1252_TO_UTF8 = {
        0x80 : b'\xe2\x82\xac', # €
        0x82 : b'\xe2\x80\x9a', # ‚
        0x83 : b'\xc6\x92',     # ƒ
        0x84 : b'\xe2\x80\x9e', # „
        0x85 : b'\xe2\x80\xa6', # …
        0x86 : b'\xe2\x80\xa0', # †
        0x87 : b'\xe2\x80\xa1', # ‡
        0x88 : b'\xcb\x86',     # ˆ
        0x89 : b'\xe2\x80\xb0', # ‰
        0x8a : b'\xc5\xa0',     # Š
        0x8b : b'\xe2\x80\xb9', # ‹
        0x8c : b'\xc5\x92',     # Œ
        0x8e : b'\xc5\xbd',     # Ž
        0x91 : b'\xe2\x80\x98', # ‘
        0x92 : b'\xe2\x80\x99', # ’
        0x93 : b'\xe2\x80\x9c', # “
        0x94 : b'\xe2\x80\x9d', # ”
        0x95 : b'\xe2\x80\xa2', # •
        0x96 : b'\xe2\x80\x93', # –
        0x97 : b'\xe2\x80\x94', # —
        0x98 : b'\xcb\x9c',     # ˜
        0x99 : b'\xe2\x84\xa2', # ™
        0x9a : b'\xc5\xa1',     # š
        0x9b : b'\xe2\x80\xba', # ›
        0x9c : b'\xc5\x93',     # œ
        0x9e : b'\xc5\xbe',     # ž
        0x9f : b'\xc5\xb8',     # Ÿ
        0xa0 : b'\xc2\xa0',     #  
        0xa1 : b'\xc2\xa1',     # ¡
        0xa2 : b'\xc2\xa2',     # ¢
        0xa3 : b'\xc2\xa3',     # £
        0xa4 : b'\xc2\xa4',     # ¤
        0xa5 : b'\xc2\xa5',     # ¥
        0xa6 : b'\xc2\xa6',     # ¦
        0xa7 : b'\xc2\xa7',     # §
        0xa8 : b'\xc2\xa8',     # ¨
        0xa9 : b'\xc2\xa9',     # ©
        0xaa : b'\xc2\xaa',     # ª
        0xab : b'\xc2\xab',     # «
        0xac : b'\xc2\xac',     # ¬
        0xad : b'\xc2\xad',     # 
        0xae : b'\xc2\xae',     # ®
        0xaf : b'\xc2\xaf',     # ¯
        0xb0 : b'\xc2\xb0',     # °
        0xb1 : b'\xc2\xb1',     # ±
        0xb2 : b'\xc2\xb2',     # ²
        0xb3 : b'\xc2\xb3',     # ³
        0xb4 : b'\xc2\xb4',     # ´
        0xb5 : b'\xc2\xb5',     # µ
        0xb6 : b'\xc2\xb6',     # ¶
        0xb7 : b'\xc2\xb7',     # ·
        0xb8 : b'\xc2\xb8',     # ¸
        0xb9 : b'\xc2\xb9',     # ¹
        0xba : b'\xc2\xba',     # º
        0xbb : b'\xc2\xbb',     # »
        0xbc : b'\xc2\xbc',     # ¼
        0xbd : b'\xc2\xbd',     # ½
        0xbe : b'\xc2\xbe',     # ¾
        0xbf : b'\xc2\xbf',     # ¿
        0xc0 : b'\xc3\x80',     # À
        0xc1 : b'\xc3\x81',     # Á
        0xc2 : b'\xc3\x82',     # Â
        0xc3 : b'\xc3\x83',     # Ã
        0xc4 : b'\xc3\x84',     # Ä
        0xc5 : b'\xc3\x85',     # Å
        0xc6 : b'\xc3\x86',     # Æ
        0xc7 : b'\xc3\x87',     # Ç
        0xc8 : b'\xc3\x88',     # È
        0xc9 : b'\xc3\x89',     # É
        0xca : b'\xc3\x8a',     # Ê
        0xcb : b'\xc3\x8b',     # Ë
        0xcc : b'\xc3\x8c',     # Ì
        0xcd : b'\xc3\x8d',     # Í
        0xce : b'\xc3\x8e',     # Î
        0xcf : b'\xc3\x8f',     # Ï
        0xd0 : b'\xc3\x90',     # Ð
        0xd1 : b'\xc3\x91',     # Ñ
        0xd2 : b'\xc3\x92',     # Ò
        0xd3 : b'\xc3\x93',     # Ó
        0xd4 : b'\xc3\x94',     # Ô
        0xd5 : b'\xc3\x95',     # Õ
        0xd6 : b'\xc3\x96',     # Ö
        0xd7 : b'\xc3\x97',     # ×
        0xd8 : b'\xc3\x98',     # Ø
        0xd9 : b'\xc3\x99',     # Ù
        0xda : b'\xc3\x9a',     # Ú
        0xdb : b'\xc3\x9b',     # Û
        0xdc : b'\xc3\x9c',     # Ü
        0xdd : b'\xc3\x9d',     # Ý
        0xde : b'\xc3\x9e',     # Þ
        0xdf : b'\xc3\x9f',     # ß
        0xe0 : b'\xc3\xa0',     # à
        0xe1 : b'\xa1',     # á
        0xe2 : b'\xc3\xa2',     # â
        0xe3 : b'\xc3\xa3',     # ã
        0xe4 : b'\xc3\xa4',     # ä
        0xe5 : b'\xc3\xa5',     # å
        0xe6 : b'\xc3\xa6',     # æ
        0xe7 : b'\xc3\xa7',     # ç
        0xe8 : b'\xc3\xa8',     # è
        0xe9 : b'\xc3\xa9',     # é
        0xea : b'\xc3\xaa',     # ê
        0xeb : b'\xc3\xab',     # ë
        0xec : b'\xc3\xac',     # ì
        0xed : b'\xc3\xad',     # í
        0xee : b'\xc3\xae',     # î
        0xef : b'\xc3\xaf',     # ï
        0xf0 : b'\xc3\xb0',     # ð
        0xf1 : b'\xc3\xb1',     # ñ
        0xf2 : b'\xc3\xb2',     # ò
        0xf3 : b'\xc3\xb3',     # ó
        0xf4 : b'\xc3\xb4',     # ô
        0xf5 : b'\xc3\xb5',     # õ
        0xf6 : b'\xc3\xb6',     # ö
        0xf7 : b'\xc3\xb7',     # ÷
        0xf8 : b'\xc3\xb8',     # ø
        0xf9 : b'\xc3\xb9',     # ù
        0xfa : b'\xc3\xba',     # ú
        0xfb : b'\xc3\xbb',     # û
        0xfc : b'\xc3\xbc',     # ü
        0xfd : b'\xc3\xbd',     # ý
        0xfe : b'\xc3\xbe',     # þ
        }
    MULTIBYTE_MARKERS_AND_SIZES = [
        (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
        (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
        (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
        ]
    FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
    LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
    @classmethod
    def detwingle(cls, in_bytes, main_encoding="utf8",
                  embedded_encoding="windows-1252"):
        """Fix characters from one encoding embedded in some other encoding.
        Currently the only situation supported is Windows-1252 (or its
        subset ISO-8859-1), embedded in UTF-8.
        The input must be a bytestring. If you've already converted
        the document to Unicode, you're too late.
        The output is a bytestring in which `embedded_encoding`
        characters have been converted to their `main_encoding`
        equivalents.
        """
        if embedded_encoding.replace('_', '-').lower() not in (
            'windows-1252', 'windows_1252'):
            raise NotImplementedError(
                "Windows-1252 and ISO-8859-1 are the only currently supported "
                "embedded encodings.")
        if main_encoding.lower() not in ('utf8', 'utf-8'):
            raise NotImplementedError(
                "UTF-8 is the only currently supported main encoding.")
        byte_chunks = []
        chunk_start = 0
        pos = 0
        while pos < len(in_bytes):
            byte = in_bytes[pos]
            if not isinstance(byte, int):
                # Python 2.x
                byte = ord(byte)
            if (byte >= cls.FIRST_MULTIBYTE_MARKER
                and byte <= cls.LAST_MULTIBYTE_MARKER):
                # This is the start of a UTF-8 multibyte character. Skip
                # to the end.
                for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
                    if byte >= start and byte <= end:
                        pos += size
                        break
            elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
                # We found a Windows-1252 character!
                # Save the string up to this point as a chunk.
                byte_chunks.append(in_bytes[chunk_start:pos])
                # Now translate the Windows-1252 character into UTF-8
                # and add it as another, one-byte chunk.
                byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
                pos += 1
                chunk_start = pos
            else:
                # Go on to the next character.
                pos += 1
        if chunk_start == 0:
            # The string is unchanged.
            return in_bytes
        else:
            # Store the final chunk.
            byte_chunks.append(in_bytes[chunk_start:])
        return b''.join(byte_chunks)
--- a/updater/bs4/element.py
+++ b/updater/bs4/element.py
--- a/updater/bs4/testing.py
+++ b/updater/bs4/testing.py
@ -0,0 +1,554 @@
 """Helper classes for tests."""
 import copy
 import functools
 import unittest
 from unittest import TestCase
 from bs4 import BeautifulSoup
 from bs4.element import (
    CharsetMetaAttributeValue,
    Comment,
    ContentMetaAttributeValue,
    Doctype,
    SoupStrainer,
 )
 from bs4.builder import HTMLParserTreeBuilder
 default_builder = HTMLParserTreeBuilder
 class SoupTest(unittest.TestCase):
    @property
    def default_builder(self):
        return default_builder()
    def soup(self, markup, **kwargs):
        """Build a Beautiful Soup object from markup."""
        builder = kwargs.pop('builder', self.default_builder)
        return BeautifulSoup(markup, builder=builder, **kwargs)
    def document_for(self, markup):
        """Turn an HTML fragment into a document.
        The details depend on the builder.
        """
        return self.default_builder.test_fragment_to_document(markup)
    def assertSoupEquals(self, to_parse, compare_parsed_to=None):
        builder = self.default_builder
        obj = BeautifulSoup(to_parse, builder=builder)
        if compare_parsed_to is None:
            compare_parsed_to = to_parse
        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
 class HTMLTreeBuilderSmokeTest(object):
    """A basic test of a treebuilder's competence.
    Any HTML treebuilder, present or future, should be able to pass
    these tests. With invalid markup, there's room for interpretation,
    and different parsers can handle it differently. But with the
    markup in these tests, there's not much room for interpretation.
    """
    def assertDoctypeHandled(self, doctype_fragment):
        """Assert that a given doctype string is handled correctly."""
        doctype_str, soup = self._document_with_doctype(doctype_fragment)
        # Make sure a Doctype object was created.
        doctype = soup.contents[0]
        self.assertEqual(doctype.__class__, Doctype)
        self.assertEqual(doctype, doctype_fragment)
        self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
        # Make sure that the doctype was correctly associated with the
        # parse tree and that the rest of the document parsed.
        self.assertEqual(soup.p.contents[0], 'foo')
    def _document_with_doctype(self, doctype_fragment):
        """Generate and parse a document with the given doctype."""
        doctype = '<!DOCTYPE %s>' % doctype_fragment
        markup = doctype + '\n<p>foo</p>'
        soup = self.soup(markup)
        return doctype, soup
    def test_normal_doctypes(self):
        """Make sure normal, everyday HTML doctypes are handled correctly."""
        self.assertDoctypeHandled("html")
        self.assertDoctypeHandled(
            'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
    def test_public_doctype_with_url(self):
        doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
        self.assertDoctypeHandled(doctype)
    def test_system_doctype(self):
        self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
    def test_namespaced_system_doctype(self):
        # We can handle a namespaced doctype with a system ID.
        self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
    def test_namespaced_public_doctype(self):
        # Test a namespaced doctype with a public id.
        self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
    def test_real_xhtml_document(self):
        """A real XHTML document should come out more or less the same as it went in."""
        markup = b"""<?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
 <html xmlns="http://www.w3.org/1999/xhtml">
 <head><title>Hello.</title></head>
 <body>Goodbye.</body>
 </html>"""
        soup = self.soup(markup)
        self.assertEqual(
            soup.encode("utf-8").replace(b"\n", b""),
            markup.replace(b"\n", b""))
    def test_deepcopy(self):
        """Make sure you can copy the tree builder.
        This is important because the builder is part of a
        BeautifulSoup object, and we want to be able to copy that.
        """
        copy.deepcopy(self.default_builder)
    def test_p_tag_is_never_empty_element(self):
        """A <p> tag is never designated as an empty-element tag.
        Even if the markup shows it as an empty-element tag, it
        shouldn't be presented that way.
        """
        soup = self.soup("<p/>")
        self.assertFalse(soup.p.is_empty_element)
        self.assertEqual(str(soup.p), "<p></p>")
    def test_unclosed_tags_get_closed(self):
        """A tag that's not closed by the end of the document should be closed.
        This applies to all tags except empty-element tags.
        """
        self.assertSoupEquals("<p>", "<p></p>")
        self.assertSoupEquals("<b>", "<b></b>")
        self.assertSoupEquals("<br>", "<br/>")
    def test_br_is_always_empty_element_tag(self):
        """A <br> tag is designated as an empty-element tag.
        Some parsers treat <br></br> as one <br/> tag, some parsers as
        two tags, but it should always be an empty-element tag.
        """
        soup = self.soup("<br></br>")
        self.assertTrue(soup.br.is_empty_element)
        self.assertEqual(str(soup.br), "<br/>")
    def test_nested_formatting_elements(self):
        self.assertSoupEquals("<em><em></em></em>")
    def test_comment(self):
        # Comments are represented as Comment objects.
        markup = "<p>foo<!--foobar-->baz</p>"
        self.assertSoupEquals(markup)
        soup = self.soup(markup)
        comment = soup.find(text="foobar")
        self.assertEqual(comment.__class__, Comment)
        # The comment is properly integrated into the tree.
        foo = soup.find(text="foo")
        self.assertEqual(comment, foo.next_element)
        baz = soup.find(text="baz")
        self.assertEquals(comment, baz.previous_element)
    def test_preserved_whitespace_in_pre_and_textarea(self):
        """Whitespace must be preserved in <pre> and <textarea> tags."""
        self.assertSoupEquals("<pre>   </pre>")
        self.assertSoupEquals("<textarea> woo  </textarea>")
    def test_nested_inline_elements(self):
        """Inline elements can be nested indefinitely."""
        b_tag = "<b>Inside a B tag</b>"
        self.assertSoupEquals(b_tag)
        nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
        self.assertSoupEquals(nested_b_tag)
        double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
        self.assertSoupEquals(nested_b_tag)
    def test_nested_block_level_elements(self):
        """Block elements can be nested."""
        soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
        blockquote = soup.blockquote
        self.assertEqual(blockquote.p.b.string, 'Foo')
        self.assertEqual(blockquote.b.string, 'Foo')
    def test_correctly_nested_tables(self):
        """One table can go inside another one."""
        markup = ('<table id="1">'
                  '<tr>'
                  "<td>Here's another table:"
                  '<table id="2">'
                  '<tr><td>foo</td></tr>'
                  '</table></td>')
        self.assertSoupEquals(
            markup,
            '<table id="1"><tr><td>Here\'s another table:'
            '<table id="2"><tr><td>foo</td></tr></table>'
            '</td></tr></table>')
        self.assertSoupEquals(
            "<table><thead><tr><td>Foo</td></tr></thead>"
            "<tbody><tr><td>Bar</td></tr></tbody>"
            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
    def test_deeply_nested_multivalued_attribute(self):
        # html5lib can set the attributes of the same tag many times
        # as it rearranges the tree. This has caused problems with
        # multivalued attributes.
        markup = '<table><div><div class="css"></div></div></table>'
        soup = self.soup(markup)
        self.assertEqual(["css"], soup.div.div['class'])
    def test_angle_brackets_in_attribute_values_are_escaped(self):
        self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
    def test_entities_in_attributes_converted_to_unicode(self):
        expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
        self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
    def test_entities_in_text_converted_to_unicode(self):
        expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
        self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
        self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
        self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
    def test_quot_entity_converted_to_quotation_mark(self):
        self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
                              '<p>I said "good day!"</p>')
    def test_out_of_range_entity(self):
        expect = u"\N{REPLACEMENT CHARACTER}"
        self.assertSoupEquals("&#10000000000000;", expect)
        self.assertSoupEquals("&#x10000000000000;", expect)
        self.assertSoupEquals("&#1000000000;", expect)
    def test_basic_namespaces(self):
        """Parsers don't need to *understand* namespaces, but at the
        very least they should not choke on namespaces or lose
        data."""
        markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
        soup = self.soup(markup)
        self.assertEqual(markup, soup.encode())
        html = soup.html
        self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
        self.assertEqual(
            'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
        self.assertEqual(
            'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
    def test_multivalued_attribute_value_becomes_list(self):
        markup = b'<a class="foo bar">'
        soup = self.soup(markup)
        self.assertEqual(['foo', 'bar'], soup.a['class'])
    #
    # Generally speaking, tests below this point are more tests of
    # Beautiful Soup than tests of the tree builders. But parsers are
    # weird, so we run these tests separately for every tree builder
    # to detect any differences between them.
    #
    def test_soupstrainer(self):
        """Parsers should be able to work with SoupStrainers."""
        strainer = SoupStrainer("b")
        soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
                         parse_only=strainer)
        self.assertEqual(soup.decode(), "<b>bold</b>")
    def test_single_quote_attribute_values_become_double_quotes(self):
        self.assertSoupEquals("<foo attr='bar'></foo>",
                              '<foo attr="bar"></foo>')
    def test_attribute_values_with_nested_quotes_are_left_alone(self):
        text = """<foo attr='bar "brawls" happen'>a</foo>"""
        self.assertSoupEquals(text)
    def test_attribute_values_with_double_nested_quotes_get_quoted(self):
        text = """<foo attr='bar "brawls" happen'>a</foo>"""
        soup = self.soup(text)
        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
        self.assertSoupEquals(
            soup.foo.decode(),
            """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
    def test_ampersand_in_attribute_value_gets_escaped(self):
        self.assertSoupEquals('<this is="really messed up & stuff"></this>',
                              '<this is="really messed up &amp; stuff"></this>')
        self.assertSoupEquals(
            '<a href="http://example.org?a=1&b=2;3">foo</a>',
            '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
    def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
        self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
    def test_entities_in_strings_converted_during_parsing(self):
        # Both XML and HTML entities are converted to Unicode characters
        # during parsing.
        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
        self.assertSoupEquals(text, expected)
    def test_smart_quotes_converted_on_the_way_in(self):
        # Microsoft smart quotes are converted to Unicode characters during
        # parsing.
        quote = b"<p>\x91Foo\x92</p>"
        soup = self.soup(quote)
        self.assertEqual(
            soup.p.string,
            u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
    def test_non_breaking_spaces_converted_on_the_way_in(self):
        soup = self.soup("<a>&nbsp;&nbsp;</a>")
        self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
    def test_entities_converted_on_the_way_out(self):
        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
        soup = self.soup(text)
        self.assertEqual(soup.p.encode("utf-8"), expected)
    def test_real_iso_latin_document(self):
        # Smoke test of interrelated functionality, using an
        # easy-to-understand document.
        # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
        unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
        # That's because we're going to encode it into ISO-Latin-1, and use
        # that to test.
        iso_latin_html = unicode_html.encode("iso-8859-1")
        # Parse the ISO-Latin-1 HTML.
        soup = self.soup(iso_latin_html)
        # Encode it to UTF-8.
        result = soup.encode("utf-8")
        # What do we expect the result to look like? Well, it would
        # look like unicode_html, except that the META tag would say
        # UTF-8 instead of ISO-Latin-1.
        expected = unicode_html.replace("ISO-Latin-1", "utf-8")
        # And, of course, it would be in UTF-8, not Unicode.
        expected = expected.encode("utf-8")
        # Ta-da!
        self.assertEqual(result, expected)
    def test_real_shift_jis_document(self):
        # Smoke test to make sure the parser can handle a document in
        # Shift-JIS encoding, without choking.
        shift_jis_html = (
            b'<html><head></head><body><pre>'
            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
            b'</pre></body></html>')
        unicode_html = shift_jis_html.decode("shift-jis")
        soup = self.soup(unicode_html)
        # Make sure the parse tree is correctly encoded to various
        # encodings.
        self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
        self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
    def test_real_hebrew_document(self):
        # A real-world test to make sure we can convert ISO-8859-9 (a
        # Hebrew encoding) to UTF-8.
        hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
        soup = self.soup(
            hebrew_document, from_encoding="iso8859-8")
        self.assertEqual(soup.original_encoding, 'iso8859-8')
        self.assertEqual(
            soup.encode('utf-8'),
            hebrew_document.decode("iso8859-8").encode("utf-8"))
    def test_meta_tag_reflects_current_encoding(self):
        # Here's the <meta> tag saying that a document is
        # encoded in Shift-JIS.
        meta_tag = ('<meta content="text/html; charset=x-sjis" '
                    'http-equiv="Content-type"/>')
        # Here's a document incorporating that meta tag.
        shift_jis_html = (
            '<html><head>\n%s\n'
            '<meta http-equiv="Content-language" content="ja"/>'
            '</head><body>Shift-JIS markup goes here.') % meta_tag
        soup = self.soup(shift_jis_html)
        # Parse the document, and the charset is seemingly unaffected.
        parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
        content = parsed_meta['content']
        self.assertEqual('text/html; charset=x-sjis', content)
        # But that value is actually a ContentMetaAttributeValue object.
        self.assertTrue(isinstance(content, ContentMetaAttributeValue))
        # And it will take on a value that reflects its current
        # encoding.
        self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
        # For the rest of the story, see TestSubstitutions in
        # test_tree.py.
    def test_html5_style_meta_tag_reflects_current_encoding(self):
        # Here's the <meta> tag saying that a document is
        # encoded in Shift-JIS.
        meta_tag = ('<meta id="encoding" charset="x-sjis" />')
        # Here's a document incorporating that meta tag.
        shift_jis_html = (
            '<html><head>\n%s\n'
            '<meta http-equiv="Content-language" content="ja"/>'
            '</head><body>Shift-JIS markup goes here.') % meta_tag
        soup = self.soup(shift_jis_html)
        # Parse the document, and the charset is seemingly unaffected.
        parsed_meta = soup.find('meta', id="encoding")
        charset = parsed_meta['charset']
        self.assertEqual('x-sjis', charset)
        # But that value is actually a CharsetMetaAttributeValue object.
        self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
        # And it will take on a value that reflects its current
        # encoding.
        self.assertEqual('utf8', charset.encode("utf8"))
    def test_tag_with_no_attributes_can_have_attributes_added(self):
        data = self.soup("<a>text</a>")
        data.a['foo'] = 'bar'
        self.assertEqual('<a foo="bar">text</a>', data.a.decode())
 class XMLTreeBuilderSmokeTest(object):
    def test_docstring_generated(self):
        soup = self.soup("<root/>")
        self.assertEqual(
            soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
    def test_real_xhtml_document(self):
        """A real XHTML document should come out *exactly* the same as it went in."""
        markup = b"""<?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
 <html xmlns="http://www.w3.org/1999/xhtml">
 <head><title>Hello.</title></head>
 <body>Goodbye.</body>
 </html>"""
        soup = self.soup(markup)
        self.assertEqual(
            soup.encode("utf-8"), markup)
    def test_popping_namespaced_tag(self):
        markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
        soup = self.soup(markup)
        self.assertEqual(
            unicode(soup.rss), markup)
    def test_docstring_includes_correct_encoding(self):
        soup = self.soup("<root/>")
        self.assertEqual(
            soup.encode("latin1"),
            b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
    def test_large_xml_document(self):
        """A large XML document should come out the same as it went in."""
        markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
                  + b'0' * (2**12)
                  + b'</root>')
        soup = self.soup(markup)
        self.assertEqual(soup.encode("utf-8"), markup)
    def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
        self.assertSoupEquals("<p>", "<p/>")
        self.assertSoupEquals("<p>foo</p>")
    def test_namespaces_are_preserved(self):
        markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
        soup = self.soup(markup)
        root = soup.root
        self.assertEqual("http://example.com/", root['xmlns:a'])
        self.assertEqual("http://example.net/", root['xmlns:b'])
    def test_closing_namespaced_tag(self):
        markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
        soup = self.soup(markup)
        self.assertEqual(unicode(soup.p), markup)
    def test_namespaced_attributes(self):
        markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
        soup = self.soup(markup)
        self.assertEqual(unicode(soup.foo), markup)
    def test_namespaced_attributes_xml_namespace(self):
        markup = '<foo xml:lang="fr">bar</foo>'
        soup = self.soup(markup)
        self.assertEqual(unicode(soup.foo), markup)
 class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
    """Smoke test for a tree builder that supports HTML5."""
    def test_real_xhtml_document(self):
        # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
        # XHTML documents in any particular way.
        pass
    def test_html_tags_have_namespace(self):
        markup = "<a>"
        soup = self.soup(markup)
        self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
    def test_svg_tags_have_namespace(self):
        markup = '<svg><circle/></svg>'
        soup = self.soup(markup)
        namespace = "http://www.w3.org/2000/svg"
        self.assertEqual(namespace, soup.svg.namespace)
        self.assertEqual(namespace, soup.circle.namespace)
    def test_mathml_tags_have_namespace(self):
        markup = '<math><msqrt>5</msqrt></math>'
        soup = self.soup(markup)
        namespace = 'http://www.w3.org/1998/Math/MathML'
        self.assertEqual(namespace, soup.math.namespace)
        self.assertEqual(namespace, soup.msqrt.namespace)
    def test_xml_declaration_becomes_comment(self):
        markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
        soup = self.soup(markup)
        self.assertTrue(isinstance(soup.contents[0], Comment))
        self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
        self.assertEqual("html", soup.contents[0].next_element.name)
 def skipIf(condition, reason):
   def nothing(test, *args, **kwargs):
       return None
   def decorator(test_item):
       if condition:
           return nothing
       else:
           return test_item
   return decorator
--- a/updater/bs4/tests/init.py
+++ b/updater/bs4/tests/init.py
@ -0,0 +1 @@
 "The beautifulsoup tests."
--- a/updater/bs4/tests/test_builder_registry.py
+++ b/updater/bs4/tests/test_builder_registry.py
@ -0,0 +1,141 @@
 """Tests of the builder registry."""
 import unittest
 from bs4 import BeautifulSoup
 from bs4.builder import (
    builder_registry as registry,
    HTMLParserTreeBuilder,
    TreeBuilderRegistry,
 )
 try:
    from bs4.builder import HTML5TreeBuilder
    HTML5LIB_PRESENT = True
 except ImportError:
    HTML5LIB_PRESENT = False
 try:
    from bs4.builder import (
        LXMLTreeBuilderForXML,
        LXMLTreeBuilder,
        )
    LXML_PRESENT = True
 except ImportError:
    LXML_PRESENT = False
 class BuiltInRegistryTest(unittest.TestCase):
    """Test the built-in registry with the default builders registered."""
    def test_combination(self):
        if LXML_PRESENT:
            self.assertEqual(registry.lookup('fast', 'html'),
                             LXMLTreeBuilder)
        if LXML_PRESENT:
            self.assertEqual(registry.lookup('permissive', 'xml'),
                             LXMLTreeBuilderForXML)
        self.assertEqual(registry.lookup('strict', 'html'),
                          HTMLParserTreeBuilder)
        if HTML5LIB_PRESENT:
            self.assertEqual(registry.lookup('html5lib', 'html'),
                              HTML5TreeBuilder)
    def test_lookup_by_markup_type(self):
        if LXML_PRESENT:
            self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
            self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
        else:
            self.assertEqual(registry.lookup('xml'), None)
            if HTML5LIB_PRESENT:
                self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
            else:
                self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
    def test_named_library(self):
        if LXML_PRESENT:
            self.assertEqual(registry.lookup('lxml', 'xml'),
                             LXMLTreeBuilderForXML)
            self.assertEqual(registry.lookup('lxml', 'html'),
                             LXMLTreeBuilder)
        if HTML5LIB_PRESENT:
            self.assertEqual(registry.lookup('html5lib'),
                              HTML5TreeBuilder)
        self.assertEqual(registry.lookup('html.parser'),
                          HTMLParserTreeBuilder)
    def test_beautifulsoup_constructor_does_lookup(self):
        # You can pass in a string.
        BeautifulSoup("", features="html")
        # Or a list of strings.
        BeautifulSoup("", features=["html", "fast"])
        # You'll get an exception if BS can't find an appropriate
        # builder.
        self.assertRaises(ValueError, BeautifulSoup,
                          "", features="no-such-feature")
 class RegistryTest(unittest.TestCase):
    """Test the TreeBuilderRegistry class in general."""
    def setUp(self):
        self.registry = TreeBuilderRegistry()
    def builder_for_features(self, *feature_list):
        cls = type('Builder_' + '_'.join(feature_list),
                   (object,), {'features' : feature_list})
        self.registry.register(cls)
        return cls
    def test_register_with_no_features(self):
        builder = self.builder_for_features()
        # Since the builder advertises no features, you can't find it
        # by looking up features.
        self.assertEqual(self.registry.lookup('foo'), None)
        # But you can find it by doing a lookup with no features, if
        # this happens to be the only registered builder.
        self.assertEqual(self.registry.lookup(), builder)
    def test_register_with_features_makes_lookup_succeed(self):
        builder = self.builder_for_features('foo', 'bar')
        self.assertEqual(self.registry.lookup('foo'), builder)
        self.assertEqual(self.registry.lookup('bar'), builder)
    def test_lookup_fails_when_no_builder_implements_feature(self):
        builder = self.builder_for_features('foo', 'bar')
        self.assertEqual(self.registry.lookup('baz'), None)
    def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
        builder1 = self.builder_for_features('foo')
        builder2 = self.builder_for_features('bar')
        self.assertEqual(self.registry.lookup(), builder2)
    def test_lookup_fails_when_no_tree_builders_registered(self):
        self.assertEqual(self.registry.lookup(), None)
    def test_lookup_gets_most_recent_builder_supporting_all_features(self):
        has_one = self.builder_for_features('foo')
        has_the_other = self.builder_for_features('bar')
        has_both_early = self.builder_for_features('foo', 'bar', 'baz')
        has_both_late = self.builder_for_features('foo', 'bar', 'quux')
        lacks_one = self.builder_for_features('bar')
        has_the_other = self.builder_for_features('foo')
        # There are two builders featuring 'foo' and 'bar', but
        # the one that also features 'quux' was registered later.
        self.assertEqual(self.registry.lookup('foo', 'bar'),
                          has_both_late)
        # There is only one builder featuring 'foo', 'bar', and 'baz'.
        self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
                          has_both_early)
    def test_lookup_fails_when_cannot_reconcile_requested_features(self):
        builder1 = self.builder_for_features('foo', 'bar')
        builder2 = self.builder_for_features('foo', 'baz')
        self.assertEqual(self.registry.lookup('bar', 'baz'), None)
--- a/updater/bs4/tests/test_docs.py
+++ b/updater/bs4/tests/test_docs.py
@ -0,0 +1,36 @@
 "Test harness for doctests."
 # pylint: disable-msg=E0611,W0142
 __metaclass__ = type
 __all__ = [
    'additional_tests',
    ]
 import atexit
 import doctest
 import os
 #from pkg_resources import (
 #    resource_filename, resource_exists, resource_listdir, cleanup_resources)
 import unittest
 DOCTEST_FLAGS = (
    doctest.ELLIPSIS |
    doctest.NORMALIZE_WHITESPACE |
    doctest.REPORT_NDIFF)
 # def additional_tests():
 #     "Run the doc tests (README.txt and docs/*, if any exist)"
 #     doctest_files = [
 #         os.path.abspath(resource_filename('bs4', 'README.txt'))]
 #     if resource_exists('bs4', 'docs'):
 #         for name in resource_listdir('bs4', 'docs'):
 #             if name.endswith('.txt'):
 #                 doctest_files.append(
 #                     os.path.abspath(
 #                         resource_filename('bs4', 'docs/%s' % name)))
 #     kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
 #     atexit.register(cleanup_resources)
 #     return unittest.TestSuite((
 #         doctest.DocFileSuite(*doctest_files, **kwargs)))
--- a/updater/bs4/tests/test_html5lib.py
+++ b/updater/bs4/tests/test_html5lib.py
@ -0,0 +1,72 @@
 """Tests to ensure that the html5lib tree builder generates good trees."""
 import warnings
 try:
    from bs4.builder import HTML5TreeBuilder
    HTML5LIB_PRESENT = True
 except ImportError, e:
    HTML5LIB_PRESENT = False
 from bs4.element import SoupStrainer
 from bs4.testing import (
    HTML5TreeBuilderSmokeTest,
    SoupTest,
    skipIf,
 )
@skipIf(
    not HTML5LIB_PRESENT,
    "html5lib seems not to be present, not testing its tree builder.")
 class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
    """See ``HTML5TreeBuilderSmokeTest``."""
    @property
    def default_builder(self):
        return HTML5TreeBuilder()
    def test_soupstrainer(self):
        # The html5lib tree builder does not support SoupStrainers.
        strainer = SoupStrainer("b")
        markup = "<p>A <b>bold</b> statement.</p>"
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup(markup, parse_only=strainer)
        self.assertEqual(
            soup.decode(), self.document_for(markup))
        self.assertTrue(
            "the html5lib tree builder doesn't support parse_only" in
            str(w[0].message))
    def test_correctly_nested_tables(self):
        """html5lib inserts <tbody> tags where other parsers don't."""
        markup = ('<table id="1">'
                  '<tr>'
                  "<td>Here's another table:"
                  '<table id="2">'
                  '<tr><td>foo</td></tr>'
                  '</table></td>')
        self.assertSoupEquals(
            markup,
            '<table id="1"><tbody><tr><td>Here\'s another table:'
            '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
            '</td></tr></tbody></table>')
        self.assertSoupEquals(
            "<table><thead><tr><td>Foo</td></tr></thead>"
            "<tbody><tr><td>Bar</td></tr></tbody>"
            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
    def test_xml_declaration_followed_by_doctype(self):
        markup = '''<?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE html>
 <html>
  <head>
  </head>
  <body>
   <p>foo</p>
  </body>
 </html>'''
        soup = self.soup(markup)
        # Verify that we can reach the <p> tag; this means the tree is connected.
        self.assertEquals("<p>foo</p>", soup.p.encode())
--- a/updater/bs4/tests/test_htmlparser.py
+++ b/updater/bs4/tests/test_htmlparser.py
@ -0,0 +1,19 @@
 """Tests to ensure that the html.parser tree builder generates good
 trees."""
 from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
 from bs4.builder import HTMLParserTreeBuilder
 class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
    @property
    def default_builder(self):
        return HTMLParserTreeBuilder()
    def test_namespaced_system_doctype(self):
        # html.parser can't handle namespaced doctypes, so skip this one.
        pass
    def test_namespaced_public_doctype(self):
        # html.parser can't handle namespaced doctypes, so skip this one.
        pass
--- a/updater/bs4/tests/test_lxml.py
+++ b/updater/bs4/tests/test_lxml.py
@ -0,0 +1,75 @@
 """Tests to ensure that the lxml tree builder generates good trees."""
 import re
 import warnings
 try:
    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
    LXML_PRESENT = True
 except ImportError, e:
    LXML_PRESENT = False
 from bs4 import (
    BeautifulSoup,
    BeautifulStoneSoup,
    )
 from bs4.element import Comment, Doctype, SoupStrainer
 from bs4.testing import skipIf
 from bs4.tests import test_htmlparser
 from bs4.testing import (
    HTMLTreeBuilderSmokeTest,
    XMLTreeBuilderSmokeTest,
    SoupTest,
    skipIf,
 )
@skipIf(
    not LXML_PRESENT,
    "lxml seems not to be present, not testing its tree builder.")
 class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
    """See ``HTMLTreeBuilderSmokeTest``."""
    @property
    def default_builder(self):
        return LXMLTreeBuilder()
    def test_out_of_range_entity(self):
        self.assertSoupEquals(
            "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
        self.assertSoupEquals(
            "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
        self.assertSoupEquals(
            "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
    def test_beautifulstonesoup_is_xml_parser(self):
        # Make sure that the deprecated BSS class uses an xml builder
        # if one is installed.
        with warnings.catch_warnings(record=False) as w:
            soup = BeautifulStoneSoup("<b />")
            self.assertEqual(u"<b/>", unicode(soup.b))
    def test_real_xhtml_document(self):
        """lxml strips the XML definition from an XHTML doc, which is fine."""
        markup = b"""<?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
 <html xmlns="http://www.w3.org/1999/xhtml">
 <head><title>Hello.</title></head>
 <body>Goodbye.</body>
 </html>"""
        soup = self.soup(markup)
        self.assertEqual(
            soup.encode("utf-8").replace(b"\n", b''),
            markup.replace(b'\n', b'').replace(
                b'<?xml version="1.0" encoding="utf-8"?>', b''))
@skipIf(
    not LXML_PRESENT,
    "lxml seems not to be present, not testing its XML tree builder.")
 class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
    """See ``HTMLTreeBuilderSmokeTest``."""
    @property
    def default_builder(self):
        return LXMLTreeBuilderForXML()
--- a/updater/bs4/tests/test_soup.py
+++ b/updater/bs4/tests/test_soup.py
@ -0,0 +1,378 @@
 # -*- coding: utf-8 -*-
 """Tests of Beautiful Soup as a whole."""
 import logging
 import unittest
 import sys
 from bs4 import (
    BeautifulSoup,
    BeautifulStoneSoup,
 )
 from bs4.element import (
    CharsetMetaAttributeValue,
    ContentMetaAttributeValue,
    SoupStrainer,
    NamespacedAttribute,
    )
 import bs4.dammit
 from bs4.dammit import EntitySubstitution, UnicodeDammit
 from bs4.testing import (
    SoupTest,
    skipIf,
 )
 import warnings
 try:
    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
    LXML_PRESENT = True
 except ImportError, e:
    LXML_PRESENT = False
 PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
 PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
 class TestDeprecatedConstructorArguments(SoupTest):
    def test_parseOnlyThese_renamed_to_parse_only(self):
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
        msg = str(w[0].message)
        self.assertTrue("parseOnlyThese" in msg)
        self.assertTrue("parse_only" in msg)
        self.assertEqual(b"<b></b>", soup.encode())
    def test_fromEncoding_renamed_to_from_encoding(self):
        with warnings.catch_warnings(record=True) as w:
            utf8 = b"\xc3\xa9"
            soup = self.soup(utf8, fromEncoding="utf8")
        msg = str(w[0].message)
        self.assertTrue("fromEncoding" in msg)
        self.assertTrue("from_encoding" in msg)
        self.assertEqual("utf8", soup.original_encoding)
    def test_unrecognized_keyword_argument(self):
        self.assertRaises(
            TypeError, self.soup, "<a>", no_such_argument=True)
    @skipIf(
        not LXML_PRESENT,
        "lxml not present, not testing BeautifulStoneSoup.")
    def test_beautifulstonesoup(self):
        with warnings.catch_warnings(record=True) as w:
            soup = BeautifulStoneSoup("<markup>")
            self.assertTrue(isinstance(soup, BeautifulSoup))
            self.assertTrue("BeautifulStoneSoup class is deprecated")
 class TestSelectiveParsing(SoupTest):
    def test_parse_with_soupstrainer(self):
        markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
        strainer = SoupStrainer("b")
        soup = self.soup(markup, parse_only=strainer)
        self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
 class TestEntitySubstitution(unittest.TestCase):
    """Standalone tests of the EntitySubstitution class."""
    def setUp(self):
        self.sub = EntitySubstitution
    def test_simple_html_substitution(self):
        # Unicode characters corresponding to named HTML entites
        # are substituted, and no others.
        s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
        self.assertEqual(self.sub.substitute_html(s),
                          u"foo&forall;\N{SNOWMAN}&otilde;bar")
    def test_smart_quote_substitution(self):
        # MS smart quotes are a common source of frustration, so we
        # give them a special test.
        quotes = b"\x91\x92foo\x93\x94"
        dammit = UnicodeDammit(quotes)
        self.assertEqual(self.sub.substitute_html(dammit.markup),
                          "&lsquo;&rsquo;foo&ldquo;&rdquo;")
    def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
        s = 'Welcome to "my bar"'
        self.assertEqual(self.sub.substitute_xml(s, False), s)
    def test_xml_attribute_quoting_normally_uses_double_quotes(self):
        self.assertEqual(self.sub.substitute_xml("Welcome", True),
                          '"Welcome"')
        self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
                          '"Bob\'s Bar"')
    def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
        s = 'Welcome to "my bar"'
        self.assertEqual(self.sub.substitute_xml(s, True),
                          "'Welcome to \"my bar\"'")
    def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
        s = 'Welcome to "Bob\'s Bar"'
        self.assertEqual(
            self.sub.substitute_xml(s, True),
            '"Welcome to &quot;Bob\'s Bar&quot;"')
    def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
        quoted = 'Welcome to "Bob\'s Bar"'
        self.assertEqual(self.sub.substitute_xml(quoted), quoted)
    def test_xml_quoting_handles_angle_brackets(self):
        self.assertEqual(
            self.sub.substitute_xml("foo<bar>"),
            "foo&lt;bar&gt;")
    def test_xml_quoting_handles_ampersands(self):
        self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
    def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self):
        self.assertEqual(
            self.sub.substitute_xml("&Aacute;T&T"),
            "&Aacute;T&amp;T")
    def test_quotes_not_html_substituted(self):
        """There's no need to do this except inside attribute values."""
        text = 'Bob\'s "bar"'
        self.assertEqual(self.sub.substitute_html(text), text)
 class TestEncodingConversion(SoupTest):
    # Test Beautiful Soup's ability to decode and encode from various
    # encodings.
    def setUp(self):
        super(TestEncodingConversion, self).setUp()
        self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
        self.utf8_data = self.unicode_data.encode("utf-8")
        # Just so you know what it looks like.
        self.assertEqual(
            self.utf8_data,
            b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
    def test_ascii_in_unicode_out(self):
        # ASCII input is converted to Unicode. The original_encoding
        # attribute is set.
        ascii = b"<foo>a</foo>"
        soup_from_ascii = self.soup(ascii)
        unicode_output = soup_from_ascii.decode()
        self.assertTrue(isinstance(unicode_output, unicode))
        self.assertEqual(unicode_output, self.document_for(ascii.decode()))
        self.assertEqual(soup_from_ascii.original_encoding.lower(), "ascii")
    def test_unicode_in_unicode_out(self):
        # Unicode input is left alone. The original_encoding attribute
        # is not set.
        soup_from_unicode = self.soup(self.unicode_data)
        self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
        self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
        self.assertEqual(soup_from_unicode.original_encoding, None)
    def test_utf8_in_unicode_out(self):
        # UTF-8 input is converted to Unicode. The original_encoding
        # attribute is set.
        soup_from_utf8 = self.soup(self.utf8_data)
        self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
        self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
    def test_utf8_out(self):
        # The internal data structures can be encoded as UTF-8.
        soup_from_unicode = self.soup(self.unicode_data)
        self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
    @skipIf(
        PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
        "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
    def test_attribute_name_containing_unicode_characters(self):
        markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
        self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
 class TestUnicodeDammit(unittest.TestCase):
    """Standalone tests of Unicode, Dammit."""
    def test_smart_quotes_to_unicode(self):
        markup = b"<foo>\x91\x92\x93\x94</foo>"
        dammit = UnicodeDammit(markup)
        self.assertEqual(
            dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
    def test_smart_quotes_to_xml_entities(self):
        markup = b"<foo>\x91\x92\x93\x94</foo>"
        dammit = UnicodeDammit(markup, smart_quotes_to="xml")
        self.assertEqual(
            dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
    def test_smart_quotes_to_html_entities(self):
        markup = b"<foo>\x91\x92\x93\x94</foo>"
        dammit = UnicodeDammit(markup, smart_quotes_to="html")
        self.assertEqual(
            dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
    def test_smart_quotes_to_ascii(self):
        markup = b"<foo>\x91\x92\x93\x94</foo>"
        dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
        self.assertEqual(
            dammit.unicode_markup, """<foo>''""</foo>""")
    def test_detect_utf8(self):
        utf8 = b"\xc3\xa9"
        dammit = UnicodeDammit(utf8)
        self.assertEqual(dammit.unicode_markup, u'\xe9')
        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
    def test_convert_hebrew(self):
        hebrew = b"\xed\xe5\xec\xf9"
        dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
        self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
        self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
    def test_dont_see_smart_quotes_where_there_are_none(self):
        utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
        dammit = UnicodeDammit(utf_8)
        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
        self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
    def test_ignore_inappropriate_codecs(self):
        utf8_data = u"Räksmörgås".encode("utf-8")
        dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
    def test_ignore_invalid_codecs(self):
        utf8_data = u"Räksmörgås".encode("utf-8")
        for bad_encoding in ['.utf8', '...', 'utF---16.!']:
            dammit = UnicodeDammit(utf8_data, [bad_encoding])
            self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
    def test_detect_html5_style_meta_tag(self):
        for data in (
            b'<html><meta charset="euc-jp" /></html>',
            b"<html><meta charset='euc-jp' /></html>",
            b"<html><meta charset=euc-jp /></html>",
            b"<html><meta charset=euc-jp/></html>"):
            dammit = UnicodeDammit(data, is_html=True)
            self.assertEqual(
                "euc-jp", dammit.original_encoding)
    def test_last_ditch_entity_replacement(self):
        # This is a UTF-8 document that contains bytestrings
        # completely incompatible with UTF-8 (ie. encoded with some other
        # encoding).
        #
        # Since there is no consistent encoding for the document,
        # Unicode, Dammit will eventually encode the document as UTF-8
        # and encode the incompatible characters as REPLACEMENT
        # CHARACTER.
        #
        # If chardet is installed, it will detect that the document
        # can be converted into ISO-8859-1 without errors. This happens
        # to be the wrong encoding, but it is a consistent encoding, so the
        # code we're testing here won't run.
        #
        # So we temporarily disable chardet if it's present.
        doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
 <html><b>\330\250\330\252\330\261</b>
 <i>\310\322\321\220\312\321\355\344</i></html>"""
        chardet = bs4.dammit.chardet_dammit
        logging.disable(logging.WARNING)
        try:
            def noop(str):
                return None
            bs4.dammit.chardet_dammit = noop
            dammit = UnicodeDammit(doc)
            self.assertEqual(True, dammit.contains_replacement_characters)
            self.assertTrue(u"\ufffd" in dammit.unicode_markup)
            soup = BeautifulSoup(doc, "html.parser")
            self.assertTrue(soup.contains_replacement_characters)
        finally:
            logging.disable(logging.NOTSET)
            bs4.dammit.chardet_dammit = chardet
    def test_sniffed_xml_encoding(self):
        # A document written in UTF-16LE will be converted by a different
        # code path that sniffs the byte order markers.
        data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
        dammit = UnicodeDammit(data)
        self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
        self.assertEqual("utf-16le", dammit.original_encoding)
    def test_detwingle(self):
        # Here's a UTF8 document.
        utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
        # Here's a Windows-1252 document.
        windows_1252 = (
            u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
            u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
        # Through some unholy alchemy, they've been stuck together.
        doc = utf8 + windows_1252 + utf8
        # The document can't be turned into UTF-8:
        self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
        # Unicode, Dammit thinks the whole document is Windows-1252,
        # and decodes it into "â˜ƒâ˜ƒâ˜ƒ“Hi, I like Windows!”â˜ƒâ˜ƒâ˜ƒ"
        # But if we run it through fix_embedded_windows_1252, it's fixed:
        fixed = UnicodeDammit.detwingle(doc)
        self.assertEqual(
            u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
    def test_detwingle_ignores_multibyte_characters(self):
        # Each of these characters has a UTF-8 representation ending
        # in \x93. \x93 is a smart quote if interpreted as
        # Windows-1252. But our code knows to skip over multibyte
        # UTF-8 characters, so they'll survive the process unscathed.
        for tricky_unicode_char in (
            u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
            u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
            u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
            ):
            input = tricky_unicode_char.encode("utf8")
            self.assertTrue(input.endswith(b'\x93'))
            output = UnicodeDammit.detwingle(input)
            self.assertEqual(output, input)
 class TestNamedspacedAttribute(SoupTest):
    def test_name_may_be_none(self):
        a = NamespacedAttribute("xmlns", None)
        self.assertEqual(a, "xmlns")
    def test_attribute_is_equivalent_to_colon_separated_string(self):
        a = NamespacedAttribute("a", "b")
        self.assertEqual("a:b", a)
    def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
        a = NamespacedAttribute("a", "b", "c")
        b = NamespacedAttribute("a", "b", "c")
        self.assertEqual(a, b)
        # The actual namespace is not considered.
        c = NamespacedAttribute("a", "b", None)
        self.assertEqual(a, c)
        # But name and prefix are important.
        d = NamespacedAttribute("a", "z", "c")
        self.assertNotEqual(a, d)
        e = NamespacedAttribute("z", "b", "c")
        self.assertNotEqual(a, e)
 class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
    def test_content_meta_attribute_value(self):
        value = CharsetMetaAttributeValue("euc-jp")
        self.assertEqual("euc-jp", value)
        self.assertEqual("euc-jp", value.original_value)
        self.assertEqual("utf8", value.encode("utf8"))
    def test_content_meta_attribute_value(self):
        value = ContentMetaAttributeValue("text/html; charset=euc-jp")
        self.assertEqual("text/html; charset=euc-jp", value)
        self.assertEqual("text/html; charset=euc-jp", value.original_value)
        self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
--- a/updater/bs4/tests/test_tree.py
+++ b/updater/bs4/tests/test_tree.py
--- a/updater/coursera.json
+++ b/updater/coursera.json
--- a/updater/lib.py
+++ b/updater/lib.py
@ -10,9 +10,20 @@ class Database(object):
 	TEST = 7
 	BOOK = 8
 	AUDIOBOOK = 9
 	LECTURE = 10
 	def __init__(self, host, user, password=None, database="learn"):
-		self.database = oursql.connect(host=host, user=user, db=database)
+		self.database = oursql.connect(host=host, user=user, passwd=password, db=database)
 	def topic_exists(self, provider, unique_id):
 		c = self.database.cursor()
 		c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (provider, unique_id))
 		return (len(c.fetchall()) > 0)
 	def item_exists(self, provider, unique_id):
 		c = self.database.cursor()
 		c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (provider, unique_id))
 		return (len(c.fetchall()) > 0)
 	def insert_topic(self, provider, unique_id, title, override=False, **kwargs):
 		defaults = {
@ -21,7 +32,8 @@ class Database(object):
 			"start_date": None,
 			"end_date": None,
 			"parent_id": 0,
-			"description": ""
+			"description": "",
 			"provider_name": ""
 		}
 		for kwarg, val in defaults.iteritems():
@ -43,9 +55,9 @@ class Database(object):
 		if exists == True:
 			return (False, results[0][0])
 		else:
-			c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`)"
+			c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`, `CustomProviderName`)"
-				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], provider, unique_id, title, kwargs['description'], kwargs['creation_date'], 
+				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], provider, unique_id, title, kwargs['description'], kwargs['creation_date'], 
-				                                         kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date']))
+				                                         kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'], kwargs["provider_name"]))
 			return (True, c.lastrowid)
@ -56,7 +68,10 @@ class Database(object):
 			"topic_id": 0,
 			"parent_id": 0,
 			"description": "",
-			"date": None
+			"date": None,
 			"start_date": None,
 			"end_date": None,
 			"provider_name": ""
 		}
 		for kwarg, val in defaults.iteritems():
@ -78,8 +93,8 @@ class Database(object):
 		if exists == True:
 			return (False, results[0][0])
 		else:
-			c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`)"
+			c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`, `StartDate`, `EndDate`, `CustomProviderName`)"
-				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (has_topic, itemtype, provider, unique_id, title, kwargs["description"], item_url, kwargs["source_url"], 
+				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (has_topic, itemtype, provider, unique_id, title, kwargs["description"], item_url, kwargs["source_url"], 
-									       kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"]))
+									       kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"]))
 			return (True, c.lastrowid)
--- a/updater/scrapers/init.py
+++ b/updater/scrapers/init.py
@ -0,0 +1,26 @@
 import inspect, os, sys
 my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
 def _import_module_into_scope(modulename):
 	module = __import__(modulename)
 	for name in vars(module):
 		data = getattr(module, name)
 		globals()[name] = data
 sys.path.insert(0, my_path)
 for fname in os.listdir(my_path):
 	fpath = os.path.join(my_path, fname)
 	fbasename, fext = os.path.splitext(fname)
 	if os.path.isdir(fpath):
 		if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
 			# This is a python directory module
 			_import_module_into_scope(fname)
 	elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
 		# This is a python file module
 		_import_module_into_scope(fbasename)
 sys.path.remove(my_path)
--- a/updater/scrapers/coursera.py
+++ b/updater/scrapers/coursera.py
@ -0,0 +1,50 @@
 import datetime, json, sys
 import requests
 import shared
 class Coursera(shared.Scraper):
 	provider_id = 2
 	def run(self):
 		self.retrieve_dataset()
 		self.parse_dataset()
 	def retrieve_dataset(self):
 		self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
 	def parse_dataset(self):
 		for item in self.dataset:
 			self.process_item(item)
 	def process_item(self, item):
 		inserted, row_id = self.insert_topic(str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
 		if inserted:
 			self.env.log("Inserted topic %s" % item["name"])
 		else:
 			self.env.log("Skipped topic %s" % item["name"])
 		for course in item["courses"]:
 			self.process_course(course, row_id)
 	def process_course(self, course, topicid):
 		try:
 			start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
 		except TypeError, e:
 			start_date = None
 		title = self.generate_title(course['name'], start_date)
 		inserted, row_id = self.insert_item(str(course["id"]), title, course["home_link"], has_topic=True, itemtype=self.COURSE, description=course["certificate_description"], start_date=start_date, topic_id=topicid)
 		if inserted:
 			self.env.log("Inserted item %s" % title)
 		else:
 			self.env.log("Skipped item %s" % title)
 	def generate_title(self, name, date):
 		if date is None:
 			return "%s (date undetermined)" % name
 		else:
 			return "%s (starting %s)" % (name, date.strftime("%b %d, %Y"))
--- a/updater/scrapers/genericocw.py
+++ b/updater/scrapers/genericocw.py
@ -0,0 +1,201 @@
 import requests
 import oursql
 import datetime
 import json
 import sys, os
 import shared
 from bs4 import BeautifulSoup
 import bs4
 rsess = requests.Session()
 rsess.headers['User-Agent'] = 'http://learn.cryto.net/ (scraper@cryto.net) - We mean no harm, thanks for making knowledge free :)'
 class OpenCourseWare(shared.Scraper):
 	def run(self):
 		overview = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource").text
 		soup = BeautifulSoup(overview)
 		for element in soup.find(id="pagecontent")("a"):
 			#if "Hopkins" not in element.string:
 			#	continue
 			self.process_source(int(element["href"].split("/")[-1]), element.string)
 	def process_source(self, source_id, source_name):
 		data = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/browse/source/%d" % source_id).text
 		soup = BeautifulSoup(data)
 		courses = soup.select("table#cfResultsTable tr")
 		for course in courses[:3]:
 			links = course("a")
 			if len(links) > 0:
 				external = links[0]
 				details = links[1]
 				self.parse_course(external.string, external["href"], details["href"].split("/")[-1], source_name)
 	def parse_course(self, course_name, course_url, course_id, source_name):
 		self.env.log("Parsing %s" % course_url)
 		# First fetch metadata from ocwconsortium.org
 		ocw_data = self._metadata_ocw(course_id)
 		ocw_data["providername"] = source_name
 		ocw_data["url"] = course_url
 		# Now fetch metadata from the particular course provider
 		provider_data = self._metadata_provider(course_url)
 		if provider_data != False:
 			data = ocw_data.copy()
 			data.update(provider_data)
 			# TODO: insert data
 			self.env.log(repr(data))
 	def _metadata_ocw(self, course_id):
 		soup = BeautifulSoup(rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/course/%s" % course_id).text)
 		metadata = soup.select("dl.coursepage")[0]
 		if len(metadata) > 0:
 			data = self._parse_ocw_dl(metadata.select("dd"), metadata.select("dt"))
 		else:
 			# No metadata provided by ocwconsortium.
 			data = {}
 		return data
 	def _parse_ocw_dl(self, dd, dt):
 		data = {}
 		for i in xrange(0, len(dd)):
 			label = dd[i].string.strip().rstrip(":")
 			value = dt[i].string
 			if value is not None:
 				value = value.strip()
 			if label == "Tags":
 				if value == None:
 					data["tags"] = []
 				else:
 					data["tags"] = [x.strip() for x in value.split(",")]
 			elif label == "Source":
 				data["providername"] = value
 			elif label == "Language":
 				data["language"] = value
 			elif label == "Link":
 				# We can ignore this, we already have it anyway
 				pass
 			elif label == "Author":
 				if value == None:
 					data["author"] = None
 				else:
 					data["author"] = value
 			elif label == "License":
 				if value == None:
 					data["license"] = None
 				else:
 					data["license"] = value
 			elif label == "Date Published":
 				data["creation_date"] = datetime.datetime.strptime(value, "%b %d, %Y")
 			else:
 				self.env.log("UNKNOWN: %s => %s" % (label, value), True)
 		return data
 	def _metadata_provider(self, url):
 		providers = {
 			"oer.avu.org": self._metadata_avu,
 			"ocw.capilanou.ca": self._metadata_capilano,
 			"ocw.hokudai.ac.jp": self._metadata_hokkaido,
 			"ocw.ie.edu": self._metadata_ie,
 			"ocw.jhsph.edu": self._metadata_hopkins,
 		}
 		host = url.split("/")[2]
 		data = {}
 		for provider, func in providers.iteritems():
 			if host.endswith(provider):
 				return func(url)
 		return False
 	def _metadata_avu(self, url):
 		# African Virtual University
 		soup = BeautifulSoup(rsess.get(url + "?show=full").text)
 		table = soup.select("table.ds-includeSet-table")[0]
 		data = {"providername": "African Virtual University"}
 		for row in table("tr"):
 			cells = row("td")
 			label = cells[0].string
 			value = cells[1].string
 			if label == "dc.identifier.uri":
 				data["identifier_uri"] = value
 			elif label == "dc.type":
 				data["object_type"] = value
 			elif label == "dc.date.accessioned":
 				data["creation_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
 			elif label == "dc.date.issued":
 				data["issued_date"] = datetime.datetime.strptime(value, "%Y-%m-%d")
 			elif label == "dc.date.available":
 				data["available_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
 			elif label == "dc.language.iso":
 				data["language"] = value
 			elif label == "dc.description.abstract":
 				data["description"] = " ".join(x for y in cells[1]("p") for x in y.strings)
 			elif label == "dc.contributor.author":
 				data["author"] = value
 			elif label == "dc.title":
 				data["title"] = value
 			else:
 				self.env.log("UNKNOWN KEY: %s => %s" % (label, value), True)
 		return data
 	def _metadata_capilano(self, url):
 		# Capilano University
 		soup = BeautifulSoup(rsess.get(url).text)
 		data = {"providername": "Capilano University"}
 		data["title"] = soup.select("h1.documentFirstHeading")[0].string.strip()
 		data["description"] = " ".join(x for y in soup.select("#about > p") for x in y.strings).strip()
 		return data
 	def _metadata_hokkaido(self, url):
 		# Hokkaido University
 		soup = BeautifulSoup(rsess.get(url).text)
 		data = {"providername": "Hokkaido University"}
 		data["title"] = soup.select("#MAIN h1")[0].string.strip()
 		data["description"] = soup.select("#MAIN p")[0].string.strip()
 		return data
 	def _metadata_ie(self, url):
 		# IE University
 		course_id = url.split("=")[1]
 		soup = BeautifulSoup(rsess.get("http://ocw.ie.edu/ocw/cur%s01_esp.html" % course_id.zfill(2)).text)
 		data = {"providername": "IE University"}
 		data["title"] = soup.select(".ari_18_negrita")[0].string.strip()
 		data["description"] = " ".join(x.strip() for x in soup.select(".ari_12_negra")[-1].strings)
 		data["author"] = soup.select(".ari_12_negra")[2].select(".ari_12_negrita")[0].string.strip()
 		return data
 	def _metadata_hopkins(self, url):
 		# Johns Hopkins Bloomberg School of Public Health
 		soup = BeautifulSoup(rsess.get(url).text)
 		data = {"providername": "Johns Hopkins Bloomberg School of Public Health"}
 		data["title"] = self.soup_to_text(soup.select("h1")[-1])
 		data["author"] = self.soup_to_text(soup.select("#courseInfoBox p:nth-of-type(1)"))
 		data["description"] = self.soup_to_text(soup.select("#courseImageAndInfoBox > p"))
 		return data
--- a/updater/scrapers/khan.py
+++ b/updater/scrapers/khan.py
@ -0,0 +1,197 @@
 import datetime, json, sys
 import requests
 import shared
 class KhanAcademy(shared.Scraper):
 	provider_id = 1
 	def run(self):
 		self.retrieve_dataset()
 		self.process_item(self.dataset, 0)
 	def retrieve_dataset(self):
 		self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
 	def process_item(self, item, level, parent=None):
 		try:
 			kind = item["kind"]
 		except KeyError, e:
 			return
 		if kind == "Topic":
 			self.process_topic(item, level, parent=parent)
 		elif kind in ("Video", "Exercise", "Article", "Scratchpad"):
 			self.process_object(item, level, parent=parent)
 		elif kind == "Separator":
 			pass  # Ignore separators
 		else:
 			self.env.log("Unrecognized kind: %s" % repr(item["kind"]), True)
 		try:
 			children = item["children"]
 		except KeyError, e:
 			return
 		for child in children:
 			self.process_item(child, level + 1, item)
 	def process_topic(self, item, level, parent=None):
 		unique_id = item["id"]
 		try:
 			parent_id = parent["_cl_id"]
 		except TypeError, e:
 			parent_id = 0
 		# Check if a title is set
 		if item["title"] is not None:
 			title = item["title"]
 		else:
 			# No title was set - log this as an error and default to 'Untitled'.
 			self.env.log("No title found for item: %s" % repr(item), True)
 			title = "Untitled"
 		# Check if a description is set, and default to no description if not
 		if item["description"] is not None:
 			description = item["description"]
 		else:
 			description = None
 		# Insert the topic
 		inserted, row_id = self.insert_topic(unique_id, title, description=description, needs_enrollment=False)
 		# Set the ID of the newly inserted row so that all objects in this topic know the ID of their topic.
 		item["_cl_id"] = row_id
 		if inserted:
 			self.env.log("Inserted %s" % title)
 		else:
 			self.env.log("Skipped %s" % title)
 	def process_object(self, item, level, parent=None):
 		unique_id = None
 		# First check for the 'readable_id' property
 		try:
 			unique_id = item["readable_id"]
 		except KeyError, e:
 			pass
 		# If no identifier was found, check for the 'name' property
 		if unique_id is None:
 			try:
 				unique_id = item["name"]
 			except KeyError, e:
 				pass
 		# If still no identifier was found, check for the 'id' property
 		if unique_id is None:
 			try:
 				unique_id = str(item["id"])
 			except KeyError, e:
 				pass
 		# If we *still* do not have an identifier, log the error and bail out
 		if unique_id is None:
 			self.env.log("No suitable identifier found for item: %s" % repr(item), True)
 			return
 		# Determine the object type
 		if item["kind"] == "Video":
 			itemtype = self.VIDEO
 		elif item["kind"] == "Exercise":
 			itemtype = self.EXERCISE
 		elif item["kind"] == "Article":
 			itemtype = self.ARTICLE
 		elif item["kind"] == "Scratchpad":
 			itemtype = self.SANDBOX
 		source_url = None
 		# Determine the source URL via the 'ka_url' property
 		try:
 			source_url = item["ka_url"]
 		except KeyError, e:
 			pass
 		# If no source URL was found, try the 'url' property
 		if source_url is None:			
 			try:
 				source_url = item["url"]
 			except KeyError, e:
 				pass
 		# If still no source URL was found...
 		if source_url is None:
 			if itemtype == self.ARTICLE:
 				# Articles can lack a URL.
 				source_url = None
 			else:
 				# There was no source URL, but this wasn't an article. Log the error and bail out.
 				self.env.log("No source URL found for non-article object: %s" % repr(item), True)
 				return
 		# Determine the (external) item URL
 		try:
 			item_url = item["url"]
 		except KeyError, e:
 			# Apparently there was no external item URL. Use the source URL as item URL - this will most likely be correct.
 			item_url = source_url
 		# If the object is an article, we'll want to use the actual article content as description.
 		if itemtype == self.ARTICLE:
 			description = item["content"]
 		else:
 			# Otherwise, we'll check if there's a 'description' property. If not, leave empty.
 			try:
 				description = item["description"]
 			except KeyError, e:
 				description = None
 		title = None
 		# First check the 'title' property for an object title.
 		try:
 			title = item["title"]
 		except KeyError, e:
 			pass
 		# As second option, check the 'display_name' property.
 		if title is None:
 			try:
 				title = item["display_name"]
 			except KeyError, e:
 				# Apparently it really does not have a title. Log the error and default to 'Untitled'.
 				self.env.log("No object title found for item: %s" % repr(item), True)
 				title = "Untitled"
 		# If a 'views' property is present, include it.
 		try:
 			views = item["views"]
 		except KeyError, e:
 			views = None
 		# If a creation date is present, include it.
 		try:
 			date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
 		except KeyError, e:
 			date = None
 		# Check if there is a parent ID
 		try:
 			parent_id = parent["_cl_id"]
 		except KeyError, e:
 			# No parent ID present - log this as an error and default to 0.
 			self.env.log("No parent ID found for item: %s" % repr(item), True)
 			parent_id = 0
 		# Insert the item
 		inserted, row_id = self.insert_item(unique_id, title, item_url, itemtype=itemtype, has_topic=True, source_url=source_url, description=description, views=views, topic_id=parent_id, date=date)
 		# Store the resulting row ID in the item so that the children know the ID of their parent.
 		item["_cl_id"] = row_id
 		if inserted:
 			self.env.log("Inserted %s" % title)
 		else:
 			self.env.log("Skipped %s" % title)
--- a/updater/scrapers/ureddit.py
+++ b/updater/scrapers/ureddit.py
@ -0,0 +1,55 @@
 import datetime, json, simplejson, sys, re
 import requests
 import shared
 class UniversityOfReddit(shared.Scraper):
 	provider_id = 3
 	def run(self):
 		data = requests.get("http://ureddit.com/api?type=catalog").json()
 		for category in data["categories"]:
 			self.parse_category(category['id'], category['value'])
 	def parse_category(self, category_id, category_name):
 		try:
 			data = requests.get("http://ureddit.com/api?type=category&id=%s" % category_id).json()
 		except simplejson.decoder.JSONDecodeError, e:
 			return
 		for _class in data["classes"]:
 			if not self.topic_exists(_class['id']):
 				self.parse_class(_class['id'], _class['value'], category_name)
 			else:
 				self.env.log("Skipped class %s" % _class['value'])
 	def parse_class(self, class_id, class_name, category_name):
 		try:
 			data = requests.get("http://ureddit.com/api?type=class&id=%s" % class_id).json()
 		except simplejson.decoder.JSONDecodeError, e:
 			self.env.log("Skipped %s due to JSON formatting error" % class_name, True)
 			return
 		if data["status"] == '1' or data["status"] == '3' or data["status"] == '5':
 			try:
 				creation_date = datetime.datetime.strptime(data["created"], '%Y-%m-%d %H:%M:%S')
 			except ValueError, e:
 				creation_date = None
 			class_page = data["url"]
 			inserted, topic_id = self.insert_topic(str(class_id), data["name"], needs_enrollment=True, description=data["description"], creation_date=creation_date)
 			if inserted:
 				self.env.log("Inserted topic %s" % data["name"])
 			else:
 				self.env.log("Skipped topic %s" % data["name"])
 			inserted, item_id = self.insert_item(str(class_id), data["name"], class_page, itemtype=self.COURSE, has_topic=True, topic_id=topic_id, date=creation_date, description=data["description"])
 			if inserted:
 				self.env.log("Inserted item %s" % data["name"])
 			else:
 				self.env.log("Skipped item %s" % data["name"])
 		else:
 			self.env.log("Skipped %s due to status (%s)" % (data["name"], data["status_description"]))
--- a/updater/shared/init.py
+++ b/updater/shared/init.py
@ -0,0 +1,26 @@
 import inspect, os, sys
 my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
 def _import_module_into_scope(modulename):
 	module = __import__(modulename)
 	for name in vars(module):
 		data = getattr(module, name)
 		globals()[name] = data
 sys.path.insert(0, my_path)
 for fname in os.listdir(my_path):
 	fpath = os.path.join(my_path, fname)
 	fbasename, fext = os.path.splitext(fname)
 	if os.path.isdir(fpath):
 		if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
 			# This is a python directory module
 			_import_module_into_scope(fname)
 	elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
 		# This is a python file module
 		_import_module_into_scope(fbasename)
 sys.path.remove(my_path)
--- a/updater/shared/environment.py
+++ b/updater/shared/environment.py
@ -0,0 +1,17 @@
 import oursql, sys
 class Environment(object):
 	def connect(self, host="localhost", username="root", password="", database="learn"):
 		self.db = oursql.connect(host=host, user=username, passwd=password, db=database)
 		self.connected = True
 	def log(self, text, is_error=False):
 		if is_error == False:
 			sys.stdout.write(text + "\n")
 		else:
 			sys.stderr.write(text + "\n")
 	def Scraper(self, scraper_class):
 		s = scraper_class(self.db)
 		s.env = self
 		return s
--- a/updater/shared/scraper.py
+++ b/updater/shared/scraper.py
@ -0,0 +1,122 @@
 class Scraper(object):
 	UNKNOWN = 0
 	TOPIC = 1
 	COURSE = 2
 	VIDEO = 3
 	ARTICLE = 4
 	EXERCISE = 5
 	QUIZ = 6
 	TEST = 7
 	BOOK = 8
 	AUDIOBOOK = 9
 	LECTURE = 10
 	SANDBOX = 11
 	provider_id = 0
 	def __init__(self, database=None):
 		if database is not None:
 			self.db = database
 			self.can_store = True
 		else:
 			self.can_store = False
 	def run(self, *args, **kwargs):
 		raise Exception("No run() method was specified for this scraper.")
 	def topic_exists(self, unique_id):
 		c = self.db.cursor()
 		c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
 		return (len(c.fetchall()) > 0)
 	def item_exists(self, unique_id):
 		c = self.db.cursor()
 		c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
 		return (len(c.fetchall()) > 0)
 	def insert_topic(self, unique_id, title, override=False, **kwargs):
 		defaults = {
 			"needs_enrollment": False,
 			"creation_date": None,
 			"start_date": None,
 			"end_date": None,
 			"parent_id": 0,
 			"description": "",
 			"provider_name": ""
 		}
 		for kwarg, val in defaults.iteritems():
 			try:
 				if kwargs[kwarg] == None:
 					kwargs[kwarg] = defaults[kwarg]
 			except KeyError, e:
 				kwargs[kwarg] = defaults[kwarg]
 		c = self.db.cursor()
 		if override == True:
 			exists = False
 		else:
 			c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
 			results = c.fetchall()
 			exists = (len(results) > 0)
 		if exists == True:
 			return (False, results[0][0])
 		else:
 			c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`, `CustomProviderName`)"
 				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], self.provider_id, unique_id, title, kwargs['description'], kwargs['creation_date'], 
 				                                            kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'], kwargs["provider_name"]))
 			return (True, c.lastrowid)
 	def insert_item(self, unique_id, title, item_url, override=False, **kwargs):
 		defaults = {
 			"views": None,
 			"has_topic": False,
 			"itemtype": 0,
 			"source_url": item_url,
 			"topic_id": 0,
 			"parent_id": 0,
 			"description": "",
 			"date": None,
 			"start_date": None,
 			"end_date": None,
 			"provider_name": ""
 		}
 		for kwarg, val in defaults.iteritems():
 			try:
 				if kwargs[kwarg] == None:
 					kwargs[kwarg] = defaults[kwarg]
 			except KeyError, e:
 				kwargs[kwarg] = defaults[kwarg]
 		c = self.db.cursor()
 		if override == True:
 			exists = False
 		else:
 			c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
 			results = c.fetchall()
 			exists = (len(results) > 0)
 		if exists == True:
 			return (False, results[0][0])
 		else:
 			c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`, `StartDate`, `EndDate`, `CustomProviderName`)"
 				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs["has_topic"], kwargs["itemtype"], self.provider_id, unique_id, title, kwargs["description"], item_url, kwargs["source_url"], 
 									       kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"]))
 			return (True, c.lastrowid)
 	def soup_to_text(self, soup):
 		strings = []
 		try:
 			for el in soup:
 				strings += el._all_strings(True, True)
 		except AttributeError, e:
 			strings = soup._all_strings(True, True)
 		return " ".join(strings)
--- a/updater/test_ocw.py
+++ b/updater/test_ocw.py
@ -0,0 +1,4 @@
 import update_ocw
 c = update_ocw.OpenCourseWareCrawler()
 print c.get_provider_data("http://ocw.jhsph.edu/courses/AdolHealthDev/?source=rss")
--- a/updater/update.py
+++ b/updater/update.py
@ -0,0 +1,8 @@
 #!/usr/bin/env python
 import shared, scrapers
 env = shared.Environment()
 env.connect(host="localhost", username="root", password="", database="learn")
 scraper = env.Scraper(scrapers.OpenCourseWare)
 scraper.run()
--- a/updater/update_khan.py
+++ b/updater/update_khan.py
@ -1,131 +0,0 @@
 import requests
 import oursql
 import datetime
 import json
 import lib
 class KhanUniversityCrawler(object):
 	def __init__(self):
 		self.db = lib.Database("localhost", "root")
 	def retrieve_dataset(self):
 		self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
 		#self.dataset = json.loads(open("data.json", "r").read())
 	def parse_dataset(self):
 		self.process_item(self.dataset, 0)
 	def process_item(self, item, level, parent=None):
 		try:
 			kind = item["kind"]
 		except KeyError, e:
 			return
 		if kind == "Topic":
 			unique_id = item["id"]
 			try:
 				parent_id = parent["_cl_id"]
 			except TypeError, e:
 				parent_id = 0
 			if item["title"] is not None:
 				title = item["title"]
 			else:
 				title = ""
 			inserted, rowid = self.db.insert_topic(1, unique_id, title, description=item["description"], needs_enrollment=False)
 			item["_cl_id"] = rowid
 			if inserted:
 				print "Inserted %s" % title
 			else:
 				print "Skipped %s" % title
 		elif kind in ("Video", "Exercise", "Article"):
 			try:
 				unique_id = item["readable_id"]
 			except KeyError, e:
 				try:
 					unique_id = item["name"]
 				except KeyError, e:
 					try:
 						unique_id = str(item["id"])
 					except KeyError, e:
 						print repr(item)
 						sys.stderr.write("WARNING: No suitable identifier found for item\n")
 						raise
 						return
 			if item["kind"] == "Video":
 				itemtype = self.db.VIDEO
 			elif item["kind"] == "Exercise":
 				itemtype = self.db.EXERCISE
 			elif item["kind"] == "Article":
 				itemtype = self.db.ARTICLE
 			try:
 				source_url = item["ka_url"]
 			except KeyError, e:
 				if itemtype == self.db.ARTICLE:
 					source_url = ""
 				else:
 					return
 			try:
 				item_url = item["url"]
 			except KeyError, e:
 				try:
 					item_url = item["ka_url"]
 				except KeyError, e:
 					item_url = None
 			if itemtype == self.db.ARTICLE:
 				description = item["content"]
 			else:
 				try:
 					description = item["description"]
 				except KeyError, e:
 					description = None
 			try:
 				title = item["title"]
 			except KeyError, e:
 				try:
 					title = item["display_name"]
 				except KeyError, e:
 					title = "Untitled"
 			try:
 				views = item["views"]
 			except KeyError, e:
 				views = None
 			try:
 				date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
 			except KeyError, e:
 				date = None
 			inserted, rowid = self.db.insert_item(1, unique_id, True, itemtype, title, item_url, source_url=source_url, description=description, views=views, topic_id=parent["_cl_id"], date=date)
 			item["_cl_id"] = rowid
 			if inserted:
 				print "Inserted %s" % title
 			else:
 				print "Skipped %s" % title
 		elif kind == "Separator":
 			pass  # Ignore separators
 		else:
 			sys.stderr.write("Unrecognized kind: %s\n" % item["kind"])
 			sys.stderr.write("%s\n" % (repr(item)))
 		try:
 			children = item["children"]
 		except KeyError, e:
 			pass
 		else:
 			for child in children:
 				self.process_item(child, level + 1, item)
 crawler = KhanUniversityCrawler()
 crawler.retrieve_dataset()
 crawler.parse_dataset()
--- a/updater/update_ocw.py
+++ b/updater/update_ocw.py
@ -0,0 +1,288 @@
 import requests
 import oursql
 import datetime
 import json
 import lib
 from bs4 import BeautifulSoup
 import bs4
 def combine_dict(a, b):
 	c = a.copy()
 	c.update(b)
 	return c
 rsess = requests.Session()
 rsess.headers['User-Agent'] = 'http://learn.cryto.net/ (scraper@cryto.net) - We mean no harm, thanks for making knowledge free :)'
 class OpenCourseWareCrawler(object):
 	def __init__(self):
 		self.db = lib.Database("localhost", "root", password="")
 	def parse_catalog(self):
 		overview = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource").text
 		soup = BeautifulSoup(overview)
 		for element in soup.find(id="pagecontent")("a"):
 			self.parse_source(int(element["href"].split("/")[-1]), element.string)
 	def parse_source(self, source_id, source_name):
 		data = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/browse/source/%d" % source_id).text
 		soup = BeautifulSoup(data)
 		courses = soup.select("table#cfResultsTable tr")
 		print "# " + source_name
 		for course in courses[:2]:
 			links = course("a")
 			if len(links) > 0:
 				external = links[0]
 				details = links[1]
 				self.parse_course(external.string, external["href"], details["href"].split("/")[-1])
 	def parse_course(self, course_name, course_url, course_id):
 		# First fetch metadata from ocwconsortium.org
 		print course_url
 		metadata_soup = BeautifulSoup(rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/course/%s" % course_id).text)
 		metadata = metadata_soup.select("dl.coursepage")[0]
 		if len(metadata) > 0:
 			data = self.parse_dl(metadata.select("dd"), metadata.select("dt"))
 		else:
 			# No metadata provided by ocwconsortium.
 			data = {}
 		# Now fetch metadata from the particular course provider
 		provider_data = self.get_provider_data(course_url)
 		if provider_data != {}:
 			print repr(provider_data)
 	def parse_dl(self, dd, dt):
 		data = {}
 		for i in xrange(0, len(dd)):
 			label = dd[i].string.strip().rstrip(":")
 			value = dt[i].string
 			if value is not None:
 				value = value.strip()
 			if label == "Tags":
 				if value == None:
 					data["tags"] = []
 				else:
 					data["tags"] = [x.strip() for x in value.split(",")]
 			elif label == "Source":
 				data["source"] = value
 			elif label == "Language":
 				data["language"] = value
 			elif label == "Link":
 				# We can ignore this, we already have it anyway
 				pass
 			elif label == "Author":
 				if value == None:
 					data["author"] = None
 				else:
 					data["author"] = value
 			elif label == "License":
 				if value == None:
 					data["license"] = None
 				else:
 					data["license"] = value
 			elif label == "Date Published":
 				data["creation_date"] = datetime.datetime.strptime(value, "%b %d, %Y")
 			else:
 				print "UNKNOWN: %s => %s" % (label, value)
 		return data
 	def get_provider_data(self, url):
 		providers = {
 			"oer.avu.org": self._data_avu,
 			"ocw.capilanou.ca": self._data_capilano,
 			"ocw.hokudai.ac.jp": self._data_hokkaido,
 			"ocw.ie.edu": self._data_ie,
 			"ocw.jhsph.edu": self._data_hopkins,
 		}
 		""",
 			"ocw.kaplan.edu": self._data_kaplan,
 			"ocw.korea.edu": self._data_korea,
 			"kyotomm.jp": self._data_kyoto,
 			"ocw.kyushu-u.ac.jp": self._data_kyushu,
 			"open-marhi.ru": self._data_moscow,
 			"yctrtrc.ncku.edu.tw": self._data_chengkung,
 			"ocw.nctu.edu.tw": self._data_chiaotung,
 			"opencourse.ndhu.edu.tw": self._data_donghwa,
 			"ocw.njit.edu": self._data_njit,
 			"graduateschool.paristech.fr": self._data_paris,
 			"peoples-uni.org": self._data_oaei,
 			"ocw.sbu.ac.ir": self._data_shahid,
 			"studentscircle.net": self._data_studentscircle,
 			"ocw.tmu.edu.tw:8080": self._data_taipei,
 			"openlearn.open.ac.uk": self._data_openuni,
 			"www.ocw.titech.ac.jp": self._data_tokyo,
 			"feedproxy.google.com": self._data_tudelft,
 			"ocw.tufts.edu": self._data_tufts,
 			"ocw.unu.edu": self._data_un,
 			"ocw.uc3m.es": self._data_madrid,
 			"ocw.ua.es": self._data_alicante,
 			"ocw.unican.es": self._data_cantabria,
 			"ocw.ugr.es": self._data_granada,
 			"ocw.udem.edu.mx": self._data_monterrey,
 			"ocw.um.es": self._data_murcia,
 			"ocw.uniovi.es": self._data_oviedo,
 			"ocw.usal.es": self._data_salamanca,
 			"ocwus.us.es": self._data_sevilla,
 			"ocw.unizar.es": self._data_zaragoza,
 			"ocw.univalle.edu.co3": self._data_colombia,
 			"ocw.uned.ac.cr": self._data_distancia,
 			"www.icesi.edu.co": self._data_icesi,
 			"ocw.innova.uned.es": self._data_innova,
 			"upv.es": self._data_valencia,
 			"ocw.upm.es": self._data_upm,
 			"ocw.utpl.edu.ec": self._data_utpl,
 			"ocw.uab.cat": self._data_uab,
 			"ocw.ub.edu": self._data_ub,
 			"ocw.uib.es": self._data_uib,
 			"ocw.udl.cat": self._data_udl,
 			"ocw.uv.es": self._data_uv,
 			"e-ujier.uji.e": self._data_uji,
 			"ocw.uoc.edu": self._data_uoc,
 			"ocw.utm.my": self._data_utm,
 			"ocw.uci.edu": self._data_uci,
 			"opencontent.uct.ac.za": self._data_uct,
 			"ocw.umb.edu:8080": self._data_boston,
 			"open.umich.edu": self._data_michigan,
 			"ocw.nd.edu": self._data_notredame,
 			"ocw.usu.ac.id": self._data_usu,
 			"ocw.tsukuba.ac.jp": self._data_tsukaba"""
 		host = url.split("/")[2]
 		data = {}
 		for provider, func in providers.iteritems():
 			if host.endswith(provider):
 				data = func(url)
 		return data
 	def _data_avu(self, url):
 		# African Virtual University
 		soup = BeautifulSoup(rsess.get(url + "?show=full").text)
 		table = soup.select("table.ds-includeSet-table")[0]
 		data = {"providername": "African Virtual University"}
 		for row in table("tr"):
 			cells = row("td")
 			label = cells[0].string
 			value = cells[1].string
 			if label == "dc.identifier.uri":
 				data["identifier_uri"] = value
 			elif label == "dc.type":
 				data["object_type"] = value
 			elif label == "dc.date.accessioned":
 				data["creation_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
 			elif label == "dc.date.issued":
 				data["issued_date"] = datetime.datetime.strptime(value, "%Y-%m-%d")
 			elif label == "dc.date.available":
 				data["available_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
 			elif label == "dc.language.iso":
 				data["language"] = value
 			elif label == "dc.description.abstract":
 				data["description"] = " ".join(x for y in cells[1]("p") for x in y.strings)
 			elif label == "dc.contributor.author":
 				data["author"] = value
 			elif label == "dc.title":
 				data["title"] = value
 			else:
 				print "UNKNOWN KEY: %s => %s" % (label, value)
 		return data
 	def _data_capilano(self, url):
 		# Capilano University
 		soup = BeautifulSoup(rsess.get(url).text)
 		data = {"providername": "Capilano University"}
 		data["title"] = soup.select("h1.documentFirstHeading")[0].string.strip()
 		data["description"] = " ".join(x for y in soup.select("#about > p") for x in y.strings).strip()
 		return data
 	def _data_hokkaido(self, url):
 		# Hokkaido University
 		soup = BeautifulSoup(rsess.get(url).text)
 		data = {"providername": "Hokkaido University"}
 		data["title"] = soup.select("#MAIN h1")[0].string.strip()
 		data["description"] = soup.select("#MAIN p")[0].string.strip()
 		return data
 	def _data_ie(self, url):
 		# IE University
 		course_id = url.split("=")[1]
 		soup = BeautifulSoup(rsess.get("http://ocw.ie.edu/ocw/cur%s01_esp.html" % course_id.zfill(2)).text)
 		data = {"providername": "IE University"}
 		data["title"] = soup.select(".ari_18_negrita")[0].string.strip()
 		data["description"] = " ".join(x.strip() for x in soup.select(".ari_12_negra")[-1].strings)
 		data["author"] = soup.select(".ari_12_negra")[2].select(".ari_12_negrita")[0].string.strip()
 		return data
 	def _data_hopkins(self, url):
 		# Johns Hopkins Bloomberg School of Public Health
 		soup = BeautifulSoup(rsess.get(url).text)
 		data = {"providername": "Johns Hopkins Bloomberg School of Public Health"}
 		data["title"] = " ".join(x.strip() for x in soup.select("h1")[-1].strings if type(x) != bs4.element.Comment)
 		data["author"] = soup.select("#courseInfoBox p")[0].string.strip()
 		data["description"] = soup.select("#courseImageAndInfoBox p")[-1].string.strip()
 		return data
 	def parse_dataset(self):
 		for item in self.dataset:
 			self.process_item(item)
 	def process_item(self, item):
 		inserted, rowid = self.db.insert_topic(2, str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
 		if inserted:
 			print "Inserted %s" % item["name"]
 		else:
 			print "Skipped %s" % item["name"]
 		for course in item["courses"]:
 			self.process_course(course, rowid)
 	def process_course(self, course, topicid):
 		try:
 			start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
 			title = "%s: %s-%s-%s" % (course["name"], str(course["start_year"]).zfill(4), str(course["start_month"]).zfill(2), str(course["start_day"]).zfill(2))
 		except TypeError, e:
 			start_date = None
 			title = "%s (date undetermined)" % (course["name"])
 		inserted, itemid = self.db.insert_item(2, str(course["id"]), True, self.db.COURSE, title, course["home_link"], description=course["certificate_description"], start_date=start_date, topic_id=topicid)
 		if inserted:
 			print "\tInserted %s" % title
 		else:
 			print "\tSkipped %s" % title
 #crawler = OpenCourseWareCrawler()
 #crawler.parse_catalog()
Author	SHA1	Message	Date
Sven Slootweg	d98ee113bc	Rewrite generic OCW parser, BeautifulSoup fix to allow exclusion of comments for string retrieval, and fix BS4 bug	2013-01-31 01:36:20 +01:00
Sven Slootweg	98340b38a0	Rewrite University of Reddit crawler - now with less hacks!	2013-01-30 22:36:42 +01:00
Sven Slootweg	8bbffb9429	Add topic_exists and item_exists methods to Scraper class	2013-01-30 22:30:13 +01:00
Sven Slootweg	0e4df4549f	No need to import oursql from within the scrapers	2013-01-30 22:03:55 +01:00
Sven Slootweg	2c3bcc5418	Rewrite Khan Academy crawler	2013-01-30 20:42:46 +01:00
Sven Slootweg	d9034b6215	Consistently use row_id, and not itemid or rowid	2013-01-30 20:42:23 +01:00
Sven Slootweg	8c0033074b	Support both output logging and error logging in the Environment.log() method	2013-01-30 20:41:51 +01:00
Sven Slootweg	b3edd35ecf	Add support for lectures and sandboxes	2013-01-30 20:41:11 +01:00
Sven Slootweg	d6d8eb70b9	Fix typo - it should be Khan Academy, not Khan University.	2013-01-30 20:07:50 +01:00
Sven Slootweg	fb6c43a38f	Rewrite scraper to be more modular, and convert the Coursera crawler to the new model	2013-01-30 19:43:48 +01:00
Sven Slootweg	c2a8a66dac	Update README to fix dependencies list	2013-01-30 14:17:32 +01:00
Sven Slootweg	a690cb2c8f	Add rudimentary first version of the OCW scraper	2013-01-30 13:41:27 +01:00
Sven Slootweg	f188d443d1	Add README	2013-01-30 13:39:44 +01:00
Sven Slootweg	43c700ac2b	Add list of various OCW sources for parser development	2013-01-30 13:34:18 +01:00
Sven Slootweg	26b68952fa	Add table structure updates for new version of updater	2013-01-30 13:33:24 +01:00
Sven Slootweg	a4e744f892	Add list of sources for book data	2013-01-30 13:33:07 +01:00
Sven Slootweg	d3bd59f813	Add modified version of BeautifulSoup4 (nth-of-type pseudoselector and full-featured direct descendant support)	2013-01-30 13:30:18 +01:00
Sven Slootweg	8e951f6b27	Add simple script for searching from a terminal	2013-01-30 13:28:21 +01:00
Sven Slootweg	d387541822	Support custom provider names	2013-01-30 13:27:59 +01:00
Sven Slootweg	a6e350c0d9	Add dumping script	2013-01-28 17:11:44 +01:00
Sven Slootweg	0f5cade812	Simple dumper	2013-01-28 17:10:13 +01:00
Sven Slootweg	fa74d394a7	Filter _ search terms	2013-01-28 16:43:46 +01:00
Sven Slootweg	a9d2576eaf	Add donation link	2013-01-28 16:39:38 +01:00
Sven Slootweg	f57d45fa53	Add header message	2013-01-28 16:34:25 +01:00
Sven Slootweg	1503c1f75f	Add 404 page	2013-01-28 16:32:52 +01:00
Sven Slootweg	bfbfd821b5	Include a small preview in the search results	2013-01-28 16:15:06 +01:00
Sven Slootweg	efeef5f70e	Change search term requirements	2013-01-28 16:09:17 +01:00
Sven Slootweg	3f02174ba3	Implement some very basic methods to prevent overloading	2013-01-28 16:07:48 +01:00
Sven Slootweg	1fbb21e6d8	Properly use the password when connecting the crawlers	2013-01-28 15:48:37 +01:00
Sven Slootweg	dd4c62bc4e	Very basic error handling	2013-01-28 15:43:39 +01:00
Sven Slootweg	6ec1a2d90b	Add crawlers for coursera and ureddit, get first quick and dirty version of frontend done, and fix buigs and stuff	2013-01-28 14:48:35 +01:00
		`@ -0,0 +1,2 @@`
							ALTER TABLE `items` ADD `CustomProviderName` VARCHAR( 250 ) NULL DEFAULT NULL;
							ALTER TABLE `topics` ADD `CustomProviderName` VARCHAR( 250 ) NULL DEFAULT NULL;