Rewrite generic OCW parser, BeautifulSoup fix to allow exclusion of comments for string retrieval, and fix BS4 bug

Rewrite University of Reddit crawler - now with less hacks!
Add topic_exists and item_exists methods to Scraper class
51 changed files with 8545 additions and 140 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,9 @@
+# Cryto Learn
+
+This is the source code for http://learn.cryto.net/. It consists of the following:
+
+* The updating script, a few very rudimentary scrapers for various educational sources. Requires Python 2. Dependencies are [oursql](http://packages.python.org/oursql/), [requests](http://docs.python-requests.org/en/latest/) and BeautifulSoup 4 (custom version included). Located in `updater/`.
+* The frontend, a fairly hacky and messy PHP-based search interface. Needs cleaning up, but not an immediate priority. Requires PHP 5.3+ and uses [CPHP](http://github.com/joepie91/cphp). Located in `frontend/`.
+* A simple shell search script, using the Cryto Learn API to search for the specified string and print results to stdout. Requires Python 2. Also very rudimentary.
+
+Licensed under the WTFPL. It may or may not work on your system, use at your own risk, etc. etc.
--- a/book_data_sources.txt
+++ b/book_data_sources.txt
@ -0,0 +1,7 @@
+API:
+http://www.goodreads.com/api
+https://developers.google.com/books/docs/getting-started#books_api_v1
+
+Dumps:
+http://openlibrary.org/data/ol_dump_latest.txt.gz
+http://www.librarything.com/feeds/
--- a/config.json
+++ b/config.json
@ -0,0 +1,30 @@
+{
+	"database": {
+		"driver": 	"mysql",
+		"pdo":		true,
+		"hostname": 	"localhost",
+		"username": 	"root",
+		"password": 	"",
+		"database": 	"learn"
+	},
+	"locale": {
+		"path": 		"locales",
+		"extension":		"lng",
+		"default_locale": 	"english",
+		"default_timezone": 	"Europe/Amsterdam"
+	},
+	"memcache": {
+		"enabled": 	true,
+		"compressed": 	true,
+		"hostname": 	"localhost",
+		"port": 	11211
+	},
+	"class_map": {
+		"item": 	 	"Item",
+		"topic": 		"Topic"
+	},
+	"components": [
+		"router",
+		"errorhandler"
+	]
+}
--- a/frontend/classes/item.php
+++ b/frontend/classes/item.php
@ -0,0 +1,152 @@
+<?php
+/*
+ * Cryto Learn is more free software. It is licensed under the WTFPL, which
+ * allows you to do pretty much anything with it, without having to
+ * ask permission. Commercial use is allowed, and no attribution is
+ * required. We do politely request that you share your modifications
+ * to benefit other developers, but you are under no enforced
+ * obligation to do so :)
+ * 
+ * Please read the accompanying LICENSE document for the full WTFPL
+ * licensing text.
+ */
+ 
+if(!isset($_APP)) { die("Unauthorized."); }
+
+class Item extends CPHPDatabaseRecordClass
+{
+	public $table_name = "items";
+	public $fill_query = "SELECT * FROM items WHERE `Id` = :Id";
+	public $verify_query = "SELECT * FROM items WHERE `Id` = :Id";
+	
+	public $prototype = array(
+		'string' => array(
+			'Title'			=> "Title",
+			'Description'		=> "Description",
+			'SourceUrl'		=> "SourceUrl",
+			'ItemUrl'		=> "ItemUrl"
+		),
+		'numeric' => array(
+			'Type'			=> "Type",
+			'Provider'		=> "Provider",
+			'Views'			=> "Views",
+			'TopicId'		=> "TopicId",
+			'ParentId'		=> "ParentId"
+		),
+		'boolean' => array(
+			'HasTopic'		=> "HasTopic"
+		),
+		'timestamp' => array(
+			'CreationDate'		=> "Date",
+			'StartDate'		=> "StartDate",
+			'EndDate'		=> "EndDate"
+		),
+		'topic' => array(
+			'Topic'			=> "TopicId"
+		),
+		'item' => array(
+			'Parent'		=> "ParentId"
+		)
+	);
+	
+	public function __get($name)
+	{
+		switch($name)
+		{
+			case "sTypeName":
+				return $this->GetTypeName();
+				break;
+			case "sProviderName":
+				return $this->GetProviderName();
+				break;
+			default:
+				return parent::__get($name);
+				break;
+		}
+	}
+	
+	public function GetTypeName()
+	{
+		switch($this->sType)
+		{
+			case 1:
+				return "topic";
+			case 2:
+				return "course";
+			case 3:
+				return "video";
+			case 4:
+				return "article";
+			case 5:
+				return "exercise";
+			case 6:
+				return "quiz";
+			case 7:
+				return "test";
+			case 8:
+				return "book";
+			case 9:
+				return "audiobook";
+			case 10:
+				return "lecture";
+			case 11:
+				return "sandbox";
+			default:
+				return "unknown";
+		}
+	}
+	
+	public function GetProviderName()
+	{
+		switch($this->sProvider)
+		{
+			case 1:
+				return "Khan Academy";
+			case 2:
+				return "Coursera";
+			case 3:
+				return "University of Reddit";
+			default:
+				return "Unknown";
+		}
+	}
+	
+	public function GetChildren()
+	{
+		try
+		{
+			return Item::CreateFromQuery("SELECT * FROM items WHERE `ParentId` = :ParentId", array(':ParentId' => $this->sId));
+		}
+		catch (NotFoundException $e)
+		{
+			return array();
+		}
+	}
+	
+	public function AsDataset($fetch_children = true)
+	{
+		$child_data = array();
+		
+		if($fetch_children == true)
+		{
+			foreach($this->GetChildren() as $child)
+			{
+				$child_data[] = $child->AsDataset();
+			}
+		}
+		
+		return array(
+			"title"		=> $this->uTitle,
+			"description"	=> $this->uDescription,
+			"url"		=> $this->uItemUrl,
+			"source"	=> $this->uSourceUrl,
+			"created"	=> $this->sCreationDate,
+			"start"		=> $this->sStartDate,
+			"end"		=> $this->sEndDate,
+			"type"		=> $this->sTypeName,
+			"provider"	=> $this->sProviderName,
+			"views"		=> $this->sViews,
+			"children"	=> $child_data
+		);
+	}
+}
--- a/frontend/classes/topic.php
+++ b/frontend/classes/topic.php
@ -0,0 +1,131 @@
+<?php
+/*
+ * Cryto Learn is more free software. It is licensed under the WTFPL, which
+ * allows you to do pretty much anything with it, without having to
+ * ask permission. Commercial use is allowed, and no attribution is
+ * required. We do politely request that you share your modifications
+ * to benefit other developers, but you are under no enforced
+ * obligation to do so :)
+ * 
+ * Please read the accompanying LICENSE document for the full WTFPL
+ * licensing text.
+ */
+ 
+if(!isset($_APP)) { die("Unauthorized."); }
+
+class Topic extends CPHPDatabaseRecordClass
+{
+	public $table_name = "topics";
+	public $fill_query = "SELECT * FROM topics WHERE `Id` = :Id";
+	public $verify_query = "SELECT * FROM topics WHERE `Id` = :Id";
+	
+	public $prototype = array(
+		'string' => array(
+			'Title'			=> "Title",
+			'ProviderId'		=> "ProviderId",
+			'Description'		=> "Description"
+		),
+		'numeric' => array(
+			'ParentId'		=> "ParentId",
+			'Provider'		=> "Provider"
+		),
+		'boolean' => array(
+			'NeedsEnrollment'	=> "NeedsEnrollment"
+		),
+		'timestamp' => array(
+			'CreationDate'		=> "Created",
+			'StartDate'		=> "StartDate",
+			'EndDate'		=> "EndDate"
+		),
+		'topic' => array(
+			'Parent'		=> "ParentId"
+		)
+	);
+	
+	public function __get($name)
+	{
+		switch($name)
+		{
+			case "sProviderName":
+				return $this->GetProviderName();
+				break;
+			default:
+				return parent::__get($name);
+				break;
+		}
+	}
+	
+	public function GetProviderName()
+	{
+		switch($this->sProvider)
+		{
+			case 1:
+				return "Khan Academy";
+			case 2:
+				return "Coursera";
+			case 3:
+				return "University of Reddit";
+			default:
+				return "Unknown";
+		}
+	}
+	
+	public function AsDataset($fetch_children = true, $fetch_items = true)
+	{
+		$child_data = array();
+		
+		if($fetch_children == true)
+		{
+			foreach($this->GetChildren() as $child)
+			{
+				$child_data[] = $child->AsDataset();
+			}
+		}
+		
+		$item_data = array();
+		
+		if($fetch_items == true)
+		{
+			foreach($this->GetItems() as $item)
+			{
+				$item_data[] = $item->AsDataset();
+			}
+		}
+		
+		return array(
+			"title"			=> $this->uTitle,
+			"description"		=> $this->uDescription,
+			"created"		=> $this->sCreationDate,
+			"start"			=> $this->sStartDate,
+			"end"			=> $this->sEndDate,
+			"provider"		=> $this->sProviderName,
+			"needs_enrollment"	=> $this->sNeedsEnrollment,
+			"children"		=> $child_data,
+			"items"			=> $item_data
+		);
+	}
+	
+	public function GetItems()
+	{
+		try
+		{
+			return Item::CreateFromQuery("SELECT * FROM items WHERE `TopicId` = :TopicId", array(':TopicId' => $this->sId));
+		}
+		catch (NotFoundException $e)
+		{
+			return array();
+		}
+	}
+	
+	public function GetChildren()
+	{
+		try
+		{
+			return Topic::CreateFromQuery("SELECT * FROM topics WHERE `ParentId` = :ParentId", array(':ParentId' => $this->sId));
+		}
+		catch (NotFoundException $e)
+		{
+			return array();
+		}
+	}
+}
--- a/frontend/cphp
+++ b/frontend/cphp
@ -0,0 +1 @@
+../../cphp
--- a/frontend/dump.json
+++ b/frontend/dump.json
--- a/frontend/includes/base.php
+++ b/frontend/includes/base.php
@ -0,0 +1,26 @@
+<?php
+/*
+ * Cryto Learn is more free software. It is licensed under the WTFPL, which
+ * allows you to do pretty much anything with it, without having to
+ * ask permission. Commercial use is allowed, and no attribution is
+ * required. We do politely request that you share your modifications
+ * to benefit other developers, but you are under no enforced
+ * obligation to do so :)
+ * 
+ * Please read the accompanying LICENSE document for the full WTFPL
+ * licensing text.
+ */
+
+if(!isset($_APP)) { die("Unauthorized."); }
+
+$_CPHP = true;
+$_CPHP_CONFIG = "../config.json";
+require("cphp/base.php");
+
+function __autoload($class_name) 
+{
+	global $_APP;
+	
+	$class_name = str_replace("\\", "/", strtolower($class_name));
+	require_once("classes/{$class_name}.php");
+}
--- a/frontend/index.php
+++ b/frontend/index.php
@ -0,0 +1,14 @@
+<?php
+/*
+ * Cryto Learn is more free software. It is licensed under the WTFPL, which
+ * allows you to do pretty much anything with it, without having to
+ * ask permission. Commercial use is allowed, and no attribution is
+ * required. We do politely request that you share your modifications
+ * to benefit other developers, but you are under no enforced
+ * obligation to do so :)
+ * 
+ * Please read the accompanying LICENSE document for the full WTFPL
+ * licensing text.
+ */
+
+require("rewrite.php");
--- a/frontend/locales/english.lng
+++ b/frontend/locales/english.lng
@ -0,0 +1,24 @@
+_locale;								en_US.UTF-8,en_US
+_datetime_short;							%d/%m/%Y %H:%M:%S
+_datetime_long;								%A %B %d, %Y %H:%M:%S
+_date_short;								%d/%m/%Y
+_date_long;								%A %B %d, %Y
+_time;									%H:%M:%S
+		
+event-now;								now
+event-future;								in the future
+event-past;								in the past
+event-1second-ago;							1 second ago
+event-seconds-ago;							%1$d seconds ago
+event-1minutes-ago;							1 minute ago
+event-minutes-ago;							%1$d minutes ago
+event-1hour-ago;							1 hour ago
+event-hours-ago;							%1$d hours ago
+event-1day-ago;								1 day ago
+event-days-ago;								%1$d days ago
+event-1week-ago;							1 week ago
+event-weeks-ago;							%1$d weeks ago
+event-1month-ago;							1 month ago
+event-months-ago;							%1$d months ago
+event-1year-ago;							1 year ago
+event-years-ago;							%1$d years ago
--- a/frontend/modules/api/dump.php
+++ b/frontend/modules/api/dump.php
@ -0,0 +1,28 @@
+<?php
+/*
+ * Cryto Learn is more free software. It is licensed under the WTFPL, which
+ * allows you to do pretty much anything with it, without having to
+ * ask permission. Commercial use is allowed, and no attribution is
+ * required. We do politely request that you share your modifications
+ * to benefit other developers, but you are under no enforced
+ * obligation to do so :)
+ * 
+ * Please read the accompanying LICENSE document for the full WTFPL
+ * licensing text.
+ */
+
+if(!isset($_APP)) { die("Unauthorized."); }
+
+if($_GET['key'] !== "derp")
+{
+	die();
+}
+
+$data = array();
+
+foreach(Topic::CreateFromQuery("SELECT * FROM topics WHERE `ParentId` = 0") as $topic)
+{
+	$data[] = $topic->AsDataset();
+}
+
+echo(json_encode($data));
--- a/frontend/modules/api/search.php
+++ b/frontend/modules/api/search.php
@ -0,0 +1,69 @@
+<?php
+/*
+ * Cryto Learn is more free software. It is licensed under the WTFPL, which
+ * allows you to do pretty much anything with it, without having to
+ * ask permission. Commercial use is allowed, and no attribution is
+ * required. We do politely request that you share your modifications
+ * to benefit other developers, but you are under no enforced
+ * obligation to do so :)
+ * 
+ * Please read the accompanying LICENSE document for the full WTFPL
+ * licensing text.
+ */
+
+if(!isset($_APP)) { die("Unauthorized."); }
+
+if(empty($_POST['q']))
+{
+	die(json_encode(array(
+		"error" => "No search query specified."
+	)));
+}
+else
+{
+	$query = $_POST['q'];
+	$terms = explode(" ", $query);
+	
+	$db_query_terms = array();
+	$valid_term = false;
+	
+	foreach($terms as $term)
+	{
+		$db_query_terms[] = "`Title` LIKE ?";
+		$term = str_replace("%", "\%", $term);
+		$term = str_replace("_", "\_", $term);
+		$valid_term = $valid_term || (strlen($term) >= 2);
+		$db_query_arguments[] = "%{$term}%";
+	}
+	
+	if($valid_term)
+	{
+		$db_query = implode(" AND ", $db_query_terms);
+		array_unshift($db_query_arguments, '');
+		unset($db_query_arguments[0]);
+		
+		try
+		{
+			$results_topics = Topic::CreateFromQuery("SELECT * FROM topics WHERE {$db_query}", $db_query_arguments);
+			
+			$return_objects = array();
+		
+			foreach($results_topics as $topic)
+			{
+				$return_objects[] =  $topic->AsDataset();
+			}
+			
+			$sPageContents = json_encode($return_objects);
+		}
+		catch (NotFoundException $e)
+		{
+			$sPageContents = json_encode(array("error" => "No results found for the specified query.", "query" => $query));
+		}
+	}
+	else
+	{
+		die(json_encode(array(
+			"error" => "No valid search query specified."
+		)));
+	}
+}
--- a/frontend/modules/ui/index.php
+++ b/frontend/modules/ui/index.php
@ -0,0 +1,18 @@
+<?php
+/*
+ * Cryto Learn is more free software. It is licensed under the WTFPL, which
+ * allows you to do pretty much anything with it, without having to
+ * ask permission. Commercial use is allowed, and no attribution is
+ * required. We do politely request that you share your modifications
+ * to benefit other developers, but you are under no enforced
+ * obligation to do so :)
+ * 
+ * Please read the accompanying LICENSE document for the full WTFPL
+ * licensing text.
+ */
+
+if(!isset($_APP)) { die("Unauthorized."); }
+
+$sPageContents = NewTemplater::Render("ui/index", $locale->strings, array());
+
+$sPageType = "ui";
--- a/frontend/rewrite.php
+++ b/frontend/rewrite.php
@ -0,0 +1,34 @@
+<?php
+$_APP = true;
+require("includes/base.php");
+
+$sPageContents = "";
+
+$router = new CPHPRouter();
+
+$router->allow_slash = true;
+$router->ignore_query = true;
+
+$router->routes = array(
+	0 => array(
+		"^/$"							=> "modules/ui/index.php",
+		"^/api/search$"						=> "modules/api/search.php",
+		"^/api/dump$"						=> "modules/api/dump.php"
+	)
+);
+
+try
+{
+	$router->RouteRequest();
+}
+catch (RouterException $e)
+{
+	http_status_code(404);
+	$sPageContents = "404 not found";
+}
+
+echo($sPageContents);
+
+/*
+
+* */
--- a/frontend/static/spinner.gif
+++ b/frontend/static/spinner.gif
--- a/frontend/style.css
+++ b/frontend/style.css
@ -6,6 +6,11 @@ body
 	font-family: sans-serif;
 }

+#templates
+{
+	display: none;
+}
+
 .header
 {
 	background-color: #C9F9DF;
@ -19,6 +24,12 @@ body
 	font-weight: normal;
 }

+.header h2
+{
+	margin: 2px;
+	font-size: 17px;
+}
+
 .search-large
 {
 	color: #006824;
@ -55,3 +66,95 @@ body
 	font-size: 26px;
 	width: 180px;
 }
+
+.spinner
+{
+	margin-left: 14px;
+}
+
+.topic, .item
+{
+	padding: 9px 12px;
+	margin: 5px 20px;
+	background-color: #79E1A8;
+	font-size: 20px;
+	width: 960px;
+}
+
+.topic 
+{
+	margin-top: 19px;
+	cursor: pointer;
+}
+
+.item
+{
+	margin-left: 34px;
+	width: 926px;
+	font-size: 18px;
+	background-color: #97F3C1;
+	display: none;
+}
+
+.type
+{
+	font-size: 18px;
+	color: gray;
+}
+
+.type:after
+{
+	content: ":";
+}
+
+a.title
+{
+	color: #041F9F;
+}
+
+.toggler
+{
+	display: block;
+	float: left;
+	width: 16px;
+	height: 16px;
+	margin-top: 2px;
+	margin-right: 8px;
+	font-size: 13px;
+	text-align: center;
+	font-weight: bold;
+	border: 1px solid black;
+	background-color: #D2ECCF;
+}
+
+.providername
+{
+	font-size: 18px;
+	color: gray;
+}
+
+.providername:before
+{
+	content: "(";
+}
+
+.providername:after
+{
+	content: ")";
+}
+
+.error
+{
+	margin: 8px 16px;
+	font-size: 19px;
+}
+
+.description
+{
+	margin-top: 4px;
+	font-size: 13px;
+	max-height: 15px;
+	overflow: hidden;
+	text-overflow: ellipsis;
+	white-space: nowrap;
+}
--- a/frontend/templates/ui/index.tpl
+++ b/frontend/templates/ui/index.tpl
@ -0,0 +1,160 @@
+<!doctype html>
+<html>
+	<head>
+		<title>learn.cryto.net</title>
+		<link rel="stylesheet" href="style.css">
+		<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.9.0/jquery.min.js"></script>
+		<script>
+			var search_timeout = null;
+			
+			$(function(){
+				/*$("input").val("data");
+				runSearch();*/
+			
+				$("input").keyup(function(){
+					if(typeof search_timeout !== "null")
+					{
+						clearTimeout(search_timeout);
+					}
+					
+					search_timeout = setTimeout(runSearch, 800)
+				});
+			});
+			
+			function runSearch()
+			{
+				$(".search-large").removeClass("search-large").addClass("search-top");
+				$(".spinner").show();
+				var query = $("input#query").val();
+				
+				if(query.length >= 3)
+				{
+					$.post("/api/search", {q: query}, function(response){
+						$(".spinner").hide();
+						$(".results").html("");
+						
+						if(typeof response.error == "undefined")
+						{
+							for(i in response)
+							{
+								if(response[i].items.length > 0)
+								{
+									var result_wrapper = instantiateTemplate("result_wrapper");
+									
+									var result_block = instantiateTemplate("result_topic");
+									result_block.children(".title").html(response[i].title);
+									result_block.children(".description").html(response[i].description);
+									result_block.children(".providername").html(response[i].provider);
+									result_block.appendTo(result_wrapper);
+									
+									for(x in response[i].items)
+									{
+										item = response[i].items[x];
+										
+										var item_block = instantiateTemplate("result_item");
+										item_block.children(".title").html(item.title);
+										item_block.children(".title").attr("href", item.url);
+										item_block.children(".type").html(item.type);
+										item_block.insertAfter(result_block);
+									}
+									
+									result_wrapper.appendTo(".results");
+								}
+							}
+						}
+						else
+						{
+							$(".results").html("<div class='error'>No results.</div>");
+						}
+						
+						setHandlers();
+					}, "json");
+				}
+				else
+				{
+					$(".spinner").hide();
+					$(".results").html("<div class='error'>Enter at least 3 characters.</div>");
+				}
+			}
+			
+			function setHandlers()
+			{
+				$(".toggler, .topic").each(
+					function(){
+						$(this).click(function(event){
+							toggleItems(this, event);
+						});
+					}
+				);
+			}
+			
+			function instantiateTemplate(template_name)
+			{
+				var instance = $("#template_" + template_name).clone();
+				instance.removeAttr("id");
+				return instance;
+			}
+			
+			function toggleItems(ctx, event)
+			{
+				var parent = $(ctx).parentsUntil(".wrapper");
+				
+				if(parent.length == 0)
+				{
+					var wrapper = $(ctx).parent();
+				}
+				else
+				{
+					var wrapper = parent.parent();
+				}
+				
+				var toggler = wrapper.find(".toggler");
+				
+				if(typeof toggler.data("toggled") == "undefined" || toggler.data("toggled") == false)
+				{
+					toggler.data("toggled", true);
+					toggler.html("-");
+					wrapper.find(".item").show();
+				}
+				else
+				{
+					toggler.data("toggled", false);
+					toggler.html("+");
+					wrapper.find(".item").hide();
+				}
+				
+				event.stopPropagation();
+			}
+		</script>
+	</head>
+	<body>
+		<div class="header">
+			<h1><strong>learn.cryto.net</strong> :: Learn something new!</h1>
+			<h2>Currently searching Coursera, Khan University, University of Reddit. Comments? <a href="mailto:learn@cryto.net">learn@cryto.net</a> or 
+			<a href="irc://irc.cryto.net/crytocc">irc.cryto.net #crytocc</a></h2>
+			<h2>Like the service and wish to donate? <a href="http://cryto.net/~joepie91/donate.html">You can do that here :)</a></h2>
+		</div>
+		<div class="main">
+			<div class="search-large">
+				I want to learn about <input type="text" id="query">. <img src="/static/spinner.gif" class="spinner" style="display: none;">
+			</div>
+			<div class="results">
+				
+			</div>
+		</div>
+		<div id="templates">
+			<div id="template_result_wrapper" class="wrapper"></div>
+			<div id="template_result_topic" class="topic">
+				<span class="toggler">+</span>
+				<strong>Topic: </strong>
+				<span class="title"></span>
+				<span class="providername"></span>
+				<div class="description"></div>
+			</div>
+			<div id="template_result_item" class="item">
+				<span class="type"></span>
+				<a href="#" class="title"></a>
+			</div>
+		</div>
+	</body>
+</html>
--- a/ocw_functions.txt
+++ b/ocw_functions.txt
@ -0,0 +1,51 @@
+"ocw.kaplan.edu": self._metadata_kaplan,
+"ocw.korea.edu": self._metadata_korea,
+"kyotomm.jp": self._metadata_kyoto,
+"ocw.kyushu-u.ac.jp": self._metadata_kyushu,
+"open-marhi.ru": self._metadata_moscow,
+"yctrtrc.ncku.edu.tw": self._metadata_chengkung,
+"ocw.nctu.edu.tw": self._metadata_chiaotung,
+"opencourse.ndhu.edu.tw": self._metadata_donghwa,
+"ocw.njit.edu": self._metadata_njit,
+"graduateschool.paristech.fr": self._metadata_paris,
+"peoples-uni.org": self._metadata_oaei,
+"ocw.sbu.ac.ir": self._metadata_shahid,
+"studentscircle.net": self._metadata_studentscircle,
+"ocw.tmu.edu.tw:8080": self._metadata_taipei,
+"openlearn.open.ac.uk": self._metadata_openuni,
+"www.ocw.titech.ac.jp": self._metadata_tokyo,
+"feedproxy.google.com": self._metadata_tudelft,
+"ocw.tufts.edu": self._metadata_tufts,
+"ocw.unu.edu": self._metadata_un,
+"ocw.uc3m.es": self._metadata_madrid,
+"ocw.ua.es": self._metadata_alicante,
+"ocw.unican.es": self._metadata_cantabria,
+"ocw.ugr.es": self._metadata_granada,
+"ocw.udem.edu.mx": self._metadata_monterrey,
+"ocw.um.es": self._metadata_murcia,
+"ocw.uniovi.es": self._metadata_oviedo,
+"ocw.usal.es": self._metadata_salamanca,
+"ocwus.us.es": self._metadata_sevilla,
+"ocw.unizar.es": self._metadata_zaragoza,
+"ocw.univalle.edu.co3": self._metadata_colombia,
+"ocw.uned.ac.cr": self._metadata_distancia,
+"www.icesi.edu.co": self._metadata_icesi,
+"ocw.innova.uned.es": self._metadata_innova,
+"upv.es": self._metadata_valencia,
+"ocw.upm.es": self._metadata_upm,
+"ocw.utpl.edu.ec": self._metadata_utpl,
+"ocw.uab.cat": self._metadata_uab,
+"ocw.ub.edu": self._metadata_ub,
+"ocw.uib.es": self._metadata_uib,
+"ocw.udl.cat": self._metadata_udl,
+"ocw.uv.es": self._metadata_uv,
+"e-ujier.uji.e": self._metadata_uji,
+"ocw.uoc.edu": self._metadata_uoc,
+"ocw.utm.my": self._metadata_utm,
+"ocw.uci.edu": self._metadata_uci,
+"opencontent.uct.ac.za": self._metadata_uct,
+"ocw.umb.edu:8080": self._metadata_boston,
+"open.umich.edu": self._metadata_michigan,
+"ocw.nd.edu": self._metadata_notredame,
+"ocw.usu.ac.id": self._metadata_usu,
+"ocw.tsukuba.ac.jp": self._metadata_tsukaba
--- a/ocw_sources.txt
+++ b/ocw_sources.txt
@ -0,0 +1,116 @@
+# AGH University of Science and Technology
+http://open.agh.edu.pl/course/view.php?id=97
+# Funda Getulio Vargas - FGV Online
+http://www5.fgv.br/fgvonline/CursosGratuitosFormulario.aspx?id_curso=OCWAJUEAD_00_01/2011_1
+# Gunadarma University
+http://ocw.gunadarma.ac.id/course/about
+# Johns Hopkins Bloomberg School of Public Health
+http://ocw.jhsph.edu/courses/AdolHealthDev/?source=rss
+# Kaplan University Online & Campus Learning
+http://ocw.kaplan.edu/arts-and-sciences/academic-strategies
+# Korea University
+http://ocw.korea.edu/ocw/college-of-science/general-physics-i
+# Kyoto Seika University
+http://www.kyotomm.jp/event/exh/kyotomagic2012.php
+# Kyushu University
+http://ocw.kyushu-u.ac.jp/90901/0007/index.html
+# Massachusetts Institute of Technology
+http://ocw.mit.edu/courses/civil-and-environmental-engineering/1-00-introduction-to-computers-and-engineering-problem-solving-fall-2005
+# MOSCOW ARCHITECTURAL INSTITUTE
+http://www.open-marhi.ru/courses/detail/index.php?ID=6631
+# National Cheng Kung University
+http://yctrtrc.ncku.edu.tw/site2/newocwcourse/OCW_MAIN.php?cid=141
+# National Chiao Tung University
+http://ocw.nctu.edu.tw/riki_detail.php?pgid=335
+# National Dong Hwa University
+http://opencourse.ndhu.edu.tw/moodle/mod/forum/discuss.php?d=3
+# New Jersey Institute of Technology
+http://ocw.njit.edu/ocw/som/acct/acct-615-anandarajan/index.php
+# Paris Tech
+http://graduateschool.paristech.fr/cours.php?id=309132
+# People's Open Access Education Initiative
+http://www.peoples-uni.org/node/236
+# Shahid Beheshti University
+http://ocw.sbu.ac.ir/Default.aspx?tabid=5352&language=fa-IR
+# Students Circle Network
+http://studentscircle.net/live/2011/07/a-guide-before-learning-a-new-javascript-framework/
+# Taipei Medical University
+http://ocw.tmu.edu.tw:8080/eduCommons/general-education/53f28a1882076b7753f24eba72698a556790-shih-chi-analysis-on-historical-figures
+# The Open University
+http://openlearn.open.ac.uk/course/view.php?name=DD208_3
+# The Open University of Israel
+http://peer-news.blogspot.com/2011/12/2-10934.html
+# Tokyo Institute of Technology
+http://www.ocw.titech.ac.jp/index.php?module=General&Nendo=2012&action=T0300&GakubuCD=223&GakkaCD=224710&KougiCD=70030&Gakki=1&lang=EN
+# TU Delft
+http://feedproxy.google.com/~r/tudelft/OCW/~3/0sA6qPQKcOg/bachelor-civiele-techniek
+# Tufts University
+http://ocw.tufts.edu/Course/39
+# UNISUL - Universidade do Sul de Santa Catarina
+http://labspace.open.ac.uk
+# United Nations University
+http://ocw.unu.edu/international-institute-for-software-technology/building-a-community-of-practice-for-electronic-governance
+# Universidad Carlos III de Madrid
+http://ocw.uc3m.es/ingenieria-electrica/accionamientos-electricos
+# Universidad de Alicante
+http://ocw.ua.es/Ciencias_Sociales_y_Juridicas/actividades-deportivas-medio-ambiente
+# Universidad de Cantabria
+http://ocw.unican.es/ciencias-de-la-salud/actuacion-en-situaciones-especiales
+# Universidad de Granada
+http://ocw.ugr.es/course/view.php?id=23&topic=1
+# Universidad de Monterrey
+http://ocw.udem.edu.mx/cursos-de-profesional/administracion-de-tecnologias-de-informacion
+# Universidad de Murcia
+http://ocw.um.es/cc.-sociales/actividad-fisica-en-el-envejecimiento
+# Universidad de Oviedo
+http://ocw.uniovi.es/course/view.php?id=28&ocw=1
+# Universidad de Salamanca
+http://ocw.usal.es/ciencias-sociales-1/curso-cero-matematicas-para-ciencias-sociales-nivelacion-de-conocimientos
+# Universidad de Sevilla
+http://ocwus.us.es/matematica-aplicada/pp-3
+# Universidad de Zaragoza
+http://ocw.unizar.es/ocw/ciencias-de-la-salud-1/actividades-fisicas-y-deportivas-aereas
+# Universidad del Valle - Colombia
+http://ocw.univalle.edu.co/ocw/ingenieria-electronica-telecomunicaciones-y-afines/arquitectura-de-procesos-industriales
+# Universidad Estatal a Distancia
+http://ocw.uned.ac.cr/eduCommons/ciencias-de-la-administracion/compras-y-almacenamiento
+# Universidad Icesi
+http://www.icesi.edu.co/ocw/tic/administracion_plataformas_y_seguridad
+# Universidad Nacional de Educacion a Distancia
+http://ocw.innova.uned.es/ocwuniversia/psicologia/analisis-de-datos-en-Psico-I
+# Universidad Politica de Valencia
+http://www.upv.es/ocwasi/2010/6842
+# Universidad Politica Madrid
+http://ocw.upm.es/ingenieria-cartografica-geodesica-y-fotogrametria/3d-scanning-and-modeling
+# UNIVERSIDAD TECNICA PARTICULAR DE LOJA
+http://ocw.utpl.edu.ec/economia
+# Universitat Auta de Barcelona
+http://ocw.uab.cat/enginyeries/apunts-de-calcul-matricial-i-resolucio-de-sistemes
+# Universitat de Barcelona
+http://ocw.ub.edu/admistracio-i-direccio-dempreses
+# Universitat de les Illes Balears
+http://ocw.uib.es/ocw/infermeria/atencion-de-enfermeria-frente-situaciones-de
+# Universitat de Lleida
+http://ocw.udl.cat/arts-i-humanitats
+# Universitat de Valia
+http://ocw.uv.es/ciencias-sociales-y-juridicas/2-2
+# Universitat Jaume I
+http://e-ujier.uji.es/pls/www/!gri_www.euji22101?p_id=15&p_tipo=A&p_curso=IG23&p_idioma=CA
+# Universitat Oberta de Catalunya
+http://ocw.uoc.edu/informatica-tecnologia-i-multimedia/administracio-avancada-del-sistema-operatiu-gnu-linux
+# Universiti Teknologi Malaysia
+http://ocw.utm.my/course/view.php?id=90
+# University of California, Irvine
+http://ocw.uci.edu/courses/course.aspx?id=113
+# University of Cape Town
+http://opencontent.uct.ac.za/Centre-for-Higher-Education-Development/Centre-for-Open-Learning/A-developmental-state-The-challenge-ahead
+# University of Massachusetts Boston
+http://ocw.umb.edu:8080/eduCommons/about
+# University of Michigan
+http://open.umich.edu/education/med/oernetwork/med/em/aetc-redirect/2009
+# University of Notre Dame
+http://ocw.nd.edu/history/african-american-history-ii
+# University of Sumatera Utara
+http://ocw.usu.ac.id/course/detail/teknik-sipil-s1/4110000007-struktur-bangunan-sipil-i.html
+# University of Tsukuba
+http://ocw.tsukuba.ac.jp/6570740672698cea79d15b6678147a7679d130fb65705b665c02653b/66f87c4d7d394ecb
--- a/shellsearch/search.py
+++ b/shellsearch/search.py
@ -0,0 +1,22 @@
+#!/usr/bin/env python
+
+import requests, sys, re
+
+query = sys.argv[1]
+
+results = requests.post("http://learn.cryto.net/api/search", {"q": query}).json()
+
+for result in results:
+	name = result["title"].rstrip()
+	description = result["description"].strip().replace("\n", " ")
+	
+	if len(description) > 200:
+		description = re.match("^(.{0,300})\W", description).group(1) + "..."
+	
+	print "## %s\n%s" % (name, description)
+	
+	for item in result["items"]:
+		name = item["title"].ljust(70)
+		print "\t[%s] %s\t%s" % (item["type"], name, item["url"])
+
+	print ""
--- a/update.sql
+++ b/update.sql
@ -0,0 +1,2 @@
+ALTER TABLE  `items` ADD  `CustomProviderName` VARCHAR( 250 ) NULL DEFAULT NULL;
+ALTER TABLE  `topics` ADD  `CustomProviderName` VARCHAR( 250 ) NULL DEFAULT NULL;
--- a/updater/bs4/init.py
+++ b/updater/bs4/init.py
@ -0,0 +1,361 @@
+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+http://www.crummy.com/software/BeautifulSoup/
+
+Beautiful Soup uses a pluggable XML or HTML parser to parse a
+(possibly invalid) document into a tree representation. Beautiful Soup
+provides provides methods and Pythonic idioms that make it easy to
+navigate, search, and modify the parse tree.
+
+Beautiful Soup works with Python 2.6 and up. It works better if lxml
+and/or html5lib is installed.
+
+For more than you ever wanted to know about Beautiful Soup, see the
+documentation:
+http://www.crummy.com/software/BeautifulSoup/bs4/doc/
+"""
+
+__author__ = "Leonard Richardson (leonardr@segfault.org)"
+__version__ = "4.1.3"
+__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
+__license__ = "MIT"
+
+__all__ = ['BeautifulSoup']
+
+import re
+import warnings
+
+from .builder import builder_registry
+from .dammit import UnicodeDammit
+from .element import (
+    CData,
+    Comment,
+    DEFAULT_OUTPUT_ENCODING,
+    Declaration,
+    Doctype,
+    NavigableString,
+    PageElement,
+    ProcessingInstruction,
+    ResultSet,
+    SoupStrainer,
+    Tag,
+    )
+
+# The very first thing we do is give a useful error if someone is
+# running this code under Python 3 without converting it.
+syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
+
+class BeautifulSoup(Tag):
+    """
+    This class defines the basic interface called by the tree builders.
+
+    These methods will be called by the parser:
+      reset()
+      feed(markup)
+
+    The tree builder may call these methods from its feed() implementation:
+      handle_starttag(name, attrs) # See note about return value
+      handle_endtag(name)
+      handle_data(data) # Appends to the current data node
+      endData(containerClass=NavigableString) # Ends the current data node
+
+    No matter how complicated the underlying parser is, you should be
+    able to build a tree using 'start tag' events, 'end tag' events,
+    'data' events, and "done with data" events.
+
+    If you encounter an empty-element tag (aka a self-closing tag,
+    like HTML's <br> tag), call handle_starttag and then
+    handle_endtag.
+    """
+    ROOT_TAG_NAME = u'[document]'
+
+    # If the end-user gives no indication which tree builder they
+    # want, look for one with these features.
+    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
+
+    # Used when determining whether a text node is all whitespace and
+    # can be replaced with a single space. A text node that contains
+    # fancy Unicode spaces (usually non-breaking) should be left
+    # alone.
+    STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
+
+    def __init__(self, markup="", features=None, builder=None,
+                 parse_only=None, from_encoding=None, **kwargs):
+        """The Soup object is initialized as the 'root tag', and the
+        provided markup (which can be a string or a file-like object)
+        is fed into the underlying parser."""
+
+        if 'convertEntities' in kwargs:
+            warnings.warn(
+                "BS4 does not respect the convertEntities argument to the "
+                "BeautifulSoup constructor. Entities are always converted "
+                "to Unicode characters.")
+
+        if 'markupMassage' in kwargs:
+            del kwargs['markupMassage']
+            warnings.warn(
+                "BS4 does not respect the markupMassage argument to the "
+                "BeautifulSoup constructor. The tree builder is responsible "
+                "for any necessary markup massage.")
+
+        if 'smartQuotesTo' in kwargs:
+            del kwargs['smartQuotesTo']
+            warnings.warn(
+                "BS4 does not respect the smartQuotesTo argument to the "
+                "BeautifulSoup constructor. Smart quotes are always converted "
+                "to Unicode characters.")
+
+        if 'selfClosingTags' in kwargs:
+            del kwargs['selfClosingTags']
+            warnings.warn(
+                "BS4 does not respect the selfClosingTags argument to the "
+                "BeautifulSoup constructor. The tree builder is responsible "
+                "for understanding self-closing tags.")
+
+        if 'isHTML' in kwargs:
+            del kwargs['isHTML']
+            warnings.warn(
+                "BS4 does not respect the isHTML argument to the "
+                "BeautifulSoup constructor. You can pass in features='html' "
+                "or features='xml' to get a builder capable of handling "
+                "one or the other.")
+
+        def deprecated_argument(old_name, new_name):
+            if old_name in kwargs:
+                warnings.warn(
+                    'The "%s" argument to the BeautifulSoup constructor '
+                    'has been renamed to "%s."' % (old_name, new_name))
+                value = kwargs[old_name]
+                del kwargs[old_name]
+                return value
+            return None
+
+        parse_only = parse_only or deprecated_argument(
+            "parseOnlyThese", "parse_only")
+
+        from_encoding = from_encoding or deprecated_argument(
+            "fromEncoding", "from_encoding")
+
+        if len(kwargs) > 0:
+            arg = kwargs.keys().pop()
+            raise TypeError(
+                "__init__() got an unexpected keyword argument '%s'" % arg)
+
+        if builder is None:
+            if isinstance(features, basestring):
+                features = [features]
+            if features is None or len(features) == 0:
+                features = self.DEFAULT_BUILDER_FEATURES
+            builder_class = builder_registry.lookup(*features)
+            if builder_class is None:
+                raise FeatureNotFound(
+                    "Couldn't find a tree builder with the features you "
+                    "requested: %s. Do you need to install a parser library?"
+                    % ",".join(features))
+            builder = builder_class()
+        self.builder = builder
+        self.is_xml = builder.is_xml
+        self.builder.soup = self
+
+        self.parse_only = parse_only
+
+        self.reset()
+
+        if hasattr(markup, 'read'):        # It's a file-type object.
+            markup = markup.read()
+        (self.markup, self.original_encoding, self.declared_html_encoding,
+         self.contains_replacement_characters) = (
+            self.builder.prepare_markup(markup, from_encoding))
+
+        try:
+            self._feed()
+        except StopParsing:
+            pass
+
+        # Clear out the markup and remove the builder's circular
+        # reference to this object.
+        self.markup = None
+        self.builder.soup = None
+
+    def _feed(self):
+        # Convert the document to Unicode.
+        self.builder.reset()
+
+        self.builder.feed(self.markup)
+        # Close out any unfinished strings and close all the open tags.
+        self.endData()
+        while self.currentTag.name != self.ROOT_TAG_NAME:
+            self.popTag()
+
+    def reset(self):
+        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
+        self.hidden = 1
+        self.builder.reset()
+        self.currentData = []
+        self.currentTag = None
+        self.tagStack = []
+        self.pushTag(self)
+
+    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
+        """Create a new tag associated with this soup."""
+        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
+
+    def new_string(self, s):
+        """Create a new NavigableString associated with this soup."""
+        navigable = NavigableString(s)
+        navigable.setup()
+        return navigable
+
+    def insert_before(self, successor):
+        raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
+
+    def insert_after(self, successor):
+        raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
+
+    def popTag(self):
+        tag = self.tagStack.pop()
+        #print "Pop", tag.name
+        if self.tagStack:
+            self.currentTag = self.tagStack[-1]
+        return self.currentTag
+
+    def pushTag(self, tag):
+        #print "Push", tag.name
+        if self.currentTag:
+            self.currentTag.contents.append(tag)
+        self.tagStack.append(tag)
+        self.currentTag = self.tagStack[-1]
+
+    def endData(self, containerClass=NavigableString):
+        if self.currentData:
+            currentData = u''.join(self.currentData)
+            if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
+                not set([tag.name for tag in self.tagStack]).intersection(
+                    self.builder.preserve_whitespace_tags)):
+                if '\n' in currentData:
+                    currentData = '\n'
+                else:
+                    currentData = ' '
+            self.currentData = []
+            if self.parse_only and len(self.tagStack) <= 1 and \
+                   (not self.parse_only.text or \
+                    not self.parse_only.search(currentData)):
+                return
+            o = containerClass(currentData)
+            self.object_was_parsed(o)
+
+    def object_was_parsed(self, o, parent=None, previous_element=None):
+        """Add an object to the parse tree."""
+        parent = parent or self.currentTag
+        previous_element = previous_element or self.previous_element
+        o.setup(parent, previous_element)
+        if self.previous_element:
+            self.previous_element.next_element = o
+        self.previous_element = o
+        parent.contents.append(o)
+
+    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
+        """Pops the tag stack up to and including the most recent
+        instance of the given tag. If inclusivePop is false, pops the tag
+        stack up to but *not* including the most recent instqance of
+        the given tag."""
+        #print "Popping to %s" % name
+        if name == self.ROOT_TAG_NAME:
+            return
+
+        numPops = 0
+        mostRecentTag = None
+
+        for i in range(len(self.tagStack) - 1, 0, -1):
+            if (name == self.tagStack[i].name
+                and nsprefix == self.tagStack[i].prefix):
+                numPops = len(self.tagStack) - i
+                break
+        if not inclusivePop:
+            numPops = numPops - 1
+
+        for i in range(0, numPops):
+            mostRecentTag = self.popTag()
+        return mostRecentTag
+
+    def handle_starttag(self, name, namespace, nsprefix, attrs):
+        """Push a start tag on to the stack.
+
+        If this method returns None, the tag was rejected by the
+        SoupStrainer. You should proceed as if the tag had not occured
+        in the document. For instance, if this was a self-closing tag,
+        don't call handle_endtag.
+        """
+
+        # print "Start tag %s: %s" % (name, attrs)
+        self.endData()
+
+        if (self.parse_only and len(self.tagStack) <= 1
+            and (self.parse_only.text
+                 or not self.parse_only.search_tag(name, attrs))):
+            return None
+
+        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
+                  self.currentTag, self.previous_element)
+        if tag is None:
+            return tag
+        if self.previous_element:
+            self.previous_element.next_element = tag
+        self.previous_element = tag
+        self.pushTag(tag)
+        return tag
+
+    def handle_endtag(self, name, nsprefix=None):
+        #print "End tag: " + name
+        self.endData()
+        self._popToTag(name, nsprefix)
+
+    def handle_data(self, data):
+        self.currentData.append(data)
+
+    def decode(self, pretty_print=False,
+               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+               formatter="minimal"):
+        """Returns a string or Unicode representation of this document.
+        To get Unicode, pass None for encoding."""
+
+        if self.is_xml:
+            # Print the XML declaration
+            encoding_part = ''
+            if eventual_encoding != None:
+                encoding_part = ' encoding="%s"' % eventual_encoding
+            prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
+        else:
+            prefix = u''
+        if not pretty_print:
+            indent_level = None
+        else:
+            indent_level = 0
+        return prefix + super(BeautifulSoup, self).decode(
+            indent_level, eventual_encoding, formatter)
+
+class BeautifulStoneSoup(BeautifulSoup):
+    """Deprecated interface to an XML parser."""
+
+    def __init__(self, *args, **kwargs):
+        kwargs['features'] = 'xml'
+        warnings.warn(
+            'The BeautifulStoneSoup class is deprecated. Instead of using '
+            'it, pass features="xml" into the BeautifulSoup constructor.')
+        super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
+
+
+class StopParsing(Exception):
+    pass
+
+
+class FeatureNotFound(ValueError):
+    pass
+
+
+#By default, act as an HTML pretty-printer.
+if __name__ == '__main__':
+    import sys
+    soup = BeautifulSoup(sys.stdin)
+    print soup.prettify()
--- a/updater/bs4/builder/init.py
+++ b/updater/bs4/builder/init.py
@ -0,0 +1,316 @@
+from collections import defaultdict
+import itertools
+import sys
+from bs4.element import (
+    CharsetMetaAttributeValue,
+    ContentMetaAttributeValue,
+    whitespace_re
+    )
+
+__all__ = [
+    'HTMLTreeBuilder',
+    'SAXTreeBuilder',
+    'TreeBuilder',
+    'TreeBuilderRegistry',
+    ]
+
+# Some useful features for a TreeBuilder to have.
+FAST = 'fast'
+PERMISSIVE = 'permissive'
+STRICT = 'strict'
+XML = 'xml'
+HTML = 'html'
+HTML_5 = 'html5'
+
+
+class TreeBuilderRegistry(object):
+
+    def __init__(self):
+        self.builders_for_feature = defaultdict(list)
+        self.builders = []
+
+    def register(self, treebuilder_class):
+        """Register a treebuilder based on its advertised features."""
+        for feature in treebuilder_class.features:
+            self.builders_for_feature[feature].insert(0, treebuilder_class)
+        self.builders.insert(0, treebuilder_class)
+
+    def lookup(self, *features):
+        if len(self.builders) == 0:
+            # There are no builders at all.
+            return None
+
+        if len(features) == 0:
+            # They didn't ask for any features. Give them the most
+            # recently registered builder.
+            return self.builders[0]
+
+        # Go down the list of features in order, and eliminate any builders
+        # that don't match every feature.
+        features = list(features)
+        features.reverse()
+        candidates = None
+        candidate_set = None
+        while len(features) > 0:
+            feature = features.pop()
+            we_have_the_feature = self.builders_for_feature.get(feature, [])
+            if len(we_have_the_feature) > 0:
+                if candidates is None:
+                    candidates = we_have_the_feature
+                    candidate_set = set(candidates)
+                else:
+                    # Eliminate any candidates that don't have this feature.
+                    candidate_set = candidate_set.intersection(
+                        set(we_have_the_feature))
+
+        # The only valid candidates are the ones in candidate_set.
+        # Go through the original list of candidates and pick the first one
+        # that's in candidate_set.
+        if candidate_set is None:
+            return None
+        for candidate in candidates:
+            if candidate in candidate_set:
+                return candidate
+        return None
+
+# The BeautifulSoup class will take feature lists from developers and use them
+# to look up builders in this registry.
+builder_registry = TreeBuilderRegistry()
+
+class TreeBuilder(object):
+    """Turn a document into a Beautiful Soup object tree."""
+
+    features = []
+
+    is_xml = False
+    preserve_whitespace_tags = set()
+    empty_element_tags = None # A tag will be considered an empty-element
+                              # tag when and only when it has no contents.
+
+    # A value for these tag/attribute combinations is a space- or
+    # comma-separated list of CDATA, rather than a single CDATA.
+    cdata_list_attributes = {}
+
+
+    def __init__(self):
+        self.soup = None
+
+    def reset(self):
+        pass
+
+    def can_be_empty_element(self, tag_name):
+        """Might a tag with this name be an empty-element tag?
+
+        The final markup may or may not actually present this tag as
+        self-closing.
+
+        For instance: an HTMLBuilder does not consider a <p> tag to be
+        an empty-element tag (it's not in
+        HTMLBuilder.empty_element_tags). This means an empty <p> tag
+        will be presented as "<p></p>", not "<p />".
+
+        The default implementation has no opinion about which tags are
+        empty-element tags, so a tag will be presented as an
+        empty-element tag if and only if it has no contents.
+        "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
+        be left alone.
+        """
+        if self.empty_element_tags is None:
+            return True
+        return tag_name in self.empty_element_tags
+
+    def feed(self, markup):
+        raise NotImplementedError()
+
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       document_declared_encoding=None):
+        return markup, None, None, False
+
+    def test_fragment_to_document(self, fragment):
+        """Wrap an HTML fragment to make it look like a document.
+
+        Different parsers do this differently. For instance, lxml
+        introduces an empty <head> tag, and html5lib
+        doesn't. Abstracting this away lets us write simple tests
+        which run HTML fragments through the parser and compare the
+        results against other HTML fragments.
+
+        This method should not be used outside of tests.
+        """
+        return fragment
+
+    def set_up_substitutions(self, tag):
+        return False
+
+    def _replace_cdata_list_attribute_values(self, tag_name, attrs):
+        """Replaces class="foo bar" with class=["foo", "bar"]
+
+        Modifies its input in place.
+        """
+        if self.cdata_list_attributes:
+            universal = self.cdata_list_attributes.get('*', [])
+            tag_specific = self.cdata_list_attributes.get(
+                tag_name.lower(), [])
+            for cdata_list_attr in itertools.chain(universal, tag_specific):
+                if cdata_list_attr in dict(attrs):
+                    # Basically, we have a "class" attribute whose
+                    # value is a whitespace-separated list of CSS
+                    # classes. Split it into a list.
+                    value = attrs[cdata_list_attr]
+                    if isinstance(value, basestring):
+                        values = whitespace_re.split(value)
+                    else:
+                        # html5lib sometimes calls setAttributes twice
+                        # for the same tag when rearranging the parse
+                        # tree. On the second call the attribute value
+                        # here is already a list.  If this happens,
+                        # leave the value alone rather than trying to
+                        # split it again.
+                        values = value
+                    attrs[cdata_list_attr] = values
+        return attrs
+
+class SAXTreeBuilder(TreeBuilder):
+    """A Beautiful Soup treebuilder that listens for SAX events."""
+
+    def feed(self, markup):
+        raise NotImplementedError()
+
+    def close(self):
+        pass
+
+    def startElement(self, name, attrs):
+        attrs = dict((key[1], value) for key, value in list(attrs.items()))
+        #print "Start %s, %r" % (name, attrs)
+        self.soup.handle_starttag(name, attrs)
+
+    def endElement(self, name):
+        #print "End %s" % name
+        self.soup.handle_endtag(name)
+
+    def startElementNS(self, nsTuple, nodeName, attrs):
+        # Throw away (ns, nodeName) for now.
+        self.startElement(nodeName, attrs)
+
+    def endElementNS(self, nsTuple, nodeName):
+        # Throw away (ns, nodeName) for now.
+        self.endElement(nodeName)
+        #handler.endElementNS((ns, node.nodeName), node.nodeName)
+
+    def startPrefixMapping(self, prefix, nodeValue):
+        # Ignore the prefix for now.
+        pass
+
+    def endPrefixMapping(self, prefix):
+        # Ignore the prefix for now.
+        # handler.endPrefixMapping(prefix)
+        pass
+
+    def characters(self, content):
+        self.soup.handle_data(content)
+
+    def startDocument(self):
+        pass
+
+    def endDocument(self):
+        pass
+
+
+class HTMLTreeBuilder(TreeBuilder):
+    """This TreeBuilder knows facts about HTML.
+
+    Such as which tags are empty-element tags.
+    """
+
+    preserve_whitespace_tags = set(['pre', 'textarea'])
+    empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
+                              'spacer', 'link', 'frame', 'base'])
+
+    # The HTML standard defines these attributes as containing a
+    # space-separated list of values, not a single value. That is,
+    # class="foo bar" means that the 'class' attribute has two values,
+    # 'foo' and 'bar', not the single value 'foo bar'.  When we
+    # encounter one of these attributes, we will parse its value into
+    # a list of values if possible. Upon output, the list will be
+    # converted back into a string.
+    cdata_list_attributes = {
+        "*" : ['class', 'accesskey', 'dropzone'],
+        "a" : ['rel', 'rev'],
+        "link" :  ['rel', 'rev'],
+        "td" : ["headers"],
+        "th" : ["headers"],
+        "td" : ["headers"],
+        "form" : ["accept-charset"],
+        "object" : ["archive"],
+
+        # These are HTML5 specific, as are *.accesskey and *.dropzone above.
+        "area" : ["rel"],
+        "icon" : ["sizes"],
+        "iframe" : ["sandbox"],
+        "output" : ["for"],
+        }
+
+    def set_up_substitutions(self, tag):
+        # We are only interested in <meta> tags
+        if tag.name != 'meta':
+            return False
+
+        http_equiv = tag.get('http-equiv')
+        content = tag.get('content')
+        charset = tag.get('charset')
+
+        # We are interested in <meta> tags that say what encoding the
+        # document was originally in. This means HTML 5-style <meta>
+        # tags that provide the "charset" attribute. It also means
+        # HTML 4-style <meta> tags that provide the "content"
+        # attribute and have "http-equiv" set to "content-type".
+        #
+        # In both cases we will replace the value of the appropriate
+        # attribute with a standin object that can take on any
+        # encoding.
+        meta_encoding = None
+        if charset is not None:
+            # HTML 5 style:
+            # <meta charset="utf8">
+            meta_encoding = charset
+            tag['charset'] = CharsetMetaAttributeValue(charset)
+
+        elif (content is not None and http_equiv is not None
+              and http_equiv.lower() == 'content-type'):
+            # HTML 4 style:
+            # <meta http-equiv="content-type" content="text/html; charset=utf8">
+            tag['content'] = ContentMetaAttributeValue(content)
+
+        return (meta_encoding is not None)
+
+def register_treebuilders_from(module):
+    """Copy TreeBuilders from the given module into this module."""
+    # I'm fairly sure this is not the best way to do this.
+    this_module = sys.modules['bs4.builder']
+    for name in module.__all__:
+        obj = getattr(module, name)
+
+        if issubclass(obj, TreeBuilder):
+            setattr(this_module, name, obj)
+            this_module.__all__.append(name)
+            # Register the builder while we're at it.
+            this_module.builder_registry.register(obj)
+
+# Builders are registered in reverse order of priority, so that custom
+# builder registrations will take precedence. In general, we want lxml
+# to take precedence over html5lib, because it's faster. And we only
+# want to use HTMLParser as a last result.
+from . import _htmlparser
+register_treebuilders_from(_htmlparser)
+try:
+    from . import _html5lib
+    register_treebuilders_from(_html5lib)
+except ImportError:
+    # They don't have html5lib installed.
+    pass
+try:
+    from . import _lxml
+    register_treebuilders_from(_lxml)
+except ImportError:
+    # They don't have lxml installed.
+    pass
--- a/updater/bs4/builder/_html5lib.py
+++ b/updater/bs4/builder/_html5lib.py
@ -0,0 +1,221 @@
+__all__ = [
+    'HTML5TreeBuilder',
+    ]
+
+import warnings
+from bs4.builder import (
+    PERMISSIVE,
+    HTML,
+    HTML_5,
+    HTMLTreeBuilder,
+    )
+from bs4.element import NamespacedAttribute
+import html5lib
+from html5lib.constants import namespaces
+from bs4.element import (
+    Comment,
+    Doctype,
+    NavigableString,
+    Tag,
+    )
+
+class HTML5TreeBuilder(HTMLTreeBuilder):
+    """Use html5lib to build a tree."""
+
+    features = ['html5lib', PERMISSIVE, HTML_5, HTML]
+
+    def prepare_markup(self, markup, user_specified_encoding):
+        # Store the user-specified encoding for use later on.
+        self.user_specified_encoding = user_specified_encoding
+        return markup, None, None, False
+
+    # These methods are defined by Beautiful Soup.
+    def feed(self, markup):
+        if self.soup.parse_only is not None:
+            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
+        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
+        doc = parser.parse(markup, encoding=self.user_specified_encoding)
+
+        # Set the character encoding detected by the tokenizer.
+        if isinstance(markup, unicode):
+            # We need to special-case this because html5lib sets
+            # charEncoding to UTF-8 if it gets Unicode input.
+            doc.original_encoding = None
+        else:
+            doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
+
+    def create_treebuilder(self, namespaceHTMLElements):
+        self.underlying_builder = TreeBuilderForHtml5lib(
+            self.soup, namespaceHTMLElements)
+        return self.underlying_builder
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<html><head></head><body>%s</body></html>' % fragment
+
+
+class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
+
+    def __init__(self, soup, namespaceHTMLElements):
+        self.soup = soup
+        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
+
+    def documentClass(self):
+        self.soup.reset()
+        return Element(self.soup, self.soup, None)
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
+        self.soup.object_was_parsed(doctype)
+
+    def elementClass(self, name, namespace):
+        tag = self.soup.new_tag(name, namespace)
+        return Element(tag, self.soup, namespace)
+
+    def commentClass(self, data):
+        return TextNode(Comment(data), self.soup)
+
+    def fragmentClass(self):
+        self.soup = BeautifulSoup("")
+        self.soup.name = "[document_fragment]"
+        return Element(self.soup, self.soup, None)
+
+    def appendChild(self, node):
+        # XXX This code is not covered by the BS4 tests.
+        self.soup.append(node.element)
+
+    def getDocument(self):
+        return self.soup
+
+    def getFragment(self):
+        return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
+
+class AttrList(object):
+    def __init__(self, element):
+        self.element = element
+        self.attrs = dict(self.element.attrs)
+    def __iter__(self):
+        return list(self.attrs.items()).__iter__()
+    def __setitem__(self, name, value):
+        "set attr", name, value
+        self.element[name] = value
+    def items(self):
+        return list(self.attrs.items())
+    def keys(self):
+        return list(self.attrs.keys())
+    def __len__(self):
+        return len(self.attrs)
+    def __getitem__(self, name):
+        return self.attrs[name]
+    def __contains__(self, name):
+        return name in list(self.attrs.keys())
+
+
+class Element(html5lib.treebuilders._base.Node):
+    def __init__(self, element, soup, namespace):
+        html5lib.treebuilders._base.Node.__init__(self, element.name)
+        self.element = element
+        self.soup = soup
+        self.namespace = namespace
+
+    def appendChild(self, node):
+        if (node.element.__class__ == NavigableString and self.element.contents
+            and self.element.contents[-1].__class__ == NavigableString):
+            # Concatenate new text onto old text node
+            # XXX This has O(n^2) performance, for input like
+            # "a</a>a</a>a</a>..."
+            old_element = self.element.contents[-1]
+            new_element = self.soup.new_string(old_element + node.element)
+            old_element.replace_with(new_element)
+        else:
+            self.soup.object_was_parsed(node.element, parent=self.element)
+
+    def getAttributes(self):
+        return AttrList(self.element)
+
+    def setAttributes(self, attributes):
+        if attributes is not None and len(attributes) > 0:
+
+            converted_attributes = []
+            for name, value in list(attributes.items()):
+                if isinstance(name, tuple):
+                    new_name = NamespacedAttribute(*name)
+                    del attributes[name]
+                    attributes[new_name] = value
+
+            self.soup.builder._replace_cdata_list_attribute_values(
+                self.name, attributes)
+            for name, value in attributes.items():
+                self.element[name] = value
+
+            # The attributes may contain variables that need substitution.
+            # Call set_up_substitutions manually.
+            #
+            # The Tag constructor called this method when the Tag was created,
+            # but we just set/changed the attributes, so call it again.
+            self.soup.builder.set_up_substitutions(self.element)
+    attributes = property(getAttributes, setAttributes)
+
+    def insertText(self, data, insertBefore=None):
+        text = TextNode(self.soup.new_string(data), self.soup)
+        if insertBefore:
+            self.insertBefore(text, insertBefore)
+        else:
+            self.appendChild(text)
+
+    def insertBefore(self, node, refNode):
+        index = self.element.index(refNode.element)
+        if (node.element.__class__ == NavigableString and self.element.contents
+            and self.element.contents[index-1].__class__ == NavigableString):
+            # (See comments in appendChild)
+            old_node = self.element.contents[index-1]
+            new_str = self.soup.new_string(old_node + node.element)
+            old_node.replace_with(new_str)
+        else:
+            self.element.insert(index, node.element)
+            node.parent = self
+
+    def removeChild(self, node):
+        node.element.extract()
+
+    def reparentChildren(self, newParent):
+        while self.element.contents:
+            child = self.element.contents[0]
+            child.extract()
+            if isinstance(child, Tag):
+                newParent.appendChild(
+                    Element(child, self.soup, namespaces["html"]))
+            else:
+                newParent.appendChild(
+                    TextNode(child, self.soup))
+
+    def cloneNode(self):
+        tag = self.soup.new_tag(self.element.name, self.namespace)
+        node = Element(tag, self.soup, self.namespace)
+        for key,value in self.attributes:
+            node.attributes[key] = value
+        return node
+
+    def hasContent(self):
+        return self.element.contents
+
+    def getNameTuple(self):
+        if self.namespace == None:
+            return namespaces["html"], self.name
+        else:
+            return self.namespace, self.name
+
+    nameTuple = property(getNameTuple)
+
+class TextNode(Element):
+    def __init__(self, element, soup):
+        html5lib.treebuilders._base.Node.__init__(self, None)
+        self.element = element
+        self.soup = soup
+
+    def cloneNode(self):
+        raise NotImplementedError
--- a/updater/bs4/builder/_htmlparser.py
+++ b/updater/bs4/builder/_htmlparser.py
@ -0,0 +1,244 @@
+"""Use the HTMLParser library to parse HTML files that aren't too bad."""
+
+__all__ = [
+    'HTMLParserTreeBuilder',
+    ]
+
+from HTMLParser import (
+    HTMLParser,
+    HTMLParseError,
+    )
+import sys
+import warnings
+
+# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
+# argument, which we'd like to set to False. Unfortunately,
+# http://bugs.python.org/issue13273 makes strict=True a better bet
+# before Python 3.2.3.
+#
+# At the end of this file, we monkeypatch HTMLParser so that
+# strict=True works well on Python 3.2.2.
+major, minor, release = sys.version_info[:3]
+CONSTRUCTOR_TAKES_STRICT = (
+    major > 3
+    or (major == 3 and minor > 2)
+    or (major == 3 and minor == 2 and release >= 3))
+
+from bs4.element import (
+    CData,
+    Comment,
+    Declaration,
+    Doctype,
+    ProcessingInstruction,
+    )
+from bs4.dammit import EntitySubstitution, UnicodeDammit
+
+from bs4.builder import (
+    HTML,
+    HTMLTreeBuilder,
+    STRICT,
+    )
+
+
+HTMLPARSER = 'html.parser'
+
+class BeautifulSoupHTMLParser(HTMLParser):
+    def handle_starttag(self, name, attrs):
+        # XXX namespace
+        self.soup.handle_starttag(name, None, None, dict(attrs))
+
+    def handle_endtag(self, name):
+        self.soup.handle_endtag(name)
+
+    def handle_data(self, data):
+        self.soup.handle_data(data)
+
+    def handle_charref(self, name):
+        # XXX workaround for a bug in HTMLParser. Remove this once
+        # it's fixed.
+        if name.startswith('x'):
+            real_name = int(name.lstrip('x'), 16)
+        else:
+            real_name = int(name)
+
+        try:
+            data = unichr(real_name)
+        except (ValueError, OverflowError), e:
+            data = u"\N{REPLACEMENT CHARACTER}"
+
+        self.handle_data(data)
+
+    def handle_entityref(self, name):
+        character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
+        if character is not None:
+            data = character
+        else:
+            data = "&%s;" % name
+        self.handle_data(data)
+
+    def handle_comment(self, data):
+        self.soup.endData()
+        self.soup.handle_data(data)
+        self.soup.endData(Comment)
+
+    def handle_decl(self, data):
+        self.soup.endData()
+        if data.startswith("DOCTYPE "):
+            data = data[len("DOCTYPE "):]
+        self.soup.handle_data(data)
+        self.soup.endData(Doctype)
+
+    def unknown_decl(self, data):
+        if data.upper().startswith('CDATA['):
+            cls = CData
+            data = data[len('CDATA['):]
+        else:
+            cls = Declaration
+        self.soup.endData()
+        self.soup.handle_data(data)
+        self.soup.endData(cls)
+
+    def handle_pi(self, data):
+        self.soup.endData()
+        if data.endswith("?") and data.lower().startswith("xml"):
+            # "An XHTML processing instruction using the trailing '?'
+            # will cause the '?' to be included in data." - HTMLParser
+            # docs.
+            #
+            # Strip the question mark so we don't end up with two
+            # question marks.
+            data = data[:-1]
+        self.soup.handle_data(data)
+        self.soup.endData(ProcessingInstruction)
+
+
+class HTMLParserTreeBuilder(HTMLTreeBuilder):
+
+    is_xml = False
+    features = [HTML, STRICT, HTMLPARSER]
+
+    def __init__(self, *args, **kwargs):
+        if CONSTRUCTOR_TAKES_STRICT:
+            kwargs['strict'] = False
+        self.parser_args = (args, kwargs)
+
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       document_declared_encoding=None):
+        """
+        :return: A 4-tuple (markup, original encoding, encoding
+        declared within markup, whether any characters had to be
+        replaced with REPLACEMENT CHARACTER).
+        """
+        if isinstance(markup, unicode):
+            return markup, None, None, False
+
+        try_encodings = [user_specified_encoding, document_declared_encoding]
+        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
+        return (dammit.markup, dammit.original_encoding,
+                dammit.declared_html_encoding,
+                dammit.contains_replacement_characters)
+
+    def feed(self, markup):
+        args, kwargs = self.parser_args
+        parser = BeautifulSoupHTMLParser(*args, **kwargs)
+        parser.soup = self.soup
+        try:
+            parser.feed(markup)
+        except HTMLParseError, e:
+            warnings.warn(RuntimeWarning(
+                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
+            raise e
+
+# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
+# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
+# string.
+#
+# XXX This code can be removed once most Python 3 users are on 3.2.3.
+if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
+    import re
+    attrfind_tolerant = re.compile(
+        r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
+        r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
+    HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
+
+    locatestarttagend = re.compile(r"""
+  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
+  (?:\s+                             # whitespace before attribute name
+    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
+      (?:\s*=\s*                     # value indicator
+        (?:'[^']*'                   # LITA-enclosed value
+          |\"[^\"]*\"                # LIT-enclosed value
+          |[^'\">\s]+                # bare value
+         )
+       )?
+     )
+   )*
+  \s*                                # trailing whitespace
+""", re.VERBOSE)
+    BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
+
+    from html.parser import tagfind, attrfind
+
+    def parse_starttag(self, i):
+        self.__starttag_text = None
+        endpos = self.check_for_whole_start_tag(i)
+        if endpos < 0:
+            return endpos
+        rawdata = self.rawdata
+        self.__starttag_text = rawdata[i:endpos]
+
+        # Now parse the data between i+1 and j into a tag and attrs
+        attrs = []
+        match = tagfind.match(rawdata, i+1)
+        assert match, 'unexpected call to parse_starttag()'
+        k = match.end()
+        self.lasttag = tag = rawdata[i+1:k].lower()
+        while k < endpos:
+            if self.strict:
+                m = attrfind.match(rawdata, k)
+            else:
+                m = attrfind_tolerant.match(rawdata, k)
+            if not m:
+                break
+            attrname, rest, attrvalue = m.group(1, 2, 3)
+            if not rest:
+                attrvalue = None
+            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
+                 attrvalue[:1] == '"' == attrvalue[-1:]:
+                attrvalue = attrvalue[1:-1]
+            if attrvalue:
+                attrvalue = self.unescape(attrvalue)
+            attrs.append((attrname.lower(), attrvalue))
+            k = m.end()
+
+        end = rawdata[k:endpos].strip()
+        if end not in (">", "/>"):
+            lineno, offset = self.getpos()
+            if "\n" in self.__starttag_text:
+                lineno = lineno + self.__starttag_text.count("\n")
+                offset = len(self.__starttag_text) \
+                         - self.__starttag_text.rfind("\n")
+            else:
+                offset = offset + len(self.__starttag_text)
+            if self.strict:
+                self.error("junk characters in start tag: %r"
+                           % (rawdata[k:endpos][:20],))
+            self.handle_data(rawdata[i:endpos])
+            return endpos
+        if end.endswith('/>'):
+            # XHTML-style empty tag: <span attr="value" />
+            self.handle_startendtag(tag, attrs)
+        else:
+            self.handle_starttag(tag, attrs)
+            if tag in self.CDATA_CONTENT_ELEMENTS:
+                self.set_cdata_mode(tag)
+        return endpos
+
+    def set_cdata_mode(self, elem):
+        self.cdata_elem = elem.lower()
+        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
+
+    BeautifulSoupHTMLParser.parse_starttag = parse_starttag
+    BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
+
+    CONSTRUCTOR_TAKES_STRICT = True
--- a/updater/bs4/builder/_lxml.py
+++ b/updater/bs4/builder/_lxml.py
@ -0,0 +1,196 @@
+__all__ = [
+    'LXMLTreeBuilderForXML',
+    'LXMLTreeBuilder',
+    ]
+
+from StringIO import StringIO
+import collections
+from lxml import etree
+from bs4.element import Comment, Doctype, NamespacedAttribute
+from bs4.builder import (
+    FAST,
+    HTML,
+    HTMLTreeBuilder,
+    PERMISSIVE,
+    TreeBuilder,
+    XML)
+from bs4.dammit import UnicodeDammit
+
+LXML = 'lxml'
+
+class LXMLTreeBuilderForXML(TreeBuilder):
+    DEFAULT_PARSER_CLASS = etree.XMLParser
+
+    is_xml = True
+
+    # Well, it's permissive by XML parser standards.
+    features = [LXML, XML, FAST, PERMISSIVE]
+
+    CHUNK_SIZE = 512
+
+    # This namespace mapping is specified in the XML Namespace
+    # standard.
+    DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
+
+    @property
+    def default_parser(self):
+        # This can either return a parser object or a class, which
+        # will be instantiated with default arguments.
+        return etree.XMLParser(target=self, strip_cdata=False, recover=True)
+
+    def __init__(self, parser=None, empty_element_tags=None):
+        if empty_element_tags is not None:
+            self.empty_element_tags = set(empty_element_tags)
+        if parser is None:
+            # Use the default parser.
+            parser = self.default_parser
+        if isinstance(parser, collections.Callable):
+            # Instantiate the parser with default arguments
+            parser = parser(target=self, strip_cdata=False)
+        self.parser = parser
+        self.soup = None
+        self.nsmaps = [self.DEFAULT_NSMAPS]
+
+    def _getNsTag(self, tag):
+        # Split the namespace URL out of a fully-qualified lxml tag
+        # name. Copied from lxml's src/lxml/sax.py.
+        if tag[0] == '{':
+            return tuple(tag[1:].split('}', 1))
+        else:
+            return (None, tag)
+
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       document_declared_encoding=None):
+        """
+        :return: A 3-tuple (markup, original encoding, encoding
+        declared within markup).
+        """
+        if isinstance(markup, unicode):
+            return markup, None, None, False
+
+        try_encodings = [user_specified_encoding, document_declared_encoding]
+        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
+        return (dammit.markup, dammit.original_encoding,
+                dammit.declared_html_encoding,
+                dammit.contains_replacement_characters)
+
+    def feed(self, markup):
+        if isinstance(markup, basestring):
+            markup = StringIO(markup)
+        # Call feed() at least once, even if the markup is empty,
+        # or the parser won't be initialized.
+        data = markup.read(self.CHUNK_SIZE)
+        self.parser.feed(data)
+        while data != '':
+            # Now call feed() on the rest of the data, chunk by chunk.
+            data = markup.read(self.CHUNK_SIZE)
+            if data != '':
+                self.parser.feed(data)
+        self.parser.close()
+
+    def close(self):
+        self.nsmaps = [self.DEFAULT_NSMAPS]
+
+    def start(self, name, attrs, nsmap={}):
+        # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
+        attrs = dict(attrs)
+        nsprefix = None
+        # Invert each namespace map as it comes in.
+        if len(self.nsmaps) > 1:
+            # There are no new namespaces for this tag, but
+            # non-default namespaces are in play, so we need a
+            # separate tag stack to know when they end.
+            self.nsmaps.append(None)
+        elif len(nsmap) > 0:
+            # A new namespace mapping has come into play.
+            inverted_nsmap = dict((value, key) for key, value in nsmap.items())
+            self.nsmaps.append(inverted_nsmap)
+            # Also treat the namespace mapping as a set of attributes on the
+            # tag, so we can recreate it later.
+            attrs = attrs.copy()
+            for prefix, namespace in nsmap.items():
+                attribute = NamespacedAttribute(
+                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
+                attrs[attribute] = namespace
+
+        # Namespaces are in play. Find any attributes that came in
+        # from lxml with namespaces attached to their names, and
+        # turn then into NamespacedAttribute objects.
+        new_attrs = {}
+        for attr, value in attrs.items():
+            namespace, attr = self._getNsTag(attr)
+            if namespace is None:
+                new_attrs[attr] = value
+            else:
+                nsprefix = self._prefix_for_namespace(namespace)
+                attr = NamespacedAttribute(nsprefix, attr, namespace)
+                new_attrs[attr] = value
+        attrs = new_attrs
+
+        namespace, name = self._getNsTag(name)
+        nsprefix = self._prefix_for_namespace(namespace)
+        self.soup.handle_starttag(name, namespace, nsprefix, attrs)
+
+    def _prefix_for_namespace(self, namespace):
+        """Find the currently active prefix for the given namespace."""
+        if namespace is None:
+            return None
+        for inverted_nsmap in reversed(self.nsmaps):
+            if inverted_nsmap is not None and namespace in inverted_nsmap:
+                return inverted_nsmap[namespace]
+        return None
+
+    def end(self, name):
+        self.soup.endData()
+        completed_tag = self.soup.tagStack[-1]
+        namespace, name = self._getNsTag(name)
+        nsprefix = None
+        if namespace is not None:
+            for inverted_nsmap in reversed(self.nsmaps):
+                if inverted_nsmap is not None and namespace in inverted_nsmap:
+                    nsprefix = inverted_nsmap[namespace]
+                    break
+        self.soup.handle_endtag(name, nsprefix)
+        if len(self.nsmaps) > 1:
+            # This tag, or one of its parents, introduced a namespace
+            # mapping, so pop it off the stack.
+            self.nsmaps.pop()
+
+    def pi(self, target, data):
+        pass
+
+    def data(self, content):
+        self.soup.handle_data(content)
+
+    def doctype(self, name, pubid, system):
+        self.soup.endData()
+        doctype = Doctype.for_name_and_ids(name, pubid, system)
+        self.soup.object_was_parsed(doctype)
+
+    def comment(self, content):
+        "Handle comments as Comment objects."
+        self.soup.endData()
+        self.soup.handle_data(content)
+        self.soup.endData(Comment)
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
+
+
+class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
+
+    features = [LXML, HTML, FAST, PERMISSIVE]
+    is_xml = False
+
+    @property
+    def default_parser(self):
+        return etree.HTMLParser
+
+    def feed(self, markup):
+        self.parser.feed(markup)
+        self.parser.close()
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<html><body>%s</body></html>' % fragment
--- a/updater/bs4/dammit.py
+++ b/updater/bs4/dammit.py
@ -0,0 +1,802 @@
+# -*- coding: utf-8 -*-
+"""Beautiful Soup bonus library: Unicode, Dammit
+
+This class forces XML data into a standard format (usually to UTF-8 or
+Unicode).  It is heavily based on code from Mark Pilgrim's Universal
+Feed Parser. It does not rewrite the XML or HTML to reflect a new
+encoding; that's the tree builder's job.
+"""
+
+import codecs
+from htmlentitydefs import codepoint2name
+import re
+import logging
+
+# Import a library to autodetect character encodings.
+chardet_type = None
+try:
+    # First try the fast C implementation.
+    #  PyPI package: cchardet
+    import cchardet
+    def chardet_dammit(s):
+        return cchardet.detect(s)['encoding']
+except ImportError:
+    try:
+        # Fall back to the pure Python implementation
+        #  Debian package: python-chardet
+        #  PyPI package: chardet
+        import chardet
+        def chardet_dammit(s):
+            return chardet.detect(s)['encoding']
+        #import chardet.constants
+        #chardet.constants._debug = 1
+    except ImportError:
+        # No chardet available.
+        def chardet_dammit(s):
+            return None
+
+# Available from http://cjkpython.i18n.org/.
+try:
+    import iconv_codec
+except ImportError:
+    pass
+
+xml_encoding_re = re.compile(
+    '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
+html_meta_re = re.compile(
+    '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+
+class EntitySubstitution(object):
+
+    """Substitute XML or HTML entities for the corresponding characters."""
+
+    def _populate_class_variables():
+        lookup = {}
+        reverse_lookup = {}
+        characters_for_re = []
+        for codepoint, name in list(codepoint2name.items()):
+            character = unichr(codepoint)
+            if codepoint != 34:
+                # There's no point in turning the quotation mark into
+                # &quot;, unless it happens within an attribute value, which
+                # is handled elsewhere.
+                characters_for_re.append(character)
+                lookup[character] = name
+            # But we do want to turn &quot; into the quotation mark.
+            reverse_lookup[name] = character
+        re_definition = "[%s]" % "".join(characters_for_re)
+        return lookup, reverse_lookup, re.compile(re_definition)
+    (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
+     CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
+
+    CHARACTER_TO_XML_ENTITY = {
+        "'": "apos",
+        '"': "quot",
+        "&": "amp",
+        "<": "lt",
+        ">": "gt",
+        }
+
+    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+                                           ")")
+
+    @classmethod
+    def _substitute_html_entity(cls, matchobj):
+        entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
+        return "&%s;" % entity
+
+    @classmethod
+    def _substitute_xml_entity(cls, matchobj):
+        """Used with a regular expression to substitute the
+        appropriate XML entity for an XML special character."""
+        entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
+        return "&%s;" % entity
+
+    @classmethod
+    def quoted_attribute_value(self, value):
+        """Make a value into a quoted XML attribute, possibly escaping it.
+
+         Most strings will be quoted using double quotes.
+
+          Bob's Bar -> "Bob's Bar"
+
+         If a string contains double quotes, it will be quoted using
+         single quotes.
+
+          Welcome to "my bar" -> 'Welcome to "my bar"'
+
+         If a string contains both single and double quotes, the
+         double quotes will be escaped, and the string will be quoted
+         using double quotes.
+
+          Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
+        """
+        quote_with = '"'
+        if '"' in value:
+            if "'" in value:
+                # The string contains both single and double
+                # quotes.  Turn the double quotes into
+                # entities. We quote the double quotes rather than
+                # the single quotes because the entity name is
+                # "&quot;" whether this is HTML or XML.  If we
+                # quoted the single quotes, we'd have to decide
+                # between &apos; and &squot;.
+                replace_with = "&quot;"
+                value = value.replace('"', replace_with)
+            else:
+                # There are double quotes but no single quotes.
+                # We can use single quotes to quote the attribute.
+                quote_with = "'"
+        return quote_with + value + quote_with
+
+    @classmethod
+    def substitute_xml(cls, value, make_quoted_attribute=False):
+        """Substitute XML entities for special XML characters.
+
+        :param value: A string to be substituted. The less-than sign will
+          become &lt;, the greater-than sign will become &gt;, and any
+          ampersands that are not part of an entity defition will
+          become &amp;.
+
+        :param make_quoted_attribute: If True, then the string will be
+         quoted, as befits an attribute value.
+        """
+        # Escape angle brackets, and ampersands that aren't part of
+        # entities.
+        value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
+            cls._substitute_xml_entity, value)
+
+        if make_quoted_attribute:
+            value = cls.quoted_attribute_value(value)
+        return value
+
+    @classmethod
+    def substitute_html(cls, s):
+        """Replace certain Unicode characters with named HTML entities.
+
+        This differs from data.encode(encoding, 'xmlcharrefreplace')
+        in that the goal is to make the result more readable (to those
+        with ASCII displays) rather than to recover from
+        errors. There's absolutely nothing wrong with a UTF-8 string
+        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
+        character with "&eacute;" will make it more readable to some
+        people.
+        """
+        return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
+            cls._substitute_html_entity, s)
+
+
+class UnicodeDammit:
+    """A class for detecting the encoding of a *ML document and
+    converting it to a Unicode string. If the source encoding is
+    windows-1252, can replace MS smart quotes with their HTML or XML
+    equivalents."""
+
+    # This dictionary maps commonly seen values for "charset" in HTML
+    # meta tags to the corresponding Python codec names. It only covers
+    # values that aren't in Python's aliases and can't be determined
+    # by the heuristics in find_codec.
+    CHARSET_ALIASES = {"macintosh": "mac-roman",
+                       "x-sjis": "shift-jis"}
+
+    ENCODINGS_WITH_SMART_QUOTES = [
+        "windows-1252",
+        "iso-8859-1",
+        "iso-8859-2",
+        ]
+
+    def __init__(self, markup, override_encodings=[],
+                 smart_quotes_to=None, is_html=False):
+        self.declared_html_encoding = None
+        self.smart_quotes_to = smart_quotes_to
+        self.tried_encodings = []
+        self.contains_replacement_characters = False
+
+        if markup == '' or isinstance(markup, unicode):
+            self.markup = markup
+            self.unicode_markup = unicode(markup)
+            self.original_encoding = None
+            return
+
+        new_markup, document_encoding, sniffed_encoding = \
+            self._detectEncoding(markup, is_html)
+        self.markup = new_markup
+
+        u = None
+        if new_markup != markup:
+            # _detectEncoding modified the markup, then converted it to
+            # Unicode and then to UTF-8. So convert it from UTF-8.
+            u = self._convert_from("utf8")
+            self.original_encoding = sniffed_encoding
+
+        if not u:
+            for proposed_encoding in (
+                override_encodings + [document_encoding, sniffed_encoding]):
+                if proposed_encoding is not None:
+                    u = self._convert_from(proposed_encoding)
+                    if u:
+                        break
+
+        # If no luck and we have auto-detection library, try that:
+        if not u and not isinstance(self.markup, unicode):
+            u = self._convert_from(chardet_dammit(self.markup))
+
+        # As a last resort, try utf-8 and windows-1252:
+        if not u:
+            for proposed_encoding in ("utf-8", "windows-1252"):
+                u = self._convert_from(proposed_encoding)
+                if u:
+                    break
+
+        # As an absolute last resort, try the encodings again with
+        # character replacement.
+        if not u:
+            for proposed_encoding in (
+                override_encodings + [
+                    document_encoding, sniffed_encoding, "utf-8", "windows-1252"]):
+                if proposed_encoding != "ascii":
+                    u = self._convert_from(proposed_encoding, "replace")
+                if u is not None:
+                    logging.warning(
+                            "Some characters could not be decoded, and were "
+                            "replaced with REPLACEMENT CHARACTER.")
+                    self.contains_replacement_characters = True
+                    break
+
+        # We could at this point force it to ASCII, but that would
+        # destroy so much data that I think giving up is better
+        self.unicode_markup = u
+        if not u:
+            self.original_encoding = None
+
+    def _sub_ms_char(self, match):
+        """Changes a MS smart quote character to an XML or HTML
+        entity, or an ASCII character."""
+        orig = match.group(1)
+        if self.smart_quotes_to == 'ascii':
+            sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
+        else:
+            sub = self.MS_CHARS.get(orig)
+            if type(sub) == tuple:
+                if self.smart_quotes_to == 'xml':
+                    sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
+                else:
+                    sub = '&'.encode() + sub[0].encode() + ';'.encode()
+            else:
+                sub = sub.encode()
+        return sub
+
+    def _convert_from(self, proposed, errors="strict"):
+        proposed = self.find_codec(proposed)
+        if not proposed or (proposed, errors) in self.tried_encodings:
+            return None
+        self.tried_encodings.append((proposed, errors))
+        markup = self.markup
+        # Convert smart quotes to HTML if coming from an encoding
+        # that might have them.
+        if (self.smart_quotes_to is not None
+            and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
+            smart_quotes_re = b"([\x80-\x9f])"
+            smart_quotes_compiled = re.compile(smart_quotes_re)
+            markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
+
+        try:
+            #print "Trying to convert document to %s (errors=%s)" % (
+            #    proposed, errors)
+            u = self._to_unicode(markup, proposed, errors)
+            self.markup = u
+            self.original_encoding = proposed
+        except Exception as e:
+            #print "That didn't work!"
+            #print e
+            return None
+        #print "Correct encoding: %s" % proposed
+        return self.markup
+
+    def _to_unicode(self, data, encoding, errors="strict"):
+        '''Given a string and its encoding, decodes the string into Unicode.
+        %encoding is a string recognized by encodings.aliases'''
+
+        # strip Byte Order Mark (if present)
+        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
+               and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16be'
+            data = data[2:]
+        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
+                 and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16le'
+            data = data[2:]
+        elif data[:3] == '\xef\xbb\xbf':
+            encoding = 'utf-8'
+            data = data[3:]
+        elif data[:4] == '\x00\x00\xfe\xff':
+            encoding = 'utf-32be'
+            data = data[4:]
+        elif data[:4] == '\xff\xfe\x00\x00':
+            encoding = 'utf-32le'
+            data = data[4:]
+        newdata = unicode(data, encoding, errors)
+        return newdata
+
+    def _detectEncoding(self, xml_data, is_html=False):
+        """Given a document, tries to detect its XML encoding."""
+        xml_encoding = sniffed_xml_encoding = None
+        try:
+            if xml_data[:4] == b'\x4c\x6f\xa7\x94':
+                # EBCDIC
+                xml_data = self._ebcdic_to_ascii(xml_data)
+            elif xml_data[:4] == b'\x00\x3c\x00\x3f':
+                # UTF-16BE
+                sniffed_xml_encoding = 'utf-16be'
+                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
+                     and (xml_data[2:4] != b'\x00\x00'):
+                # UTF-16BE with BOM
+                sniffed_xml_encoding = 'utf-16be'
+                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+            elif xml_data[:4] == b'\x3c\x00\x3f\x00':
+                # UTF-16LE
+                sniffed_xml_encoding = 'utf-16le'
+                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
+                     (xml_data[2:4] != b'\x00\x00'):
+                # UTF-16LE with BOM
+                sniffed_xml_encoding = 'utf-16le'
+                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+            elif xml_data[:4] == b'\x00\x00\x00\x3c':
+                # UTF-32BE
+                sniffed_xml_encoding = 'utf-32be'
+                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+            elif xml_data[:4] == b'\x3c\x00\x00\x00':
+                # UTF-32LE
+                sniffed_xml_encoding = 'utf-32le'
+                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+            elif xml_data[:4] == b'\x00\x00\xfe\xff':
+                # UTF-32BE with BOM
+                sniffed_xml_encoding = 'utf-32be'
+                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+            elif xml_data[:4] == b'\xff\xfe\x00\x00':
+                # UTF-32LE with BOM
+                sniffed_xml_encoding = 'utf-32le'
+                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+            elif xml_data[:3] == b'\xef\xbb\xbf':
+                # UTF-8 with BOM
+                sniffed_xml_encoding = 'utf-8'
+                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+            else:
+                sniffed_xml_encoding = 'ascii'
+                pass
+        except:
+            xml_encoding_match = None
+        xml_encoding_match = xml_encoding_re.match(xml_data)
+        if not xml_encoding_match and is_html:
+            xml_encoding_match = html_meta_re.search(xml_data)
+        if xml_encoding_match is not None:
+            xml_encoding = xml_encoding_match.groups()[0].decode(
+                'ascii').lower()
+            if is_html:
+                self.declared_html_encoding = xml_encoding
+            if sniffed_xml_encoding and \
+               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
+                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
+                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
+                                 'utf16', 'u16')):
+                xml_encoding = sniffed_xml_encoding
+        return xml_data, xml_encoding, sniffed_xml_encoding
+
+    def find_codec(self, charset):
+        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
+               or (charset and self._codec(charset.replace("-", ""))) \
+               or (charset and self._codec(charset.replace("-", "_"))) \
+               or charset
+
+    def _codec(self, charset):
+        if not charset:
+            return charset
+        codec = None
+        try:
+            codecs.lookup(charset)
+            codec = charset
+        except (LookupError, ValueError):
+            pass
+        return codec
+
+    EBCDIC_TO_ASCII_MAP = None
+
+    def _ebcdic_to_ascii(self, s):
+        c = self.__class__
+        if not c.EBCDIC_TO_ASCII_MAP:
+            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
+                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
+                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
+                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
+                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
+                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
+                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
+                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
+                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
+                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
+                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
+                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
+                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
+                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
+                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
+                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
+                    250,251,252,253,254,255)
+            import string
+            c.EBCDIC_TO_ASCII_MAP = string.maketrans(
+            ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
+        return s.translate(c.EBCDIC_TO_ASCII_MAP)
+
+    # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
+    MS_CHARS = {b'\x80': ('euro', '20AC'),
+                b'\x81': ' ',
+                b'\x82': ('sbquo', '201A'),
+                b'\x83': ('fnof', '192'),
+                b'\x84': ('bdquo', '201E'),
+                b'\x85': ('hellip', '2026'),
+                b'\x86': ('dagger', '2020'),
+                b'\x87': ('Dagger', '2021'),
+                b'\x88': ('circ', '2C6'),
+                b'\x89': ('permil', '2030'),
+                b'\x8A': ('Scaron', '160'),
+                b'\x8B': ('lsaquo', '2039'),
+                b'\x8C': ('OElig', '152'),
+                b'\x8D': '?',
+                b'\x8E': ('#x17D', '17D'),
+                b'\x8F': '?',
+                b'\x90': '?',
+                b'\x91': ('lsquo', '2018'),
+                b'\x92': ('rsquo', '2019'),
+                b'\x93': ('ldquo', '201C'),
+                b'\x94': ('rdquo', '201D'),
+                b'\x95': ('bull', '2022'),
+                b'\x96': ('ndash', '2013'),
+                b'\x97': ('mdash', '2014'),
+                b'\x98': ('tilde', '2DC'),
+                b'\x99': ('trade', '2122'),
+                b'\x9a': ('scaron', '161'),
+                b'\x9b': ('rsaquo', '203A'),
+                b'\x9c': ('oelig', '153'),
+                b'\x9d': '?',
+                b'\x9e': ('#x17E', '17E'),
+                b'\x9f': ('Yuml', ''),}
+
+    # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
+    # horrors like stripping diacritical marks to turn á into a, but also
+    # contains non-horrors like turning “ into ".
+    MS_CHARS_TO_ASCII = {
+        b'\x80' : 'EUR',
+        b'\x81' : ' ',
+        b'\x82' : ',',
+        b'\x83' : 'f',
+        b'\x84' : ',,',
+        b'\x85' : '...',
+        b'\x86' : '+',
+        b'\x87' : '++',
+        b'\x88' : '^',
+        b'\x89' : '%',
+        b'\x8a' : 'S',
+        b'\x8b' : '<',
+        b'\x8c' : 'OE',
+        b'\x8d' : '?',
+        b'\x8e' : 'Z',
+        b'\x8f' : '?',
+        b'\x90' : '?',
+        b'\x91' : "'",
+        b'\x92' : "'",
+        b'\x93' : '"',
+        b'\x94' : '"',
+        b'\x95' : '*',
+        b'\x96' : '-',
+        b'\x97' : '--',
+        b'\x98' : '~',
+        b'\x99' : '(TM)',
+        b'\x9a' : 's',
+        b'\x9b' : '>',
+        b'\x9c' : 'oe',
+        b'\x9d' : '?',
+        b'\x9e' : 'z',
+        b'\x9f' : 'Y',
+        b'\xa0' : ' ',
+        b'\xa1' : '!',
+        b'\xa2' : 'c',
+        b'\xa3' : 'GBP',
+        b'\xa4' : '$', #This approximation is especially parochial--this is the
+                       #generic currency symbol.
+        b'\xa5' : 'YEN',
+        b'\xa6' : '|',
+        b'\xa7' : 'S',
+        b'\xa8' : '..',
+        b'\xa9' : '',
+        b'\xaa' : '(th)',
+        b'\xab' : '<<',
+        b'\xac' : '!',
+        b'\xad' : ' ',
+        b'\xae' : '(R)',
+        b'\xaf' : '-',
+        b'\xb0' : 'o',
+        b'\xb1' : '+-',
+        b'\xb2' : '2',
+        b'\xb3' : '3',
+        b'\xb4' : ("'", 'acute'),
+        b'\xb5' : 'u',
+        b'\xb6' : 'P',
+        b'\xb7' : '*',
+        b'\xb8' : ',',
+        b'\xb9' : '1',
+        b'\xba' : '(th)',
+        b'\xbb' : '>>',
+        b'\xbc' : '1/4',
+        b'\xbd' : '1/2',
+        b'\xbe' : '3/4',
+        b'\xbf' : '?',
+        b'\xc0' : 'A',
+        b'\xc1' : 'A',
+        b'\xc2' : 'A',
+        b'\xc3' : 'A',
+        b'\xc4' : 'A',
+        b'\xc5' : 'A',
+        b'\xc6' : 'AE',
+        b'\xc7' : 'C',
+        b'\xc8' : 'E',
+        b'\xc9' : 'E',
+        b'\xca' : 'E',
+        b'\xcb' : 'E',
+        b'\xcc' : 'I',
+        b'\xcd' : 'I',
+        b'\xce' : 'I',
+        b'\xcf' : 'I',
+        b'\xd0' : 'D',
+        b'\xd1' : 'N',
+        b'\xd2' : 'O',
+        b'\xd3' : 'O',
+        b'\xd4' : 'O',
+        b'\xd5' : 'O',
+        b'\xd6' : 'O',
+        b'\xd7' : '*',
+        b'\xd8' : 'O',
+        b'\xd9' : 'U',
+        b'\xda' : 'U',
+        b'\xdb' : 'U',
+        b'\xdc' : 'U',
+        b'\xdd' : 'Y',
+        b'\xde' : 'b',
+        b'\xdf' : 'B',
+        b'\xe0' : 'a',
+        b'\xe1' : 'a',
+        b'\xe2' : 'a',
+        b'\xe3' : 'a',
+        b'\xe4' : 'a',
+        b'\xe5' : 'a',
+        b'\xe6' : 'ae',
+        b'\xe7' : 'c',
+        b'\xe8' : 'e',
+        b'\xe9' : 'e',
+        b'\xea' : 'e',
+        b'\xeb' : 'e',
+        b'\xec' : 'i',
+        b'\xed' : 'i',
+        b'\xee' : 'i',
+        b'\xef' : 'i',
+        b'\xf0' : 'o',
+        b'\xf1' : 'n',
+        b'\xf2' : 'o',
+        b'\xf3' : 'o',
+        b'\xf4' : 'o',
+        b'\xf5' : 'o',
+        b'\xf6' : 'o',
+        b'\xf7' : '/',
+        b'\xf8' : 'o',
+        b'\xf9' : 'u',
+        b'\xfa' : 'u',
+        b'\xfb' : 'u',
+        b'\xfc' : 'u',
+        b'\xfd' : 'y',
+        b'\xfe' : 'b',
+        b'\xff' : 'y',
+        }
+
+    # A map used when removing rogue Windows-1252/ISO-8859-1
+    # characters in otherwise UTF-8 documents.
+    #
+    # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
+    # Windows-1252.
+    WINDOWS_1252_TO_UTF8 = {
+        0x80 : b'\xe2\x82\xac', # €
+        0x82 : b'\xe2\x80\x9a', # ‚
+        0x83 : b'\xc6\x92',     # ƒ
+        0x84 : b'\xe2\x80\x9e', # „
+        0x85 : b'\xe2\x80\xa6', # …
+        0x86 : b'\xe2\x80\xa0', # †
+        0x87 : b'\xe2\x80\xa1', # ‡
+        0x88 : b'\xcb\x86',     # ˆ
+        0x89 : b'\xe2\x80\xb0', # ‰
+        0x8a : b'\xc5\xa0',     # Š
+        0x8b : b'\xe2\x80\xb9', # ‹
+        0x8c : b'\xc5\x92',     # Œ
+        0x8e : b'\xc5\xbd',     # Ž
+        0x91 : b'\xe2\x80\x98', # ‘
+        0x92 : b'\xe2\x80\x99', # ’
+        0x93 : b'\xe2\x80\x9c', # “
+        0x94 : b'\xe2\x80\x9d', # ”
+        0x95 : b'\xe2\x80\xa2', # •
+        0x96 : b'\xe2\x80\x93', # –
+        0x97 : b'\xe2\x80\x94', # —
+        0x98 : b'\xcb\x9c',     # ˜
+        0x99 : b'\xe2\x84\xa2', # ™
+        0x9a : b'\xc5\xa1',     # š
+        0x9b : b'\xe2\x80\xba', # ›
+        0x9c : b'\xc5\x93',     # œ
+        0x9e : b'\xc5\xbe',     # ž
+        0x9f : b'\xc5\xb8',     # Ÿ
+        0xa0 : b'\xc2\xa0',     #  
+        0xa1 : b'\xc2\xa1',     # ¡
+        0xa2 : b'\xc2\xa2',     # ¢
+        0xa3 : b'\xc2\xa3',     # £
+        0xa4 : b'\xc2\xa4',     # ¤
+        0xa5 : b'\xc2\xa5',     # ¥
+        0xa6 : b'\xc2\xa6',     # ¦
+        0xa7 : b'\xc2\xa7',     # §
+        0xa8 : b'\xc2\xa8',     # ¨
+        0xa9 : b'\xc2\xa9',     # ©
+        0xaa : b'\xc2\xaa',     # ª
+        0xab : b'\xc2\xab',     # «
+        0xac : b'\xc2\xac',     # ¬
+        0xad : b'\xc2\xad',     # 
+        0xae : b'\xc2\xae',     # ®
+        0xaf : b'\xc2\xaf',     # ¯
+        0xb0 : b'\xc2\xb0',     # °
+        0xb1 : b'\xc2\xb1',     # ±
+        0xb2 : b'\xc2\xb2',     # ²
+        0xb3 : b'\xc2\xb3',     # ³
+        0xb4 : b'\xc2\xb4',     # ´
+        0xb5 : b'\xc2\xb5',     # µ
+        0xb6 : b'\xc2\xb6',     # ¶
+        0xb7 : b'\xc2\xb7',     # ·
+        0xb8 : b'\xc2\xb8',     # ¸
+        0xb9 : b'\xc2\xb9',     # ¹
+        0xba : b'\xc2\xba',     # º
+        0xbb : b'\xc2\xbb',     # »
+        0xbc : b'\xc2\xbc',     # ¼
+        0xbd : b'\xc2\xbd',     # ½
+        0xbe : b'\xc2\xbe',     # ¾
+        0xbf : b'\xc2\xbf',     # ¿
+        0xc0 : b'\xc3\x80',     # À
+        0xc1 : b'\xc3\x81',     # Á
+        0xc2 : b'\xc3\x82',     # Â
+        0xc3 : b'\xc3\x83',     # Ã
+        0xc4 : b'\xc3\x84',     # Ä
+        0xc5 : b'\xc3\x85',     # Å
+        0xc6 : b'\xc3\x86',     # Æ
+        0xc7 : b'\xc3\x87',     # Ç
+        0xc8 : b'\xc3\x88',     # È
+        0xc9 : b'\xc3\x89',     # É
+        0xca : b'\xc3\x8a',     # Ê
+        0xcb : b'\xc3\x8b',     # Ë
+        0xcc : b'\xc3\x8c',     # Ì
+        0xcd : b'\xc3\x8d',     # Í
+        0xce : b'\xc3\x8e',     # Î
+        0xcf : b'\xc3\x8f',     # Ï
+        0xd0 : b'\xc3\x90',     # Ð
+        0xd1 : b'\xc3\x91',     # Ñ
+        0xd2 : b'\xc3\x92',     # Ò
+        0xd3 : b'\xc3\x93',     # Ó
+        0xd4 : b'\xc3\x94',     # Ô
+        0xd5 : b'\xc3\x95',     # Õ
+        0xd6 : b'\xc3\x96',     # Ö
+        0xd7 : b'\xc3\x97',     # ×
+        0xd8 : b'\xc3\x98',     # Ø
+        0xd9 : b'\xc3\x99',     # Ù
+        0xda : b'\xc3\x9a',     # Ú
+        0xdb : b'\xc3\x9b',     # Û
+        0xdc : b'\xc3\x9c',     # Ü
+        0xdd : b'\xc3\x9d',     # Ý
+        0xde : b'\xc3\x9e',     # Þ
+        0xdf : b'\xc3\x9f',     # ß
+        0xe0 : b'\xc3\xa0',     # à
+        0xe1 : b'\xa1',     # á
+        0xe2 : b'\xc3\xa2',     # â
+        0xe3 : b'\xc3\xa3',     # ã
+        0xe4 : b'\xc3\xa4',     # ä
+        0xe5 : b'\xc3\xa5',     # å
+        0xe6 : b'\xc3\xa6',     # æ
+        0xe7 : b'\xc3\xa7',     # ç
+        0xe8 : b'\xc3\xa8',     # è
+        0xe9 : b'\xc3\xa9',     # é
+        0xea : b'\xc3\xaa',     # ê
+        0xeb : b'\xc3\xab',     # ë
+        0xec : b'\xc3\xac',     # ì
+        0xed : b'\xc3\xad',     # í
+        0xee : b'\xc3\xae',     # î
+        0xef : b'\xc3\xaf',     # ï
+        0xf0 : b'\xc3\xb0',     # ð
+        0xf1 : b'\xc3\xb1',     # ñ
+        0xf2 : b'\xc3\xb2',     # ò
+        0xf3 : b'\xc3\xb3',     # ó
+        0xf4 : b'\xc3\xb4',     # ô
+        0xf5 : b'\xc3\xb5',     # õ
+        0xf6 : b'\xc3\xb6',     # ö
+        0xf7 : b'\xc3\xb7',     # ÷
+        0xf8 : b'\xc3\xb8',     # ø
+        0xf9 : b'\xc3\xb9',     # ù
+        0xfa : b'\xc3\xba',     # ú
+        0xfb : b'\xc3\xbb',     # û
+        0xfc : b'\xc3\xbc',     # ü
+        0xfd : b'\xc3\xbd',     # ý
+        0xfe : b'\xc3\xbe',     # þ
+        }
+
+    MULTIBYTE_MARKERS_AND_SIZES = [
+        (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
+        (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
+        (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
+        ]
+
+    FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
+    LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
+
+    @classmethod
+    def detwingle(cls, in_bytes, main_encoding="utf8",
+                  embedded_encoding="windows-1252"):
+        """Fix characters from one encoding embedded in some other encoding.
+
+        Currently the only situation supported is Windows-1252 (or its
+        subset ISO-8859-1), embedded in UTF-8.
+
+        The input must be a bytestring. If you've already converted
+        the document to Unicode, you're too late.
+
+        The output is a bytestring in which `embedded_encoding`
+        characters have been converted to their `main_encoding`
+        equivalents.
+        """
+        if embedded_encoding.replace('_', '-').lower() not in (
+            'windows-1252', 'windows_1252'):
+            raise NotImplementedError(
+                "Windows-1252 and ISO-8859-1 are the only currently supported "
+                "embedded encodings.")
+
+        if main_encoding.lower() not in ('utf8', 'utf-8'):
+            raise NotImplementedError(
+                "UTF-8 is the only currently supported main encoding.")
+
+        byte_chunks = []
+
+        chunk_start = 0
+        pos = 0
+        while pos < len(in_bytes):
+            byte = in_bytes[pos]
+            if not isinstance(byte, int):
+                # Python 2.x
+                byte = ord(byte)
+            if (byte >= cls.FIRST_MULTIBYTE_MARKER
+                and byte <= cls.LAST_MULTIBYTE_MARKER):
+                # This is the start of a UTF-8 multibyte character. Skip
+                # to the end.
+                for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
+                    if byte >= start and byte <= end:
+                        pos += size
+                        break
+            elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
+                # We found a Windows-1252 character!
+                # Save the string up to this point as a chunk.
+                byte_chunks.append(in_bytes[chunk_start:pos])
+
+                # Now translate the Windows-1252 character into UTF-8
+                # and add it as another, one-byte chunk.
+                byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
+                pos += 1
+                chunk_start = pos
+            else:
+                # Go on to the next character.
+                pos += 1
+        if chunk_start == 0:
+            # The string is unchanged.
+            return in_bytes
+        else:
+            # Store the final chunk.
+            byte_chunks.append(in_bytes[chunk_start:])
+        return b''.join(byte_chunks)
+
--- a/updater/bs4/element.py
+++ b/updater/bs4/element.py
--- a/updater/bs4/testing.py
+++ b/updater/bs4/testing.py
@ -0,0 +1,554 @@
+"""Helper classes for tests."""
+
+import copy
+import functools
+import unittest
+from unittest import TestCase
+from bs4 import BeautifulSoup
+from bs4.element import (
+    CharsetMetaAttributeValue,
+    Comment,
+    ContentMetaAttributeValue,
+    Doctype,
+    SoupStrainer,
+)
+
+from bs4.builder import HTMLParserTreeBuilder
+default_builder = HTMLParserTreeBuilder
+
+
+class SoupTest(unittest.TestCase):
+
+    @property
+    def default_builder(self):
+        return default_builder()
+
+    def soup(self, markup, **kwargs):
+        """Build a Beautiful Soup object from markup."""
+        builder = kwargs.pop('builder', self.default_builder)
+        return BeautifulSoup(markup, builder=builder, **kwargs)
+
+    def document_for(self, markup):
+        """Turn an HTML fragment into a document.
+
+        The details depend on the builder.
+        """
+        return self.default_builder.test_fragment_to_document(markup)
+
+    def assertSoupEquals(self, to_parse, compare_parsed_to=None):
+        builder = self.default_builder
+        obj = BeautifulSoup(to_parse, builder=builder)
+        if compare_parsed_to is None:
+            compare_parsed_to = to_parse
+
+        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
+
+
+class HTMLTreeBuilderSmokeTest(object):
+
+    """A basic test of a treebuilder's competence.
+
+    Any HTML treebuilder, present or future, should be able to pass
+    these tests. With invalid markup, there's room for interpretation,
+    and different parsers can handle it differently. But with the
+    markup in these tests, there's not much room for interpretation.
+    """
+
+    def assertDoctypeHandled(self, doctype_fragment):
+        """Assert that a given doctype string is handled correctly."""
+        doctype_str, soup = self._document_with_doctype(doctype_fragment)
+
+        # Make sure a Doctype object was created.
+        doctype = soup.contents[0]
+        self.assertEqual(doctype.__class__, Doctype)
+        self.assertEqual(doctype, doctype_fragment)
+        self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
+
+        # Make sure that the doctype was correctly associated with the
+        # parse tree and that the rest of the document parsed.
+        self.assertEqual(soup.p.contents[0], 'foo')
+
+    def _document_with_doctype(self, doctype_fragment):
+        """Generate and parse a document with the given doctype."""
+        doctype = '<!DOCTYPE %s>' % doctype_fragment
+        markup = doctype + '\n<p>foo</p>'
+        soup = self.soup(markup)
+        return doctype, soup
+
+    def test_normal_doctypes(self):
+        """Make sure normal, everyday HTML doctypes are handled correctly."""
+        self.assertDoctypeHandled("html")
+        self.assertDoctypeHandled(
+            'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
+
+    def test_public_doctype_with_url(self):
+        doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
+        self.assertDoctypeHandled(doctype)
+
+    def test_system_doctype(self):
+        self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
+
+    def test_namespaced_system_doctype(self):
+        # We can handle a namespaced doctype with a system ID.
+        self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
+
+    def test_namespaced_public_doctype(self):
+        # Test a namespaced doctype with a public id.
+        self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
+
+    def test_real_xhtml_document(self):
+        """A real XHTML document should come out more or less the same as it went in."""
+        markup = b"""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>Hello.</title></head>
+<body>Goodbye.</body>
+</html>"""
+        soup = self.soup(markup)
+        self.assertEqual(
+            soup.encode("utf-8").replace(b"\n", b""),
+            markup.replace(b"\n", b""))
+
+    def test_deepcopy(self):
+        """Make sure you can copy the tree builder.
+
+        This is important because the builder is part of a
+        BeautifulSoup object, and we want to be able to copy that.
+        """
+        copy.deepcopy(self.default_builder)
+
+    def test_p_tag_is_never_empty_element(self):
+        """A <p> tag is never designated as an empty-element tag.
+
+        Even if the markup shows it as an empty-element tag, it
+        shouldn't be presented that way.
+        """
+        soup = self.soup("<p/>")
+        self.assertFalse(soup.p.is_empty_element)
+        self.assertEqual(str(soup.p), "<p></p>")
+
+    def test_unclosed_tags_get_closed(self):
+        """A tag that's not closed by the end of the document should be closed.
+
+        This applies to all tags except empty-element tags.
+        """
+        self.assertSoupEquals("<p>", "<p></p>")
+        self.assertSoupEquals("<b>", "<b></b>")
+
+        self.assertSoupEquals("<br>", "<br/>")
+
+    def test_br_is_always_empty_element_tag(self):
+        """A <br> tag is designated as an empty-element tag.
+
+        Some parsers treat <br></br> as one <br/> tag, some parsers as
+        two tags, but it should always be an empty-element tag.
+        """
+        soup = self.soup("<br></br>")
+        self.assertTrue(soup.br.is_empty_element)
+        self.assertEqual(str(soup.br), "<br/>")
+
+    def test_nested_formatting_elements(self):
+        self.assertSoupEquals("<em><em></em></em>")
+
+    def test_comment(self):
+        # Comments are represented as Comment objects.
+        markup = "<p>foo<!--foobar-->baz</p>"
+        self.assertSoupEquals(markup)
+
+        soup = self.soup(markup)
+        comment = soup.find(text="foobar")
+        self.assertEqual(comment.__class__, Comment)
+
+        # The comment is properly integrated into the tree.
+        foo = soup.find(text="foo")
+        self.assertEqual(comment, foo.next_element)
+        baz = soup.find(text="baz")
+        self.assertEquals(comment, baz.previous_element)
+
+    def test_preserved_whitespace_in_pre_and_textarea(self):
+        """Whitespace must be preserved in <pre> and <textarea> tags."""
+        self.assertSoupEquals("<pre>   </pre>")
+        self.assertSoupEquals("<textarea> woo  </textarea>")
+
+    def test_nested_inline_elements(self):
+        """Inline elements can be nested indefinitely."""
+        b_tag = "<b>Inside a B tag</b>"
+        self.assertSoupEquals(b_tag)
+
+        nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
+        self.assertSoupEquals(nested_b_tag)
+
+        double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
+        self.assertSoupEquals(nested_b_tag)
+
+    def test_nested_block_level_elements(self):
+        """Block elements can be nested."""
+        soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
+        blockquote = soup.blockquote
+        self.assertEqual(blockquote.p.b.string, 'Foo')
+        self.assertEqual(blockquote.b.string, 'Foo')
+
+    def test_correctly_nested_tables(self):
+        """One table can go inside another one."""
+        markup = ('<table id="1">'
+                  '<tr>'
+                  "<td>Here's another table:"
+                  '<table id="2">'
+                  '<tr><td>foo</td></tr>'
+                  '</table></td>')
+
+        self.assertSoupEquals(
+            markup,
+            '<table id="1"><tr><td>Here\'s another table:'
+            '<table id="2"><tr><td>foo</td></tr></table>'
+            '</td></tr></table>')
+
+        self.assertSoupEquals(
+            "<table><thead><tr><td>Foo</td></tr></thead>"
+            "<tbody><tr><td>Bar</td></tr></tbody>"
+            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
+
+    def test_deeply_nested_multivalued_attribute(self):
+        # html5lib can set the attributes of the same tag many times
+        # as it rearranges the tree. This has caused problems with
+        # multivalued attributes.
+        markup = '<table><div><div class="css"></div></div></table>'
+        soup = self.soup(markup)
+        self.assertEqual(["css"], soup.div.div['class'])
+
+    def test_angle_brackets_in_attribute_values_are_escaped(self):
+        self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
+
+    def test_entities_in_attributes_converted_to_unicode(self):
+        expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
+        self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
+        self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
+        self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
+
+    def test_entities_in_text_converted_to_unicode(self):
+        expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
+        self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
+        self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
+        self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
+
+    def test_quot_entity_converted_to_quotation_mark(self):
+        self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
+                              '<p>I said "good day!"</p>')
+
+    def test_out_of_range_entity(self):
+        expect = u"\N{REPLACEMENT CHARACTER}"
+        self.assertSoupEquals("&#10000000000000;", expect)
+        self.assertSoupEquals("&#x10000000000000;", expect)
+        self.assertSoupEquals("&#1000000000;", expect)
+
+    def test_basic_namespaces(self):
+        """Parsers don't need to *understand* namespaces, but at the
+        very least they should not choke on namespaces or lose
+        data."""
+
+        markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
+        soup = self.soup(markup)
+        self.assertEqual(markup, soup.encode())
+        html = soup.html
+        self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
+        self.assertEqual(
+            'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
+        self.assertEqual(
+            'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
+
+    def test_multivalued_attribute_value_becomes_list(self):
+        markup = b'<a class="foo bar">'
+        soup = self.soup(markup)
+        self.assertEqual(['foo', 'bar'], soup.a['class'])
+
+    #
+    # Generally speaking, tests below this point are more tests of
+    # Beautiful Soup than tests of the tree builders. But parsers are
+    # weird, so we run these tests separately for every tree builder
+    # to detect any differences between them.
+    #
+
+    def test_soupstrainer(self):
+        """Parsers should be able to work with SoupStrainers."""
+        strainer = SoupStrainer("b")
+        soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
+                         parse_only=strainer)
+        self.assertEqual(soup.decode(), "<b>bold</b>")
+
+    def test_single_quote_attribute_values_become_double_quotes(self):
+        self.assertSoupEquals("<foo attr='bar'></foo>",
+                              '<foo attr="bar"></foo>')
+
+    def test_attribute_values_with_nested_quotes_are_left_alone(self):
+        text = """<foo attr='bar "brawls" happen'>a</foo>"""
+        self.assertSoupEquals(text)
+
+    def test_attribute_values_with_double_nested_quotes_get_quoted(self):
+        text = """<foo attr='bar "brawls" happen'>a</foo>"""
+        soup = self.soup(text)
+        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
+        self.assertSoupEquals(
+            soup.foo.decode(),
+            """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
+
+    def test_ampersand_in_attribute_value_gets_escaped(self):
+        self.assertSoupEquals('<this is="really messed up & stuff"></this>',
+                              '<this is="really messed up &amp; stuff"></this>')
+
+        self.assertSoupEquals(
+            '<a href="http://example.org?a=1&b=2;3">foo</a>',
+            '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
+
+    def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
+        self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
+
+    def test_entities_in_strings_converted_during_parsing(self):
+        # Both XML and HTML entities are converted to Unicode characters
+        # during parsing.
+        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
+        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
+        self.assertSoupEquals(text, expected)
+
+    def test_smart_quotes_converted_on_the_way_in(self):
+        # Microsoft smart quotes are converted to Unicode characters during
+        # parsing.
+        quote = b"<p>\x91Foo\x92</p>"
+        soup = self.soup(quote)
+        self.assertEqual(
+            soup.p.string,
+            u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
+
+    def test_non_breaking_spaces_converted_on_the_way_in(self):
+        soup = self.soup("<a>&nbsp;&nbsp;</a>")
+        self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
+
+    def test_entities_converted_on_the_way_out(self):
+        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
+        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
+        soup = self.soup(text)
+        self.assertEqual(soup.p.encode("utf-8"), expected)
+
+    def test_real_iso_latin_document(self):
+        # Smoke test of interrelated functionality, using an
+        # easy-to-understand document.
+
+        # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
+        unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
+
+        # That's because we're going to encode it into ISO-Latin-1, and use
+        # that to test.
+        iso_latin_html = unicode_html.encode("iso-8859-1")
+
+        # Parse the ISO-Latin-1 HTML.
+        soup = self.soup(iso_latin_html)
+        # Encode it to UTF-8.
+        result = soup.encode("utf-8")
+
+        # What do we expect the result to look like? Well, it would
+        # look like unicode_html, except that the META tag would say
+        # UTF-8 instead of ISO-Latin-1.
+        expected = unicode_html.replace("ISO-Latin-1", "utf-8")
+
+        # And, of course, it would be in UTF-8, not Unicode.
+        expected = expected.encode("utf-8")
+
+        # Ta-da!
+        self.assertEqual(result, expected)
+
+    def test_real_shift_jis_document(self):
+        # Smoke test to make sure the parser can handle a document in
+        # Shift-JIS encoding, without choking.
+        shift_jis_html = (
+            b'<html><head></head><body><pre>'
+            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
+            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
+            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
+            b'</pre></body></html>')
+        unicode_html = shift_jis_html.decode("shift-jis")
+        soup = self.soup(unicode_html)
+
+        # Make sure the parse tree is correctly encoded to various
+        # encodings.
+        self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
+        self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
+
+    def test_real_hebrew_document(self):
+        # A real-world test to make sure we can convert ISO-8859-9 (a
+        # Hebrew encoding) to UTF-8.
+        hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
+        soup = self.soup(
+            hebrew_document, from_encoding="iso8859-8")
+        self.assertEqual(soup.original_encoding, 'iso8859-8')
+        self.assertEqual(
+            soup.encode('utf-8'),
+            hebrew_document.decode("iso8859-8").encode("utf-8"))
+
+    def test_meta_tag_reflects_current_encoding(self):
+        # Here's the <meta> tag saying that a document is
+        # encoded in Shift-JIS.
+        meta_tag = ('<meta content="text/html; charset=x-sjis" '
+                    'http-equiv="Content-type"/>')
+
+        # Here's a document incorporating that meta tag.
+        shift_jis_html = (
+            '<html><head>\n%s\n'
+            '<meta http-equiv="Content-language" content="ja"/>'
+            '</head><body>Shift-JIS markup goes here.') % meta_tag
+        soup = self.soup(shift_jis_html)
+
+        # Parse the document, and the charset is seemingly unaffected.
+        parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
+        content = parsed_meta['content']
+        self.assertEqual('text/html; charset=x-sjis', content)
+
+        # But that value is actually a ContentMetaAttributeValue object.
+        self.assertTrue(isinstance(content, ContentMetaAttributeValue))
+
+        # And it will take on a value that reflects its current
+        # encoding.
+        self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
+
+        # For the rest of the story, see TestSubstitutions in
+        # test_tree.py.
+
+    def test_html5_style_meta_tag_reflects_current_encoding(self):
+        # Here's the <meta> tag saying that a document is
+        # encoded in Shift-JIS.
+        meta_tag = ('<meta id="encoding" charset="x-sjis" />')
+
+        # Here's a document incorporating that meta tag.
+        shift_jis_html = (
+            '<html><head>\n%s\n'
+            '<meta http-equiv="Content-language" content="ja"/>'
+            '</head><body>Shift-JIS markup goes here.') % meta_tag
+        soup = self.soup(shift_jis_html)
+
+        # Parse the document, and the charset is seemingly unaffected.
+        parsed_meta = soup.find('meta', id="encoding")
+        charset = parsed_meta['charset']
+        self.assertEqual('x-sjis', charset)
+
+        # But that value is actually a CharsetMetaAttributeValue object.
+        self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
+
+        # And it will take on a value that reflects its current
+        # encoding.
+        self.assertEqual('utf8', charset.encode("utf8"))
+
+    def test_tag_with_no_attributes_can_have_attributes_added(self):
+        data = self.soup("<a>text</a>")
+        data.a['foo'] = 'bar'
+        self.assertEqual('<a foo="bar">text</a>', data.a.decode())
+
+class XMLTreeBuilderSmokeTest(object):
+
+    def test_docstring_generated(self):
+        soup = self.soup("<root/>")
+        self.assertEqual(
+            soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
+
+    def test_real_xhtml_document(self):
+        """A real XHTML document should come out *exactly* the same as it went in."""
+        markup = b"""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>Hello.</title></head>
+<body>Goodbye.</body>
+</html>"""
+        soup = self.soup(markup)
+        self.assertEqual(
+            soup.encode("utf-8"), markup)
+
+    def test_popping_namespaced_tag(self):
+        markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
+        soup = self.soup(markup)
+        self.assertEqual(
+            unicode(soup.rss), markup)
+
+    def test_docstring_includes_correct_encoding(self):
+        soup = self.soup("<root/>")
+        self.assertEqual(
+            soup.encode("latin1"),
+            b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
+
+    def test_large_xml_document(self):
+        """A large XML document should come out the same as it went in."""
+        markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
+                  + b'0' * (2**12)
+                  + b'</root>')
+        soup = self.soup(markup)
+        self.assertEqual(soup.encode("utf-8"), markup)
+
+
+    def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
+        self.assertSoupEquals("<p>", "<p/>")
+        self.assertSoupEquals("<p>foo</p>")
+
+    def test_namespaces_are_preserved(self):
+        markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
+        soup = self.soup(markup)
+        root = soup.root
+        self.assertEqual("http://example.com/", root['xmlns:a'])
+        self.assertEqual("http://example.net/", root['xmlns:b'])
+
+    def test_closing_namespaced_tag(self):
+        markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
+        soup = self.soup(markup)
+        self.assertEqual(unicode(soup.p), markup)
+
+    def test_namespaced_attributes(self):
+        markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
+        soup = self.soup(markup)
+        self.assertEqual(unicode(soup.foo), markup)
+
+    def test_namespaced_attributes_xml_namespace(self):
+        markup = '<foo xml:lang="fr">bar</foo>'
+        soup = self.soup(markup)
+        self.assertEqual(unicode(soup.foo), markup)
+
+class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
+    """Smoke test for a tree builder that supports HTML5."""
+
+    def test_real_xhtml_document(self):
+        # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
+        # XHTML documents in any particular way.
+        pass
+
+    def test_html_tags_have_namespace(self):
+        markup = "<a>"
+        soup = self.soup(markup)
+        self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
+
+    def test_svg_tags_have_namespace(self):
+        markup = '<svg><circle/></svg>'
+        soup = self.soup(markup)
+        namespace = "http://www.w3.org/2000/svg"
+        self.assertEqual(namespace, soup.svg.namespace)
+        self.assertEqual(namespace, soup.circle.namespace)
+
+
+    def test_mathml_tags_have_namespace(self):
+        markup = '<math><msqrt>5</msqrt></math>'
+        soup = self.soup(markup)
+        namespace = 'http://www.w3.org/1998/Math/MathML'
+        self.assertEqual(namespace, soup.math.namespace)
+        self.assertEqual(namespace, soup.msqrt.namespace)
+
+    def test_xml_declaration_becomes_comment(self):
+        markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
+        soup = self.soup(markup)
+        self.assertTrue(isinstance(soup.contents[0], Comment))
+        self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
+        self.assertEqual("html", soup.contents[0].next_element.name)
+
+def skipIf(condition, reason):
+   def nothing(test, *args, **kwargs):
+       return None
+
+   def decorator(test_item):
+       if condition:
+           return nothing
+       else:
+           return test_item
+
+   return decorator
--- a/updater/bs4/tests/init.py
+++ b/updater/bs4/tests/init.py
@ -0,0 +1 @@
+"The beautifulsoup tests."
--- a/updater/bs4/tests/test_builder_registry.py
+++ b/updater/bs4/tests/test_builder_registry.py
@ -0,0 +1,141 @@
+"""Tests of the builder registry."""
+
+import unittest
+
+from bs4 import BeautifulSoup
+from bs4.builder import (
+    builder_registry as registry,
+    HTMLParserTreeBuilder,
+    TreeBuilderRegistry,
+)
+
+try:
+    from bs4.builder import HTML5TreeBuilder
+    HTML5LIB_PRESENT = True
+except ImportError:
+    HTML5LIB_PRESENT = False
+
+try:
+    from bs4.builder import (
+        LXMLTreeBuilderForXML,
+        LXMLTreeBuilder,
+        )
+    LXML_PRESENT = True
+except ImportError:
+    LXML_PRESENT = False
+
+
+class BuiltInRegistryTest(unittest.TestCase):
+    """Test the built-in registry with the default builders registered."""
+
+    def test_combination(self):
+        if LXML_PRESENT:
+            self.assertEqual(registry.lookup('fast', 'html'),
+                             LXMLTreeBuilder)
+
+        if LXML_PRESENT:
+            self.assertEqual(registry.lookup('permissive', 'xml'),
+                             LXMLTreeBuilderForXML)
+        self.assertEqual(registry.lookup('strict', 'html'),
+                          HTMLParserTreeBuilder)
+        if HTML5LIB_PRESENT:
+            self.assertEqual(registry.lookup('html5lib', 'html'),
+                              HTML5TreeBuilder)
+
+    def test_lookup_by_markup_type(self):
+        if LXML_PRESENT:
+            self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
+            self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
+        else:
+            self.assertEqual(registry.lookup('xml'), None)
+            if HTML5LIB_PRESENT:
+                self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
+            else:
+                self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
+
+    def test_named_library(self):
+        if LXML_PRESENT:
+            self.assertEqual(registry.lookup('lxml', 'xml'),
+                             LXMLTreeBuilderForXML)
+            self.assertEqual(registry.lookup('lxml', 'html'),
+                             LXMLTreeBuilder)
+        if HTML5LIB_PRESENT:
+            self.assertEqual(registry.lookup('html5lib'),
+                              HTML5TreeBuilder)
+
+        self.assertEqual(registry.lookup('html.parser'),
+                          HTMLParserTreeBuilder)
+
+    def test_beautifulsoup_constructor_does_lookup(self):
+        # You can pass in a string.
+        BeautifulSoup("", features="html")
+        # Or a list of strings.
+        BeautifulSoup("", features=["html", "fast"])
+
+        # You'll get an exception if BS can't find an appropriate
+        # builder.
+        self.assertRaises(ValueError, BeautifulSoup,
+                          "", features="no-such-feature")
+
+class RegistryTest(unittest.TestCase):
+    """Test the TreeBuilderRegistry class in general."""
+
+    def setUp(self):
+        self.registry = TreeBuilderRegistry()
+
+    def builder_for_features(self, *feature_list):
+        cls = type('Builder_' + '_'.join(feature_list),
+                   (object,), {'features' : feature_list})
+
+        self.registry.register(cls)
+        return cls
+
+    def test_register_with_no_features(self):
+        builder = self.builder_for_features()
+
+        # Since the builder advertises no features, you can't find it
+        # by looking up features.
+        self.assertEqual(self.registry.lookup('foo'), None)
+
+        # But you can find it by doing a lookup with no features, if
+        # this happens to be the only registered builder.
+        self.assertEqual(self.registry.lookup(), builder)
+
+    def test_register_with_features_makes_lookup_succeed(self):
+        builder = self.builder_for_features('foo', 'bar')
+        self.assertEqual(self.registry.lookup('foo'), builder)
+        self.assertEqual(self.registry.lookup('bar'), builder)
+
+    def test_lookup_fails_when_no_builder_implements_feature(self):
+        builder = self.builder_for_features('foo', 'bar')
+        self.assertEqual(self.registry.lookup('baz'), None)
+
+    def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
+        builder1 = self.builder_for_features('foo')
+        builder2 = self.builder_for_features('bar')
+        self.assertEqual(self.registry.lookup(), builder2)
+
+    def test_lookup_fails_when_no_tree_builders_registered(self):
+        self.assertEqual(self.registry.lookup(), None)
+
+    def test_lookup_gets_most_recent_builder_supporting_all_features(self):
+        has_one = self.builder_for_features('foo')
+        has_the_other = self.builder_for_features('bar')
+        has_both_early = self.builder_for_features('foo', 'bar', 'baz')
+        has_both_late = self.builder_for_features('foo', 'bar', 'quux')
+        lacks_one = self.builder_for_features('bar')
+        has_the_other = self.builder_for_features('foo')
+
+        # There are two builders featuring 'foo' and 'bar', but
+        # the one that also features 'quux' was registered later.
+        self.assertEqual(self.registry.lookup('foo', 'bar'),
+                          has_both_late)
+
+        # There is only one builder featuring 'foo', 'bar', and 'baz'.
+        self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
+                          has_both_early)
+
+    def test_lookup_fails_when_cannot_reconcile_requested_features(self):
+        builder1 = self.builder_for_features('foo', 'bar')
+        builder2 = self.builder_for_features('foo', 'baz')
+        self.assertEqual(self.registry.lookup('bar', 'baz'), None)
--- a/updater/bs4/tests/test_docs.py
+++ b/updater/bs4/tests/test_docs.py
@ -0,0 +1,36 @@
+"Test harness for doctests."
+
+# pylint: disable-msg=E0611,W0142
+
+__metaclass__ = type
+__all__ = [
+    'additional_tests',
+    ]
+
+import atexit
+import doctest
+import os
+#from pkg_resources import (
+#    resource_filename, resource_exists, resource_listdir, cleanup_resources)
+import unittest
+
+DOCTEST_FLAGS = (
+    doctest.ELLIPSIS |
+    doctest.NORMALIZE_WHITESPACE |
+    doctest.REPORT_NDIFF)
+
+
+# def additional_tests():
+#     "Run the doc tests (README.txt and docs/*, if any exist)"
+#     doctest_files = [
+#         os.path.abspath(resource_filename('bs4', 'README.txt'))]
+#     if resource_exists('bs4', 'docs'):
+#         for name in resource_listdir('bs4', 'docs'):
+#             if name.endswith('.txt'):
+#                 doctest_files.append(
+#                     os.path.abspath(
+#                         resource_filename('bs4', 'docs/%s' % name)))
+#     kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
+#     atexit.register(cleanup_resources)
+#     return unittest.TestSuite((
+#         doctest.DocFileSuite(*doctest_files, **kwargs)))
--- a/updater/bs4/tests/test_html5lib.py
+++ b/updater/bs4/tests/test_html5lib.py
@ -0,0 +1,72 @@
+"""Tests to ensure that the html5lib tree builder generates good trees."""
+
+import warnings
+
+try:
+    from bs4.builder import HTML5TreeBuilder
+    HTML5LIB_PRESENT = True
+except ImportError, e:
+    HTML5LIB_PRESENT = False
+from bs4.element import SoupStrainer
+from bs4.testing import (
+    HTML5TreeBuilderSmokeTest,
+    SoupTest,
+    skipIf,
+)
+
+@skipIf(
+    not HTML5LIB_PRESENT,
+    "html5lib seems not to be present, not testing its tree builder.")
+class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
+    """See ``HTML5TreeBuilderSmokeTest``."""
+
+    @property
+    def default_builder(self):
+        return HTML5TreeBuilder()
+
+    def test_soupstrainer(self):
+        # The html5lib tree builder does not support SoupStrainers.
+        strainer = SoupStrainer("b")
+        markup = "<p>A <b>bold</b> statement.</p>"
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup(markup, parse_only=strainer)
+        self.assertEqual(
+            soup.decode(), self.document_for(markup))
+
+        self.assertTrue(
+            "the html5lib tree builder doesn't support parse_only" in
+            str(w[0].message))
+
+    def test_correctly_nested_tables(self):
+        """html5lib inserts <tbody> tags where other parsers don't."""
+        markup = ('<table id="1">'
+                  '<tr>'
+                  "<td>Here's another table:"
+                  '<table id="2">'
+                  '<tr><td>foo</td></tr>'
+                  '</table></td>')
+
+        self.assertSoupEquals(
+            markup,
+            '<table id="1"><tbody><tr><td>Here\'s another table:'
+            '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
+            '</td></tr></tbody></table>')
+
+        self.assertSoupEquals(
+            "<table><thead><tr><td>Foo</td></tr></thead>"
+            "<tbody><tr><td>Bar</td></tr></tbody>"
+            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
+
+    def test_xml_declaration_followed_by_doctype(self):
+        markup = '''<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html>
+<html>
+  <head>
+  </head>
+  <body>
+   <p>foo</p>
+  </body>
+</html>'''
+        soup = self.soup(markup)
+        # Verify that we can reach the <p> tag; this means the tree is connected.
+        self.assertEquals("<p>foo</p>", soup.p.encode())
--- a/updater/bs4/tests/test_htmlparser.py
+++ b/updater/bs4/tests/test_htmlparser.py
@ -0,0 +1,19 @@
+"""Tests to ensure that the html.parser tree builder generates good
+trees."""
+
+from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
+from bs4.builder import HTMLParserTreeBuilder
+
+class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
+
+    @property
+    def default_builder(self):
+        return HTMLParserTreeBuilder()
+
+    def test_namespaced_system_doctype(self):
+        # html.parser can't handle namespaced doctypes, so skip this one.
+        pass
+
+    def test_namespaced_public_doctype(self):
+        # html.parser can't handle namespaced doctypes, so skip this one.
+        pass
--- a/updater/bs4/tests/test_lxml.py
+++ b/updater/bs4/tests/test_lxml.py
@ -0,0 +1,75 @@
+"""Tests to ensure that the lxml tree builder generates good trees."""
+
+import re
+import warnings
+
+try:
+    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
+    LXML_PRESENT = True
+except ImportError, e:
+    LXML_PRESENT = False
+
+from bs4 import (
+    BeautifulSoup,
+    BeautifulStoneSoup,
+    )
+from bs4.element import Comment, Doctype, SoupStrainer
+from bs4.testing import skipIf
+from bs4.tests import test_htmlparser
+from bs4.testing import (
+    HTMLTreeBuilderSmokeTest,
+    XMLTreeBuilderSmokeTest,
+    SoupTest,
+    skipIf,
+)
+
+@skipIf(
+    not LXML_PRESENT,
+    "lxml seems not to be present, not testing its tree builder.")
+class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
+    """See ``HTMLTreeBuilderSmokeTest``."""
+
+    @property
+    def default_builder(self):
+        return LXMLTreeBuilder()
+
+    def test_out_of_range_entity(self):
+        self.assertSoupEquals(
+            "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
+        self.assertSoupEquals(
+            "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
+        self.assertSoupEquals(
+            "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
+
+    def test_beautifulstonesoup_is_xml_parser(self):
+        # Make sure that the deprecated BSS class uses an xml builder
+        # if one is installed.
+        with warnings.catch_warnings(record=False) as w:
+            soup = BeautifulStoneSoup("<b />")
+            self.assertEqual(u"<b/>", unicode(soup.b))
+
+    def test_real_xhtml_document(self):
+        """lxml strips the XML definition from an XHTML doc, which is fine."""
+        markup = b"""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>Hello.</title></head>
+<body>Goodbye.</body>
+</html>"""
+        soup = self.soup(markup)
+        self.assertEqual(
+            soup.encode("utf-8").replace(b"\n", b''),
+            markup.replace(b'\n', b'').replace(
+                b'<?xml version="1.0" encoding="utf-8"?>', b''))
+
+
+@skipIf(
+    not LXML_PRESENT,
+    "lxml seems not to be present, not testing its XML tree builder.")
+class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
+    """See ``HTMLTreeBuilderSmokeTest``."""
+
+    @property
+    def default_builder(self):
+        return LXMLTreeBuilderForXML()
+
--- a/updater/bs4/tests/test_soup.py
+++ b/updater/bs4/tests/test_soup.py
@ -0,0 +1,378 @@
+# -*- coding: utf-8 -*-
+"""Tests of Beautiful Soup as a whole."""
+
+import logging
+import unittest
+import sys
+from bs4 import (
+    BeautifulSoup,
+    BeautifulStoneSoup,
+)
+from bs4.element import (
+    CharsetMetaAttributeValue,
+    ContentMetaAttributeValue,
+    SoupStrainer,
+    NamespacedAttribute,
+    )
+import bs4.dammit
+from bs4.dammit import EntitySubstitution, UnicodeDammit
+from bs4.testing import (
+    SoupTest,
+    skipIf,
+)
+import warnings
+
+try:
+    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
+    LXML_PRESENT = True
+except ImportError, e:
+    LXML_PRESENT = False
+
+PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
+PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
+
+class TestDeprecatedConstructorArguments(SoupTest):
+
+    def test_parseOnlyThese_renamed_to_parse_only(self):
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
+        msg = str(w[0].message)
+        self.assertTrue("parseOnlyThese" in msg)
+        self.assertTrue("parse_only" in msg)
+        self.assertEqual(b"<b></b>", soup.encode())
+
+    def test_fromEncoding_renamed_to_from_encoding(self):
+        with warnings.catch_warnings(record=True) as w:
+            utf8 = b"\xc3\xa9"
+            soup = self.soup(utf8, fromEncoding="utf8")
+        msg = str(w[0].message)
+        self.assertTrue("fromEncoding" in msg)
+        self.assertTrue("from_encoding" in msg)
+        self.assertEqual("utf8", soup.original_encoding)
+
+    def test_unrecognized_keyword_argument(self):
+        self.assertRaises(
+            TypeError, self.soup, "<a>", no_such_argument=True)
+
+    @skipIf(
+        not LXML_PRESENT,
+        "lxml not present, not testing BeautifulStoneSoup.")
+    def test_beautifulstonesoup(self):
+        with warnings.catch_warnings(record=True) as w:
+            soup = BeautifulStoneSoup("<markup>")
+            self.assertTrue(isinstance(soup, BeautifulSoup))
+            self.assertTrue("BeautifulStoneSoup class is deprecated")
+
+class TestSelectiveParsing(SoupTest):
+
+    def test_parse_with_soupstrainer(self):
+        markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
+        strainer = SoupStrainer("b")
+        soup = self.soup(markup, parse_only=strainer)
+        self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
+
+
+class TestEntitySubstitution(unittest.TestCase):
+    """Standalone tests of the EntitySubstitution class."""
+    def setUp(self):
+        self.sub = EntitySubstitution
+
+    def test_simple_html_substitution(self):
+        # Unicode characters corresponding to named HTML entites
+        # are substituted, and no others.
+        s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
+        self.assertEqual(self.sub.substitute_html(s),
+                          u"foo&forall;\N{SNOWMAN}&otilde;bar")
+
+    def test_smart_quote_substitution(self):
+        # MS smart quotes are a common source of frustration, so we
+        # give them a special test.
+        quotes = b"\x91\x92foo\x93\x94"
+        dammit = UnicodeDammit(quotes)
+        self.assertEqual(self.sub.substitute_html(dammit.markup),
+                          "&lsquo;&rsquo;foo&ldquo;&rdquo;")
+
+    def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
+        s = 'Welcome to "my bar"'
+        self.assertEqual(self.sub.substitute_xml(s, False), s)
+
+    def test_xml_attribute_quoting_normally_uses_double_quotes(self):
+        self.assertEqual(self.sub.substitute_xml("Welcome", True),
+                          '"Welcome"')
+        self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
+                          '"Bob\'s Bar"')
+
+    def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
+        s = 'Welcome to "my bar"'
+        self.assertEqual(self.sub.substitute_xml(s, True),
+                          "'Welcome to \"my bar\"'")
+
+    def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
+        s = 'Welcome to "Bob\'s Bar"'
+        self.assertEqual(
+            self.sub.substitute_xml(s, True),
+            '"Welcome to &quot;Bob\'s Bar&quot;"')
+
+    def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
+        quoted = 'Welcome to "Bob\'s Bar"'
+        self.assertEqual(self.sub.substitute_xml(quoted), quoted)
+
+    def test_xml_quoting_handles_angle_brackets(self):
+        self.assertEqual(
+            self.sub.substitute_xml("foo<bar>"),
+            "foo&lt;bar&gt;")
+
+    def test_xml_quoting_handles_ampersands(self):
+        self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
+
+    def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self):
+        self.assertEqual(
+            self.sub.substitute_xml("&Aacute;T&T"),
+            "&Aacute;T&amp;T")
+
+    def test_quotes_not_html_substituted(self):
+        """There's no need to do this except inside attribute values."""
+        text = 'Bob\'s "bar"'
+        self.assertEqual(self.sub.substitute_html(text), text)
+
+
+class TestEncodingConversion(SoupTest):
+    # Test Beautiful Soup's ability to decode and encode from various
+    # encodings.
+
+    def setUp(self):
+        super(TestEncodingConversion, self).setUp()
+        self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
+        self.utf8_data = self.unicode_data.encode("utf-8")
+        # Just so you know what it looks like.
+        self.assertEqual(
+            self.utf8_data,
+            b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
+
+    def test_ascii_in_unicode_out(self):
+        # ASCII input is converted to Unicode. The original_encoding
+        # attribute is set.
+        ascii = b"<foo>a</foo>"
+        soup_from_ascii = self.soup(ascii)
+        unicode_output = soup_from_ascii.decode()
+        self.assertTrue(isinstance(unicode_output, unicode))
+        self.assertEqual(unicode_output, self.document_for(ascii.decode()))
+        self.assertEqual(soup_from_ascii.original_encoding.lower(), "ascii")
+
+    def test_unicode_in_unicode_out(self):
+        # Unicode input is left alone. The original_encoding attribute
+        # is not set.
+        soup_from_unicode = self.soup(self.unicode_data)
+        self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
+        self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
+        self.assertEqual(soup_from_unicode.original_encoding, None)
+
+    def test_utf8_in_unicode_out(self):
+        # UTF-8 input is converted to Unicode. The original_encoding
+        # attribute is set.
+        soup_from_utf8 = self.soup(self.utf8_data)
+        self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
+        self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
+
+    def test_utf8_out(self):
+        # The internal data structures can be encoded as UTF-8.
+        soup_from_unicode = self.soup(self.unicode_data)
+        self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
+
+    @skipIf(
+        PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
+        "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
+    def test_attribute_name_containing_unicode_characters(self):
+        markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
+        self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
+
+class TestUnicodeDammit(unittest.TestCase):
+    """Standalone tests of Unicode, Dammit."""
+
+    def test_smart_quotes_to_unicode(self):
+        markup = b"<foo>\x91\x92\x93\x94</foo>"
+        dammit = UnicodeDammit(markup)
+        self.assertEqual(
+            dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
+
+    def test_smart_quotes_to_xml_entities(self):
+        markup = b"<foo>\x91\x92\x93\x94</foo>"
+        dammit = UnicodeDammit(markup, smart_quotes_to="xml")
+        self.assertEqual(
+            dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
+
+    def test_smart_quotes_to_html_entities(self):
+        markup = b"<foo>\x91\x92\x93\x94</foo>"
+        dammit = UnicodeDammit(markup, smart_quotes_to="html")
+        self.assertEqual(
+            dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
+
+    def test_smart_quotes_to_ascii(self):
+        markup = b"<foo>\x91\x92\x93\x94</foo>"
+        dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
+        self.assertEqual(
+            dammit.unicode_markup, """<foo>''""</foo>""")
+
+    def test_detect_utf8(self):
+        utf8 = b"\xc3\xa9"
+        dammit = UnicodeDammit(utf8)
+        self.assertEqual(dammit.unicode_markup, u'\xe9')
+        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
+
+    def test_convert_hebrew(self):
+        hebrew = b"\xed\xe5\xec\xf9"
+        dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
+        self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
+        self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
+
+    def test_dont_see_smart_quotes_where_there_are_none(self):
+        utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
+        dammit = UnicodeDammit(utf_8)
+        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
+        self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
+
+    def test_ignore_inappropriate_codecs(self):
+        utf8_data = u"Räksmörgås".encode("utf-8")
+        dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
+        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
+
+    def test_ignore_invalid_codecs(self):
+        utf8_data = u"Räksmörgås".encode("utf-8")
+        for bad_encoding in ['.utf8', '...', 'utF---16.!']:
+            dammit = UnicodeDammit(utf8_data, [bad_encoding])
+            self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
+
+    def test_detect_html5_style_meta_tag(self):
+
+        for data in (
+            b'<html><meta charset="euc-jp" /></html>',
+            b"<html><meta charset='euc-jp' /></html>",
+            b"<html><meta charset=euc-jp /></html>",
+            b"<html><meta charset=euc-jp/></html>"):
+            dammit = UnicodeDammit(data, is_html=True)
+            self.assertEqual(
+                "euc-jp", dammit.original_encoding)
+
+    def test_last_ditch_entity_replacement(self):
+        # This is a UTF-8 document that contains bytestrings
+        # completely incompatible with UTF-8 (ie. encoded with some other
+        # encoding).
+        #
+        # Since there is no consistent encoding for the document,
+        # Unicode, Dammit will eventually encode the document as UTF-8
+        # and encode the incompatible characters as REPLACEMENT
+        # CHARACTER.
+        #
+        # If chardet is installed, it will detect that the document
+        # can be converted into ISO-8859-1 without errors. This happens
+        # to be the wrong encoding, but it is a consistent encoding, so the
+        # code we're testing here won't run.
+        #
+        # So we temporarily disable chardet if it's present.
+        doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
+<html><b>\330\250\330\252\330\261</b>
+<i>\310\322\321\220\312\321\355\344</i></html>"""
+        chardet = bs4.dammit.chardet_dammit
+        logging.disable(logging.WARNING)
+        try:
+            def noop(str):
+                return None
+            bs4.dammit.chardet_dammit = noop
+            dammit = UnicodeDammit(doc)
+            self.assertEqual(True, dammit.contains_replacement_characters)
+            self.assertTrue(u"\ufffd" in dammit.unicode_markup)
+
+            soup = BeautifulSoup(doc, "html.parser")
+            self.assertTrue(soup.contains_replacement_characters)
+        finally:
+            logging.disable(logging.NOTSET)
+            bs4.dammit.chardet_dammit = chardet
+
+    def test_sniffed_xml_encoding(self):
+        # A document written in UTF-16LE will be converted by a different
+        # code path that sniffs the byte order markers.
+        data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
+        dammit = UnicodeDammit(data)
+        self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
+        self.assertEqual("utf-16le", dammit.original_encoding)
+
+    def test_detwingle(self):
+        # Here's a UTF8 document.
+        utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
+
+        # Here's a Windows-1252 document.
+        windows_1252 = (
+            u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
+            u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
+
+        # Through some unholy alchemy, they've been stuck together.
+        doc = utf8 + windows_1252 + utf8
+
+        # The document can't be turned into UTF-8:
+        self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
+
+        # Unicode, Dammit thinks the whole document is Windows-1252,
+        # and decodes it into "â˜ƒâ˜ƒâ˜ƒ“Hi, I like Windows!”â˜ƒâ˜ƒâ˜ƒ"
+
+        # But if we run it through fix_embedded_windows_1252, it's fixed:
+
+        fixed = UnicodeDammit.detwingle(doc)
+        self.assertEqual(
+            u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
+
+    def test_detwingle_ignores_multibyte_characters(self):
+        # Each of these characters has a UTF-8 representation ending
+        # in \x93. \x93 is a smart quote if interpreted as
+        # Windows-1252. But our code knows to skip over multibyte
+        # UTF-8 characters, so they'll survive the process unscathed.
+        for tricky_unicode_char in (
+            u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
+            u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
+            u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
+            ):
+            input = tricky_unicode_char.encode("utf8")
+            self.assertTrue(input.endswith(b'\x93'))
+            output = UnicodeDammit.detwingle(input)
+            self.assertEqual(output, input)
+
+class TestNamedspacedAttribute(SoupTest):
+
+    def test_name_may_be_none(self):
+        a = NamespacedAttribute("xmlns", None)
+        self.assertEqual(a, "xmlns")
+
+    def test_attribute_is_equivalent_to_colon_separated_string(self):
+        a = NamespacedAttribute("a", "b")
+        self.assertEqual("a:b", a)
+
+    def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
+        a = NamespacedAttribute("a", "b", "c")
+        b = NamespacedAttribute("a", "b", "c")
+        self.assertEqual(a, b)
+
+        # The actual namespace is not considered.
+        c = NamespacedAttribute("a", "b", None)
+        self.assertEqual(a, c)
+
+        # But name and prefix are important.
+        d = NamespacedAttribute("a", "z", "c")
+        self.assertNotEqual(a, d)
+
+        e = NamespacedAttribute("z", "b", "c")
+        self.assertNotEqual(a, e)
+
+
+class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
+
+    def test_content_meta_attribute_value(self):
+        value = CharsetMetaAttributeValue("euc-jp")
+        self.assertEqual("euc-jp", value)
+        self.assertEqual("euc-jp", value.original_value)
+        self.assertEqual("utf8", value.encode("utf8"))
+
+
+    def test_content_meta_attribute_value(self):
+        value = ContentMetaAttributeValue("text/html; charset=euc-jp")
+        self.assertEqual("text/html; charset=euc-jp", value)
+        self.assertEqual("text/html; charset=euc-jp", value.original_value)
+        self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
--- a/updater/bs4/tests/test_tree.py
+++ b/updater/bs4/tests/test_tree.py
--- a/updater/coursera.json
+++ b/updater/coursera.json
--- a/updater/lib.py
+++ b/updater/lib.py
@ -10,9 +10,20 @@ class Database(object):
 	TEST = 7
 	BOOK = 8
 	AUDIOBOOK = 9
+	LECTURE = 10
 	
 	def __init__(self, host, user, password=None, database="learn"):
-		self.database = oursql.connect(host=host, user=user, db=database)
+		self.database = oursql.connect(host=host, user=user, passwd=password, db=database)
+	
+	def topic_exists(self, provider, unique_id):
+		c = self.database.cursor()
+		c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (provider, unique_id))
+		return (len(c.fetchall()) > 0)
+		
+	def item_exists(self, provider, unique_id):
+		c = self.database.cursor()
+		c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (provider, unique_id))
+		return (len(c.fetchall()) > 0)
 	
 	def insert_topic(self, provider, unique_id, title, override=False, **kwargs):
 		defaults = {
@ -21,7 +32,8 @@ class Database(object):
 			"start_date": None,
 			"end_date": None,
 			"parent_id": 0,
-			"description": ""
+			"description": "",
+			"provider_name": ""
 		}
 		
 		for kwarg, val in defaults.iteritems():
@ -43,9 +55,9 @@ class Database(object):
 		if exists == True:
 			return (False, results[0][0])
 		else:
-			c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`)"
-				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], provider, unique_id, title, kwargs['description'], kwargs['creation_date'], 
-				                                         kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date']))
+			c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`, `CustomProviderName`)"
+				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], provider, unique_id, title, kwargs['description'], kwargs['creation_date'], 
+				                                         kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'], kwargs["provider_name"]))
 			
 			return (True, c.lastrowid)
 	
@ -56,7 +68,10 @@ class Database(object):
 			"topic_id": 0,
 			"parent_id": 0,
 			"description": "",
-			"date": None
+			"date": None,
+			"start_date": None,
+			"end_date": None,
+			"provider_name": ""
 		}
 		
 		for kwarg, val in defaults.iteritems():
@ -78,8 +93,8 @@ class Database(object):
 		if exists == True:
 			return (False, results[0][0])
 		else:
-			c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`)"
-				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (has_topic, itemtype, provider, unique_id, title, kwargs["description"], item_url, kwargs["source_url"], 
-									       kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"]))
+			c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`, `StartDate`, `EndDate`, `CustomProviderName`)"
+				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (has_topic, itemtype, provider, unique_id, title, kwargs["description"], item_url, kwargs["source_url"], 
+									       kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"]))
 			
 			return (True, c.lastrowid)
--- a/updater/scrapers/init.py
+++ b/updater/scrapers/init.py
@ -0,0 +1,26 @@
+import inspect, os, sys
+
+my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
+
+def _import_module_into_scope(modulename):
+	module = __import__(modulename)
+	
+	for name in vars(module):
+		data = getattr(module, name)
+		globals()[name] = data
+
+sys.path.insert(0, my_path)
+
+for fname in os.listdir(my_path):
+	fpath = os.path.join(my_path, fname)
+	fbasename, fext = os.path.splitext(fname)
+	
+	if os.path.isdir(fpath):
+		if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
+			# This is a python directory module
+			_import_module_into_scope(fname)
+	elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
+		# This is a python file module
+		_import_module_into_scope(fbasename)
+
+sys.path.remove(my_path)
--- a/updater/scrapers/coursera.py
+++ b/updater/scrapers/coursera.py
@ -0,0 +1,50 @@
+import datetime, json, sys
+import requests
+import shared
+
+class Coursera(shared.Scraper):
+	provider_id = 2
+	
+	def run(self):
+		self.retrieve_dataset()
+		self.parse_dataset()
+	
+	def retrieve_dataset(self):
+		self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
+
+	def parse_dataset(self):
+		for item in self.dataset:
+			self.process_item(item)
+		
+	def process_item(self, item):
+		inserted, row_id = self.insert_topic(str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
+		
+		if inserted:
+			self.env.log("Inserted topic %s" % item["name"])
+		else:
+			self.env.log("Skipped topic %s" % item["name"])
+		
+		for course in item["courses"]:
+			self.process_course(course, row_id)
+	
+	def process_course(self, course, topicid):
+		try:
+			start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
+		except TypeError, e:
+			start_date = None
+			
+		title = self.generate_title(course['name'], start_date)
+		
+		inserted, row_id = self.insert_item(str(course["id"]), title, course["home_link"], has_topic=True, itemtype=self.COURSE, description=course["certificate_description"], start_date=start_date, topic_id=topicid)
+		
+		if inserted:
+			self.env.log("Inserted item %s" % title)
+		else:
+			self.env.log("Skipped item %s" % title)
+			
+	def generate_title(self, name, date):
+		if date is None:
+			return "%s (date undetermined)" % name
+		else:
+			return "%s (starting %s)" % (name, date.strftime("%b %d, %Y"))
+			
--- a/updater/scrapers/genericocw.py
+++ b/updater/scrapers/genericocw.py
@ -0,0 +1,201 @@
+import requests
+import oursql
+import datetime
+import json
+import sys, os
+import shared
+
+from bs4 import BeautifulSoup
+import bs4
+	
+rsess = requests.Session()
+rsess.headers['User-Agent'] = 'http://learn.cryto.net/ (scraper@cryto.net) - We mean no harm, thanks for making knowledge free :)'
+
+class OpenCourseWare(shared.Scraper):
+	def run(self):
+		overview = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource").text
+		soup = BeautifulSoup(overview)
+		
+		for element in soup.find(id="pagecontent")("a"):
+			#if "Hopkins" not in element.string:
+			#	continue
+			self.process_source(int(element["href"].split("/")[-1]), element.string)
+		
+	def process_source(self, source_id, source_name):
+		data = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/browse/source/%d" % source_id).text
+		soup = BeautifulSoup(data)
+		
+		courses = soup.select("table#cfResultsTable tr")
+		
+		for course in courses[:3]:
+			links = course("a")
+			
+			if len(links) > 0:
+				external = links[0]
+				details = links[1]
+				
+				self.parse_course(external.string, external["href"], details["href"].split("/")[-1], source_name)
+				
+	def parse_course(self, course_name, course_url, course_id, source_name):
+		self.env.log("Parsing %s" % course_url)
+		
+		# First fetch metadata from ocwconsortium.org
+		ocw_data = self._metadata_ocw(course_id)
+		ocw_data["providername"] = source_name
+		ocw_data["url"] = course_url
+		
+		# Now fetch metadata from the particular course provider
+		provider_data = self._metadata_provider(course_url)
+		
+		if provider_data != False:
+			data = ocw_data.copy()
+			data.update(provider_data)
+			
+			# TODO: insert data
+			self.env.log(repr(data))
+	
+	def _metadata_ocw(self, course_id):
+		soup = BeautifulSoup(rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/course/%s" % course_id).text)
+		metadata = soup.select("dl.coursepage")[0]
+		
+		if len(metadata) > 0:
+			data = self._parse_ocw_dl(metadata.select("dd"), metadata.select("dt"))
+		else:
+			# No metadata provided by ocwconsortium.
+			data = {}
+			
+		return data
+	
+	def _parse_ocw_dl(self, dd, dt):
+		data = {}
+		
+		for i in xrange(0, len(dd)):
+			label = dd[i].string.strip().rstrip(":")
+			value = dt[i].string
+			
+			if value is not None:
+				value = value.strip()
+			
+			if label == "Tags":
+				if value == None:
+					data["tags"] = []
+				else:
+					data["tags"] = [x.strip() for x in value.split(",")]
+			elif label == "Source":
+				data["providername"] = value
+			elif label == "Language":
+				data["language"] = value
+			elif label == "Link":
+				# We can ignore this, we already have it anyway
+				pass
+			elif label == "Author":
+				if value == None:
+					data["author"] = None
+				else:
+					data["author"] = value
+			elif label == "License":
+				if value == None:
+					data["license"] = None
+				else:
+					data["license"] = value
+			elif label == "Date Published":
+				data["creation_date"] = datetime.datetime.strptime(value, "%b %d, %Y")
+			else:
+				self.env.log("UNKNOWN: %s => %s" % (label, value), True)
+				
+		return data
+		
+	def _metadata_provider(self, url):
+		providers = {
+			"oer.avu.org": self._metadata_avu,
+			"ocw.capilanou.ca": self._metadata_capilano,
+			"ocw.hokudai.ac.jp": self._metadata_hokkaido,
+			"ocw.ie.edu": self._metadata_ie,
+			"ocw.jhsph.edu": self._metadata_hopkins,
+		}
+
+		host = url.split("/")[2]
+		data = {}
+		
+		for provider, func in providers.iteritems():
+			if host.endswith(provider):
+				return func(url)
+				
+		return False
+	
+	def _metadata_avu(self, url):
+		# African Virtual University
+		soup = BeautifulSoup(rsess.get(url + "?show=full").text)
+		table = soup.select("table.ds-includeSet-table")[0]
+		data = {"providername": "African Virtual University"}
+		
+		for row in table("tr"):
+			cells = row("td")
+			label = cells[0].string
+			value = cells[1].string
+			
+			if label == "dc.identifier.uri":
+				data["identifier_uri"] = value
+			elif label == "dc.type":
+				data["object_type"] = value
+			elif label == "dc.date.accessioned":
+				data["creation_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
+			elif label == "dc.date.issued":
+				data["issued_date"] = datetime.datetime.strptime(value, "%Y-%m-%d")
+			elif label == "dc.date.available":
+				data["available_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
+			elif label == "dc.language.iso":
+				data["language"] = value
+			elif label == "dc.description.abstract":
+				data["description"] = " ".join(x for y in cells[1]("p") for x in y.strings)
+			elif label == "dc.contributor.author":
+				data["author"] = value
+			elif label == "dc.title":
+				data["title"] = value
+			else:
+				self.env.log("UNKNOWN KEY: %s => %s" % (label, value), True)
+			
+		return data
+	
+	def _metadata_capilano(self, url):
+		# Capilano University
+		soup = BeautifulSoup(rsess.get(url).text)
+		data = {"providername": "Capilano University"}
+		
+		data["title"] = soup.select("h1.documentFirstHeading")[0].string.strip()
+		data["description"] = " ".join(x for y in soup.select("#about > p") for x in y.strings).strip()
+		
+		return data
+		
+	def _metadata_hokkaido(self, url):
+		# Hokkaido University
+		soup = BeautifulSoup(rsess.get(url).text)
+		data = {"providername": "Hokkaido University"}
+		
+		data["title"] = soup.select("#MAIN h1")[0].string.strip()
+		data["description"] = soup.select("#MAIN p")[0].string.strip()
+	
+		return data
+		
+	def _metadata_ie(self, url):
+		# IE University
+		course_id = url.split("=")[1]
+		soup = BeautifulSoup(rsess.get("http://ocw.ie.edu/ocw/cur%s01_esp.html" % course_id.zfill(2)).text)
+		data = {"providername": "IE University"}
+		
+		data["title"] = soup.select(".ari_18_negrita")[0].string.strip()
+		data["description"] = " ".join(x.strip() for x in soup.select(".ari_12_negra")[-1].strings)
+		data["author"] = soup.select(".ari_12_negra")[2].select(".ari_12_negrita")[0].string.strip()
+	
+		return data
+		
+	def _metadata_hopkins(self, url):
+		# Johns Hopkins Bloomberg School of Public Health
+		soup = BeautifulSoup(rsess.get(url).text)
+		data = {"providername": "Johns Hopkins Bloomberg School of Public Health"}
+		
+		data["title"] = self.soup_to_text(soup.select("h1")[-1])
+		data["author"] = self.soup_to_text(soup.select("#courseInfoBox p:nth-of-type(1)"))
+		data["description"] = self.soup_to_text(soup.select("#courseImageAndInfoBox > p"))
+		
+		return data
--- a/updater/scrapers/khan.py
+++ b/updater/scrapers/khan.py
@ -0,0 +1,197 @@
+import datetime, json, sys
+import requests
+import shared
+
+class KhanAcademy(shared.Scraper):
+	provider_id = 1
+	
+	def run(self):
+		self.retrieve_dataset()
+		self.process_item(self.dataset, 0)
+		
+	def retrieve_dataset(self):
+		self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
+		
+	def process_item(self, item, level, parent=None):
+		try:
+			kind = item["kind"]
+		except KeyError, e:
+			return
+		
+		if kind == "Topic":
+			self.process_topic(item, level, parent=parent)
+		elif kind in ("Video", "Exercise", "Article", "Scratchpad"):
+			self.process_object(item, level, parent=parent)
+		elif kind == "Separator":
+			pass  # Ignore separators
+		else:
+			self.env.log("Unrecognized kind: %s" % repr(item["kind"]), True)
+		
+		try:
+			children = item["children"]
+		except KeyError, e:
+			return
+			
+		for child in children:
+			self.process_item(child, level + 1, item)
+			
+	def process_topic(self, item, level, parent=None):
+		unique_id = item["id"]
+			
+		try:
+			parent_id = parent["_cl_id"]
+		except TypeError, e:
+			parent_id = 0
+			
+		# Check if a title is set
+		if item["title"] is not None:
+			title = item["title"]
+		else:
+			# No title was set - log this as an error and default to 'Untitled'.
+			self.env.log("No title found for item: %s" % repr(item), True)
+			title = "Untitled"
+		
+		# Check if a description is set, and default to no description if not
+		if item["description"] is not None:
+			description = item["description"]
+		else:
+			description = None
+		
+		# Insert the topic
+		inserted, row_id = self.insert_topic(unique_id, title, description=description, needs_enrollment=False)
+		
+		# Set the ID of the newly inserted row so that all objects in this topic know the ID of their topic.
+		item["_cl_id"] = row_id
+		
+		if inserted:
+			self.env.log("Inserted %s" % title)
+		else:
+			self.env.log("Skipped %s" % title)
+			
+	def process_object(self, item, level, parent=None):
+		unique_id = None
+		
+		# First check for the 'readable_id' property
+		try:
+			unique_id = item["readable_id"]
+		except KeyError, e:
+			pass
+		
+		# If no identifier was found, check for the 'name' property
+		if unique_id is None:
+			try:
+				unique_id = item["name"]
+			except KeyError, e:
+				pass
+		
+		# If still no identifier was found, check for the 'id' property
+		if unique_id is None:
+			try:
+				unique_id = str(item["id"])
+			except KeyError, e:
+				pass
+		
+		# If we *still* do not have an identifier, log the error and bail out
+		if unique_id is None:
+			self.env.log("No suitable identifier found for item: %s" % repr(item), True)
+			return
+		
+		# Determine the object type
+		if item["kind"] == "Video":
+			itemtype = self.VIDEO
+		elif item["kind"] == "Exercise":
+			itemtype = self.EXERCISE
+		elif item["kind"] == "Article":
+			itemtype = self.ARTICLE
+		elif item["kind"] == "Scratchpad":
+			itemtype = self.SANDBOX
+		
+		source_url = None
+		
+		# Determine the source URL via the 'ka_url' property
+		try:
+			source_url = item["ka_url"]
+		except KeyError, e:
+			pass
+		
+		# If no source URL was found, try the 'url' property
+		if source_url is None:			
+			try:
+				source_url = item["url"]
+			except KeyError, e:
+				pass
+		
+		# If still no source URL was found...
+		if source_url is None:
+			if itemtype == self.ARTICLE:
+				# Articles can lack a URL.
+				source_url = None
+			else:
+				# There was no source URL, but this wasn't an article. Log the error and bail out.
+				self.env.log("No source URL found for non-article object: %s" % repr(item), True)
+				return
+		
+		# Determine the (external) item URL
+		try:
+			item_url = item["url"]
+		except KeyError, e:
+			# Apparently there was no external item URL. Use the source URL as item URL - this will most likely be correct.
+			item_url = source_url
+		
+		# If the object is an article, we'll want to use the actual article content as description.
+		if itemtype == self.ARTICLE:
+			description = item["content"]
+		else:
+			# Otherwise, we'll check if there's a 'description' property. If not, leave empty.
+			try:
+				description = item["description"]
+			except KeyError, e:
+				description = None
+		
+		title = None
+		
+		# First check the 'title' property for an object title.
+		try:
+			title = item["title"]
+		except KeyError, e:
+			pass
+		
+		# As second option, check the 'display_name' property.
+		if title is None:
+			try:
+				title = item["display_name"]
+			except KeyError, e:
+				# Apparently it really does not have a title. Log the error and default to 'Untitled'.
+				self.env.log("No object title found for item: %s" % repr(item), True)
+				title = "Untitled"
+		
+		# If a 'views' property is present, include it.
+		try:
+			views = item["views"]
+		except KeyError, e:
+			views = None
+		
+		# If a creation date is present, include it.
+		try:
+			date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
+		except KeyError, e:
+			date = None
+		
+		# Check if there is a parent ID
+		try:
+			parent_id = parent["_cl_id"]
+		except KeyError, e:
+			# No parent ID present - log this as an error and default to 0.
+			self.env.log("No parent ID found for item: %s" % repr(item), True)
+			parent_id = 0
+		
+		# Insert the item
+		inserted, row_id = self.insert_item(unique_id, title, item_url, itemtype=itemtype, has_topic=True, source_url=source_url, description=description, views=views, topic_id=parent_id, date=date)
+		
+		# Store the resulting row ID in the item so that the children know the ID of their parent.
+		item["_cl_id"] = row_id
+		
+		if inserted:
+			self.env.log("Inserted %s" % title)
+		else:
+			self.env.log("Skipped %s" % title)
--- a/updater/scrapers/ureddit.py
+++ b/updater/scrapers/ureddit.py
@ -0,0 +1,55 @@
+import datetime, json, simplejson, sys, re
+import requests
+import shared
+
+class UniversityOfReddit(shared.Scraper):
+	provider_id = 3
+	
+	def run(self):
+		data = requests.get("http://ureddit.com/api?type=catalog").json()
+		
+		for category in data["categories"]:
+			self.parse_category(category['id'], category['value'])
+	
+	def parse_category(self, category_id, category_name):
+		try:
+			data = requests.get("http://ureddit.com/api?type=category&id=%s" % category_id).json()
+		except simplejson.decoder.JSONDecodeError, e:
+			return
+		
+		for _class in data["classes"]:
+			if not self.topic_exists(_class['id']):
+				self.parse_class(_class['id'], _class['value'], category_name)
+			else:
+				self.env.log("Skipped class %s" % _class['value'])
+	
+	def parse_class(self, class_id, class_name, category_name):
+		try:
+			data = requests.get("http://ureddit.com/api?type=class&id=%s" % class_id).json()
+		except simplejson.decoder.JSONDecodeError, e:
+			self.env.log("Skipped %s due to JSON formatting error" % class_name, True)
+			return
+		
+		if data["status"] == '1' or data["status"] == '3' or data["status"] == '5':
+			try:
+				creation_date = datetime.datetime.strptime(data["created"], '%Y-%m-%d %H:%M:%S')
+			except ValueError, e:
+				creation_date = None
+			
+			class_page = data["url"]
+			
+			inserted, topic_id = self.insert_topic(str(class_id), data["name"], needs_enrollment=True, description=data["description"], creation_date=creation_date)
+			
+			if inserted:
+				self.env.log("Inserted topic %s" % data["name"])
+			else:
+				self.env.log("Skipped topic %s" % data["name"])
+			
+			inserted, item_id = self.insert_item(str(class_id), data["name"], class_page, itemtype=self.COURSE, has_topic=True, topic_id=topic_id, date=creation_date, description=data["description"])
+			
+			if inserted:
+				self.env.log("Inserted item %s" % data["name"])
+			else:
+				self.env.log("Skipped item %s" % data["name"])
+		else:
+			self.env.log("Skipped %s due to status (%s)" % (data["name"], data["status_description"]))
--- a/updater/shared/init.py
+++ b/updater/shared/init.py
@ -0,0 +1,26 @@
+import inspect, os, sys
+
+my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
+
+def _import_module_into_scope(modulename):
+	module = __import__(modulename)
+	
+	for name in vars(module):
+		data = getattr(module, name)
+		globals()[name] = data
+
+sys.path.insert(0, my_path)
+
+for fname in os.listdir(my_path):
+	fpath = os.path.join(my_path, fname)
+	fbasename, fext = os.path.splitext(fname)
+	
+	if os.path.isdir(fpath):
+		if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
+			# This is a python directory module
+			_import_module_into_scope(fname)
+	elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
+		# This is a python file module
+		_import_module_into_scope(fbasename)
+
+sys.path.remove(my_path)
--- a/updater/shared/environment.py
+++ b/updater/shared/environment.py
@ -0,0 +1,17 @@
+import oursql, sys
+
+class Environment(object):
+	def connect(self, host="localhost", username="root", password="", database="learn"):
+		self.db = oursql.connect(host=host, user=username, passwd=password, db=database)
+		self.connected = True
+		
+	def log(self, text, is_error=False):
+		if is_error == False:
+			sys.stdout.write(text + "\n")
+		else:
+			sys.stderr.write(text + "\n")
+		
+	def Scraper(self, scraper_class):
+		s = scraper_class(self.db)
+		s.env = self
+		return s
--- a/updater/shared/scraper.py
+++ b/updater/shared/scraper.py
@ -0,0 +1,122 @@
+class Scraper(object):
+	UNKNOWN = 0
+	TOPIC = 1
+	COURSE = 2
+	VIDEO = 3
+	ARTICLE = 4
+	EXERCISE = 5
+	QUIZ = 6
+	TEST = 7
+	BOOK = 8
+	AUDIOBOOK = 9
+	LECTURE = 10
+	SANDBOX = 11
+	
+	provider_id = 0
+	
+	def __init__(self, database=None):
+		if database is not None:
+			self.db = database
+			self.can_store = True
+		else:
+			self.can_store = False
+			
+	def run(self, *args, **kwargs):
+		raise Exception("No run() method was specified for this scraper.")
+	
+	def topic_exists(self, unique_id):
+		c = self.db.cursor()
+		c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
+		return (len(c.fetchall()) > 0)
+		
+	def item_exists(self, unique_id):
+		c = self.db.cursor()
+		c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
+		return (len(c.fetchall()) > 0)
+	
+	def insert_topic(self, unique_id, title, override=False, **kwargs):
+		defaults = {
+			"needs_enrollment": False,
+			"creation_date": None,
+			"start_date": None,
+			"end_date": None,
+			"parent_id": 0,
+			"description": "",
+			"provider_name": ""
+		}
+		
+		for kwarg, val in defaults.iteritems():
+			try:
+				if kwargs[kwarg] == None:
+					kwargs[kwarg] = defaults[kwarg]
+			except KeyError, e:
+				kwargs[kwarg] = defaults[kwarg]
+		
+		c = self.db.cursor()
+		
+		if override == True:
+			exists = False
+		else:
+			c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
+			results = c.fetchall()
+			exists = (len(results) > 0)
+			
+		if exists == True:
+			return (False, results[0][0])
+		else:
+			c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`, `CustomProviderName`)"
+				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], self.provider_id, unique_id, title, kwargs['description'], kwargs['creation_date'], 
+				                                            kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'], kwargs["provider_name"]))
+			
+			return (True, c.lastrowid)
+			
+	def insert_item(self, unique_id, title, item_url, override=False, **kwargs):
+		defaults = {
+			"views": None,
+			"has_topic": False,
+			"itemtype": 0,
+			"source_url": item_url,
+			"topic_id": 0,
+			"parent_id": 0,
+			"description": "",
+			"date": None,
+			"start_date": None,
+			"end_date": None,
+			"provider_name": ""
+		}
+		
+		for kwarg, val in defaults.iteritems():
+			try:
+				if kwargs[kwarg] == None:
+					kwargs[kwarg] = defaults[kwarg]
+			except KeyError, e:
+				kwargs[kwarg] = defaults[kwarg]
+		
+		c = self.db.cursor()
+		
+		if override == True:
+			exists = False
+		else:
+			c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
+			results = c.fetchall()
+			exists = (len(results) > 0)
+			
+		if exists == True:
+			return (False, results[0][0])
+		else:
+			c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`, `StartDate`, `EndDate`, `CustomProviderName`)"
+				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs["has_topic"], kwargs["itemtype"], self.provider_id, unique_id, title, kwargs["description"], item_url, kwargs["source_url"], 
+									       kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"]))
+			
+			return (True, c.lastrowid)
+			
+	def soup_to_text(self, soup):
+		strings = []
+		
+		try:
+			for el in soup:
+				strings += el._all_strings(True, True)
+		except AttributeError, e:
+			strings = soup._all_strings(True, True)
+			
+		return " ".join(strings)
--- a/updater/test_ocw.py
+++ b/updater/test_ocw.py
@ -0,0 +1,4 @@
+import update_ocw
+
+c = update_ocw.OpenCourseWareCrawler()
+print c.get_provider_data("http://ocw.jhsph.edu/courses/AdolHealthDev/?source=rss")
--- a/updater/update.py
+++ b/updater/update.py
@ -0,0 +1,8 @@
+#!/usr/bin/env python
+import shared, scrapers
+
+env = shared.Environment()
+env.connect(host="localhost", username="root", password="", database="learn")
+
+scraper = env.Scraper(scrapers.OpenCourseWare)
+scraper.run()
--- a/updater/update_khan.py
+++ b/updater/update_khan.py
@ -1,131 +0,0 @@
-import requests
-import oursql
-import datetime
-import json
-import lib
-
-class KhanUniversityCrawler(object):
-	def __init__(self):
-		self.db = lib.Database("localhost", "root")
-		
-	def retrieve_dataset(self):
-		self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
-		#self.dataset = json.loads(open("data.json", "r").read())
-
-	def parse_dataset(self):
-		self.process_item(self.dataset, 0)
-		
-	def process_item(self, item, level, parent=None):
-		try:
-			kind = item["kind"]
-		except KeyError, e:
-			return
-		
-		if kind == "Topic":
-			unique_id = item["id"]
-			
-			try:
-				parent_id = parent["_cl_id"]
-			except TypeError, e:
-				parent_id = 0
-				
-			if item["title"] is not None:
-				title = item["title"]
-			else:
-				title = ""
-			
-			inserted, rowid = self.db.insert_topic(1, unique_id, title, description=item["description"], needs_enrollment=False)
-			item["_cl_id"] = rowid
-			
-			if inserted:
-				print "Inserted %s" % title
-			else:
-				print "Skipped %s" % title
-		elif kind in ("Video", "Exercise", "Article"):
-			try:
-				unique_id = item["readable_id"]
-			except KeyError, e:
-				try:
-					unique_id = item["name"]
-				except KeyError, e:
-					try:
-						unique_id = str(item["id"])
-					except KeyError, e:
-						print repr(item)
-						sys.stderr.write("WARNING: No suitable identifier found for item\n")
-						raise
-						return
-					
-			if item["kind"] == "Video":
-				itemtype = self.db.VIDEO
-			elif item["kind"] == "Exercise":
-				itemtype = self.db.EXERCISE
-			elif item["kind"] == "Article":
-				itemtype = self.db.ARTICLE
-				
-			try:
-				source_url = item["ka_url"]
-			except KeyError, e:
-				if itemtype == self.db.ARTICLE:
-					source_url = ""
-				else:
-					return
-				
-			try:
-				item_url = item["url"]
-			except KeyError, e:
-				try:
-					item_url = item["ka_url"]
-				except KeyError, e:
-					item_url = None
-				
-			if itemtype == self.db.ARTICLE:
-				description = item["content"]
-			else:
-				try:
-					description = item["description"]
-				except KeyError, e:
-					description = None
-			
-			try:
-				title = item["title"]
-			except KeyError, e:
-				try:
-					title = item["display_name"]
-				except KeyError, e:
-					title = "Untitled"
-				
-			try:
-				views = item["views"]
-			except KeyError, e:
-				views = None
-				
-			try:
-				date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
-			except KeyError, e:
-				date = None
-			
-			inserted, rowid = self.db.insert_item(1, unique_id, True, itemtype, title, item_url, source_url=source_url, description=description, views=views, topic_id=parent["_cl_id"], date=date)
-			item["_cl_id"] = rowid
-			
-			if inserted:
-				print "Inserted %s" % title
-			else:
-				print "Skipped %s" % title
-		elif kind == "Separator":
-			pass  # Ignore separators
-		else:
-			sys.stderr.write("Unrecognized kind: %s\n" % item["kind"])
-			sys.stderr.write("%s\n" % (repr(item)))
-		
-		try:
-			children = item["children"]
-		except KeyError, e:
-			pass
-		else:
-			for child in children:
-				self.process_item(child, level + 1, item)
-			
-crawler = KhanUniversityCrawler()
-crawler.retrieve_dataset()
-crawler.parse_dataset()
--- a/updater/update_ocw.py
+++ b/updater/update_ocw.py
@ -0,0 +1,288 @@
+import requests
+import oursql
+import datetime
+import json
+import lib
+from bs4 import BeautifulSoup
+import bs4
+
+def combine_dict(a, b):
+	c = a.copy()
+	c.update(b)
+	return c
+	
+rsess = requests.Session()
+rsess.headers['User-Agent'] = 'http://learn.cryto.net/ (scraper@cryto.net) - We mean no harm, thanks for making knowledge free :)'
+
+class OpenCourseWareCrawler(object):
+	def __init__(self):
+		self.db = lib.Database("localhost", "root", password="")
+	
+	def parse_catalog(self):
+		overview = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource").text
+		soup = BeautifulSoup(overview)
+		
+		for element in soup.find(id="pagecontent")("a"):
+			self.parse_source(int(element["href"].split("/")[-1]), element.string)
+		
+	def parse_source(self, source_id, source_name):
+		data = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/browse/source/%d" % source_id).text
+		soup = BeautifulSoup(data)
+		
+		courses = soup.select("table#cfResultsTable tr")
+		
+		print "# " + source_name
+		
+		for course in courses[:2]:
+			links = course("a")
+			
+			if len(links) > 0:
+				external = links[0]
+				details = links[1]
+				
+				self.parse_course(external.string, external["href"], details["href"].split("/")[-1])
+				
+	def parse_course(self, course_name, course_url, course_id):
+		# First fetch metadata from ocwconsortium.org
+		
+		print course_url
+		
+		metadata_soup = BeautifulSoup(rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/course/%s" % course_id).text)
+		
+		metadata = metadata_soup.select("dl.coursepage")[0]
+		
+		if len(metadata) > 0:
+			data = self.parse_dl(metadata.select("dd"), metadata.select("dt"))
+		else:
+			# No metadata provided by ocwconsortium.
+			data = {}
+		
+		# Now fetch metadata from the particular course provider
+		provider_data = self.get_provider_data(course_url)
+		
+		if provider_data != {}:
+			print repr(provider_data)
+			
+	def parse_dl(self, dd, dt):
+		data = {}
+		
+		for i in xrange(0, len(dd)):
+			label = dd[i].string.strip().rstrip(":")
+			value = dt[i].string
+			
+			if value is not None:
+				value = value.strip()
+			
+			if label == "Tags":
+				if value == None:
+					data["tags"] = []
+				else:
+					data["tags"] = [x.strip() for x in value.split(",")]
+			elif label == "Source":
+				data["source"] = value
+			elif label == "Language":
+				data["language"] = value
+			elif label == "Link":
+				# We can ignore this, we already have it anyway
+				pass
+			elif label == "Author":
+				if value == None:
+					data["author"] = None
+				else:
+					data["author"] = value
+			elif label == "License":
+				if value == None:
+					data["license"] = None
+				else:
+					data["license"] = value
+			elif label == "Date Published":
+				data["creation_date"] = datetime.datetime.strptime(value, "%b %d, %Y")
+			else:
+				print "UNKNOWN: %s => %s" % (label, value)
+				
+		return data
+		
+	def get_provider_data(self, url):
+		providers = {
+			"oer.avu.org": self._data_avu,
+			"ocw.capilanou.ca": self._data_capilano,
+			"ocw.hokudai.ac.jp": self._data_hokkaido,
+			"ocw.ie.edu": self._data_ie,
+			"ocw.jhsph.edu": self._data_hopkins,
+		}
+
+		""",
+			
+			
+			
+			"ocw.kaplan.edu": self._data_kaplan,
+			"ocw.korea.edu": self._data_korea,
+			"kyotomm.jp": self._data_kyoto,
+			"ocw.kyushu-u.ac.jp": self._data_kyushu,
+			
+			"open-marhi.ru": self._data_moscow,
+			"yctrtrc.ncku.edu.tw": self._data_chengkung,
+			"ocw.nctu.edu.tw": self._data_chiaotung,
+			"opencourse.ndhu.edu.tw": self._data_donghwa,
+			"ocw.njit.edu": self._data_njit,
+			"graduateschool.paristech.fr": self._data_paris,
+			"peoples-uni.org": self._data_oaei,
+			"ocw.sbu.ac.ir": self._data_shahid,
+			"studentscircle.net": self._data_studentscircle,
+			"ocw.tmu.edu.tw:8080": self._data_taipei,
+			"openlearn.open.ac.uk": self._data_openuni,
+			"www.ocw.titech.ac.jp": self._data_tokyo,
+			"feedproxy.google.com": self._data_tudelft,
+			"ocw.tufts.edu": self._data_tufts,
+			"ocw.unu.edu": self._data_un,
+			"ocw.uc3m.es": self._data_madrid,
+			"ocw.ua.es": self._data_alicante,
+			"ocw.unican.es": self._data_cantabria,
+			"ocw.ugr.es": self._data_granada,
+			"ocw.udem.edu.mx": self._data_monterrey,
+			"ocw.um.es": self._data_murcia,
+			"ocw.uniovi.es": self._data_oviedo,
+			"ocw.usal.es": self._data_salamanca,
+			"ocwus.us.es": self._data_sevilla,
+			"ocw.unizar.es": self._data_zaragoza,
+			"ocw.univalle.edu.co3": self._data_colombia,
+			"ocw.uned.ac.cr": self._data_distancia,
+			"www.icesi.edu.co": self._data_icesi,
+			"ocw.innova.uned.es": self._data_innova,
+			"upv.es": self._data_valencia,
+			"ocw.upm.es": self._data_upm,
+			"ocw.utpl.edu.ec": self._data_utpl,
+			"ocw.uab.cat": self._data_uab,
+			"ocw.ub.edu": self._data_ub,
+			"ocw.uib.es": self._data_uib,
+			"ocw.udl.cat": self._data_udl,
+			"ocw.uv.es": self._data_uv,
+			"e-ujier.uji.e": self._data_uji,
+			"ocw.uoc.edu": self._data_uoc,
+			"ocw.utm.my": self._data_utm,
+			"ocw.uci.edu": self._data_uci,
+			"opencontent.uct.ac.za": self._data_uct,
+			"ocw.umb.edu:8080": self._data_boston,
+			"open.umich.edu": self._data_michigan,
+			"ocw.nd.edu": self._data_notredame,
+			"ocw.usu.ac.id": self._data_usu,
+			"ocw.tsukuba.ac.jp": self._data_tsukaba"""
+
+		host = url.split("/")[2]
+		data = {}
+		
+		for provider, func in providers.iteritems():
+			if host.endswith(provider):
+				data = func(url)
+				
+		return data
+	
+	def _data_avu(self, url):
+		# African Virtual University
+		soup = BeautifulSoup(rsess.get(url + "?show=full").text)
+		table = soup.select("table.ds-includeSet-table")[0]
+		data = {"providername": "African Virtual University"}
+		
+		for row in table("tr"):
+			cells = row("td")
+			label = cells[0].string
+			value = cells[1].string
+			
+			if label == "dc.identifier.uri":
+				data["identifier_uri"] = value
+			elif label == "dc.type":
+				data["object_type"] = value
+			elif label == "dc.date.accessioned":
+				data["creation_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
+			elif label == "dc.date.issued":
+				data["issued_date"] = datetime.datetime.strptime(value, "%Y-%m-%d")
+			elif label == "dc.date.available":
+				data["available_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
+			elif label == "dc.language.iso":
+				data["language"] = value
+			elif label == "dc.description.abstract":
+				data["description"] = " ".join(x for y in cells[1]("p") for x in y.strings)
+			elif label == "dc.contributor.author":
+				data["author"] = value
+			elif label == "dc.title":
+				data["title"] = value
+			else:
+				print "UNKNOWN KEY: %s => %s" % (label, value)
+			
+		return data
+	
+	def _data_capilano(self, url):
+		# Capilano University
+		soup = BeautifulSoup(rsess.get(url).text)
+		data = {"providername": "Capilano University"}
+		
+		data["title"] = soup.select("h1.documentFirstHeading")[0].string.strip()
+		data["description"] = " ".join(x for y in soup.select("#about > p") for x in y.strings).strip()
+		
+		return data
+		
+	def _data_hokkaido(self, url):
+		# Hokkaido University
+		soup = BeautifulSoup(rsess.get(url).text)
+		data = {"providername": "Hokkaido University"}
+		
+		data["title"] = soup.select("#MAIN h1")[0].string.strip()
+		data["description"] = soup.select("#MAIN p")[0].string.strip()
+	
+		return data
+		
+	def _data_ie(self, url):
+		# IE University
+		course_id = url.split("=")[1]
+		soup = BeautifulSoup(rsess.get("http://ocw.ie.edu/ocw/cur%s01_esp.html" % course_id.zfill(2)).text)
+		data = {"providername": "IE University"}
+		
+		data["title"] = soup.select(".ari_18_negrita")[0].string.strip()
+		data["description"] = " ".join(x.strip() for x in soup.select(".ari_12_negra")[-1].strings)
+		data["author"] = soup.select(".ari_12_negra")[2].select(".ari_12_negrita")[0].string.strip()
+	
+		return data
+		
+	def _data_hopkins(self, url):
+		# Johns Hopkins Bloomberg School of Public Health
+		soup = BeautifulSoup(rsess.get(url).text)
+		data = {"providername": "Johns Hopkins Bloomberg School of Public Health"}
+		
+		data["title"] = " ".join(x.strip() for x in soup.select("h1")[-1].strings if type(x) != bs4.element.Comment)
+		data["author"] = soup.select("#courseInfoBox p")[0].string.strip()
+		data["description"] = soup.select("#courseImageAndInfoBox p")[-1].string.strip()
+		
+		return data
+		
+	def parse_dataset(self):
+		for item in self.dataset:
+			self.process_item(item)
+		
+	def process_item(self, item):
+		inserted, rowid = self.db.insert_topic(2, str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
+		
+		if inserted:
+			print "Inserted %s" % item["name"]
+		else:
+			print "Skipped %s" % item["name"]
+		
+		for course in item["courses"]:
+			self.process_course(course, rowid)
+	
+	def process_course(self, course, topicid):
+		try:
+			start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
+			title = "%s: %s-%s-%s" % (course["name"], str(course["start_year"]).zfill(4), str(course["start_month"]).zfill(2), str(course["start_day"]).zfill(2))
+		except TypeError, e:
+			start_date = None
+			title = "%s (date undetermined)" % (course["name"])
+		
+		inserted, itemid = self.db.insert_item(2, str(course["id"]), True, self.db.COURSE, title, course["home_link"], description=course["certificate_description"], start_date=start_date, topic_id=topicid)
+		
+		if inserted:
+			print "\tInserted %s" % title
+		else:
+			print "\tSkipped %s" % title
+			
+#crawler = OpenCourseWareCrawler()
+#crawler.parse_catalog()
Author	SHA1	Message	Date
Sven Slootweg	d98ee113bc	Rewrite generic OCW parser, BeautifulSoup fix to allow exclusion of comments for string retrieval, and fix BS4 bug	12 years ago
Sven Slootweg	98340b38a0	Rewrite University of Reddit crawler - now with less hacks!	12 years ago
Sven Slootweg	8bbffb9429	Add topic_exists and item_exists methods to Scraper class	12 years ago
Sven Slootweg	0e4df4549f	No need to import oursql from within the scrapers	12 years ago
Sven Slootweg	2c3bcc5418	Rewrite Khan Academy crawler	12 years ago
Sven Slootweg	d9034b6215	Consistently use row_id, and not itemid or rowid	12 years ago
Sven Slootweg	8c0033074b	Support both output logging and error logging in the Environment.log() method	12 years ago
Sven Slootweg	b3edd35ecf	Add support for lectures and sandboxes	12 years ago
Sven Slootweg	d6d8eb70b9	Fix typo - it should be Khan Academy, not Khan University.	12 years ago
Sven Slootweg	fb6c43a38f	Rewrite scraper to be more modular, and convert the Coursera crawler to the new model	12 years ago
Sven Slootweg	c2a8a66dac	Update README to fix dependencies list	12 years ago
Sven Slootweg	a690cb2c8f	Add rudimentary first version of the OCW scraper	12 years ago
Sven Slootweg	f188d443d1	Add README	12 years ago
Sven Slootweg	43c700ac2b	Add list of various OCW sources for parser development	12 years ago
Sven Slootweg	26b68952fa	Add table structure updates for new version of updater	12 years ago
Sven Slootweg	a4e744f892	Add list of sources for book data	12 years ago
Sven Slootweg	d3bd59f813	Add modified version of BeautifulSoup4 (nth-of-type pseudoselector and full-featured direct descendant support)	12 years ago
Sven Slootweg	8e951f6b27	Add simple script for searching from a terminal	12 years ago
Sven Slootweg	d387541822	Support custom provider names	12 years ago
Sven Slootweg	a6e350c0d9	Add dumping script	12 years ago
Sven Slootweg	0f5cade812	Simple dumper	12 years ago
Sven Slootweg	fa74d394a7	Filter _ search terms	12 years ago
Sven Slootweg	a9d2576eaf	Add donation link	12 years ago
Sven Slootweg	f57d45fa53	Add header message	12 years ago
Sven Slootweg	1503c1f75f	Add 404 page	12 years ago
Sven Slootweg	bfbfd821b5	Include a small preview in the search results	12 years ago
Sven Slootweg	efeef5f70e	Change search term requirements	12 years ago
Sven Slootweg	3f02174ba3	Implement some very basic methods to prevent overloading	12 years ago
Sven Slootweg	1fbb21e6d8	Properly use the password when connecting the crawlers	12 years ago
Sven Slootweg	dd4c62bc4e	Very basic error handling	12 years ago
Sven Slootweg	6ec1a2d90b	Add crawlers for coursera and ureddit, get first quick and dirty version of frontend done, and fix buigs and stuff	12 years ago