Rewrite generic OCW parser, BeautifulSoup fix to allow exclusion of comments for string retrieval, and fix BS4 bug

Rewrite University of Reddit crawler - now with less hacks!
Add topic_exists and item_exists methods to Scraper class
2013-01-31 01:36:20 +01:00 · 2013-01-30 22:36:42 +01:00 · 2013-01-30 22:30:13 +01:00 · 2013-01-30 22:03:55 +01:00 · 2013-01-30 20:42:46 +01:00 · 2013-01-30 20:42:23 +01:00
51 changed files with 8545 additions and 140 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,9 @@
+# Cryto Learn
+
+This is the source code for http://learn.cryto.net/. It consists of the following:
+
+* The updating script, a few very rudimentary scrapers for various educational sources. Requires Python 2. Dependencies are [oursql](http://packages.python.org/oursql/), [requests](http://docs.python-requests.org/en/latest/) and BeautifulSoup 4 (custom version included). Located in `updater/`.
+* The frontend, a fairly hacky and messy PHP-based search interface. Needs cleaning up, but not an immediate priority. Requires PHP 5.3+ and uses [CPHP](http://github.com/joepie91/cphp). Located in `frontend/`.
+* A simple shell search script, using the Cryto Learn API to search for the specified string and print results to stdout. Requires Python 2. Also very rudimentary.
+
+Licensed under the WTFPL. It may or may not work on your system, use at your own risk, etc. etc.
--- a/book_data_sources.txt
+++ b/book_data_sources.txt
@ -0,0 +1,7 @@
+API:
+http://www.goodreads.com/api
+https://developers.google.com/books/docs/getting-started#books_api_v1
+
+Dumps:
+http://openlibrary.org/data/ol_dump_latest.txt.gz
+http://www.librarything.com/feeds/
--- a/config.json
+++ b/config.json
@ -0,0 +1,30 @@
+{
+	"database": {
+		"driver": 	"mysql",
+		"pdo":		true,
+		"hostname": 	"localhost",
+		"username": 	"root",
+		"password": 	"",
+		"database": 	"learn"
+	},
+	"locale": {
+		"path": 		"locales",
+		"extension":		"lng",
+		"default_locale": 	"english",
+		"default_timezone": 	"Europe/Amsterdam"
+	},
+	"memcache": {
+		"enabled": 	true,
+		"compressed": 	true,
+		"hostname": 	"localhost",
+		"port": 	11211
+	},
+	"class_map": {
+		"item": 	 	"Item",
+		"topic": 		"Topic"
+	},
+	"components": [
+		"router",
+		"errorhandler"
+	]
+}
--- a/frontend/classes/item.php
+++ b/frontend/classes/item.php
@ -0,0 +1,152 @@
+<?php
+/*
+ * Cryto Learn is more free software. It is licensed under the WTFPL, which
+ * allows you to do pretty much anything with it, without having to
+ * ask permission. Commercial use is allowed, and no attribution is
+ * required. We do politely request that you share your modifications
+ * to benefit other developers, but you are under no enforced
+ * obligation to do so :)
+ * 
+ * Please read the accompanying LICENSE document for the full WTFPL
+ * licensing text.
+ */
+ 
+if(!isset($_APP)) { die("Unauthorized."); }
+
+class Item extends CPHPDatabaseRecordClass
+{
+	public $table_name = "items";
+	public $fill_query = "SELECT * FROM items WHERE `Id` = :Id";
+	public $verify_query = "SELECT * FROM items WHERE `Id` = :Id";
+	
+	public $prototype = array(
+		'string' => array(
+			'Title'			=> "Title",
+			'Description'		=> "Description",
+			'SourceUrl'		=> "SourceUrl",
+			'ItemUrl'		=> "ItemUrl"
+		),
+		'numeric' => array(
+			'Type'			=> "Type",
+			'Provider'		=> "Provider",
+			'Views'			=> "Views",
+			'TopicId'		=> "TopicId",
+			'ParentId'		=> "ParentId"
+		),
+		'boolean' => array(
+			'HasTopic'		=> "HasTopic"
+		),
+		'timestamp' => array(
+			'CreationDate'		=> "Date",
+			'StartDate'		=> "StartDate",
+			'EndDate'		=> "EndDate"
+		),
+		'topic' => array(
+			'Topic'			=> "TopicId"
+		),
+		'item' => array(
+			'Parent'		=> "ParentId"
+		)
+	);
+	
+	public function __get($name)
+	{
+		switch($name)
+		{
+			case "sTypeName":
+				return $this->GetTypeName();
+				break;
+			case "sProviderName":
+				return $this->GetProviderName();
+				break;
+			default:
+				return parent::__get($name);
+				break;
+		}
+	}
+	
+	public function GetTypeName()
+	{
+		switch($this->sType)
+		{
+			case 1:
+				return "topic";
+			case 2:
+				return "course";
+			case 3:
+				return "video";
+			case 4:
+				return "article";
+			case 5:
+				return "exercise";
+			case 6:
+				return "quiz";
+			case 7:
+				return "test";
+			case 8:
+				return "book";
+			case 9:
+				return "audiobook";
+			case 10:
+				return "lecture";
+			case 11:
+				return "sandbox";
+			default:
+				return "unknown";
+		}
+	}
+	
+	public function GetProviderName()
+	{
+		switch($this->sProvider)
+		{
+			case 1:
+				return "Khan Academy";
+			case 2:
+				return "Coursera";
+			case 3:
+				return "University of Reddit";
+			default:
+				return "Unknown";
+		}
+	}
+	
+	public function GetChildren()
+	{
+		try
+		{
+			return Item::CreateFromQuery("SELECT * FROM items WHERE `ParentId` = :ParentId", array(':ParentId' => $this->sId));
+		}
+		catch (NotFoundException $e)
+		{
+			return array();
+		}
+	}
+	
+	public function AsDataset($fetch_children = true)
+	{
+		$child_data = array();
+		
+		if($fetch_children == true)
+		{
+			foreach($this->GetChildren() as $child)
+			{
+				$child_data[] = $child->AsDataset();
+			}
+		}
+		
+		return array(
+			"title"		=> $this->uTitle,
+			"description"	=> $this->uDescription,
+			"url"		=> $this->uItemUrl,
+			"source"	=> $this->uSourceUrl,
+			"created"	=> $this->sCreationDate,
+			"start"		=> $this->sStartDate,
+			"end"		=> $this->sEndDate,
+			"type"		=> $this->sTypeName,
+			"provider"	=> $this->sProviderName,
+			"views"		=> $this->sViews,
+			"children"	=> $child_data
+		);
+	}
+}
--- a/frontend/classes/topic.php
+++ b/frontend/classes/topic.php
@ -0,0 +1,131 @@
+<?php
+/*
+ * Cryto Learn is more free software. It is licensed under the WTFPL, which
+ * allows you to do pretty much anything with it, without having to
+ * ask permission. Commercial use is allowed, and no attribution is
+ * required. We do politely request that you share your modifications
+ * to benefit other developers, but you are under no enforced
+ * obligation to do so :)
+ * 
+ * Please read the accompanying LICENSE document for the full WTFPL
+ * licensing text.
+ */
+ 
+if(!isset($_APP)) { die("Unauthorized."); }
+
+class Topic extends CPHPDatabaseRecordClass
+{
+	public $table_name = "topics";
+	public $fill_query = "SELECT * FROM topics WHERE `Id` = :Id";
+	public $verify_query = "SELECT * FROM topics WHERE `Id` = :Id";
+	
+	public $prototype = array(
+		'string' => array(
+			'Title'			=> "Title",
+			'ProviderId'		=> "ProviderId",
+			'Description'		=> "Description"
+		),
+		'numeric' => array(
+			'ParentId'		=> "ParentId",
+			'Provider'		=> "Provider"
+		),
+		'boolean' => array(
+			'NeedsEnrollment'	=> "NeedsEnrollment"
+		),
+		'timestamp' => array(
+			'CreationDate'		=> "Created",
+			'StartDate'		=> "StartDate",
+			'EndDate'		=> "EndDate"
+		),
+		'topic' => array(
+			'Parent'		=> "ParentId"
+		)
+	);
+	
+	public function __get($name)
+	{
+		switch($name)
+		{
+			case "sProviderName":
+				return $this->GetProviderName();
+				break;
+			default:
+				return parent::__get($name);
+				break;
+		}
+	}
+	
+	public function GetProviderName()
+	{
+		switch($this->sProvider)
+		{
+			case 1:
+				return "Khan Academy";
+			case 2:
+				return "Coursera";
+			case 3:
+				return "University of Reddit";
+			default:
+				return "Unknown";
+		}
+	}
+	
+	public function AsDataset($fetch_children = true, $fetch_items = true)
+	{
+		$child_data = array();
+		
+		if($fetch_children == true)
+		{
+			foreach($this->GetChildren() as $child)
+			{
+				$child_data[] = $child->AsDataset();
+			}
+		}
+		
+		$item_data = array();
+		
+		if($fetch_items == true)
+		{
+			foreach($this->GetItems() as $item)
+			{
+				$item_data[] = $item->AsDataset();
+			}
+		}
+		
+		return array(
+			"title"			=> $this->uTitle,
+			"description"		=> $this->uDescription,
+			"created"		=> $this->sCreationDate,
+			"start"			=> $this->sStartDate,
+			"end"			=> $this->sEndDate,
+			"provider"		=> $this->sProviderName,
+			"needs_enrollment"	=> $this->sNeedsEnrollment,
+			"children"		=> $child_data,
+			"items"			=> $item_data
+		);
+	}
+	
+	public function GetItems()
+	{
+		try
+		{
+			return Item::CreateFromQuery("SELECT * FROM items WHERE `TopicId` = :TopicId", array(':TopicId' => $this->sId));
+		}
+		catch (NotFoundException $e)
+		{
+			return array();
+		}
+	}
+	
+	public function GetChildren()
+	{
+		try
+		{
+			return Topic::CreateFromQuery("SELECT * FROM topics WHERE `ParentId` = :ParentId", array(':ParentId' => $this->sId));
+		}
+		catch (NotFoundException $e)
+		{
+			return array();
+		}
+	}
+}
--- a/frontend/cphp
+++ b/frontend/cphp
@ -0,0 +1 @@
+../../cphp
--- a/frontend/dump.json
+++ b/frontend/dump.json
--- a/frontend/includes/base.php
+++ b/frontend/includes/base.php
@ -0,0 +1,26 @@
+<?php
+/*
+ * Cryto Learn is more free software. It is licensed under the WTFPL, which
+ * allows you to do pretty much anything with it, without having to
+ * ask permission. Commercial use is allowed, and no attribution is
+ * required. We do politely request that you share your modifications
+ * to benefit other developers, but you are under no enforced
+ * obligation to do so :)
+ * 
+ * Please read the accompanying LICENSE document for the full WTFPL
+ * licensing text.
+ */
+
+if(!isset($_APP)) { die("Unauthorized."); }
+
+$_CPHP = true;
+$_CPHP_CONFIG = "../config.json";
+require("cphp/base.php");
+
+function __autoload($class_name) 
+{
+	global $_APP;
+	
+	$class_name = str_replace("\\", "/", strtolower($class_name));
+	require_once("classes/{$class_name}.php");
+}
--- a/frontend/index.php
+++ b/frontend/index.php
@ -0,0 +1,14 @@
+<?php
+/*
+ * Cryto Learn is more free software. It is licensed under the WTFPL, which
+ * allows you to do pretty much anything with it, without having to
+ * ask permission. Commercial use is allowed, and no attribution is
+ * required. We do politely request that you share your modifications
+ * to benefit other developers, but you are under no enforced
+ * obligation to do so :)
+ * 
+ * Please read the accompanying LICENSE document for the full WTFPL
+ * licensing text.
+ */
+
+require("rewrite.php");
--- a/frontend/locales/english.lng
+++ b/frontend/locales/english.lng
@ -0,0 +1,24 @@
+_locale;								en_US.UTF-8,en_US
+_datetime_short;							%d/%m/%Y %H:%M:%S
+_datetime_long;								%A %B %d, %Y %H:%M:%S
+_date_short;								%d/%m/%Y
+_date_long;								%A %B %d, %Y
+_time;									%H:%M:%S
+		
+event-now;								now
+event-future;								in the future
+event-past;								in the past
+event-1second-ago;							1 second ago
+event-seconds-ago;							%1$d seconds ago
+event-1minutes-ago;							1 minute ago
+event-minutes-ago;							%1$d minutes ago
+event-1hour-ago;							1 hour ago
+event-hours-ago;							%1$d hours ago
+event-1day-ago;								1 day ago
+event-days-ago;								%1$d days ago
+event-1week-ago;							1 week ago
+event-weeks-ago;							%1$d weeks ago
+event-1month-ago;							1 month ago
+event-months-ago;							%1$d months ago
+event-1year-ago;							1 year ago
+event-years-ago;							%1$d years ago
--- a/frontend/modules/api/dump.php
+++ b/frontend/modules/api/dump.php
@ -0,0 +1,28 @@
+<?php
+/*
+ * Cryto Learn is more free software. It is licensed under the WTFPL, which
+ * allows you to do pretty much anything with it, without having to
+ * ask permission. Commercial use is allowed, and no attribution is
+ * required. We do politely request that you share your modifications
+ * to benefit other developers, but you are under no enforced
+ * obligation to do so :)
+ * 
+ * Please read the accompanying LICENSE document for the full WTFPL
+ * licensing text.
+ */
+
+if(!isset($_APP)) { die("Unauthorized."); }
+
+if($_GET['key'] !== "derp")
+{
+	die();
+}
+
+$data = array();
+
+foreach(Topic::CreateFromQuery("SELECT * FROM topics WHERE `ParentId` = 0") as $topic)
+{
+	$data[] = $topic->AsDataset();
+}
+
+echo(json_encode($data));
--- a/frontend/modules/api/search.php
+++ b/frontend/modules/api/search.php
@ -0,0 +1,69 @@
+<?php
+/*
+ * Cryto Learn is more free software. It is licensed under the WTFPL, which
+ * allows you to do pretty much anything with it, without having to
+ * ask permission. Commercial use is allowed, and no attribution is
+ * required. We do politely request that you share your modifications
+ * to benefit other developers, but you are under no enforced
+ * obligation to do so :)
+ * 
+ * Please read the accompanying LICENSE document for the full WTFPL
+ * licensing text.
+ */
+
+if(!isset($_APP)) { die("Unauthorized."); }
+
+if(empty($_POST['q']))
+{
+	die(json_encode(array(
+		"error" => "No search query specified."
+	)));
+}
+else
+{
+	$query = $_POST['q'];
+	$terms = explode(" ", $query);
+	
+	$db_query_terms = array();
+	$valid_term = false;
+	
+	foreach($terms as $term)
+	{
+		$db_query_terms[] = "`Title` LIKE ?";
+		$term = str_replace("%", "\%", $term);
+		$term = str_replace("_", "\_", $term);
+		$valid_term = $valid_term || (strlen($term) >= 2);
+		$db_query_arguments[] = "%{$term}%";
+	}
+	
+	if($valid_term)
+	{
+		$db_query = implode(" AND ", $db_query_terms);
+		array_unshift($db_query_arguments, '');
+		unset($db_query_arguments[0]);
+		
+		try
+		{
+			$results_topics = Topic::CreateFromQuery("SELECT * FROM topics WHERE {$db_query}", $db_query_arguments);
+			
+			$return_objects = array();
+		
+			foreach($results_topics as $topic)
+			{
+				$return_objects[] =  $topic->AsDataset();
+			}
+			
+			$sPageContents = json_encode($return_objects);
+		}
+		catch (NotFoundException $e)
+		{
+			$sPageContents = json_encode(array("error" => "No results found for the specified query.", "query" => $query));
+		}
+	}
+	else
+	{
+		die(json_encode(array(
+			"error" => "No valid search query specified."
+		)));
+	}
+}
--- a/frontend/modules/ui/index.php
+++ b/frontend/modules/ui/index.php
@ -0,0 +1,18 @@
+<?php
+/*
+ * Cryto Learn is more free software. It is licensed under the WTFPL, which
+ * allows you to do pretty much anything with it, without having to
+ * ask permission. Commercial use is allowed, and no attribution is
+ * required. We do politely request that you share your modifications
+ * to benefit other developers, but you are under no enforced
+ * obligation to do so :)
+ * 
+ * Please read the accompanying LICENSE document for the full WTFPL
+ * licensing text.
+ */
+
+if(!isset($_APP)) { die("Unauthorized."); }
+
+$sPageContents = NewTemplater::Render("ui/index", $locale->strings, array());
+
+$sPageType = "ui";
--- a/frontend/rewrite.php
+++ b/frontend/rewrite.php
@ -0,0 +1,34 @@
+<?php
+$_APP = true;
+require("includes/base.php");
+
+$sPageContents = "";
+
+$router = new CPHPRouter();
+
+$router->allow_slash = true;
+$router->ignore_query = true;
+
+$router->routes = array(
+	0 => array(
+		"^/$"							=> "modules/ui/index.php",
+		"^/api/search$"						=> "modules/api/search.php",
+		"^/api/dump$"						=> "modules/api/dump.php"
+	)
+);
+
+try
+{
+	$router->RouteRequest();
+}
+catch (RouterException $e)
+{
+	http_status_code(404);
+	$sPageContents = "404 not found";
+}
+
+echo($sPageContents);
+
+/*
+
+* */
--- a/frontend/static/spinner.gif
+++ b/frontend/static/spinner.gif
--- a/frontend/style.css
+++ b/frontend/style.css
@ -6,6 +6,11 @@ body
 	font-family: sans-serif;
 }

+#templates
+{
+	display: none;
+}
+
 .header
 {
 	background-color: #C9F9DF;
@ -19,6 +24,12 @@ body
 	font-weight: normal;
 }

+.header h2
+{
+	margin: 2px;
+	font-size: 17px;
+}
+
 .search-large
 {
 	color: #006824;
@ -55,3 +66,95 @@ body
 	font-size: 26px;
 	width: 180px;
 }
+
+.spinner
+{
+	margin-left: 14px;
+}
+
+.topic, .item
+{
+	padding: 9px 12px;
+	margin: 5px 20px;
+	background-color: #79E1A8;
+	font-size: 20px;
+	width: 960px;
+}
+
+.topic 
+{
+	margin-top: 19px;
+	cursor: pointer;
+}
+
+.item
+{
+	margin-left: 34px;
+	width: 926px;
+	font-size: 18px;
+	background-color: #97F3C1;
+	display: none;
+}
+
+.type
+{
+	font-size: 18px;
+	color: gray;
+}
+
+.type:after
+{
+	content: ":";
+}
+
+a.title
+{
+	color: #041F9F;
+}
+
+.toggler
+{
+	display: block;
+	float: left;
+	width: 16px;
+	height: 16px;
+	margin-top: 2px;
+	margin-right: 8px;
+	font-size: 13px;
+	text-align: center;
+	font-weight: bold;
+	border: 1px solid black;
+	background-color: #D2ECCF;
+}
+
+.providername
+{
+	font-size: 18px;
+	color: gray;
+}
+
+.providername:before
+{
+	content: "(";
+}
+
+.providername:after
+{
+	content: ")";
+}
+
+.error
+{
+	margin: 8px 16px;
+	font-size: 19px;
+}
+
+.description
+{
+	margin-top: 4px;
+	font-size: 13px;
+	max-height: 15px;
+	overflow: hidden;
+	text-overflow: ellipsis;
+	white-space: nowrap;
+}
--- a/frontend/templates/ui/index.tpl
+++ b/frontend/templates/ui/index.tpl
@ -0,0 +1,160 @@
+<!doctype html>
+<html>
+	<head>
+		<title>learn.cryto.net</title>
+		<link rel="stylesheet" href="style.css">
+		<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.9.0/jquery.min.js"></script>
+		<script>
+			var search_timeout = null;
+			
+			$(function(){
+				/*$("input").val("data");
+				runSearch();*/
+			
+				$("input").keyup(function(){
+					if(typeof search_timeout !== "null")
+					{
+						clearTimeout(search_timeout);
+					}
+					
+					search_timeout = setTimeout(runSearch, 800)
+				});
+			});
+			
+			function runSearch()
+			{
+				$(".search-large").removeClass("search-large").addClass("search-top");
+				$(".spinner").show();
+				var query = $("input#query").val();
+				
+				if(query.length >= 3)
+				{
+					$.post("/api/search", {q: query}, function(response){
+						$(".spinner").hide();
+						$(".results").html("");
+						
+						if(typeof response.error == "undefined")
+						{
+							for(i in response)
+							{
+								if(response[i].items.length > 0)
+								{
+									var result_wrapper = instantiateTemplate("result_wrapper");
+									
+									var result_block = instantiateTemplate("result_topic");
+									result_block.children(".title").html(response[i].title);
+									result_block.children(".description").html(response[i].description);
+									result_block.children(".providername").html(response[i].provider);
+									result_block.appendTo(result_wrapper);
+									
+									for(x in response[i].items)
+									{
+										item = response[i].items[x];
+										
+										var item_block = instantiateTemplate("result_item");
+										item_block.children(".title").html(item.title);
+										item_block.children(".title").attr("href", item.url);
+										item_block.children(".type").html(item.type);
+										item_block.insertAfter(result_block);
+									}
+									
+									result_wrapper.appendTo(".results");
+								}
+							}
+						}
+						else
+						{
+							$(".results").html("<div class='error'>No results.</div>");
+						}
+						
+						setHandlers();
+					}, "json");
+				}
+				else
+				{
+					$(".spinner").hide();
+					$(".results").html("<div class='error'>Enter at least 3 characters.</div>");
+				}
+			}
+			
+			function setHandlers()
+			{
+				$(".toggler, .topic").each(
+					function(){
+						$(this).click(function(event){
+							toggleItems(this, event);
+						});
+					}
+				);
+			}
+			
+			function instantiateTemplate(template_name)
+			{
+				var instance = $("#template_" + template_name).clone();
+				instance.removeAttr("id");
+				return instance;
+			}
+			
+			function toggleItems(ctx, event)
+			{
+				var parent = $(ctx).parentsUntil(".wrapper");
+				
+				if(parent.length == 0)
+				{
+					var wrapper = $(ctx).parent();
+				}
+				else
+				{
+					var wrapper = parent.parent();
+				}
+				
+				var toggler = wrapper.find(".toggler");
+				
+				if(typeof toggler.data("toggled") == "undefined" || toggler.data("toggled") == false)
+				{
+					toggler.data("toggled", true);
+					toggler.html("-");
+					wrapper.find(".item").show();
+				}
+				else
+				{
+					toggler.data("toggled", false);
+					toggler.html("+");
+					wrapper.find(".item").hide();
+				}
+				
+				event.stopPropagation();
+			}
+		</script>
+	</head>
+	<body>
+		<div class="header">
+			<h1><strong>learn.cryto.net</strong> :: Learn something new!</h1>
+			<h2>Currently searching Coursera, Khan University, University of Reddit. Comments? <a href="mailto:learn@cryto.net">learn@cryto.net</a> or 
+			<a href="irc://irc.cryto.net/crytocc">irc.cryto.net #crytocc</a></h2>
+			<h2>Like the service and wish to donate? <a href="http://cryto.net/~joepie91/donate.html">You can do that here :)</a></h2>
+		</div>
+		<div class="main">
+			<div class="search-large">
+				I want to learn about <input type="text" id="query">. <img src="/static/spinner.gif" class="spinner" style="display: none;">
+			</div>
+			<div class="results">
+				
+			</div>
+		</div>
+		<div id="templates">
+			<div id="template_result_wrapper" class="wrapper"></div>
+			<div id="template_result_topic" class="topic">
+				<span class="toggler">+</span>
+				<strong>Topic: </strong>
+				<span class="title"></span>
+				<span class="providername"></span>
+				<div class="description"></div>
+			</div>
+			<div id="template_result_item" class="item">
+				<span class="type"></span>
+				<a href="#" class="title"></a>
+			</div>
+		</div>
+	</body>
+</html>
--- a/ocw_functions.txt
+++ b/ocw_functions.txt
@ -0,0 +1,51 @@
+"ocw.kaplan.edu": self._metadata_kaplan,
+"ocw.korea.edu": self._metadata_korea,
+"kyotomm.jp": self._metadata_kyoto,
+"ocw.kyushu-u.ac.jp": self._metadata_kyushu,
+"open-marhi.ru": self._metadata_moscow,
+"yctrtrc.ncku.edu.tw": self._metadata_chengkung,
+"ocw.nctu.edu.tw": self._metadata_chiaotung,
+"opencourse.ndhu.edu.tw": self._metadata_donghwa,
+"ocw.njit.edu": self._metadata_njit,
+"graduateschool.paristech.fr": self._metadata_paris,
+"peoples-uni.org": self._metadata_oaei,
+"ocw.sbu.ac.ir": self._metadata_shahid,
+"studentscircle.net": self._metadata_studentscircle,
+"ocw.tmu.edu.tw:8080": self._metadata_taipei,
+"openlearn.open.ac.uk": self._metadata_openuni,
+"www.ocw.titech.ac.jp": self._metadata_tokyo,
+"feedproxy.google.com": self._metadata_tudelft,
+"ocw.tufts.edu": self._metadata_tufts,
+"ocw.unu.edu": self._metadata_un,
+"ocw.uc3m.es": self._metadata_madrid,
+"ocw.ua.es": self._metadata_alicante,
+"ocw.unican.es": self._metadata_cantabria,
+"ocw.ugr.es": self._metadata_granada,
+"ocw.udem.edu.mx": self._metadata_monterrey,
+"ocw.um.es": self._metadata_murcia,
+"ocw.uniovi.es": self._metadata_oviedo,
+"ocw.usal.es": self._metadata_salamanca,
+"ocwus.us.es": self._metadata_sevilla,
+"ocw.unizar.es": self._metadata_zaragoza,
+"ocw.univalle.edu.co3": self._metadata_colombia,
+"ocw.uned.ac.cr": self._metadata_distancia,
+"www.icesi.edu.co": self._metadata_icesi,
+"ocw.innova.uned.es": self._metadata_innova,
+"upv.es": self._metadata_valencia,
+"ocw.upm.es": self._metadata_upm,
+"ocw.utpl.edu.ec": self._metadata_utpl,
+"ocw.uab.cat": self._metadata_uab,
+"ocw.ub.edu": self._metadata_ub,
+"ocw.uib.es": self._metadata_uib,
+"ocw.udl.cat": self._metadata_udl,
+"ocw.uv.es": self._metadata_uv,
+"e-ujier.uji.e": self._metadata_uji,
+"ocw.uoc.edu": self._metadata_uoc,
+"ocw.utm.my": self._metadata_utm,
+"ocw.uci.edu": self._metadata_uci,
+"opencontent.uct.ac.za": self._metadata_uct,
+"ocw.umb.edu:8080": self._metadata_boston,
+"open.umich.edu": self._metadata_michigan,
+"ocw.nd.edu": self._metadata_notredame,
+"ocw.usu.ac.id": self._metadata_usu,
+"ocw.tsukuba.ac.jp": self._metadata_tsukaba
--- a/ocw_sources.txt
+++ b/ocw_sources.txt
@ -0,0 +1,116 @@
+# AGH University of Science and Technology
+http://open.agh.edu.pl/course/view.php?id=97
+# Funda Getulio Vargas - FGV Online
+http://www5.fgv.br/fgvonline/CursosGratuitosFormulario.aspx?id_curso=OCWAJUEAD_00_01/2011_1
+# Gunadarma University
+http://ocw.gunadarma.ac.id/course/about
+# Johns Hopkins Bloomberg School of Public Health
+http://ocw.jhsph.edu/courses/AdolHealthDev/?source=rss
+# Kaplan University Online & Campus Learning
+http://ocw.kaplan.edu/arts-and-sciences/academic-strategies
+# Korea University
+http://ocw.korea.edu/ocw/college-of-science/general-physics-i
+# Kyoto Seika University
+http://www.kyotomm.jp/event/exh/kyotomagic2012.php
+# Kyushu University
+http://ocw.kyushu-u.ac.jp/90901/0007/index.html
+# Massachusetts Institute of Technology
+http://ocw.mit.edu/courses/civil-and-environmental-engineering/1-00-introduction-to-computers-and-engineering-problem-solving-fall-2005
+# MOSCOW ARCHITECTURAL INSTITUTE
+http://www.open-marhi.ru/courses/detail/index.php?ID=6631
+# National Cheng Kung University
+http://yctrtrc.ncku.edu.tw/site2/newocwcourse/OCW_MAIN.php?cid=141
+# National Chiao Tung University
+http://ocw.nctu.edu.tw/riki_detail.php?pgid=335
+# National Dong Hwa University
+http://opencourse.ndhu.edu.tw/moodle/mod/forum/discuss.php?d=3
+# New Jersey Institute of Technology
+http://ocw.njit.edu/ocw/som/acct/acct-615-anandarajan/index.php
+# Paris Tech
+http://graduateschool.paristech.fr/cours.php?id=309132
+# People's Open Access Education Initiative
+http://www.peoples-uni.org/node/236
+# Shahid Beheshti University
+http://ocw.sbu.ac.ir/Default.aspx?tabid=5352&language=fa-IR
+# Students Circle Network
+http://studentscircle.net/live/2011/07/a-guide-before-learning-a-new-javascript-framework/
+# Taipei Medical University
+http://ocw.tmu.edu.tw:8080/eduCommons/general-education/53f28a1882076b7753f24eba72698a556790-shih-chi-analysis-on-historical-figures
+# The Open University
+http://openlearn.open.ac.uk/course/view.php?name=DD208_3
+# The Open University of Israel
+http://peer-news.blogspot.com/2011/12/2-10934.html
+# Tokyo Institute of Technology
+http://www.ocw.titech.ac.jp/index.php?module=General&Nendo=2012&action=T0300&GakubuCD=223&GakkaCD=224710&KougiCD=70030&Gakki=1&lang=EN
+# TU Delft
+http://feedproxy.google.com/~r/tudelft/OCW/~3/0sA6qPQKcOg/bachelor-civiele-techniek
+# Tufts University
+http://ocw.tufts.edu/Course/39
+# UNISUL - Universidade do Sul de Santa Catarina
+http://labspace.open.ac.uk
+# United Nations University
+http://ocw.unu.edu/international-institute-for-software-technology/building-a-community-of-practice-for-electronic-governance
+# Universidad Carlos III de Madrid
+http://ocw.uc3m.es/ingenieria-electrica/accionamientos-electricos
+# Universidad de Alicante
+http://ocw.ua.es/Ciencias_Sociales_y_Juridicas/actividades-deportivas-medio-ambiente
+# Universidad de Cantabria
+http://ocw.unican.es/ciencias-de-la-salud/actuacion-en-situaciones-especiales
+# Universidad de Granada
+http://ocw.ugr.es/course/view.php?id=23&topic=1
+# Universidad de Monterrey
+http://ocw.udem.edu.mx/cursos-de-profesional/administracion-de-tecnologias-de-informacion
+# Universidad de Murcia
+http://ocw.um.es/cc.-sociales/actividad-fisica-en-el-envejecimiento
+# Universidad de Oviedo
+http://ocw.uniovi.es/course/view.php?id=28&ocw=1
+# Universidad de Salamanca
+http://ocw.usal.es/ciencias-sociales-1/curso-cero-matematicas-para-ciencias-sociales-nivelacion-de-conocimientos
+# Universidad de Sevilla
+http://ocwus.us.es/matematica-aplicada/pp-3
+# Universidad de Zaragoza
+http://ocw.unizar.es/ocw/ciencias-de-la-salud-1/actividades-fisicas-y-deportivas-aereas
+# Universidad del Valle - Colombia
+http://ocw.univalle.edu.co/ocw/ingenieria-electronica-telecomunicaciones-y-afines/arquitectura-de-procesos-industriales
+# Universidad Estatal a Distancia
+http://ocw.uned.ac.cr/eduCommons/ciencias-de-la-administracion/compras-y-almacenamiento
+# Universidad Icesi
+http://www.icesi.edu.co/ocw/tic/administracion_plataformas_y_seguridad
+# Universidad Nacional de Educacion a Distancia
+http://ocw.innova.uned.es/ocwuniversia/psicologia/analisis-de-datos-en-Psico-I
+# Universidad Politica de Valencia
+http://www.upv.es/ocwasi/2010/6842
+# Universidad Politica Madrid
+http://ocw.upm.es/ingenieria-cartografica-geodesica-y-fotogrametria/3d-scanning-and-modeling
+# UNIVERSIDAD TECNICA PARTICULAR DE LOJA
+http://ocw.utpl.edu.ec/economia
+# Universitat Auta de Barcelona
+http://ocw.uab.cat/enginyeries/apunts-de-calcul-matricial-i-resolucio-de-sistemes
+# Universitat de Barcelona
+http://ocw.ub.edu/admistracio-i-direccio-dempreses
+# Universitat de les Illes Balears
+http://ocw.uib.es/ocw/infermeria/atencion-de-enfermeria-frente-situaciones-de
+# Universitat de Lleida
+http://ocw.udl.cat/arts-i-humanitats
+# Universitat de Valia
+http://ocw.uv.es/ciencias-sociales-y-juridicas/2-2
+# Universitat Jaume I
+http://e-ujier.uji.es/pls/www/!gri_www.euji22101?p_id=15&p_tipo=A&p_curso=IG23&p_idioma=CA
+# Universitat Oberta de Catalunya
+http://ocw.uoc.edu/informatica-tecnologia-i-multimedia/administracio-avancada-del-sistema-operatiu-gnu-linux
+# Universiti Teknologi Malaysia
+http://ocw.utm.my/course/view.php?id=90
+# University of California, Irvine
+http://ocw.uci.edu/courses/course.aspx?id=113
+# University of Cape Town
+http://opencontent.uct.ac.za/Centre-for-Higher-Education-Development/Centre-for-Open-Learning/A-developmental-state-The-challenge-ahead
+# University of Massachusetts Boston
+http://ocw.umb.edu:8080/eduCommons/about
+# University of Michigan
+http://open.umich.edu/education/med/oernetwork/med/em/aetc-redirect/2009
+# University of Notre Dame
+http://ocw.nd.edu/history/african-american-history-ii
+# University of Sumatera Utara
+http://ocw.usu.ac.id/course/detail/teknik-sipil-s1/4110000007-struktur-bangunan-sipil-i.html
+# University of Tsukuba
+http://ocw.tsukuba.ac.jp/6570740672698cea79d15b6678147a7679d130fb65705b665c02653b/66f87c4d7d394ecb
--- a/shellsearch/search.py
+++ b/shellsearch/search.py
@ -0,0 +1,22 @@
+#!/usr/bin/env python
+
+import requests, sys, re
+
+query = sys.argv[1]
+
+results = requests.post("http://learn.cryto.net/api/search", {"q": query}).json()
+
+for result in results:
+	name = result["title"].rstrip()
+	description = result["description"].strip().replace("\n", " ")
+	
+	if len(description) > 200:
+		description = re.match("^(.{0,300})\W", description).group(1) + "..."
+	
+	print "## %s\n%s" % (name, description)
+	
+	for item in result["items"]:
+		name = item["title"].ljust(70)
+		print "\t[%s] %s\t%s" % (item["type"], name, item["url"])
+
+	print ""
--- a/update.sql
+++ b/update.sql
@ -0,0 +1,2 @@
+ALTER TABLE  `items` ADD  `CustomProviderName` VARCHAR( 250 ) NULL DEFAULT NULL;
+ALTER TABLE  `topics` ADD  `CustomProviderName` VARCHAR( 250 ) NULL DEFAULT NULL;
--- a/updater/bs4/init.py
+++ b/updater/bs4/init.py
@ -0,0 +1,361 @@
+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+http://www.crummy.com/software/BeautifulSoup/
+
+Beautiful Soup uses a pluggable XML or HTML parser to parse a
+(possibly invalid) document into a tree representation. Beautiful Soup
+provides provides methods and Pythonic idioms that make it easy to
+navigate, search, and modify the parse tree.
+
+Beautiful Soup works with Python 2.6 and up. It works better if lxml
+and/or html5lib is installed.
+
+For more than you ever wanted to know about Beautiful Soup, see the
+documentation:
+http://www.crummy.com/software/BeautifulSoup/bs4/doc/
+"""
+
+__author__ = "Leonard Richardson (leonardr@segfault.org)"
+__version__ = "4.1.3"
+__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
+__license__ = "MIT"
+
+__all__ = ['BeautifulSoup']
+
+import re
+import warnings
+
+from .builder import builder_registry
+from .dammit import UnicodeDammit
+from .element import (
+    CData,
+    Comment,
+    DEFAULT_OUTPUT_ENCODING,
+    Declaration,
+    Doctype,
+    NavigableString,
+    PageElement,
+    ProcessingInstruction,
+    ResultSet,
+    SoupStrainer,
+    Tag,
+    )
+
+# The very first thing we do is give a useful error if someone is
+# running this code under Python 3 without converting it.
+syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
+
+class BeautifulSoup(Tag):
+    """
+    This class defines the basic interface called by the tree builders.
+
+    These methods will be called by the parser:
+      reset()
+      feed(markup)
+
+    The tree builder may call these methods from its feed() implementation:
+      handle_starttag(name, attrs) # See note about return value
+      handle_endtag(name)
+      handle_data(data) # Appends to the current data node
+      endData(containerClass=NavigableString) # Ends the current data node
+
+    No matter how complicated the underlying parser is, you should be
+    able to build a tree using 'start tag' events, 'end tag' events,
+    'data' events, and "done with data" events.
+
+    If you encounter an empty-element tag (aka a self-closing tag,
+    like HTML's <br> tag), call handle_starttag and then
+    handle_endtag.
+    """
+    ROOT_TAG_NAME = u'[document]'
+
+    # If the end-user gives no indication which tree builder they
+    # want, look for one with these features.
+    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
+
+    # Used when determining whether a text node is all whitespace and
+    # can be replaced with a single space. A text node that contains
+    # fancy Unicode spaces (usually non-breaking) should be left
+    # alone.
+    STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
+
+    def __init__(self, markup="", features=None, builder=None,
+                 parse_only=None, from_encoding=None, **kwargs):
+        """The Soup object is initialized as the 'root tag', and the
+        provided markup (which can be a string or a file-like object)
+        is fed into the underlying parser."""
+
+        if 'convertEntities' in kwargs:
+            warnings.warn(
+                "BS4 does not respect the convertEntities argument to the "
+                "BeautifulSoup constructor. Entities are always converted "
+                "to Unicode characters.")
+
+        if 'markupMassage' in kwargs:
+            del kwargs['markupMassage']
+            warnings.warn(
+                "BS4 does not respect the markupMassage argument to the "
+                "BeautifulSoup constructor. The tree builder is responsible "
+                "for any necessary markup massage.")
+
+        if 'smartQuotesTo' in kwargs:
+            del kwargs['smartQuotesTo']
+            warnings.warn(
+                "BS4 does not respect the smartQuotesTo argument to the "
+                "BeautifulSoup constructor. Smart quotes are always converted "
+                "to Unicode characters.")
+
+        if 'selfClosingTags' in kwargs:
+            del kwargs['selfClosingTags']
+            warnings.warn(
+                "BS4 does not respect the selfClosingTags argument to the "
+                "BeautifulSoup constructor. The tree builder is responsible "
+                "for understanding self-closing tags.")
+
+        if 'isHTML' in kwargs:
+            del kwargs['isHTML']
+            warnings.warn(
+                "BS4 does not respect the isHTML argument to the "
+                "BeautifulSoup constructor. You can pass in features='html' "
+                "or features='xml' to get a builder capable of handling "
+                "one or the other.")
+
+        def deprecated_argument(old_name, new_name):
+            if old_name in kwargs:
+                warnings.warn(
+                    'The "%s" argument to the BeautifulSoup constructor '
+                    'has been renamed to "%s."' % (old_name, new_name))
+                value = kwargs[old_name]
+                del kwargs[old_name]
+                return value
+            return None
+
+        parse_only = parse_only or deprecated_argument(
+            "parseOnlyThese", "parse_only")
+
+        from_encoding = from_encoding or deprecated_argument(
+            "fromEncoding", "from_encoding")
+
+        if len(kwargs) > 0:
+            arg = kwargs.keys().pop()
+            raise TypeError(
+                "__init__() got an unexpected keyword argument '%s'" % arg)
+
+        if builder is None:
+            if isinstance(features, basestring):
+                features = [features]
+            if features is None or len(features) == 0:
+                features = self.DEFAULT_BUILDER_FEATURES
+            builder_class = builder_registry.lookup(*features)
+            if builder_class is None:
+                raise FeatureNotFound(
+                    "Couldn't find a tree builder with the features you "
+                    "requested: %s. Do you need to install a parser library?"
+                    % ",".join(features))
+            builder = builder_class()
+        self.builder = builder
+        self.is_xml = builder.is_xml
+        self.builder.soup = self
+
+        self.parse_only = parse_only
+
+        self.reset()
+
+        if hasattr(markup, 'read'):        # It's a file-type object.
+            markup = markup.read()
+        (self.markup, self.original_encoding, self.declared_html_encoding,
+         self.contains_replacement_characters) = (
+            self.builder.prepare_markup(markup, from_encoding))
+
+        try:
+            self._feed()
+        except StopParsing:
+            pass
+
+        # Clear out the markup and remove the builder's circular
+        # reference to this object.
+        self.markup = None
+        self.builder.soup = None
+
+    def _feed(self):
+        # Convert the document to Unicode.
+        self.builder.reset()
+
+        self.builder.feed(self.markup)
+        # Close out any unfinished strings and close all the open tags.
+        self.endData()
+        while self.currentTag.name != self.ROOT_TAG_NAME:
+            self.popTag()
+
+    def reset(self):
+        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
+        self.hidden = 1
+        self.builder.reset()
+        self.currentData = []
+        self.currentTag = None
+        self.tagStack = []
+        self.pushTag(self)
+
+    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
+        """Create a new tag associated with this soup."""
+        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
+
+    def new_string(self, s):
+        """Create a new NavigableString associated with this soup."""
+        navigable = NavigableString(s)
+        navigable.setup()
+        return navigable
+
+    def insert_before(self, successor):
+        raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
+
+    def insert_after(self, successor):
+        raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
+
+    def popTag(self):
+        tag = self.tagStack.pop()
+        #print "Pop", tag.name
+        if self.tagStack:
+            self.currentTag = self.tagStack[-1]
+        return self.currentTag
+
+    def pushTag(self, tag):
+        #print "Push", tag.name
+        if self.currentTag:
+            self.currentTag.contents.append(tag)
+        self.tagStack.append(tag)
+        self.currentTag = self.tagStack[-1]
+
+    def endData(self, containerClass=NavigableString):
+        if self.currentData:
+            currentData = u''.join(self.currentData)
+            if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
+                not set([tag.name for tag in self.tagStack]).intersection(
+                    self.builder.preserve_whitespace_tags)):
+                if '\n' in currentData:
+                    currentData = '\n'
+                else:
+                    currentData = ' '
+            self.currentData = []
+            if self.parse_only and len(self.tagStack) <= 1 and \
+                   (not self.parse_only.text or \
+                    not self.parse_only.search(currentData)):
+                return
+            o = containerClass(currentData)
+            self.object_was_parsed(o)
+
+    def object_was_parsed(self, o, parent=None, previous_element=None):
+        """Add an object to the parse tree."""
+        parent = parent or self.currentTag
+        previous_element = previous_element or self.previous_element
+        o.setup(parent, previous_element)
+        if self.previous_element:
+            self.previous_element.next_element = o
+        self.previous_element = o
+        parent.contents.append(o)
+
+    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
+        """Pops the tag stack up to and including the most recent
+        instance of the given tag. If inclusivePop is false, pops the tag
+        stack up to but *not* including the most recent instqance of
+        the given tag."""
+        #print "Popping to %s" % name
+        if name == self.ROOT_TAG_NAME:
+            return
+
+        numPops = 0
+        mostRecentTag = None
+
+        for i in range(len(self.tagStack) - 1, 0, -1):
+            if (name == self.tagStack[i].name
+                and nsprefix == self.tagStack[i].prefix):
+                numPops = len(self.tagStack) - i
+                break
+        if not inclusivePop:
+            numPops = numPops - 1
+
+        for i in range(0, numPops):
+            mostRecentTag = self.popTag()
+        return mostRecentTag
+
+    def handle_starttag(self, name, namespace, nsprefix, attrs):
+        """Push a start tag on to the stack.
+
+        If this method returns None, the tag was rejected by the
+        SoupStrainer. You should proceed as if the tag had not occured
+        in the document. For instance, if this was a self-closing tag,
+        don't call handle_endtag.
+        """
+
+        # print "Start tag %s: %s" % (name, attrs)
+        self.endData()
+
+        if (self.parse_only and len(self.tagStack) <= 1
+            and (self.parse_only.text
+                 or not self.parse_only.search_tag(name, attrs))):
+            return None
+
+        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
+                  self.currentTag, self.previous_element)
+        if tag is None:
+            return tag
+        if self.previous_element:
+            self.previous_element.next_element = tag
+        self.previous_element = tag
+        self.pushTag(tag)
+        return tag
+
+    def handle_endtag(self, name, nsprefix=None):
+        #print "End tag: " + name
+        self.endData()
+        self._popToTag(name, nsprefix)
+
+    def handle_data(self, data):
+        self.currentData.append(data)
+
+    def decode(self, pretty_print=False,
+               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+               formatter="minimal"):
+        """Returns a string or Unicode representation of this document.
+        To get Unicode, pass None for encoding."""
+
+        if self.is_xml:
+            # Print the XML declaration
+            encoding_part = ''
+            if eventual_encoding != None:
+                encoding_part = ' encoding="%s"' % eventual_encoding
+            prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
+        else:
+            prefix = u''
+        if not pretty_print:
+            indent_level = None
+        else:
+            indent_level = 0
+        return prefix + super(BeautifulSoup, self).decode(
+            indent_level, eventual_encoding, formatter)
+
+class BeautifulStoneSoup(BeautifulSoup):
+    """Deprecated interface to an XML parser."""
+
+    def __init__(self, *args, **kwargs):
+        kwargs['features'] = 'xml'
+        warnings.warn(
+            'The BeautifulStoneSoup class is deprecated. Instead of using '
+            'it, pass features="xml" into the BeautifulSoup constructor.')
+        super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
+
+
+class StopParsing(Exception):
+    pass
+
+
+class FeatureNotFound(ValueError):
+    pass
+
+
+#By default, act as an HTML pretty-printer.
+if __name__ == '__main__':
+    import sys
+    soup = BeautifulSoup(sys.stdin)
+    print soup.prettify()
--- a/updater/bs4/builder/init.py
+++ b/updater/bs4/builder/init.py
@ -0,0 +1,316 @@
+from collections import defaultdict
+import itertools
+import sys
+from bs4.element import (
+    CharsetMetaAttributeValue,
+    ContentMetaAttributeValue,
+    whitespace_re
+    )
+
+__all__ = [
+    'HTMLTreeBuilder',
+    'SAXTreeBuilder',
+    'TreeBuilder',
+    'TreeBuilderRegistry',
+    ]
+
+# Some useful features for a TreeBuilder to have.
+FAST = 'fast'
+PERMISSIVE = 'permissive'
+STRICT = 'strict'
+XML = 'xml'
+HTML = 'html'
+HTML_5 = 'html5'
+
+
+class TreeBuilderRegistry(object):
+
+    def __init__(self):
+        self.builders_for_feature = defaultdict(list)
+        self.builders = []
+
+    def register(self, treebuilder_class):
+        """Register a treebuilder based on its advertised features."""
+        for feature in treebuilder_class.features:
+            self.builders_for_feature[feature].insert(0, treebuilder_class)
+        self.builders.insert(0, treebuilder_class)
+
+    def lookup(self, *features):
+        if len(self.builders) == 0:
+            # There are no builders at all.
+            return None
+
+        if len(features) == 0:
+            # They didn't ask for any features. Give them the most
+            # recently registered builder.
+            return self.builders[0]
+
+        # Go down the list of features in order, and eliminate any builders
+        # that don't match every feature.
+        features = list(features)
+        features.reverse()
+        candidates = None
+        candidate_set = None
+        while len(features) > 0:
+            feature = features.pop()
+            we_have_the_feature = self.builders_for_feature.get(feature, [])
+            if len(we_have_the_feature) > 0:
+                if candidates is None:
+                    candidates = we_have_the_feature
+                    candidate_set = set(candidates)
+                else:
+                    # Eliminate any candidates that don't have this feature.
+                    candidate_set = candidate_set.intersection(
+                        set(we_have_the_feature))
+
+        # The only valid candidates are the ones in candidate_set.
+        # Go through the original list of candidates and pick the first one
+        # that's in candidate_set.
+        if candidate_set is None:
+            return None
+        for candidate in candidates:
+            if candidate in candidate_set:
+                return candidate
+        return None
+
+# The BeautifulSoup class will take feature lists from developers and use them
+# to look up builders in this registry.
+builder_registry = TreeBuilderRegistry()
+
+class TreeBuilder(object):
+    """Turn a document into a Beautiful Soup object tree."""
+
+    features = []
+
+    is_xml = False
+    preserve_whitespace_tags = set()
+    empty_element_tags = None # A tag will be considered an empty-element
+                              # tag when and only when it has no contents.
+
+    # A value for these tag/attribute combinations is a space- or
+    # comma-separated list of CDATA, rather than a single CDATA.
+    cdata_list_attributes = {}
+
+
+    def __init__(self):
+        self.soup = None
+
+    def reset(self):
+        pass
+
+    def can_be_empty_element(self, tag_name):
+        """Might a tag with this name be an empty-element tag?
+
+        The final markup may or may not actually present this tag as
+        self-closing.
+
+        For instance: an HTMLBuilder does not consider a <p> tag to be
+        an empty-element tag (it's not in
+        HTMLBuilder.empty_element_tags). This means an empty <p> tag
+        will be presented as "<p></p>", not "<p />".
+
+        The default implementation has no opinion about which tags are
+        empty-element tags, so a tag will be presented as an
+        empty-element tag if and only if it has no contents.
+        "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
+        be left alone.
+        """
+        if self.empty_element_tags is None:
+            return True
+        return tag_name in self.empty_element_tags
+
+    def feed(self, markup):
+        raise NotImplementedError()
+
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       document_declared_encoding=None):
+        return markup, None, None, False
+
+    def test_fragment_to_document(self, fragment):
+        """Wrap an HTML fragment to make it look like a document.
+
+        Different parsers do this differently. For instance, lxml
+        introduces an empty <head> tag, and html5lib
+        doesn't. Abstracting this away lets us write simple tests
+        which run HTML fragments through the parser and compare the
+        results against other HTML fragments.
+
+        This method should not be used outside of tests.
+        """
+        return fragment
+
+    def set_up_substitutions(self, tag):
+        return False
+
+    def _replace_cdata_list_attribute_values(self, tag_name, attrs):
+        """Replaces class="foo bar" with class=["foo", "bar"]
+
+        Modifies its input in place.
+        """
+        if self.cdata_list_attributes:
+            universal = self.cdata_list_attributes.get('*', [])
+            tag_specific = self.cdata_list_attributes.get(
+                tag_name.lower(), [])
+            for cdata_list_attr in itertools.chain(universal, tag_specific):
+                if cdata_list_attr in dict(attrs):
+                    # Basically, we have a "class" attribute whose
+                    # value is a whitespace-separated list of CSS
+                    # classes. Split it into a list.
+                    value = attrs[cdata_list_attr]
+                    if isinstance(value, basestring):
+                        values = whitespace_re.split(value)
+                    else:
+                        # html5lib sometimes calls setAttributes twice
+                        # for the same tag when rearranging the parse
+                        # tree. On the second call the attribute value
+                        # here is already a list.  If this happens,
+                        # leave the value alone rather than trying to
+                        # split it again.
+                        values = value
+                    attrs[cdata_list_attr] = values
+        return attrs
+
+class SAXTreeBuilder(TreeBuilder):
+    """A Beautiful Soup treebuilder that listens for SAX events."""
+
+    def feed(self, markup):
+        raise NotImplementedError()
+
+    def close(self):
+        pass
+
+    def startElement(self, name, attrs):
+        attrs = dict((key[1], value) for key, value in list(attrs.items()))
+        #print "Start %s, %r" % (name, attrs)
+        self.soup.handle_starttag(name, attrs)
+
+    def endElement(self, name):
+        #print "End %s" % name
+        self.soup.handle_endtag(name)
+
+    def startElementNS(self, nsTuple, nodeName, attrs):
+        # Throw away (ns, nodeName) for now.
+        self.startElement(nodeName, attrs)
+
+    def endElementNS(self, nsTuple, nodeName):
+        # Throw away (ns, nodeName) for now.
+        self.endElement(nodeName)
+        #handler.endElementNS((ns, node.nodeName), node.nodeName)
+
+    def startPrefixMapping(self, prefix, nodeValue):
+        # Ignore the prefix for now.
+        pass
+
+    def endPrefixMapping(self, prefix):
+        # Ignore the prefix for now.
+        # handler.endPrefixMapping(prefix)
+        pass
+
+    def characters(self, content):
+        self.soup.handle_data(content)
+
+    def startDocument(self):
+        pass
+
+    def endDocument(self):
+        pass
+
+
+class HTMLTreeBuilder(TreeBuilder):
+    """This TreeBuilder knows facts about HTML.
+
+    Such as which tags are empty-element tags.
+    """
+
+    preserve_whitespace_tags = set(['pre', 'textarea'])
+    empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
+                              'spacer', 'link', 'frame', 'base'])
+
+    # The HTML standard defines these attributes as containing a
+    # space-separated list of values, not a single value. That is,
+    # class="foo bar" means that the 'class' attribute has two values,
+    # 'foo' and 'bar', not the single value 'foo bar'.  When we
+    # encounter one of these attributes, we will parse its value into
+    # a list of values if possible. Upon output, the list will be
+    # converted back into a string.
+    cdata_list_attributes = {
+        "*" : ['class', 'accesskey', 'dropzone'],
+        "a" : ['rel', 'rev'],
+        "link" :  ['rel', 'rev'],
+        "td" : ["headers"],
+        "th" : ["headers"],
+        "td" : ["headers"],
+        "form" : ["accept-charset"],
+        "object" : ["archive"],
+
+        # These are HTML5 specific, as are *.accesskey and *.dropzone above.
+        "area" : ["rel"],
+        "icon" : ["sizes"],
+        "iframe" : ["sandbox"],
+        "output" : ["for"],
+        }
+
+    def set_up_substitutions(self, tag):
+        # We are only interested in <meta> tags
+        if tag.name != 'meta':
+            return False
+
+        http_equiv = tag.get('http-equiv')
+        content = tag.get('content')
+        charset = tag.get('charset')
+
+        # We are interested in <meta> tags that say what encoding the
+        # document was originally in. This means HTML 5-style <meta>
+        # tags that provide the "charset" attribute. It also means
+        # HTML 4-style <meta> tags that provide the "content"
+        # attribute and have "http-equiv" set to "content-type".
+        #
+        # In both cases we will replace the value of the appropriate
+        # attribute with a standin object that can take on any
+        # encoding.
+        meta_encoding = None
+        if charset is not None:
+            # HTML 5 style:
+            # <meta charset="utf8">
+            meta_encoding = charset
+            tag['charset'] = CharsetMetaAttributeValue(charset)
+
+        elif (content is not None and http_equiv is not None
+              and http_equiv.lower() == 'content-type'):
+            # HTML 4 style:
+            # <meta http-equiv="content-type" content="text/html; charset=utf8">
+            tag['content'] = ContentMetaAttributeValue(content)
+
+        return (meta_encoding is not None)
+
+def register_treebuilders_from(module):
+    """Copy TreeBuilders from the given module into this module."""
+    # I'm fairly sure this is not the best way to do this.
+    this_module = sys.modules['bs4.builder']
+    for name in module.__all__:
+        obj = getattr(module, name)
+
+        if issubclass(obj, TreeBuilder):
+            setattr(this_module, name, obj)
+            this_module.__all__.append(name)
+            # Register the builder while we're at it.
+            this_module.builder_registry.register(obj)
+
+# Builders are registered in reverse order of priority, so that custom
+# builder registrations will take precedence. In general, we want lxml
+# to take precedence over html5lib, because it's faster. And we only
+# want to use HTMLParser as a last result.
+from . import _htmlparser
+register_treebuilders_from(_htmlparser)
+try:
+    from . import _html5lib
+    register_treebuilders_from(_html5lib)
+except ImportError:
+    # They don't have html5lib installed.
+    pass
+try:
+    from . import _lxml
+    register_treebuilders_from(_lxml)
+except ImportError:
+    # They don't have lxml installed.
+    pass
--- a/updater/bs4/builder/_html5lib.py
+++ b/updater/bs4/builder/_html5lib.py
@ -0,0 +1,221 @@
+__all__ = [
+    'HTML5TreeBuilder',
+    ]
+
+import warnings
+from bs4.builder import (
+    PERMISSIVE,
+    HTML,
+    HTML_5,
+    HTMLTreeBuilder,
+    )
+from bs4.element import NamespacedAttribute
+import html5lib
+from html5lib.constants import namespaces
+from bs4.element import (
+    Comment,
+    Doctype,
+    NavigableString,
+    Tag,
+    )
+
+class HTML5TreeBuilder(HTMLTreeBuilder):
+    """Use html5lib to build a tree."""
+
+    features = ['html5lib', PERMISSIVE, HTML_5, HTML]
+
+    def prepare_markup(self, markup, user_specified_encoding):
+        # Store the user-specified encoding for use later on.
+        self.user_specified_encoding = user_specified_encoding
+        return markup, None, None, False
+
+    # These methods are defined by Beautiful Soup.
+    def feed(self, markup):
+        if self.soup.parse_only is not None:
+            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
+        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
+        doc = parser.parse(markup, encoding=self.user_specified_encoding)
+
+        # Set the character encoding detected by the tokenizer.
+        if isinstance(markup, unicode):
+            # We need to special-case this because html5lib sets
+            # charEncoding to UTF-8 if it gets Unicode input.
+            doc.original_encoding = None
+        else:
+            doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
+
+    def create_treebuilder(self, namespaceHTMLElements):
+        self.underlying_builder = TreeBuilderForHtml5lib(
+            self.soup, namespaceHTMLElements)
+        return self.underlying_builder
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<html><head></head><body>%s</body></html>' % fragment
+
+
+class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
+
+    def __init__(self, soup, namespaceHTMLElements):
+        self.soup = soup
+        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
+
+    def documentClass(self):
+        self.soup.reset()
+        return Element(self.soup, self.soup, None)
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
+        self.soup.object_was_parsed(doctype)
+
+    def elementClass(self, name, namespace):
+        tag = self.soup.new_tag(name, namespace)
+        return Element(tag, self.soup, namespace)
+
+    def commentClass(self, data):
+        return TextNode(Comment(data), self.soup)
+
+    def fragmentClass(self):
+        self.soup = BeautifulSoup("")
+        self.soup.name = "[document_fragment]"
+        return Element(self.soup, self.soup, None)
+
+    def appendChild(self, node):
+        # XXX This code is not covered by the BS4 tests.
+        self.soup.append(node.element)
+
+    def getDocument(self):
+        return self.soup
+
+    def getFragment(self):
+        return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
+
+class AttrList(object):
+    def __init__(self, element):
+        self.element = element
+        self.attrs = dict(self.element.attrs)
+    def __iter__(self):
+        return list(self.attrs.items()).__iter__()
+    def __setitem__(self, name, value):
+        "set attr", name, value
+        self.element[name] = value
+    def items(self):
+        return list(self.attrs.items())
+    def keys(self):
+        return list(self.attrs.keys())
+    def __len__(self):
+        return len(self.attrs)
+    def __getitem__(self, name):
+        return self.attrs[name]
+    def __contains__(self, name):
+        return name in list(self.attrs.keys())
+
+
+class Element(html5lib.treebuilders._base.Node):
+    def __init__(self, element, soup, namespace):
+        html5lib.treebuilders._base.Node.__init__(self, element.name)
+        self.element = element
+        self.soup = soup
+        self.namespace = namespace
+
+    def appendChild(self, node):
+        if (node.element.__class__ == NavigableString and self.element.contents
+            and self.element.contents[-1].__class__ == NavigableString):
+            # Concatenate new text onto old text node
+            # XXX This has O(n^2) performance, for input like
+            # "a</a>a</a>a</a>..."
+            old_element = self.element.contents[-1]
+            new_element = self.soup.new_string(old_element + node.element)
+            old_element.replace_with(new_element)
+        else:
+            self.soup.object_was_parsed(node.element, parent=self.element)
+
+    def getAttributes(self):
+        return AttrList(self.element)
+
+    def setAttributes(self, attributes):
+        if attributes is not None and len(attributes) > 0:
+
+            converted_attributes = []
+            for name, value in list(attributes.items()):
+                if isinstance(name, tuple):
+                    new_name = NamespacedAttribute(*name)
+                    del attributes[name]
+                    attributes[new_name] = value
+
+            self.soup.builder._replace_cdata_list_attribute_values(
+                self.name, attributes)
+            for name, value in attributes.items():
+                self.element[name] = value
+
+            # The attributes may contain variables that need substitution.
+            # Call set_up_substitutions manually.
+            #
+            # The Tag constructor called this method when the Tag was created,
+            # but we just set/changed the attributes, so call it again.
+            self.soup.builder.set_up_substitutions(self.element)
+    attributes = property(getAttributes, setAttributes)
+
+    def insertText(self, data, insertBefore=None):
+        text = TextNode(self.soup.new_string(data), self.soup)
+        if insertBefore:
+            self.insertBefore(text, insertBefore)
+        else:
+            self.appendChild(text)
+
+    def insertBefore(self, node, refNode):
+        index = self.element.index(refNode.element)
+        if (node.element.__class__ == NavigableString and self.element.contents
+            and self.element.contents[index-1].__class__ == NavigableString):
+            # (See comments in appendChild)
+            old_node = self.element.contents[index-1]
+            new_str = self.soup.new_string(old_node + node.element)
+            old_node.replace_with(new_str)
+        else:
+            self.element.insert(index, node.element)
+            node.parent = self
+
+    def removeChild(self, node):
+        node.element.extract()
+
+    def reparentChildren(self, newParent):
+        while self.element.contents:
+            child = self.element.contents[0]
+            child.extract()
+            if isinstance(child, Tag):
+                newParent.appendChild(
+                    Element(child, self.soup, namespaces["html"]))
+            else:
+                newParent.appendChild(
+                    TextNode(child, self.soup))
+
+    def cloneNode(self):
+        tag = self.soup.new_tag(self.element.name, self.namespace)
+        node = Element(tag, self.soup, self.namespace)
+        for key,value in self.attributes:
+            node.attributes[key] = value
+        return node
+
+    def hasContent(self):
+        return self.element.contents
+
+    def getNameTuple(self):
+        if self.namespace == None:
+            return namespaces["html"], self.name
+        else:
+            return self.namespace, self.name
+
+    nameTuple = property(getNameTuple)
+
+class TextNode(Element):
+    def __init__(self, element, soup):
+        html5lib.treebuilders._base.Node.__init__(self, None)
+        self.element = element
+        self.soup = soup
+
+    def cloneNode(self):
+        raise NotImplementedError
--- a/updater/bs4/builder/_htmlparser.py
+++ b/updater/bs4/builder/_htmlparser.py
@ -0,0 +1,244 @@
+"""Use the HTMLParser library to parse HTML files that aren't too bad."""
+
+__all__ = [
+    'HTMLParserTreeBuilder',
+    ]
+
+from HTMLParser import (
+    HTMLParser,
+    HTMLParseError,
+    )
+import sys
+import warnings
+
+# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
+# argument, which we'd like to set to False. Unfortunately,
+# http://bugs.python.org/issue13273 makes strict=True a better bet
+# before Python 3.2.3.
+#
+# At the end of this file, we monkeypatch HTMLParser so that
+# strict=True works well on Python 3.2.2.
+major, minor, release = sys.version_info[:3]
+CONSTRUCTOR_TAKES_STRICT = (
+    major > 3
+    or (major == 3 and minor > 2)
+    or (major == 3 and minor == 2 and release >= 3))
+
+from bs4.element import (
+    CData,
+    Comment,
+    Declaration,
+    Doctype,
+    ProcessingInstruction,
+    )
+from bs4.dammit import EntitySubstitution, UnicodeDammit
+
+from bs4.builder import (
+    HTML,
+    HTMLTreeBuilder,
+    STRICT,
+    )
+
+
+HTMLPARSER = 'html.parser'
+
+class BeautifulSoupHTMLParser(HTMLParser):
+    def handle_starttag(self, name, attrs):
+        # XXX namespace
+        self.soup.handle_starttag(name, None, None, dict(attrs))
+
+    def handle_endtag(self, name):
+        self.soup.handle_endtag(name)
+
+    def handle_data(self, data):
+        self.soup.handle_data(data)
+
+    def handle_charref(self, name):
+        # XXX workaround for a bug in HTMLParser. Remove this once
+        # it's fixed.
+        if name.startswith('x'):
+            real_name = int(name.lstrip('x'), 16)
+        else:
+            real_name = int(name)
+
+        try:
+            data = unichr(real_name)
+        except (ValueError, OverflowError), e:
+            data = u"\N{REPLACEMENT CHARACTER}"
+
+        self.handle_data(data)
+
+    def handle_entityref(self, name):
+        character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
+        if character is not None:
+            data = character
+        else:
+            data = "&%s;" % name
+        self.handle_data(data)
+
+    def handle_comment(self, data):
+        self.soup.endData()
+        self.soup.handle_data(data)
+        self.soup.endData(Comment)
+
+    def handle_decl(self, data):
+        self.soup.endData()
+        if data.startswith("DOCTYPE "):
+            data = data[len("DOCTYPE "):]
+        self.soup.handle_data(data)
+        self.soup.endData(Doctype)
+
+    def unknown_decl(self, data):
+        if data.upper().startswith('CDATA['):
+            cls = CData
+            data = data[len('CDATA['):]
+        else:
+            cls = Declaration
+        self.soup.endData()
+        self.soup.handle_data(data)
+        self.soup.endData(cls)
+
+    def handle_pi(self, data):
+        self.soup.endData()
+        if data.endswith("?") and data.lower().startswith("xml"):
+            # "An XHTML processing instruction using the trailing '?'
+            # will cause the '?' to be included in data." - HTMLParser
+            # docs.
+            #
+            # Strip the question mark so we don't end up with two
+            # question marks.
+            data = data[:-1]
+        self.soup.handle_data(data)
+        self.soup.endData(ProcessingInstruction)
+
+
+class HTMLParserTreeBuilder(HTMLTreeBuilder):
+
+    is_xml = False
+    features = [HTML, STRICT, HTMLPARSER]
+
+    def __init__(self, *args, **kwargs):
+        if CONSTRUCTOR_TAKES_STRICT:
+            kwargs['strict'] = False
+        self.parser_args = (args, kwargs)
+
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       document_declared_encoding=None):
+        """
+        :return: A 4-tuple (markup, original encoding, encoding
+        declared within markup, whether any characters had to be
+        replaced with REPLACEMENT CHARACTER).
+        """
+        if isinstance(markup, unicode):
+            return markup, None, None, False
+
+        try_encodings = [user_specified_encoding, document_declared_encoding]
+        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
+        return (dammit.markup, dammit.original_encoding,
+                dammit.declared_html_encoding,
+                dammit.contains_replacement_characters)
+
+    def feed(self, markup):
+        args, kwargs = self.parser_args
+        parser = BeautifulSoupHTMLParser(*args, **kwargs)
+        parser.soup = self.soup
+        try:
+            parser.feed(markup)
+        except HTMLParseError, e:
+            warnings.warn(RuntimeWarning(
+                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
+            raise e
+
+# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
+# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
+# string.
+#
+# XXX This code can be removed once most Python 3 users are on 3.2.3.
+if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
+    import re
+    attrfind_tolerant = re.compile(
+        r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
+        r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
+    HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
+
+    locatestarttagend = re.compile(r"""
+  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
+  (?:\s+                             # whitespace before attribute name
+    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
+      (?:\s*=\s*                     # value indicator
+        (?:'[^']*'                   # LITA-enclosed value
+          |\"[^\"]*\"                # LIT-enclosed value
+          |[^'\">\s]+                # bare value
+         )
+       )?
+     )
+   )*
+  \s*                                # trailing whitespace
+""", re.VERBOSE)
+    BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
+
+    from html.parser import tagfind, attrfind
+
+    def parse_starttag(self, i):
+        self.__starttag_text = None
+        endpos = self.check_for_whole_start_tag(i)
+        if endpos < 0:
+            return endpos
+        rawdata = self.rawdata
+        self.__starttag_text = rawdata[i:endpos]
+
+        # Now parse the data between i+1 and j into a tag and attrs
+        attrs = []
+        match = tagfind.match(rawdata, i+1)
+        assert match, 'unexpected call to parse_starttag()'
+        k = match.end()
+        self.lasttag = tag = rawdata[i+1:k].lower()
+        while k < endpos:
+            if self.strict:
+                m = attrfind.match(rawdata, k)
+            else:
+                m = attrfind_tolerant.match(rawdata, k)
+            if not m:
+                break
+            attrname, rest, attrvalue = m.group(1, 2, 3)
+            if not rest:
+                attrvalue = None
+            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
+                 attrvalue[:1] == '"' == attrvalue[-1:]:
+                attrvalue = attrvalue[1:-1]
+            if attrvalue:
+                attrvalue = self.unescape(attrvalue)
+            attrs.append((attrname.lower(), attrvalue))
+            k = m.end()
+
+        end = rawdata[k:endpos].strip()
+        if end not in (">", "/>"):
+            lineno, offset = self.getpos()
+            if "\n" in self.__starttag_text:
+                lineno = lineno + self.__starttag_text.count("\n")
+                offset = len(self.__starttag_text) \
+                         - self.__starttag_text.rfind("\n")
+            else:
+                offset = offset + len(self.__starttag_text)
+            if self.strict:
+                self.error("junk characters in start tag: %r"
+                           % (rawdata[k:endpos][:20],))
+            self.handle_data(rawdata[i:endpos])
+            return endpos
+        if end.endswith('/>'):
+            # XHTML-style empty tag: <span attr="value" />
+            self.handle_startendtag(tag, attrs)
+        else:
+            self.handle_starttag(tag, attrs)
+            if tag in self.CDATA_CONTENT_ELEMENTS:
+                self.set_cdata_mode(tag)
+        return endpos
+
+    def set_cdata_mode(self, elem):
+        self.cdata_elem = elem.lower()
+        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
+
+    BeautifulSoupHTMLParser.parse_starttag = parse_starttag
+    BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
+
+    CONSTRUCTOR_TAKES_STRICT = True
--- a/updater/bs4/builder/_lxml.py
+++ b/updater/bs4/builder/_lxml.py
@ -0,0 +1,196 @@
+__all__ = [
+    'LXMLTreeBuilderForXML',
+    'LXMLTreeBuilder',
+    ]
+
+from StringIO import StringIO
+import collections
+from lxml import etree
+from bs4.element import Comment, Doctype, NamespacedAttribute
+from bs4.builder import (
+    FAST,
+    HTML,
+    HTMLTreeBuilder,
+    PERMISSIVE,
+    TreeBuilder,
+    XML)
+from bs4.dammit import UnicodeDammit
+
+LXML = 'lxml'
+
+class LXMLTreeBuilderForXML(TreeBuilder):
+    DEFAULT_PARSER_CLASS = etree.XMLParser
+
+    is_xml = True
+
+    # Well, it's permissive by XML parser standards.
+    features = [LXML, XML, FAST, PERMISSIVE]
+
+    CHUNK_SIZE = 512
+
+    # This namespace mapping is specified in the XML Namespace
+    # standard.
+    DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
+
+    @property
+    def default_parser(self):
+        # This can either return a parser object or a class, which
+        # will be instantiated with default arguments.
+        return etree.XMLParser(target=self, strip_cdata=False, recover=True)
+
+    def __init__(self, parser=None, empty_element_tags=None):
+        if empty_element_tags is not None:
+            self.empty_element_tags = set(empty_element_tags)
+        if parser is None:
+            # Use the default parser.
+            parser = self.default_parser
+        if isinstance(parser, collections.Callable):
+            # Instantiate the parser with default arguments
+            parser = parser(target=self, strip_cdata=False)
+        self.parser = parser
+        self.soup = None
+        self.nsmaps = [self.DEFAULT_NSMAPS]
+
+    def _getNsTag(self, tag):
+        # Split the namespace URL out of a fully-qualified lxml tag
+        # name. Copied from lxml's src/lxml/sax.py.
+        if tag[0] == '{':
+            return tuple(tag[1:].split('}', 1))
+        else:
+            return (None, tag)
+
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       document_declared_encoding=None):
+        """
+        :return: A 3-tuple (markup, original encoding, encoding
+        declared within markup).
+        """
+        if isinstance(markup, unicode):
+            return markup, None, None, False
+
+        try_encodings = [user_specified_encoding, document_declared_encoding]
+        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
+        return (dammit.markup, dammit.original_encoding,
+                dammit.declared_html_encoding,
+                dammit.contains_replacement_characters)
+
+    def feed(self, markup):
+        if isinstance(markup, basestring):
+            markup = StringIO(markup)
+        # Call feed() at least once, even if the markup is empty,
+        # or the parser won't be initialized.
+        data = markup.read(self.CHUNK_SIZE)
+        self.parser.feed(data)
+        while data != '':
+            # Now call feed() on the rest of the data, chunk by chunk.
+            data = markup.read(self.CHUNK_SIZE)
+            if data != '':
+                self.parser.feed(data)
+        self.parser.close()
+
+    def close(self):
+        self.nsmaps = [self.DEFAULT_NSMAPS]
+
+    def start(self, name, attrs, nsmap={}):
+        # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
+        attrs = dict(attrs)
+        nsprefix = None
+        # Invert each namespace map as it comes in.
+        if len(self.nsmaps) > 1:
+            # There are no new namespaces for this tag, but
+            # non-default namespaces are in play, so we need a
+            # separate tag stack to know when they end.
+            self.nsmaps.append(None)
+        elif len(nsmap) > 0:
+            # A new namespace mapping has come into play.
+            inverted_nsmap = dict((value, key) for key, value in nsmap.items())
+            self.nsmaps.append(inverted_nsmap)
+            # Also treat the namespace mapping as a set of attributes on the
+            # tag, so we can recreate it later.
+            attrs = attrs.copy()
+            for prefix, namespace in nsmap.items():
+                attribute = NamespacedAttribute(
+                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
+                attrs[attribute] = namespace
+
+        # Namespaces are in play. Find any attributes that came in
+        # from lxml with namespaces attached to their names, and
+        # turn then into NamespacedAttribute objects.
+        new_attrs = {}
+        for attr, value in attrs.items():
+            namespace, attr = self._getNsTag(attr)
+            if namespace is None:
+                new_attrs[attr] = value
+            else:
+                nsprefix = self._prefix_for_namespace(namespace)
+                attr = NamespacedAttribute(nsprefix, attr, namespace)
+                new_attrs[attr] = value
+        attrs = new_attrs
+
+        namespace, name = self._getNsTag(name)
+        nsprefix = self._prefix_for_namespace(namespace)
+        self.soup.handle_starttag(name, namespace, nsprefix, attrs)
+
+    def _prefix_for_namespace(self, namespace):
+        """Find the currently active prefix for the given namespace."""
+        if namespace is None:
+            return None
+        for inverted_nsmap in reversed(self.nsmaps):
+            if inverted_nsmap is not None and namespace in inverted_nsmap:
+                return inverted_nsmap[namespace]
+        return None
+
+    def end(self, name):
+        self.soup.endData()
+        completed_tag = self.soup.tagStack[-1]
+        namespace, name = self._getNsTag(name)
+        nsprefix = None
+        if namespace is not None:
+            for inverted_nsmap in reversed(self.nsmaps):
+                if inverted_nsmap is not None and namespace in inverted_nsmap:
+                    nsprefix = inverted_nsmap[namespace]
+                    break
+        self.soup.handle_endtag(name, nsprefix)
+        if len(self.nsmaps) > 1:
+            # This tag, or one of its parents, introduced a namespace
+            # mapping, so pop it off the stack.
+            self.nsmaps.pop()
+
+    def pi(self, target, data):
+        pass
+
+    def data(self, content):
+        self.soup.handle_data(content)
+
+    def doctype(self, name, pubid, system):
+        self.soup.endData()
+        doctype = Doctype.for_name_and_ids(name, pubid, system)
+        self.soup.object_was_parsed(doctype)
+
+    def comment(self, content):
+        "Handle comments as Comment objects."
+        self.soup.endData()
+        self.soup.handle_data(content)
+        self.soup.endData(Comment)
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
+
+
+class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
+
+    features = [LXML, HTML, FAST, PERMISSIVE]
+    is_xml = False
+
+    @property
+    def default_parser(self):
+        return etree.HTMLParser
+
+    def feed(self, markup):
+        self.parser.feed(markup)
+        self.parser.close()
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<html><body>%s</body></html>' % fragment
--- a/updater/bs4/dammit.py
+++ b/updater/bs4/dammit.py
@ -0,0 +1,802 @@
+# -*- coding: utf-8 -*-
+"""Beautiful Soup bonus library: Unicode, Dammit
+
+This class forces XML data into a standard format (usually to UTF-8 or
+Unicode).  It is heavily based on code from Mark Pilgrim's Universal
+Feed Parser. It does not rewrite the XML or HTML to reflect a new
+encoding; that's the tree builder's job.
+"""
+
+import codecs
+from htmlentitydefs import codepoint2name
+import re
+import logging
+
+# Import a library to autodetect character encodings.
+chardet_type = None
+try:
+    # First try the fast C implementation.
+    #  PyPI package: cchardet
+    import cchardet
+    def chardet_dammit(s):
+        return cchardet.detect(s)['encoding']
+except ImportError:
+    try:
+        # Fall back to the pure Python implementation
+        #  Debian package: python-chardet
+        #  PyPI package: chardet
+        import chardet
+        def chardet_dammit(s):
+            return chardet.detect(s)['encoding']
+        #import chardet.constants
+        #chardet.constants._debug = 1
+    except ImportError:
+        # No chardet available.
+        def chardet_dammit(s):
+            return None
+
+# Available from http://cjkpython.i18n.org/.
+try:
+    import iconv_codec
+except ImportError:
+    pass
+
+xml_encoding_re = re.compile(
+    '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
+html_meta_re = re.compile(
+    '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+
+class EntitySubstitution(object):
+
+    """Substitute XML or HTML entities for the corresponding characters."""
+
+    def _populate_class_variables():
+        lookup = {}
+        reverse_lookup = {}
+        characters_for_re = []
+        for codepoint, name in list(codepoint2name.items()):
+            character = unichr(codepoint)
+            if codepoint != 34:
+                # There's no point in turning the quotation mark into
+                # &quot;, unless it happens within an attribute value, which
+                # is handled elsewhere.
+                characters_for_re.append(character)
+                lookup[character] = name
+            # But we do want to turn &quot; into the quotation mark.
+            reverse_lookup[name] = character
+        re_definition = "[%s]" % "".join(characters_for_re)
+        return lookup, reverse_lookup, re.compile(re_definition)
+    (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
+     CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
+
+    CHARACTER_TO_XML_ENTITY = {
+        "'": "apos",
+        '"': "quot",
+        "&": "amp",
+        "<": "lt",
+        ">": "gt",
+        }
+
+    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+                                           ")")
+
+    @classmethod
+    def _substitute_html_entity(cls, matchobj):
+        entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
+        return "&%s;" % entity
+
+    @classmethod
+    def _substitute_xml_entity(cls, matchobj):
+        """Used with a regular expression to substitute the
+        appropriate XML entity for an XML special character."""
+        entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
+        return "&%s;" % entity
+
+    @classmethod
+    def quoted_attribute_value(self, value):
+        """Make a value into a quoted XML attribute, possibly escaping it.
+
+         Most strings will be quoted using double quotes.
+
+          Bob's Bar -> "Bob's Bar"
+
+         If a string contains double quotes, it will be quoted using
+         single quotes.
+
+          Welcome to "my bar" -> 'Welcome to "my bar"'
+
+         If a string contains both single and double quotes, the
+         double quotes will be escaped, and the string will be quoted
+         using double quotes.
+
+          Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
+        """
+        quote_with = '"'
+        if '"' in value:
+            if "'" in value:
+                # The string contains both single and double
+                # quotes.  Turn the double quotes into
+                # entities. We quote the double quotes rather than
+                # the single quotes because the entity name is
+                # "&quot;" whether this is HTML or XML.  If we
+                # quoted the single quotes, we'd have to decide
+                # between &apos; and &squot;.
+                replace_with = "&quot;"
+                value = value.replace('"', replace_with)
+            else:
+                # There are double quotes but no single quotes.
+                # We can use single quotes to quote the attribute.
+                quote_with = "'"
+        return quote_with + value + quote_with
+
+    @classmethod
+    def substitute_xml(cls, value, make_quoted_attribute=False):
+        """Substitute XML entities for special XML characters.
+
+        :param value: A string to be substituted. The less-than sign will
+          become &lt;, the greater-than sign will become &gt;, and any
+          ampersands that are not part of an entity defition will
+          become &amp;.
+
+        :param make_quoted_attribute: If True, then the string will be
+         quoted, as befits an attribute value.
+        """
+        # Escape angle brackets, and ampersands that aren't part of
+        # entities.
+        value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
+            cls._substitute_xml_entity, value)
+
+        if make_quoted_attribute:
+            value = cls.quoted_attribute_value(value)
+        return value
+
+    @classmethod
+    def substitute_html(cls, s):
+        """Replace certain Unicode characters with named HTML entities.
+
+        This differs from data.encode(encoding, 'xmlcharrefreplace')
+        in that the goal is to make the result more readable (to those
+        with ASCII displays) rather than to recover from
+        errors. There's absolutely nothing wrong with a UTF-8 string
+        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
+        character with "&eacute;" will make it more readable to some
+        people.
+        """
+        return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
+            cls._substitute_html_entity, s)
+
+
+class UnicodeDammit:
+    """A class for detecting the encoding of a *ML document and
+    converting it to a Unicode string. If the source encoding is
+    windows-1252, can replace MS smart quotes with their HTML or XML
+    equivalents."""
+
+    # This dictionary maps commonly seen values for "charset" in HTML
+    # meta tags to the corresponding Python codec names. It only covers
+    # values that aren't in Python's aliases and can't be determined
+    # by the heuristics in find_codec.
+    CHARSET_ALIASES = {"macintosh": "mac-roman",
+                       "x-sjis": "shift-jis"}
+
+    ENCODINGS_WITH_SMART_QUOTES = [
+        "windows-1252",
+        "iso-8859-1",
+        "iso-8859-2",
+        ]
+
+    def __init__(self, markup, override_encodings=[],
+                 smart_quotes_to=None, is_html=False):
+        self.declared_html_encoding = None
+        self.smart_quotes_to = smart_quotes_to
+        self.tried_encodings = []
+        self.contains_replacement_characters = False
+
+        if markup == '' or isinstance(markup, unicode):
+            self.markup = markup
+            self.unicode_markup = unicode(markup)
+            self.original_encoding = None
+            return
+
+        new_markup, document_encoding, sniffed_encoding = \
+            self._detectEncoding(markup, is_html)
+        self.markup = new_markup
+
+        u = None
+        if new_markup != markup:
+            # _detectEncoding modified the markup, then converted it to
+            # Unicode and then to UTF-8. So convert it from UTF-8.
+            u = self._convert_from("utf8")
+            self.original_encoding = sniffed_encoding
+
+        if not u:
+            for proposed_encoding in (
+                override_encodings + [document_encoding, sniffed_encoding]):
+                if proposed_encoding is not None:
+                    u = self._convert_from(proposed_encoding)
+                    if u:
+                        break
+
+        # If no luck and we have auto-detection library, try that:
+        if not u and not isinstance(self.markup, unicode):
+            u = self._convert_from(chardet_dammit(self.markup))
+
+        # As a last resort, try utf-8 and windows-1252:
+        if not u:
+            for proposed_encoding in ("utf-8", "windows-1252"):
+                u = self._convert_from(proposed_encoding)
+                if u:
+                    break
+
+        # As an absolute last resort, try the encodings again with
+        # character replacement.
+        if not u:
+            for proposed_encoding in (
+                override_encodings + [
+                    document_encoding, sniffed_encoding, "utf-8", "windows-1252"]):
+                if proposed_encoding != "ascii":
+                    u = self._convert_from(proposed_encoding, "replace")
+                if u is not None:
+                    logging.warning(
+                            "Some characters could not be decoded, and were "
+                            "replaced with REPLACEMENT CHARACTER.")
+                    self.contains_replacement_characters = True
+                    break
+
+        # We could at this point force it to ASCII, but that would
+        # destroy so much data that I think giving up is better
+        self.unicode_markup = u
+        if not u:
+            self.original_encoding = None
+
+    def _sub_ms_char(self, match):
+        """Changes a MS smart quote character to an XML or HTML
+        entity, or an ASCII character."""
+        orig = match.group(1)
+        if self.smart_quotes_to == 'ascii':
+            sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
+        else:
+            sub = self.MS_CHARS.get(orig)
+            if type(sub) == tuple:
+                if self.smart_quotes_to == 'xml':
+                    sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
+                else:
+                    sub = '&'.encode() + sub[0].encode() + ';'.encode()
+            else:
+                sub = sub.encode()
+        return sub
+
+    def _convert_from(self, proposed, errors="strict"):
+        proposed = self.find_codec(proposed)
+        if not proposed or (proposed, errors) in self.tried_encodings:
+            return None
+        self.tried_encodings.append((proposed, errors))
+        markup = self.markup
+        # Convert smart quotes to HTML if coming from an encoding
+        # that might have them.
+        if (self.smart_quotes_to is not None
+            and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
+            smart_quotes_re = b"([\x80-\x9f])"
+            smart_quotes_compiled = re.compile(smart_quotes_re)
+            markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
+
+        try:
+            #print "Trying to convert document to %s (errors=%s)" % (
+            #    proposed, errors)
+            u = self._to_unicode(markup, proposed, errors)
+            self.markup = u
+            self.original_encoding = proposed
+        except Exception as e:
+            #print "That didn't work!"
+            #print e
+            return None
+        #print "Correct encoding: %s" % proposed
+        return self.markup
+
+    def _to_unicode(self, data, encoding, errors="strict"):
+        '''Given a string and its encoding, decodes the string into Unicode.
+        %encoding is a string recognized by encodings.aliases'''
+
+        # strip Byte Order Mark (if present)
+        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
+               and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16be'
+            data = data[2:]
+        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
+                 and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16le'
+            data = data[2:]
+        elif data[:3] == '\xef\xbb\xbf':
+            encoding = 'utf-8'
+            data = data[3:]
+        elif data[:4] == '\x00\x00\xfe\xff':
+            encoding = 'utf-32be'
+            data = data[4:]
+        elif data[:4] == '\xff\xfe\x00\x00':
+            encoding = 'utf-32le'
+            data = data[4:]
+        newdata = unicode(data, encoding, errors)
+        return newdata
+
+    def _detectEncoding(self, xml_data, is_html=False):
+        """Given a document, tries to detect its XML encoding."""
+        xml_encoding = sniffed_xml_encoding = None
+        try:
+            if xml_data[:4] == b'\x4c\x6f\xa7\x94':
+                # EBCDIC
+                xml_data = self._ebcdic_to_ascii(xml_data)
+            elif xml_data[:4] == b'\x00\x3c\x00\x3f':
+                # UTF-16BE
+                sniffed_xml_encoding = 'utf-16be'
+                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
+                     and (xml_data[2:4] != b'\x00\x00'):
+                # UTF-16BE with BOM
+                sniffed_xml_encoding = 'utf-16be'
+                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+            elif xml_data[:4] == b'\x3c\x00\x3f\x00':
+                # UTF-16LE
+                sniffed_xml_encoding = 'utf-16le'
+                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
+                     (xml_data[2:4] != b'\x00\x00'):
+                # UTF-16LE with BOM
+                sniffed_xml_encoding = 'utf-16le'
+                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+            elif xml_data[:4] == b'\x00\x00\x00\x3c':
+                # UTF-32BE
+                sniffed_xml_encoding = 'utf-32be'
+                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+            elif xml_data[:4] == b'\x3c\x00\x00\x00':
+                # UTF-32LE
+                sniffed_xml_encoding = 'utf-32le'
+                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+            elif xml_data[:4] == b'\x00\x00\xfe\xff':
+                # UTF-32BE with BOM
+                sniffed_xml_encoding = 'utf-32be'
+                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+            elif xml_data[:4] == b'\xff\xfe\x00\x00':
+                # UTF-32LE with BOM
+                sniffed_xml_encoding = 'utf-32le'
+                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+            elif xml_data[:3] == b'\xef\xbb\xbf':
+                # UTF-8 with BOM
+                sniffed_xml_encoding = 'utf-8'
+                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+            else:
+                sniffed_xml_encoding = 'ascii'
+                pass
+        except:
+            xml_encoding_match = None
+        xml_encoding_match = xml_encoding_re.match(xml_data)
+        if not xml_encoding_match and is_html:
+            xml_encoding_match = html_meta_re.search(xml_data)
+        if xml_encoding_match is not None:
+            xml_encoding = xml_encoding_match.groups()[0].decode(
+                'ascii').lower()
+            if is_html:
+                self.declared_html_encoding = xml_encoding
+            if sniffed_xml_encoding and \
+               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
+                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
+                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
+                                 'utf16', 'u16')):
+                xml_encoding = sniffed_xml_encoding
+        return xml_data, xml_encoding, sniffed_xml_encoding
+
+    def find_codec(self, charset):
+        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
+               or (charset and self._codec(charset.replace("-", ""))) \
+               or (charset and self._codec(charset.replace("-", "_"))) \
+               or charset
+
+    def _codec(self, charset):
+        if not charset:
+            return charset
+        codec = None
+        try:
+            codecs.lookup(charset)
+            codec = charset
+        except (LookupError, ValueError):
+            pass
+        return codec
+
+    EBCDIC_TO_ASCII_MAP = None
+
+    def _ebcdic_to_ascii(self, s):
+        c = self.__class__
+        if not c.EBCDIC_TO_ASCII_MAP:
+            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
+                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
+                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
+                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
+                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
+                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
+                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
+                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
+                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
+                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
+                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
+                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
+                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
+                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
+                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
+                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
+                    250,251,252,253,254,255)
+            import string
+            c.EBCDIC_TO_ASCII_MAP = string.maketrans(
+            ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
+        return s.translate(c.EBCDIC_TO_ASCII_MAP)
+
+    # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
+    MS_CHARS = {b'\x80': ('euro', '20AC'),
+                b'\x81': ' ',
+                b'\x82': ('sbquo', '201A'),
+                b'\x83': ('fnof', '192'),
+                b'\x84': ('bdquo', '201E'),
+                b'\x85': ('hellip', '2026'),
+                b'\x86': ('dagger', '2020'),
+                b'\x87': ('Dagger', '2021'),
+                b'\x88': ('circ', '2C6'),
+                b'\x89': ('permil', '2030'),
+                b'\x8A': ('Scaron', '160'),
+                b'\x8B': ('lsaquo', '2039'),
+                b'\x8C': ('OElig', '152'),
+                b'\x8D': '?',
+                b'\x8E': ('#x17D', '17D'),
+                b'\x8F': '?',
+                b'\x90': '?',
+                b'\x91': ('lsquo', '2018'),
+                b'\x92': ('rsquo', '2019'),
+                b'\x93': ('ldquo', '201C'),
+                b'\x94': ('rdquo', '201D'),
+                b'\x95': ('bull', '2022'),
+                b'\x96': ('ndash', '2013'),
+                b'\x97': ('mdash', '2014'),
+                b'\x98': ('tilde', '2DC'),
+                b'\x99': ('trade', '2122'),
+                b'\x9a': ('scaron', '161'),
+                b'\x9b': ('rsaquo', '203A'),
+                b'\x9c': ('oelig', '153'),
+                b'\x9d': '?',
+                b'\x9e': ('#x17E', '17E'),
+                b'\x9f': ('Yuml', ''),}
+
+    # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
+    # horrors like stripping diacritical marks to turn á into a, but also
+    # contains non-horrors like turning “ into ".
+    MS_CHARS_TO_ASCII = {
+        b'\x80' : 'EUR',
+        b'\x81' : ' ',
+        b'\x82' : ',',
+        b'\x83' : 'f',
+        b'\x84' : ',,',
+        b'\x85' : '...',
+        b'\x86' : '+',
+        b'\x87' : '++',
+        b'\x88' : '^',
+        b'\x89' : '%',
+        b'\x8a' : 'S',
+        b'\x8b' : '<',
+        b'\x8c' : 'OE',
+        b'\x8d' : '?',
+        b'\x8e' : 'Z',
+        b'\x8f' : '?',
+        b'\x90' : '?',
+        b'\x91' : "'",
+        b'\x92' : "'",
+        b'\x93' : '"',
+        b'\x94' : '"',
+        b'\x95' : '*',
+        b'\x96' : '-',
+        b'\x97' : '--',
+        b'\x98' : '~',
+        b'\x99' : '(TM)',
+        b'\x9a' : 's',
+        b'\x9b' : '>',
+        b'\x9c' : 'oe',
+        b'\x9d' : '?',
+        b'\x9e' : 'z',
+        b'\x9f' : 'Y',
+        b'\xa0' : ' ',
+        b'\xa1' : '!',
+        b'\xa2' : 'c',
+        b'\xa3' : 'GBP',
+        b'\xa4' : '$', #This approximation is especially parochial--this is the
+                       #generic currency symbol.
+        b'\xa5' : 'YEN',
+        b'\xa6' : '|',
+        b'\xa7' : 'S',
+        b'\xa8' : '..',
+        b'\xa9' : '',
+        b'\xaa' : '(th)',
+        b'\xab' : '<<',
+        b'\xac' : '!',
+        b'\xad' : ' ',
+        b'\xae' : '(R)',
+        b'\xaf' : '-',
+        b'\xb0' : 'o',
+        b'\xb1' : '+-',
+        b'\xb2' : '2',
+        b'\xb3' : '3',
+        b'\xb4' : ("'", 'acute'),
+        b'\xb5' : 'u',
+        b'\xb6' : 'P',
+        b'\xb7' : '*',
+        b'\xb8' : ',',
+        b'\xb9' : '1',
+        b'\xba' : '(th)',
+        b'\xbb' : '>>',
+        b'\xbc' : '1/4',
+        b'\xbd' : '1/2',
+        b'\xbe' : '3/4',
+        b'\xbf' : '?',
+        b'\xc0' : 'A',
+        b'\xc1' : 'A',
+        b'\xc2' : 'A',
+        b'\xc3' : 'A',
+        b'\xc4' : 'A',
+        b'\xc5' : 'A',
+        b'\xc6' : 'AE',
+        b'\xc7' : 'C',
+        b'\xc8' : 'E',
+        b'\xc9' : 'E',
+        b'\xca' : 'E',
+        b'\xcb' : 'E',
+        b'\xcc' : 'I',
+        b'\xcd' : 'I',
+        b'\xce' : 'I',
+        b'\xcf' : 'I',
+        b'\xd0' : 'D',
+        b'\xd1' : 'N',
+        b'\xd2' : 'O',
+        b'\xd3' : 'O',
+        b'\xd4' : 'O',
+        b'\xd5' : 'O',
+        b'\xd6' : 'O',
+        b'\xd7' : '*',
+        b'\xd8' : 'O',
+        b'\xd9' : 'U',
+        b'\xda' : 'U',
+        b'\xdb' : 'U',
+        b'\xdc' : 'U',
+        b'\xdd' : 'Y',
+        b'\xde' : 'b',
+        b'\xdf' : 'B',
+        b'\xe0' : 'a',
+        b'\xe1' : 'a',
+        b'\xe2' : 'a',
+        b'\xe3' : 'a',
+        b'\xe4' : 'a',
+        b'\xe5' : 'a',
+        b'\xe6' : 'ae',
+        b'\xe7' : 'c',
+        b'\xe8' : 'e',
+        b'\xe9' : 'e',
+        b'\xea' : 'e',
+        b'\xeb' : 'e',
+        b'\xec' : 'i',
+        b'\xed' : 'i',
+        b'\xee' : 'i',
+        b'\xef' : 'i',
+        b'\xf0' : 'o',
+        b'\xf1' : 'n',
+        b'\xf2' : 'o',
+        b'\xf3' : 'o',
+        b'\xf4' : 'o',
+        b'\xf5' : 'o',
+        b'\xf6' : 'o',
+        b'\xf7' : '/',
+        b'\xf8' : 'o',
+        b'\xf9' : 'u',
+        b'\xfa' : 'u',
+        b'\xfb' : 'u',
+        b'\xfc' : 'u',
+        b'\xfd' : 'y',
+        b'\xfe' : 'b',
+        b'\xff' : 'y',
+        }
+
+    # A map used when removing rogue Windows-1252/ISO-8859-1
+    # characters in otherwise UTF-8 documents.
+    #
+    # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
+    # Windows-1252.
+    WINDOWS_1252_TO_UTF8 = {
+        0x80 : b'\xe2\x82\xac', # €
+        0x82 : b'\xe2\x80\x9a', # ‚
+        0x83 : b'\xc6\x92',     # ƒ
+        0x84 : b'\xe2\x80\x9e', # „
+        0x85 : b'\xe2\x80\xa6', # …
+        0x86 : b'\xe2\x80\xa0', # †
+        0x87 : b'\xe2\x80\xa1', # ‡
+        0x88 : b'\xcb\x86',     # ˆ
+        0x89 : b'\xe2\x80\xb0', # ‰
+        0x8a : b'\xc5\xa0',     # Š
+        0x8b : b'\xe2\x80\xb9', # ‹
+        0x8c : b'\xc5\x92',     # Œ
+        0x8e : b'\xc5\xbd',     # Ž
+        0x91 : b'\xe2\x80\x98', # ‘
+        0x92 : b'\xe2\x80\x99', # ’
+        0x93 : b'\xe2\x80\x9c', # “
+        0x94 : b'\xe2\x80\x9d', # ”
+        0x95 : b'\xe2\x80\xa2', # •
+        0x96 : b'\xe2\x80\x93', # –
+        0x97 : b'\xe2\x80\x94', # —
+        0x98 : b'\xcb\x9c',     # ˜
+        0x99 : b'\xe2\x84\xa2', # ™
+        0x9a : b'\xc5\xa1',     # š
+        0x9b : b'\xe2\x80\xba', # ›
+        0x9c : b'\xc5\x93',     # œ
+        0x9e : b'\xc5\xbe',     # ž
+        0x9f : b'\xc5\xb8',     # Ÿ
+        0xa0 : b'\xc2\xa0',     #  
+        0xa1 : b'\xc2\xa1',     # ¡
+        0xa2 : b'\xc2\xa2',     # ¢
+        0xa3 : b'\xc2\xa3',     # £
+        0xa4 : b'\xc2\xa4',     # ¤
+        0xa5 : b'\xc2\xa5',     # ¥
+        0xa6 : b'\xc2\xa6',     # ¦
+        0xa7 : b'\xc2\xa7',     # §
+        0xa8 : b'\xc2\xa8',     # ¨
+        0xa9 : b'\xc2\xa9',     # ©
+        0xaa : b'\xc2\xaa',     # ª
+        0xab : b'\xc2\xab',     # «
+        0xac : b'\xc2\xac',     # ¬
+        0xad : b'\xc2\xad',     # 
+        0xae : b'\xc2\xae',     # ®
+        0xaf : b'\xc2\xaf',     # ¯
+        0xb0 : b'\xc2\xb0',     # °
+        0xb1 : b'\xc2\xb1',     # ±
+        0xb2 : b'\xc2\xb2',     # ²
+        0xb3 : b'\xc2\xb3',     # ³
+        0xb4 : b'\xc2\xb4',     # ´
+        0xb5 : b'\xc2\xb5',     # µ
+        0xb6 : b'\xc2\xb6',     # ¶
+        0xb7 : b'\xc2\xb7',     # ·
+        0xb8 : b'\xc2\xb8',     # ¸
+        0xb9 : b'\xc2\xb9',     # ¹
+        0xba : b'\xc2\xba',     # º
+        0xbb : b'\xc2\xbb',     # »
+        0xbc : b'\xc2\xbc',     # ¼
+        0xbd : b'\xc2\xbd',     # ½
+        0xbe : b'\xc2\xbe',     # ¾
+        0xbf : b'\xc2\xbf',     # ¿
+        0xc0 : b'\xc3\x80',     # À
+        0xc1 : b'\xc3\x81',     # Á
+        0xc2 : b'\xc3\x82',     # Â
+        0xc3 : b'\xc3\x83',     # Ã
+        0xc4 : b'\xc3\x84',     # Ä
+        0xc5 : b'\xc3\x85',     # Å
+        0xc6 : b'\xc3\x86',     # Æ
+        0xc7 : b'\xc3\x87',     # Ç
+        0xc8 : b'\xc3\x88',     # È
+        0xc9 : b'\xc3\x89',     # É
+        0xca : b'\xc3\x8a',     # Ê
+        0xcb : b'\xc3\x8b',     # Ë
+        0xcc : b'\xc3\x8c',     # Ì
+        0xcd : b'\xc3\x8d',     # Í
+        0xce : b'\xc3\x8e',     # Î
+        0xcf : b'\xc3\x8f',     # Ï
+        0xd0 : b'\xc3\x90',     # Ð
+        0xd1 : b'\xc3\x91',     # Ñ
+        0xd2 : b'\xc3\x92',     # Ò
+        0xd3 : b'\xc3\x93',     # Ó
+        0xd4 : b'\xc3\x94',     # Ô
+        0xd5 : b'\xc3\x95',     # Õ
+        0xd6 : b'\xc3\x96',     # Ö
+        0xd7 : b'\xc3\x97',     # ×
+        0xd8 : b'\xc3\x98',     # Ø
+        0xd9 : b'\xc3\x99',     # Ù
+        0xda : b'\xc3\x9a',     # Ú
+        0xdb : b'\xc3\x9b',     # Û
+        0xdc : b'\xc3\x9c',     # Ü
+        0xdd : b'\xc3\x9d',     # Ý
+        0xde : b'\xc3\x9e',     # Þ
+        0xdf : b'\xc3\x9f',     # ß
+        0xe0 : b'\xc3\xa0',     # à
+        0xe1 : b'\xa1',     # á
+        0xe2 : b'\xc3\xa2',     # â
+        0xe3 : b'\xc3\xa3',     # ã
+        0xe4 : b'\xc3\xa4',     # ä
+        0xe5 : b'\xc3\xa5',     # å
+        0xe6 : b'\xc3\xa6',     # æ
+        0xe7 : b'\xc3\xa7',     # ç
+        0xe8 : b'\xc3\xa8',     # è
+        0xe9 : b'\xc3\xa9',     # é
+        0xea : b'\xc3\xaa',     # ê
+        0xeb : b'\xc3\xab',     # ë
+        0xec : b'\xc3\xac',     # ì
+        0xed : b'\xc3\xad',     # í
+        0xee : b'\xc3\xae',     # î
+        0xef : b'\xc3\xaf',     # ï
+        0xf0 : b'\xc3\xb0',     # ð
+        0xf1 : b'\xc3\xb1',     # ñ
+        0xf2 : b'\xc3\xb2',     # ò
+        0xf3 : b'\xc3\xb3',     # ó
+        0xf4 : b'\xc3\xb4',     # ô
+        0xf5 : b'\xc3\xb5',     # õ
+        0xf6 : b'\xc3\xb6',     # ö
+        0xf7 : b'\xc3\xb7',     # ÷
+        0xf8 : b'\xc3\xb8',     # ø
+        0xf9 : b'\xc3\xb9',     # ù
+        0xfa : b'\xc3\xba',     # ú
+        0xfb : b'\xc3\xbb',     # û
+        0xfc : b'\xc3\xbc',     # ü
+        0xfd : b'\xc3\xbd',     # ý
+        0xfe : b'\xc3\xbe',     # þ
+        }
+
+    MULTIBYTE_MARKERS_AND_SIZES = [
+        (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
+        (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
+        (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
+        ]
+
+    FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
+    LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
+
+    @classmethod
+    def detwingle(cls, in_bytes, main_encoding="utf8",
+                  embedded_encoding="windows-1252"):
+        """Fix characters from one encoding embedded in some other encoding.
+
+        Currently the only situation supported is Windows-1252 (or its
+        subset ISO-8859-1), embedded in UTF-8.
+
+        The input must be a bytestring. If you've already converted
+        the document to Unicode, you're too late.
+
+        The output is a bytestring in which `embedded_encoding`
+        characters have been converted to their `main_encoding`
+        equivalents.
+        """
+        if embedded_encoding.replace('_', '-').lower() not in (
+            'windows-1252', 'windows_1252'):
+            raise NotImplementedError(
+                "Windows-1252 and ISO-8859-1 are the only currently supported "
+                "embedded encodings.")
+
+        if main_encoding.lower() not in ('utf8', 'utf-8'):
+            raise NotImplementedError(
+                "UTF-8 is the only currently supported main encoding.")
+
+        byte_chunks = []
+
+        chunk_start = 0
+        pos = 0
+        while pos < len(in_bytes):
+            byte = in_bytes[pos]
+            if not isinstance(byte, int):
+                # Python 2.x
+                byte = ord(byte)
+            if (byte >= cls.FIRST_MULTIBYTE_MARKER
+                and byte <= cls.LAST_MULTIBYTE_MARKER):
+                # This is the start of a UTF-8 multibyte character. Skip
+                # to the end.
+                for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
+                    if byte >= start and byte <= end:
+                        pos += size
+                        break
+            elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
+                # We found a Windows-1252 character!
+                # Save the string up to this point as a chunk.
+                byte_chunks.append(in_bytes[chunk_start:pos])
+
+                # Now translate the Windows-1252 character into UTF-8
+                # and add it as another, one-byte chunk.
+                byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
+                pos += 1
+                chunk_start = pos
+            else:
+                # Go on to the next character.
+                pos += 1
+        if chunk_start == 0:
+            # The string is unchanged.
+            return in_bytes
+        else:
+            # Store the final chunk.
+            byte_chunks.append(in_bytes[chunk_start:])
+        return b''.join(byte_chunks)
+
--- a/updater/bs4/element.py
+++ b/updater/bs4/element.py
--- a/updater/bs4/testing.py
+++ b/updater/bs4/testing.py
@ -0,0 +1,554 @@
+"""Helper classes for tests."""
+
+import copy
+import functools
+import unittest
+from unittest import TestCase
+from bs4 import BeautifulSoup
+from bs4.element import (
+    CharsetMetaAttributeValue,
+    Comment,
+    ContentMetaAttributeValue,
+    Doctype,
+    SoupStrainer,
+)
+
+from bs4.builder import HTMLParserTreeBuilder
+default_builder = HTMLParserTreeBuilder
+
+
+class SoupTest(unittest.TestCase):
+
+    @property
+    def default_builder(self):
+        return default_builder()
+
+    def soup(self, markup, **kwargs):
+        """Build a Beautiful Soup object from markup."""
+        builder = kwargs.pop('builder', self.default_builder)
+        return BeautifulSoup(markup, builder=builder, **kwargs)
+
+    def document_for(self, markup):
+        """Turn an HTML fragment into a document.
+
+        The details depend on the builder.
+        """
+        return self.default_builder.test_fragment_to_document(markup)
+
+    def assertSoupEquals(self, to_parse, compare_parsed_to=None):
+        builder = self.default_builder
+        obj = BeautifulSoup(to_parse, builder=builder)
+        if compare_parsed_to is None:
+            compare_parsed_to = to_parse
+
+        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
+
+
+class HTMLTreeBuilderSmokeTest(object):
+
+    """A basic test of a treebuilder's competence.
+
+    Any HTML treebuilder, present or future, should be able to pass
+    these tests. With invalid markup, there's room for interpretation,
+    and different parsers can handle it differently. But with the
+    markup in these tests, there's not much room for interpretation.
+    """
+
+    def assertDoctypeHandled(self, doctype_fragment):
+        """Assert that a given doctype string is handled correctly."""
+        doctype_str, soup = self._document_with_doctype(doctype_fragment)
+
+        # Make sure a Doctype object was created.
+        doctype = soup.contents[0]
+        self.assertEqual(doctype.__class__, Doctype)
+        self.assertEqual(doctype, doctype_fragment)
+        self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
+
+        # Make sure that the doctype was correctly associated with the
+        # parse tree and that the rest of the document parsed.
+        self.assertEqual(soup.p.contents[0], 'foo')
+
+    def _document_with_doctype(self, doctype_fragment):
+        """Generate and parse a document with the given doctype."""
+        doctype = '<!DOCTYPE %s>' % doctype_fragment
+        markup = doctype + '\n<p>foo</p>'
+        soup = self.soup(markup)
+        return doctype, soup
+
+    def test_normal_doctypes(self):
+        """Make sure normal, everyday HTML doctypes are handled correctly."""
+        self.assertDoctypeHandled("html")
+        self.assertDoctypeHandled(
+            'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
+
+    def test_public_doctype_with_url(self):
+        doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
+        self.assertDoctypeHandled(doctype)
+
+    def test_system_doctype(self):
+        self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
+
+    def test_namespaced_system_doctype(self):
+        # We can handle a namespaced doctype with a system ID.
+        self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
+
+    def test_namespaced_public_doctype(self):
+        # Test a namespaced doctype with a public id.
+        self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
+
+    def test_real_xhtml_document(self):
+        """A real XHTML document should come out more or less the same as it went in."""
+        markup = b"""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>Hello.</title></head>
+<body>Goodbye.</body>
+</html>"""
+        soup = self.soup(markup)
+        self.assertEqual(
+            soup.encode("utf-8").replace(b"\n", b""),
+            markup.replace(b"\n", b""))
+
+    def test_deepcopy(self):
+        """Make sure you can copy the tree builder.
+
+        This is important because the builder is part of a
+        BeautifulSoup object, and we want to be able to copy that.
+        """
+        copy.deepcopy(self.default_builder)
+
+    def test_p_tag_is_never_empty_element(self):
+        """A <p> tag is never designated as an empty-element tag.
+
+        Even if the markup shows it as an empty-element tag, it
+        shouldn't be presented that way.
+        """
+        soup = self.soup("<p/>")
+        self.assertFalse(soup.p.is_empty_element)
+        self.assertEqual(str(soup.p), "<p></p>")
+
+    def test_unclosed_tags_get_closed(self):
+        """A tag that's not closed by the end of the document should be closed.
+
+        This applies to all tags except empty-element tags.
+        """
+        self.assertSoupEquals("<p>", "<p></p>")
+        self.assertSoupEquals("<b>", "<b></b>")
+
+        self.assertSoupEquals("<br>", "<br/>")
+
+    def test_br_is_always_empty_element_tag(self):
+        """A <br> tag is designated as an empty-element tag.
+
+        Some parsers treat <br></br> as one <br/> tag, some parsers as
+        two tags, but it should always be an empty-element tag.
+        """
+        soup = self.soup("<br></br>")
+        self.assertTrue(soup.br.is_empty_element)
+        self.assertEqual(str(soup.br), "<br/>")
+
+    def test_nested_formatting_elements(self):
+        self.assertSoupEquals("<em><em></em></em>")
+
+    def test_comment(self):
+        # Comments are represented as Comment objects.
+        markup = "<p>foo<!--foobar-->baz</p>"
+        self.assertSoupEquals(markup)
+
+        soup = self.soup(markup)
+        comment = soup.find(text="foobar")
+        self.assertEqual(comment.__class__, Comment)
+
+        # The comment is properly integrated into the tree.
+        foo = soup.find(text="foo")
+        self.assertEqual(comment, foo.next_element)
+        baz = soup.find(text="baz")
+        self.assertEquals(comment, baz.previous_element)
+
+    def test_preserved_whitespace_in_pre_and_textarea(self):
+        """Whitespace must be preserved in <pre> and <textarea> tags."""
+        self.assertSoupEquals("<pre>   </pre>")
+        self.assertSoupEquals("<textarea> woo  </textarea>")
+
+    def test_nested_inline_elements(self):
+        """Inline elements can be nested indefinitely."""
+        b_tag = "<b>Inside a B tag</b>"
+        self.assertSoupEquals(b_tag)
+
+        nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
+        self.assertSoupEquals(nested_b_tag)
+
+        double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
+        self.assertSoupEquals(nested_b_tag)
+
+    def test_nested_block_level_elements(self):
+        """Block elements can be nested."""
+        soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
+        blockquote = soup.blockquote
+        self.assertEqual(blockquote.p.b.string, 'Foo')
+        self.assertEqual(blockquote.b.string, 'Foo')
+
+    def test_correctly_nested_tables(self):
+        """One table can go inside another one."""
+        markup = ('<table id="1">'
+                  '<tr>'
+                  "<td>Here's another table:"
+                  '<table id="2">'
+                  '<tr><td>foo</td></tr>'
+                  '</table></td>')
+
+        self.assertSoupEquals(
+            markup,
+            '<table id="1"><tr><td>Here\'s another table:'
+            '<table id="2"><tr><td>foo</td></tr></table>'
+            '</td></tr></table>')
+
+        self.assertSoupEquals(
+            "<table><thead><tr><td>Foo</td></tr></thead>"
+            "<tbody><tr><td>Bar</td></tr></tbody>"
+            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
+
+    def test_deeply_nested_multivalued_attribute(self):
+        # html5lib can set the attributes of the same tag many times
+        # as it rearranges the tree. This has caused problems with
+        # multivalued attributes.
+        markup = '<table><div><div class="css"></div></div></table>'
+        soup = self.soup(markup)
+        self.assertEqual(["css"], soup.div.div['class'])
+
+    def test_angle_brackets_in_attribute_values_are_escaped(self):
+        self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
+
+    def test_entities_in_attributes_converted_to_unicode(self):
+        expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
+        self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
+        self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
+        self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
+
+    def test_entities_in_text_converted_to_unicode(self):
+        expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
+        self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
+        self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
+        self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
+
+    def test_quot_entity_converted_to_quotation_mark(self):
+        self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
+                              '<p>I said "good day!"</p>')
+
+    def test_out_of_range_entity(self):
+        expect = u"\N{REPLACEMENT CHARACTER}"
+        self.assertSoupEquals("&#10000000000000;", expect)
+        self.assertSoupEquals("&#x10000000000000;", expect)
+        self.assertSoupEquals("&#1000000000;", expect)
+
+    def test_basic_namespaces(self):
+        """Parsers don't need to *understand* namespaces, but at the
+        very least they should not choke on namespaces or lose
+        data."""
+
+        markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
+        soup = self.soup(markup)
+        self.assertEqual(markup, soup.encode())
+        html = soup.html
+        self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
+        self.assertEqual(
+            'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
+        self.assertEqual(
+            'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
+
+    def test_multivalued_attribute_value_becomes_list(self):
+        markup = b'<a class="foo bar">'
+        soup = self.soup(markup)
+        self.assertEqual(['foo', 'bar'], soup.a['class'])
+
+    #
+    # Generally speaking, tests below this point are more tests of
+    # Beautiful Soup than tests of the tree builders. But parsers are
+    # weird, so we run these tests separately for every tree builder
+    # to detect any differences between them.
+    #
+
+    def test_soupstrainer(self):
+        """Parsers should be able to work with SoupStrainers."""
+        strainer = SoupStrainer("b")
+        soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
+                         parse_only=strainer)
+        self.assertEqual(soup.decode(), "<b>bold</b>")
+
+    def test_single_quote_attribute_values_become_double_quotes(self):
+        self.assertSoupEquals("<foo attr='bar'></foo>",
+                              '<foo attr="bar"></foo>')
+
+    def test_attribute_values_with_nested_quotes_are_left_alone(self):
+        text = """<foo attr='bar "brawls" happen'>a</foo>"""
+        self.assertSoupEquals(text)
+
+    def test_attribute_values_with_double_nested_quotes_get_quoted(self):
+        text = """<foo attr='bar "brawls" happen'>a</foo>"""
+        soup = self.soup(text)
+        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
+        self.assertSoupEquals(
+            soup.foo.decode(),
+            """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
+
+    def test_ampersand_in_attribute_value_gets_escaped(self):
+        self.assertSoupEquals('<this is="really messed up & stuff"></this>',
+                              '<this is="really messed up &amp; stuff"></this>')
+
+        self.assertSoupEquals(
+            '<a href="http://example.org?a=1&b=2;3">foo</a>',
+            '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
+
+    def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
+        self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
+
+    def test_entities_in_strings_converted_during_parsing(self):
+        # Both XML and HTML entities are converted to Unicode characters
+        # during parsing.
+        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
+        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
+        self.assertSoupEquals(text, expected)
+
+    def test_smart_quotes_converted_on_the_way_in(self):
+        # Microsoft smart quotes are converted to Unicode characters during
+        # parsing.
+        quote = b"<p>\x91Foo\x92</p>"
+        soup = self.soup(quote)
+        self.assertEqual(
+            soup.p.string,
+            u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
+
+    def test_non_breaking_spaces_converted_on_the_way_in(self):
+        soup = self.soup("<a>&nbsp;&nbsp;</a>")
+        self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
+
+    def test_entities_converted_on_the_way_out(self):
+        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
+        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
+        soup = self.soup(text)
+        self.assertEqual(soup.p.encode("utf-8"), expected)
+
+    def test_real_iso_latin_document(self):
+        # Smoke test of interrelated functionality, using an
+        # easy-to-understand document.
+
+        # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
+        unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
+
+        # That's because we're going to encode it into ISO-Latin-1, and use
+        # that to test.
+        iso_latin_html = unicode_html.encode("iso-8859-1")
+
+        # Parse the ISO-Latin-1 HTML.
+        soup = self.soup(iso_latin_html)
+        # Encode it to UTF-8.
+        result = soup.encode("utf-8")
+
+        # What do we expect the result to look like? Well, it would
+        # look like unicode_html, except that the META tag would say
+        # UTF-8 instead of ISO-Latin-1.
+        expected = unicode_html.replace("ISO-Latin-1", "utf-8")
+
+        # And, of course, it would be in UTF-8, not Unicode.
+        expected = expected.encode("utf-8")
+
+        # Ta-da!
+        self.assertEqual(result, expected)
+
+    def test_real_shift_jis_document(self):
+        # Smoke test to make sure the parser can handle a document in
+        # Shift-JIS encoding, without choking.
+        shift_jis_html = (
+            b'<html><head></head><body><pre>'
+            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
+            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
+            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
+            b'</pre></body></html>')
+        unicode_html = shift_jis_html.decode("shift-jis")
+        soup = self.soup(unicode_html)
+
+        # Make sure the parse tree is correctly encoded to various
+        # encodings.
+        self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
+        self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
+
+    def test_real_hebrew_document(self):
+        # A real-world test to make sure we can convert ISO-8859-9 (a
+        # Hebrew encoding) to UTF-8.
+        hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
+        soup = self.soup(
+            hebrew_document, from_encoding="iso8859-8")
+        self.assertEqual(soup.original_encoding, 'iso8859-8')
+        self.assertEqual(
+            soup.encode('utf-8'),
+            hebrew_document.decode("iso8859-8").encode("utf-8"))
+
+    def test_meta_tag_reflects_current_encoding(self):
+        # Here's the <meta> tag saying that a document is
+        # encoded in Shift-JIS.
+        meta_tag = ('<meta content="text/html; charset=x-sjis" '
+                    'http-equiv="Content-type"/>')
+
+        # Here's a document incorporating that meta tag.
+        shift_jis_html = (
+            '<html><head>\n%s\n'
+            '<meta http-equiv="Content-language" content="ja"/>'
+            '</head><body>Shift-JIS markup goes here.') % meta_tag
+        soup = self.soup(shift_jis_html)
+
+        # Parse the document, and the charset is seemingly unaffected.
+        parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
+        content = parsed_meta['content']
+        self.assertEqual('text/html; charset=x-sjis', content)
+
+        # But that value is actually a ContentMetaAttributeValue object.
+        self.assertTrue(isinstance(content, ContentMetaAttributeValue))
+
+        # And it will take on a value that reflects its current
+        # encoding.
+        self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
+
+        # For the rest of the story, see TestSubstitutions in
+        # test_tree.py.
+
+    def test_html5_style_meta_tag_reflects_current_encoding(self):
+        # Here's the <meta> tag saying that a document is
+        # encoded in Shift-JIS.
+        meta_tag = ('<meta id="encoding" charset="x-sjis" />')
+
+        # Here's a document incorporating that meta tag.
+        shift_jis_html = (
+            '<html><head>\n%s\n'
+            '<meta http-equiv="Content-language" content="ja"/>'
+            '</head><body>Shift-JIS markup goes here.') % meta_tag
+        soup = self.soup(shift_jis_html)
+
+        # Parse the document, and the charset is seemingly unaffected.
+        parsed_meta = soup.find('meta', id="encoding")
+        charset = parsed_meta['charset']
+        self.assertEqual('x-sjis', charset)
+
+        # But that value is actually a CharsetMetaAttributeValue object.
+        self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
+
+        # And it will take on a value that reflects its current
+        # encoding.
+        self.assertEqual('utf8', charset.encode("utf8"))
+
+    def test_tag_with_no_attributes_can_have_attributes_added(self):
+        data = self.soup("<a>text</a>")
+        data.a['foo'] = 'bar'
+        self.assertEqual('<a foo="bar">text</a>', data.a.decode())
+
+class XMLTreeBuilderSmokeTest(object):
+
+    def test_docstring_generated(self):
+        soup = self.soup("<root/>")
+        self.assertEqual(
+            soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
+
+    def test_real_xhtml_document(self):
+        """A real XHTML document should come out *exactly* the same as it went in."""
+        markup = b"""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>Hello.</title></head>
+<body>Goodbye.</body>
+</html>"""
+        soup = self.soup(markup)
+        self.assertEqual(
+            soup.encode("utf-8"), markup)
+
+    def test_popping_namespaced_tag(self):
+        markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
+        soup = self.soup(markup)
+        self.assertEqual(
+            unicode(soup.rss), markup)
+
+    def test_docstring_includes_correct_encoding(self):
+        soup = self.soup("<root/>")
+        self.assertEqual(
+            soup.encode("latin1"),
+            b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
+
+    def test_large_xml_document(self):
+        """A large XML document should come out the same as it went in."""
+        markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
+                  + b'0' * (2**12)
+                  + b'</root>')
+        soup = self.soup(markup)
+        self.assertEqual(soup.encode("utf-8"), markup)
+
+
+    def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
+        self.assertSoupEquals("<p>", "<p/>")
+        self.assertSoupEquals("<p>foo</p>")
+
+    def test_namespaces_are_preserved(self):
+        markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
+        soup = self.soup(markup)
+        root = soup.root
+        self.assertEqual("http://example.com/", root['xmlns:a'])
+        self.assertEqual("http://example.net/", root['xmlns:b'])
+
+    def test_closing_namespaced_tag(self):
+        markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
+        soup = self.soup(markup)
+        self.assertEqual(unicode(soup.p), markup)
+
+    def test_namespaced_attributes(self):
+        markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
+        soup = self.soup(markup)
+        self.assertEqual(unicode(soup.foo), markup)
+
+    def test_namespaced_attributes_xml_namespace(self):
+        markup = '<foo xml:lang="fr">bar</foo>'
+        soup = self.soup(markup)
+        self.assertEqual(unicode(soup.foo), markup)
+
+class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
+    """Smoke test for a tree builder that supports HTML5."""
+
+    def test_real_xhtml_document(self):
+        # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
+        # XHTML documents in any particular way.
+        pass
+
+    def test_html_tags_have_namespace(self):
+        markup = "<a>"
+        soup = self.soup(markup)
+        self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
+
+    def test_svg_tags_have_namespace(self):
+        markup = '<svg><circle/></svg>'
+        soup = self.soup(markup)
+        namespace = "http://www.w3.org/2000/svg"
+        self.assertEqual(namespace, soup.svg.namespace)
+        self.assertEqual(namespace, soup.circle.namespace)
+
+
+    def test_mathml_tags_have_namespace(self):
+        markup = '<math><msqrt>5</msqrt></math>'
+        soup = self.soup(markup)
+        namespace = 'http://www.w3.org/1998/Math/MathML'
+        self.assertEqual(namespace, soup.math.namespace)
+        self.assertEqual(namespace, soup.msqrt.namespace)
+
+    def test_xml_declaration_becomes_comment(self):
+        markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
+        soup = self.soup(markup)
+        self.assertTrue(isinstance(soup.contents[0], Comment))
+        self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
+        self.assertEqual("html", soup.contents[0].next_element.name)
+
+def skipIf(condition, reason):
+   def nothing(test, *args, **kwargs):
+       return None
+
+   def decorator(test_item):
+       if condition:
+           return nothing
+       else:
+           return test_item
+
+   return decorator
--- a/updater/bs4/tests/init.py
+++ b/updater/bs4/tests/init.py
@ -0,0 +1 @@
+"The beautifulsoup tests."
--- a/updater/bs4/tests/test_builder_registry.py
+++ b/updater/bs4/tests/test_builder_registry.py
@ -0,0 +1,141 @@
+"""Tests of the builder registry."""
+
+import unittest
+
+from bs4 import BeautifulSoup
+from bs4.builder import (
+    builder_registry as registry,
+    HTMLParserTreeBuilder,
+    TreeBuilderRegistry,
+)
+
+try:
+    from bs4.builder import HTML5TreeBuilder
+    HTML5LIB_PRESENT = True
+except ImportError:
+    HTML5LIB_PRESENT = False
+
+try:
+    from bs4.builder import (
+        LXMLTreeBuilderForXML,
+        LXMLTreeBuilder,
+        )
+    LXML_PRESENT = True
+except ImportError:
+    LXML_PRESENT = False
+
+
+class BuiltInRegistryTest(unittest.TestCase):
+    """Test the built-in registry with the default builders registered."""
+
+    def test_combination(self):
+        if LXML_PRESENT:
+            self.assertEqual(registry.lookup('fast', 'html'),
+                             LXMLTreeBuilder)
+
+        if LXML_PRESENT:
+            self.assertEqual(registry.lookup('permissive', 'xml'),
+                             LXMLTreeBuilderForXML)
+        self.assertEqual(registry.lookup('strict', 'html'),
+                          HTMLParserTreeBuilder)
+        if HTML5LIB_PRESENT:
+            self.assertEqual(registry.lookup('html5lib', 'html'),
+                              HTML5TreeBuilder)
+
+    def test_lookup_by_markup_type(self):
+        if LXML_PRESENT:
+            self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
+            self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
+        else:
+            self.assertEqual(registry.lookup('xml'), None)
+            if HTML5LIB_PRESENT:
+                self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
+            else:
+                self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
+
+    def test_named_library(self):
+        if LXML_PRESENT:
+            self.assertEqual(registry.lookup('lxml', 'xml'),
+                             LXMLTreeBuilderForXML)
+            self.assertEqual(registry.lookup('lxml', 'html'),
+                             LXMLTreeBuilder)
+        if HTML5LIB_PRESENT:
+            self.assertEqual(registry.lookup('html5lib'),
+                              HTML5TreeBuilder)
+
+        self.assertEqual(registry.lookup('html.parser'),
+                          HTMLParserTreeBuilder)
+
+    def test_beautifulsoup_constructor_does_lookup(self):
+        # You can pass in a string.
+        BeautifulSoup("", features="html")
+        # Or a list of strings.
+        BeautifulSoup("", features=["html", "fast"])
+
+        # You'll get an exception if BS can't find an appropriate
+        # builder.
+        self.assertRaises(ValueError, BeautifulSoup,
+                          "", features="no-such-feature")
+
+class RegistryTest(unittest.TestCase):
+    """Test the TreeBuilderRegistry class in general."""
+
+    def setUp(self):
+        self.registry = TreeBuilderRegistry()
+
+    def builder_for_features(self, *feature_list):
+        cls = type('Builder_' + '_'.join(feature_list),
+                   (object,), {'features' : feature_list})
+
+        self.registry.register(cls)
+        return cls
+
+    def test_register_with_no_features(self):
+        builder = self.builder_for_features()
+
+        # Since the builder advertises no features, you can't find it
+        # by looking up features.
+        self.assertEqual(self.registry.lookup('foo'), None)
+
+        # But you can find it by doing a lookup with no features, if
+        # this happens to be the only registered builder.
+        self.assertEqual(self.registry.lookup(), builder)
+
+    def test_register_with_features_makes_lookup_succeed(self):
+        builder = self.builder_for_features('foo', 'bar')
+        self.assertEqual(self.registry.lookup('foo'), builder)
+        self.assertEqual(self.registry.lookup('bar'), builder)
+
+    def test_lookup_fails_when_no_builder_implements_feature(self):
+        builder = self.builder_for_features('foo', 'bar')
+        self.assertEqual(self.registry.lookup('baz'), None)
+
+    def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
+        builder1 = self.builder_for_features('foo')
+        builder2 = self.builder_for_features('bar')
+        self.assertEqual(self.registry.lookup(), builder2)
+
+    def test_lookup_fails_when_no_tree_builders_registered(self):
+        self.assertEqual(self.registry.lookup(), None)
+
+    def test_lookup_gets_most_recent_builder_supporting_all_features(self):
+        has_one = self.builder_for_features('foo')
+        has_the_other = self.builder_for_features('bar')
+        has_both_early = self.builder_for_features('foo', 'bar', 'baz')
+        has_both_late = self.builder_for_features('foo', 'bar', 'quux')
+        lacks_one = self.builder_for_features('bar')
+        has_the_other = self.builder_for_features('foo')
+
+        # There are two builders featuring 'foo' and 'bar', but
+        # the one that also features 'quux' was registered later.
+        self.assertEqual(self.registry.lookup('foo', 'bar'),
+                          has_both_late)
+
+        # There is only one builder featuring 'foo', 'bar', and 'baz'.
+        self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
+                          has_both_early)
+
+    def test_lookup_fails_when_cannot_reconcile_requested_features(self):
+        builder1 = self.builder_for_features('foo', 'bar')
+        builder2 = self.builder_for_features('foo', 'baz')
+        self.assertEqual(self.registry.lookup('bar', 'baz'), None)
--- a/updater/bs4/tests/test_docs.py
+++ b/updater/bs4/tests/test_docs.py
@ -0,0 +1,36 @@
+"Test harness for doctests."
+
+# pylint: disable-msg=E0611,W0142
+
+__metaclass__ = type
+__all__ = [
+    'additional_tests',
+    ]
+
+import atexit
+import doctest
+import os
+#from pkg_resources import (
+#    resource_filename, resource_exists, resource_listdir, cleanup_resources)
+import unittest
+
+DOCTEST_FLAGS = (
+    doctest.ELLIPSIS |
+    doctest.NORMALIZE_WHITESPACE |
+    doctest.REPORT_NDIFF)
+
+
+# def additional_tests():
+#     "Run the doc tests (README.txt and docs/*, if any exist)"
+#     doctest_files = [
+#         os.path.abspath(resource_filename('bs4', 'README.txt'))]
+#     if resource_exists('bs4', 'docs'):
+#         for name in resource_listdir('bs4', 'docs'):
+#             if name.endswith('.txt'):
+#                 doctest_files.append(
+#                     os.path.abspath(
+#                         resource_filename('bs4', 'docs/%s' % name)))
+#     kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
+#     atexit.register(cleanup_resources)
+#     return unittest.TestSuite((
+#         doctest.DocFileSuite(*doctest_files, **kwargs)))
--- a/updater/bs4/tests/test_html5lib.py
+++ b/updater/bs4/tests/test_html5lib.py
@ -0,0 +1,72 @@
+"""Tests to ensure that the html5lib tree builder generates good trees."""
+
+import warnings
+
+try:
+    from bs4.builder import HTML5TreeBuilder
+    HTML5LIB_PRESENT = True
+except ImportError, e:
+    HTML5LIB_PRESENT = False
+from bs4.element import SoupStrainer
+from bs4.testing import (
+    HTML5TreeBuilderSmokeTest,
+    SoupTest,
+    skipIf,
+)
+
+@skipIf(
+    not HTML5LIB_PRESENT,
+    "html5lib seems not to be present, not testing its tree builder.")
+class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
+    """See ``HTML5TreeBuilderSmokeTest``."""
+
+    @property
+    def default_builder(self):
+        return HTML5TreeBuilder()
+
+    def test_soupstrainer(self):
+        # The html5lib tree builder does not support SoupStrainers.
+        strainer = SoupStrainer("b")
+        markup = "<p>A <b>bold</b> statement.</p>"
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup(markup, parse_only=strainer)
+        self.assertEqual(
+            soup.decode(), self.document_for(markup))
+
+        self.assertTrue(
+            "the html5lib tree builder doesn't support parse_only" in
+            str(w[0].message))
+
+    def test_correctly_nested_tables(self):
+        """html5lib inserts <tbody> tags where other parsers don't."""
+        markup = ('<table id="1">'
+                  '<tr>'
+                  "<td>Here's another table:"
+                  '<table id="2">'
+                  '<tr><td>foo</td></tr>'
+                  '</table></td>')
+
+        self.assertSoupEquals(
+            markup,
+            '<table id="1"><tbody><tr><td>Here\'s another table:'
+            '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
+            '</td></tr></tbody></table>')
+
+        self.assertSoupEquals(
+            "<table><thead><tr><td>Foo</td></tr></thead>"
+            "<tbody><tr><td>Bar</td></tr></tbody>"
+            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
+
+    def test_xml_declaration_followed_by_doctype(self):
+        markup = '''<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html>
+<html>
+  <head>
+  </head>
+  <body>
+   <p>foo</p>
+  </body>
+</html>'''
+        soup = self.soup(markup)
+        # Verify that we can reach the <p> tag; this means the tree is connected.
+        self.assertEquals("<p>foo</p>", soup.p.encode())
--- a/updater/bs4/tests/test_htmlparser.py
+++ b/updater/bs4/tests/test_htmlparser.py
@ -0,0 +1,19 @@
+"""Tests to ensure that the html.parser tree builder generates good
+trees."""
+
+from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
+from bs4.builder import HTMLParserTreeBuilder
+
+class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
+
+    @property
+    def default_builder(self):
+        return HTMLParserTreeBuilder()
+
+    def test_namespaced_system_doctype(self):
+        # html.parser can't handle namespaced doctypes, so skip this one.
+        pass
+
+    def test_namespaced_public_doctype(self):
+        # html.parser can't handle namespaced doctypes, so skip this one.
+        pass
--- a/updater/bs4/tests/test_lxml.py
+++ b/updater/bs4/tests/test_lxml.py
@ -0,0 +1,75 @@
+"""Tests to ensure that the lxml tree builder generates good trees."""
+
+import re
+import warnings
+
+try:
+    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
+    LXML_PRESENT = True
+except ImportError, e:
+    LXML_PRESENT = False
+
+from bs4 import (
+    BeautifulSoup,
+    BeautifulStoneSoup,
+    )
+from bs4.element import Comment, Doctype, SoupStrainer
+from bs4.testing import skipIf
+from bs4.tests import test_htmlparser
+from bs4.testing import (
+    HTMLTreeBuilderSmokeTest,
+    XMLTreeBuilderSmokeTest,
+    SoupTest,
+    skipIf,
+)
+
+@skipIf(
+    not LXML_PRESENT,
+    "lxml seems not to be present, not testing its tree builder.")
+class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
+    """See ``HTMLTreeBuilderSmokeTest``."""
+
+    @property
+    def default_builder(self):
+        return LXMLTreeBuilder()
+
+    def test_out_of_range_entity(self):
+        self.assertSoupEquals(
+            "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
+        self.assertSoupEquals(
+            "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
+        self.assertSoupEquals(
+            "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
+
+    def test_beautifulstonesoup_is_xml_parser(self):
+        # Make sure that the deprecated BSS class uses an xml builder
+        # if one is installed.
+        with warnings.catch_warnings(record=False) as w:
+            soup = BeautifulStoneSoup("<b />")
+            self.assertEqual(u"<b/>", unicode(soup.b))
+
+    def test_real_xhtml_document(self):
+        """lxml strips the XML definition from an XHTML doc, which is fine."""
+        markup = b"""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>Hello.</title></head>
+<body>Goodbye.</body>
+</html>"""
+        soup = self.soup(markup)
+        self.assertEqual(
+            soup.encode("utf-8").replace(b"\n", b''),
+            markup.replace(b'\n', b'').replace(
+                b'<?xml version="1.0" encoding="utf-8"?>', b''))
+
+
+@skipIf(
+    not LXML_PRESENT,
+    "lxml seems not to be present, not testing its XML tree builder.")
+class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
+    """See ``HTMLTreeBuilderSmokeTest``."""
+
+    @property
+    def default_builder(self):
+        return LXMLTreeBuilderForXML()
+
--- a/updater/bs4/tests/test_soup.py
+++ b/updater/bs4/tests/test_soup.py
@ -0,0 +1,378 @@
+# -*- coding: utf-8 -*-
+"""Tests of Beautiful Soup as a whole."""
+
+import logging
+import unittest
+import sys
+from bs4 import (
+    BeautifulSoup,
+    BeautifulStoneSoup,
+)
+from bs4.element import (
+    CharsetMetaAttributeValue,
+    ContentMetaAttributeValue,
+    SoupStrainer,
+    NamespacedAttribute,
+    )
+import bs4.dammit
+from bs4.dammit import EntitySubstitution, UnicodeDammit
+from bs4.testing import (
+    SoupTest,
+    skipIf,
+)
+import warnings
+
+try:
+    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
+    LXML_PRESENT = True
+except ImportError, e:
+    LXML_PRESENT = False
+
+PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
+PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
+
+class TestDeprecatedConstructorArguments(SoupTest):
+
+    def test_parseOnlyThese_renamed_to_parse_only(self):
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
+        msg = str(w[0].message)
+        self.assertTrue("parseOnlyThese" in msg)
+        self.assertTrue("parse_only" in msg)
+        self.assertEqual(b"<b></b>", soup.encode())
+
+    def test_fromEncoding_renamed_to_from_encoding(self):
+        with warnings.catch_warnings(record=True) as w:
+            utf8 = b"\xc3\xa9"
+            soup = self.soup(utf8, fromEncoding="utf8")
+        msg = str(w[0].message)
+        self.assertTrue("fromEncoding" in msg)
+        self.assertTrue("from_encoding" in msg)
+        self.assertEqual("utf8", soup.original_encoding)
+
+    def test_unrecognized_keyword_argument(self):
+        self.assertRaises(
+            TypeError, self.soup, "<a>", no_such_argument=True)
+
+    @skipIf(
+        not LXML_PRESENT,
+        "lxml not present, not testing BeautifulStoneSoup.")
+    def test_beautifulstonesoup(self):
+        with warnings.catch_warnings(record=True) as w:
+            soup = BeautifulStoneSoup("<markup>")
+            self.assertTrue(isinstance(soup, BeautifulSoup))
+            self.assertTrue("BeautifulStoneSoup class is deprecated")
+
+class TestSelectiveParsing(SoupTest):
+
+    def test_parse_with_soupstrainer(self):
+        markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
+        strainer = SoupStrainer("b")
+        soup = self.soup(markup, parse_only=strainer)
+        self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
+
+
+class TestEntitySubstitution(unittest.TestCase):
+    """Standalone tests of the EntitySubstitution class."""
+    def setUp(self):
+        self.sub = EntitySubstitution
+
+    def test_simple_html_substitution(self):
+        # Unicode characters corresponding to named HTML entites
+        # are substituted, and no others.
+        s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
+        self.assertEqual(self.sub.substitute_html(s),
+                          u"foo&forall;\N{SNOWMAN}&otilde;bar")
+
+    def test_smart_quote_substitution(self):
+        # MS smart quotes are a common source of frustration, so we
+        # give them a special test.
+        quotes = b"\x91\x92foo\x93\x94"
+        dammit = UnicodeDammit(quotes)
+        self.assertEqual(self.sub.substitute_html(dammit.markup),
+                          "&lsquo;&rsquo;foo&ldquo;&rdquo;")
+
+    def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
+        s = 'Welcome to "my bar"'
+        self.assertEqual(self.sub.substitute_xml(s, False), s)
+
+    def test_xml_attribute_quoting_normally_uses_double_quotes(self):
+        self.assertEqual(self.sub.substitute_xml("Welcome", True),
+                          '"Welcome"')
+        self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
+                          '"Bob\'s Bar"')
+
+    def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
+        s = 'Welcome to "my bar"'
+        self.assertEqual(self.sub.substitute_xml(s, True),
+                          "'Welcome to \"my bar\"'")
+
+    def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
+        s = 'Welcome to "Bob\'s Bar"'
+        self.assertEqual(
+            self.sub.substitute_xml(s, True),
+            '"Welcome to &quot;Bob\'s Bar&quot;"')
+
+    def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
+        quoted = 'Welcome to "Bob\'s Bar"'
+        self.assertEqual(self.sub.substitute_xml(quoted), quoted)
+
+    def test_xml_quoting_handles_angle_brackets(self):
+        self.assertEqual(
+            self.sub.substitute_xml("foo<bar>"),
+            "foo&lt;bar&gt;")
+
+    def test_xml_quoting_handles_ampersands(self):
+        self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
+
+    def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self):
+        self.assertEqual(
+            self.sub.substitute_xml("&Aacute;T&T"),
+            "&Aacute;T&amp;T")
+
+    def test_quotes_not_html_substituted(self):
+        """There's no need to do this except inside attribute values."""
+        text = 'Bob\'s "bar"'
+        self.assertEqual(self.sub.substitute_html(text), text)
+
+
+class TestEncodingConversion(SoupTest):
+    # Test Beautiful Soup's ability to decode and encode from various
+    # encodings.
+
+    def setUp(self):
+        super(TestEncodingConversion, self).setUp()
+        self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
+        self.utf8_data = self.unicode_data.encode("utf-8")
+        # Just so you know what it looks like.
+        self.assertEqual(
+            self.utf8_data,
+            b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
+
+    def test_ascii_in_unicode_out(self):
+        # ASCII input is converted to Unicode. The original_encoding
+        # attribute is set.
+        ascii = b"<foo>a</foo>"
+        soup_from_ascii = self.soup(ascii)
+        unicode_output = soup_from_ascii.decode()
+        self.assertTrue(isinstance(unicode_output, unicode))
+        self.assertEqual(unicode_output, self.document_for(ascii.decode()))
+        self.assertEqual(soup_from_ascii.original_encoding.lower(), "ascii")
+
+    def test_unicode_in_unicode_out(self):
+        # Unicode input is left alone. The original_encoding attribute
+        # is not set.
+        soup_from_unicode = self.soup(self.unicode_data)
+        self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
+        self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
+        self.assertEqual(soup_from_unicode.original_encoding, None)
+
+    def test_utf8_in_unicode_out(self):
+        # UTF-8 input is converted to Unicode. The original_encoding
+        # attribute is set.
+        soup_from_utf8 = self.soup(self.utf8_data)
+        self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
+        self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
+
+    def test_utf8_out(self):
+        # The internal data structures can be encoded as UTF-8.
+        soup_from_unicode = self.soup(self.unicode_data)
+        self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
+
+    @skipIf(
+        PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
+        "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
+    def test_attribute_name_containing_unicode_characters(self):
+        markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
+        self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
+
+class TestUnicodeDammit(unittest.TestCase):
+    """Standalone tests of Unicode, Dammit."""
+
+    def test_smart_quotes_to_unicode(self):
+        markup = b"<foo>\x91\x92\x93\x94</foo>"
+        dammit = UnicodeDammit(markup)
+        self.assertEqual(
+            dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
+
+    def test_smart_quotes_to_xml_entities(self):
+        markup = b"<foo>\x91\x92\x93\x94</foo>"
+        dammit = UnicodeDammit(markup, smart_quotes_to="xml")
+        self.assertEqual(
+            dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
+
+    def test_smart_quotes_to_html_entities(self):
+        markup = b"<foo>\x91\x92\x93\x94</foo>"
+        dammit = UnicodeDammit(markup, smart_quotes_to="html")
+        self.assertEqual(
+            dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
+
+    def test_smart_quotes_to_ascii(self):
+        markup = b"<foo>\x91\x92\x93\x94</foo>"
+        dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
+        self.assertEqual(
+            dammit.unicode_markup, """<foo>''""</foo>""")
+
+    def test_detect_utf8(self):
+        utf8 = b"\xc3\xa9"
+        dammit = UnicodeDammit(utf8)
+        self.assertEqual(dammit.unicode_markup, u'\xe9')
+        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
+
+    def test_convert_hebrew(self):
+        hebrew = b"\xed\xe5\xec\xf9"
+        dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
+        self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
+        self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
+
+    def test_dont_see_smart_quotes_where_there_are_none(self):
+        utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
+        dammit = UnicodeDammit(utf_8)
+        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
+        self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
+
+    def test_ignore_inappropriate_codecs(self):
+        utf8_data = u"Räksmörgås".encode("utf-8")
+        dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
+        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
+
+    def test_ignore_invalid_codecs(self):
+        utf8_data = u"Räksmörgås".encode("utf-8")
+        for bad_encoding in ['.utf8', '...', 'utF---16.!']:
+            dammit = UnicodeDammit(utf8_data, [bad_encoding])
+            self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
+
+    def test_detect_html5_style_meta_tag(self):
+
+        for data in (
+            b'<html><meta charset="euc-jp" /></html>',
+            b"<html><meta charset='euc-jp' /></html>",
+            b"<html><meta charset=euc-jp /></html>",
+            b"<html><meta charset=euc-jp/></html>"):
+            dammit = UnicodeDammit(data, is_html=True)
+            self.assertEqual(
+                "euc-jp", dammit.original_encoding)
+
+    def test_last_ditch_entity_replacement(self):
+        # This is a UTF-8 document that contains bytestrings
+        # completely incompatible with UTF-8 (ie. encoded with some other
+        # encoding).
+        #
+        # Since there is no consistent encoding for the document,
+        # Unicode, Dammit will eventually encode the document as UTF-8
+        # and encode the incompatible characters as REPLACEMENT
+        # CHARACTER.
+        #
+        # If chardet is installed, it will detect that the document
+        # can be converted into ISO-8859-1 without errors. This happens
+        # to be the wrong encoding, but it is a consistent encoding, so the
+        # code we're testing here won't run.
+        #
+        # So we temporarily disable chardet if it's present.
+        doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
+<html><b>\330\250\330\252\330\261</b>
+<i>\310\322\321\220\312\321\355\344</i></html>"""
+        chardet = bs4.dammit.chardet_dammit
+        logging.disable(logging.WARNING)
+        try:
+            def noop(str):
+                return None
+            bs4.dammit.chardet_dammit = noop
+            dammit = UnicodeDammit(doc)
+            self.assertEqual(True, dammit.contains_replacement_characters)
+            self.assertTrue(u"\ufffd" in dammit.unicode_markup)
+
+            soup = BeautifulSoup(doc, "html.parser")
+            self.assertTrue(soup.contains_replacement_characters)
+        finally:
+            logging.disable(logging.NOTSET)
+            bs4.dammit.chardet_dammit = chardet
+
+    def test_sniffed_xml_encoding(self):
+        # A document written in UTF-16LE will be converted by a different
+        # code path that sniffs the byte order markers.
+        data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
+        dammit = UnicodeDammit(data)
+        self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
+        self.assertEqual("utf-16le", dammit.original_encoding)
+
+    def test_detwingle(self):
+        # Here's a UTF8 document.
+        utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
+
+        # Here's a Windows-1252 document.
+        windows_1252 = (
+            u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
+            u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
+
+        # Through some unholy alchemy, they've been stuck together.
+        doc = utf8 + windows_1252 + utf8
+
+        # The document can't be turned into UTF-8:
+        self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
+
+        # Unicode, Dammit thinks the whole document is Windows-1252,
+        # and decodes it into "â˜ƒâ˜ƒâ˜ƒ“Hi, I like Windows!”â˜ƒâ˜ƒâ˜ƒ"
+
+        # But if we run it through fix_embedded_windows_1252, it's fixed:
+
+        fixed = UnicodeDammit.detwingle(doc)
+        self.assertEqual(
+            u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
+
+    def test_detwingle_ignores_multibyte_characters(self):
+        # Each of these characters has a UTF-8 representation ending
+        # in \x93. \x93 is a smart quote if interpreted as
+        # Windows-1252. But our code knows to skip over multibyte
+        # UTF-8 characters, so they'll survive the process unscathed.
+        for tricky_unicode_char in (
+            u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
+            u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
+            u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
+            ):
+            input = tricky_unicode_char.encode("utf8")
+            self.assertTrue(input.endswith(b'\x93'))
+            output = UnicodeDammit.detwingle(input)
+            self.assertEqual(output, input)
+
+class TestNamedspacedAttribute(SoupTest):
+
+    def test_name_may_be_none(self):
+        a = NamespacedAttribute("xmlns", None)
+        self.assertEqual(a, "xmlns")
+
+    def test_attribute_is_equivalent_to_colon_separated_string(self):
+        a = NamespacedAttribute("a", "b")
+        self.assertEqual("a:b", a)
+
+    def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
+        a = NamespacedAttribute("a", "b", "c")
+        b = NamespacedAttribute("a", "b", "c")
+        self.assertEqual(a, b)
+
+        # The actual namespace is not considered.
+        c = NamespacedAttribute("a", "b", None)
+        self.assertEqual(a, c)
+
+        # But name and prefix are important.
+        d = NamespacedAttribute("a", "z", "c")
+        self.assertNotEqual(a, d)
+
+        e = NamespacedAttribute("z", "b", "c")
+        self.assertNotEqual(a, e)
+
+
+class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
+
+    def test_content_meta_attribute_value(self):
+        value = CharsetMetaAttributeValue("euc-jp")
+        self.assertEqual("euc-jp", value)
+        self.assertEqual("euc-jp", value.original_value)
+        self.assertEqual("utf8", value.encode("utf8"))
+
+
+    def test_content_meta_attribute_value(self):
+        value = ContentMetaAttributeValue("text/html; charset=euc-jp")
+        self.assertEqual("text/html; charset=euc-jp", value)
+        self.assertEqual("text/html; charset=euc-jp", value.original_value)
+        self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
--- a/updater/bs4/tests/test_tree.py
+++ b/updater/bs4/tests/test_tree.py
--- a/updater/coursera.json
+++ b/updater/coursera.json
--- a/updater/lib.py
+++ b/updater/lib.py
@ -10,9 +10,20 @@ class Database(object):
 	TEST = 7
 	BOOK = 8
 	AUDIOBOOK = 9
+	LECTURE = 10
 	
 	def __init__(self, host, user, password=None, database="learn"):
-		self.database = oursql.connect(host=host, user=user, db=database)
+		self.database = oursql.connect(host=host, user=user, passwd=password, db=database)
+	
+	def topic_exists(self, provider, unique_id):
+		c = self.database.cursor()
+		c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (provider, unique_id))
+		return (len(c.fetchall()) > 0)
+		
+	def item_exists(self, provider, unique_id):
+		c = self.database.cursor()
+		c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (provider, unique_id))
+		return (len(c.fetchall()) > 0)
 	
 	def insert_topic(self, provider, unique_id, title, override=False, **kwargs):
 		defaults = {
@ -21,7 +32,8 @@ class Database(object):
 			"start_date": None,
 			"end_date": None,
 			"parent_id": 0,
-			"description": ""
+			"description": "",
+			"provider_name": ""
 		}
 		
 		for kwarg, val in defaults.iteritems():
@ -43,9 +55,9 @@ class Database(object):
 		if exists == True:
 			return (False, results[0][0])
 		else:
-			c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`)"
-				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], provider, unique_id, title, kwargs['description'], kwargs['creation_date'], 
-				                                         kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date']))
+			c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`, `CustomProviderName`)"
+				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], provider, unique_id, title, kwargs['description'], kwargs['creation_date'], 
+				                                         kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'], kwargs["provider_name"]))
 			
 			return (True, c.lastrowid)
 	
@ -56,7 +68,10 @@ class Database(object):
 			"topic_id": 0,
 			"parent_id": 0,
 			"description": "",
-			"date": None
+			"date": None,
+			"start_date": None,
+			"end_date": None,
+			"provider_name": ""
 		}
 		
 		for kwarg, val in defaults.iteritems():
@ -78,8 +93,8 @@ class Database(object):
 		if exists == True:
 			return (False, results[0][0])
 		else:
-			c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`)"
-				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (has_topic, itemtype, provider, unique_id, title, kwargs["description"], item_url, kwargs["source_url"], 
-									       kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"]))
+			c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`, `StartDate`, `EndDate`, `CustomProviderName`)"
+				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (has_topic, itemtype, provider, unique_id, title, kwargs["description"], item_url, kwargs["source_url"], 
+									       kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"]))
 			
 			return (True, c.lastrowid)
--- a/updater/scrapers/init.py
+++ b/updater/scrapers/init.py
@ -0,0 +1,26 @@
+import inspect, os, sys
+
+my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
+
+def _import_module_into_scope(modulename):
+	module = __import__(modulename)
+	
+	for name in vars(module):
+		data = getattr(module, name)
+		globals()[name] = data
+
+sys.path.insert(0, my_path)
+
+for fname in os.listdir(my_path):
+	fpath = os.path.join(my_path, fname)
+	fbasename, fext = os.path.splitext(fname)
+	
+	if os.path.isdir(fpath):
+		if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
+			# This is a python directory module
+			_import_module_into_scope(fname)
+	elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
+		# This is a python file module
+		_import_module_into_scope(fbasename)
+
+sys.path.remove(my_path)
--- a/updater/scrapers/coursera.py
+++ b/updater/scrapers/coursera.py
@ -0,0 +1,50 @@
+import datetime, json, sys
+import requests
+import shared
+
+class Coursera(shared.Scraper):
+	provider_id = 2
+	
+	def run(self):
+		self.retrieve_dataset()
+		self.parse_dataset()
+	
+	def retrieve_dataset(self):
+		self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
+
+	def parse_dataset(self):
+		for item in self.dataset:
+			self.process_item(item)
+		
+	def process_item(self, item):
+		inserted, row_id = self.insert_topic(str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
+		
+		if inserted:
+			self.env.log("Inserted topic %s" % item["name"])
+		else:
+			self.env.log("Skipped topic %s" % item["name"])
+		
+		for course in item["courses"]:
+			self.process_course(course, row_id)
+	
+	def process_course(self, course, topicid):
+		try:
+			start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
+		except TypeError, e:
+			start_date = None
+			
+		title = self.generate_title(course['name'], start_date)
+		
+		inserted, row_id = self.insert_item(str(course["id"]), title, course["home_link"], has_topic=True, itemtype=self.COURSE, description=course["certificate_description"], start_date=start_date, topic_id=topicid)
+		
+		if inserted:
+			self.env.log("Inserted item %s" % title)
+		else:
+			self.env.log("Skipped item %s" % title)
+			
+	def generate_title(self, name, date):
+		if date is None:
+			return "%s (date undetermined)" % name
+		else:
+			return "%s (starting %s)" % (name, date.strftime("%b %d, %Y"))
+			
--- a/updater/scrapers/genericocw.py
+++ b/updater/scrapers/genericocw.py
@ -0,0 +1,201 @@
+import requests
+import oursql
+import datetime
+import json
+import sys, os
+import shared
+
+from bs4 import BeautifulSoup
+import bs4
+	
+rsess = requests.Session()
+rsess.headers['User-Agent'] = 'http://learn.cryto.net/ (scraper@cryto.net) - We mean no harm, thanks for making knowledge free :)'
+
+class OpenCourseWare(shared.Scraper):
+	def run(self):
+		overview = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource").text
+		soup = BeautifulSoup(overview)
+		
+		for element in soup.find(id="pagecontent")("a"):
+			#if "Hopkins" not in element.string:
+			#	continue
+			self.process_source(int(element["href"].split("/")[-1]), element.string)
+		
+	def process_source(self, source_id, source_name):
+		data = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/browse/source/%d" % source_id).text
+		soup = BeautifulSoup(data)
+		
+		courses = soup.select("table#cfResultsTable tr")
+		
+		for course in courses[:3]:
+			links = course("a")
+			
+			if len(links) > 0:
+				external = links[0]
+				details = links[1]
+				
+				self.parse_course(external.string, external["href"], details["href"].split("/")[-1], source_name)
+				
+	def parse_course(self, course_name, course_url, course_id, source_name):
+		self.env.log("Parsing %s" % course_url)
+		
+		# First fetch metadata from ocwconsortium.org
+		ocw_data = self._metadata_ocw(course_id)
+		ocw_data["providername"] = source_name
+		ocw_data["url"] = course_url
+		
+		# Now fetch metadata from the particular course provider
+		provider_data = self._metadata_provider(course_url)
+		
+		if provider_data != False:
+			data = ocw_data.copy()
+			data.update(provider_data)
+			
+			# TODO: insert data
+			self.env.log(repr(data))
+	
+	def _metadata_ocw(self, course_id):
+		soup = BeautifulSoup(rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/course/%s" % course_id).text)
+		metadata = soup.select("dl.coursepage")[0]
+		
+		if len(metadata) > 0:
+			data = self._parse_ocw_dl(metadata.select("dd"), metadata.select("dt"))
+		else:
+			# No metadata provided by ocwconsortium.
+			data = {}
+			
+		return data
+	
+	def _parse_ocw_dl(self, dd, dt):
+		data = {}
+		
+		for i in xrange(0, len(dd)):
+			label = dd[i].string.strip().rstrip(":")
+			value = dt[i].string
+			
+			if value is not None:
+				value = value.strip()
+			
+			if label == "Tags":
+				if value == None:
+					data["tags"] = []
+				else:
+					data["tags"] = [x.strip() for x in value.split(",")]
+			elif label == "Source":
+				data["providername"] = value
+			elif label == "Language":
+				data["language"] = value
+			elif label == "Link":
+				# We can ignore this, we already have it anyway
+				pass
+			elif label == "Author":
+				if value == None:
+					data["author"] = None
+				else:
+					data["author"] = value
+			elif label == "License":
+				if value == None:
+					data["license"] = None
+				else:
+					data["license"] = value
+			elif label == "Date Published":
+				data["creation_date"] = datetime.datetime.strptime(value, "%b %d, %Y")
+			else:
+				self.env.log("UNKNOWN: %s => %s" % (label, value), True)
+				
+		return data
+		
+	def _metadata_provider(self, url):
+		providers = {
+			"oer.avu.org": self._metadata_avu,
+			"ocw.capilanou.ca": self._metadata_capilano,
+			"ocw.hokudai.ac.jp": self._metadata_hokkaido,
+			"ocw.ie.edu": self._metadata_ie,
+			"ocw.jhsph.edu": self._metadata_hopkins,
+		}
+
+		host = url.split("/")[2]
+		data = {}
+		
+		for provider, func in providers.iteritems():
+			if host.endswith(provider):
+				return func(url)
+				
+		return False
+	
+	def _metadata_avu(self, url):
+		# African Virtual University
+		soup = BeautifulSoup(rsess.get(url + "?show=full").text)
+		table = soup.select("table.ds-includeSet-table")[0]
+		data = {"providername": "African Virtual University"}
+		
+		for row in table("tr"):
+			cells = row("td")
+			label = cells[0].string
+			value = cells[1].string
+			
+			if label == "dc.identifier.uri":
+				data["identifier_uri"] = value
+			elif label == "dc.type":
+				data["object_type"] = value
+			elif label == "dc.date.accessioned":
+				data["creation_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
+			elif label == "dc.date.issued":
+				data["issued_date"] = datetime.datetime.strptime(value, "%Y-%m-%d")
+			elif label == "dc.date.available":
+				data["available_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
+			elif label == "dc.language.iso":
+				data["language"] = value
+			elif label == "dc.description.abstract":
+				data["description"] = " ".join(x for y in cells[1]("p") for x in y.strings)
+			elif label == "dc.contributor.author":
+				data["author"] = value
+			elif label == "dc.title":
+				data["title"] = value
+			else:
+				self.env.log("UNKNOWN KEY: %s => %s" % (label, value), True)
+			
+		return data
+	
+	def _metadata_capilano(self, url):
+		# Capilano University
+		soup = BeautifulSoup(rsess.get(url).text)
+		data = {"providername": "Capilano University"}
+		
+		data["title"] = soup.select("h1.documentFirstHeading")[0].string.strip()
+		data["description"] = " ".join(x for y in soup.select("#about > p") for x in y.strings).strip()
+		
+		return data
+		
+	def _metadata_hokkaido(self, url):
+		# Hokkaido University
+		soup = BeautifulSoup(rsess.get(url).text)
+		data = {"providername": "Hokkaido University"}
+		
+		data["title"] = soup.select("#MAIN h1")[0].string.strip()
+		data["description"] = soup.select("#MAIN p")[0].string.strip()
+	
+		return data
+		
+	def _metadata_ie(self, url):
+		# IE University
+		course_id = url.split("=")[1]
+		soup = BeautifulSoup(rsess.get("http://ocw.ie.edu/ocw/cur%s01_esp.html" % course_id.zfill(2)).text)
+		data = {"providername": "IE University"}
+		
+		data["title"] = soup.select(".ari_18_negrita")[0].string.strip()
+		data["description"] = " ".join(x.strip() for x in soup.select(".ari_12_negra")[-1].strings)
+		data["author"] = soup.select(".ari_12_negra")[2].select(".ari_12_negrita")[0].string.strip()
+	
+		return data
+		
+	def _metadata_hopkins(self, url):
+		# Johns Hopkins Bloomberg School of Public Health
+		soup = BeautifulSoup(rsess.get(url).text)
+		data = {"providername": "Johns Hopkins Bloomberg School of Public Health"}
+		
+		data["title"] = self.soup_to_text(soup.select("h1")[-1])
+		data["author"] = self.soup_to_text(soup.select("#courseInfoBox p:nth-of-type(1)"))
+		data["description"] = self.soup_to_text(soup.select("#courseImageAndInfoBox > p"))
+		
+		return data
--- a/updater/scrapers/khan.py
+++ b/updater/scrapers/khan.py
@ -0,0 +1,197 @@
+import datetime, json, sys
+import requests
+import shared
+
+class KhanAcademy(shared.Scraper):
+	provider_id = 1
+	
+	def run(self):
+		self.retrieve_dataset()
+		self.process_item(self.dataset, 0)
+		
+	def retrieve_dataset(self):
+		self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
+		
+	def process_item(self, item, level, parent=None):
+		try:
+			kind = item["kind"]
+		except KeyError, e:
+			return
+		
+		if kind == "Topic":
+			self.process_topic(item, level, parent=parent)
+		elif kind in ("Video", "Exercise", "Article", "Scratchpad"):
+			self.process_object(item, level, parent=parent)
+		elif kind == "Separator":
+			pass  # Ignore separators
+		else:
+			self.env.log("Unrecognized kind: %s" % repr(item["kind"]), True)
+		
+		try:
+			children = item["children"]
+		except KeyError, e:
+			return
+			
+		for child in children:
+			self.process_item(child, level + 1, item)
+			
+	def process_topic(self, item, level, parent=None):
+		unique_id = item["id"]
+			
+		try:
+			parent_id = parent["_cl_id"]
+		except TypeError, e:
+			parent_id = 0
+			
+		# Check if a title is set
+		if item["title"] is not None:
+			title = item["title"]
+		else:
+			# No title was set - log this as an error and default to 'Untitled'.
+			self.env.log("No title found for item: %s" % repr(item), True)
+			title = "Untitled"
+		
+		# Check if a description is set, and default to no description if not
+		if item["description"] is not None:
+			description = item["description"]
+		else:
+			description = None
+		
+		# Insert the topic
+		inserted, row_id = self.insert_topic(unique_id, title, description=description, needs_enrollment=False)
+		
+		# Set the ID of the newly inserted row so that all objects in this topic know the ID of their topic.
+		item["_cl_id"] = row_id
+		
+		if inserted:
+			self.env.log("Inserted %s" % title)
+		else:
+			self.env.log("Skipped %s" % title)
+			
+	def process_object(self, item, level, parent=None):
+		unique_id = None
+		
+		# First check for the 'readable_id' property
+		try:
+			unique_id = item["readable_id"]
+		except KeyError, e:
+			pass
+		
+		# If no identifier was found, check for the 'name' property
+		if unique_id is None:
+			try:
+				unique_id = item["name"]
+			except KeyError, e:
+				pass
+		
+		# If still no identifier was found, check for the 'id' property
+		if unique_id is None:
+			try:
+				unique_id = str(item["id"])
+			except KeyError, e:
+				pass
+		
+		# If we *still* do not have an identifier, log the error and bail out
+		if unique_id is None:
+			self.env.log("No suitable identifier found for item: %s" % repr(item), True)
+			return
+		
+		# Determine the object type
+		if item["kind"] == "Video":
+			itemtype = self.VIDEO
+		elif item["kind"] == "Exercise":
+			itemtype = self.EXERCISE
+		elif item["kind"] == "Article":
+			itemtype = self.ARTICLE
+		elif item["kind"] == "Scratchpad":
+			itemtype = self.SANDBOX
+		
+		source_url = None
+		
+		# Determine the source URL via the 'ka_url' property
+		try:
+			source_url = item["ka_url"]
+		except KeyError, e:
+			pass
+		
+		# If no source URL was found, try the 'url' property
+		if source_url is None:			
+			try:
+				source_url = item["url"]
+			except KeyError, e:
+				pass
+		
+		# If still no source URL was found...
+		if source_url is None:
+			if itemtype == self.ARTICLE:
+				# Articles can lack a URL.
+				source_url = None
+			else:
+				# There was no source URL, but this wasn't an article. Log the error and bail out.
+				self.env.log("No source URL found for non-article object: %s" % repr(item), True)
+				return
+		
+		# Determine the (external) item URL
+		try:
+			item_url = item["url"]
+		except KeyError, e:
+			# Apparently there was no external item URL. Use the source URL as item URL - this will most likely be correct.
+			item_url = source_url
+		
+		# If the object is an article, we'll want to use the actual article content as description.
+		if itemtype == self.ARTICLE:
+			description = item["content"]
+		else:
+			# Otherwise, we'll check if there's a 'description' property. If not, leave empty.
+			try:
+				description = item["description"]
+			except KeyError, e:
+				description = None
+		
+		title = None
+		
+		# First check the 'title' property for an object title.
+		try:
+			title = item["title"]
+		except KeyError, e:
+			pass
+		
+		# As second option, check the 'display_name' property.
+		if title is None:
+			try:
+				title = item["display_name"]
+			except KeyError, e:
+				# Apparently it really does not have a title. Log the error and default to 'Untitled'.
+				self.env.log("No object title found for item: %s" % repr(item), True)
+				title = "Untitled"
+		
+		# If a 'views' property is present, include it.
+		try:
+			views = item["views"]
+		except KeyError, e:
+			views = None
+		
+		# If a creation date is present, include it.
+		try:
+			date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
+		except KeyError, e:
+			date = None
+		
+		# Check if there is a parent ID
+		try:
+			parent_id = parent["_cl_id"]
+		except KeyError, e:
+			# No parent ID present - log this as an error and default to 0.
+			self.env.log("No parent ID found for item: %s" % repr(item), True)
+			parent_id = 0
+		
+		# Insert the item
+		inserted, row_id = self.insert_item(unique_id, title, item_url, itemtype=itemtype, has_topic=True, source_url=source_url, description=description, views=views, topic_id=parent_id, date=date)
+		
+		# Store the resulting row ID in the item so that the children know the ID of their parent.
+		item["_cl_id"] = row_id
+		
+		if inserted:
+			self.env.log("Inserted %s" % title)
+		else:
+			self.env.log("Skipped %s" % title)
--- a/updater/scrapers/ureddit.py
+++ b/updater/scrapers/ureddit.py
@ -0,0 +1,55 @@
+import datetime, json, simplejson, sys, re
+import requests
+import shared
+
+class UniversityOfReddit(shared.Scraper):
+	provider_id = 3
+	
+	def run(self):
+		data = requests.get("http://ureddit.com/api?type=catalog").json()
+		
+		for category in data["categories"]:
+			self.parse_category(category['id'], category['value'])
+	
+	def parse_category(self, category_id, category_name):
+		try:
+			data = requests.get("http://ureddit.com/api?type=category&id=%s" % category_id).json()
+		except simplejson.decoder.JSONDecodeError, e:
+			return
+		
+		for _class in data["classes"]:
+			if not self.topic_exists(_class['id']):
+				self.parse_class(_class['id'], _class['value'], category_name)
+			else:
+				self.env.log("Skipped class %s" % _class['value'])
+	
+	def parse_class(self, class_id, class_name, category_name):
+		try:
+			data = requests.get("http://ureddit.com/api?type=class&id=%s" % class_id).json()
+		except simplejson.decoder.JSONDecodeError, e:
+			self.env.log("Skipped %s due to JSON formatting error" % class_name, True)
+			return
+		
+		if data["status"] == '1' or data["status"] == '3' or data["status"] == '5':
+			try:
+				creation_date = datetime.datetime.strptime(data["created"], '%Y-%m-%d %H:%M:%S')
+			except ValueError, e:
+				creation_date = None
+			
+			class_page = data["url"]
+			
+			inserted, topic_id = self.insert_topic(str(class_id), data["name"], needs_enrollment=True, description=data["description"], creation_date=creation_date)
+			
+			if inserted:
+				self.env.log("Inserted topic %s" % data["name"])
+			else:
+				self.env.log("Skipped topic %s" % data["name"])
+			
+			inserted, item_id = self.insert_item(str(class_id), data["name"], class_page, itemtype=self.COURSE, has_topic=True, topic_id=topic_id, date=creation_date, description=data["description"])
+			
+			if inserted:
+				self.env.log("Inserted item %s" % data["name"])
+			else:
+				self.env.log("Skipped item %s" % data["name"])
+		else:
+			self.env.log("Skipped %s due to status (%s)" % (data["name"], data["status_description"]))
--- a/updater/shared/init.py
+++ b/updater/shared/init.py
@ -0,0 +1,26 @@
+import inspect, os, sys
+
+my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
+
+def _import_module_into_scope(modulename):
+	module = __import__(modulename)
+	
+	for name in vars(module):
+		data = getattr(module, name)
+		globals()[name] = data
+
+sys.path.insert(0, my_path)
+
+for fname in os.listdir(my_path):
+	fpath = os.path.join(my_path, fname)
+	fbasename, fext = os.path.splitext(fname)
+	
+	if os.path.isdir(fpath):
+		if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
+			# This is a python directory module
+			_import_module_into_scope(fname)
+	elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
+		# This is a python file module
+		_import_module_into_scope(fbasename)
+
+sys.path.remove(my_path)
--- a/updater/shared/environment.py
+++ b/updater/shared/environment.py
@ -0,0 +1,17 @@
+import oursql, sys
+
+class Environment(object):
+	def connect(self, host="localhost", username="root", password="", database="learn"):
+		self.db = oursql.connect(host=host, user=username, passwd=password, db=database)
+		self.connected = True
+		
+	def log(self, text, is_error=False):
+		if is_error == False:
+			sys.stdout.write(text + "\n")
+		else:
+			sys.stderr.write(text + "\n")
+		
+	def Scraper(self, scraper_class):
+		s = scraper_class(self.db)
+		s.env = self
+		return s
--- a/updater/shared/scraper.py
+++ b/updater/shared/scraper.py
@ -0,0 +1,122 @@
+class Scraper(object):
+	UNKNOWN = 0
+	TOPIC = 1
+	COURSE = 2
+	VIDEO = 3
+	ARTICLE = 4
+	EXERCISE = 5
+	QUIZ = 6
+	TEST = 7
+	BOOK = 8
+	AUDIOBOOK = 9
+	LECTURE = 10
+	SANDBOX = 11
+	
+	provider_id = 0
+	
+	def __init__(self, database=None):
+		if database is not None:
+			self.db = database
+			self.can_store = True
+		else:
+			self.can_store = False
+			
+	def run(self, *args, **kwargs):
+		raise Exception("No run() method was specified for this scraper.")
+	
+	def topic_exists(self, unique_id):
+		c = self.db.cursor()
+		c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
+		return (len(c.fetchall()) > 0)
+		
+	def item_exists(self, unique_id):
+		c = self.db.cursor()
+		c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
+		return (len(c.fetchall()) > 0)
+	
+	def insert_topic(self, unique_id, title, override=False, **kwargs):
+		defaults = {
+			"needs_enrollment": False,
+			"creation_date": None,
+			"start_date": None,
+			"end_date": None,
+			"parent_id": 0,
+			"description": "",
+			"provider_name": ""
+		}
+		
+		for kwarg, val in defaults.iteritems():
+			try:
+				if kwargs[kwarg] == None:
+					kwargs[kwarg] = defaults[kwarg]
+			except KeyError, e:
+				kwargs[kwarg] = defaults[kwarg]
+		
+		c = self.db.cursor()
+		
+		if override == True:
+			exists = False
+		else:
+			c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
+			results = c.fetchall()
+			exists = (len(results) > 0)
+			
+		if exists == True:
+			return (False, results[0][0])
+		else:
+			c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`, `CustomProviderName`)"
+				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], self.provider_id, unique_id, title, kwargs['description'], kwargs['creation_date'], 
+				                                            kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'], kwargs["provider_name"]))
+			
+			return (True, c.lastrowid)
+			
+	def insert_item(self, unique_id, title, item_url, override=False, **kwargs):
+		defaults = {
+			"views": None,
+			"has_topic": False,
+			"itemtype": 0,
+			"source_url": item_url,
+			"topic_id": 0,
+			"parent_id": 0,
+			"description": "",
+			"date": None,
+			"start_date": None,
+			"end_date": None,
+			"provider_name": ""
+		}
+		
+		for kwarg, val in defaults.iteritems():
+			try:
+				if kwargs[kwarg] == None:
+					kwargs[kwarg] = defaults[kwarg]
+			except KeyError, e:
+				kwargs[kwarg] = defaults[kwarg]
+		
+		c = self.db.cursor()
+		
+		if override == True:
+			exists = False
+		else:
+			c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
+			results = c.fetchall()
+			exists = (len(results) > 0)
+			
+		if exists == True:
+			return (False, results[0][0])
+		else:
+			c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`, `StartDate`, `EndDate`, `CustomProviderName`)"
+				  "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs["has_topic"], kwargs["itemtype"], self.provider_id, unique_id, title, kwargs["description"], item_url, kwargs["source_url"], 
+									       kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"]))
+			
+			return (True, c.lastrowid)
+			
+	def soup_to_text(self, soup):
+		strings = []
+		
+		try:
+			for el in soup:
+				strings += el._all_strings(True, True)
+		except AttributeError, e:
+			strings = soup._all_strings(True, True)
+			
+		return " ".join(strings)
--- a/updater/test_ocw.py
+++ b/updater/test_ocw.py
@ -0,0 +1,4 @@
+import update_ocw
+
+c = update_ocw.OpenCourseWareCrawler()
+print c.get_provider_data("http://ocw.jhsph.edu/courses/AdolHealthDev/?source=rss")
--- a/updater/update.py
+++ b/updater/update.py
@ -0,0 +1,8 @@
+#!/usr/bin/env python
+import shared, scrapers
+
+env = shared.Environment()
+env.connect(host="localhost", username="root", password="", database="learn")
+
+scraper = env.Scraper(scrapers.OpenCourseWare)
+scraper.run()
--- a/updater/update_khan.py
+++ b/updater/update_khan.py
@ -1,131 +0,0 @@
-import requests
-import oursql
-import datetime
-import json
-import lib
-
-class KhanUniversityCrawler(object):
-	def __init__(self):
-		self.db = lib.Database("localhost", "root")
-		
-	def retrieve_dataset(self):
-		self.dataset = requests.get("http://www.khanacademy.org/api/v1/topictree").json()
-		#self.dataset = json.loads(open("data.json", "r").read())
-
-	def parse_dataset(self):
-		self.process_item(self.dataset, 0)
-		
-	def process_item(self, item, level, parent=None):
-		try:
-			kind = item["kind"]
-		except KeyError, e:
-			return
-		
-		if kind == "Topic":
-			unique_id = item["id"]
-			
-			try:
-				parent_id = parent["_cl_id"]
-			except TypeError, e:
-				parent_id = 0
-				
-			if item["title"] is not None:
-				title = item["title"]
-			else:
-				title = ""
-			
-			inserted, rowid = self.db.insert_topic(1, unique_id, title, description=item["description"], needs_enrollment=False)
-			item["_cl_id"] = rowid
-			
-			if inserted:
-				print "Inserted %s" % title
-			else:
-				print "Skipped %s" % title
-		elif kind in ("Video", "Exercise", "Article"):
-			try:
-				unique_id = item["readable_id"]
-			except KeyError, e:
-				try:
-					unique_id = item["name"]
-				except KeyError, e:
-					try:
-						unique_id = str(item["id"])
-					except KeyError, e:
-						print repr(item)
-						sys.stderr.write("WARNING: No suitable identifier found for item\n")
-						raise
-						return
-					
-			if item["kind"] == "Video":
-				itemtype = self.db.VIDEO
-			elif item["kind"] == "Exercise":
-				itemtype = self.db.EXERCISE
-			elif item["kind"] == "Article":
-				itemtype = self.db.ARTICLE
-				
-			try:
-				source_url = item["ka_url"]
-			except KeyError, e:
-				if itemtype == self.db.ARTICLE:
-					source_url = ""
-				else:
-					return
-				
-			try:
-				item_url = item["url"]
-			except KeyError, e:
-				try:
-					item_url = item["ka_url"]
-				except KeyError, e:
-					item_url = None
-				
-			if itemtype == self.db.ARTICLE:
-				description = item["content"]
-			else:
-				try:
-					description = item["description"]
-				except KeyError, e:
-					description = None
-			
-			try:
-				title = item["title"]
-			except KeyError, e:
-				try:
-					title = item["display_name"]
-				except KeyError, e:
-					title = "Untitled"
-				
-			try:
-				views = item["views"]
-			except KeyError, e:
-				views = None
-				
-			try:
-				date = datetime.datetime.strptime(item["date_added"], "%Y-%m-%dT%H:%M:%SZ")
-			except KeyError, e:
-				date = None
-			
-			inserted, rowid = self.db.insert_item(1, unique_id, True, itemtype, title, item_url, source_url=source_url, description=description, views=views, topic_id=parent["_cl_id"], date=date)
-			item["_cl_id"] = rowid
-			
-			if inserted:
-				print "Inserted %s" % title
-			else:
-				print "Skipped %s" % title
-		elif kind == "Separator":
-			pass  # Ignore separators
-		else:
-			sys.stderr.write("Unrecognized kind: %s\n" % item["kind"])
-			sys.stderr.write("%s\n" % (repr(item)))
-		
-		try:
-			children = item["children"]
-		except KeyError, e:
-			pass
-		else:
-			for child in children:
-				self.process_item(child, level + 1, item)
-			
-crawler = KhanUniversityCrawler()
-crawler.retrieve_dataset()
-crawler.parse_dataset()
--- a/updater/update_ocw.py
+++ b/updater/update_ocw.py
@ -0,0 +1,288 @@
+import requests
+import oursql
+import datetime
+import json
+import lib
+from bs4 import BeautifulSoup
+import bs4
+
+def combine_dict(a, b):
+	c = a.copy()
+	c.update(b)
+	return c
+	
+rsess = requests.Session()
+rsess.headers['User-Agent'] = 'http://learn.cryto.net/ (scraper@cryto.net) - We mean no harm, thanks for making knowledge free :)'
+
+class OpenCourseWareCrawler(object):
+	def __init__(self):
+		self.db = lib.Database("localhost", "root", password="")
+	
+	def parse_catalog(self):
+		overview = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource").text
+		soup = BeautifulSoup(overview)
+		
+		for element in soup.find(id="pagecontent")("a"):
+			self.parse_source(int(element["href"].split("/")[-1]), element.string)
+		
+	def parse_source(self, source_id, source_name):
+		data = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/browse/source/%d" % source_id).text
+		soup = BeautifulSoup(data)
+		
+		courses = soup.select("table#cfResultsTable tr")
+		
+		print "# " + source_name
+		
+		for course in courses[:2]:
+			links = course("a")
+			
+			if len(links) > 0:
+				external = links[0]
+				details = links[1]
+				
+				self.parse_course(external.string, external["href"], details["href"].split("/")[-1])
+				
+	def parse_course(self, course_name, course_url, course_id):
+		# First fetch metadata from ocwconsortium.org
+		
+		print course_url
+		
+		metadata_soup = BeautifulSoup(rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/course/%s" % course_id).text)
+		
+		metadata = metadata_soup.select("dl.coursepage")[0]
+		
+		if len(metadata) > 0:
+			data = self.parse_dl(metadata.select("dd"), metadata.select("dt"))
+		else:
+			# No metadata provided by ocwconsortium.
+			data = {}
+		
+		# Now fetch metadata from the particular course provider
+		provider_data = self.get_provider_data(course_url)
+		
+		if provider_data != {}:
+			print repr(provider_data)
+			
+	def parse_dl(self, dd, dt):
+		data = {}
+		
+		for i in xrange(0, len(dd)):
+			label = dd[i].string.strip().rstrip(":")
+			value = dt[i].string
+			
+			if value is not None:
+				value = value.strip()
+			
+			if label == "Tags":
+				if value == None:
+					data["tags"] = []
+				else:
+					data["tags"] = [x.strip() for x in value.split(",")]
+			elif label == "Source":
+				data["source"] = value
+			elif label == "Language":
+				data["language"] = value
+			elif label == "Link":
+				# We can ignore this, we already have it anyway
+				pass
+			elif label == "Author":
+				if value == None:
+					data["author"] = None
+				else:
+					data["author"] = value
+			elif label == "License":
+				if value == None:
+					data["license"] = None
+				else:
+					data["license"] = value
+			elif label == "Date Published":
+				data["creation_date"] = datetime.datetime.strptime(value, "%b %d, %Y")
+			else:
+				print "UNKNOWN: %s => %s" % (label, value)
+				
+		return data
+		
+	def get_provider_data(self, url):
+		providers = {
+			"oer.avu.org": self._data_avu,
+			"ocw.capilanou.ca": self._data_capilano,
+			"ocw.hokudai.ac.jp": self._data_hokkaido,
+			"ocw.ie.edu": self._data_ie,
+			"ocw.jhsph.edu": self._data_hopkins,
+		}
+
+		""",
+			
+			
+			
+			"ocw.kaplan.edu": self._data_kaplan,
+			"ocw.korea.edu": self._data_korea,
+			"kyotomm.jp": self._data_kyoto,
+			"ocw.kyushu-u.ac.jp": self._data_kyushu,
+			
+			"open-marhi.ru": self._data_moscow,
+			"yctrtrc.ncku.edu.tw": self._data_chengkung,
+			"ocw.nctu.edu.tw": self._data_chiaotung,
+			"opencourse.ndhu.edu.tw": self._data_donghwa,
+			"ocw.njit.edu": self._data_njit,
+			"graduateschool.paristech.fr": self._data_paris,
+			"peoples-uni.org": self._data_oaei,
+			"ocw.sbu.ac.ir": self._data_shahid,
+			"studentscircle.net": self._data_studentscircle,
+			"ocw.tmu.edu.tw:8080": self._data_taipei,
+			"openlearn.open.ac.uk": self._data_openuni,
+			"www.ocw.titech.ac.jp": self._data_tokyo,
+			"feedproxy.google.com": self._data_tudelft,
+			"ocw.tufts.edu": self._data_tufts,
+			"ocw.unu.edu": self._data_un,
+			"ocw.uc3m.es": self._data_madrid,
+			"ocw.ua.es": self._data_alicante,
+			"ocw.unican.es": self._data_cantabria,
+			"ocw.ugr.es": self._data_granada,
+			"ocw.udem.edu.mx": self._data_monterrey,
+			"ocw.um.es": self._data_murcia,
+			"ocw.uniovi.es": self._data_oviedo,
+			"ocw.usal.es": self._data_salamanca,
+			"ocwus.us.es": self._data_sevilla,
+			"ocw.unizar.es": self._data_zaragoza,
+			"ocw.univalle.edu.co3": self._data_colombia,
+			"ocw.uned.ac.cr": self._data_distancia,
+			"www.icesi.edu.co": self._data_icesi,
+			"ocw.innova.uned.es": self._data_innova,
+			"upv.es": self._data_valencia,
+			"ocw.upm.es": self._data_upm,
+			"ocw.utpl.edu.ec": self._data_utpl,
+			"ocw.uab.cat": self._data_uab,
+			"ocw.ub.edu": self._data_ub,
+			"ocw.uib.es": self._data_uib,
+			"ocw.udl.cat": self._data_udl,
+			"ocw.uv.es": self._data_uv,
+			"e-ujier.uji.e": self._data_uji,
+			"ocw.uoc.edu": self._data_uoc,
+			"ocw.utm.my": self._data_utm,
+			"ocw.uci.edu": self._data_uci,
+			"opencontent.uct.ac.za": self._data_uct,
+			"ocw.umb.edu:8080": self._data_boston,
+			"open.umich.edu": self._data_michigan,
+			"ocw.nd.edu": self._data_notredame,
+			"ocw.usu.ac.id": self._data_usu,
+			"ocw.tsukuba.ac.jp": self._data_tsukaba"""
+
+		host = url.split("/")[2]
+		data = {}
+		
+		for provider, func in providers.iteritems():
+			if host.endswith(provider):
+				data = func(url)
+				
+		return data
+	
+	def _data_avu(self, url):
+		# African Virtual University
+		soup = BeautifulSoup(rsess.get(url + "?show=full").text)
+		table = soup.select("table.ds-includeSet-table")[0]
+		data = {"providername": "African Virtual University"}
+		
+		for row in table("tr"):
+			cells = row("td")
+			label = cells[0].string
+			value = cells[1].string
+			
+			if label == "dc.identifier.uri":
+				data["identifier_uri"] = value
+			elif label == "dc.type":
+				data["object_type"] = value
+			elif label == "dc.date.accessioned":
+				data["creation_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
+			elif label == "dc.date.issued":
+				data["issued_date"] = datetime.datetime.strptime(value, "%Y-%m-%d")
+			elif label == "dc.date.available":
+				data["available_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
+			elif label == "dc.language.iso":
+				data["language"] = value
+			elif label == "dc.description.abstract":
+				data["description"] = " ".join(x for y in cells[1]("p") for x in y.strings)
+			elif label == "dc.contributor.author":
+				data["author"] = value
+			elif label == "dc.title":
+				data["title"] = value
+			else:
+				print "UNKNOWN KEY: %s => %s" % (label, value)
+			
+		return data
+	
+	def _data_capilano(self, url):
+		# Capilano University
+		soup = BeautifulSoup(rsess.get(url).text)
+		data = {"providername": "Capilano University"}
+		
+		data["title"] = soup.select("h1.documentFirstHeading")[0].string.strip()
+		data["description"] = " ".join(x for y in soup.select("#about > p") for x in y.strings).strip()
+		
+		return data
+		
+	def _data_hokkaido(self, url):
+		# Hokkaido University
+		soup = BeautifulSoup(rsess.get(url).text)
+		data = {"providername": "Hokkaido University"}
+		
+		data["title"] = soup.select("#MAIN h1")[0].string.strip()
+		data["description"] = soup.select("#MAIN p")[0].string.strip()
+	
+		return data
+		
+	def _data_ie(self, url):
+		# IE University
+		course_id = url.split("=")[1]
+		soup = BeautifulSoup(rsess.get("http://ocw.ie.edu/ocw/cur%s01_esp.html" % course_id.zfill(2)).text)
+		data = {"providername": "IE University"}
+		
+		data["title"] = soup.select(".ari_18_negrita")[0].string.strip()
+		data["description"] = " ".join(x.strip() for x in soup.select(".ari_12_negra")[-1].strings)
+		data["author"] = soup.select(".ari_12_negra")[2].select(".ari_12_negrita")[0].string.strip()
+	
+		return data
+		
+	def _data_hopkins(self, url):
+		# Johns Hopkins Bloomberg School of Public Health
+		soup = BeautifulSoup(rsess.get(url).text)
+		data = {"providername": "Johns Hopkins Bloomberg School of Public Health"}
+		
+		data["title"] = " ".join(x.strip() for x in soup.select("h1")[-1].strings if type(x) != bs4.element.Comment)
+		data["author"] = soup.select("#courseInfoBox p")[0].string.strip()
+		data["description"] = soup.select("#courseImageAndInfoBox p")[-1].string.strip()
+		
+		return data
+		
+	def parse_dataset(self):
+		for item in self.dataset:
+			self.process_item(item)
+		
+	def process_item(self, item):
+		inserted, rowid = self.db.insert_topic(2, str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
+		
+		if inserted:
+			print "Inserted %s" % item["name"]
+		else:
+			print "Skipped %s" % item["name"]
+		
+		for course in item["courses"]:
+			self.process_course(course, rowid)
+	
+	def process_course(self, course, topicid):
+		try:
+			start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
+			title = "%s: %s-%s-%s" % (course["name"], str(course["start_year"]).zfill(4), str(course["start_month"]).zfill(2), str(course["start_day"]).zfill(2))
+		except TypeError, e:
+			start_date = None
+			title = "%s (date undetermined)" % (course["name"])
+		
+		inserted, itemid = self.db.insert_item(2, str(course["id"]), True, self.db.COURSE, title, course["home_link"], description=course["certificate_description"], start_date=start_date, topic_id=topicid)
+		
+		if inserted:
+			print "\tInserted %s" % title
+		else:
+			print "\tSkipped %s" % title
+			
+#crawler = OpenCourseWareCrawler()
+#crawler.parse_catalog()
Author	SHA1	Message	Date
Sven Slootweg	d98ee113bc	Rewrite generic OCW parser, BeautifulSoup fix to allow exclusion of comments for string retrieval, and fix BS4 bug	2013-01-31 01:36:20 +01:00
Sven Slootweg	98340b38a0	Rewrite University of Reddit crawler - now with less hacks!	2013-01-30 22:36:42 +01:00
Sven Slootweg	8bbffb9429	Add topic_exists and item_exists methods to Scraper class	2013-01-30 22:30:13 +01:00
Sven Slootweg	0e4df4549f	No need to import oursql from within the scrapers	2013-01-30 22:03:55 +01:00
Sven Slootweg	2c3bcc5418	Rewrite Khan Academy crawler	2013-01-30 20:42:46 +01:00
Sven Slootweg	d9034b6215	Consistently use row_id, and not itemid or rowid	2013-01-30 20:42:23 +01:00
Sven Slootweg	8c0033074b	Support both output logging and error logging in the Environment.log() method	2013-01-30 20:41:51 +01:00
Sven Slootweg	b3edd35ecf	Add support for lectures and sandboxes	2013-01-30 20:41:11 +01:00
Sven Slootweg	d6d8eb70b9	Fix typo - it should be Khan Academy, not Khan University.	2013-01-30 20:07:50 +01:00
Sven Slootweg	fb6c43a38f	Rewrite scraper to be more modular, and convert the Coursera crawler to the new model	2013-01-30 19:43:48 +01:00
Sven Slootweg	c2a8a66dac	Update README to fix dependencies list	2013-01-30 14:17:32 +01:00
Sven Slootweg	a690cb2c8f	Add rudimentary first version of the OCW scraper	2013-01-30 13:41:27 +01:00
Sven Slootweg	f188d443d1	Add README	2013-01-30 13:39:44 +01:00
Sven Slootweg	43c700ac2b	Add list of various OCW sources for parser development	2013-01-30 13:34:18 +01:00
Sven Slootweg	26b68952fa	Add table structure updates for new version of updater	2013-01-30 13:33:24 +01:00
Sven Slootweg	a4e744f892	Add list of sources for book data	2013-01-30 13:33:07 +01:00
Sven Slootweg	d3bd59f813	Add modified version of BeautifulSoup4 (nth-of-type pseudoselector and full-featured direct descendant support)	2013-01-30 13:30:18 +01:00
Sven Slootweg	8e951f6b27	Add simple script for searching from a terminal	2013-01-30 13:28:21 +01:00
Sven Slootweg	d387541822	Support custom provider names	2013-01-30 13:27:59 +01:00
Sven Slootweg	a6e350c0d9	Add dumping script	2013-01-28 17:11:44 +01:00
Sven Slootweg	0f5cade812	Simple dumper	2013-01-28 17:10:13 +01:00
Sven Slootweg	fa74d394a7	Filter _ search terms	2013-01-28 16:43:46 +01:00
Sven Slootweg	a9d2576eaf	Add donation link	2013-01-28 16:39:38 +01:00
Sven Slootweg	f57d45fa53	Add header message	2013-01-28 16:34:25 +01:00
Sven Slootweg	1503c1f75f	Add 404 page	2013-01-28 16:32:52 +01:00
Sven Slootweg	bfbfd821b5	Include a small preview in the search results	2013-01-28 16:15:06 +01:00
Sven Slootweg	efeef5f70e	Change search term requirements	2013-01-28 16:09:17 +01:00
Sven Slootweg	3f02174ba3	Implement some very basic methods to prevent overloading	2013-01-28 16:07:48 +01:00
Sven Slootweg	1fbb21e6d8	Properly use the password when connecting the crawlers	2013-01-28 15:48:37 +01:00
Sven Slootweg	dd4c62bc4e	Very basic error handling	2013-01-28 15:43:39 +01:00
Sven Slootweg	6ec1a2d90b	Add crawlers for coursera and ureddit, get first quick and dirty version of frontend done, and fix buigs and stuff	2013-01-28 14:48:35 +01:00