Rewrite generic OCW parser, BeautifulSoup fix to allow exclusion of comments for string retrieval, and fix BS4 bug

2013-01-31 01:36:20 +01:00 · 2013-01-31 01:36:20 +01:00 · d98ee113bc
parent 98340b38a0
commit d98ee113bc
6 changed files with 267 additions and 363 deletions
--- a/ocw_functions.txt
+++ b/ocw_functions.txt
@ -0,0 +1,51 @@
+"ocw.kaplan.edu": self._metadata_kaplan,
+"ocw.korea.edu": self._metadata_korea,
+"kyotomm.jp": self._metadata_kyoto,
+"ocw.kyushu-u.ac.jp": self._metadata_kyushu,
+"open-marhi.ru": self._metadata_moscow,
+"yctrtrc.ncku.edu.tw": self._metadata_chengkung,
+"ocw.nctu.edu.tw": self._metadata_chiaotung,
+"opencourse.ndhu.edu.tw": self._metadata_donghwa,
+"ocw.njit.edu": self._metadata_njit,
+"graduateschool.paristech.fr": self._metadata_paris,
+"peoples-uni.org": self._metadata_oaei,
+"ocw.sbu.ac.ir": self._metadata_shahid,
+"studentscircle.net": self._metadata_studentscircle,
+"ocw.tmu.edu.tw:8080": self._metadata_taipei,
+"openlearn.open.ac.uk": self._metadata_openuni,
+"www.ocw.titech.ac.jp": self._metadata_tokyo,
+"feedproxy.google.com": self._metadata_tudelft,
+"ocw.tufts.edu": self._metadata_tufts,
+"ocw.unu.edu": self._metadata_un,
+"ocw.uc3m.es": self._metadata_madrid,
+"ocw.ua.es": self._metadata_alicante,
+"ocw.unican.es": self._metadata_cantabria,
+"ocw.ugr.es": self._metadata_granada,
+"ocw.udem.edu.mx": self._metadata_monterrey,
+"ocw.um.es": self._metadata_murcia,
+"ocw.uniovi.es": self._metadata_oviedo,
+"ocw.usal.es": self._metadata_salamanca,
+"ocwus.us.es": self._metadata_sevilla,
+"ocw.unizar.es": self._metadata_zaragoza,
+"ocw.univalle.edu.co3": self._metadata_colombia,
+"ocw.uned.ac.cr": self._metadata_distancia,
+"www.icesi.edu.co": self._metadata_icesi,
+"ocw.innova.uned.es": self._metadata_innova,
+"upv.es": self._metadata_valencia,
+"ocw.upm.es": self._metadata_upm,
+"ocw.utpl.edu.ec": self._metadata_utpl,
+"ocw.uab.cat": self._metadata_uab,
+"ocw.ub.edu": self._metadata_ub,
+"ocw.uib.es": self._metadata_uib,
+"ocw.udl.cat": self._metadata_udl,
+"ocw.uv.es": self._metadata_uv,
+"e-ujier.uji.e": self._metadata_uji,
+"ocw.uoc.edu": self._metadata_uoc,
+"ocw.utm.my": self._metadata_utm,
+"ocw.uci.edu": self._metadata_uci,
+"opencontent.uct.ac.za": self._metadata_uct,
+"ocw.umb.edu:8080": self._metadata_boston,
+"open.umich.edu": self._metadata_michigan,
+"ocw.nd.edu": self._metadata_notredame,
+"ocw.usu.ac.id": self._metadata_usu,
+"ocw.tsukuba.ac.jp": self._metadata_tsukaba
--- a/updater/bs4/bs4/init.py
+++ b/updater/bs4/bs4/init.py
@ -1,361 +0,0 @@
-"""Beautiful Soup
-Elixir and Tonic
-"The Screen-Scraper's Friend"
-http://www.crummy.com/software/BeautifulSoup/
-
-Beautiful Soup uses a pluggable XML or HTML parser to parse a
-(possibly invalid) document into a tree representation. Beautiful Soup
-provides provides methods and Pythonic idioms that make it easy to
-navigate, search, and modify the parse tree.
-
-Beautiful Soup works with Python 2.6 and up. It works better if lxml
-and/or html5lib is installed.
-
-For more than you ever wanted to know about Beautiful Soup, see the
-documentation:
-http://www.crummy.com/software/BeautifulSoup/bs4/doc/
-"""
-
-__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.1.3"
-__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
-__license__ = "MIT"
-
-__all__ = ['BeautifulSoup']
-
-import re
-import warnings
-
-from .builder import builder_registry
-from .dammit import UnicodeDammit
-from .element import (
-    CData,
-    Comment,
-    DEFAULT_OUTPUT_ENCODING,
-    Declaration,
-    Doctype,
-    NavigableString,
-    PageElement,
-    ProcessingInstruction,
-    ResultSet,
-    SoupStrainer,
-    Tag,
-    )
-
-# The very first thing we do is give a useful error if someone is
-# running this code under Python 3 without converting it.
-syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
-
-class BeautifulSoup(Tag):
-    """
-    This class defines the basic interface called by the tree builders.
-
-    These methods will be called by the parser:
-      reset()
-      feed(markup)
-
-    The tree builder may call these methods from its feed() implementation:
-      handle_starttag(name, attrs) # See note about return value
-      handle_endtag(name)
-      handle_data(data) # Appends to the current data node
-      endData(containerClass=NavigableString) # Ends the current data node
-
-    No matter how complicated the underlying parser is, you should be
-    able to build a tree using 'start tag' events, 'end tag' events,
-    'data' events, and "done with data" events.
-
-    If you encounter an empty-element tag (aka a self-closing tag,
-    like HTML's <br> tag), call handle_starttag and then
-    handle_endtag.
-    """
-    ROOT_TAG_NAME = u'[document]'
-
-    # If the end-user gives no indication which tree builder they
-    # want, look for one with these features.
-    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
-
-    # Used when determining whether a text node is all whitespace and
-    # can be replaced with a single space. A text node that contains
-    # fancy Unicode spaces (usually non-breaking) should be left
-    # alone.
-    STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
-
-    def __init__(self, markup="", features=None, builder=None,
-                 parse_only=None, from_encoding=None, **kwargs):
-        """The Soup object is initialized as the 'root tag', and the
-        provided markup (which can be a string or a file-like object)
-        is fed into the underlying parser."""
-
-        if 'convertEntities' in kwargs:
-            warnings.warn(
-                "BS4 does not respect the convertEntities argument to the "
-                "BeautifulSoup constructor. Entities are always converted "
-                "to Unicode characters.")
-
-        if 'markupMassage' in kwargs:
-            del kwargs['markupMassage']
-            warnings.warn(
-                "BS4 does not respect the markupMassage argument to the "
-                "BeautifulSoup constructor. The tree builder is responsible "
-                "for any necessary markup massage.")
-
-        if 'smartQuotesTo' in kwargs:
-            del kwargs['smartQuotesTo']
-            warnings.warn(
-                "BS4 does not respect the smartQuotesTo argument to the "
-                "BeautifulSoup constructor. Smart quotes are always converted "
-                "to Unicode characters.")
-
-        if 'selfClosingTags' in kwargs:
-            del kwargs['selfClosingTags']
-            warnings.warn(
-                "BS4 does not respect the selfClosingTags argument to the "
-                "BeautifulSoup constructor. The tree builder is responsible "
-                "for understanding self-closing tags.")
-
-        if 'isHTML' in kwargs:
-            del kwargs['isHTML']
-            warnings.warn(
-                "BS4 does not respect the isHTML argument to the "
-                "BeautifulSoup constructor. You can pass in features='html' "
-                "or features='xml' to get a builder capable of handling "
-                "one or the other.")
-
-        def deprecated_argument(old_name, new_name):
-            if old_name in kwargs:
-                warnings.warn(
-                    'The "%s" argument to the BeautifulSoup constructor '
-                    'has been renamed to "%s."' % (old_name, new_name))
-                value = kwargs[old_name]
-                del kwargs[old_name]
-                return value
-            return None
-
-        parse_only = parse_only or deprecated_argument(
-            "parseOnlyThese", "parse_only")
-
-        from_encoding = from_encoding or deprecated_argument(
-            "fromEncoding", "from_encoding")
-
-        if len(kwargs) > 0:
-            arg = kwargs.keys().pop()
-            raise TypeError(
-                "__init__() got an unexpected keyword argument '%s'" % arg)
-
-        if builder is None:
-            if isinstance(features, basestring):
-                features = [features]
-            if features is None or len(features) == 0:
-                features = self.DEFAULT_BUILDER_FEATURES
-            builder_class = builder_registry.lookup(*features)
-            if builder_class is None:
-                raise FeatureNotFound(
-                    "Couldn't find a tree builder with the features you "
-                    "requested: %s. Do you need to install a parser library?"
-                    % ",".join(features))
-            builder = builder_class()
-        self.builder = builder
-        self.is_xml = builder.is_xml
-        self.builder.soup = self
-
-        self.parse_only = parse_only
-
-        self.reset()
-
-        if hasattr(markup, 'read'):        # It's a file-type object.
-            markup = markup.read()
-        (self.markup, self.original_encoding, self.declared_html_encoding,
-         self.contains_replacement_characters) = (
-            self.builder.prepare_markup(markup, from_encoding))
-
-        try:
-            self._feed()
-        except StopParsing:
-            pass
-
-        # Clear out the markup and remove the builder's circular
-        # reference to this object.
-        self.markup = None
-        self.builder.soup = None
-
-    def _feed(self):
-        # Convert the document to Unicode.
-        self.builder.reset()
-
-        self.builder.feed(self.markup)
-        # Close out any unfinished strings and close all the open tags.
-        self.endData()
-        while self.currentTag.name != self.ROOT_TAG_NAME:
-            self.popTag()
-
-    def reset(self):
-        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
-        self.hidden = 1
-        self.builder.reset()
-        self.currentData = []
-        self.currentTag = None
-        self.tagStack = []
-        self.pushTag(self)
-
-    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
-        """Create a new tag associated with this soup."""
-        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
-
-    def new_string(self, s):
-        """Create a new NavigableString associated with this soup."""
-        navigable = NavigableString(s)
-        navigable.setup()
-        return navigable
-
-    def insert_before(self, successor):
-        raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
-
-    def insert_after(self, successor):
-        raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
-
-    def popTag(self):
-        tag = self.tagStack.pop()
-        #print "Pop", tag.name
-        if self.tagStack:
-            self.currentTag = self.tagStack[-1]
-        return self.currentTag
-
-    def pushTag(self, tag):
-        #print "Push", tag.name
-        if self.currentTag:
-            self.currentTag.contents.append(tag)
-        self.tagStack.append(tag)
-        self.currentTag = self.tagStack[-1]
-
-    def endData(self, containerClass=NavigableString):
-        if self.currentData:
-            currentData = u''.join(self.currentData)
-            if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
-                not set([tag.name for tag in self.tagStack]).intersection(
-                    self.builder.preserve_whitespace_tags)):
-                if '\n' in currentData:
-                    currentData = '\n'
-                else:
-                    currentData = ' '
-            self.currentData = []
-            if self.parse_only and len(self.tagStack) <= 1 and \
-                   (not self.parse_only.text or \
-                    not self.parse_only.search(currentData)):
-                return
-            o = containerClass(currentData)
-            self.object_was_parsed(o)
-
-    def object_was_parsed(self, o, parent=None, previous_element=None):
-        """Add an object to the parse tree."""
-        parent = parent or self.currentTag
-        previous_element = previous_element or self.previous_element
-        o.setup(parent, previous_element)
-        if self.previous_element:
-            self.previous_element.next_element = o
-        self.previous_element = o
-        parent.contents.append(o)
-
-    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
-        """Pops the tag stack up to and including the most recent
-        instance of the given tag. If inclusivePop is false, pops the tag
-        stack up to but *not* including the most recent instqance of
-        the given tag."""
-        #print "Popping to %s" % name
-        if name == self.ROOT_TAG_NAME:
-            return
-
-        numPops = 0
-        mostRecentTag = None
-
-        for i in range(len(self.tagStack) - 1, 0, -1):
-            if (name == self.tagStack[i].name
-                and nsprefix == self.tagStack[i].prefix):
-                numPops = len(self.tagStack) - i
-                break
-        if not inclusivePop:
-            numPops = numPops - 1
-
-        for i in range(0, numPops):
-            mostRecentTag = self.popTag()
-        return mostRecentTag
-
-    def handle_starttag(self, name, namespace, nsprefix, attrs):
-        """Push a start tag on to the stack.
-
-        If this method returns None, the tag was rejected by the
-        SoupStrainer. You should proceed as if the tag had not occured
-        in the document. For instance, if this was a self-closing tag,
-        don't call handle_endtag.
-        """
-
-        # print "Start tag %s: %s" % (name, attrs)
-        self.endData()
-
-        if (self.parse_only and len(self.tagStack) <= 1
-            and (self.parse_only.text
-                 or not self.parse_only.search_tag(name, attrs))):
-            return None
-
-        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
-                  self.currentTag, self.previous_element)
-        if tag is None:
-            return tag
-        if self.previous_element:
-            self.previous_element.next_element = tag
-        self.previous_element = tag
-        self.pushTag(tag)
-        return tag
-
-    def handle_endtag(self, name, nsprefix=None):
-        #print "End tag: " + name
-        self.endData()
-        self._popToTag(name, nsprefix)
-
-    def handle_data(self, data):
-        self.currentData.append(data)
-
-    def decode(self, pretty_print=False,
-               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
-               formatter="minimal"):
-        """Returns a string or Unicode representation of this document.
-        To get Unicode, pass None for encoding."""
-
-        if self.is_xml:
-            # Print the XML declaration
-            encoding_part = ''
-            if eventual_encoding != None:
-                encoding_part = ' encoding="%s"' % eventual_encoding
-            prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
-        else:
-            prefix = u''
-        if not pretty_print:
-            indent_level = None
-        else:
-            indent_level = 0
-        return prefix + super(BeautifulSoup, self).decode(
-            indent_level, eventual_encoding, formatter)
-
-class BeautifulStoneSoup(BeautifulSoup):
-    """Deprecated interface to an XML parser."""
-
-    def __init__(self, *args, **kwargs):
-        kwargs['features'] = 'xml'
-        warnings.warn(
-            'The BeautifulStoneSoup class is deprecated. Instead of using '
-            'it, pass features="xml" into the BeautifulSoup constructor.')
-        super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
-
-
-class StopParsing(Exception):
-    pass
-
-
-class FeatureNotFound(ValueError):
-    pass
-
-
-#By default, act as an HTML pretty-printer.
-if __name__ == '__main__':
-    import sys
-    soup = BeautifulSoup(sys.stdin)
-    print soup.prettify()
--- a/updater/bs4/element.py
+++ b/updater/bs4/element.py
@ -832,11 +832,13 @@ class Tag(PageElement):
        self.clear()
        self.append(string.__class__(string))

-    def _all_strings(self, strip=False):
+    def _all_strings(self, strip=False, no_comments=False):
        """Yield all child strings, possibly stripping them."""
        for descendant in self.descendants:
            if not isinstance(descendant, NavigableString):
                continue
+            if no_comments == True and isinstance(descendant, Comment):
+                continue
            if strip:
                descendant = descendant.strip()
                if len(descendant) == 0:
--- a/updater/scrapers/genericocw.py
+++ b/updater/scrapers/genericocw.py
@ -0,0 +1,201 @@
+import requests
+import oursql
+import datetime
+import json
+import sys, os
+import shared
+
+from bs4 import BeautifulSoup
+import bs4
+	
+rsess = requests.Session()
+rsess.headers['User-Agent'] = 'http://learn.cryto.net/ (scraper@cryto.net) - We mean no harm, thanks for making knowledge free :)'
+
+class OpenCourseWare(shared.Scraper):
+	def run(self):
+		overview = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource").text
+		soup = BeautifulSoup(overview)
+		
+		for element in soup.find(id="pagecontent")("a"):
+			#if "Hopkins" not in element.string:
+			#	continue
+			self.process_source(int(element["href"].split("/")[-1]), element.string)
+		
+	def process_source(self, source_id, source_name):
+		data = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/browse/source/%d" % source_id).text
+		soup = BeautifulSoup(data)
+		
+		courses = soup.select("table#cfResultsTable tr")
+		
+		for course in courses[:3]:
+			links = course("a")
+			
+			if len(links) > 0:
+				external = links[0]
+				details = links[1]
+				
+				self.parse_course(external.string, external["href"], details["href"].split("/")[-1], source_name)
+				
+	def parse_course(self, course_name, course_url, course_id, source_name):
+		self.env.log("Parsing %s" % course_url)
+		
+		# First fetch metadata from ocwconsortium.org
+		ocw_data = self._metadata_ocw(course_id)
+		ocw_data["providername"] = source_name
+		ocw_data["url"] = course_url
+		
+		# Now fetch metadata from the particular course provider
+		provider_data = self._metadata_provider(course_url)
+		
+		if provider_data != False:
+			data = ocw_data.copy()
+			data.update(provider_data)
+			
+			# TODO: insert data
+			self.env.log(repr(data))
+	
+	def _metadata_ocw(self, course_id):
+		soup = BeautifulSoup(rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/course/%s" % course_id).text)
+		metadata = soup.select("dl.coursepage")[0]
+		
+		if len(metadata) > 0:
+			data = self._parse_ocw_dl(metadata.select("dd"), metadata.select("dt"))
+		else:
+			# No metadata provided by ocwconsortium.
+			data = {}
+			
+		return data
+	
+	def _parse_ocw_dl(self, dd, dt):
+		data = {}
+		
+		for i in xrange(0, len(dd)):
+			label = dd[i].string.strip().rstrip(":")
+			value = dt[i].string
+			
+			if value is not None:
+				value = value.strip()
+			
+			if label == "Tags":
+				if value == None:
+					data["tags"] = []
+				else:
+					data["tags"] = [x.strip() for x in value.split(",")]
+			elif label == "Source":
+				data["providername"] = value
+			elif label == "Language":
+				data["language"] = value
+			elif label == "Link":
+				# We can ignore this, we already have it anyway
+				pass
+			elif label == "Author":
+				if value == None:
+					data["author"] = None
+				else:
+					data["author"] = value
+			elif label == "License":
+				if value == None:
+					data["license"] = None
+				else:
+					data["license"] = value
+			elif label == "Date Published":
+				data["creation_date"] = datetime.datetime.strptime(value, "%b %d, %Y")
+			else:
+				self.env.log("UNKNOWN: %s => %s" % (label, value), True)
+				
+		return data
+		
+	def _metadata_provider(self, url):
+		providers = {
+			"oer.avu.org": self._metadata_avu,
+			"ocw.capilanou.ca": self._metadata_capilano,
+			"ocw.hokudai.ac.jp": self._metadata_hokkaido,
+			"ocw.ie.edu": self._metadata_ie,
+			"ocw.jhsph.edu": self._metadata_hopkins,
+		}
+
+		host = url.split("/")[2]
+		data = {}
+		
+		for provider, func in providers.iteritems():
+			if host.endswith(provider):
+				return func(url)
+				
+		return False
+	
+	def _metadata_avu(self, url):
+		# African Virtual University
+		soup = BeautifulSoup(rsess.get(url + "?show=full").text)
+		table = soup.select("table.ds-includeSet-table")[0]
+		data = {"providername": "African Virtual University"}
+		
+		for row in table("tr"):
+			cells = row("td")
+			label = cells[0].string
+			value = cells[1].string
+			
+			if label == "dc.identifier.uri":
+				data["identifier_uri"] = value
+			elif label == "dc.type":
+				data["object_type"] = value
+			elif label == "dc.date.accessioned":
+				data["creation_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
+			elif label == "dc.date.issued":
+				data["issued_date"] = datetime.datetime.strptime(value, "%Y-%m-%d")
+			elif label == "dc.date.available":
+				data["available_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
+			elif label == "dc.language.iso":
+				data["language"] = value
+			elif label == "dc.description.abstract":
+				data["description"] = " ".join(x for y in cells[1]("p") for x in y.strings)
+			elif label == "dc.contributor.author":
+				data["author"] = value
+			elif label == "dc.title":
+				data["title"] = value
+			else:
+				self.env.log("UNKNOWN KEY: %s => %s" % (label, value), True)
+			
+		return data
+	
+	def _metadata_capilano(self, url):
+		# Capilano University
+		soup = BeautifulSoup(rsess.get(url).text)
+		data = {"providername": "Capilano University"}
+		
+		data["title"] = soup.select("h1.documentFirstHeading")[0].string.strip()
+		data["description"] = " ".join(x for y in soup.select("#about > p") for x in y.strings).strip()
+		
+		return data
+		
+	def _metadata_hokkaido(self, url):
+		# Hokkaido University
+		soup = BeautifulSoup(rsess.get(url).text)
+		data = {"providername": "Hokkaido University"}
+		
+		data["title"] = soup.select("#MAIN h1")[0].string.strip()
+		data["description"] = soup.select("#MAIN p")[0].string.strip()
+	
+		return data
+		
+	def _metadata_ie(self, url):
+		# IE University
+		course_id = url.split("=")[1]
+		soup = BeautifulSoup(rsess.get("http://ocw.ie.edu/ocw/cur%s01_esp.html" % course_id.zfill(2)).text)
+		data = {"providername": "IE University"}
+		
+		data["title"] = soup.select(".ari_18_negrita")[0].string.strip()
+		data["description"] = " ".join(x.strip() for x in soup.select(".ari_12_negra")[-1].strings)
+		data["author"] = soup.select(".ari_12_negra")[2].select(".ari_12_negrita")[0].string.strip()
+	
+		return data
+		
+	def _metadata_hopkins(self, url):
+		# Johns Hopkins Bloomberg School of Public Health
+		soup = BeautifulSoup(rsess.get(url).text)
+		data = {"providername": "Johns Hopkins Bloomberg School of Public Health"}
+		
+		data["title"] = self.soup_to_text(soup.select("h1")[-1])
+		data["author"] = self.soup_to_text(soup.select("#courseInfoBox p:nth-of-type(1)"))
+		data["description"] = self.soup_to_text(soup.select("#courseImageAndInfoBox > p"))
+		
+		return data
--- a/updater/shared/scraper.py
+++ b/updater/shared/scraper.py
@ -109,3 +109,14 @@ class Scraper(object):
 									       kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"]))
 			
 			return (True, c.lastrowid)
+			
+	def soup_to_text(self, soup):
+		strings = []
+		
+		try:
+			for el in soup:
+				strings += el._all_strings(True, True)
+		except AttributeError, e:
+			strings = soup._all_strings(True, True)
+			
+		return " ".join(strings)
--- a/updater/update.py
+++ b/updater/update.py
@ -4,5 +4,5 @@ import shared, scrapers
 env = shared.Environment()
 env.connect(host="localhost", username="root", password="", database="learn")

-scraper = env.Scraper(scrapers.UniversityOfReddit)
+scraper = env.Scraper(scrapers.OpenCourseWare)
 scraper.run()