From d98ee113bce7fefe690fce37ac02cc2b2631bc5c Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Thu, 31 Jan 2013 01:36:20 +0100 Subject: [PATCH] Rewrite generic OCW parser, BeautifulSoup fix to allow exclusion of comments for string retrieval, and fix BS4 bug --- ocw_functions.txt | 51 +++++ updater/bs4/bs4/__init__.py | 361 --------------------------------- updater/bs4/element.py | 4 +- updater/scrapers/genericocw.py | 201 ++++++++++++++++++ updater/shared/scraper.py | 11 + updater/update.py | 2 +- 6 files changed, 267 insertions(+), 363 deletions(-) create mode 100644 ocw_functions.txt delete mode 100644 updater/bs4/bs4/__init__.py create mode 100644 updater/scrapers/genericocw.py diff --git a/ocw_functions.txt b/ocw_functions.txt new file mode 100644 index 0000000..fd90fe5 --- /dev/null +++ b/ocw_functions.txt @@ -0,0 +1,51 @@ +"ocw.kaplan.edu": self._metadata_kaplan, +"ocw.korea.edu": self._metadata_korea, +"kyotomm.jp": self._metadata_kyoto, +"ocw.kyushu-u.ac.jp": self._metadata_kyushu, +"open-marhi.ru": self._metadata_moscow, +"yctrtrc.ncku.edu.tw": self._metadata_chengkung, +"ocw.nctu.edu.tw": self._metadata_chiaotung, +"opencourse.ndhu.edu.tw": self._metadata_donghwa, +"ocw.njit.edu": self._metadata_njit, +"graduateschool.paristech.fr": self._metadata_paris, +"peoples-uni.org": self._metadata_oaei, +"ocw.sbu.ac.ir": self._metadata_shahid, +"studentscircle.net": self._metadata_studentscircle, +"ocw.tmu.edu.tw:8080": self._metadata_taipei, +"openlearn.open.ac.uk": self._metadata_openuni, +"www.ocw.titech.ac.jp": self._metadata_tokyo, +"feedproxy.google.com": self._metadata_tudelft, +"ocw.tufts.edu": self._metadata_tufts, +"ocw.unu.edu": self._metadata_un, +"ocw.uc3m.es": self._metadata_madrid, +"ocw.ua.es": self._metadata_alicante, +"ocw.unican.es": self._metadata_cantabria, +"ocw.ugr.es": self._metadata_granada, +"ocw.udem.edu.mx": self._metadata_monterrey, +"ocw.um.es": self._metadata_murcia, +"ocw.uniovi.es": self._metadata_oviedo, +"ocw.usal.es": self._metadata_salamanca, +"ocwus.us.es": self._metadata_sevilla, +"ocw.unizar.es": self._metadata_zaragoza, +"ocw.univalle.edu.co3": self._metadata_colombia, +"ocw.uned.ac.cr": self._metadata_distancia, +"www.icesi.edu.co": self._metadata_icesi, +"ocw.innova.uned.es": self._metadata_innova, +"upv.es": self._metadata_valencia, +"ocw.upm.es": self._metadata_upm, +"ocw.utpl.edu.ec": self._metadata_utpl, +"ocw.uab.cat": self._metadata_uab, +"ocw.ub.edu": self._metadata_ub, +"ocw.uib.es": self._metadata_uib, +"ocw.udl.cat": self._metadata_udl, +"ocw.uv.es": self._metadata_uv, +"e-ujier.uji.e": self._metadata_uji, +"ocw.uoc.edu": self._metadata_uoc, +"ocw.utm.my": self._metadata_utm, +"ocw.uci.edu": self._metadata_uci, +"opencontent.uct.ac.za": self._metadata_uct, +"ocw.umb.edu:8080": self._metadata_boston, +"open.umich.edu": self._metadata_michigan, +"ocw.nd.edu": self._metadata_notredame, +"ocw.usu.ac.id": self._metadata_usu, +"ocw.tsukuba.ac.jp": self._metadata_tsukaba diff --git a/updater/bs4/bs4/__init__.py b/updater/bs4/bs4/__init__.py deleted file mode 100644 index fe2656b..0000000 --- a/updater/bs4/bs4/__init__.py +++ /dev/null @@ -1,361 +0,0 @@ -"""Beautiful Soup -Elixir and Tonic -"The Screen-Scraper's Friend" -http://www.crummy.com/software/BeautifulSoup/ - -Beautiful Soup uses a pluggable XML or HTML parser to parse a -(possibly invalid) document into a tree representation. Beautiful Soup -provides provides methods and Pythonic idioms that make it easy to -navigate, search, and modify the parse tree. - -Beautiful Soup works with Python 2.6 and up. It works better if lxml -and/or html5lib is installed. - -For more than you ever wanted to know about Beautiful Soup, see the -documentation: -http://www.crummy.com/software/BeautifulSoup/bs4/doc/ -""" - -__author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.1.3" -__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" -__license__ = "MIT" - -__all__ = ['BeautifulSoup'] - -import re -import warnings - -from .builder import builder_registry -from .dammit import UnicodeDammit -from .element import ( - CData, - Comment, - DEFAULT_OUTPUT_ENCODING, - Declaration, - Doctype, - NavigableString, - PageElement, - ProcessingInstruction, - ResultSet, - SoupStrainer, - Tag, - ) - -# The very first thing we do is give a useful error if someone is -# running this code under Python 3 without converting it. -syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' - -class BeautifulSoup(Tag): - """ - This class defines the basic interface called by the tree builders. - - These methods will be called by the parser: - reset() - feed(markup) - - The tree builder may call these methods from its feed() implementation: - handle_starttag(name, attrs) # See note about return value - handle_endtag(name) - handle_data(data) # Appends to the current data node - endData(containerClass=NavigableString) # Ends the current data node - - No matter how complicated the underlying parser is, you should be - able to build a tree using 'start tag' events, 'end tag' events, - 'data' events, and "done with data" events. - - If you encounter an empty-element tag (aka a self-closing tag, - like HTML's
tag), call handle_starttag and then - handle_endtag. - """ - ROOT_TAG_NAME = u'[document]' - - # If the end-user gives no indication which tree builder they - # want, look for one with these features. - DEFAULT_BUILDER_FEATURES = ['html', 'fast'] - - # Used when determining whether a text node is all whitespace and - # can be replaced with a single space. A text node that contains - # fancy Unicode spaces (usually non-breaking) should be left - # alone. - STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, } - - def __init__(self, markup="", features=None, builder=None, - parse_only=None, from_encoding=None, **kwargs): - """The Soup object is initialized as the 'root tag', and the - provided markup (which can be a string or a file-like object) - is fed into the underlying parser.""" - - if 'convertEntities' in kwargs: - warnings.warn( - "BS4 does not respect the convertEntities argument to the " - "BeautifulSoup constructor. Entities are always converted " - "to Unicode characters.") - - if 'markupMassage' in kwargs: - del kwargs['markupMassage'] - warnings.warn( - "BS4 does not respect the markupMassage argument to the " - "BeautifulSoup constructor. The tree builder is responsible " - "for any necessary markup massage.") - - if 'smartQuotesTo' in kwargs: - del kwargs['smartQuotesTo'] - warnings.warn( - "BS4 does not respect the smartQuotesTo argument to the " - "BeautifulSoup constructor. Smart quotes are always converted " - "to Unicode characters.") - - if 'selfClosingTags' in kwargs: - del kwargs['selfClosingTags'] - warnings.warn( - "BS4 does not respect the selfClosingTags argument to the " - "BeautifulSoup constructor. The tree builder is responsible " - "for understanding self-closing tags.") - - if 'isHTML' in kwargs: - del kwargs['isHTML'] - warnings.warn( - "BS4 does not respect the isHTML argument to the " - "BeautifulSoup constructor. You can pass in features='html' " - "or features='xml' to get a builder capable of handling " - "one or the other.") - - def deprecated_argument(old_name, new_name): - if old_name in kwargs: - warnings.warn( - 'The "%s" argument to the BeautifulSoup constructor ' - 'has been renamed to "%s."' % (old_name, new_name)) - value = kwargs[old_name] - del kwargs[old_name] - return value - return None - - parse_only = parse_only or deprecated_argument( - "parseOnlyThese", "parse_only") - - from_encoding = from_encoding or deprecated_argument( - "fromEncoding", "from_encoding") - - if len(kwargs) > 0: - arg = kwargs.keys().pop() - raise TypeError( - "__init__() got an unexpected keyword argument '%s'" % arg) - - if builder is None: - if isinstance(features, basestring): - features = [features] - if features is None or len(features) == 0: - features = self.DEFAULT_BUILDER_FEATURES - builder_class = builder_registry.lookup(*features) - if builder_class is None: - raise FeatureNotFound( - "Couldn't find a tree builder with the features you " - "requested: %s. Do you need to install a parser library?" - % ",".join(features)) - builder = builder_class() - self.builder = builder - self.is_xml = builder.is_xml - self.builder.soup = self - - self.parse_only = parse_only - - self.reset() - - if hasattr(markup, 'read'): # It's a file-type object. - markup = markup.read() - (self.markup, self.original_encoding, self.declared_html_encoding, - self.contains_replacement_characters) = ( - self.builder.prepare_markup(markup, from_encoding)) - - try: - self._feed() - except StopParsing: - pass - - # Clear out the markup and remove the builder's circular - # reference to this object. - self.markup = None - self.builder.soup = None - - def _feed(self): - # Convert the document to Unicode. - self.builder.reset() - - self.builder.feed(self.markup) - # Close out any unfinished strings and close all the open tags. - self.endData() - while self.currentTag.name != self.ROOT_TAG_NAME: - self.popTag() - - def reset(self): - Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) - self.hidden = 1 - self.builder.reset() - self.currentData = [] - self.currentTag = None - self.tagStack = [] - self.pushTag(self) - - def new_tag(self, name, namespace=None, nsprefix=None, **attrs): - """Create a new tag associated with this soup.""" - return Tag(None, self.builder, name, namespace, nsprefix, attrs) - - def new_string(self, s): - """Create a new NavigableString associated with this soup.""" - navigable = NavigableString(s) - navigable.setup() - return navigable - - def insert_before(self, successor): - raise NotImplementedError("BeautifulSoup objects don't support insert_before().") - - def insert_after(self, successor): - raise NotImplementedError("BeautifulSoup objects don't support insert_after().") - - def popTag(self): - tag = self.tagStack.pop() - #print "Pop", tag.name - if self.tagStack: - self.currentTag = self.tagStack[-1] - return self.currentTag - - def pushTag(self, tag): - #print "Push", tag.name - if self.currentTag: - self.currentTag.contents.append(tag) - self.tagStack.append(tag) - self.currentTag = self.tagStack[-1] - - def endData(self, containerClass=NavigableString): - if self.currentData: - currentData = u''.join(self.currentData) - if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and - not set([tag.name for tag in self.tagStack]).intersection( - self.builder.preserve_whitespace_tags)): - if '\n' in currentData: - currentData = '\n' - else: - currentData = ' ' - self.currentData = [] - if self.parse_only and len(self.tagStack) <= 1 and \ - (not self.parse_only.text or \ - not self.parse_only.search(currentData)): - return - o = containerClass(currentData) - self.object_was_parsed(o) - - def object_was_parsed(self, o, parent=None, previous_element=None): - """Add an object to the parse tree.""" - parent = parent or self.currentTag - previous_element = previous_element or self.previous_element - o.setup(parent, previous_element) - if self.previous_element: - self.previous_element.next_element = o - self.previous_element = o - parent.contents.append(o) - - def _popToTag(self, name, nsprefix=None, inclusivePop=True): - """Pops the tag stack up to and including the most recent - instance of the given tag. If inclusivePop is false, pops the tag - stack up to but *not* including the most recent instqance of - the given tag.""" - #print "Popping to %s" % name - if name == self.ROOT_TAG_NAME: - return - - numPops = 0 - mostRecentTag = None - - for i in range(len(self.tagStack) - 1, 0, -1): - if (name == self.tagStack[i].name - and nsprefix == self.tagStack[i].prefix): - numPops = len(self.tagStack) - i - break - if not inclusivePop: - numPops = numPops - 1 - - for i in range(0, numPops): - mostRecentTag = self.popTag() - return mostRecentTag - - def handle_starttag(self, name, namespace, nsprefix, attrs): - """Push a start tag on to the stack. - - If this method returns None, the tag was rejected by the - SoupStrainer. You should proceed as if the tag had not occured - in the document. For instance, if this was a self-closing tag, - don't call handle_endtag. - """ - - # print "Start tag %s: %s" % (name, attrs) - self.endData() - - if (self.parse_only and len(self.tagStack) <= 1 - and (self.parse_only.text - or not self.parse_only.search_tag(name, attrs))): - return None - - tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, - self.currentTag, self.previous_element) - if tag is None: - return tag - if self.previous_element: - self.previous_element.next_element = tag - self.previous_element = tag - self.pushTag(tag) - return tag - - def handle_endtag(self, name, nsprefix=None): - #print "End tag: " + name - self.endData() - self._popToTag(name, nsprefix) - - def handle_data(self, data): - self.currentData.append(data) - - def decode(self, pretty_print=False, - eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): - """Returns a string or Unicode representation of this document. - To get Unicode, pass None for encoding.""" - - if self.is_xml: - # Print the XML declaration - encoding_part = '' - if eventual_encoding != None: - encoding_part = ' encoding="%s"' % eventual_encoding - prefix = u'\n' % encoding_part - else: - prefix = u'' - if not pretty_print: - indent_level = None - else: - indent_level = 0 - return prefix + super(BeautifulSoup, self).decode( - indent_level, eventual_encoding, formatter) - -class BeautifulStoneSoup(BeautifulSoup): - """Deprecated interface to an XML parser.""" - - def __init__(self, *args, **kwargs): - kwargs['features'] = 'xml' - warnings.warn( - 'The BeautifulStoneSoup class is deprecated. Instead of using ' - 'it, pass features="xml" into the BeautifulSoup constructor.') - super(BeautifulStoneSoup, self).__init__(*args, **kwargs) - - -class StopParsing(Exception): - pass - - -class FeatureNotFound(ValueError): - pass - - -#By default, act as an HTML pretty-printer. -if __name__ == '__main__': - import sys - soup = BeautifulSoup(sys.stdin) - print soup.prettify() diff --git a/updater/bs4/element.py b/updater/bs4/element.py index 5989809..586d33d 100644 --- a/updater/bs4/element.py +++ b/updater/bs4/element.py @@ -832,11 +832,13 @@ class Tag(PageElement): self.clear() self.append(string.__class__(string)) - def _all_strings(self, strip=False): + def _all_strings(self, strip=False, no_comments=False): """Yield all child strings, possibly stripping them.""" for descendant in self.descendants: if not isinstance(descendant, NavigableString): continue + if no_comments == True and isinstance(descendant, Comment): + continue if strip: descendant = descendant.strip() if len(descendant) == 0: diff --git a/updater/scrapers/genericocw.py b/updater/scrapers/genericocw.py new file mode 100644 index 0000000..6fdc69b --- /dev/null +++ b/updater/scrapers/genericocw.py @@ -0,0 +1,201 @@ +import requests +import oursql +import datetime +import json +import sys, os +import shared + +from bs4 import BeautifulSoup +import bs4 + +rsess = requests.Session() +rsess.headers['User-Agent'] = 'http://learn.cryto.net/ (scraper@cryto.net) - We mean no harm, thanks for making knowledge free :)' + +class OpenCourseWare(shared.Scraper): + def run(self): + overview = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource").text + soup = BeautifulSoup(overview) + + for element in soup.find(id="pagecontent")("a"): + #if "Hopkins" not in element.string: + # continue + self.process_source(int(element["href"].split("/")[-1]), element.string) + + def process_source(self, source_id, source_name): + data = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/browse/source/%d" % source_id).text + soup = BeautifulSoup(data) + + courses = soup.select("table#cfResultsTable tr") + + for course in courses[:3]: + links = course("a") + + if len(links) > 0: + external = links[0] + details = links[1] + + self.parse_course(external.string, external["href"], details["href"].split("/")[-1], source_name) + + def parse_course(self, course_name, course_url, course_id, source_name): + self.env.log("Parsing %s" % course_url) + + # First fetch metadata from ocwconsortium.org + ocw_data = self._metadata_ocw(course_id) + ocw_data["providername"] = source_name + ocw_data["url"] = course_url + + # Now fetch metadata from the particular course provider + provider_data = self._metadata_provider(course_url) + + if provider_data != False: + data = ocw_data.copy() + data.update(provider_data) + + # TODO: insert data + self.env.log(repr(data)) + + def _metadata_ocw(self, course_id): + soup = BeautifulSoup(rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/course/%s" % course_id).text) + metadata = soup.select("dl.coursepage")[0] + + if len(metadata) > 0: + data = self._parse_ocw_dl(metadata.select("dd"), metadata.select("dt")) + else: + # No metadata provided by ocwconsortium. + data = {} + + return data + + def _parse_ocw_dl(self, dd, dt): + data = {} + + for i in xrange(0, len(dd)): + label = dd[i].string.strip().rstrip(":") + value = dt[i].string + + if value is not None: + value = value.strip() + + if label == "Tags": + if value == None: + data["tags"] = [] + else: + data["tags"] = [x.strip() for x in value.split(",")] + elif label == "Source": + data["providername"] = value + elif label == "Language": + data["language"] = value + elif label == "Link": + # We can ignore this, we already have it anyway + pass + elif label == "Author": + if value == None: + data["author"] = None + else: + data["author"] = value + elif label == "License": + if value == None: + data["license"] = None + else: + data["license"] = value + elif label == "Date Published": + data["creation_date"] = datetime.datetime.strptime(value, "%b %d, %Y") + else: + self.env.log("UNKNOWN: %s => %s" % (label, value), True) + + return data + + def _metadata_provider(self, url): + providers = { + "oer.avu.org": self._metadata_avu, + "ocw.capilanou.ca": self._metadata_capilano, + "ocw.hokudai.ac.jp": self._metadata_hokkaido, + "ocw.ie.edu": self._metadata_ie, + "ocw.jhsph.edu": self._metadata_hopkins, + } + + host = url.split("/")[2] + data = {} + + for provider, func in providers.iteritems(): + if host.endswith(provider): + return func(url) + + return False + + def _metadata_avu(self, url): + # African Virtual University + soup = BeautifulSoup(rsess.get(url + "?show=full").text) + table = soup.select("table.ds-includeSet-table")[0] + data = {"providername": "African Virtual University"} + + for row in table("tr"): + cells = row("td") + label = cells[0].string + value = cells[1].string + + if label == "dc.identifier.uri": + data["identifier_uri"] = value + elif label == "dc.type": + data["object_type"] = value + elif label == "dc.date.accessioned": + data["creation_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ") + elif label == "dc.date.issued": + data["issued_date"] = datetime.datetime.strptime(value, "%Y-%m-%d") + elif label == "dc.date.available": + data["available_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ") + elif label == "dc.language.iso": + data["language"] = value + elif label == "dc.description.abstract": + data["description"] = " ".join(x for y in cells[1]("p") for x in y.strings) + elif label == "dc.contributor.author": + data["author"] = value + elif label == "dc.title": + data["title"] = value + else: + self.env.log("UNKNOWN KEY: %s => %s" % (label, value), True) + + return data + + def _metadata_capilano(self, url): + # Capilano University + soup = BeautifulSoup(rsess.get(url).text) + data = {"providername": "Capilano University"} + + data["title"] = soup.select("h1.documentFirstHeading")[0].string.strip() + data["description"] = " ".join(x for y in soup.select("#about > p") for x in y.strings).strip() + + return data + + def _metadata_hokkaido(self, url): + # Hokkaido University + soup = BeautifulSoup(rsess.get(url).text) + data = {"providername": "Hokkaido University"} + + data["title"] = soup.select("#MAIN h1")[0].string.strip() + data["description"] = soup.select("#MAIN p")[0].string.strip() + + return data + + def _metadata_ie(self, url): + # IE University + course_id = url.split("=")[1] + soup = BeautifulSoup(rsess.get("http://ocw.ie.edu/ocw/cur%s01_esp.html" % course_id.zfill(2)).text) + data = {"providername": "IE University"} + + data["title"] = soup.select(".ari_18_negrita")[0].string.strip() + data["description"] = " ".join(x.strip() for x in soup.select(".ari_12_negra")[-1].strings) + data["author"] = soup.select(".ari_12_negra")[2].select(".ari_12_negrita")[0].string.strip() + + return data + + def _metadata_hopkins(self, url): + # Johns Hopkins Bloomberg School of Public Health + soup = BeautifulSoup(rsess.get(url).text) + data = {"providername": "Johns Hopkins Bloomberg School of Public Health"} + + data["title"] = self.soup_to_text(soup.select("h1")[-1]) + data["author"] = self.soup_to_text(soup.select("#courseInfoBox p:nth-of-type(1)")) + data["description"] = self.soup_to_text(soup.select("#courseImageAndInfoBox > p")) + + return data diff --git a/updater/shared/scraper.py b/updater/shared/scraper.py index df1978d..f04cbe3 100644 --- a/updater/shared/scraper.py +++ b/updater/shared/scraper.py @@ -109,3 +109,14 @@ class Scraper(object): kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"])) return (True, c.lastrowid) + + def soup_to_text(self, soup): + strings = [] + + try: + for el in soup: + strings += el._all_strings(True, True) + except AttributeError, e: + strings = soup._all_strings(True, True) + + return " ".join(strings) diff --git a/updater/update.py b/updater/update.py index 4d4d335..ef021ef 100644 --- a/updater/update.py +++ b/updater/update.py @@ -4,5 +4,5 @@ import shared, scrapers env = shared.Environment() env.connect(host="localhost", username="root", password="", database="learn") -scraper = env.Scraper(scrapers.UniversityOfReddit) +scraper = env.Scraper(scrapers.OpenCourseWare) scraper.run()