Rewrite generic OCW parser, BeautifulSoup fix to allow exclusion of comments for string retrieval, and fix BS4 bug
parent
98340b38a0
commit
d98ee113bc
@ -0,0 +1,51 @@
|
|||||||
|
"ocw.kaplan.edu": self._metadata_kaplan,
|
||||||
|
"ocw.korea.edu": self._metadata_korea,
|
||||||
|
"kyotomm.jp": self._metadata_kyoto,
|
||||||
|
"ocw.kyushu-u.ac.jp": self._metadata_kyushu,
|
||||||
|
"open-marhi.ru": self._metadata_moscow,
|
||||||
|
"yctrtrc.ncku.edu.tw": self._metadata_chengkung,
|
||||||
|
"ocw.nctu.edu.tw": self._metadata_chiaotung,
|
||||||
|
"opencourse.ndhu.edu.tw": self._metadata_donghwa,
|
||||||
|
"ocw.njit.edu": self._metadata_njit,
|
||||||
|
"graduateschool.paristech.fr": self._metadata_paris,
|
||||||
|
"peoples-uni.org": self._metadata_oaei,
|
||||||
|
"ocw.sbu.ac.ir": self._metadata_shahid,
|
||||||
|
"studentscircle.net": self._metadata_studentscircle,
|
||||||
|
"ocw.tmu.edu.tw:8080": self._metadata_taipei,
|
||||||
|
"openlearn.open.ac.uk": self._metadata_openuni,
|
||||||
|
"www.ocw.titech.ac.jp": self._metadata_tokyo,
|
||||||
|
"feedproxy.google.com": self._metadata_tudelft,
|
||||||
|
"ocw.tufts.edu": self._metadata_tufts,
|
||||||
|
"ocw.unu.edu": self._metadata_un,
|
||||||
|
"ocw.uc3m.es": self._metadata_madrid,
|
||||||
|
"ocw.ua.es": self._metadata_alicante,
|
||||||
|
"ocw.unican.es": self._metadata_cantabria,
|
||||||
|
"ocw.ugr.es": self._metadata_granada,
|
||||||
|
"ocw.udem.edu.mx": self._metadata_monterrey,
|
||||||
|
"ocw.um.es": self._metadata_murcia,
|
||||||
|
"ocw.uniovi.es": self._metadata_oviedo,
|
||||||
|
"ocw.usal.es": self._metadata_salamanca,
|
||||||
|
"ocwus.us.es": self._metadata_sevilla,
|
||||||
|
"ocw.unizar.es": self._metadata_zaragoza,
|
||||||
|
"ocw.univalle.edu.co3": self._metadata_colombia,
|
||||||
|
"ocw.uned.ac.cr": self._metadata_distancia,
|
||||||
|
"www.icesi.edu.co": self._metadata_icesi,
|
||||||
|
"ocw.innova.uned.es": self._metadata_innova,
|
||||||
|
"upv.es": self._metadata_valencia,
|
||||||
|
"ocw.upm.es": self._metadata_upm,
|
||||||
|
"ocw.utpl.edu.ec": self._metadata_utpl,
|
||||||
|
"ocw.uab.cat": self._metadata_uab,
|
||||||
|
"ocw.ub.edu": self._metadata_ub,
|
||||||
|
"ocw.uib.es": self._metadata_uib,
|
||||||
|
"ocw.udl.cat": self._metadata_udl,
|
||||||
|
"ocw.uv.es": self._metadata_uv,
|
||||||
|
"e-ujier.uji.e": self._metadata_uji,
|
||||||
|
"ocw.uoc.edu": self._metadata_uoc,
|
||||||
|
"ocw.utm.my": self._metadata_utm,
|
||||||
|
"ocw.uci.edu": self._metadata_uci,
|
||||||
|
"opencontent.uct.ac.za": self._metadata_uct,
|
||||||
|
"ocw.umb.edu:8080": self._metadata_boston,
|
||||||
|
"open.umich.edu": self._metadata_michigan,
|
||||||
|
"ocw.nd.edu": self._metadata_notredame,
|
||||||
|
"ocw.usu.ac.id": self._metadata_usu,
|
||||||
|
"ocw.tsukuba.ac.jp": self._metadata_tsukaba
|
@ -1,361 +0,0 @@
|
|||||||
"""Beautiful Soup
|
|
||||||
Elixir and Tonic
|
|
||||||
"The Screen-Scraper's Friend"
|
|
||||||
http://www.crummy.com/software/BeautifulSoup/
|
|
||||||
|
|
||||||
Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
|
||||||
(possibly invalid) document into a tree representation. Beautiful Soup
|
|
||||||
provides provides methods and Pythonic idioms that make it easy to
|
|
||||||
navigate, search, and modify the parse tree.
|
|
||||||
|
|
||||||
Beautiful Soup works with Python 2.6 and up. It works better if lxml
|
|
||||||
and/or html5lib is installed.
|
|
||||||
|
|
||||||
For more than you ever wanted to know about Beautiful Soup, see the
|
|
||||||
documentation:
|
|
||||||
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
|
||||||
"""
|
|
||||||
|
|
||||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
|
||||||
__version__ = "4.1.3"
|
|
||||||
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
|
|
||||||
__license__ = "MIT"
|
|
||||||
|
|
||||||
__all__ = ['BeautifulSoup']
|
|
||||||
|
|
||||||
import re
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
from .builder import builder_registry
|
|
||||||
from .dammit import UnicodeDammit
|
|
||||||
from .element import (
|
|
||||||
CData,
|
|
||||||
Comment,
|
|
||||||
DEFAULT_OUTPUT_ENCODING,
|
|
||||||
Declaration,
|
|
||||||
Doctype,
|
|
||||||
NavigableString,
|
|
||||||
PageElement,
|
|
||||||
ProcessingInstruction,
|
|
||||||
ResultSet,
|
|
||||||
SoupStrainer,
|
|
||||||
Tag,
|
|
||||||
)
|
|
||||||
|
|
||||||
# The very first thing we do is give a useful error if someone is
|
|
||||||
# running this code under Python 3 without converting it.
|
|
||||||
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
|
||||||
|
|
||||||
class BeautifulSoup(Tag):
|
|
||||||
"""
|
|
||||||
This class defines the basic interface called by the tree builders.
|
|
||||||
|
|
||||||
These methods will be called by the parser:
|
|
||||||
reset()
|
|
||||||
feed(markup)
|
|
||||||
|
|
||||||
The tree builder may call these methods from its feed() implementation:
|
|
||||||
handle_starttag(name, attrs) # See note about return value
|
|
||||||
handle_endtag(name)
|
|
||||||
handle_data(data) # Appends to the current data node
|
|
||||||
endData(containerClass=NavigableString) # Ends the current data node
|
|
||||||
|
|
||||||
No matter how complicated the underlying parser is, you should be
|
|
||||||
able to build a tree using 'start tag' events, 'end tag' events,
|
|
||||||
'data' events, and "done with data" events.
|
|
||||||
|
|
||||||
If you encounter an empty-element tag (aka a self-closing tag,
|
|
||||||
like HTML's <br> tag), call handle_starttag and then
|
|
||||||
handle_endtag.
|
|
||||||
"""
|
|
||||||
ROOT_TAG_NAME = u'[document]'
|
|
||||||
|
|
||||||
# If the end-user gives no indication which tree builder they
|
|
||||||
# want, look for one with these features.
|
|
||||||
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
|
|
||||||
|
|
||||||
# Used when determining whether a text node is all whitespace and
|
|
||||||
# can be replaced with a single space. A text node that contains
|
|
||||||
# fancy Unicode spaces (usually non-breaking) should be left
|
|
||||||
# alone.
|
|
||||||
STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
|
|
||||||
|
|
||||||
def __init__(self, markup="", features=None, builder=None,
|
|
||||||
parse_only=None, from_encoding=None, **kwargs):
|
|
||||||
"""The Soup object is initialized as the 'root tag', and the
|
|
||||||
provided markup (which can be a string or a file-like object)
|
|
||||||
is fed into the underlying parser."""
|
|
||||||
|
|
||||||
if 'convertEntities' in kwargs:
|
|
||||||
warnings.warn(
|
|
||||||
"BS4 does not respect the convertEntities argument to the "
|
|
||||||
"BeautifulSoup constructor. Entities are always converted "
|
|
||||||
"to Unicode characters.")
|
|
||||||
|
|
||||||
if 'markupMassage' in kwargs:
|
|
||||||
del kwargs['markupMassage']
|
|
||||||
warnings.warn(
|
|
||||||
"BS4 does not respect the markupMassage argument to the "
|
|
||||||
"BeautifulSoup constructor. The tree builder is responsible "
|
|
||||||
"for any necessary markup massage.")
|
|
||||||
|
|
||||||
if 'smartQuotesTo' in kwargs:
|
|
||||||
del kwargs['smartQuotesTo']
|
|
||||||
warnings.warn(
|
|
||||||
"BS4 does not respect the smartQuotesTo argument to the "
|
|
||||||
"BeautifulSoup constructor. Smart quotes are always converted "
|
|
||||||
"to Unicode characters.")
|
|
||||||
|
|
||||||
if 'selfClosingTags' in kwargs:
|
|
||||||
del kwargs['selfClosingTags']
|
|
||||||
warnings.warn(
|
|
||||||
"BS4 does not respect the selfClosingTags argument to the "
|
|
||||||
"BeautifulSoup constructor. The tree builder is responsible "
|
|
||||||
"for understanding self-closing tags.")
|
|
||||||
|
|
||||||
if 'isHTML' in kwargs:
|
|
||||||
del kwargs['isHTML']
|
|
||||||
warnings.warn(
|
|
||||||
"BS4 does not respect the isHTML argument to the "
|
|
||||||
"BeautifulSoup constructor. You can pass in features='html' "
|
|
||||||
"or features='xml' to get a builder capable of handling "
|
|
||||||
"one or the other.")
|
|
||||||
|
|
||||||
def deprecated_argument(old_name, new_name):
|
|
||||||
if old_name in kwargs:
|
|
||||||
warnings.warn(
|
|
||||||
'The "%s" argument to the BeautifulSoup constructor '
|
|
||||||
'has been renamed to "%s."' % (old_name, new_name))
|
|
||||||
value = kwargs[old_name]
|
|
||||||
del kwargs[old_name]
|
|
||||||
return value
|
|
||||||
return None
|
|
||||||
|
|
||||||
parse_only = parse_only or deprecated_argument(
|
|
||||||
"parseOnlyThese", "parse_only")
|
|
||||||
|
|
||||||
from_encoding = from_encoding or deprecated_argument(
|
|
||||||
"fromEncoding", "from_encoding")
|
|
||||||
|
|
||||||
if len(kwargs) > 0:
|
|
||||||
arg = kwargs.keys().pop()
|
|
||||||
raise TypeError(
|
|
||||||
"__init__() got an unexpected keyword argument '%s'" % arg)
|
|
||||||
|
|
||||||
if builder is None:
|
|
||||||
if isinstance(features, basestring):
|
|
||||||
features = [features]
|
|
||||||
if features is None or len(features) == 0:
|
|
||||||
features = self.DEFAULT_BUILDER_FEATURES
|
|
||||||
builder_class = builder_registry.lookup(*features)
|
|
||||||
if builder_class is None:
|
|
||||||
raise FeatureNotFound(
|
|
||||||
"Couldn't find a tree builder with the features you "
|
|
||||||
"requested: %s. Do you need to install a parser library?"
|
|
||||||
% ",".join(features))
|
|
||||||
builder = builder_class()
|
|
||||||
self.builder = builder
|
|
||||||
self.is_xml = builder.is_xml
|
|
||||||
self.builder.soup = self
|
|
||||||
|
|
||||||
self.parse_only = parse_only
|
|
||||||
|
|
||||||
self.reset()
|
|
||||||
|
|
||||||
if hasattr(markup, 'read'): # It's a file-type object.
|
|
||||||
markup = markup.read()
|
|
||||||
(self.markup, self.original_encoding, self.declared_html_encoding,
|
|
||||||
self.contains_replacement_characters) = (
|
|
||||||
self.builder.prepare_markup(markup, from_encoding))
|
|
||||||
|
|
||||||
try:
|
|
||||||
self._feed()
|
|
||||||
except StopParsing:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Clear out the markup and remove the builder's circular
|
|
||||||
# reference to this object.
|
|
||||||
self.markup = None
|
|
||||||
self.builder.soup = None
|
|
||||||
|
|
||||||
def _feed(self):
|
|
||||||
# Convert the document to Unicode.
|
|
||||||
self.builder.reset()
|
|
||||||
|
|
||||||
self.builder.feed(self.markup)
|
|
||||||
# Close out any unfinished strings and close all the open tags.
|
|
||||||
self.endData()
|
|
||||||
while self.currentTag.name != self.ROOT_TAG_NAME:
|
|
||||||
self.popTag()
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
|
|
||||||
self.hidden = 1
|
|
||||||
self.builder.reset()
|
|
||||||
self.currentData = []
|
|
||||||
self.currentTag = None
|
|
||||||
self.tagStack = []
|
|
||||||
self.pushTag(self)
|
|
||||||
|
|
||||||
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
|
|
||||||
"""Create a new tag associated with this soup."""
|
|
||||||
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
|
|
||||||
|
|
||||||
def new_string(self, s):
|
|
||||||
"""Create a new NavigableString associated with this soup."""
|
|
||||||
navigable = NavigableString(s)
|
|
||||||
navigable.setup()
|
|
||||||
return navigable
|
|
||||||
|
|
||||||
def insert_before(self, successor):
|
|
||||||
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
|
|
||||||
|
|
||||||
def insert_after(self, successor):
|
|
||||||
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
|
|
||||||
|
|
||||||
def popTag(self):
|
|
||||||
tag = self.tagStack.pop()
|
|
||||||
#print "Pop", tag.name
|
|
||||||
if self.tagStack:
|
|
||||||
self.currentTag = self.tagStack[-1]
|
|
||||||
return self.currentTag
|
|
||||||
|
|
||||||
def pushTag(self, tag):
|
|
||||||
#print "Push", tag.name
|
|
||||||
if self.currentTag:
|
|
||||||
self.currentTag.contents.append(tag)
|
|
||||||
self.tagStack.append(tag)
|
|
||||||
self.currentTag = self.tagStack[-1]
|
|
||||||
|
|
||||||
def endData(self, containerClass=NavigableString):
|
|
||||||
if self.currentData:
|
|
||||||
currentData = u''.join(self.currentData)
|
|
||||||
if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
|
|
||||||
not set([tag.name for tag in self.tagStack]).intersection(
|
|
||||||
self.builder.preserve_whitespace_tags)):
|
|
||||||
if '\n' in currentData:
|
|
||||||
currentData = '\n'
|
|
||||||
else:
|
|
||||||
currentData = ' '
|
|
||||||
self.currentData = []
|
|
||||||
if self.parse_only and len(self.tagStack) <= 1 and \
|
|
||||||
(not self.parse_only.text or \
|
|
||||||
not self.parse_only.search(currentData)):
|
|
||||||
return
|
|
||||||
o = containerClass(currentData)
|
|
||||||
self.object_was_parsed(o)
|
|
||||||
|
|
||||||
def object_was_parsed(self, o, parent=None, previous_element=None):
|
|
||||||
"""Add an object to the parse tree."""
|
|
||||||
parent = parent or self.currentTag
|
|
||||||
previous_element = previous_element or self.previous_element
|
|
||||||
o.setup(parent, previous_element)
|
|
||||||
if self.previous_element:
|
|
||||||
self.previous_element.next_element = o
|
|
||||||
self.previous_element = o
|
|
||||||
parent.contents.append(o)
|
|
||||||
|
|
||||||
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
|
||||||
"""Pops the tag stack up to and including the most recent
|
|
||||||
instance of the given tag. If inclusivePop is false, pops the tag
|
|
||||||
stack up to but *not* including the most recent instqance of
|
|
||||||
the given tag."""
|
|
||||||
#print "Popping to %s" % name
|
|
||||||
if name == self.ROOT_TAG_NAME:
|
|
||||||
return
|
|
||||||
|
|
||||||
numPops = 0
|
|
||||||
mostRecentTag = None
|
|
||||||
|
|
||||||
for i in range(len(self.tagStack) - 1, 0, -1):
|
|
||||||
if (name == self.tagStack[i].name
|
|
||||||
and nsprefix == self.tagStack[i].prefix):
|
|
||||||
numPops = len(self.tagStack) - i
|
|
||||||
break
|
|
||||||
if not inclusivePop:
|
|
||||||
numPops = numPops - 1
|
|
||||||
|
|
||||||
for i in range(0, numPops):
|
|
||||||
mostRecentTag = self.popTag()
|
|
||||||
return mostRecentTag
|
|
||||||
|
|
||||||
def handle_starttag(self, name, namespace, nsprefix, attrs):
|
|
||||||
"""Push a start tag on to the stack.
|
|
||||||
|
|
||||||
If this method returns None, the tag was rejected by the
|
|
||||||
SoupStrainer. You should proceed as if the tag had not occured
|
|
||||||
in the document. For instance, if this was a self-closing tag,
|
|
||||||
don't call handle_endtag.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# print "Start tag %s: %s" % (name, attrs)
|
|
||||||
self.endData()
|
|
||||||
|
|
||||||
if (self.parse_only and len(self.tagStack) <= 1
|
|
||||||
and (self.parse_only.text
|
|
||||||
or not self.parse_only.search_tag(name, attrs))):
|
|
||||||
return None
|
|
||||||
|
|
||||||
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
|
|
||||||
self.currentTag, self.previous_element)
|
|
||||||
if tag is None:
|
|
||||||
return tag
|
|
||||||
if self.previous_element:
|
|
||||||
self.previous_element.next_element = tag
|
|
||||||
self.previous_element = tag
|
|
||||||
self.pushTag(tag)
|
|
||||||
return tag
|
|
||||||
|
|
||||||
def handle_endtag(self, name, nsprefix=None):
|
|
||||||
#print "End tag: " + name
|
|
||||||
self.endData()
|
|
||||||
self._popToTag(name, nsprefix)
|
|
||||||
|
|
||||||
def handle_data(self, data):
|
|
||||||
self.currentData.append(data)
|
|
||||||
|
|
||||||
def decode(self, pretty_print=False,
|
|
||||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
|
||||||
formatter="minimal"):
|
|
||||||
"""Returns a string or Unicode representation of this document.
|
|
||||||
To get Unicode, pass None for encoding."""
|
|
||||||
|
|
||||||
if self.is_xml:
|
|
||||||
# Print the XML declaration
|
|
||||||
encoding_part = ''
|
|
||||||
if eventual_encoding != None:
|
|
||||||
encoding_part = ' encoding="%s"' % eventual_encoding
|
|
||||||
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
|
|
||||||
else:
|
|
||||||
prefix = u''
|
|
||||||
if not pretty_print:
|
|
||||||
indent_level = None
|
|
||||||
else:
|
|
||||||
indent_level = 0
|
|
||||||
return prefix + super(BeautifulSoup, self).decode(
|
|
||||||
indent_level, eventual_encoding, formatter)
|
|
||||||
|
|
||||||
class BeautifulStoneSoup(BeautifulSoup):
|
|
||||||
"""Deprecated interface to an XML parser."""
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
kwargs['features'] = 'xml'
|
|
||||||
warnings.warn(
|
|
||||||
'The BeautifulStoneSoup class is deprecated. Instead of using '
|
|
||||||
'it, pass features="xml" into the BeautifulSoup constructor.')
|
|
||||||
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class StopParsing(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class FeatureNotFound(ValueError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
#By default, act as an HTML pretty-printer.
|
|
||||||
if __name__ == '__main__':
|
|
||||||
import sys
|
|
||||||
soup = BeautifulSoup(sys.stdin)
|
|
||||||
print soup.prettify()
|
|
@ -0,0 +1,201 @@
|
|||||||
|
import requests
|
||||||
|
import oursql
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import sys, os
|
||||||
|
import shared
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import bs4
|
||||||
|
|
||||||
|
rsess = requests.Session()
|
||||||
|
rsess.headers['User-Agent'] = 'http://learn.cryto.net/ (scraper@cryto.net) - We mean no harm, thanks for making knowledge free :)'
|
||||||
|
|
||||||
|
class OpenCourseWare(shared.Scraper):
|
||||||
|
def run(self):
|
||||||
|
overview = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource").text
|
||||||
|
soup = BeautifulSoup(overview)
|
||||||
|
|
||||||
|
for element in soup.find(id="pagecontent")("a"):
|
||||||
|
#if "Hopkins" not in element.string:
|
||||||
|
# continue
|
||||||
|
self.process_source(int(element["href"].split("/")[-1]), element.string)
|
||||||
|
|
||||||
|
def process_source(self, source_id, source_name):
|
||||||
|
data = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/browse/source/%d" % source_id).text
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
|
||||||
|
courses = soup.select("table#cfResultsTable tr")
|
||||||
|
|
||||||
|
for course in courses[:3]:
|
||||||
|
links = course("a")
|
||||||
|
|
||||||
|
if len(links) > 0:
|
||||||
|
external = links[0]
|
||||||
|
details = links[1]
|
||||||
|
|
||||||
|
self.parse_course(external.string, external["href"], details["href"].split("/")[-1], source_name)
|
||||||
|
|
||||||
|
def parse_course(self, course_name, course_url, course_id, source_name):
|
||||||
|
self.env.log("Parsing %s" % course_url)
|
||||||
|
|
||||||
|
# First fetch metadata from ocwconsortium.org
|
||||||
|
ocw_data = self._metadata_ocw(course_id)
|
||||||
|
ocw_data["providername"] = source_name
|
||||||
|
ocw_data["url"] = course_url
|
||||||
|
|
||||||
|
# Now fetch metadata from the particular course provider
|
||||||
|
provider_data = self._metadata_provider(course_url)
|
||||||
|
|
||||||
|
if provider_data != False:
|
||||||
|
data = ocw_data.copy()
|
||||||
|
data.update(provider_data)
|
||||||
|
|
||||||
|
# TODO: insert data
|
||||||
|
self.env.log(repr(data))
|
||||||
|
|
||||||
|
def _metadata_ocw(self, course_id):
|
||||||
|
soup = BeautifulSoup(rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/course/%s" % course_id).text)
|
||||||
|
metadata = soup.select("dl.coursepage")[0]
|
||||||
|
|
||||||
|
if len(metadata) > 0:
|
||||||
|
data = self._parse_ocw_dl(metadata.select("dd"), metadata.select("dt"))
|
||||||
|
else:
|
||||||
|
# No metadata provided by ocwconsortium.
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _parse_ocw_dl(self, dd, dt):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
for i in xrange(0, len(dd)):
|
||||||
|
label = dd[i].string.strip().rstrip(":")
|
||||||
|
value = dt[i].string
|
||||||
|
|
||||||
|
if value is not None:
|
||||||
|
value = value.strip()
|
||||||
|
|
||||||
|
if label == "Tags":
|
||||||
|
if value == None:
|
||||||
|
data["tags"] = []
|
||||||
|
else:
|
||||||
|
data["tags"] = [x.strip() for x in value.split(",")]
|
||||||
|
elif label == "Source":
|
||||||
|
data["providername"] = value
|
||||||
|
elif label == "Language":
|
||||||
|
data["language"] = value
|
||||||
|
elif label == "Link":
|
||||||
|
# We can ignore this, we already have it anyway
|
||||||
|
pass
|
||||||
|
elif label == "Author":
|
||||||
|
if value == None:
|
||||||
|
data["author"] = None
|
||||||
|
else:
|
||||||
|
data["author"] = value
|
||||||
|
elif label == "License":
|
||||||
|
if value == None:
|
||||||
|
data["license"] = None
|
||||||
|
else:
|
||||||
|
data["license"] = value
|
||||||
|
elif label == "Date Published":
|
||||||
|
data["creation_date"] = datetime.datetime.strptime(value, "%b %d, %Y")
|
||||||
|
else:
|
||||||
|
self.env.log("UNKNOWN: %s => %s" % (label, value), True)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _metadata_provider(self, url):
|
||||||
|
providers = {
|
||||||
|
"oer.avu.org": self._metadata_avu,
|
||||||
|
"ocw.capilanou.ca": self._metadata_capilano,
|
||||||
|
"ocw.hokudai.ac.jp": self._metadata_hokkaido,
|
||||||
|
"ocw.ie.edu": self._metadata_ie,
|
||||||
|
"ocw.jhsph.edu": self._metadata_hopkins,
|
||||||
|
}
|
||||||
|
|
||||||
|
host = url.split("/")[2]
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
for provider, func in providers.iteritems():
|
||||||
|
if host.endswith(provider):
|
||||||
|
return func(url)
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _metadata_avu(self, url):
|
||||||
|
# African Virtual University
|
||||||
|
soup = BeautifulSoup(rsess.get(url + "?show=full").text)
|
||||||
|
table = soup.select("table.ds-includeSet-table")[0]
|
||||||
|
data = {"providername": "African Virtual University"}
|
||||||
|
|
||||||
|
for row in table("tr"):
|
||||||
|
cells = row("td")
|
||||||
|
label = cells[0].string
|
||||||
|
value = cells[1].string
|
||||||
|
|
||||||
|
if label == "dc.identifier.uri":
|
||||||
|
data["identifier_uri"] = value
|
||||||
|
elif label == "dc.type":
|
||||||
|
data["object_type"] = value
|
||||||
|
elif label == "dc.date.accessioned":
|
||||||
|
data["creation_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
elif label == "dc.date.issued":
|
||||||
|
data["issued_date"] = datetime.datetime.strptime(value, "%Y-%m-%d")
|
||||||
|
elif label == "dc.date.available":
|
||||||
|
data["available_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
elif label == "dc.language.iso":
|
||||||
|
data["language"] = value
|
||||||
|
elif label == "dc.description.abstract":
|
||||||
|
data["description"] = " ".join(x for y in cells[1]("p") for x in y.strings)
|
||||||
|
elif label == "dc.contributor.author":
|
||||||
|
data["author"] = value
|
||||||
|
elif label == "dc.title":
|
||||||
|
data["title"] = value
|
||||||
|
else:
|
||||||
|
self.env.log("UNKNOWN KEY: %s => %s" % (label, value), True)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _metadata_capilano(self, url):
|
||||||
|
# Capilano University
|
||||||
|
soup = BeautifulSoup(rsess.get(url).text)
|
||||||
|
data = {"providername": "Capilano University"}
|
||||||
|
|
||||||
|
data["title"] = soup.select("h1.documentFirstHeading")[0].string.strip()
|
||||||
|
data["description"] = " ".join(x for y in soup.select("#about > p") for x in y.strings).strip()
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _metadata_hokkaido(self, url):
|
||||||
|
# Hokkaido University
|
||||||
|
soup = BeautifulSoup(rsess.get(url).text)
|
||||||
|
data = {"providername": "Hokkaido University"}
|
||||||
|
|
||||||
|
data["title"] = soup.select("#MAIN h1")[0].string.strip()
|
||||||
|
data["description"] = soup.select("#MAIN p")[0].string.strip()
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _metadata_ie(self, url):
|
||||||
|
# IE University
|
||||||
|
course_id = url.split("=")[1]
|
||||||
|
soup = BeautifulSoup(rsess.get("http://ocw.ie.edu/ocw/cur%s01_esp.html" % course_id.zfill(2)).text)
|
||||||
|
data = {"providername": "IE University"}
|
||||||
|
|
||||||
|
data["title"] = soup.select(".ari_18_negrita")[0].string.strip()
|
||||||
|
data["description"] = " ".join(x.strip() for x in soup.select(".ari_12_negra")[-1].strings)
|
||||||
|
data["author"] = soup.select(".ari_12_negra")[2].select(".ari_12_negrita")[0].string.strip()
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _metadata_hopkins(self, url):
|
||||||
|
# Johns Hopkins Bloomberg School of Public Health
|
||||||
|
soup = BeautifulSoup(rsess.get(url).text)
|
||||||
|
data = {"providername": "Johns Hopkins Bloomberg School of Public Health"}
|
||||||
|
|
||||||
|
data["title"] = self.soup_to_text(soup.select("h1")[-1])
|
||||||
|
data["author"] = self.soup_to_text(soup.select("#courseInfoBox p:nth-of-type(1)"))
|
||||||
|
data["description"] = self.soup_to_text(soup.select("#courseImageAndInfoBox > p"))
|
||||||
|
|
||||||
|
return data
|
Loading…
Reference in New Issue