From a690cb2c8fc63395ce34a9acf7f729704edb5620 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Wed, 30 Jan 2013 13:41:27 +0100 Subject: [PATCH] Add rudimentary first version of the OCW scraper --- updater/test_ocw.py | 4 + updater/update_ocw.py | 288 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 292 insertions(+) create mode 100644 updater/test_ocw.py create mode 100644 updater/update_ocw.py diff --git a/updater/test_ocw.py b/updater/test_ocw.py new file mode 100644 index 0000000..f4cb7bd --- /dev/null +++ b/updater/test_ocw.py @@ -0,0 +1,4 @@ +import update_ocw + +c = update_ocw.OpenCourseWareCrawler() +print c.get_provider_data("http://ocw.jhsph.edu/courses/AdolHealthDev/?source=rss") diff --git a/updater/update_ocw.py b/updater/update_ocw.py new file mode 100644 index 0000000..76a0447 --- /dev/null +++ b/updater/update_ocw.py @@ -0,0 +1,288 @@ +import requests +import oursql +import datetime +import json +import lib +from bs4 import BeautifulSoup +import bs4 + +def combine_dict(a, b): + c = a.copy() + c.update(b) + return c + +rsess = requests.Session() +rsess.headers['User-Agent'] = 'http://learn.cryto.net/ (scraper@cryto.net) - We mean no harm, thanks for making knowledge free :)' + +class OpenCourseWareCrawler(object): + def __init__(self): + self.db = lib.Database("localhost", "root", password="") + + def parse_catalog(self): + overview = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource").text + soup = BeautifulSoup(overview) + + for element in soup.find(id="pagecontent")("a"): + self.parse_source(int(element["href"].split("/")[-1]), element.string) + + def parse_source(self, source_id, source_name): + data = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/browse/source/%d" % source_id).text + soup = BeautifulSoup(data) + + courses = soup.select("table#cfResultsTable tr") + + print "# " + source_name + + for course in courses[:2]: + links = course("a") + + if len(links) > 0: + external = links[0] + details = links[1] + + self.parse_course(external.string, external["href"], details["href"].split("/")[-1]) + + def parse_course(self, course_name, course_url, course_id): + # First fetch metadata from ocwconsortium.org + + print course_url + + metadata_soup = BeautifulSoup(rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/course/%s" % course_id).text) + + metadata = metadata_soup.select("dl.coursepage")[0] + + if len(metadata) > 0: + data = self.parse_dl(metadata.select("dd"), metadata.select("dt")) + else: + # No metadata provided by ocwconsortium. + data = {} + + # Now fetch metadata from the particular course provider + provider_data = self.get_provider_data(course_url) + + if provider_data != {}: + print repr(provider_data) + + def parse_dl(self, dd, dt): + data = {} + + for i in xrange(0, len(dd)): + label = dd[i].string.strip().rstrip(":") + value = dt[i].string + + if value is not None: + value = value.strip() + + if label == "Tags": + if value == None: + data["tags"] = [] + else: + data["tags"] = [x.strip() for x in value.split(",")] + elif label == "Source": + data["source"] = value + elif label == "Language": + data["language"] = value + elif label == "Link": + # We can ignore this, we already have it anyway + pass + elif label == "Author": + if value == None: + data["author"] = None + else: + data["author"] = value + elif label == "License": + if value == None: + data["license"] = None + else: + data["license"] = value + elif label == "Date Published": + data["creation_date"] = datetime.datetime.strptime(value, "%b %d, %Y") + else: + print "UNKNOWN: %s => %s" % (label, value) + + return data + + def get_provider_data(self, url): + providers = { + "oer.avu.org": self._data_avu, + "ocw.capilanou.ca": self._data_capilano, + "ocw.hokudai.ac.jp": self._data_hokkaido, + "ocw.ie.edu": self._data_ie, + "ocw.jhsph.edu": self._data_hopkins, + } + + """, + + + + "ocw.kaplan.edu": self._data_kaplan, + "ocw.korea.edu": self._data_korea, + "kyotomm.jp": self._data_kyoto, + "ocw.kyushu-u.ac.jp": self._data_kyushu, + + "open-marhi.ru": self._data_moscow, + "yctrtrc.ncku.edu.tw": self._data_chengkung, + "ocw.nctu.edu.tw": self._data_chiaotung, + "opencourse.ndhu.edu.tw": self._data_donghwa, + "ocw.njit.edu": self._data_njit, + "graduateschool.paristech.fr": self._data_paris, + "peoples-uni.org": self._data_oaei, + "ocw.sbu.ac.ir": self._data_shahid, + "studentscircle.net": self._data_studentscircle, + "ocw.tmu.edu.tw:8080": self._data_taipei, + "openlearn.open.ac.uk": self._data_openuni, + "www.ocw.titech.ac.jp": self._data_tokyo, + "feedproxy.google.com": self._data_tudelft, + "ocw.tufts.edu": self._data_tufts, + "ocw.unu.edu": self._data_un, + "ocw.uc3m.es": self._data_madrid, + "ocw.ua.es": self._data_alicante, + "ocw.unican.es": self._data_cantabria, + "ocw.ugr.es": self._data_granada, + "ocw.udem.edu.mx": self._data_monterrey, + "ocw.um.es": self._data_murcia, + "ocw.uniovi.es": self._data_oviedo, + "ocw.usal.es": self._data_salamanca, + "ocwus.us.es": self._data_sevilla, + "ocw.unizar.es": self._data_zaragoza, + "ocw.univalle.edu.co3": self._data_colombia, + "ocw.uned.ac.cr": self._data_distancia, + "www.icesi.edu.co": self._data_icesi, + "ocw.innova.uned.es": self._data_innova, + "upv.es": self._data_valencia, + "ocw.upm.es": self._data_upm, + "ocw.utpl.edu.ec": self._data_utpl, + "ocw.uab.cat": self._data_uab, + "ocw.ub.edu": self._data_ub, + "ocw.uib.es": self._data_uib, + "ocw.udl.cat": self._data_udl, + "ocw.uv.es": self._data_uv, + "e-ujier.uji.e": self._data_uji, + "ocw.uoc.edu": self._data_uoc, + "ocw.utm.my": self._data_utm, + "ocw.uci.edu": self._data_uci, + "opencontent.uct.ac.za": self._data_uct, + "ocw.umb.edu:8080": self._data_boston, + "open.umich.edu": self._data_michigan, + "ocw.nd.edu": self._data_notredame, + "ocw.usu.ac.id": self._data_usu, + "ocw.tsukuba.ac.jp": self._data_tsukaba""" + + host = url.split("/")[2] + data = {} + + for provider, func in providers.iteritems(): + if host.endswith(provider): + data = func(url) + + return data + + def _data_avu(self, url): + # African Virtual University + soup = BeautifulSoup(rsess.get(url + "?show=full").text) + table = soup.select("table.ds-includeSet-table")[0] + data = {"providername": "African Virtual University"} + + for row in table("tr"): + cells = row("td") + label = cells[0].string + value = cells[1].string + + if label == "dc.identifier.uri": + data["identifier_uri"] = value + elif label == "dc.type": + data["object_type"] = value + elif label == "dc.date.accessioned": + data["creation_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ") + elif label == "dc.date.issued": + data["issued_date"] = datetime.datetime.strptime(value, "%Y-%m-%d") + elif label == "dc.date.available": + data["available_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ") + elif label == "dc.language.iso": + data["language"] = value + elif label == "dc.description.abstract": + data["description"] = " ".join(x for y in cells[1]("p") for x in y.strings) + elif label == "dc.contributor.author": + data["author"] = value + elif label == "dc.title": + data["title"] = value + else: + print "UNKNOWN KEY: %s => %s" % (label, value) + + return data + + def _data_capilano(self, url): + # Capilano University + soup = BeautifulSoup(rsess.get(url).text) + data = {"providername": "Capilano University"} + + data["title"] = soup.select("h1.documentFirstHeading")[0].string.strip() + data["description"] = " ".join(x for y in soup.select("#about > p") for x in y.strings).strip() + + return data + + def _data_hokkaido(self, url): + # Hokkaido University + soup = BeautifulSoup(rsess.get(url).text) + data = {"providername": "Hokkaido University"} + + data["title"] = soup.select("#MAIN h1")[0].string.strip() + data["description"] = soup.select("#MAIN p")[0].string.strip() + + return data + + def _data_ie(self, url): + # IE University + course_id = url.split("=")[1] + soup = BeautifulSoup(rsess.get("http://ocw.ie.edu/ocw/cur%s01_esp.html" % course_id.zfill(2)).text) + data = {"providername": "IE University"} + + data["title"] = soup.select(".ari_18_negrita")[0].string.strip() + data["description"] = " ".join(x.strip() for x in soup.select(".ari_12_negra")[-1].strings) + data["author"] = soup.select(".ari_12_negra")[2].select(".ari_12_negrita")[0].string.strip() + + return data + + def _data_hopkins(self, url): + # Johns Hopkins Bloomberg School of Public Health + soup = BeautifulSoup(rsess.get(url).text) + data = {"providername": "Johns Hopkins Bloomberg School of Public Health"} + + data["title"] = " ".join(x.strip() for x in soup.select("h1")[-1].strings if type(x) != bs4.element.Comment) + data["author"] = soup.select("#courseInfoBox p")[0].string.strip() + data["description"] = soup.select("#courseImageAndInfoBox p")[-1].string.strip() + + return data + + def parse_dataset(self): + for item in self.dataset: + self.process_item(item) + + def process_item(self, item): + inserted, rowid = self.db.insert_topic(2, str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True) + + if inserted: + print "Inserted %s" % item["name"] + else: + print "Skipped %s" % item["name"] + + for course in item["courses"]: + self.process_course(course, rowid) + + def process_course(self, course, topicid): + try: + start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"]) + title = "%s: %s-%s-%s" % (course["name"], str(course["start_year"]).zfill(4), str(course["start_month"]).zfill(2), str(course["start_day"]).zfill(2)) + except TypeError, e: + start_date = None + title = "%s (date undetermined)" % (course["name"]) + + inserted, itemid = self.db.insert_item(2, str(course["id"]), True, self.db.COURSE, title, course["home_link"], description=course["certificate_description"], start_date=start_date, topic_id=topicid) + + if inserted: + print "\tInserted %s" % title + else: + print "\tSkipped %s" % title + +#crawler = OpenCourseWareCrawler() +#crawler.parse_catalog()