You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
202 lines
6.2 KiB
Python
202 lines
6.2 KiB
Python
import requests
|
|
import oursql
|
|
import datetime
|
|
import json
|
|
import sys, os
|
|
import shared
|
|
|
|
from bs4 import BeautifulSoup
|
|
import bs4
|
|
|
|
rsess = requests.Session()
|
|
rsess.headers['User-Agent'] = 'http://learn.cryto.net/ (scraper@cryto.net) - We mean no harm, thanks for making knowledge free :)'
|
|
|
|
class OpenCourseWare(shared.Scraper):
|
|
def run(self):
|
|
overview = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource").text
|
|
soup = BeautifulSoup(overview)
|
|
|
|
for element in soup.find(id="pagecontent")("a"):
|
|
#if "Hopkins" not in element.string:
|
|
# continue
|
|
self.process_source(int(element["href"].split("/")[-1]), element.string)
|
|
|
|
def process_source(self, source_id, source_name):
|
|
data = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/browse/source/%d" % source_id).text
|
|
soup = BeautifulSoup(data)
|
|
|
|
courses = soup.select("table#cfResultsTable tr")
|
|
|
|
for course in courses[:3]:
|
|
links = course("a")
|
|
|
|
if len(links) > 0:
|
|
external = links[0]
|
|
details = links[1]
|
|
|
|
self.parse_course(external.string, external["href"], details["href"].split("/")[-1], source_name)
|
|
|
|
def parse_course(self, course_name, course_url, course_id, source_name):
|
|
self.env.log("Parsing %s" % course_url)
|
|
|
|
# First fetch metadata from ocwconsortium.org
|
|
ocw_data = self._metadata_ocw(course_id)
|
|
ocw_data["providername"] = source_name
|
|
ocw_data["url"] = course_url
|
|
|
|
# Now fetch metadata from the particular course provider
|
|
provider_data = self._metadata_provider(course_url)
|
|
|
|
if provider_data != False:
|
|
data = ocw_data.copy()
|
|
data.update(provider_data)
|
|
|
|
# TODO: insert data
|
|
self.env.log(repr(data))
|
|
|
|
def _metadata_ocw(self, course_id):
|
|
soup = BeautifulSoup(rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/course/%s" % course_id).text)
|
|
metadata = soup.select("dl.coursepage")[0]
|
|
|
|
if len(metadata) > 0:
|
|
data = self._parse_ocw_dl(metadata.select("dd"), metadata.select("dt"))
|
|
else:
|
|
# No metadata provided by ocwconsortium.
|
|
data = {}
|
|
|
|
return data
|
|
|
|
def _parse_ocw_dl(self, dd, dt):
|
|
data = {}
|
|
|
|
for i in xrange(0, len(dd)):
|
|
label = dd[i].string.strip().rstrip(":")
|
|
value = dt[i].string
|
|
|
|
if value is not None:
|
|
value = value.strip()
|
|
|
|
if label == "Tags":
|
|
if value == None:
|
|
data["tags"] = []
|
|
else:
|
|
data["tags"] = [x.strip() for x in value.split(",")]
|
|
elif label == "Source":
|
|
data["providername"] = value
|
|
elif label == "Language":
|
|
data["language"] = value
|
|
elif label == "Link":
|
|
# We can ignore this, we already have it anyway
|
|
pass
|
|
elif label == "Author":
|
|
if value == None:
|
|
data["author"] = None
|
|
else:
|
|
data["author"] = value
|
|
elif label == "License":
|
|
if value == None:
|
|
data["license"] = None
|
|
else:
|
|
data["license"] = value
|
|
elif label == "Date Published":
|
|
data["creation_date"] = datetime.datetime.strptime(value, "%b %d, %Y")
|
|
else:
|
|
self.env.log("UNKNOWN: %s => %s" % (label, value), True)
|
|
|
|
return data
|
|
|
|
def _metadata_provider(self, url):
|
|
providers = {
|
|
"oer.avu.org": self._metadata_avu,
|
|
"ocw.capilanou.ca": self._metadata_capilano,
|
|
"ocw.hokudai.ac.jp": self._metadata_hokkaido,
|
|
"ocw.ie.edu": self._metadata_ie,
|
|
"ocw.jhsph.edu": self._metadata_hopkins,
|
|
}
|
|
|
|
host = url.split("/")[2]
|
|
data = {}
|
|
|
|
for provider, func in providers.iteritems():
|
|
if host.endswith(provider):
|
|
return func(url)
|
|
|
|
return False
|
|
|
|
def _metadata_avu(self, url):
|
|
# African Virtual University
|
|
soup = BeautifulSoup(rsess.get(url + "?show=full").text)
|
|
table = soup.select("table.ds-includeSet-table")[0]
|
|
data = {"providername": "African Virtual University"}
|
|
|
|
for row in table("tr"):
|
|
cells = row("td")
|
|
label = cells[0].string
|
|
value = cells[1].string
|
|
|
|
if label == "dc.identifier.uri":
|
|
data["identifier_uri"] = value
|
|
elif label == "dc.type":
|
|
data["object_type"] = value
|
|
elif label == "dc.date.accessioned":
|
|
data["creation_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
|
|
elif label == "dc.date.issued":
|
|
data["issued_date"] = datetime.datetime.strptime(value, "%Y-%m-%d")
|
|
elif label == "dc.date.available":
|
|
data["available_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
|
|
elif label == "dc.language.iso":
|
|
data["language"] = value
|
|
elif label == "dc.description.abstract":
|
|
data["description"] = " ".join(x for y in cells[1]("p") for x in y.strings)
|
|
elif label == "dc.contributor.author":
|
|
data["author"] = value
|
|
elif label == "dc.title":
|
|
data["title"] = value
|
|
else:
|
|
self.env.log("UNKNOWN KEY: %s => %s" % (label, value), True)
|
|
|
|
return data
|
|
|
|
def _metadata_capilano(self, url):
|
|
# Capilano University
|
|
soup = BeautifulSoup(rsess.get(url).text)
|
|
data = {"providername": "Capilano University"}
|
|
|
|
data["title"] = soup.select("h1.documentFirstHeading")[0].string.strip()
|
|
data["description"] = " ".join(x for y in soup.select("#about > p") for x in y.strings).strip()
|
|
|
|
return data
|
|
|
|
def _metadata_hokkaido(self, url):
|
|
# Hokkaido University
|
|
soup = BeautifulSoup(rsess.get(url).text)
|
|
data = {"providername": "Hokkaido University"}
|
|
|
|
data["title"] = soup.select("#MAIN h1")[0].string.strip()
|
|
data["description"] = soup.select("#MAIN p")[0].string.strip()
|
|
|
|
return data
|
|
|
|
def _metadata_ie(self, url):
|
|
# IE University
|
|
course_id = url.split("=")[1]
|
|
soup = BeautifulSoup(rsess.get("http://ocw.ie.edu/ocw/cur%s01_esp.html" % course_id.zfill(2)).text)
|
|
data = {"providername": "IE University"}
|
|
|
|
data["title"] = soup.select(".ari_18_negrita")[0].string.strip()
|
|
data["description"] = " ".join(x.strip() for x in soup.select(".ari_12_negra")[-1].strings)
|
|
data["author"] = soup.select(".ari_12_negra")[2].select(".ari_12_negrita")[0].string.strip()
|
|
|
|
return data
|
|
|
|
def _metadata_hopkins(self, url):
|
|
# Johns Hopkins Bloomberg School of Public Health
|
|
soup = BeautifulSoup(rsess.get(url).text)
|
|
data = {"providername": "Johns Hopkins Bloomberg School of Public Health"}
|
|
|
|
data["title"] = self.soup_to_text(soup.select("h1")[-1])
|
|
data["author"] = self.soup_to_text(soup.select("#courseInfoBox p:nth-of-type(1)"))
|
|
data["description"] = self.soup_to_text(soup.select("#courseImageAndInfoBox > p"))
|
|
|
|
return data
|