You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

202 lines
6.2 KiB
Python

import requests
import oursql
import datetime
import json
import sys, os
import shared
from bs4 import BeautifulSoup
import bs4
rsess = requests.Session()
rsess.headers['User-Agent'] = 'http://learn.cryto.net/ (scraper@cryto.net) - We mean no harm, thanks for making knowledge free :)'
class OpenCourseWare(shared.Scraper):
def run(self):
overview = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource").text
soup = BeautifulSoup(overview)
for element in soup.find(id="pagecontent")("a"):
#if "Hopkins" not in element.string:
# continue
self.process_source(int(element["href"].split("/")[-1]), element.string)
def process_source(self, source_id, source_name):
data = rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/browse/source/%d" % source_id).text
soup = BeautifulSoup(data)
courses = soup.select("table#cfResultsTable tr")
for course in courses[:3]:
links = course("a")
if len(links) > 0:
external = links[0]
details = links[1]
self.parse_course(external.string, external["href"], details["href"].split("/")[-1], source_name)
def parse_course(self, course_name, course_url, course_id, source_name):
self.env.log("Parsing %s" % course_url)
# First fetch metadata from ocwconsortium.org
ocw_data = self._metadata_ocw(course_id)
ocw_data["providername"] = source_name
ocw_data["url"] = course_url
# Now fetch metadata from the particular course provider
provider_data = self._metadata_provider(course_url)
if provider_data != False:
data = ocw_data.copy()
data.update(provider_data)
# TODO: insert data
self.env.log(repr(data))
def _metadata_ocw(self, course_id):
soup = BeautifulSoup(rsess.get("http://www.ocwconsortium.org/en/courses/browsesource/course/%s" % course_id).text)
metadata = soup.select("dl.coursepage")[0]
if len(metadata) > 0:
data = self._parse_ocw_dl(metadata.select("dd"), metadata.select("dt"))
else:
# No metadata provided by ocwconsortium.
data = {}
return data
def _parse_ocw_dl(self, dd, dt):
data = {}
for i in xrange(0, len(dd)):
label = dd[i].string.strip().rstrip(":")
value = dt[i].string
if value is not None:
value = value.strip()
if label == "Tags":
if value == None:
data["tags"] = []
else:
data["tags"] = [x.strip() for x in value.split(",")]
elif label == "Source":
data["providername"] = value
elif label == "Language":
data["language"] = value
elif label == "Link":
# We can ignore this, we already have it anyway
pass
elif label == "Author":
if value == None:
data["author"] = None
else:
data["author"] = value
elif label == "License":
if value == None:
data["license"] = None
else:
data["license"] = value
elif label == "Date Published":
data["creation_date"] = datetime.datetime.strptime(value, "%b %d, %Y")
else:
self.env.log("UNKNOWN: %s => %s" % (label, value), True)
return data
def _metadata_provider(self, url):
providers = {
"oer.avu.org": self._metadata_avu,
"ocw.capilanou.ca": self._metadata_capilano,
"ocw.hokudai.ac.jp": self._metadata_hokkaido,
"ocw.ie.edu": self._metadata_ie,
"ocw.jhsph.edu": self._metadata_hopkins,
}
host = url.split("/")[2]
data = {}
for provider, func in providers.iteritems():
if host.endswith(provider):
return func(url)
return False
def _metadata_avu(self, url):
# African Virtual University
soup = BeautifulSoup(rsess.get(url + "?show=full").text)
table = soup.select("table.ds-includeSet-table")[0]
data = {"providername": "African Virtual University"}
for row in table("tr"):
cells = row("td")
label = cells[0].string
value = cells[1].string
if label == "dc.identifier.uri":
data["identifier_uri"] = value
elif label == "dc.type":
data["object_type"] = value
elif label == "dc.date.accessioned":
data["creation_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
elif label == "dc.date.issued":
data["issued_date"] = datetime.datetime.strptime(value, "%Y-%m-%d")
elif label == "dc.date.available":
data["available_date"] = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
elif label == "dc.language.iso":
data["language"] = value
elif label == "dc.description.abstract":
data["description"] = " ".join(x for y in cells[1]("p") for x in y.strings)
elif label == "dc.contributor.author":
data["author"] = value
elif label == "dc.title":
data["title"] = value
else:
self.env.log("UNKNOWN KEY: %s => %s" % (label, value), True)
return data
def _metadata_capilano(self, url):
# Capilano University
soup = BeautifulSoup(rsess.get(url).text)
data = {"providername": "Capilano University"}
data["title"] = soup.select("h1.documentFirstHeading")[0].string.strip()
data["description"] = " ".join(x for y in soup.select("#about > p") for x in y.strings).strip()
return data
def _metadata_hokkaido(self, url):
# Hokkaido University
soup = BeautifulSoup(rsess.get(url).text)
data = {"providername": "Hokkaido University"}
data["title"] = soup.select("#MAIN h1")[0].string.strip()
data["description"] = soup.select("#MAIN p")[0].string.strip()
return data
def _metadata_ie(self, url):
# IE University
course_id = url.split("=")[1]
soup = BeautifulSoup(rsess.get("http://ocw.ie.edu/ocw/cur%s01_esp.html" % course_id.zfill(2)).text)
data = {"providername": "IE University"}
data["title"] = soup.select(".ari_18_negrita")[0].string.strip()
data["description"] = " ".join(x.strip() for x in soup.select(".ari_12_negra")[-1].strings)
data["author"] = soup.select(".ari_12_negra")[2].select(".ari_12_negrita")[0].string.strip()
return data
def _metadata_hopkins(self, url):
# Johns Hopkins Bloomberg School of Public Health
soup = BeautifulSoup(rsess.get(url).text)
data = {"providername": "Johns Hopkins Bloomberg School of Public Health"}
data["title"] = self.soup_to_text(soup.select("h1")[-1])
data["author"] = self.soup_to_text(soup.select("#courseInfoBox p:nth-of-type(1)"))
data["description"] = self.soup_to_text(soup.select("#courseImageAndInfoBox > p"))
return data