Rewrite scraper to be more modular, and convert the Coursera crawler to the new model
parent
c2a8a66dac
commit
fb6c43a38f
@ -0,0 +1,26 @@
|
|||||||
|
import inspect, os, sys
|
||||||
|
|
||||||
|
my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
|
||||||
|
|
||||||
|
def _import_module_into_scope(modulename):
|
||||||
|
module = __import__(modulename)
|
||||||
|
|
||||||
|
for name in vars(module):
|
||||||
|
data = getattr(module, name)
|
||||||
|
globals()[name] = data
|
||||||
|
|
||||||
|
sys.path.insert(0, my_path)
|
||||||
|
|
||||||
|
for fname in os.listdir(my_path):
|
||||||
|
fpath = os.path.join(my_path, fname)
|
||||||
|
fbasename, fext = os.path.splitext(fname)
|
||||||
|
|
||||||
|
if os.path.isdir(fpath):
|
||||||
|
if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
|
||||||
|
# This is a python directory module
|
||||||
|
_import_module_into_scope(fname)
|
||||||
|
elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
|
||||||
|
# This is a python file module
|
||||||
|
_import_module_into_scope(fbasename)
|
||||||
|
|
||||||
|
sys.path.remove(my_path)
|
@ -0,0 +1,50 @@
|
|||||||
|
import datetime, json, sys
|
||||||
|
import requests, oursql
|
||||||
|
import shared
|
||||||
|
|
||||||
|
class Coursera(shared.Scraper):
|
||||||
|
provider_id = 2
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
self.retrieve_dataset()
|
||||||
|
self.parse_dataset()
|
||||||
|
|
||||||
|
def retrieve_dataset(self):
|
||||||
|
self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
|
||||||
|
|
||||||
|
def parse_dataset(self):
|
||||||
|
for item in self.dataset:
|
||||||
|
self.process_item(item)
|
||||||
|
|
||||||
|
def process_item(self, item):
|
||||||
|
inserted, rowid = self.insert_topic(str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
|
||||||
|
|
||||||
|
if inserted:
|
||||||
|
self.env.log("Inserted topic %s" % item["name"])
|
||||||
|
else:
|
||||||
|
self.env.log("Skipped topic %s" % item["name"])
|
||||||
|
|
||||||
|
for course in item["courses"]:
|
||||||
|
self.process_course(course, rowid)
|
||||||
|
|
||||||
|
def process_course(self, course, topicid):
|
||||||
|
try:
|
||||||
|
start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
|
||||||
|
except TypeError, e:
|
||||||
|
start_date = None
|
||||||
|
|
||||||
|
title = self.generate_title(course['name'], start_date)
|
||||||
|
|
||||||
|
inserted, itemid = self.insert_item(str(course["id"]), title, course["home_link"], has_topic=True, itemtype=self.COURSE, description=course["certificate_description"], start_date=start_date, topic_id=topicid)
|
||||||
|
|
||||||
|
if inserted:
|
||||||
|
self.env.log("Inserted item %s" % title)
|
||||||
|
else:
|
||||||
|
self.env.log("Skipped item %s" % title)
|
||||||
|
|
||||||
|
def generate_title(self, name, date):
|
||||||
|
if date is None:
|
||||||
|
return "%s (date undetermined)" % name
|
||||||
|
else:
|
||||||
|
return "%s (starting %s)" % (name, date.strftime("%b %d, %Y"))
|
||||||
|
|
@ -0,0 +1,26 @@
|
|||||||
|
import inspect, os, sys
|
||||||
|
|
||||||
|
my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
|
||||||
|
|
||||||
|
def _import_module_into_scope(modulename):
|
||||||
|
module = __import__(modulename)
|
||||||
|
|
||||||
|
for name in vars(module):
|
||||||
|
data = getattr(module, name)
|
||||||
|
globals()[name] = data
|
||||||
|
|
||||||
|
sys.path.insert(0, my_path)
|
||||||
|
|
||||||
|
for fname in os.listdir(my_path):
|
||||||
|
fpath = os.path.join(my_path, fname)
|
||||||
|
fbasename, fext = os.path.splitext(fname)
|
||||||
|
|
||||||
|
if os.path.isdir(fpath):
|
||||||
|
if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
|
||||||
|
# This is a python directory module
|
||||||
|
_import_module_into_scope(fname)
|
||||||
|
elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
|
||||||
|
# This is a python file module
|
||||||
|
_import_module_into_scope(fbasename)
|
||||||
|
|
||||||
|
sys.path.remove(my_path)
|
@ -0,0 +1,14 @@
|
|||||||
|
import oursql
|
||||||
|
|
||||||
|
class Environment(object):
|
||||||
|
def connect(self, host="localhost", username="root", password="", database="learn"):
|
||||||
|
self.db = oursql.connect(host=host, user=username, passwd=password, db=database)
|
||||||
|
self.connected = True
|
||||||
|
|
||||||
|
def log(self, text):
|
||||||
|
print text
|
||||||
|
|
||||||
|
def Scraper(self, scraper_class):
|
||||||
|
s = scraper_class(self.db)
|
||||||
|
s.env = self
|
||||||
|
return s
|
@ -0,0 +1,100 @@
|
|||||||
|
class Scraper(object):
|
||||||
|
UNKNOWN = 0
|
||||||
|
TOPIC = 1
|
||||||
|
COURSE = 2
|
||||||
|
VIDEO = 3
|
||||||
|
ARTICLE = 4
|
||||||
|
EXERCISE = 5
|
||||||
|
QUIZ = 6
|
||||||
|
TEST = 7
|
||||||
|
BOOK = 8
|
||||||
|
AUDIOBOOK = 9
|
||||||
|
LECTURE = 10
|
||||||
|
|
||||||
|
provider_id = 0
|
||||||
|
|
||||||
|
def __init__(self, database=None):
|
||||||
|
if database is not None:
|
||||||
|
self.db = database
|
||||||
|
self.can_store = True
|
||||||
|
else:
|
||||||
|
self.can_store = False
|
||||||
|
|
||||||
|
def run(self, *args, **kwargs):
|
||||||
|
raise Exception("No run() method was specified for this scraper.")
|
||||||
|
|
||||||
|
def insert_topic(self, unique_id, title, override=False, **kwargs):
|
||||||
|
defaults = {
|
||||||
|
"needs_enrollment": False,
|
||||||
|
"creation_date": None,
|
||||||
|
"start_date": None,
|
||||||
|
"end_date": None,
|
||||||
|
"parent_id": 0,
|
||||||
|
"description": "",
|
||||||
|
"provider_name": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
for kwarg, val in defaults.iteritems():
|
||||||
|
try:
|
||||||
|
if kwargs[kwarg] == None:
|
||||||
|
kwargs[kwarg] = defaults[kwarg]
|
||||||
|
except KeyError, e:
|
||||||
|
kwargs[kwarg] = defaults[kwarg]
|
||||||
|
|
||||||
|
c = self.db.cursor()
|
||||||
|
|
||||||
|
if override == True:
|
||||||
|
exists = False
|
||||||
|
else:
|
||||||
|
c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
|
||||||
|
results = c.fetchall()
|
||||||
|
exists = (len(results) > 0)
|
||||||
|
|
||||||
|
if exists == True:
|
||||||
|
return (False, results[0][0])
|
||||||
|
else:
|
||||||
|
c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`, `CustomProviderName`)"
|
||||||
|
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], self.provider_id, unique_id, title, kwargs['description'], kwargs['creation_date'],
|
||||||
|
kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'], kwargs["provider_name"]))
|
||||||
|
|
||||||
|
return (True, c.lastrowid)
|
||||||
|
|
||||||
|
def insert_item(self, unique_id, title, item_url, override=False, **kwargs):
|
||||||
|
defaults = {
|
||||||
|
"views": None,
|
||||||
|
"has_topic": False,
|
||||||
|
"itemtype": 0,
|
||||||
|
"source_url": item_url,
|
||||||
|
"topic_id": 0,
|
||||||
|
"parent_id": 0,
|
||||||
|
"description": "",
|
||||||
|
"date": None,
|
||||||
|
"start_date": None,
|
||||||
|
"end_date": None,
|
||||||
|
"provider_name": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
for kwarg, val in defaults.iteritems():
|
||||||
|
try:
|
||||||
|
if kwargs[kwarg] == None:
|
||||||
|
kwargs[kwarg] = defaults[kwarg]
|
||||||
|
except KeyError, e:
|
||||||
|
kwargs[kwarg] = defaults[kwarg]
|
||||||
|
|
||||||
|
c = self.db.cursor()
|
||||||
|
|
||||||
|
if override == True:
|
||||||
|
exists = False
|
||||||
|
else:
|
||||||
|
c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
|
||||||
|
results = c.fetchall()
|
||||||
|
exists = (len(results) > 0)
|
||||||
|
|
||||||
|
if exists == True:
|
||||||
|
return (False, results[0][0])
|
||||||
|
else:
|
||||||
|
c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`, `StartDate`, `EndDate`, `CustomProviderName`)"
|
||||||
|
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs["has_topic"], kwargs["itemtype"], self.provider_id, unique_id, title, kwargs["description"], item_url, kwargs["source_url"],
|
||||||
|
kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"]))
|
||||||
|
|
||||||
|
return (True, c.lastrowid)
|
@ -0,0 +1,8 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
import shared, scrapers
|
||||||
|
|
||||||
|
env = shared.Environment()
|
||||||
|
env.connect(host="localhost", username="root", password="", database="learn")
|
||||||
|
|
||||||
|
scraper = env.Scraper(scrapers.Coursera)
|
||||||
|
scraper.run()
|
@ -1,47 +0,0 @@
|
|||||||
import requests
|
|
||||||
import oursql
|
|
||||||
import datetime
|
|
||||||
import json
|
|
||||||
import lib
|
|
||||||
|
|
||||||
class CourseraCrawler(object):
|
|
||||||
def __init__(self):
|
|
||||||
self.db = lib.Database("localhost", "root")
|
|
||||||
|
|
||||||
def retrieve_dataset(self):
|
|
||||||
#self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
|
|
||||||
self.dataset = json.loads(open("coursera.json", "r").read())
|
|
||||||
|
|
||||||
def parse_dataset(self):
|
|
||||||
for item in self.dataset:
|
|
||||||
self.process_item(item)
|
|
||||||
|
|
||||||
def process_item(self, item):
|
|
||||||
inserted, rowid = self.db.insert_topic(2, str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
|
|
||||||
|
|
||||||
if inserted:
|
|
||||||
print "Inserted %s" % item["name"]
|
|
||||||
else:
|
|
||||||
print "Skipped %s" % item["name"]
|
|
||||||
|
|
||||||
for course in item["courses"]:
|
|
||||||
self.process_course(course, rowid)
|
|
||||||
|
|
||||||
def process_course(self, course, topicid):
|
|
||||||
try:
|
|
||||||
start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
|
|
||||||
title = "%s: %s-%s-%s" % (course["name"], str(course["start_year"]).zfill(4), str(course["start_month"]).zfill(2), str(course["start_day"]).zfill(2))
|
|
||||||
except TypeError, e:
|
|
||||||
start_date = None
|
|
||||||
title = "%s (date undetermined)" % (course["name"])
|
|
||||||
|
|
||||||
inserted, itemid = self.db.insert_item(2, str(course["id"]), True, self.db.COURSE, title, course["home_link"], description=course["certificate_description"], start_date=start_date, topic_id=topicid)
|
|
||||||
|
|
||||||
if inserted:
|
|
||||||
print "\tInserted %s" % title
|
|
||||||
else:
|
|
||||||
print "\tSkipped %s" % title
|
|
||||||
|
|
||||||
crawler = CourseraCrawler()
|
|
||||||
crawler.retrieve_dataset()
|
|
||||||
crawler.parse_dataset()
|
|
Loading…
Reference in New Issue