Rewrite scraper to be more modular, and convert the Coursera crawler to the new model

develop
Sven Slootweg 11 years ago
parent c2a8a66dac
commit fb6c43a38f

@ -0,0 +1,26 @@
import inspect, os, sys
my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
def _import_module_into_scope(modulename):
module = __import__(modulename)
for name in vars(module):
data = getattr(module, name)
globals()[name] = data
sys.path.insert(0, my_path)
for fname in os.listdir(my_path):
fpath = os.path.join(my_path, fname)
fbasename, fext = os.path.splitext(fname)
if os.path.isdir(fpath):
if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
# This is a python directory module
_import_module_into_scope(fname)
elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
# This is a python file module
_import_module_into_scope(fbasename)
sys.path.remove(my_path)

@ -0,0 +1,50 @@
import datetime, json, sys
import requests, oursql
import shared
class Coursera(shared.Scraper):
provider_id = 2
def run(self):
self.retrieve_dataset()
self.parse_dataset()
def retrieve_dataset(self):
self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
def parse_dataset(self):
for item in self.dataset:
self.process_item(item)
def process_item(self, item):
inserted, rowid = self.insert_topic(str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
if inserted:
self.env.log("Inserted topic %s" % item["name"])
else:
self.env.log("Skipped topic %s" % item["name"])
for course in item["courses"]:
self.process_course(course, rowid)
def process_course(self, course, topicid):
try:
start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
except TypeError, e:
start_date = None
title = self.generate_title(course['name'], start_date)
inserted, itemid = self.insert_item(str(course["id"]), title, course["home_link"], has_topic=True, itemtype=self.COURSE, description=course["certificate_description"], start_date=start_date, topic_id=topicid)
if inserted:
self.env.log("Inserted item %s" % title)
else:
self.env.log("Skipped item %s" % title)
def generate_title(self, name, date):
if date is None:
return "%s (date undetermined)" % name
else:
return "%s (starting %s)" % (name, date.strftime("%b %d, %Y"))

@ -0,0 +1,26 @@
import inspect, os, sys
my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
def _import_module_into_scope(modulename):
module = __import__(modulename)
for name in vars(module):
data = getattr(module, name)
globals()[name] = data
sys.path.insert(0, my_path)
for fname in os.listdir(my_path):
fpath = os.path.join(my_path, fname)
fbasename, fext = os.path.splitext(fname)
if os.path.isdir(fpath):
if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
# This is a python directory module
_import_module_into_scope(fname)
elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
# This is a python file module
_import_module_into_scope(fbasename)
sys.path.remove(my_path)

@ -0,0 +1,14 @@
import oursql
class Environment(object):
def connect(self, host="localhost", username="root", password="", database="learn"):
self.db = oursql.connect(host=host, user=username, passwd=password, db=database)
self.connected = True
def log(self, text):
print text
def Scraper(self, scraper_class):
s = scraper_class(self.db)
s.env = self
return s

@ -0,0 +1,100 @@
class Scraper(object):
UNKNOWN = 0
TOPIC = 1
COURSE = 2
VIDEO = 3
ARTICLE = 4
EXERCISE = 5
QUIZ = 6
TEST = 7
BOOK = 8
AUDIOBOOK = 9
LECTURE = 10
provider_id = 0
def __init__(self, database=None):
if database is not None:
self.db = database
self.can_store = True
else:
self.can_store = False
def run(self, *args, **kwargs):
raise Exception("No run() method was specified for this scraper.")
def insert_topic(self, unique_id, title, override=False, **kwargs):
defaults = {
"needs_enrollment": False,
"creation_date": None,
"start_date": None,
"end_date": None,
"parent_id": 0,
"description": "",
"provider_name": ""
}
for kwarg, val in defaults.iteritems():
try:
if kwargs[kwarg] == None:
kwargs[kwarg] = defaults[kwarg]
except KeyError, e:
kwargs[kwarg] = defaults[kwarg]
c = self.db.cursor()
if override == True:
exists = False
else:
c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
results = c.fetchall()
exists = (len(results) > 0)
if exists == True:
return (False, results[0][0])
else:
c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`, `CustomProviderName`)"
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], self.provider_id, unique_id, title, kwargs['description'], kwargs['creation_date'],
kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'], kwargs["provider_name"]))
return (True, c.lastrowid)
def insert_item(self, unique_id, title, item_url, override=False, **kwargs):
defaults = {
"views": None,
"has_topic": False,
"itemtype": 0,
"source_url": item_url,
"topic_id": 0,
"parent_id": 0,
"description": "",
"date": None,
"start_date": None,
"end_date": None,
"provider_name": ""
}
for kwarg, val in defaults.iteritems():
try:
if kwargs[kwarg] == None:
kwargs[kwarg] = defaults[kwarg]
except KeyError, e:
kwargs[kwarg] = defaults[kwarg]
c = self.db.cursor()
if override == True:
exists = False
else:
c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
results = c.fetchall()
exists = (len(results) > 0)
if exists == True:
return (False, results[0][0])
else:
c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`, `StartDate`, `EndDate`, `CustomProviderName`)"
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs["has_topic"], kwargs["itemtype"], self.provider_id, unique_id, title, kwargs["description"], item_url, kwargs["source_url"],
kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"]))
return (True, c.lastrowid)

@ -0,0 +1,8 @@
#!/usr/bin/env python
import shared, scrapers
env = shared.Environment()
env.connect(host="localhost", username="root", password="", database="learn")
scraper = env.Scraper(scrapers.Coursera)
scraper.run()

@ -1,47 +0,0 @@
import requests
import oursql
import datetime
import json
import lib
class CourseraCrawler(object):
def __init__(self):
self.db = lib.Database("localhost", "root")
def retrieve_dataset(self):
#self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
self.dataset = json.loads(open("coursera.json", "r").read())
def parse_dataset(self):
for item in self.dataset:
self.process_item(item)
def process_item(self, item):
inserted, rowid = self.db.insert_topic(2, str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
if inserted:
print "Inserted %s" % item["name"]
else:
print "Skipped %s" % item["name"]
for course in item["courses"]:
self.process_course(course, rowid)
def process_course(self, course, topicid):
try:
start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
title = "%s: %s-%s-%s" % (course["name"], str(course["start_year"]).zfill(4), str(course["start_month"]).zfill(2), str(course["start_day"]).zfill(2))
except TypeError, e:
start_date = None
title = "%s (date undetermined)" % (course["name"])
inserted, itemid = self.db.insert_item(2, str(course["id"]), True, self.db.COURSE, title, course["home_link"], description=course["certificate_description"], start_date=start_date, topic_id=topicid)
if inserted:
print "\tInserted %s" % title
else:
print "\tSkipped %s" % title
crawler = CourseraCrawler()
crawler.retrieve_dataset()
crawler.parse_dataset()
Loading…
Cancel
Save