Rewrite scraper to be more modular, and convert the Coursera crawler to the new model
parent
c2a8a66dac
commit
fb6c43a38f
@ -0,0 +1,26 @@
|
||||
import inspect, os, sys
|
||||
|
||||
my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
|
||||
|
||||
def _import_module_into_scope(modulename):
|
||||
module = __import__(modulename)
|
||||
|
||||
for name in vars(module):
|
||||
data = getattr(module, name)
|
||||
globals()[name] = data
|
||||
|
||||
sys.path.insert(0, my_path)
|
||||
|
||||
for fname in os.listdir(my_path):
|
||||
fpath = os.path.join(my_path, fname)
|
||||
fbasename, fext = os.path.splitext(fname)
|
||||
|
||||
if os.path.isdir(fpath):
|
||||
if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
|
||||
# This is a python directory module
|
||||
_import_module_into_scope(fname)
|
||||
elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
|
||||
# This is a python file module
|
||||
_import_module_into_scope(fbasename)
|
||||
|
||||
sys.path.remove(my_path)
|
@ -0,0 +1,50 @@
|
||||
import datetime, json, sys
|
||||
import requests, oursql
|
||||
import shared
|
||||
|
||||
class Coursera(shared.Scraper):
|
||||
provider_id = 2
|
||||
|
||||
def run(self):
|
||||
self.retrieve_dataset()
|
||||
self.parse_dataset()
|
||||
|
||||
def retrieve_dataset(self):
|
||||
self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
|
||||
|
||||
def parse_dataset(self):
|
||||
for item in self.dataset:
|
||||
self.process_item(item)
|
||||
|
||||
def process_item(self, item):
|
||||
inserted, rowid = self.insert_topic(str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
|
||||
|
||||
if inserted:
|
||||
self.env.log("Inserted topic %s" % item["name"])
|
||||
else:
|
||||
self.env.log("Skipped topic %s" % item["name"])
|
||||
|
||||
for course in item["courses"]:
|
||||
self.process_course(course, rowid)
|
||||
|
||||
def process_course(self, course, topicid):
|
||||
try:
|
||||
start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
|
||||
except TypeError, e:
|
||||
start_date = None
|
||||
|
||||
title = self.generate_title(course['name'], start_date)
|
||||
|
||||
inserted, itemid = self.insert_item(str(course["id"]), title, course["home_link"], has_topic=True, itemtype=self.COURSE, description=course["certificate_description"], start_date=start_date, topic_id=topicid)
|
||||
|
||||
if inserted:
|
||||
self.env.log("Inserted item %s" % title)
|
||||
else:
|
||||
self.env.log("Skipped item %s" % title)
|
||||
|
||||
def generate_title(self, name, date):
|
||||
if date is None:
|
||||
return "%s (date undetermined)" % name
|
||||
else:
|
||||
return "%s (starting %s)" % (name, date.strftime("%b %d, %Y"))
|
||||
|
@ -0,0 +1,26 @@
|
||||
import inspect, os, sys
|
||||
|
||||
my_path = os.path.dirname(inspect.getfile(inspect.currentframe()))
|
||||
|
||||
def _import_module_into_scope(modulename):
|
||||
module = __import__(modulename)
|
||||
|
||||
for name in vars(module):
|
||||
data = getattr(module, name)
|
||||
globals()[name] = data
|
||||
|
||||
sys.path.insert(0, my_path)
|
||||
|
||||
for fname in os.listdir(my_path):
|
||||
fpath = os.path.join(my_path, fname)
|
||||
fbasename, fext = os.path.splitext(fname)
|
||||
|
||||
if os.path.isdir(fpath):
|
||||
if os.path.isfile(os.path.join(my_path, fname, "__init__.py")):
|
||||
# This is a python directory module
|
||||
_import_module_into_scope(fname)
|
||||
elif os.path.isfile(fpath) and fext == ".py" and fbasename != "__init__":
|
||||
# This is a python file module
|
||||
_import_module_into_scope(fbasename)
|
||||
|
||||
sys.path.remove(my_path)
|
@ -0,0 +1,14 @@
|
||||
import oursql
|
||||
|
||||
class Environment(object):
|
||||
def connect(self, host="localhost", username="root", password="", database="learn"):
|
||||
self.db = oursql.connect(host=host, user=username, passwd=password, db=database)
|
||||
self.connected = True
|
||||
|
||||
def log(self, text):
|
||||
print text
|
||||
|
||||
def Scraper(self, scraper_class):
|
||||
s = scraper_class(self.db)
|
||||
s.env = self
|
||||
return s
|
@ -0,0 +1,100 @@
|
||||
class Scraper(object):
|
||||
UNKNOWN = 0
|
||||
TOPIC = 1
|
||||
COURSE = 2
|
||||
VIDEO = 3
|
||||
ARTICLE = 4
|
||||
EXERCISE = 5
|
||||
QUIZ = 6
|
||||
TEST = 7
|
||||
BOOK = 8
|
||||
AUDIOBOOK = 9
|
||||
LECTURE = 10
|
||||
|
||||
provider_id = 0
|
||||
|
||||
def __init__(self, database=None):
|
||||
if database is not None:
|
||||
self.db = database
|
||||
self.can_store = True
|
||||
else:
|
||||
self.can_store = False
|
||||
|
||||
def run(self, *args, **kwargs):
|
||||
raise Exception("No run() method was specified for this scraper.")
|
||||
|
||||
def insert_topic(self, unique_id, title, override=False, **kwargs):
|
||||
defaults = {
|
||||
"needs_enrollment": False,
|
||||
"creation_date": None,
|
||||
"start_date": None,
|
||||
"end_date": None,
|
||||
"parent_id": 0,
|
||||
"description": "",
|
||||
"provider_name": ""
|
||||
}
|
||||
|
||||
for kwarg, val in defaults.iteritems():
|
||||
try:
|
||||
if kwargs[kwarg] == None:
|
||||
kwargs[kwarg] = defaults[kwarg]
|
||||
except KeyError, e:
|
||||
kwargs[kwarg] = defaults[kwarg]
|
||||
|
||||
c = self.db.cursor()
|
||||
|
||||
if override == True:
|
||||
exists = False
|
||||
else:
|
||||
c.execute("SELECT `Id` FROM topics WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
|
||||
results = c.fetchall()
|
||||
exists = (len(results) > 0)
|
||||
|
||||
if exists == True:
|
||||
return (False, results[0][0])
|
||||
else:
|
||||
c.execute("INSERT INTO topics (`ParentId`, `Provider`, `ProviderId`, `Title`, `Description`, `Created`, `NeedsEnrollment`, `StartDate`, `EndDate`, `CustomProviderName`)"
|
||||
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs['parent_id'], self.provider_id, unique_id, title, kwargs['description'], kwargs['creation_date'],
|
||||
kwargs['needs_enrollment'], kwargs['start_date'], kwargs['end_date'], kwargs["provider_name"]))
|
||||
|
||||
return (True, c.lastrowid)
|
||||
|
||||
def insert_item(self, unique_id, title, item_url, override=False, **kwargs):
|
||||
defaults = {
|
||||
"views": None,
|
||||
"has_topic": False,
|
||||
"itemtype": 0,
|
||||
"source_url": item_url,
|
||||
"topic_id": 0,
|
||||
"parent_id": 0,
|
||||
"description": "",
|
||||
"date": None,
|
||||
"start_date": None,
|
||||
"end_date": None,
|
||||
"provider_name": ""
|
||||
}
|
||||
|
||||
for kwarg, val in defaults.iteritems():
|
||||
try:
|
||||
if kwargs[kwarg] == None:
|
||||
kwargs[kwarg] = defaults[kwarg]
|
||||
except KeyError, e:
|
||||
kwargs[kwarg] = defaults[kwarg]
|
||||
|
||||
c = self.db.cursor()
|
||||
|
||||
if override == True:
|
||||
exists = False
|
||||
else:
|
||||
c.execute("SELECT `Id` FROM items WHERE `Provider` = ? AND `ProviderId` = ? LIMIT 1", (self.provider_id, unique_id))
|
||||
results = c.fetchall()
|
||||
exists = (len(results) > 0)
|
||||
|
||||
if exists == True:
|
||||
return (False, results[0][0])
|
||||
else:
|
||||
c.execute("INSERT INTO items (`HasTopic`, `Type`, `Provider`, `ProviderId`, `Title`, `Description`, `ItemUrl`, `SourceUrl`, `Views`, `TopicId`, `ParentId`, `Date`, `StartDate`, `EndDate`, `CustomProviderName`)"
|
||||
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (kwargs["has_topic"], kwargs["itemtype"], self.provider_id, unique_id, title, kwargs["description"], item_url, kwargs["source_url"],
|
||||
kwargs["views"], kwargs["topic_id"], kwargs["parent_id"], kwargs["date"], kwargs["start_date"], kwargs["end_date"], kwargs["provider_name"]))
|
||||
|
||||
return (True, c.lastrowid)
|
@ -0,0 +1,8 @@
|
||||
#!/usr/bin/env python
|
||||
import shared, scrapers
|
||||
|
||||
env = shared.Environment()
|
||||
env.connect(host="localhost", username="root", password="", database="learn")
|
||||
|
||||
scraper = env.Scraper(scrapers.Coursera)
|
||||
scraper.run()
|
@ -1,47 +0,0 @@
|
||||
import requests
|
||||
import oursql
|
||||
import datetime
|
||||
import json
|
||||
import lib
|
||||
|
||||
class CourseraCrawler(object):
|
||||
def __init__(self):
|
||||
self.db = lib.Database("localhost", "root")
|
||||
|
||||
def retrieve_dataset(self):
|
||||
#self.dataset = requests.get("https://www.coursera.org/maestro/api/topic/list?full=1").json()
|
||||
self.dataset = json.loads(open("coursera.json", "r").read())
|
||||
|
||||
def parse_dataset(self):
|
||||
for item in self.dataset:
|
||||
self.process_item(item)
|
||||
|
||||
def process_item(self, item):
|
||||
inserted, rowid = self.db.insert_topic(2, str(item["id"]), item["name"], description=item["short_description"], needs_enrollment=True)
|
||||
|
||||
if inserted:
|
||||
print "Inserted %s" % item["name"]
|
||||
else:
|
||||
print "Skipped %s" % item["name"]
|
||||
|
||||
for course in item["courses"]:
|
||||
self.process_course(course, rowid)
|
||||
|
||||
def process_course(self, course, topicid):
|
||||
try:
|
||||
start_date = datetime.datetime(course["start_year"], course["start_month"], course["start_day"])
|
||||
title = "%s: %s-%s-%s" % (course["name"], str(course["start_year"]).zfill(4), str(course["start_month"]).zfill(2), str(course["start_day"]).zfill(2))
|
||||
except TypeError, e:
|
||||
start_date = None
|
||||
title = "%s (date undetermined)" % (course["name"])
|
||||
|
||||
inserted, itemid = self.db.insert_item(2, str(course["id"]), True, self.db.COURSE, title, course["home_link"], description=course["certificate_description"], start_date=start_date, topic_id=topicid)
|
||||
|
||||
if inserted:
|
||||
print "\tInserted %s" % title
|
||||
else:
|
||||
print "\tSkipped %s" % title
|
||||
|
||||
crawler = CourseraCrawler()
|
||||
crawler.retrieve_dataset()
|
||||
crawler.parse_dataset()
|
Loading…
Reference in New Issue