Bits and pieces of a new scraper and task distribution mechanism, and a (probably over-engineered) revisioned dict, also some ISBN scraping stuff

feature/insanity
Sven Slootweg 11 years ago
parent 6a0654b7cb
commit 4927b5e7a3

@ -0,0 +1,7 @@
import requests, re
source = "http://www.loc.gov/z3950/gateway.html"
for match in re.findall('"http:\/\/www\.loc\.gov\/cgi-bin\/zgstart\?ACTION=INIT&FORM_HOST_PORT=\/prod\/www\/data\/z3950\/.+\.html,([^,]+),([0-9]+)"', requests.get(source).text):
host, port = match
print "%s:%s" % (host, port)

@ -0,0 +1,133 @@
import string, random
def random_id():
return "".join(random.choice(string.lowercase + string.uppercase + string.digits) for x in xrange(0, 14))
class RevisionedDict(object):
def __init__(self, parent=None):
self.latest_revision = ""
self.parent = parent
self.revisions = {}
self.objects = {}
def __eq__(self, other):
# This is a tricky one... we need to compare this RevisionedDict against the other thing - which is almost certainly a dict.
# We'll just compare keys and values.
try:
if set(self.keys()) != set(other.keys()):
return False
except AttributeError, e:
return False # Not a dict(-like)
latest_rev = self._get_latest_revision()
for key, value in other.iteritems():
if self.objects[latest_rev[key]] != value:
return False
return True
def __len__(self):
return len(self._get_latest_revision())
def __getitem__(self, key):
return self.objects[self._get_latest_revision()[key]]
def __setitem__(self, key, value):
obj = self._dump_latest_revision()
obj[key] = value
self.update(obj)
def __delitem__(self, key):
obj = self._dump_latest_revision()
del obj[key]
self.update(obj)
def __contains__(self, key):
return (key in self._get_latest_revision())
def keys(self):
return self._get_latest_revision().keys()
def values(self):
return [self.objects[id_] for id_ in self._get_latest_revision().values()]
def items(self):
return [(key, self.objects[id_]) for key, id_ in self._get_latest_revision().items()]
# The below are awful... this really isn't how iterators are supposed to work
def iterkeys(self):
return iter(self._get_latest_revision().keys())
def itervalues(self):
return iter([self.objects[id_] for id_ in self._get_latest_revision().values()])
def iteritems(self):
return iter([(key, self.objects[id_]) for key, id_ in self._get_latest_revision().items()])
# TODO: __iter__, __reversed__
def _add_revision(data):
object_map = {}
latest_rev = self._get_latest_revision()
anything_changed = False
for key in data.keys():
try:
try:
is_dict = isinstance(self.objects[latest_rev[key]][0], RevisionedDict)
except IndexError, e:
is_dict = False
if is_dict:
unchanged = self.objects[latest_rev[key]][0] == data[key]:
else:
unchanged = self.objects[latest_rev[key]] == data[key]:
except KeyError, e:
# Doesn't exist in last rev, new key
unchanged = False
if unchanged:
# Leave as it is
object_map[key] = latest_rev[key]
else:
# New data!
if isinstance(data[key], dict): # dict, just need to update values
new_sub_rev = self.objects[latest_rev[key]].update(data[key])
self.objects[new_id] = (self.objects[latest_rev[key]], new_sub_rev)
else:
new_id = random_id()
self.objects[new_id] = data[key]
object_map[key] = new_id
anything_changed = True
if anything_changed:
new_rev = random_id()
self.revisions[new_rev] = (self.latest_revision, object_map) # (parent revision, new object map)
return new_rev
else:
return latest_rev
def _get_latest_revision():
return self.revisions[self.latest_revision]
def _dump_latest_revision():
obj = {}
for key, id_ in self._get_latest_revision().iteritems():
obj[key] = self.objects[id_]
return obj
def update(data):
rev_id = self._add_revision(data)
self.latest_revision = rev_id
return rev_id
# TODO: compare!
# Problems:
# - How to handle list diffs? Can't just replace, would still lose data..
# - Over-engineering? Python already interns primitives, so no point in storing object references rather than just direct revision maps?
# -> Would still need to pre-process dicts and lists before storage, and compare them...
# Ideas:
# - Download PDF/EPUB headers and extract metadata from there

@ -0,0 +1,103 @@
import lxml.html, requests, urlparse, re
from lxml import etree
from datetime import datetime
endpoint = "http://caltsardragon.com:8080"
def get_date(string):
# Because for whatever reason, strptime doesn't work
month, year = string.split()
month_map = {
"Jan": 1,
"Feb": 2,
"Mar": 3,
"Apr": 4,
"May": 5,
"Jun": 6,
"Jul": 7,
"Aug": 8,
"Sep": 9,
"Oct": 10,
"Nov": 11,
"Dec": 12
}
return (int(year), month_map[month])
# We'll retrieve a list of all book IDs for this installation
response = requests.get("%s/browse/category/allbooks" % endpoint)
xml = lxml.html.fromstring(response.text.encode("utf-8"))
book_ids = {}
for item in xml.xpath("//*[@id='booklist']/div[@class='page']/div[@class='load_data']/@title"):
response = requests.post("%s/browse/booklist_page" % endpoint, data={"ids": item})
xml_titles = lxml.html.fromstring(response.json().encode("utf-8"))
title_map = {}
for subitem in xml_titles.xpath("//div[@class='summary']"):
#print str(etree.tostring(subitem))
id_ = subitem.xpath("div[@class='details-href']/@title")[0].split("/")[-1]
title = subitem.xpath("div/div[@class='title']/strong/text()")
book_ids[id_] = title
print "Done %s..." % item
for id_, title in book_ids.iteritems():
details_url = "/browse/details/%s" % id_
cover_url = "/get/cover/%s" % id_
response = requests.get(endpoint + details_url)
xml = lxml.html.fromstring(response.json().encode("utf-8"))
#print etree.tostring(xml)
downloads = {}
for item in xml.xpath("//div[@class='field formats']/a"):
filetype = item.get("title")
url = endpoint + item.get("href")
downloads[filetype.lower()] = url
isbn = xml.xpath("//div[@class='field']/a[starts-with(@title,'isbn:')]/text()")
amazon = xml.xpath("//div[@class='field']/a[starts-with(@title,'amazon:')]/@href")
google = xml.xpath("//div[@class='field']/a[starts-with(@title,'google:')]/@href")
tags = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by tags:')]/text()")
publish_date = [get_date(date) for date in xml.xpath("//div[@class='field' and strong/text() = 'Published: ']/text()")]
language = xml.xpath("//div[@class='field' and strong/text() = 'Languages: ']/text()")
publishers = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by publisher:')]/text()")
authors = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by authors:')]/text()")
series = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by series:')]/text()")
if len(series) > 0:
try:
series_title, series_id = re.match("(.+) \[(.+)\]$", series[0]).groups(1)
except AttributeError, e:
series_title, series_id = (None, None)
else:
series_title, series_id = (None, None)
print "%s: %s" % (series_title, series_id)
obj = {
"ids": {
"isbn": isbn,
"amazon": amazon,
"google": google,
},
"title": title,
"authors": authors,
"publishers": publishers,
"publish_date": publish_date,
"language": language,
"tags": tags,
"urls": downloads,
"cover_url": cover_url,
"series": [
{
"title": series_title,
"item": series_id
}
]
}
print obj

@ -0,0 +1,65 @@
import requests, lxml.html, urlparse, time
sess = requests.Session()
sess.headers.update({"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2"})
base_uri = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22"
uri = base_uri
all_results = []
while True:
response = sess.get(uri)
xml = lxml.html.fromstring(response.text)
results = xml.xpath("//h3[@class='r']/a/@href")
next_ = xml.xpath("//a[@id='pnnext']/@href")
for result in results:
all_results.append(result)
if len(next_) > 0:
uri = urlparse.urljoin(uri, next_[0])
else:
break
time.sleep(1)
unique_results = []
for result in all_results:
print "Testing %s..." % result
try:
response = requests.get(result, timeout=10)
except requests.exceptions.RequestException, e:
# Dead, skip
continue
except socket.timeout, e:
# Also dead, this might be thrown instead of above (see https://github.com/kennethreitz/requests/issues/1797)
continue
if "Donate to support the development of calibre" not in response.text:
# Fake...
continue
# Find base URI for this Calibre
xml = lxml.html.fromstring(response.text.encode("utf-8"))
try:
base_path = xml.xpath("//div[@id='header']//div[@class='bubble']//a/@href")[0]
except IndexError, e:
# Not found... probably not a Calibre, just a very good fake?
continue
result = urlparse.urljoin(result, base_path).rstrip("/")
if result.endswith("/browse"):
result = result[:-7]
if result not in unique_results:
print result
unique_results.append(result)
for result in unique_results:
pass#print result

@ -0,0 +1,156 @@
import uuid, os, json, shutil, re
from collections import defaultdict, deque
class TaskManager(object):
def __init__(self):
self.tasks = {}
def get(self, task_id):
return self.tasks[task_id]
def put(self, task_id, task_data):
# Persist on disk
try:
os.mkdirs("task_data/%s" % task_id)
except OSError, e:
pass
with open("task_data", "w") as task_file:
task_file.write(json.dumps(task_data))
# Store in RAM
self.tasks[task_id] = task_data
def delete(self, task_id):
shutil.rmtree("task_data/%s" % task_id, ignore_errors=True)
del self[task_id]
class TaskDistributor(object):
def __init__(self):
self.pools = defaultdict(deque)
self.processing = defaultdict(list)
def put(self, pool, task_id):
self.pools[pool].append(task_id)
def get(self, pool):
task_id = self.pools[pool].popleft()
self.processing[pool].append(task_id)
return task_id
def get_pools(self, task_id):
return [pool in self.pools if task_id in pool]
def done(self, pool, task_id):
self.processing[pool].remove(task_id)
def fail(self, pool, task_id):
# Re-add
self.processing[pool].remove(task_id)
self.put(pool, task_id)
class IsbnProcessor(object):
def clean(self, isbn):
isbn = isbn.upper().replace("-", "").replace(" ", "")
if len(isbn) == 9: # 9 digit SBN
isbn = "0" + isbn
return isbn
def validate(self, isbn):
isbn = self.clean(isbn)
if len(isbn) == 10:
total = 0
for i in xrange(0, 9):
total += (int(isbn[i]) * (10 - i))
check_digit = 11 - (total % 11)
if check_digit == 10:
check_digit = "X"
else:
check_digit = str(check_digit)
return (check_digit == isbn[9])
elif len(isbn) == 13:
odd = False
total = 0
for i in xrange(0, 12):
if odd:
total += int(isbn[i])
else:
total += int(isbn[i]) * 3
odd = not odd
check_digit = 10 - (total % 10)
if check_digit == 10:
check_digit = 0
check_digit = str(check_digit)
return (check_digit == isbn[12])
else:
return False
class BookTaskClassifier(object):
def __init__(self):
self.isbn = IsbnProcessor()
def get_pools(self, task_data):
eligible_pools = []
try:
for isbn in [isbn.strip() for isbn in task_data["book_data"]["ids"]["isbn"]]:
if self.isbn.validate(isbn) and "isbn" not in task_data["pools_done"]:
eligible_pools.append("isbn")
except KeyError, e:
pass
for identifier in ("amazon", "google"):
try:
if task_data["book_data"]["ids"][identifier].strip() != "" and identifier not in task_data["pools_done"]:
eligible_pools.append(identifier)
except KeyError, e:
pass
if len(eligible_pools) == 0 and "title" not in task_data["pools_done"]:
eligible_pools.append("title")
return eligible_pools
class BookTaskManager(object):
def __init__(self):
self.manager = TaskManager()
self.distributor = TaskDistributor()
self.classifier = BookTaskClassifier()
def new(self, book_data):
task_id = uuid.uuid4()
task_data = {
"id": task_id,
"book_data": book_data,
"pools_done": [],
"logs": [],
"flags": []
}
self.manager.put(task_id, task_data)
self.enqueue(task_id)
def enqueue(self, task_id):
task_data = self.manager.get(task_id)
pools = self.classifier.get_pools(task_data)
if len(pools) > 1:
for pool in pools:
self.distributor.put(pool, task_id)
else:
# No more pools to put this into... this is the best we have!
def get(self, pool):
task_id = self.distributor.get(pool)
return task_id
def done(self, pool, task_data):
self.distributor.done(pool, task_data["id"])
self.manager.put(task_data["id"], task_data) # conflicts..
Loading…
Cancel
Save