Compare commits
2 Commits
master
...
feature/in
Author | SHA1 | Date |
---|---|---|
Sven Slootweg | 1f32cac481 | 11 years ago |
Sven Slootweg | 4927b5e7a3 | 11 years ago |
@ -0,0 +1,7 @@
|
|||||||
|
import requests, re
|
||||||
|
|
||||||
|
source = "http://www.loc.gov/z3950/gateway.html"
|
||||||
|
|
||||||
|
for match in re.findall('"http:\/\/www\.loc\.gov\/cgi-bin\/zgstart\?ACTION=INIT&FORM_HOST_PORT=\/prod\/www\/data\/z3950\/.+\.html,([^,]+),([0-9]+)"', requests.get(source).text):
|
||||||
|
host, port = match
|
||||||
|
print "%s:%s" % (host, port)
|
@ -0,0 +1,159 @@
|
|||||||
|
# Problems:
|
||||||
|
# - How to handle list diffs? Can't just replace, would still lose data..
|
||||||
|
# - Over-engineering? Python already interns primitives, so no point in storing object references rather than just direct revision maps?
|
||||||
|
# -> Would still need to pre-process dicts and lists before storage, and compare them...
|
||||||
|
|
||||||
|
# Ideas:
|
||||||
|
# - Download PDF/EPUB headers and extract metadata from there
|
||||||
|
|
||||||
|
import string, random, copy
|
||||||
|
from collections import OrderedDict
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
|
||||||
|
class RevisionedDict(object):
|
||||||
|
def __init__(self, data):
|
||||||
|
self._revisions = OrderedDict({})
|
||||||
|
self._applied_diffs = []
|
||||||
|
self._add_revision(data)
|
||||||
|
|
||||||
|
def _generate_revision_id(self):
|
||||||
|
return "".join(random.choice(string.lowercase + string.uppercase + string.digits) for x in xrange(0, 14))
|
||||||
|
|
||||||
|
def _add_revision(self, data):
|
||||||
|
new_revision_id = self._generate_revision_id()
|
||||||
|
self._revisions[new_revision_id] = copy.deepcopy(data)
|
||||||
|
self._last_revision = new_revision_id
|
||||||
|
return new_revision_id
|
||||||
|
|
||||||
|
def _get_last_revision(self): # Always returns a copy!
|
||||||
|
base_revision = copy.deepcopy(self._revisions[self._last_revision])
|
||||||
|
base_revision["_rev"] = self._last_revision # This is to be able to identify the source revision for a modified serialized object later
|
||||||
|
return base_revision
|
||||||
|
|
||||||
|
def _apply_diff(self, diff):
|
||||||
|
new_data = diff.apply(self._get_last_revision(), self._diffs_since(diff.origin_revision))
|
||||||
|
new_revision_id = self._add_revision(new_data)
|
||||||
|
self._applied_diffs.append((new_revision_id, diff))
|
||||||
|
|
||||||
|
def _diffs_since(self, revision_id):
|
||||||
|
try:
|
||||||
|
revision_index = next(x for x in enumerate(self._applied_diffs) if x[1][0] == revision_id)
|
||||||
|
return [x[1] for x in self._applied_diffs[revision_index[0] + 1:]]
|
||||||
|
except StopIteration, e:
|
||||||
|
return [x[1] for x in self._applied_diffs]
|
||||||
|
|
||||||
|
def update(self, data):
|
||||||
|
diff = self.autodiff(data)
|
||||||
|
self._apply_diff(diff)
|
||||||
|
|
||||||
|
def diff(self, data, origin_revision):
|
||||||
|
# Figure out if any revisions happened in the meantime
|
||||||
|
return RevisionedDictDiff(data, self._revisions[origin_revision], origin_revision)
|
||||||
|
|
||||||
|
def autodiff(self, data):
|
||||||
|
# Takes the revision number from the data
|
||||||
|
return self.diff(data, data["_rev"])
|
||||||
|
|
||||||
|
class RevisionedDictDiff(object):
|
||||||
|
def __init__(self, data, origin_data, origin_revision):
|
||||||
|
self.origin_revision = origin_revision
|
||||||
|
self._diff_data = self._diff_structure(data, origin_data)
|
||||||
|
|
||||||
|
def _diff_structure(self, structure, origin_structure, structure_key=None):
|
||||||
|
if isinstance(structure, dict):
|
||||||
|
if isinstance(origin_structure, dict):
|
||||||
|
# Compare dicts
|
||||||
|
opcodes = []
|
||||||
|
|
||||||
|
removed_keys = set(origin_structure.keys()) - set(structure.keys())
|
||||||
|
|
||||||
|
for key in removed_keys:
|
||||||
|
opcodes.append(("delete", key))
|
||||||
|
|
||||||
|
new_keys = set(structure.keys()) - set(origin_structure.keys())
|
||||||
|
|
||||||
|
for key in new_keys:
|
||||||
|
if key != "_rev": # Ignore added _rev key
|
||||||
|
opcodes.append(("insert", key, structure[key]))
|
||||||
|
|
||||||
|
for key, value in structure.iteritems():
|
||||||
|
if key not in new_keys:
|
||||||
|
if value == origin_structure[key]:
|
||||||
|
opcodes.append(("equal", key))
|
||||||
|
else:
|
||||||
|
if isinstance(value, dict) or isinstance(value, list):
|
||||||
|
opcodes.append(self._diff_structure(value, origin_structure[key], structure_key=key))
|
||||||
|
else:
|
||||||
|
opcodes.append(("replace", key, value))
|
||||||
|
|
||||||
|
return ("dict", structure_key, opcodes)
|
||||||
|
else:
|
||||||
|
return ("replace", structure)
|
||||||
|
elif isinstance(structure, list):
|
||||||
|
if isinstance(origin_structure, list):
|
||||||
|
# Compare lists (does NOT support nested dictionaries yet!)
|
||||||
|
return ("list", structure_key, SequenceMatcher(a=origin_structure, b=structure, autojunk=False).get_opcodes())
|
||||||
|
else:
|
||||||
|
return ("replace", structure)
|
||||||
|
else:
|
||||||
|
return ("replace", structure)
|
||||||
|
|
||||||
|
def _apply_structure(self, structure, diff_data, intermediate_diffs):
|
||||||
|
pass
|
||||||
|
# for every key
|
||||||
|
# if list
|
||||||
|
# calculate_offsets (TODO)
|
||||||
|
# apply structure
|
||||||
|
# if dict
|
||||||
|
# apply structure
|
||||||
|
# else
|
||||||
|
# apply diff data
|
||||||
|
# return key
|
||||||
|
|
||||||
|
def apply(data, intermediate_diffs=[]):
|
||||||
|
# This will apply the diff against the specified source data
|
||||||
|
data = copy.deepcopy(data)
|
||||||
|
self._apply_structure(data, self._diff_data, intermediate_diffs)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
origin = {
|
||||||
|
"type": "message",
|
||||||
|
"data": {
|
||||||
|
"title": "Sample title",
|
||||||
|
"author": "Sample author",
|
||||||
|
"isbn": ["a0", "a1", "a2", "a3"],
|
||||||
|
"description": ["test one", "test two"],
|
||||||
|
"eq": ["a", "b", "c"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
"""
|
||||||
|
revdict = RevisionedDict(origin)
|
||||||
|
origin = revdict._get_last_revision()
|
||||||
|
|
||||||
|
origin["herp"] = "derp"
|
||||||
|
origin["data"]["isbn"].remove("a2")
|
||||||
|
origin["data"]["isbn"].insert(0, "a4")
|
||||||
|
origin["data"]["author"] = "Other author"
|
||||||
|
|
||||||
|
#import json
|
||||||
|
#print json.dumps(revdict.autodiff(origin)._diff_data, indent=4)
|
||||||
|
|
||||||
|
revdict.update(origin)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
revdict = RevisionedDict(origin)
|
||||||
|
|
||||||
|
for i in xrange(0, 5):
|
||||||
|
x = revdict._add_revision("blah")
|
||||||
|
revdict._applied_diffs.append((x, i))
|
||||||
|
|
||||||
|
base_rev = revdict._last_revision
|
||||||
|
|
||||||
|
for i in xrange(5, 10):
|
||||||
|
x = revdict._add_revision("blah")
|
||||||
|
revdict._applied_diffs.append((x, i))
|
||||||
|
|
||||||
|
print revdict._diffs_since(base_rev)
|
@ -0,0 +1,103 @@
|
|||||||
|
import lxml.html, requests, urlparse, re
|
||||||
|
from lxml import etree
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
endpoint = "http://caltsardragon.com:8080"
|
||||||
|
|
||||||
|
def get_date(string):
|
||||||
|
# Because for whatever reason, strptime doesn't work
|
||||||
|
month, year = string.split()
|
||||||
|
month_map = {
|
||||||
|
"Jan": 1,
|
||||||
|
"Feb": 2,
|
||||||
|
"Mar": 3,
|
||||||
|
"Apr": 4,
|
||||||
|
"May": 5,
|
||||||
|
"Jun": 6,
|
||||||
|
"Jul": 7,
|
||||||
|
"Aug": 8,
|
||||||
|
"Sep": 9,
|
||||||
|
"Oct": 10,
|
||||||
|
"Nov": 11,
|
||||||
|
"Dec": 12
|
||||||
|
}
|
||||||
|
|
||||||
|
return (int(year), month_map[month])
|
||||||
|
|
||||||
|
|
||||||
|
# We'll retrieve a list of all book IDs for this installation
|
||||||
|
response = requests.get("%s/browse/category/allbooks" % endpoint)
|
||||||
|
xml = lxml.html.fromstring(response.text.encode("utf-8"))
|
||||||
|
book_ids = {}
|
||||||
|
|
||||||
|
for item in xml.xpath("//*[@id='booklist']/div[@class='page']/div[@class='load_data']/@title"):
|
||||||
|
response = requests.post("%s/browse/booklist_page" % endpoint, data={"ids": item})
|
||||||
|
xml_titles = lxml.html.fromstring(response.json().encode("utf-8"))
|
||||||
|
title_map = {}
|
||||||
|
|
||||||
|
for subitem in xml_titles.xpath("//div[@class='summary']"):
|
||||||
|
#print str(etree.tostring(subitem))
|
||||||
|
id_ = subitem.xpath("div[@class='details-href']/@title")[0].split("/")[-1]
|
||||||
|
title = subitem.xpath("div/div[@class='title']/strong/text()")
|
||||||
|
book_ids[id_] = title
|
||||||
|
print "Done %s..." % item
|
||||||
|
|
||||||
|
for id_, title in book_ids.iteritems():
|
||||||
|
details_url = "/browse/details/%s" % id_
|
||||||
|
cover_url = "/get/cover/%s" % id_
|
||||||
|
|
||||||
|
response = requests.get(endpoint + details_url)
|
||||||
|
xml = lxml.html.fromstring(response.json().encode("utf-8"))
|
||||||
|
#print etree.tostring(xml)
|
||||||
|
|
||||||
|
downloads = {}
|
||||||
|
|
||||||
|
for item in xml.xpath("//div[@class='field formats']/a"):
|
||||||
|
filetype = item.get("title")
|
||||||
|
url = endpoint + item.get("href")
|
||||||
|
downloads[filetype.lower()] = url
|
||||||
|
|
||||||
|
isbn = xml.xpath("//div[@class='field']/a[starts-with(@title,'isbn:')]/text()")
|
||||||
|
amazon = xml.xpath("//div[@class='field']/a[starts-with(@title,'amazon:')]/@href")
|
||||||
|
google = xml.xpath("//div[@class='field']/a[starts-with(@title,'google:')]/@href")
|
||||||
|
|
||||||
|
tags = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by tags:')]/text()")
|
||||||
|
publish_date = [get_date(date) for date in xml.xpath("//div[@class='field' and strong/text() = 'Published: ']/text()")]
|
||||||
|
language = xml.xpath("//div[@class='field' and strong/text() = 'Languages: ']/text()")
|
||||||
|
publishers = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by publisher:')]/text()")
|
||||||
|
authors = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by authors:')]/text()")
|
||||||
|
|
||||||
|
series = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by series:')]/text()")
|
||||||
|
if len(series) > 0:
|
||||||
|
try:
|
||||||
|
series_title, series_id = re.match("(.+) \[(.+)\]$", series[0]).groups(1)
|
||||||
|
except AttributeError, e:
|
||||||
|
series_title, series_id = (None, None)
|
||||||
|
else:
|
||||||
|
series_title, series_id = (None, None)
|
||||||
|
|
||||||
|
print "%s: %s" % (series_title, series_id)
|
||||||
|
|
||||||
|
obj = {
|
||||||
|
"ids": {
|
||||||
|
"isbn": isbn,
|
||||||
|
"amazon": amazon,
|
||||||
|
"google": google,
|
||||||
|
},
|
||||||
|
"title": title,
|
||||||
|
"authors": authors,
|
||||||
|
"publishers": publishers,
|
||||||
|
"publish_date": publish_date,
|
||||||
|
"language": language,
|
||||||
|
"tags": tags,
|
||||||
|
"urls": downloads,
|
||||||
|
"cover_url": cover_url,
|
||||||
|
"series": [
|
||||||
|
{
|
||||||
|
"title": series_title,
|
||||||
|
"item": series_id
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
print obj
|
@ -0,0 +1,65 @@
|
|||||||
|
import requests, lxml.html, urlparse, time
|
||||||
|
|
||||||
|
sess = requests.Session()
|
||||||
|
sess.headers.update({"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2"})
|
||||||
|
|
||||||
|
base_uri = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22"
|
||||||
|
|
||||||
|
uri = base_uri
|
||||||
|
|
||||||
|
all_results = []
|
||||||
|
|
||||||
|
while True:
|
||||||
|
response = sess.get(uri)
|
||||||
|
xml = lxml.html.fromstring(response.text)
|
||||||
|
|
||||||
|
results = xml.xpath("//h3[@class='r']/a/@href")
|
||||||
|
next_ = xml.xpath("//a[@id='pnnext']/@href")
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
all_results.append(result)
|
||||||
|
|
||||||
|
if len(next_) > 0:
|
||||||
|
uri = urlparse.urljoin(uri, next_[0])
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
unique_results = []
|
||||||
|
|
||||||
|
for result in all_results:
|
||||||
|
print "Testing %s..." % result
|
||||||
|
try:
|
||||||
|
response = requests.get(result, timeout=10)
|
||||||
|
except requests.exceptions.RequestException, e:
|
||||||
|
# Dead, skip
|
||||||
|
continue
|
||||||
|
except socket.timeout, e:
|
||||||
|
# Also dead, this might be thrown instead of above (see https://github.com/kennethreitz/requests/issues/1797)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "Donate to support the development of calibre" not in response.text:
|
||||||
|
# Fake...
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Find base URI for this Calibre
|
||||||
|
xml = lxml.html.fromstring(response.text.encode("utf-8"))
|
||||||
|
|
||||||
|
try:
|
||||||
|
base_path = xml.xpath("//div[@id='header']//div[@class='bubble']//a/@href")[0]
|
||||||
|
except IndexError, e:
|
||||||
|
# Not found... probably not a Calibre, just a very good fake?
|
||||||
|
continue
|
||||||
|
|
||||||
|
result = urlparse.urljoin(result, base_path).rstrip("/")
|
||||||
|
|
||||||
|
if result.endswith("/browse"):
|
||||||
|
result = result[:-7]
|
||||||
|
|
||||||
|
if result not in unique_results:
|
||||||
|
print result
|
||||||
|
unique_results.append(result)
|
||||||
|
|
||||||
|
for result in unique_results:
|
||||||
|
pass#print result
|
@ -0,0 +1,156 @@
|
|||||||
|
import uuid, os, json, shutil, re
|
||||||
|
from collections import defaultdict, deque
|
||||||
|
|
||||||
|
class TaskManager(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.tasks = {}
|
||||||
|
|
||||||
|
def get(self, task_id):
|
||||||
|
return self.tasks[task_id]
|
||||||
|
|
||||||
|
def put(self, task_id, task_data):
|
||||||
|
# Persist on disk
|
||||||
|
try:
|
||||||
|
os.mkdirs("task_data/%s" % task_id)
|
||||||
|
except OSError, e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
with open("task_data", "w") as task_file:
|
||||||
|
task_file.write(json.dumps(task_data))
|
||||||
|
|
||||||
|
# Store in RAM
|
||||||
|
self.tasks[task_id] = task_data
|
||||||
|
|
||||||
|
def delete(self, task_id):
|
||||||
|
shutil.rmtree("task_data/%s" % task_id, ignore_errors=True)
|
||||||
|
del self[task_id]
|
||||||
|
|
||||||
|
class TaskDistributor(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.pools = defaultdict(deque)
|
||||||
|
self.processing = defaultdict(list)
|
||||||
|
|
||||||
|
def put(self, pool, task_id):
|
||||||
|
self.pools[pool].append(task_id)
|
||||||
|
|
||||||
|
def get(self, pool):
|
||||||
|
task_id = self.pools[pool].popleft()
|
||||||
|
self.processing[pool].append(task_id)
|
||||||
|
return task_id
|
||||||
|
|
||||||
|
def get_pools(self, task_id):
|
||||||
|
return [pool in self.pools if task_id in pool]
|
||||||
|
|
||||||
|
def done(self, pool, task_id):
|
||||||
|
self.processing[pool].remove(task_id)
|
||||||
|
|
||||||
|
def fail(self, pool, task_id):
|
||||||
|
# Re-add
|
||||||
|
self.processing[pool].remove(task_id)
|
||||||
|
self.put(pool, task_id)
|
||||||
|
|
||||||
|
class IsbnProcessor(object):
|
||||||
|
def clean(self, isbn):
|
||||||
|
isbn = isbn.upper().replace("-", "").replace(" ", "")
|
||||||
|
|
||||||
|
if len(isbn) == 9: # 9 digit SBN
|
||||||
|
isbn = "0" + isbn
|
||||||
|
|
||||||
|
return isbn
|
||||||
|
|
||||||
|
def validate(self, isbn):
|
||||||
|
isbn = self.clean(isbn)
|
||||||
|
|
||||||
|
if len(isbn) == 10:
|
||||||
|
total = 0
|
||||||
|
for i in xrange(0, 9):
|
||||||
|
total += (int(isbn[i]) * (10 - i))
|
||||||
|
|
||||||
|
check_digit = 11 - (total % 11)
|
||||||
|
if check_digit == 10:
|
||||||
|
check_digit = "X"
|
||||||
|
else:
|
||||||
|
check_digit = str(check_digit)
|
||||||
|
|
||||||
|
return (check_digit == isbn[9])
|
||||||
|
elif len(isbn) == 13:
|
||||||
|
odd = False
|
||||||
|
total = 0
|
||||||
|
for i in xrange(0, 12):
|
||||||
|
if odd:
|
||||||
|
total += int(isbn[i])
|
||||||
|
else:
|
||||||
|
total += int(isbn[i]) * 3
|
||||||
|
odd = not odd
|
||||||
|
|
||||||
|
check_digit = 10 - (total % 10)
|
||||||
|
if check_digit == 10:
|
||||||
|
check_digit = 0
|
||||||
|
check_digit = str(check_digit)
|
||||||
|
|
||||||
|
return (check_digit == isbn[12])
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
class BookTaskClassifier(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.isbn = IsbnProcessor()
|
||||||
|
|
||||||
|
def get_pools(self, task_data):
|
||||||
|
eligible_pools = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
for isbn in [isbn.strip() for isbn in task_data["book_data"]["ids"]["isbn"]]:
|
||||||
|
if self.isbn.validate(isbn) and "isbn" not in task_data["pools_done"]:
|
||||||
|
eligible_pools.append("isbn")
|
||||||
|
except KeyError, e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
for identifier in ("amazon", "google"):
|
||||||
|
try:
|
||||||
|
if task_data["book_data"]["ids"][identifier].strip() != "" and identifier not in task_data["pools_done"]:
|
||||||
|
eligible_pools.append(identifier)
|
||||||
|
except KeyError, e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if len(eligible_pools) == 0 and "title" not in task_data["pools_done"]:
|
||||||
|
eligible_pools.append("title")
|
||||||
|
|
||||||
|
return eligible_pools
|
||||||
|
|
||||||
|
class BookTaskManager(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.manager = TaskManager()
|
||||||
|
self.distributor = TaskDistributor()
|
||||||
|
self.classifier = BookTaskClassifier()
|
||||||
|
|
||||||
|
def new(self, book_data):
|
||||||
|
task_id = uuid.uuid4()
|
||||||
|
task_data = {
|
||||||
|
"id": task_id,
|
||||||
|
"book_data": book_data,
|
||||||
|
"pools_done": [],
|
||||||
|
"logs": [],
|
||||||
|
"flags": []
|
||||||
|
}
|
||||||
|
|
||||||
|
self.manager.put(task_id, task_data)
|
||||||
|
self.enqueue(task_id)
|
||||||
|
|
||||||
|
def enqueue(self, task_id):
|
||||||
|
task_data = self.manager.get(task_id)
|
||||||
|
pools = self.classifier.get_pools(task_data)
|
||||||
|
|
||||||
|
if len(pools) > 1:
|
||||||
|
for pool in pools:
|
||||||
|
self.distributor.put(pool, task_id)
|
||||||
|
else:
|
||||||
|
# No more pools to put this into... this is the best we have!
|
||||||
|
|
||||||
|
def get(self, pool):
|
||||||
|
task_id = self.distributor.get(pool)
|
||||||
|
return task_id
|
||||||
|
|
||||||
|
def done(self, pool, task_data):
|
||||||
|
self.distributor.done(pool, task_data["id"])
|
||||||
|
self.manager.put(task_data["id"], task_data) # conflicts..
|
Loading…
Reference in New Issue