From 4927b5e7a380e5fdfda1517b9743810186e0f823 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 15 Dec 2013 17:45:35 +0100 Subject: [PATCH] Bits and pieces of a new scraper and task distribution mechanism, and a (probably over-engineered) revisioned dict, also some ISBN scraping stuff --- isbn-scraper/tools/get-z3950-sources.py | 7 ++ newcrawler/rev.py | 133 ++++++++++++++++++++ newcrawler/scrape/calibre.py | 103 ++++++++++++++++ newcrawler/scrape/find-calibre.py | 65 ++++++++++ newcrawler/tasks.py | 156 ++++++++++++++++++++++++ 5 files changed, 464 insertions(+) create mode 100644 isbn-scraper/tools/get-z3950-sources.py create mode 100644 newcrawler/rev.py create mode 100644 newcrawler/scrape/calibre.py create mode 100644 newcrawler/scrape/find-calibre.py create mode 100644 newcrawler/tasks.py diff --git a/isbn-scraper/tools/get-z3950-sources.py b/isbn-scraper/tools/get-z3950-sources.py new file mode 100644 index 0000000..b5a8d72 --- /dev/null +++ b/isbn-scraper/tools/get-z3950-sources.py @@ -0,0 +1,7 @@ +import requests, re + +source = "http://www.loc.gov/z3950/gateway.html" + +for match in re.findall('"http:\/\/www\.loc\.gov\/cgi-bin\/zgstart\?ACTION=INIT&FORM_HOST_PORT=\/prod\/www\/data\/z3950\/.+\.html,([^,]+),([0-9]+)"', requests.get(source).text): + host, port = match + print "%s:%s" % (host, port) diff --git a/newcrawler/rev.py b/newcrawler/rev.py new file mode 100644 index 0000000..9d5902a --- /dev/null +++ b/newcrawler/rev.py @@ -0,0 +1,133 @@ +import string, random + +def random_id(): + return "".join(random.choice(string.lowercase + string.uppercase + string.digits) for x in xrange(0, 14)) + +class RevisionedDict(object): + def __init__(self, parent=None): + self.latest_revision = "" + self.parent = parent + self.revisions = {} + self.objects = {} + + def __eq__(self, other): + # This is a tricky one... we need to compare this RevisionedDict against the other thing - which is almost certainly a dict. + # We'll just compare keys and values. + try: + if set(self.keys()) != set(other.keys()): + return False + except AttributeError, e: + return False # Not a dict(-like) + + latest_rev = self._get_latest_revision() + for key, value in other.iteritems(): + if self.objects[latest_rev[key]] != value: + return False + + return True + + def __len__(self): + return len(self._get_latest_revision()) + + def __getitem__(self, key): + return self.objects[self._get_latest_revision()[key]] + + def __setitem__(self, key, value): + obj = self._dump_latest_revision() + obj[key] = value + self.update(obj) + + def __delitem__(self, key): + obj = self._dump_latest_revision() + del obj[key] + self.update(obj) + + def __contains__(self, key): + return (key in self._get_latest_revision()) + + def keys(self): + return self._get_latest_revision().keys() + + def values(self): + return [self.objects[id_] for id_ in self._get_latest_revision().values()] + + def items(self): + return [(key, self.objects[id_]) for key, id_ in self._get_latest_revision().items()] + + # The below are awful... this really isn't how iterators are supposed to work + + def iterkeys(self): + return iter(self._get_latest_revision().keys()) + + def itervalues(self): + return iter([self.objects[id_] for id_ in self._get_latest_revision().values()]) + + def iteritems(self): + return iter([(key, self.objects[id_]) for key, id_ in self._get_latest_revision().items()]) + + # TODO: __iter__, __reversed__ + + def _add_revision(data): + object_map = {} + latest_rev = self._get_latest_revision() + anything_changed = False + + for key in data.keys(): + try: + try: + is_dict = isinstance(self.objects[latest_rev[key]][0], RevisionedDict) + except IndexError, e: + is_dict = False + + if is_dict: + unchanged = self.objects[latest_rev[key]][0] == data[key]: + else: + unchanged = self.objects[latest_rev[key]] == data[key]: + except KeyError, e: + # Doesn't exist in last rev, new key + unchanged = False + + if unchanged: + # Leave as it is + object_map[key] = latest_rev[key] + else: + # New data! + if isinstance(data[key], dict): # dict, just need to update values + new_sub_rev = self.objects[latest_rev[key]].update(data[key]) + self.objects[new_id] = (self.objects[latest_rev[key]], new_sub_rev) + else: + new_id = random_id() + self.objects[new_id] = data[key] + object_map[key] = new_id + anything_changed = True + + if anything_changed: + new_rev = random_id() + self.revisions[new_rev] = (self.latest_revision, object_map) # (parent revision, new object map) + return new_rev + else: + return latest_rev + + def _get_latest_revision(): + return self.revisions[self.latest_revision] + + def _dump_latest_revision(): + obj = {} + for key, id_ in self._get_latest_revision().iteritems(): + obj[key] = self.objects[id_] + return obj + + def update(data): + rev_id = self._add_revision(data) + self.latest_revision = rev_id + return rev_id + + # TODO: compare! + +# Problems: +# - How to handle list diffs? Can't just replace, would still lose data.. +# - Over-engineering? Python already interns primitives, so no point in storing object references rather than just direct revision maps? +# -> Would still need to pre-process dicts and lists before storage, and compare them... + +# Ideas: +# - Download PDF/EPUB headers and extract metadata from there diff --git a/newcrawler/scrape/calibre.py b/newcrawler/scrape/calibre.py new file mode 100644 index 0000000..9534826 --- /dev/null +++ b/newcrawler/scrape/calibre.py @@ -0,0 +1,103 @@ +import lxml.html, requests, urlparse, re +from lxml import etree +from datetime import datetime + +endpoint = "http://caltsardragon.com:8080" + +def get_date(string): + # Because for whatever reason, strptime doesn't work + month, year = string.split() + month_map = { + "Jan": 1, + "Feb": 2, + "Mar": 3, + "Apr": 4, + "May": 5, + "Jun": 6, + "Jul": 7, + "Aug": 8, + "Sep": 9, + "Oct": 10, + "Nov": 11, + "Dec": 12 + } + + return (int(year), month_map[month]) + + +# We'll retrieve a list of all book IDs for this installation +response = requests.get("%s/browse/category/allbooks" % endpoint) +xml = lxml.html.fromstring(response.text.encode("utf-8")) +book_ids = {} + +for item in xml.xpath("//*[@id='booklist']/div[@class='page']/div[@class='load_data']/@title"): + response = requests.post("%s/browse/booklist_page" % endpoint, data={"ids": item}) + xml_titles = lxml.html.fromstring(response.json().encode("utf-8")) + title_map = {} + + for subitem in xml_titles.xpath("//div[@class='summary']"): + #print str(etree.tostring(subitem)) + id_ = subitem.xpath("div[@class='details-href']/@title")[0].split("/")[-1] + title = subitem.xpath("div/div[@class='title']/strong/text()") + book_ids[id_] = title + print "Done %s..." % item + +for id_, title in book_ids.iteritems(): + details_url = "/browse/details/%s" % id_ + cover_url = "/get/cover/%s" % id_ + + response = requests.get(endpoint + details_url) + xml = lxml.html.fromstring(response.json().encode("utf-8")) + #print etree.tostring(xml) + + downloads = {} + + for item in xml.xpath("//div[@class='field formats']/a"): + filetype = item.get("title") + url = endpoint + item.get("href") + downloads[filetype.lower()] = url + + isbn = xml.xpath("//div[@class='field']/a[starts-with(@title,'isbn:')]/text()") + amazon = xml.xpath("//div[@class='field']/a[starts-with(@title,'amazon:')]/@href") + google = xml.xpath("//div[@class='field']/a[starts-with(@title,'google:')]/@href") + + tags = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by tags:')]/text()") + publish_date = [get_date(date) for date in xml.xpath("//div[@class='field' and strong/text() = 'Published: ']/text()")] + language = xml.xpath("//div[@class='field' and strong/text() = 'Languages: ']/text()") + publishers = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by publisher:')]/text()") + authors = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by authors:')]/text()") + + series = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by series:')]/text()") + if len(series) > 0: + try: + series_title, series_id = re.match("(.+) \[(.+)\]$", series[0]).groups(1) + except AttributeError, e: + series_title, series_id = (None, None) + else: + series_title, series_id = (None, None) + + print "%s: %s" % (series_title, series_id) + + obj = { + "ids": { + "isbn": isbn, + "amazon": amazon, + "google": google, + }, + "title": title, + "authors": authors, + "publishers": publishers, + "publish_date": publish_date, + "language": language, + "tags": tags, + "urls": downloads, + "cover_url": cover_url, + "series": [ + { + "title": series_title, + "item": series_id + } + ] + } + + print obj diff --git a/newcrawler/scrape/find-calibre.py b/newcrawler/scrape/find-calibre.py new file mode 100644 index 0000000..8efdbce --- /dev/null +++ b/newcrawler/scrape/find-calibre.py @@ -0,0 +1,65 @@ +import requests, lxml.html, urlparse, time + +sess = requests.Session() +sess.headers.update({"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2"}) + +base_uri = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22" + +uri = base_uri + +all_results = [] + +while True: + response = sess.get(uri) + xml = lxml.html.fromstring(response.text) + + results = xml.xpath("//h3[@class='r']/a/@href") + next_ = xml.xpath("//a[@id='pnnext']/@href") + + for result in results: + all_results.append(result) + + if len(next_) > 0: + uri = urlparse.urljoin(uri, next_[0]) + else: + break + + time.sleep(1) + +unique_results = [] + +for result in all_results: + print "Testing %s..." % result + try: + response = requests.get(result, timeout=10) + except requests.exceptions.RequestException, e: + # Dead, skip + continue + except socket.timeout, e: + # Also dead, this might be thrown instead of above (see https://github.com/kennethreitz/requests/issues/1797) + continue + + if "Donate to support the development of calibre" not in response.text: + # Fake... + continue + + # Find base URI for this Calibre + xml = lxml.html.fromstring(response.text.encode("utf-8")) + + try: + base_path = xml.xpath("//div[@id='header']//div[@class='bubble']//a/@href")[0] + except IndexError, e: + # Not found... probably not a Calibre, just a very good fake? + continue + + result = urlparse.urljoin(result, base_path).rstrip("/") + + if result.endswith("/browse"): + result = result[:-7] + + if result not in unique_results: + print result + unique_results.append(result) + +for result in unique_results: + pass#print result diff --git a/newcrawler/tasks.py b/newcrawler/tasks.py new file mode 100644 index 0000000..b4afb5e --- /dev/null +++ b/newcrawler/tasks.py @@ -0,0 +1,156 @@ +import uuid, os, json, shutil, re +from collections import defaultdict, deque + +class TaskManager(object): + def __init__(self): + self.tasks = {} + + def get(self, task_id): + return self.tasks[task_id] + + def put(self, task_id, task_data): + # Persist on disk + try: + os.mkdirs("task_data/%s" % task_id) + except OSError, e: + pass + + with open("task_data", "w") as task_file: + task_file.write(json.dumps(task_data)) + + # Store in RAM + self.tasks[task_id] = task_data + + def delete(self, task_id): + shutil.rmtree("task_data/%s" % task_id, ignore_errors=True) + del self[task_id] + +class TaskDistributor(object): + def __init__(self): + self.pools = defaultdict(deque) + self.processing = defaultdict(list) + + def put(self, pool, task_id): + self.pools[pool].append(task_id) + + def get(self, pool): + task_id = self.pools[pool].popleft() + self.processing[pool].append(task_id) + return task_id + + def get_pools(self, task_id): + return [pool in self.pools if task_id in pool] + + def done(self, pool, task_id): + self.processing[pool].remove(task_id) + + def fail(self, pool, task_id): + # Re-add + self.processing[pool].remove(task_id) + self.put(pool, task_id) + +class IsbnProcessor(object): + def clean(self, isbn): + isbn = isbn.upper().replace("-", "").replace(" ", "") + + if len(isbn) == 9: # 9 digit SBN + isbn = "0" + isbn + + return isbn + + def validate(self, isbn): + isbn = self.clean(isbn) + + if len(isbn) == 10: + total = 0 + for i in xrange(0, 9): + total += (int(isbn[i]) * (10 - i)) + + check_digit = 11 - (total % 11) + if check_digit == 10: + check_digit = "X" + else: + check_digit = str(check_digit) + + return (check_digit == isbn[9]) + elif len(isbn) == 13: + odd = False + total = 0 + for i in xrange(0, 12): + if odd: + total += int(isbn[i]) + else: + total += int(isbn[i]) * 3 + odd = not odd + + check_digit = 10 - (total % 10) + if check_digit == 10: + check_digit = 0 + check_digit = str(check_digit) + + return (check_digit == isbn[12]) + else: + return False + +class BookTaskClassifier(object): + def __init__(self): + self.isbn = IsbnProcessor() + + def get_pools(self, task_data): + eligible_pools = [] + + try: + for isbn in [isbn.strip() for isbn in task_data["book_data"]["ids"]["isbn"]]: + if self.isbn.validate(isbn) and "isbn" not in task_data["pools_done"]: + eligible_pools.append("isbn") + except KeyError, e: + pass + + for identifier in ("amazon", "google"): + try: + if task_data["book_data"]["ids"][identifier].strip() != "" and identifier not in task_data["pools_done"]: + eligible_pools.append(identifier) + except KeyError, e: + pass + + if len(eligible_pools) == 0 and "title" not in task_data["pools_done"]: + eligible_pools.append("title") + + return eligible_pools + +class BookTaskManager(object): + def __init__(self): + self.manager = TaskManager() + self.distributor = TaskDistributor() + self.classifier = BookTaskClassifier() + + def new(self, book_data): + task_id = uuid.uuid4() + task_data = { + "id": task_id, + "book_data": book_data, + "pools_done": [], + "logs": [], + "flags": [] + } + + self.manager.put(task_id, task_data) + self.enqueue(task_id) + + def enqueue(self, task_id): + task_data = self.manager.get(task_id) + pools = self.classifier.get_pools(task_data) + + if len(pools) > 1: + for pool in pools: + self.distributor.put(pool, task_id) + else: + # No more pools to put this into... this is the best we have! + + def get(self, pool): + task_id = self.distributor.get(pool) + return task_id + + def done(self, pool, task_data): + self.distributor.done(pool, task_data["id"]) + self.manager.put(task_data["id"], task_data) # conflicts..