Bits and pieces of a new scraper and task distribution mechanism, and a (probably over-engineered) revisioned dict, also some ISBN scraping stuff

11 years ago · 4927b5e7a3
parent 6a0654b7cb
commit 4927b5e7a3
5 changed files with 464 additions and 0 deletions
--- a/isbn-scraper/tools/get-z3950-sources.py
+++ b/isbn-scraper/tools/get-z3950-sources.py
@ -0,0 +1,7 @@
 import requests, re
 source = "http://www.loc.gov/z3950/gateway.html"
 for match in re.findall('"http:\/\/www\.loc\.gov\/cgi-bin\/zgstart\?ACTION=INIT&FORM_HOST_PORT=\/prod\/www\/data\/z3950\/.+\.html,([^,]+),([0-9]+)"', requests.get(source).text):
 	host, port = match
 	print "%s:%s" % (host, port)
--- a/newcrawler/rev.py
+++ b/newcrawler/rev.py
@ -0,0 +1,133 @@
 import string, random
 def random_id():
 	return "".join(random.choice(string.lowercase + string.uppercase + string.digits) for x in xrange(0, 14))
 class RevisionedDict(object):
 	def __init__(self, parent=None):
 		self.latest_revision = ""
 		self.parent = parent
 		self.revisions = {}
 		self.objects = {}
 	def __eq__(self, other):
 		# This is a tricky one... we need to compare this RevisionedDict against the other thing - which is almost certainly a dict.
 		# We'll just compare keys and values.
 		try:
 			if set(self.keys()) != set(other.keys()):
 				return False
 		except AttributeError, e:
 			return False # Not a dict(-like)
 		latest_rev = self._get_latest_revision()
 		for key, value in other.iteritems():
 			if self.objects[latest_rev[key]] != value:
 				return False
 		return True
 	def __len__(self):
 		return len(self._get_latest_revision())
 	def __getitem__(self, key):
 		return self.objects[self._get_latest_revision()[key]]
 	def __setitem__(self, key, value):
 		obj = self._dump_latest_revision()
 		obj[key] = value
 		self.update(obj)
 	def __delitem__(self, key):
 		obj = self._dump_latest_revision()
 		del obj[key]
 		self.update(obj)
 	def __contains__(self, key):
 		return (key in self._get_latest_revision())
 	def keys(self):
 		return self._get_latest_revision().keys()
 	def values(self):
 		return [self.objects[id_] for id_ in self._get_latest_revision().values()]
 	def items(self):
 		return [(key, self.objects[id_]) for key, id_ in self._get_latest_revision().items()]
 	# The below are awful... this really isn't how iterators are supposed to work
 	def iterkeys(self):
 		return iter(self._get_latest_revision().keys())
 	def itervalues(self):
 		return iter([self.objects[id_] for id_ in self._get_latest_revision().values()])
 	def iteritems(self):
 		return iter([(key, self.objects[id_]) for key, id_ in self._get_latest_revision().items()])
 	# TODO: __iter__, __reversed__
 	def _add_revision(data):
 		object_map = {}
 		latest_rev = self._get_latest_revision()
 		anything_changed = False
 		for key in data.keys():
 			try:
 				try:
 					is_dict = isinstance(self.objects[latest_rev[key]][0], RevisionedDict)
 				except IndexError, e:
 					is_dict = False
 				if is_dict:
 					unchanged = self.objects[latest_rev[key]][0] == data[key]:
 				else:
 					unchanged = self.objects[latest_rev[key]] == data[key]:
 			except KeyError, e:
 				# Doesn't exist in last rev, new key
 				unchanged = False
 			if unchanged:
 				# Leave as it is
 				object_map[key] = latest_rev[key]
 			else:
 				# New data!
 				if isinstance(data[key], dict): # dict, just need to update values
 					new_sub_rev = self.objects[latest_rev[key]].update(data[key])
 					self.objects[new_id] = (self.objects[latest_rev[key]], new_sub_rev)
 				else:
 					new_id = random_id()
 					self.objects[new_id] = data[key]
 					object_map[key] = new_id
 				anything_changed = True
 		if anything_changed:
 			new_rev = random_id()
 			self.revisions[new_rev] = (self.latest_revision, object_map) # (parent revision, new object map)
 			return new_rev
 		else:
 			return latest_rev
 	def _get_latest_revision():
 		return self.revisions[self.latest_revision]
 	def _dump_latest_revision():
 		obj = {}
 		for key, id_ in self._get_latest_revision().iteritems():
 			obj[key] = self.objects[id_]
 		return obj
 	def update(data):
 		rev_id = self._add_revision(data)
 		self.latest_revision = rev_id
 		return rev_id
 	# TODO: compare!
 # Problems:
 #  - How to handle list diffs? Can't just replace, would still lose data..
 #  - Over-engineering? Python already interns primitives, so no point in storing object references rather than just direct revision maps?
 #      -> Would still need to pre-process dicts and lists before storage, and compare them...
 # Ideas:
 #  - Download PDF/EPUB headers and extract metadata from there
--- a/newcrawler/scrape/calibre.py
+++ b/newcrawler/scrape/calibre.py
@ -0,0 +1,103 @@
 import lxml.html, requests, urlparse, re
 from lxml import etree
 from datetime import datetime
 endpoint = "http://caltsardragon.com:8080"
 def get_date(string):
 	# Because for whatever reason, strptime doesn't work
 	month, year = string.split()
 	month_map = {
 		"Jan": 1,
 		"Feb": 2,
 		"Mar": 3,
 		"Apr": 4,
 		"May": 5,
 		"Jun": 6,
 		"Jul": 7,
 		"Aug": 8,
 		"Sep": 9,
 		"Oct": 10,
 		"Nov": 11,
 		"Dec": 12
 	}
 	return (int(year), month_map[month])
 # We'll retrieve a list of all book IDs for this installation
 response = requests.get("%s/browse/category/allbooks" % endpoint)
 xml = lxml.html.fromstring(response.text.encode("utf-8"))
 book_ids = {}
 for item in xml.xpath("//*[@id='booklist']/div[@class='page']/div[@class='load_data']/@title"):
 	response = requests.post("%s/browse/booklist_page" % endpoint, data={"ids": item})
 	xml_titles = lxml.html.fromstring(response.json().encode("utf-8"))
 	title_map = {}
 	for subitem in xml_titles.xpath("//div[@class='summary']"):
 		#print str(etree.tostring(subitem))
 		id_ = subitem.xpath("div[@class='details-href']/@title")[0].split("/")[-1]
 		title = subitem.xpath("div/div[@class='title']/strong/text()")
 		book_ids[id_] = title
 	print "Done %s..." % item
 for id_, title in book_ids.iteritems():
 	details_url = "/browse/details/%s" % id_
 	cover_url = "/get/cover/%s" % id_
 	response = requests.get(endpoint + details_url)
 	xml = lxml.html.fromstring(response.json().encode("utf-8"))
 	#print etree.tostring(xml)
 	downloads = {}
 	for item in xml.xpath("//div[@class='field formats']/a"):
 		filetype = item.get("title")
 		url = endpoint + item.get("href")
 		downloads[filetype.lower()] = url
 	isbn = xml.xpath("//div[@class='field']/a[starts-with(@title,'isbn:')]/text()")
 	amazon = xml.xpath("//div[@class='field']/a[starts-with(@title,'amazon:')]/@href")
 	google = xml.xpath("//div[@class='field']/a[starts-with(@title,'google:')]/@href")
 	tags = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by tags:')]/text()")
 	publish_date = [get_date(date) for date in xml.xpath("//div[@class='field' and strong/text() = 'Published: ']/text()")]
 	language = xml.xpath("//div[@class='field' and strong/text() = 'Languages: ']/text()")
 	publishers = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by publisher:')]/text()")
 	authors = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by authors:')]/text()")
 	series = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by series:')]/text()")
 	if len(series) > 0:
 		try:
 			series_title, series_id = re.match("(.+) \[(.+)\]$", series[0]).groups(1)
 		except AttributeError, e:
 			series_title, series_id = (None, None)
 	else:
 		series_title, series_id = (None, None)
 	print "%s: %s" % (series_title, series_id)
 	obj = {
 		"ids": {
 			"isbn": isbn,
 			"amazon": amazon,
 			"google": google,
 		},
 		"title": title,
 		"authors": authors,
 		"publishers": publishers,
 		"publish_date": publish_date,
 		"language": language,
 		"tags": tags,
 		"urls": downloads,
 		"cover_url": cover_url,
 		"series": [
 			{
 				"title": series_title,
 				"item": series_id
 			}
 		]
 	}
 	print obj
--- a/newcrawler/scrape/find-calibre.py
+++ b/newcrawler/scrape/find-calibre.py
@ -0,0 +1,65 @@
 import requests, lxml.html, urlparse, time
 sess = requests.Session()
 sess.headers.update({"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2"})
 base_uri = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22"
 uri = base_uri
 all_results = []
 while True:
 	response = sess.get(uri)
 	xml = lxml.html.fromstring(response.text)
 	results = xml.xpath("//h3[@class='r']/a/@href")
 	next_ = xml.xpath("//a[@id='pnnext']/@href")
 	for result in results:
 		all_results.append(result)
 	if len(next_) > 0:
 		uri = urlparse.urljoin(uri, next_[0])
 	else:
 		break
 	time.sleep(1)
 unique_results = []
 for result in all_results:
 	print "Testing %s..." % result
 	try:
 		response = requests.get(result, timeout=10)
 	except requests.exceptions.RequestException, e:
 		# Dead, skip
 		continue
 	except socket.timeout, e:
 		# Also dead, this might be thrown instead of above (see https://github.com/kennethreitz/requests/issues/1797)
 		continue
 	if "Donate to support the development of calibre" not in response.text:
 		# Fake...
 		continue
 	# Find base URI for this Calibre
 	xml = lxml.html.fromstring(response.text.encode("utf-8"))
 	try:
 		base_path = xml.xpath("//div[@id='header']//div[@class='bubble']//a/@href")[0]
 	except IndexError, e:
 		# Not found... probably not a Calibre, just a very good fake?
 		continue
 	result = urlparse.urljoin(result, base_path).rstrip("/")
 	if result.endswith("/browse"):
 		result = result[:-7]
 	if result not in unique_results:
 		print result
 		unique_results.append(result)
 for result in unique_results:
 	pass#print result
--- a/newcrawler/tasks.py
+++ b/newcrawler/tasks.py
@ -0,0 +1,156 @@
 import uuid, os, json, shutil, re
 from collections import defaultdict, deque
 class TaskManager(object):
 	def __init__(self):
 		self.tasks = {}
 	def get(self, task_id):
 		return self.tasks[task_id]
 	def put(self, task_id, task_data):
 		# Persist on disk
 		try:
 			os.mkdirs("task_data/%s" % task_id)
 		except OSError, e:
 			pass
 		with open("task_data", "w") as task_file:
 			task_file.write(json.dumps(task_data))
 		# Store in RAM
 		self.tasks[task_id] = task_data
 	def delete(self, task_id):
 		shutil.rmtree("task_data/%s" % task_id, ignore_errors=True)
 		del self[task_id]
 class TaskDistributor(object):
 	def __init__(self):
 		self.pools = defaultdict(deque)
 		self.processing = defaultdict(list)
 	def put(self, pool, task_id):
 		self.pools[pool].append(task_id)
 	def get(self, pool):
 		task_id = self.pools[pool].popleft()
 		self.processing[pool].append(task_id)
 		return task_id
 	def get_pools(self, task_id):
 		return [pool in self.pools if task_id in pool]
 	def done(self, pool, task_id):
 		self.processing[pool].remove(task_id)
 	def fail(self, pool, task_id):
 		# Re-add
 		self.processing[pool].remove(task_id)
 		self.put(pool, task_id)
 class IsbnProcessor(object):
 	def clean(self, isbn):
 		isbn = isbn.upper().replace("-", "").replace(" ", "")
 		if len(isbn) == 9: # 9 digit SBN
 			isbn = "0" + isbn
 		return isbn
 	def validate(self, isbn):
 		isbn = self.clean(isbn)
 		if len(isbn) == 10:
 			total = 0
 			for i in xrange(0, 9):
 				total += (int(isbn[i]) * (10 - i))
 			check_digit = 11 - (total % 11)
 			if check_digit == 10:
 				check_digit = "X"
 			else:
 				check_digit = str(check_digit)
 			return (check_digit == isbn[9])
 		elif len(isbn) == 13:
 			odd = False
 			total = 0
 			for i in xrange(0, 12):
 				if odd:
 					total += int(isbn[i])
 				else:
 					total += int(isbn[i]) * 3
 				odd = not odd
 			check_digit = 10 - (total % 10)
 			if check_digit == 10:
 				check_digit = 0
 			check_digit = str(check_digit)
 			return (check_digit == isbn[12])
 		else:
 			return False
 class BookTaskClassifier(object):
 	def __init__(self):
 		self.isbn = IsbnProcessor()
 	def get_pools(self, task_data):
 		eligible_pools = []
 		try:
 			for isbn in [isbn.strip() for isbn in task_data["book_data"]["ids"]["isbn"]]:
 				if self.isbn.validate(isbn) and "isbn" not in task_data["pools_done"]:
 					eligible_pools.append("isbn")
 		except KeyError, e:
 			pass
 		for identifier in ("amazon", "google"):
 			try:
 				if task_data["book_data"]["ids"][identifier].strip() != "" and identifier not in task_data["pools_done"]:
 					eligible_pools.append(identifier)
 			except KeyError, e:
 				pass
 		if len(eligible_pools) == 0 and "title" not in task_data["pools_done"]:
 			eligible_pools.append("title")
 		return eligible_pools
 class BookTaskManager(object):
 	def __init__(self):
 		self.manager = TaskManager()
 		self.distributor = TaskDistributor()
 		self.classifier = BookTaskClassifier()
 	def new(self, book_data):
 		task_id = uuid.uuid4()
 		task_data = {
 			"id": task_id,
 			"book_data": book_data,
 			"pools_done": [],
 			"logs": [],
 			"flags": []
 		}
 		self.manager.put(task_id, task_data)
 		self.enqueue(task_id)
 	def enqueue(self, task_id):
 		task_data = self.manager.get(task_id)
 		pools = self.classifier.get_pools(task_data)
 		if len(pools) > 1:
 			for pool in pools:
 				self.distributor.put(pool, task_id)
 		else:
 			# No more pools to put this into... this is the best we have!
 	def get(self, pool):
 		task_id = self.distributor.get(pool)
 		return task_id
 	def done(self, pool, task_data):
 		self.distributor.done(pool, task_data["id"])
 		self.manager.put(task_data["id"], task_data) # conflicts..