In the process of redoing RevisionedDict

Bits and pieces of a new scraper and task distribution mechanism, and a (probably over-engineered) revisioned dict, also some ISBN scraping stuff
5 changed files with 490 additions and 0 deletions
--- a/isbn-scraper/tools/get-z3950-sources.py
+++ b/isbn-scraper/tools/get-z3950-sources.py
@ -0,0 +1,7 @@
+import requests, re
+
+source = "http://www.loc.gov/z3950/gateway.html"
+
+for match in re.findall('"http:\/\/www\.loc\.gov\/cgi-bin\/zgstart\?ACTION=INIT&FORM_HOST_PORT=\/prod\/www\/data\/z3950\/.+\.html,([^,]+),([0-9]+)"', requests.get(source).text):
+	host, port = match
+	print "%s:%s" % (host, port)
--- a/newcrawler/rev.py
+++ b/newcrawler/rev.py
@ -0,0 +1,159 @@
+# Problems:
+#  - How to handle list diffs? Can't just replace, would still lose data..
+#  - Over-engineering? Python already interns primitives, so no point in storing object references rather than just direct revision maps?
+#      -> Would still need to pre-process dicts and lists before storage, and compare them...
+
+# Ideas:
+#  - Download PDF/EPUB headers and extract metadata from there
+
+import string, random, copy
+from collections import OrderedDict
+from difflib import SequenceMatcher
+
+class RevisionedDict(object):
+	def __init__(self, data):
+		self._revisions = OrderedDict({})
+		self._applied_diffs = []
+		self._add_revision(data)
+		
+	def _generate_revision_id(self):
+		return "".join(random.choice(string.lowercase + string.uppercase + string.digits) for x in xrange(0, 14))
+	
+	def _add_revision(self, data):
+		new_revision_id = self._generate_revision_id()
+		self._revisions[new_revision_id] = copy.deepcopy(data)
+		self._last_revision = new_revision_id
+		return new_revision_id
+		
+	def _get_last_revision(self): # Always returns a copy!
+		base_revision = copy.deepcopy(self._revisions[self._last_revision])
+		base_revision["_rev"] = self._last_revision # This is to be able to identify the source revision for a modified serialized object later
+		return base_revision
+		
+	def _apply_diff(self, diff):
+		new_data = diff.apply(self._get_last_revision(), self._diffs_since(diff.origin_revision))
+		new_revision_id = self._add_revision(new_data)
+		self._applied_diffs.append((new_revision_id, diff))
+		
+	def _diffs_since(self, revision_id):
+		try:
+			revision_index = next(x for x in enumerate(self._applied_diffs) if x[1][0] == revision_id)
+			return [x[1] for x in self._applied_diffs[revision_index[0] + 1:]]
+		except StopIteration, e:
+			return [x[1] for x in self._applied_diffs]
+		
+	def update(self, data):
+		diff = self.autodiff(data)
+		self._apply_diff(diff)
+			
+	def diff(self, data, origin_revision):
+		# Figure out if any revisions happened in the meantime
+		return RevisionedDictDiff(data, self._revisions[origin_revision], origin_revision)
+		
+	def autodiff(self, data):
+		# Takes the revision number from the data
+		return self.diff(data, data["_rev"])
+		
+class RevisionedDictDiff(object):
+	def __init__(self, data, origin_data, origin_revision):
+		self.origin_revision = origin_revision
+		self._diff_data = self._diff_structure(data, origin_data)
+		
+	def _diff_structure(self, structure, origin_structure, structure_key=None):
+		if isinstance(structure, dict):
+			if isinstance(origin_structure, dict):
+				# Compare dicts
+				opcodes = []
+				
+				removed_keys = set(origin_structure.keys()) - set(structure.keys())
+				
+				for key in removed_keys:
+					opcodes.append(("delete", key))
+				
+				new_keys = set(structure.keys()) - set(origin_structure.keys())
+				
+				for key in new_keys:
+					if key != "_rev": # Ignore added _rev key
+						opcodes.append(("insert", key, structure[key]))
+					
+				for key, value in structure.iteritems():
+					if key not in new_keys:
+						if value == origin_structure[key]:
+							opcodes.append(("equal", key))
+						else:
+							if isinstance(value, dict) or isinstance(value, list):
+								opcodes.append(self._diff_structure(value, origin_structure[key], structure_key=key))
+							else:
+								opcodes.append(("replace", key, value))
+				
+				return ("dict", structure_key, opcodes)
+			else:
+				return ("replace", structure)
+		elif isinstance(structure, list):
+			if isinstance(origin_structure, list):
+				# Compare lists (does NOT support nested dictionaries yet!)
+				return ("list", structure_key, SequenceMatcher(a=origin_structure, b=structure, autojunk=False).get_opcodes())
+			else:
+				return ("replace", structure)
+		else:
+			return ("replace", structure)
+			
+	def _apply_structure(self, structure, diff_data, intermediate_diffs):
+		pass
+		# for every key
+			# if list
+				# calculate_offsets (TODO)
+				# apply structure
+			# if dict
+				# apply structure
+			# else
+				# apply diff data
+		# return key
+		
+	def apply(data, intermediate_diffs=[]):
+		# This will apply the diff against the specified source data
+		data = copy.deepcopy(data)
+		self._apply_structure(data, self._diff_data, intermediate_diffs)
+		
+		
+		
+origin = {
+	"type": "message",
+	"data": {
+		"title": "Sample title",
+		"author": "Sample author",
+		"isbn": ["a0", "a1", "a2", "a3"],
+		"description": ["test one", "test two"],
+		"eq": ["a", "b", "c"]
+	}
+}
+
+"""
+revdict = RevisionedDict(origin)
+origin = revdict._get_last_revision()
+
+origin["herp"] = "derp"
+origin["data"]["isbn"].remove("a2")
+origin["data"]["isbn"].insert(0, "a4")
+origin["data"]["author"] = "Other author"
+
+#import json
+#print json.dumps(revdict.autodiff(origin)._diff_data, indent=4)
+
+revdict.update(origin)
+
+"""
+
+revdict = RevisionedDict(origin)
+
+for i in xrange(0, 5):
+	x = revdict._add_revision("blah")
+	revdict._applied_diffs.append((x, i))
+	
+base_rev = revdict._last_revision
+
+for i in xrange(5, 10):
+	x = revdict._add_revision("blah")
+	revdict._applied_diffs.append((x, i))
+		
+print revdict._diffs_since(base_rev)
--- a/newcrawler/scrape/calibre.py
+++ b/newcrawler/scrape/calibre.py
@ -0,0 +1,103 @@
+import lxml.html, requests, urlparse, re
+from lxml import etree
+from datetime import datetime
+
+endpoint = "http://caltsardragon.com:8080"
+
+def get_date(string):
+	# Because for whatever reason, strptime doesn't work
+	month, year = string.split()
+	month_map = {
+		"Jan": 1,
+		"Feb": 2,
+		"Mar": 3,
+		"Apr": 4,
+		"May": 5,
+		"Jun": 6,
+		"Jul": 7,
+		"Aug": 8,
+		"Sep": 9,
+		"Oct": 10,
+		"Nov": 11,
+		"Dec": 12
+	}
+	
+	return (int(year), month_map[month])
+
+	
+# We'll retrieve a list of all book IDs for this installation
+response = requests.get("%s/browse/category/allbooks" % endpoint)
+xml = lxml.html.fromstring(response.text.encode("utf-8"))
+book_ids = {}
+
+for item in xml.xpath("//*[@id='booklist']/div[@class='page']/div[@class='load_data']/@title"):
+	response = requests.post("%s/browse/booklist_page" % endpoint, data={"ids": item})
+	xml_titles = lxml.html.fromstring(response.json().encode("utf-8"))
+	title_map = {}
+	
+	for subitem in xml_titles.xpath("//div[@class='summary']"):
+		#print str(etree.tostring(subitem))
+		id_ = subitem.xpath("div[@class='details-href']/@title")[0].split("/")[-1]
+		title = subitem.xpath("div/div[@class='title']/strong/text()")
+		book_ids[id_] = title
+	print "Done %s..." % item
+
+for id_, title in book_ids.iteritems():
+	details_url = "/browse/details/%s" % id_
+	cover_url = "/get/cover/%s" % id_
+	
+	response = requests.get(endpoint + details_url)
+	xml = lxml.html.fromstring(response.json().encode("utf-8"))
+	#print etree.tostring(xml)
+	
+	downloads = {}
+	
+	for item in xml.xpath("//div[@class='field formats']/a"):
+		filetype = item.get("title")
+		url = endpoint + item.get("href")
+		downloads[filetype.lower()] = url
+		
+	isbn = xml.xpath("//div[@class='field']/a[starts-with(@title,'isbn:')]/text()")
+	amazon = xml.xpath("//div[@class='field']/a[starts-with(@title,'amazon:')]/@href")
+	google = xml.xpath("//div[@class='field']/a[starts-with(@title,'google:')]/@href")
+	
+	tags = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by tags:')]/text()")
+	publish_date = [get_date(date) for date in xml.xpath("//div[@class='field' and strong/text() = 'Published: ']/text()")]
+	language = xml.xpath("//div[@class='field' and strong/text() = 'Languages: ']/text()")
+	publishers = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by publisher:')]/text()")
+	authors = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by authors:')]/text()")
+	
+	series = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by series:')]/text()")
+	if len(series) > 0:
+		try:
+			series_title, series_id = re.match("(.+) \[(.+)\]$", series[0]).groups(1)
+		except AttributeError, e:
+			series_title, series_id = (None, None)
+	else:
+		series_title, series_id = (None, None)
+		
+	print "%s: %s" % (series_title, series_id)
+	
+	obj = {
+		"ids": {
+			"isbn": isbn,
+			"amazon": amazon,
+			"google": google,
+		},
+		"title": title,
+		"authors": authors,
+		"publishers": publishers,
+		"publish_date": publish_date,
+		"language": language,
+		"tags": tags,
+		"urls": downloads,
+		"cover_url": cover_url,
+		"series": [
+			{
+				"title": series_title,
+				"item": series_id
+			}
+		]
+	}
+	
+	print obj
--- a/newcrawler/scrape/find-calibre.py
+++ b/newcrawler/scrape/find-calibre.py
@ -0,0 +1,65 @@
+import requests, lxml.html, urlparse, time
+
+sess = requests.Session()
+sess.headers.update({"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2"})
+
+base_uri = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22"
+
+uri = base_uri
+
+all_results = []
+
+while True:
+	response = sess.get(uri)
+	xml = lxml.html.fromstring(response.text)
+	
+	results = xml.xpath("//h3[@class='r']/a/@href")
+	next_ = xml.xpath("//a[@id='pnnext']/@href")
+	
+	for result in results:
+		all_results.append(result)
+	
+	if len(next_) > 0:
+		uri = urlparse.urljoin(uri, next_[0])
+	else:
+		break
+		
+	time.sleep(1)
+	
+unique_results = []
+	
+for result in all_results:
+	print "Testing %s..." % result
+	try:
+		response = requests.get(result, timeout=10)
+	except requests.exceptions.RequestException, e:
+		# Dead, skip
+		continue
+	except socket.timeout, e:
+		# Also dead, this might be thrown instead of above (see https://github.com/kennethreitz/requests/issues/1797)
+		continue
+	
+	if "Donate to support the development of calibre" not in response.text:
+		# Fake...
+		continue
+	
+	# Find base URI for this Calibre
+	xml = lxml.html.fromstring(response.text.encode("utf-8"))
+
+	try:
+		base_path = xml.xpath("//div[@id='header']//div[@class='bubble']//a/@href")[0]
+	except IndexError, e:
+		# Not found... probably not a Calibre, just a very good fake?
+		continue
+		
+	result = urlparse.urljoin(result, base_path).rstrip("/")
+	
+	if result.endswith("/browse"):
+		result = result[:-7]
+		
+	if result not in unique_results:
+		print result
+		unique_results.append(result)
+		
+for result in unique_results:
+	pass#print result
--- a/newcrawler/tasks.py
+++ b/newcrawler/tasks.py
@ -0,0 +1,156 @@
+import uuid, os, json, shutil, re
+from collections import defaultdict, deque
+
+class TaskManager(object):
+	def __init__(self):
+		self.tasks = {}
+		
+	def get(self, task_id):
+		return self.tasks[task_id]
+		
+	def put(self, task_id, task_data):
+		# Persist on disk
+		try:
+			os.mkdirs("task_data/%s" % task_id)
+		except OSError, e:
+			pass
+			
+		with open("task_data", "w") as task_file:
+			task_file.write(json.dumps(task_data))
+			
+		# Store in RAM
+		self.tasks[task_id] = task_data
+		
+	def delete(self, task_id):
+		shutil.rmtree("task_data/%s" % task_id, ignore_errors=True)
+		del self[task_id]
+	
+class TaskDistributor(object):
+	def __init__(self):
+		self.pools = defaultdict(deque)
+		self.processing = defaultdict(list)
+		
+	def put(self, pool, task_id):
+		self.pools[pool].append(task_id)
+		
+	def get(self, pool):
+		task_id = self.pools[pool].popleft()
+		self.processing[pool].append(task_id)
+		return task_id
+		
+	def get_pools(self, task_id):
+		return [pool in self.pools if task_id in pool]
+		
+	def done(self, pool, task_id):
+		self.processing[pool].remove(task_id)
+		
+	def fail(self, pool, task_id):
+		# Re-add
+		self.processing[pool].remove(task_id)
+		self.put(pool, task_id)
+
+class IsbnProcessor(object):
+	def clean(self, isbn):
+		isbn = isbn.upper().replace("-", "").replace(" ", "")
+		
+		if len(isbn) == 9: # 9 digit SBN
+			isbn = "0" + isbn
+			
+		return isbn
+		
+	def validate(self, isbn):
+		isbn = self.clean(isbn)
+		
+		if len(isbn) == 10:
+			total = 0
+			for i in xrange(0, 9):
+				total += (int(isbn[i]) * (10 - i))
+			
+			check_digit = 11 - (total % 11)
+			if check_digit == 10:
+				check_digit = "X"
+			else:
+				check_digit = str(check_digit)
+				
+			return (check_digit == isbn[9])
+		elif len(isbn) == 13:
+			odd = False
+			total = 0
+			for i in xrange(0, 12):
+				if odd:
+					total += int(isbn[i])
+				else:
+					total += int(isbn[i]) * 3
+				odd = not odd
+			
+			check_digit = 10 - (total % 10)
+			if check_digit == 10:
+				check_digit = 0
+			check_digit = str(check_digit)
+			
+			return (check_digit == isbn[12])
+		else:
+			return False
+
+class BookTaskClassifier(object):
+	def __init__(self):
+		self.isbn = IsbnProcessor()
+		
+	def get_pools(self, task_data):
+		eligible_pools = []
+		
+		try:
+			for isbn in [isbn.strip() for isbn in task_data["book_data"]["ids"]["isbn"]]:
+				if self.isbn.validate(isbn) and "isbn" not in task_data["pools_done"]:
+					eligible_pools.append("isbn")
+		except KeyError, e:
+			pass
+			
+		for identifier in ("amazon", "google"):
+			try:
+				if task_data["book_data"]["ids"][identifier].strip() != "" and identifier not in task_data["pools_done"]:
+					eligible_pools.append(identifier)
+			except KeyError, e:
+				pass
+				
+		if len(eligible_pools) == 0 and "title" not in task_data["pools_done"]:
+			eligible_pools.append("title")
+				
+		return eligible_pools
+
+class BookTaskManager(object):
+	def __init__(self):
+		self.manager = TaskManager()
+		self.distributor = TaskDistributor()
+		self.classifier = BookTaskClassifier()
+		
+	def new(self, book_data):
+		task_id = uuid.uuid4()
+		task_data = {
+			"id": task_id,
+			"book_data": book_data,
+			"pools_done": [],
+			"logs": [],
+			"flags": []
+		}
+		
+		self.manager.put(task_id, task_data)
+		self.enqueue(task_id)
+		
+	def enqueue(self, task_id):
+		task_data = self.manager.get(task_id)
+		pools = self.classifier.get_pools(task_data)
+		
+		if len(pools) > 1:
+			for pool in pools:
+				self.distributor.put(pool, task_id)
+		else:
+			# No more pools to put this into... this is the best we have!
+			
+	def get(self, pool):
+		task_id = self.distributor.get(pool)
+		return task_id
+			
+	def done(self, pool, task_data):
+		self.distributor.done(pool, task_data["id"])
+		self.manager.put(task_data["id"], task_data) # conflicts..
Author	SHA1	Message	Date
Sven Slootweg	1f32cac481	In the process of redoing RevisionedDict	10 years ago
Sven Slootweg	4927b5e7a3	Bits and pieces of a new scraper and task distribution mechanism, and a (probably over-engineered) revisioned dict, also some ISBN scraping stuff	11 years ago