From a11bf165a4622305fa9c55e1ff03a315bff1614a Mon Sep 17 00:00:00 2001
From: Sven Slootweg <jamsoftgamedev@gmail.com>
Date: Wed, 17 Oct 2012 02:01:43 +0200
Subject: [PATCH] Rewrite crawler

---
 crawler/calibre.py | 162 +++++++++++++++++++++++++++++++++++++++++++++
 crawler/crawler.py | 161 --------------------------------------------
 crawler/daemon.py  |  62 +++++++++++++++++
 3 files changed, 224 insertions(+), 161 deletions(-)
 create mode 100644 crawler/calibre.py
 delete mode 100644 crawler/crawler.py
 create mode 100644 crawler/daemon.py
diff --git a/crawler/calibre.py b/crawler/calibre.py
new file mode 100644
index 0000000..82dcccc
--- /dev/null
+++ b/crawler/calibre.py
@@ -0,0 +1,162 @@
+#!/usr/bin/python
+import os, time, sys, json, urllib, urllib2, threading, re
+from collections import deque
+from BeautifulSoup import BeautifulSoup
+
+STOP = False
+
+pipe_name = 'pipe_books'
+
+calibre_urls = deque([])
+
+def add_book(title, authors, description, thumbnail, urls):
+	global pipe_name
+	
+	print "[libcal] Submitted %s" % title
+	
+	pipeout = os.open(pipe_name, os.O_WRONLY)
+	os.write(pipeout, json.dumps({
+		'type': "add",
+		'data': {
+			'title': title,
+			'authors': authors,
+			'description': description,
+			'thumbnail': thumbnail,
+			'urls': urls
+		}
+	}) + "\n")
+	os.close(pipeout)
+	
+
+class GoogleCrawler (threading.Thread):
+	def run(self):
+		self.current_start = 0
+		self.crawl_page(self.base_url)
+	
+	def crawl_page(self, url):
+		global calibre_urls, STOP
+		
+		if STOP == True:
+			return None
+		
+		opener = urllib2.build_opener()
+		opener.addheaders = [('User-agent', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2")]
+		response = opener.open(url)
+		page_contents = response.read()
+		
+		print "[google] == FIND CALIBRE LINKS"
+		soup = BeautifulSoup(page_contents)
+		for subsoup in soup.findAll("li", "g"):
+			url = subsoup.find("a", "l")['href']
+			url_matcher = re.compile("(https?:\/\/[^/]*.*\/)browse\/.*")
+			
+			try:
+				url = url_matcher.match(url).group(1)
+			except AttributeError:
+				continue
+				
+			if url not in calibre_urls:
+				print "[google] Found Calibre at %s" % url
+				calibre_urls.append(url)
+				
+		print "[google] == FIND NEXT PAGE"
+		next_url = ""
+		subsoup = soup.find("table", attrs={'id':'nav'})
+		for item in subsoup.findAll("a", "fl"):
+			new_start = int(re.search("start=([0-9]+)", item['href']).group(1))
+			if new_start == self.current_start + 10:
+				self.current_start = new_start
+				next_url = item['href']
+				
+		if next_url == "":
+			print "[google] No next pages found... Done spidering Google!"
+		else:
+			
+			print "[google] == SLEEPING..."
+			time.sleep(4)
+			#self.crawl_page("http://www.google.com" + next_url)
+
+class CalibreCrawler(threading.Thread):
+	def run(self):
+		global calibre_urls, STOP
+		
+		while STOP == False:
+			if len(calibre_urls) > 0:
+				current_url = calibre_urls.popleft()
+				self.crawl_page(current_url)
+				
+			time.sleep(1)
+	
+	def crawl_page(self, url):
+		url_matcher = re.compile("(https?:\/\/[^/]*).*")
+		base = url_matcher.match(url).group(1)
+
+		print "[calibr] Starting crawl on %s ..." % url
+
+		response = urllib2.urlopen(url + "browse/category/allbooks")
+		page_contents = response.read()
+
+		matcher = re.compile("<div class=\"load_data\" title=\"([\[\]0-9\s,]*)\">")
+
+		for result in matcher.finditer(page_contents):
+			try:
+				query = result.group(1)
+				
+				list_url = url + "browse/booklist_page"
+				data = urllib.urlencode({'ids': query})
+				req = urllib2.Request(list_url, data)
+				response = urllib2.urlopen(req)
+				page_contents = response.read()[1:-1].replace("\\n", "").replace("\\\"", "\"")
+
+				soup = BeautifulSoup(page_contents)
+				
+				for subsoup in soup.findAll("div", "summary"):
+					try:
+						title = subsoup.find("div", "title").strong.string
+						authors = subsoup.find("div", "authors").string
+						description = subsoup.find("div", "comments").prettify()
+						
+						try:
+							thumbnail = base + subsoup.find("div", "left").img['src']
+						except:
+							thumbnail = ""
+						
+						urls = []
+						urls.append(subsoup.find("a", "read")['href'])
+						
+						formats_list = subsoup.find("div", "formats")
+						for format_url in formats_list.findAll("a"):
+							urls.append(format_url['href'])
+						
+						final_urls = []
+						
+						for format_url in urls:
+							url_matcher = re.compile(".*\/get\/([^/]+)\/.*")
+							m = url_matcher.match(format_url)
+							filetype = m.group(1).lower()
+							download_url = base + format_url
+							final_urls.append({
+								'filetype': filetype,
+								'url': download_url
+							})
+							
+						add_book(title, authors, description, thumbnail, final_urls)
+					
+					except Exception, e:
+						print "[calibr] FAILED: '%s' by '%s', error: %s" % (title, authors, str(e))
+						
+				time.sleep(2)
+			except:
+				pass
+
+try:
+	google = GoogleCrawler()
+	google.base_url = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22"
+	#google.start()
+	
+	calibre_urls = deque(["http://caltsardragon.com:8080/"])
+
+	calibre = CalibreCrawler()
+	calibre.start()
+except KeyboardInterrupt:
+	STOP = True
diff --git a/crawler/crawler.py b/crawler/crawler.py
deleted file mode 100644
index 8082a52..0000000
--- a/crawler/crawler.py
+++ /dev/null
@@ -1,161 +0,0 @@
-import urllib, urllib2, re, _mysql, time, math, threading, htmlentitydefs
-from collections import deque
-from BeautifulSoup import BeautifulSoup
-
-# Don't forget to configure the database login at the bottom of the code!
-
-class GoogleCrawler (threading.Thread):
-	def run(self):
-		google_start()
-		
-class CalibreCrawler (threading.Thread):
-	def run(self):
-		calibre_loop()
-
-def crawl_page(url):
-	try:
-		opener = urllib2.build_opener()
-		opener.addheaders = [('User-agent', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2")]
-		response = opener.open(url)
-		page_contents = response.read()
-		find_links(page_contents)
-		time.sleep(4)
-		next_url = find_next(page_contents)
-		if next_url == "":
-			print "Done."
-		else:
-			crawl_page("http://www.google.com" + next_url)
-	except:
-		crawl_page(url)
-
-def find_links(page):
-	soup = BeautifulSoup(page)
-	for subsoup in soup.findAll("li", "g"):
-		url = subsoup.find("a", "l")['href']
-		url_matcher = re.compile("(https?:\/\/[^/]*.*\/)browse\/.*")
-		url = url_matcher.match(url).group(1)
-		if url not in calibre_urls:
-			print url
-			calibre_urls.append(url)
-
-def find_next(page):
-	global current_start
-	soup = BeautifulSoup(page)
-	subsoup = soup.find("table", attrs={'id':'nav'})
-	for item in subsoup.findAll("a", "pn"):
-		new_start = find_start(item['href'])
-		if new_start == current_start + 10:
-			current_start = new_start
-			return item['href']
-	return ""
-
-def find_start(url):
-	url_match = re.search("start=([0-9]+)", url)
-	if url_match is None:
-		return 0
-	else:
-		return int(url_match.group(1))
-		
-def google_start():
-	crawl_page(base_url)
-	
-def calibre_crawl(url):
-	try:
-		url_matcher = re.compile("(https?:\/\/[^/]*).*")
-		base = url_matcher.match(url).group(1)
-
-		print "Starting crawl on %s ..." % url
-
-		response = urllib2.urlopen(url + "browse/category/allbooks")
-		page_contents = response.read()
-
-		matcher = re.compile("<div class=\"load_data\" title=\"([\[\]0-9\s,]*)\">")
-
-		for result in matcher.finditer(page_contents):
-			try:
-				query = result.group(1)
-				
-				list_url = url + "browse/booklist_page"
-				data = urllib.urlencode({'ids': query})
-				req = urllib2.Request(list_url, data)
-				response = urllib2.urlopen(req)
-				page_contents = response.read()[1:-1].replace("\\n", "").replace("\\\"", "\"")
-
-				soup = BeautifulSoup(page_contents)
-				
-				for subsoup in soup.findAll("div", "summary"):
-					try:
-						title = subsoup.find("div", "title").strong.string
-						authors = subsoup.find("div", "authors").string
-						description = subsoup.find("div", "comments").prettify()
-						
-						try:
-							thumbnail = base + subsoup.find("div", "left").img['src']
-						except:
-							thumbnail = ""
-						
-						urls = []
-						urls.append(subsoup.find("a", "read")['href'])
-						
-						formats_list = subsoup.find("div", "formats")
-						for format_url in formats_list.findAll("a"):
-							urls.append(format_url['href'])
-						
-						s_title = db.escape_string(title)
-						s_authors = db.escape_string(authors)
-						s_description = db.escape_string(description)
-						s_thumbnail = db.escape_string(thumbnail)
-						
-						sql_query = "SELECT * FROM books WHERE `Thumbnail` = '%s'" % s_thumbnail
-						db.query(sql_query)
-						sql_result = db.store_result()
-						
-						if sql_result.num_rows() == 0:
-							sql_query = "INSERT INTO books (`Title`, `Authors`, `Description`, `Thumbnail`) VALUES ('%s', '%s', '%s', '%s')" % (s_title, s_authors, s_description, s_thumbnail)
-							#print sql_query
-							
-							db.query(sql_query)
-							ins_id = db.insert_id()
-							
-							for format_url in urls:
-								url_matcher = re.compile(".*\/get\/([^/]+)\/.*")
-								m = url_matcher.match(format_url)
-								filetype = m.group(1).lower()
-								download_url = db.escape_string(base + format_url)
-								
-								s_filetype = db.escape_string(filetype)
-								s_url = db.escape_string(download_url)
-								
-								sql_query = "INSERT INTO files (`BookId`, `Format`, `Url`) VALUES ('%d', '%s', '%s')" % (ins_id, s_filetype, s_url)
-								db.query(sql_query)
-								
-							print "SUCCESS: %s" % s_title
-						else:
-							print "SKIP: %s" % title
-							
-						time.sleep(0.1)
-					
-					except:
-						print "FAIL: %s" % title
-						
-				time.sleep(2)
-			except:
-				pass
-	except:
-		pass
-
-def calibre_loop():
-	global calibre_urls
-	while True:
-		if len(calibre_urls) > 0:
-			current_url = calibre_urls.popleft()
-			calibre_crawl(current_url)
-		time.sleep(1)
-
-calibre_urls = deque([])
-current_start = 0
-base_url = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22"
-db = _mysql.connect("localhost", "root", "", "ebooks")
-
-GoogleCrawler().start()
-CalibreCrawler().start()
\ No newline at end of file
diff --git a/crawler/daemon.py b/crawler/daemon.py
new file mode 100644
index 0000000..5aef1cc
--- /dev/null
+++ b/crawler/daemon.py
@@ -0,0 +1,62 @@
+#!/usr/bin/python
+import os, time, sys, json, _mysql
+
+def stringdammit(input_string):
+	if isinstance(input_string, str):
+		return input_string
+	else:
+		return input_string.encode('utf-8')
+
+pipe_name = 'pipe_books'
+
+if not os.path.exists(pipe_name):
+	os.mkfifo(pipe_name)
+
+pipein = open(pipe_name, 'r')
+buff = ""
+db = _mysql.connect("localhost", "root", "", "ebooks")
+
+while True:
+	data = pipein.read()
+	buff += data
+	stack = buff.replace("\r", "").split("\n")
+	buff = stack.pop()
+	
+	for line in stack:
+		try:
+			obj = json.loads(line)
+		except:
+			print line
+			sys.stderr.write("ERROR: Could not decode message: %s" % line)
+			continue
+		
+		message_type = obj['type']
+		message_data = obj['data']
+		
+		if message_type == "add":
+			s_title = db.escape_string(stringdammit(message_data['title']))
+			s_description = db.escape_string(stringdammit(message_data['description']))
+			s_authors = db.escape_string(stringdammit(message_data['authors']))
+			s_thumbnail = db.escape_string(stringdammit(message_data['thumbnail']))
+			
+			sql_query = "SELECT * FROM books WHERE `Thumbnail` = '%s'" % s_thumbnail
+			db.query(sql_query)
+			sql_result = db.store_result()
+			
+			if sql_result.num_rows() == 0:
+				sql_query = "INSERT INTO books (`Title`, `Authors`, `Description`, `Thumbnail`) VALUES ('%s', '%s', '%s', '%s')" % (s_title, s_authors, s_description, s_thumbnail)
+				db.query(sql_query)
+				book_id = db.insert_id()
+				
+				for format_url in message_data['urls']:
+					s_filetype = db.escape_string(stringdammit(format_url['filetype'].lower()))
+					s_url = db.escape_string(stringdammit(format_url['url']))
+					
+					sql_query = "INSERT INTO files (`BookId`, `Format`, `Url`) VALUES ('%d', '%s', '%s')" % (book_id, s_filetype, s_url)
+					db.query(sql_query)
+					
+				print "Received and inserted '%s' by '%s'" % (s_title, s_authors)
+			else:
+				print "Skipped '%s' by '%s' (already exists)" % (s_title, s_authors)
+		else:
+			print "Unrecognized command: %s" % message_type