Rewrite crawler

12 years ago · a11bf165a4
parent 8c255202c4
commit a11bf165a4
3 changed files with 224 additions and 161 deletions
--- a/crawler/calibre.py
+++ b/crawler/calibre.py
@ -0,0 +1,162 @@
 #!/usr/bin/python
 import os, time, sys, json, urllib, urllib2, threading, re
 from collections import deque
 from BeautifulSoup import BeautifulSoup
 STOP = False
 pipe_name = 'pipe_books'
 calibre_urls = deque([])
 def add_book(title, authors, description, thumbnail, urls):
 	global pipe_name
 	print "[libcal] Submitted %s" % title
 	pipeout = os.open(pipe_name, os.O_WRONLY)
 	os.write(pipeout, json.dumps({
 		'type': "add",
 		'data': {
 			'title': title,
 			'authors': authors,
 			'description': description,
 			'thumbnail': thumbnail,
 			'urls': urls
 		}
 	}) + "\n")
 	os.close(pipeout)
 class GoogleCrawler (threading.Thread):
 	def run(self):
 		self.current_start = 0
 		self.crawl_page(self.base_url)
 	def crawl_page(self, url):
 		global calibre_urls, STOP
 		if STOP == True:
 			return None
 		opener = urllib2.build_opener()
 		opener.addheaders = [('User-agent', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2")]
 		response = opener.open(url)
 		page_contents = response.read()
 		print "[google] == FIND CALIBRE LINKS"
 		soup = BeautifulSoup(page_contents)
 		for subsoup in soup.findAll("li", "g"):
 			url = subsoup.find("a", "l")['href']
 			url_matcher = re.compile("(https?:\/\/[^/]*.*\/)browse\/.*")
 			try:
 				url = url_matcher.match(url).group(1)
 			except AttributeError:
 				continue
 			if url not in calibre_urls:
 				print "[google] Found Calibre at %s" % url
 				calibre_urls.append(url)
 		print "[google] == FIND NEXT PAGE"
 		next_url = ""
 		subsoup = soup.find("table", attrs={'id':'nav'})
 		for item in subsoup.findAll("a", "fl"):
 			new_start = int(re.search("start=([0-9]+)", item['href']).group(1))
 			if new_start == self.current_start + 10:
 				self.current_start = new_start
 				next_url = item['href']
 		if next_url == "":
 			print "[google] No next pages found... Done spidering Google!"
 		else:
 			print "[google] == SLEEPING..."
 			time.sleep(4)
 			#self.crawl_page("http://www.google.com" + next_url)
 class CalibreCrawler(threading.Thread):
 	def run(self):
 		global calibre_urls, STOP
 		while STOP == False:
 			if len(calibre_urls) > 0:
 				current_url = calibre_urls.popleft()
 				self.crawl_page(current_url)
 			time.sleep(1)
 	def crawl_page(self, url):
 		url_matcher = re.compile("(https?:\/\/[^/]*).*")
 		base = url_matcher.match(url).group(1)
 		print "[calibr] Starting crawl on %s ..." % url
 		response = urllib2.urlopen(url + "browse/category/allbooks")
 		page_contents = response.read()
 		matcher = re.compile("<div class=\"load_data\" title=\"([\[\]0-9\s,]*)\">")
 		for result in matcher.finditer(page_contents):
 			try:
 				query = result.group(1)
 				list_url = url + "browse/booklist_page"
 				data = urllib.urlencode({'ids': query})
 				req = urllib2.Request(list_url, data)
 				response = urllib2.urlopen(req)
 				page_contents = response.read()[1:-1].replace("\\n", "").replace("\\\"", "\"")
 				soup = BeautifulSoup(page_contents)
 				for subsoup in soup.findAll("div", "summary"):
 					try:
 						title = subsoup.find("div", "title").strong.string
 						authors = subsoup.find("div", "authors").string
 						description = subsoup.find("div", "comments").prettify()
 						try:
 							thumbnail = base + subsoup.find("div", "left").img['src']
 						except:
 							thumbnail = ""
 						urls = []
 						urls.append(subsoup.find("a", "read")['href'])
 						formats_list = subsoup.find("div", "formats")
 						for format_url in formats_list.findAll("a"):
 							urls.append(format_url['href'])
 						final_urls = []
 						for format_url in urls:
 							url_matcher = re.compile(".*\/get\/([^/]+)\/.*")
 							m = url_matcher.match(format_url)
 							filetype = m.group(1).lower()
 							download_url = base + format_url
 							final_urls.append({
 								'filetype': filetype,
 								'url': download_url
 							})
 						add_book(title, authors, description, thumbnail, final_urls)
 					except Exception, e:
 						print "[calibr] FAILED: '%s' by '%s', error: %s" % (title, authors, str(e))
 				time.sleep(2)
 			except:
 				pass
 try:
 	google = GoogleCrawler()
 	google.base_url = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22"
 	#google.start()
 	calibre_urls = deque(["http://caltsardragon.com:8080/"])
 	calibre = CalibreCrawler()
 	calibre.start()
 except KeyboardInterrupt:
 	STOP = True
--- a/crawler/crawler.py
+++ b/crawler/crawler.py
@ -1,161 +0,0 @@
 import urllib, urllib2, re, _mysql, time, math, threading, htmlentitydefs
 from collections import deque
 from BeautifulSoup import BeautifulSoup
 # Don't forget to configure the database login at the bottom of the code!
 class GoogleCrawler (threading.Thread):
 	def run(self):
 		google_start()
 class CalibreCrawler (threading.Thread):
 	def run(self):
 		calibre_loop()
 def crawl_page(url):
 	try:
 		opener = urllib2.build_opener()
 		opener.addheaders = [('User-agent', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2")]
 		response = opener.open(url)
 		page_contents = response.read()
 		find_links(page_contents)
 		time.sleep(4)
 		next_url = find_next(page_contents)
 		if next_url == "":
 			print "Done."
 		else:
 			crawl_page("http://www.google.com" + next_url)
 	except:
 		crawl_page(url)
 def find_links(page):
 	soup = BeautifulSoup(page)
 	for subsoup in soup.findAll("li", "g"):
 		url = subsoup.find("a", "l")['href']
 		url_matcher = re.compile("(https?:\/\/[^/]*.*\/)browse\/.*")
 		url = url_matcher.match(url).group(1)
 		if url not in calibre_urls:
 			print url
 			calibre_urls.append(url)
 def find_next(page):
 	global current_start
 	soup = BeautifulSoup(page)
 	subsoup = soup.find("table", attrs={'id':'nav'})
 	for item in subsoup.findAll("a", "pn"):
 		new_start = find_start(item['href'])
 		if new_start == current_start + 10:
 			current_start = new_start
 			return item['href']
 	return ""
 def find_start(url):
 	url_match = re.search("start=([0-9]+)", url)
 	if url_match is None:
 		return 0
 	else:
 		return int(url_match.group(1))
 def google_start():
 	crawl_page(base_url)
 def calibre_crawl(url):
 	try:
 		url_matcher = re.compile("(https?:\/\/[^/]*).*")
 		base = url_matcher.match(url).group(1)
 		print "Starting crawl on %s ..." % url
 		response = urllib2.urlopen(url + "browse/category/allbooks")
 		page_contents = response.read()
 		matcher = re.compile("<div class=\"load_data\" title=\"([\[\]0-9\s,]*)\">")
 		for result in matcher.finditer(page_contents):
 			try:
 				query = result.group(1)
 				list_url = url + "browse/booklist_page"
 				data = urllib.urlencode({'ids': query})
 				req = urllib2.Request(list_url, data)
 				response = urllib2.urlopen(req)
 				page_contents = response.read()[1:-1].replace("\\n", "").replace("\\\"", "\"")
 				soup = BeautifulSoup(page_contents)
 				for subsoup in soup.findAll("div", "summary"):
 					try:
 						title = subsoup.find("div", "title").strong.string
 						authors = subsoup.find("div", "authors").string
 						description = subsoup.find("div", "comments").prettify()
 						try:
 							thumbnail = base + subsoup.find("div", "left").img['src']
 						except:
 							thumbnail = ""
 						urls = []
 						urls.append(subsoup.find("a", "read")['href'])
 						formats_list = subsoup.find("div", "formats")
 						for format_url in formats_list.findAll("a"):
 							urls.append(format_url['href'])
 						s_title = db.escape_string(title)
 						s_authors = db.escape_string(authors)
 						s_description = db.escape_string(description)
 						s_thumbnail = db.escape_string(thumbnail)
 						sql_query = "SELECT * FROM books WHERE `Thumbnail` = '%s'" % s_thumbnail
 						db.query(sql_query)
 						sql_result = db.store_result()
 						if sql_result.num_rows() == 0:
 							sql_query = "INSERT INTO books (`Title`, `Authors`, `Description`, `Thumbnail`) VALUES ('%s', '%s', '%s', '%s')" % (s_title, s_authors, s_description, s_thumbnail)
 							#print sql_query
 							db.query(sql_query)
 							ins_id = db.insert_id()
 							for format_url in urls:
 								url_matcher = re.compile(".*\/get\/([^/]+)\/.*")
 								m = url_matcher.match(format_url)
 								filetype = m.group(1).lower()
 								download_url = db.escape_string(base + format_url)
 								s_filetype = db.escape_string(filetype)
 								s_url = db.escape_string(download_url)
 								sql_query = "INSERT INTO files (`BookId`, `Format`, `Url`) VALUES ('%d', '%s', '%s')" % (ins_id, s_filetype, s_url)
 								db.query(sql_query)
 							print "SUCCESS: %s" % s_title
 						else:
 							print "SKIP: %s" % title
 						time.sleep(0.1)
 					except:
 						print "FAIL: %s" % title
 				time.sleep(2)
 			except:
 				pass
 	except:
 		pass
 def calibre_loop():
 	global calibre_urls
 	while True:
 		if len(calibre_urls) > 0:
 			current_url = calibre_urls.popleft()
 			calibre_crawl(current_url)
 		time.sleep(1)
 calibre_urls = deque([])
 current_start = 0
 base_url = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22"
 db = _mysql.connect("localhost", "root", "", "ebooks")
 GoogleCrawler().start()
 CalibreCrawler().start()
--- a/crawler/daemon.py
+++ b/crawler/daemon.py
@ -0,0 +1,62 @@
 #!/usr/bin/python
 import os, time, sys, json, _mysql
 def stringdammit(input_string):
 	if isinstance(input_string, str):
 		return input_string
 	else:
 		return input_string.encode('utf-8')
 pipe_name = 'pipe_books'
 if not os.path.exists(pipe_name):
 	os.mkfifo(pipe_name)
 pipein = open(pipe_name, 'r')
 buff = ""
 db = _mysql.connect("localhost", "root", "", "ebooks")
 while True:
 	data = pipein.read()
 	buff += data
 	stack = buff.replace("\r", "").split("\n")
 	buff = stack.pop()
 	for line in stack:
 		try:
 			obj = json.loads(line)
 		except:
 			print line
 			sys.stderr.write("ERROR: Could not decode message: %s" % line)
 			continue
 		message_type = obj['type']
 		message_data = obj['data']
 		if message_type == "add":
 			s_title = db.escape_string(stringdammit(message_data['title']))
 			s_description = db.escape_string(stringdammit(message_data['description']))
 			s_authors = db.escape_string(stringdammit(message_data['authors']))
 			s_thumbnail = db.escape_string(stringdammit(message_data['thumbnail']))
 			sql_query = "SELECT * FROM books WHERE `Thumbnail` = '%s'" % s_thumbnail
 			db.query(sql_query)
 			sql_result = db.store_result()
 			if sql_result.num_rows() == 0:
 				sql_query = "INSERT INTO books (`Title`, `Authors`, `Description`, `Thumbnail`) VALUES ('%s', '%s', '%s', '%s')" % (s_title, s_authors, s_description, s_thumbnail)
 				db.query(sql_query)
 				book_id = db.insert_id()
 				for format_url in message_data['urls']:
 					s_filetype = db.escape_string(stringdammit(format_url['filetype'].lower()))
 					s_url = db.escape_string(stringdammit(format_url['url']))
 					sql_query = "INSERT INTO files (`BookId`, `Format`, `Url`) VALUES ('%d', '%s', '%s')" % (book_id, s_filetype, s_url)
 					db.query(sql_query)
 				print "Received and inserted '%s' by '%s'" % (s_title, s_authors)
 			else:
 				print "Skipped '%s' by '%s' (already exists)" % (s_title, s_authors)
 		else:
 			print "Unrecognized command: %s" % message_type