From a11bf165a4622305fa9c55e1ff03a315bff1614a Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Wed, 17 Oct 2012 02:01:43 +0200 Subject: [PATCH] Rewrite crawler --- crawler/calibre.py | 162 +++++++++++++++++++++++++++++++++++++++++++++ crawler/crawler.py | 161 -------------------------------------------- crawler/daemon.py | 62 +++++++++++++++++ 3 files changed, 224 insertions(+), 161 deletions(-) create mode 100644 crawler/calibre.py delete mode 100644 crawler/crawler.py create mode 100644 crawler/daemon.py diff --git a/crawler/calibre.py b/crawler/calibre.py new file mode 100644 index 0000000..82dcccc --- /dev/null +++ b/crawler/calibre.py @@ -0,0 +1,162 @@ +#!/usr/bin/python +import os, time, sys, json, urllib, urllib2, threading, re +from collections import deque +from BeautifulSoup import BeautifulSoup + +STOP = False + +pipe_name = 'pipe_books' + +calibre_urls = deque([]) + +def add_book(title, authors, description, thumbnail, urls): + global pipe_name + + print "[libcal] Submitted %s" % title + + pipeout = os.open(pipe_name, os.O_WRONLY) + os.write(pipeout, json.dumps({ + 'type': "add", + 'data': { + 'title': title, + 'authors': authors, + 'description': description, + 'thumbnail': thumbnail, + 'urls': urls + } + }) + "\n") + os.close(pipeout) + + +class GoogleCrawler (threading.Thread): + def run(self): + self.current_start = 0 + self.crawl_page(self.base_url) + + def crawl_page(self, url): + global calibre_urls, STOP + + if STOP == True: + return None + + opener = urllib2.build_opener() + opener.addheaders = [('User-agent', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2")] + response = opener.open(url) + page_contents = response.read() + + print "[google] == FIND CALIBRE LINKS" + soup = BeautifulSoup(page_contents) + for subsoup in soup.findAll("li", "g"): + url = subsoup.find("a", "l")['href'] + url_matcher = re.compile("(https?:\/\/[^/]*.*\/)browse\/.*") + + try: + url = url_matcher.match(url).group(1) + except AttributeError: + continue + + if url not in calibre_urls: + print "[google] Found Calibre at %s" % url + calibre_urls.append(url) + + print "[google] == FIND NEXT PAGE" + next_url = "" + subsoup = soup.find("table", attrs={'id':'nav'}) + for item in subsoup.findAll("a", "fl"): + new_start = int(re.search("start=([0-9]+)", item['href']).group(1)) + if new_start == self.current_start + 10: + self.current_start = new_start + next_url = item['href'] + + if next_url == "": + print "[google] No next pages found... Done spidering Google!" + else: + + print "[google] == SLEEPING..." + time.sleep(4) + #self.crawl_page("http://www.google.com" + next_url) + +class CalibreCrawler(threading.Thread): + def run(self): + global calibre_urls, STOP + + while STOP == False: + if len(calibre_urls) > 0: + current_url = calibre_urls.popleft() + self.crawl_page(current_url) + + time.sleep(1) + + def crawl_page(self, url): + url_matcher = re.compile("(https?:\/\/[^/]*).*") + base = url_matcher.match(url).group(1) + + print "[calibr] Starting crawl on %s ..." % url + + response = urllib2.urlopen(url + "browse/category/allbooks") + page_contents = response.read() + + matcher = re.compile("
") + + for result in matcher.finditer(page_contents): + try: + query = result.group(1) + + list_url = url + "browse/booklist_page" + data = urllib.urlencode({'ids': query}) + req = urllib2.Request(list_url, data) + response = urllib2.urlopen(req) + page_contents = response.read()[1:-1].replace("\\n", "").replace("\\\"", "\"") + + soup = BeautifulSoup(page_contents) + + for subsoup in soup.findAll("div", "summary"): + try: + title = subsoup.find("div", "title").strong.string + authors = subsoup.find("div", "authors").string + description = subsoup.find("div", "comments").prettify() + + try: + thumbnail = base + subsoup.find("div", "left").img['src'] + except: + thumbnail = "" + + urls = [] + urls.append(subsoup.find("a", "read")['href']) + + formats_list = subsoup.find("div", "formats") + for format_url in formats_list.findAll("a"): + urls.append(format_url['href']) + + final_urls = [] + + for format_url in urls: + url_matcher = re.compile(".*\/get\/([^/]+)\/.*") + m = url_matcher.match(format_url) + filetype = m.group(1).lower() + download_url = base + format_url + final_urls.append({ + 'filetype': filetype, + 'url': download_url + }) + + add_book(title, authors, description, thumbnail, final_urls) + + except Exception, e: + print "[calibr] FAILED: '%s' by '%s', error: %s" % (title, authors, str(e)) + + time.sleep(2) + except: + pass + +try: + google = GoogleCrawler() + google.base_url = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22" + #google.start() + + calibre_urls = deque(["http://caltsardragon.com:8080/"]) + + calibre = CalibreCrawler() + calibre.start() +except KeyboardInterrupt: + STOP = True diff --git a/crawler/crawler.py b/crawler/crawler.py deleted file mode 100644 index 8082a52..0000000 --- a/crawler/crawler.py +++ /dev/null @@ -1,161 +0,0 @@ -import urllib, urllib2, re, _mysql, time, math, threading, htmlentitydefs -from collections import deque -from BeautifulSoup import BeautifulSoup - -# Don't forget to configure the database login at the bottom of the code! - -class GoogleCrawler (threading.Thread): - def run(self): - google_start() - -class CalibreCrawler (threading.Thread): - def run(self): - calibre_loop() - -def crawl_page(url): - try: - opener = urllib2.build_opener() - opener.addheaders = [('User-agent', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2")] - response = opener.open(url) - page_contents = response.read() - find_links(page_contents) - time.sleep(4) - next_url = find_next(page_contents) - if next_url == "": - print "Done." - else: - crawl_page("http://www.google.com" + next_url) - except: - crawl_page(url) - -def find_links(page): - soup = BeautifulSoup(page) - for subsoup in soup.findAll("li", "g"): - url = subsoup.find("a", "l")['href'] - url_matcher = re.compile("(https?:\/\/[^/]*.*\/)browse\/.*") - url = url_matcher.match(url).group(1) - if url not in calibre_urls: - print url - calibre_urls.append(url) - -def find_next(page): - global current_start - soup = BeautifulSoup(page) - subsoup = soup.find("table", attrs={'id':'nav'}) - for item in subsoup.findAll("a", "pn"): - new_start = find_start(item['href']) - if new_start == current_start + 10: - current_start = new_start - return item['href'] - return "" - -def find_start(url): - url_match = re.search("start=([0-9]+)", url) - if url_match is None: - return 0 - else: - return int(url_match.group(1)) - -def google_start(): - crawl_page(base_url) - -def calibre_crawl(url): - try: - url_matcher = re.compile("(https?:\/\/[^/]*).*") - base = url_matcher.match(url).group(1) - - print "Starting crawl on %s ..." % url - - response = urllib2.urlopen(url + "browse/category/allbooks") - page_contents = response.read() - - matcher = re.compile("
") - - for result in matcher.finditer(page_contents): - try: - query = result.group(1) - - list_url = url + "browse/booklist_page" - data = urllib.urlencode({'ids': query}) - req = urllib2.Request(list_url, data) - response = urllib2.urlopen(req) - page_contents = response.read()[1:-1].replace("\\n", "").replace("\\\"", "\"") - - soup = BeautifulSoup(page_contents) - - for subsoup in soup.findAll("div", "summary"): - try: - title = subsoup.find("div", "title").strong.string - authors = subsoup.find("div", "authors").string - description = subsoup.find("div", "comments").prettify() - - try: - thumbnail = base + subsoup.find("div", "left").img['src'] - except: - thumbnail = "" - - urls = [] - urls.append(subsoup.find("a", "read")['href']) - - formats_list = subsoup.find("div", "formats") - for format_url in formats_list.findAll("a"): - urls.append(format_url['href']) - - s_title = db.escape_string(title) - s_authors = db.escape_string(authors) - s_description = db.escape_string(description) - s_thumbnail = db.escape_string(thumbnail) - - sql_query = "SELECT * FROM books WHERE `Thumbnail` = '%s'" % s_thumbnail - db.query(sql_query) - sql_result = db.store_result() - - if sql_result.num_rows() == 0: - sql_query = "INSERT INTO books (`Title`, `Authors`, `Description`, `Thumbnail`) VALUES ('%s', '%s', '%s', '%s')" % (s_title, s_authors, s_description, s_thumbnail) - #print sql_query - - db.query(sql_query) - ins_id = db.insert_id() - - for format_url in urls: - url_matcher = re.compile(".*\/get\/([^/]+)\/.*") - m = url_matcher.match(format_url) - filetype = m.group(1).lower() - download_url = db.escape_string(base + format_url) - - s_filetype = db.escape_string(filetype) - s_url = db.escape_string(download_url) - - sql_query = "INSERT INTO files (`BookId`, `Format`, `Url`) VALUES ('%d', '%s', '%s')" % (ins_id, s_filetype, s_url) - db.query(sql_query) - - print "SUCCESS: %s" % s_title - else: - print "SKIP: %s" % title - - time.sleep(0.1) - - except: - print "FAIL: %s" % title - - time.sleep(2) - except: - pass - except: - pass - -def calibre_loop(): - global calibre_urls - while True: - if len(calibre_urls) > 0: - current_url = calibre_urls.popleft() - calibre_crawl(current_url) - time.sleep(1) - -calibre_urls = deque([]) -current_start = 0 -base_url = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22" -db = _mysql.connect("localhost", "root", "", "ebooks") - -GoogleCrawler().start() -CalibreCrawler().start() \ No newline at end of file diff --git a/crawler/daemon.py b/crawler/daemon.py new file mode 100644 index 0000000..5aef1cc --- /dev/null +++ b/crawler/daemon.py @@ -0,0 +1,62 @@ +#!/usr/bin/python +import os, time, sys, json, _mysql + +def stringdammit(input_string): + if isinstance(input_string, str): + return input_string + else: + return input_string.encode('utf-8') + +pipe_name = 'pipe_books' + +if not os.path.exists(pipe_name): + os.mkfifo(pipe_name) + +pipein = open(pipe_name, 'r') +buff = "" +db = _mysql.connect("localhost", "root", "", "ebooks") + +while True: + data = pipein.read() + buff += data + stack = buff.replace("\r", "").split("\n") + buff = stack.pop() + + for line in stack: + try: + obj = json.loads(line) + except: + print line + sys.stderr.write("ERROR: Could not decode message: %s" % line) + continue + + message_type = obj['type'] + message_data = obj['data'] + + if message_type == "add": + s_title = db.escape_string(stringdammit(message_data['title'])) + s_description = db.escape_string(stringdammit(message_data['description'])) + s_authors = db.escape_string(stringdammit(message_data['authors'])) + s_thumbnail = db.escape_string(stringdammit(message_data['thumbnail'])) + + sql_query = "SELECT * FROM books WHERE `Thumbnail` = '%s'" % s_thumbnail + db.query(sql_query) + sql_result = db.store_result() + + if sql_result.num_rows() == 0: + sql_query = "INSERT INTO books (`Title`, `Authors`, `Description`, `Thumbnail`) VALUES ('%s', '%s', '%s', '%s')" % (s_title, s_authors, s_description, s_thumbnail) + db.query(sql_query) + book_id = db.insert_id() + + for format_url in message_data['urls']: + s_filetype = db.escape_string(stringdammit(format_url['filetype'].lower())) + s_url = db.escape_string(stringdammit(format_url['url'])) + + sql_query = "INSERT INTO files (`BookId`, `Format`, `Url`) VALUES ('%d', '%s', '%s')" % (book_id, s_filetype, s_url) + db.query(sql_query) + + print "Received and inserted '%s' by '%s'" % (s_title, s_authors) + else: + print "Skipped '%s' by '%s' (already exists)" % (s_title, s_authors) + else: + print "Unrecognized command: %s" % message_type