#!/usr/bin/python import os, time, sys, urllib, urllib2, threading, re from collections import deque from BeautifulSoup import BeautifulSoup try: import json except: import simplejson as json STOP = False pipe_name = 'pipe_books' calibre_urls = deque([]) def add_book(title, authors, description, thumbnail, urls): global pipe_name print "[libcal] Submitted %s" % title pipeout = os.open(pipe_name, os.O_WRONLY) os.write(pipeout, json.dumps({ 'type': "add", 'data': { 'title': title, 'authors': authors, 'description': description, 'thumbnail': thumbnail, 'urls': urls } }) + "\n") os.close(pipeout) class GoogleCrawler (threading.Thread): def run(self): self.current_start = 0 self.crawl_page(self.base_url) def crawl_page(self, url): global calibre_urls, STOP if STOP == True: return None opener = urllib2.build_opener() opener.addheaders = [('User-agent', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2")] response = opener.open(url) page_contents = response.read() print "[google] == FIND CALIBRE LINKS" soup = BeautifulSoup(page_contents) for subsoup in soup.findAll("li", "g"): url = subsoup.find("a", "l")['href'] url_matcher = re.compile("(https?:\/\/[^/]*.*\/)browse\/.*") try: url = url_matcher.match(url).group(1) except AttributeError: continue if url not in calibre_urls: print "[google] Found Calibre at %s" % url calibre_urls.append(url) print "[google] == FIND NEXT PAGE" next_url = "" subsoup = soup.find("table", attrs={'id':'nav'}) for item in subsoup.findAll("a", "fl"): new_start = int(re.search("start=([0-9]+)", item['href']).group(1)) if new_start == self.current_start + 10: self.current_start = new_start next_url = item['href'] if next_url == "": print "[google] No next pages found... Done spidering Google!" else: print "[google] == SLEEPING..." time.sleep(4) self.crawl_page("http://www.google.com" + next_url) class CalibreCrawler(threading.Thread): def run(self): global calibre_urls, STOP while STOP == False: if len(calibre_urls) > 0: current_url = calibre_urls.popleft() self.crawl_page(current_url) time.sleep(1) def crawl_page(self, url): url_matcher = re.compile("(https?:\/\/[^/]*).*") base = url_matcher.match(url).group(1) print "[calibr] Starting crawl on %s ..." % url try: response = urllib2.urlopen(url + "browse/category/allbooks") except urllib2.URLError: print "Skipping %s, as the server could not be successfully reached." % url return None page_contents = response.read() matcher = re.compile("