You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
171 lines
4.5 KiB
Python
171 lines
4.5 KiB
Python
#!/usr/bin/python
|
|
import os, time, sys, urllib, urllib2, threading, re
|
|
from collections import deque
|
|
from BeautifulSoup import BeautifulSoup
|
|
|
|
try:
|
|
import json
|
|
except:
|
|
import simplejson as json
|
|
|
|
STOP = False
|
|
|
|
pipe_name = 'pipe_books'
|
|
|
|
calibre_urls = deque([])
|
|
|
|
def add_book(title, authors, description, thumbnail, urls):
|
|
global pipe_name
|
|
|
|
print "[libcal] Submitted %s" % title
|
|
|
|
pipeout = os.open(pipe_name, os.O_WRONLY)
|
|
os.write(pipeout, json.dumps({
|
|
'type': "add",
|
|
'data': {
|
|
'title': title,
|
|
'authors': authors,
|
|
'description': description,
|
|
'thumbnail': thumbnail,
|
|
'urls': urls
|
|
}
|
|
}) + "\n")
|
|
os.close(pipeout)
|
|
|
|
|
|
class GoogleCrawler (threading.Thread):
|
|
def run(self):
|
|
self.current_start = 0
|
|
self.crawl_page(self.base_url)
|
|
|
|
def crawl_page(self, url):
|
|
global calibre_urls, STOP
|
|
|
|
if STOP == True:
|
|
return None
|
|
|
|
opener = urllib2.build_opener()
|
|
opener.addheaders = [('User-agent', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2")]
|
|
response = opener.open(url)
|
|
page_contents = response.read()
|
|
|
|
print "[google] == FIND CALIBRE LINKS"
|
|
soup = BeautifulSoup(page_contents)
|
|
for subsoup in soup.findAll("li", "g"):
|
|
url = subsoup.find("a", "l")['href']
|
|
url_matcher = re.compile("(https?:\/\/[^/]*.*\/)browse\/.*")
|
|
|
|
try:
|
|
url = url_matcher.match(url).group(1)
|
|
except AttributeError:
|
|
continue
|
|
|
|
if url not in calibre_urls:
|
|
print "[google] Found Calibre at %s" % url
|
|
calibre_urls.append(url)
|
|
|
|
print "[google] == FIND NEXT PAGE"
|
|
next_url = ""
|
|
subsoup = soup.find("table", attrs={'id':'nav'})
|
|
for item in subsoup.findAll("a", "fl"):
|
|
new_start = int(re.search("start=([0-9]+)", item['href']).group(1))
|
|
if new_start == self.current_start + 10:
|
|
self.current_start = new_start
|
|
next_url = item['href']
|
|
|
|
if next_url == "":
|
|
print "[google] No next pages found... Done spidering Google!"
|
|
else:
|
|
|
|
print "[google] == SLEEPING..."
|
|
time.sleep(4)
|
|
self.crawl_page("http://www.google.com" + next_url)
|
|
|
|
class CalibreCrawler(threading.Thread):
|
|
def run(self):
|
|
global calibre_urls, STOP
|
|
|
|
while STOP == False:
|
|
if len(calibre_urls) > 0:
|
|
current_url = calibre_urls.popleft()
|
|
self.crawl_page(current_url)
|
|
|
|
time.sleep(1)
|
|
|
|
def crawl_page(self, url):
|
|
url_matcher = re.compile("(https?:\/\/[^/]*).*")
|
|
base = url_matcher.match(url).group(1)
|
|
|
|
print "[calibr] Starting crawl on %s ..." % url
|
|
|
|
try:
|
|
response = urllib2.urlopen(url + "browse/category/allbooks")
|
|
except urllib2.URLError:
|
|
print "Skipping %s, as the server could not be successfully reached." % url
|
|
return None
|
|
|
|
page_contents = response.read()
|
|
|
|
matcher = re.compile("<div class=\"load_data\" title=\"([\[\]0-9\s,]*)\">")
|
|
|
|
for result in matcher.finditer(page_contents):
|
|
try:
|
|
query = result.group(1)
|
|
|
|
list_url = url + "browse/booklist_page"
|
|
data = urllib.urlencode({'ids': query})
|
|
req = urllib2.Request(list_url, data)
|
|
response = urllib2.urlopen(req)
|
|
page_contents = response.read()[1:-1].replace("\\n", "").replace("\\\"", "\"")
|
|
|
|
soup = BeautifulSoup(page_contents)
|
|
|
|
for subsoup in soup.findAll("div", "summary"):
|
|
try:
|
|
title = subsoup.find("div", "title").strong.string
|
|
authors = subsoup.find("div", "authors").string
|
|
description = subsoup.find("div", "comments").prettify()
|
|
|
|
try:
|
|
thumbnail = base + subsoup.find("div", "left").img['src']
|
|
except:
|
|
thumbnail = ""
|
|
|
|
urls = []
|
|
urls.append(subsoup.find("a", "read")['href'])
|
|
|
|
formats_list = subsoup.find("div", "formats")
|
|
for format_url in formats_list.findAll("a"):
|
|
urls.append(format_url['href'])
|
|
|
|
final_urls = []
|
|
|
|
for format_url in urls:
|
|
url_matcher = re.compile(".*\/get\/([^/]+)\/.*")
|
|
m = url_matcher.match(format_url)
|
|
filetype = m.group(1).lower()
|
|
download_url = base + format_url
|
|
final_urls.append({
|
|
'filetype': filetype,
|
|
'url': download_url
|
|
})
|
|
|
|
add_book(title, authors, description, thumbnail, final_urls)
|
|
|
|
except Exception, e:
|
|
print "[calibr] FAILED: '%s' by '%s', error: %s" % (title, authors, str(e))
|
|
|
|
time.sleep(2)
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
google = GoogleCrawler()
|
|
google.base_url = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22"
|
|
google.start()
|
|
|
|
calibre = CalibreCrawler()
|
|
calibre.start()
|
|
except KeyboardInterrupt:
|
|
STOP = True
|