Rewrite crawler
parent
8c255202c4
commit
a11bf165a4
@ -0,0 +1,162 @@
|
||||
#!/usr/bin/python
|
||||
import os, time, sys, json, urllib, urllib2, threading, re
|
||||
from collections import deque
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
STOP = False
|
||||
|
||||
pipe_name = 'pipe_books'
|
||||
|
||||
calibre_urls = deque([])
|
||||
|
||||
def add_book(title, authors, description, thumbnail, urls):
|
||||
global pipe_name
|
||||
|
||||
print "[libcal] Submitted %s" % title
|
||||
|
||||
pipeout = os.open(pipe_name, os.O_WRONLY)
|
||||
os.write(pipeout, json.dumps({
|
||||
'type': "add",
|
||||
'data': {
|
||||
'title': title,
|
||||
'authors': authors,
|
||||
'description': description,
|
||||
'thumbnail': thumbnail,
|
||||
'urls': urls
|
||||
}
|
||||
}) + "\n")
|
||||
os.close(pipeout)
|
||||
|
||||
|
||||
class GoogleCrawler (threading.Thread):
|
||||
def run(self):
|
||||
self.current_start = 0
|
||||
self.crawl_page(self.base_url)
|
||||
|
||||
def crawl_page(self, url):
|
||||
global calibre_urls, STOP
|
||||
|
||||
if STOP == True:
|
||||
return None
|
||||
|
||||
opener = urllib2.build_opener()
|
||||
opener.addheaders = [('User-agent', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2")]
|
||||
response = opener.open(url)
|
||||
page_contents = response.read()
|
||||
|
||||
print "[google] == FIND CALIBRE LINKS"
|
||||
soup = BeautifulSoup(page_contents)
|
||||
for subsoup in soup.findAll("li", "g"):
|
||||
url = subsoup.find("a", "l")['href']
|
||||
url_matcher = re.compile("(https?:\/\/[^/]*.*\/)browse\/.*")
|
||||
|
||||
try:
|
||||
url = url_matcher.match(url).group(1)
|
||||
except AttributeError:
|
||||
continue
|
||||
|
||||
if url not in calibre_urls:
|
||||
print "[google] Found Calibre at %s" % url
|
||||
calibre_urls.append(url)
|
||||
|
||||
print "[google] == FIND NEXT PAGE"
|
||||
next_url = ""
|
||||
subsoup = soup.find("table", attrs={'id':'nav'})
|
||||
for item in subsoup.findAll("a", "fl"):
|
||||
new_start = int(re.search("start=([0-9]+)", item['href']).group(1))
|
||||
if new_start == self.current_start + 10:
|
||||
self.current_start = new_start
|
||||
next_url = item['href']
|
||||
|
||||
if next_url == "":
|
||||
print "[google] No next pages found... Done spidering Google!"
|
||||
else:
|
||||
|
||||
print "[google] == SLEEPING..."
|
||||
time.sleep(4)
|
||||
#self.crawl_page("http://www.google.com" + next_url)
|
||||
|
||||
class CalibreCrawler(threading.Thread):
|
||||
def run(self):
|
||||
global calibre_urls, STOP
|
||||
|
||||
while STOP == False:
|
||||
if len(calibre_urls) > 0:
|
||||
current_url = calibre_urls.popleft()
|
||||
self.crawl_page(current_url)
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
def crawl_page(self, url):
|
||||
url_matcher = re.compile("(https?:\/\/[^/]*).*")
|
||||
base = url_matcher.match(url).group(1)
|
||||
|
||||
print "[calibr] Starting crawl on %s ..." % url
|
||||
|
||||
response = urllib2.urlopen(url + "browse/category/allbooks")
|
||||
page_contents = response.read()
|
||||
|
||||
matcher = re.compile("<div class=\"load_data\" title=\"([\[\]0-9\s,]*)\">")
|
||||
|
||||
for result in matcher.finditer(page_contents):
|
||||
try:
|
||||
query = result.group(1)
|
||||
|
||||
list_url = url + "browse/booklist_page"
|
||||
data = urllib.urlencode({'ids': query})
|
||||
req = urllib2.Request(list_url, data)
|
||||
response = urllib2.urlopen(req)
|
||||
page_contents = response.read()[1:-1].replace("\\n", "").replace("\\\"", "\"")
|
||||
|
||||
soup = BeautifulSoup(page_contents)
|
||||
|
||||
for subsoup in soup.findAll("div", "summary"):
|
||||
try:
|
||||
title = subsoup.find("div", "title").strong.string
|
||||
authors = subsoup.find("div", "authors").string
|
||||
description = subsoup.find("div", "comments").prettify()
|
||||
|
||||
try:
|
||||
thumbnail = base + subsoup.find("div", "left").img['src']
|
||||
except:
|
||||
thumbnail = ""
|
||||
|
||||
urls = []
|
||||
urls.append(subsoup.find("a", "read")['href'])
|
||||
|
||||
formats_list = subsoup.find("div", "formats")
|
||||
for format_url in formats_list.findAll("a"):
|
||||
urls.append(format_url['href'])
|
||||
|
||||
final_urls = []
|
||||
|
||||
for format_url in urls:
|
||||
url_matcher = re.compile(".*\/get\/([^/]+)\/.*")
|
||||
m = url_matcher.match(format_url)
|
||||
filetype = m.group(1).lower()
|
||||
download_url = base + format_url
|
||||
final_urls.append({
|
||||
'filetype': filetype,
|
||||
'url': download_url
|
||||
})
|
||||
|
||||
add_book(title, authors, description, thumbnail, final_urls)
|
||||
|
||||
except Exception, e:
|
||||
print "[calibr] FAILED: '%s' by '%s', error: %s" % (title, authors, str(e))
|
||||
|
||||
time.sleep(2)
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
google = GoogleCrawler()
|
||||
google.base_url = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22"
|
||||
#google.start()
|
||||
|
||||
calibre_urls = deque(["http://caltsardragon.com:8080/"])
|
||||
|
||||
calibre = CalibreCrawler()
|
||||
calibre.start()
|
||||
except KeyboardInterrupt:
|
||||
STOP = True
|
@ -1,161 +0,0 @@
|
||||
import urllib, urllib2, re, _mysql, time, math, threading, htmlentitydefs
|
||||
from collections import deque
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
# Don't forget to configure the database login at the bottom of the code!
|
||||
|
||||
class GoogleCrawler (threading.Thread):
|
||||
def run(self):
|
||||
google_start()
|
||||
|
||||
class CalibreCrawler (threading.Thread):
|
||||
def run(self):
|
||||
calibre_loop()
|
||||
|
||||
def crawl_page(url):
|
||||
try:
|
||||
opener = urllib2.build_opener()
|
||||
opener.addheaders = [('User-agent', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2")]
|
||||
response = opener.open(url)
|
||||
page_contents = response.read()
|
||||
find_links(page_contents)
|
||||
time.sleep(4)
|
||||
next_url = find_next(page_contents)
|
||||
if next_url == "":
|
||||
print "Done."
|
||||
else:
|
||||
crawl_page("http://www.google.com" + next_url)
|
||||
except:
|
||||
crawl_page(url)
|
||||
|
||||
def find_links(page):
|
||||
soup = BeautifulSoup(page)
|
||||
for subsoup in soup.findAll("li", "g"):
|
||||
url = subsoup.find("a", "l")['href']
|
||||
url_matcher = re.compile("(https?:\/\/[^/]*.*\/)browse\/.*")
|
||||
url = url_matcher.match(url).group(1)
|
||||
if url not in calibre_urls:
|
||||
print url
|
||||
calibre_urls.append(url)
|
||||
|
||||
def find_next(page):
|
||||
global current_start
|
||||
soup = BeautifulSoup(page)
|
||||
subsoup = soup.find("table", attrs={'id':'nav'})
|
||||
for item in subsoup.findAll("a", "pn"):
|
||||
new_start = find_start(item['href'])
|
||||
if new_start == current_start + 10:
|
||||
current_start = new_start
|
||||
return item['href']
|
||||
return ""
|
||||
|
||||
def find_start(url):
|
||||
url_match = re.search("start=([0-9]+)", url)
|
||||
if url_match is None:
|
||||
return 0
|
||||
else:
|
||||
return int(url_match.group(1))
|
||||
|
||||
def google_start():
|
||||
crawl_page(base_url)
|
||||
|
||||
def calibre_crawl(url):
|
||||
try:
|
||||
url_matcher = re.compile("(https?:\/\/[^/]*).*")
|
||||
base = url_matcher.match(url).group(1)
|
||||
|
||||
print "Starting crawl on %s ..." % url
|
||||
|
||||
response = urllib2.urlopen(url + "browse/category/allbooks")
|
||||
page_contents = response.read()
|
||||
|
||||
matcher = re.compile("<div class=\"load_data\" title=\"([\[\]0-9\s,]*)\">")
|
||||
|
||||
for result in matcher.finditer(page_contents):
|
||||
try:
|
||||
query = result.group(1)
|
||||
|
||||
list_url = url + "browse/booklist_page"
|
||||
data = urllib.urlencode({'ids': query})
|
||||
req = urllib2.Request(list_url, data)
|
||||
response = urllib2.urlopen(req)
|
||||
page_contents = response.read()[1:-1].replace("\\n", "").replace("\\\"", "\"")
|
||||
|
||||
soup = BeautifulSoup(page_contents)
|
||||
|
||||
for subsoup in soup.findAll("div", "summary"):
|
||||
try:
|
||||
title = subsoup.find("div", "title").strong.string
|
||||
authors = subsoup.find("div", "authors").string
|
||||
description = subsoup.find("div", "comments").prettify()
|
||||
|
||||
try:
|
||||
thumbnail = base + subsoup.find("div", "left").img['src']
|
||||
except:
|
||||
thumbnail = ""
|
||||
|
||||
urls = []
|
||||
urls.append(subsoup.find("a", "read")['href'])
|
||||
|
||||
formats_list = subsoup.find("div", "formats")
|
||||
for format_url in formats_list.findAll("a"):
|
||||
urls.append(format_url['href'])
|
||||
|
||||
s_title = db.escape_string(title)
|
||||
s_authors = db.escape_string(authors)
|
||||
s_description = db.escape_string(description)
|
||||
s_thumbnail = db.escape_string(thumbnail)
|
||||
|
||||
sql_query = "SELECT * FROM books WHERE `Thumbnail` = '%s'" % s_thumbnail
|
||||
db.query(sql_query)
|
||||
sql_result = db.store_result()
|
||||
|
||||
if sql_result.num_rows() == 0:
|
||||
sql_query = "INSERT INTO books (`Title`, `Authors`, `Description`, `Thumbnail`) VALUES ('%s', '%s', '%s', '%s')" % (s_title, s_authors, s_description, s_thumbnail)
|
||||
#print sql_query
|
||||
|
||||
db.query(sql_query)
|
||||
ins_id = db.insert_id()
|
||||
|
||||
for format_url in urls:
|
||||
url_matcher = re.compile(".*\/get\/([^/]+)\/.*")
|
||||
m = url_matcher.match(format_url)
|
||||
filetype = m.group(1).lower()
|
||||
download_url = db.escape_string(base + format_url)
|
||||
|
||||
s_filetype = db.escape_string(filetype)
|
||||
s_url = db.escape_string(download_url)
|
||||
|
||||
sql_query = "INSERT INTO files (`BookId`, `Format`, `Url`) VALUES ('%d', '%s', '%s')" % (ins_id, s_filetype, s_url)
|
||||
db.query(sql_query)
|
||||
|
||||
print "SUCCESS: %s" % s_title
|
||||
else:
|
||||
print "SKIP: %s" % title
|
||||
|
||||
time.sleep(0.1)
|
||||
|
||||
except:
|
||||
print "FAIL: %s" % title
|
||||
|
||||
time.sleep(2)
|
||||
except:
|
||||
pass
|
||||
except:
|
||||
pass
|
||||
|
||||
def calibre_loop():
|
||||
global calibre_urls
|
||||
while True:
|
||||
if len(calibre_urls) > 0:
|
||||
current_url = calibre_urls.popleft()
|
||||
calibre_crawl(current_url)
|
||||
time.sleep(1)
|
||||
|
||||
calibre_urls = deque([])
|
||||
current_start = 0
|
||||
base_url = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22"
|
||||
db = _mysql.connect("localhost", "root", "", "ebooks")
|
||||
|
||||
GoogleCrawler().start()
|
||||
CalibreCrawler().start()
|
@ -0,0 +1,62 @@
|
||||
#!/usr/bin/python
|
||||
import os, time, sys, json, _mysql
|
||||
|
||||
def stringdammit(input_string):
|
||||
if isinstance(input_string, str):
|
||||
return input_string
|
||||
else:
|
||||
return input_string.encode('utf-8')
|
||||
|
||||
pipe_name = 'pipe_books'
|
||||
|
||||
if not os.path.exists(pipe_name):
|
||||
os.mkfifo(pipe_name)
|
||||
|
||||
pipein = open(pipe_name, 'r')
|
||||
buff = ""
|
||||
db = _mysql.connect("localhost", "root", "", "ebooks")
|
||||
|
||||
while True:
|
||||
data = pipein.read()
|
||||
buff += data
|
||||
stack = buff.replace("\r", "").split("\n")
|
||||
buff = stack.pop()
|
||||
|
||||
for line in stack:
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
except:
|
||||
print line
|
||||
sys.stderr.write("ERROR: Could not decode message: %s" % line)
|
||||
continue
|
||||
|
||||
message_type = obj['type']
|
||||
message_data = obj['data']
|
||||
|
||||
if message_type == "add":
|
||||
s_title = db.escape_string(stringdammit(message_data['title']))
|
||||
s_description = db.escape_string(stringdammit(message_data['description']))
|
||||
s_authors = db.escape_string(stringdammit(message_data['authors']))
|
||||
s_thumbnail = db.escape_string(stringdammit(message_data['thumbnail']))
|
||||
|
||||
sql_query = "SELECT * FROM books WHERE `Thumbnail` = '%s'" % s_thumbnail
|
||||
db.query(sql_query)
|
||||
sql_result = db.store_result()
|
||||
|
||||
if sql_result.num_rows() == 0:
|
||||
sql_query = "INSERT INTO books (`Title`, `Authors`, `Description`, `Thumbnail`) VALUES ('%s', '%s', '%s', '%s')" % (s_title, s_authors, s_description, s_thumbnail)
|
||||
db.query(sql_query)
|
||||
book_id = db.insert_id()
|
||||
|
||||
for format_url in message_data['urls']:
|
||||
s_filetype = db.escape_string(stringdammit(format_url['filetype'].lower()))
|
||||
s_url = db.escape_string(stringdammit(format_url['url']))
|
||||
|
||||
sql_query = "INSERT INTO files (`BookId`, `Format`, `Url`) VALUES ('%d', '%s', '%s')" % (book_id, s_filetype, s_url)
|
||||
db.query(sql_query)
|
||||
|
||||
print "Received and inserted '%s' by '%s'" % (s_title, s_authors)
|
||||
else:
|
||||
print "Skipped '%s' by '%s' (already exists)" % (s_title, s_authors)
|
||||
else:
|
||||
print "Unrecognized command: %s" % message_type
|
Loading…
Reference in New Issue