You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
crytobooks/newcrawler/scrape/find-calibre.py

66 lines
1.7 KiB
Python

import requests, lxml.html, urlparse, time
sess = requests.Session()
sess.headers.update({"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2"})
base_uri = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22"
uri = base_uri
all_results = []
while True:
response = sess.get(uri)
xml = lxml.html.fromstring(response.text)
results = xml.xpath("//h3[@class='r']/a/@href")
next_ = xml.xpath("//a[@id='pnnext']/@href")
for result in results:
all_results.append(result)
if len(next_) > 0:
uri = urlparse.urljoin(uri, next_[0])
else:
break
time.sleep(1)
unique_results = []
for result in all_results:
print "Testing %s..." % result
try:
response = requests.get(result, timeout=10)
except requests.exceptions.RequestException, e:
# Dead, skip
continue
except socket.timeout, e:
# Also dead, this might be thrown instead of above (see https://github.com/kennethreitz/requests/issues/1797)
continue
if "Donate to support the development of calibre" not in response.text:
# Fake...
continue
# Find base URI for this Calibre
xml = lxml.html.fromstring(response.text.encode("utf-8"))
try:
base_path = xml.xpath("//div[@id='header']//div[@class='bubble']//a/@href")[0]
except IndexError, e:
# Not found... probably not a Calibre, just a very good fake?
continue
result = urlparse.urljoin(result, base_path).rstrip("/")
if result.endswith("/browse"):
result = result[:-7]
if result not in unique_results:
print result
unique_results.append(result)
for result in unique_results:
pass#print result