You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
66 lines
1.7 KiB
Python
66 lines
1.7 KiB
Python
import requests, lxml.html, urlparse, time
|
|
|
|
sess = requests.Session()
|
|
sess.headers.update({"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2"})
|
|
|
|
base_uri = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22"
|
|
|
|
uri = base_uri
|
|
|
|
all_results = []
|
|
|
|
while True:
|
|
response = sess.get(uri)
|
|
xml = lxml.html.fromstring(response.text)
|
|
|
|
results = xml.xpath("//h3[@class='r']/a/@href")
|
|
next_ = xml.xpath("//a[@id='pnnext']/@href")
|
|
|
|
for result in results:
|
|
all_results.append(result)
|
|
|
|
if len(next_) > 0:
|
|
uri = urlparse.urljoin(uri, next_[0])
|
|
else:
|
|
break
|
|
|
|
time.sleep(1)
|
|
|
|
unique_results = []
|
|
|
|
for result in all_results:
|
|
print "Testing %s..." % result
|
|
try:
|
|
response = requests.get(result, timeout=10)
|
|
except requests.exceptions.RequestException, e:
|
|
# Dead, skip
|
|
continue
|
|
except socket.timeout, e:
|
|
# Also dead, this might be thrown instead of above (see https://github.com/kennethreitz/requests/issues/1797)
|
|
continue
|
|
|
|
if "Donate to support the development of calibre" not in response.text:
|
|
# Fake...
|
|
continue
|
|
|
|
# Find base URI for this Calibre
|
|
xml = lxml.html.fromstring(response.text.encode("utf-8"))
|
|
|
|
try:
|
|
base_path = xml.xpath("//div[@id='header']//div[@class='bubble']//a/@href")[0]
|
|
except IndexError, e:
|
|
# Not found... probably not a Calibre, just a very good fake?
|
|
continue
|
|
|
|
result = urlparse.urljoin(result, base_path).rstrip("/")
|
|
|
|
if result.endswith("/browse"):
|
|
result = result[:-7]
|
|
|
|
if result not in unique_results:
|
|
print result
|
|
unique_results.append(result)
|
|
|
|
for result in unique_results:
|
|
pass#print result
|