Compare commits

...

5 Commits

@ -1,8 +1,13 @@
#!/usr/bin/python
import os, time, sys, json, urllib, urllib2, threading, re
import os, time, sys, urllib, urllib2, threading, re
from collections import deque
from BeautifulSoup import BeautifulSoup
try:
import json
except:
import simplejson as json
STOP = False
pipe_name = 'pipe_books'
@ -74,7 +79,7 @@ class GoogleCrawler (threading.Thread):
print "[google] == SLEEPING..."
time.sleep(4)
#self.crawl_page("http://www.google.com" + next_url)
self.crawl_page("http://www.google.com" + next_url)
class CalibreCrawler(threading.Thread):
def run(self):
@ -92,8 +97,13 @@ class CalibreCrawler(threading.Thread):
base = url_matcher.match(url).group(1)
print "[calibr] Starting crawl on %s ..." % url
response = urllib2.urlopen(url + "browse/category/allbooks")
try:
response = urllib2.urlopen(url + "browse/category/allbooks")
except urllib2.URLError:
print "Skipping %s, as the server could not be successfully reached." % url
return None
page_contents = response.read()
matcher = re.compile("<div class=\"load_data\" title=\"([\[\]0-9\s,]*)\">")

@ -1,5 +1,10 @@
#!/usr/bin/python
import os, time, sys, json, _mysql
import os, time, sys, _mysql
try:
import json
except:
import simplejson as json
def stringdammit(input_string):
if isinstance(input_string, str):
@ -62,3 +67,5 @@ while True:
print "Skipped '%s' by '%s' (already exists)" % (s_title, s_authors)
else:
print "Unrecognized command: %s" % message_type
time.sleep(0.05)

Loading…
Cancel
Save