|
|
@ -1,13 +1,8 @@
|
|
|
|
#!/usr/bin/python
|
|
|
|
#!/usr/bin/python
|
|
|
|
import os, time, sys, urllib, urllib2, threading, re
|
|
|
|
import os, time, sys, json, urllib, urllib2, threading, re
|
|
|
|
from collections import deque
|
|
|
|
from collections import deque
|
|
|
|
from BeautifulSoup import BeautifulSoup
|
|
|
|
from BeautifulSoup import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
except:
|
|
|
|
|
|
|
|
import simplejson as json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
STOP = False
|
|
|
|
STOP = False
|
|
|
|
|
|
|
|
|
|
|
|
pipe_name = 'pipe_books'
|
|
|
|
pipe_name = 'pipe_books'
|
|
|
@ -79,7 +74,7 @@ class GoogleCrawler (threading.Thread):
|
|
|
|
|
|
|
|
|
|
|
|
print "[google] == SLEEPING..."
|
|
|
|
print "[google] == SLEEPING..."
|
|
|
|
time.sleep(4)
|
|
|
|
time.sleep(4)
|
|
|
|
self.crawl_page("http://www.google.com" + next_url)
|
|
|
|
#self.crawl_page("http://www.google.com" + next_url)
|
|
|
|
|
|
|
|
|
|
|
|
class CalibreCrawler(threading.Thread):
|
|
|
|
class CalibreCrawler(threading.Thread):
|
|
|
|
def run(self):
|
|
|
|
def run(self):
|
|
|
@ -98,12 +93,7 @@ class CalibreCrawler(threading.Thread):
|
|
|
|
|
|
|
|
|
|
|
|
print "[calibr] Starting crawl on %s ..." % url
|
|
|
|
print "[calibr] Starting crawl on %s ..." % url
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
response = urllib2.urlopen(url + "browse/category/allbooks")
|
|
|
|
response = urllib2.urlopen(url + "browse/category/allbooks")
|
|
|
|
except urllib2.URLError:
|
|
|
|
|
|
|
|
print "Skipping %s, as the server could not be successfully reached." % url
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
page_contents = response.read()
|
|
|
|
page_contents = response.read()
|
|
|
|
|
|
|
|
|
|
|
|
matcher = re.compile("<div class=\"load_data\" title=\"([\[\]0-9\s,]*)\">")
|
|
|
|
matcher = re.compile("<div class=\"load_data\" title=\"([\[\]0-9\s,]*)\">")
|
|
|
|