diff --git a/crawler/calibre.py b/crawler/calibre.py
new file mode 100644
index 0000000..82dcccc
--- /dev/null
+++ b/crawler/calibre.py
@@ -0,0 +1,162 @@
+#!/usr/bin/python
+import os, time, sys, json, urllib, urllib2, threading, re
+from collections import deque
+from BeautifulSoup import BeautifulSoup
+
+STOP = False
+
+pipe_name = 'pipe_books'
+
+calibre_urls = deque([])
+
+def add_book(title, authors, description, thumbnail, urls):
+ global pipe_name
+
+ print "[libcal] Submitted %s" % title
+
+ pipeout = os.open(pipe_name, os.O_WRONLY)
+ os.write(pipeout, json.dumps({
+ 'type': "add",
+ 'data': {
+ 'title': title,
+ 'authors': authors,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'urls': urls
+ }
+ }) + "\n")
+ os.close(pipeout)
+
+
+class GoogleCrawler (threading.Thread):
+ def run(self):
+ self.current_start = 0
+ self.crawl_page(self.base_url)
+
+ def crawl_page(self, url):
+ global calibre_urls, STOP
+
+ if STOP == True:
+ return None
+
+ opener = urllib2.build_opener()
+ opener.addheaders = [('User-agent', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2")]
+ response = opener.open(url)
+ page_contents = response.read()
+
+ print "[google] == FIND CALIBRE LINKS"
+ soup = BeautifulSoup(page_contents)
+ for subsoup in soup.findAll("li", "g"):
+ url = subsoup.find("a", "l")['href']
+ url_matcher = re.compile("(https?:\/\/[^/]*.*\/)browse\/.*")
+
+ try:
+ url = url_matcher.match(url).group(1)
+ except AttributeError:
+ continue
+
+ if url not in calibre_urls:
+ print "[google] Found Calibre at %s" % url
+ calibre_urls.append(url)
+
+ print "[google] == FIND NEXT PAGE"
+ next_url = ""
+ subsoup = soup.find("table", attrs={'id':'nav'})
+ for item in subsoup.findAll("a", "fl"):
+ new_start = int(re.search("start=([0-9]+)", item['href']).group(1))
+ if new_start == self.current_start + 10:
+ self.current_start = new_start
+ next_url = item['href']
+
+ if next_url == "":
+ print "[google] No next pages found... Done spidering Google!"
+ else:
+
+ print "[google] == SLEEPING..."
+ time.sleep(4)
+ #self.crawl_page("http://www.google.com" + next_url)
+
+class CalibreCrawler(threading.Thread):
+ def run(self):
+ global calibre_urls, STOP
+
+ while STOP == False:
+ if len(calibre_urls) > 0:
+ current_url = calibre_urls.popleft()
+ self.crawl_page(current_url)
+
+ time.sleep(1)
+
+ def crawl_page(self, url):
+ url_matcher = re.compile("(https?:\/\/[^/]*).*")
+ base = url_matcher.match(url).group(1)
+
+ print "[calibr] Starting crawl on %s ..." % url
+
+ response = urllib2.urlopen(url + "browse/category/allbooks")
+ page_contents = response.read()
+
+ matcher = re.compile("
")
+
+ for result in matcher.finditer(page_contents):
+ try:
+ query = result.group(1)
+
+ list_url = url + "browse/booklist_page"
+ data = urllib.urlencode({'ids': query})
+ req = urllib2.Request(list_url, data)
+ response = urllib2.urlopen(req)
+ page_contents = response.read()[1:-1].replace("\\n", "").replace("\\\"", "\"")
+
+ soup = BeautifulSoup(page_contents)
+
+ for subsoup in soup.findAll("div", "summary"):
+ try:
+ title = subsoup.find("div", "title").strong.string
+ authors = subsoup.find("div", "authors").string
+ description = subsoup.find("div", "comments").prettify()
+
+ try:
+ thumbnail = base + subsoup.find("div", "left").img['src']
+ except:
+ thumbnail = ""
+
+ urls = []
+ urls.append(subsoup.find("a", "read")['href'])
+
+ formats_list = subsoup.find("div", "formats")
+ for format_url in formats_list.findAll("a"):
+ urls.append(format_url['href'])
+
+ final_urls = []
+
+ for format_url in urls:
+ url_matcher = re.compile(".*\/get\/([^/]+)\/.*")
+ m = url_matcher.match(format_url)
+ filetype = m.group(1).lower()
+ download_url = base + format_url
+ final_urls.append({
+ 'filetype': filetype,
+ 'url': download_url
+ })
+
+ add_book(title, authors, description, thumbnail, final_urls)
+
+ except Exception, e:
+ print "[calibr] FAILED: '%s' by '%s', error: %s" % (title, authors, str(e))
+
+ time.sleep(2)
+ except:
+ pass
+
+try:
+ google = GoogleCrawler()
+ google.base_url = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22"
+ #google.start()
+
+ calibre_urls = deque(["http://caltsardragon.com:8080/"])
+
+ calibre = CalibreCrawler()
+ calibre.start()
+except KeyboardInterrupt:
+ STOP = True
diff --git a/crawler/crawler.py b/crawler/crawler.py
deleted file mode 100644
index 8082a52..0000000
--- a/crawler/crawler.py
+++ /dev/null
@@ -1,161 +0,0 @@
-import urllib, urllib2, re, _mysql, time, math, threading, htmlentitydefs
-from collections import deque
-from BeautifulSoup import BeautifulSoup
-
-# Don't forget to configure the database login at the bottom of the code!
-
-class GoogleCrawler (threading.Thread):
- def run(self):
- google_start()
-
-class CalibreCrawler (threading.Thread):
- def run(self):
- calibre_loop()
-
-def crawl_page(url):
- try:
- opener = urllib2.build_opener()
- opener.addheaders = [('User-agent', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2")]
- response = opener.open(url)
- page_contents = response.read()
- find_links(page_contents)
- time.sleep(4)
- next_url = find_next(page_contents)
- if next_url == "":
- print "Done."
- else:
- crawl_page("http://www.google.com" + next_url)
- except:
- crawl_page(url)
-
-def find_links(page):
- soup = BeautifulSoup(page)
- for subsoup in soup.findAll("li", "g"):
- url = subsoup.find("a", "l")['href']
- url_matcher = re.compile("(https?:\/\/[^/]*.*\/)browse\/.*")
- url = url_matcher.match(url).group(1)
- if url not in calibre_urls:
- print url
- calibre_urls.append(url)
-
-def find_next(page):
- global current_start
- soup = BeautifulSoup(page)
- subsoup = soup.find("table", attrs={'id':'nav'})
- for item in subsoup.findAll("a", "pn"):
- new_start = find_start(item['href'])
- if new_start == current_start + 10:
- current_start = new_start
- return item['href']
- return ""
-
-def find_start(url):
- url_match = re.search("start=([0-9]+)", url)
- if url_match is None:
- return 0
- else:
- return int(url_match.group(1))
-
-def google_start():
- crawl_page(base_url)
-
-def calibre_crawl(url):
- try:
- url_matcher = re.compile("(https?:\/\/[^/]*).*")
- base = url_matcher.match(url).group(1)
-
- print "Starting crawl on %s ..." % url
-
- response = urllib2.urlopen(url + "browse/category/allbooks")
- page_contents = response.read()
-
- matcher = re.compile("
")
-
- for result in matcher.finditer(page_contents):
- try:
- query = result.group(1)
-
- list_url = url + "browse/booklist_page"
- data = urllib.urlencode({'ids': query})
- req = urllib2.Request(list_url, data)
- response = urllib2.urlopen(req)
- page_contents = response.read()[1:-1].replace("\\n", "").replace("\\\"", "\"")
-
- soup = BeautifulSoup(page_contents)
-
- for subsoup in soup.findAll("div", "summary"):
- try:
- title = subsoup.find("div", "title").strong.string
- authors = subsoup.find("div", "authors").string
- description = subsoup.find("div", "comments").prettify()
-
- try:
- thumbnail = base + subsoup.find("div", "left").img['src']
- except:
- thumbnail = ""
-
- urls = []
- urls.append(subsoup.find("a", "read")['href'])
-
- formats_list = subsoup.find("div", "formats")
- for format_url in formats_list.findAll("a"):
- urls.append(format_url['href'])
-
- s_title = db.escape_string(title)
- s_authors = db.escape_string(authors)
- s_description = db.escape_string(description)
- s_thumbnail = db.escape_string(thumbnail)
-
- sql_query = "SELECT * FROM books WHERE `Thumbnail` = '%s'" % s_thumbnail
- db.query(sql_query)
- sql_result = db.store_result()
-
- if sql_result.num_rows() == 0:
- sql_query = "INSERT INTO books (`Title`, `Authors`, `Description`, `Thumbnail`) VALUES ('%s', '%s', '%s', '%s')" % (s_title, s_authors, s_description, s_thumbnail)
- #print sql_query
-
- db.query(sql_query)
- ins_id = db.insert_id()
-
- for format_url in urls:
- url_matcher = re.compile(".*\/get\/([^/]+)\/.*")
- m = url_matcher.match(format_url)
- filetype = m.group(1).lower()
- download_url = db.escape_string(base + format_url)
-
- s_filetype = db.escape_string(filetype)
- s_url = db.escape_string(download_url)
-
- sql_query = "INSERT INTO files (`BookId`, `Format`, `Url`) VALUES ('%d', '%s', '%s')" % (ins_id, s_filetype, s_url)
- db.query(sql_query)
-
- print "SUCCESS: %s" % s_title
- else:
- print "SKIP: %s" % title
-
- time.sleep(0.1)
-
- except:
- print "FAIL: %s" % title
-
- time.sleep(2)
- except:
- pass
- except:
- pass
-
-def calibre_loop():
- global calibre_urls
- while True:
- if len(calibre_urls) > 0:
- current_url = calibre_urls.popleft()
- calibre_crawl(current_url)
- time.sleep(1)
-
-calibre_urls = deque([])
-current_start = 0
-base_url = "http://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=intitle%3A%22calibre+library%22+inurl%3A%22browse%22"
-db = _mysql.connect("localhost", "root", "", "ebooks")
-
-GoogleCrawler().start()
-CalibreCrawler().start()
\ No newline at end of file
diff --git a/crawler/daemon.py b/crawler/daemon.py
new file mode 100644
index 0000000..5aef1cc
--- /dev/null
+++ b/crawler/daemon.py
@@ -0,0 +1,62 @@
+#!/usr/bin/python
+import os, time, sys, json, _mysql
+
+def stringdammit(input_string):
+ if isinstance(input_string, str):
+ return input_string
+ else:
+ return input_string.encode('utf-8')
+
+pipe_name = 'pipe_books'
+
+if not os.path.exists(pipe_name):
+ os.mkfifo(pipe_name)
+
+pipein = open(pipe_name, 'r')
+buff = ""
+db = _mysql.connect("localhost", "root", "", "ebooks")
+
+while True:
+ data = pipein.read()
+ buff += data
+ stack = buff.replace("\r", "").split("\n")
+ buff = stack.pop()
+
+ for line in stack:
+ try:
+ obj = json.loads(line)
+ except:
+ print line
+ sys.stderr.write("ERROR: Could not decode message: %s" % line)
+ continue
+
+ message_type = obj['type']
+ message_data = obj['data']
+
+ if message_type == "add":
+ s_title = db.escape_string(stringdammit(message_data['title']))
+ s_description = db.escape_string(stringdammit(message_data['description']))
+ s_authors = db.escape_string(stringdammit(message_data['authors']))
+ s_thumbnail = db.escape_string(stringdammit(message_data['thumbnail']))
+
+ sql_query = "SELECT * FROM books WHERE `Thumbnail` = '%s'" % s_thumbnail
+ db.query(sql_query)
+ sql_result = db.store_result()
+
+ if sql_result.num_rows() == 0:
+ sql_query = "INSERT INTO books (`Title`, `Authors`, `Description`, `Thumbnail`) VALUES ('%s', '%s', '%s', '%s')" % (s_title, s_authors, s_description, s_thumbnail)
+ db.query(sql_query)
+ book_id = db.insert_id()
+
+ for format_url in message_data['urls']:
+ s_filetype = db.escape_string(stringdammit(format_url['filetype'].lower()))
+ s_url = db.escape_string(stringdammit(format_url['url']))
+
+ sql_query = "INSERT INTO files (`BookId`, `Format`, `Url`) VALUES ('%d', '%s', '%s')" % (book_id, s_filetype, s_url)
+ db.query(sql_query)
+
+ print "Received and inserted '%s' by '%s'" % (s_title, s_authors)
+ else:
+ print "Skipped '%s' by '%s' (already exists)" % (s_title, s_authors)
+ else:
+ print "Unrecognized command: %s" % message_type