Use the proper exception name

Skip a server when it cannot be reached
Let the Calibre crawler crawl more than one Google page
2 changed files with 22 additions and 5 deletions
--- a/crawler/calibre.py
+++ b/crawler/calibre.py
@ -1,8 +1,13 @@
 #!/usr/bin/python
-import os, time, sys, json, urllib, urllib2, threading, re
+import os, time, sys, urllib, urllib2, threading, re
 from collections import deque
 from BeautifulSoup import BeautifulSoup

+try:
+	import json
+except:
+	import simplejson as json
+
 STOP = False

 pipe_name = 'pipe_books'
@ -74,7 +79,7 @@ class GoogleCrawler (threading.Thread):
 			
 			print "[google] == SLEEPING..."
 			time.sleep(4)
-			#self.crawl_page("http://www.google.com" + next_url)
+			self.crawl_page("http://www.google.com" + next_url)

 class CalibreCrawler(threading.Thread):
 	def run(self):
@ -92,8 +97,13 @@ class CalibreCrawler(threading.Thread):
 		base = url_matcher.match(url).group(1)

 		print "[calibr] Starting crawl on %s ..." % url
-
-		response = urllib2.urlopen(url + "browse/category/allbooks")
+		
+		try:
+			response = urllib2.urlopen(url + "browse/category/allbooks")
+		except urllib2.URLError:
+			print "Skipping %s, as the server could not be successfully reached." % url
+			return None
+		
 		page_contents = response.read()

 		matcher = re.compile("<div class=\"load_data\" title=\"([\[\]0-9\s,]*)\">")
--- a/crawler/daemon.py
+++ b/crawler/daemon.py
@ -1,5 +1,10 @@
 #!/usr/bin/python
-import os, time, sys, json, _mysql
+import os, time, sys, _mysql
+
+try:
+	import json
+except:
+	import simplejson as json

 def stringdammit(input_string):
 	if isinstance(input_string, str):
@ -62,3 +67,5 @@ while True:
 				print "Skipped '%s' by '%s' (already exists)" % (s_title, s_authors)
 		else:
 			print "Unrecognized command: %s" % message_type
+
+	time.sleep(0.05)
Author	SHA1	Message	Date
Sven Slootweg	6a0654b7cb	Use the proper exception name	12 years ago
Sven Slootweg	0a2d4fcb9f	Skip a server when it cannot be reached	12 years ago
Sven Slootweg	51916b8bbd	Let the Calibre crawler crawl more than one Google page	12 years ago
Sven Slootweg	090165e62f	Sleep for a bit after every read to not waste an entire core	12 years ago
Sven Slootweg	82e4826dea	Fall back to simplejson when the stdlib json module is not available	12 years ago