import zmq , msgpack , requests , time
context = zmq . Context ( )
receiver = context . socket ( zmq . PULL )
receiver . connect ( " ipc:///tmp/pbscrape-tasks " )
sender = context . socket ( zmq . PUSH )
sender . connect ( " ipc:///tmp/pbscrape-results " )
logger = context . socket ( zmq . PUSH )
logger . connect ( " ipc:///tmp/pbscrape-log " )
while True :
item = msgpack . unpackb ( receiver . recv ( ) )
gone = False
while True : # We want to keep trying until it succeeds...
try :
response = requests . get ( " http://pastebin.com/raw.php?i= %s " % item [ " id " ] )
if response . status_code == 404 :
# Gone...
gone = True
break
elif " text/html " in response . headers [ " Content-Type " ] :
# We most likely got an "under heavy load" message or similar; sleep a while and retry
logger . send ( msgpack . packb ( { " component " : " retrieve " , " timestamp " : int ( time . time ( ) ) , " message " : " Hit a text/html response for raw.php, servers most likely overloaded, sleeping... " } ) )
time . sleep ( 10 )
continue # Retry
paste = response . text
except Exception , e :
# TODO: Log error
print e
time . sleep ( 5 )
continue # Retry
if response . status_code == 403 :
logger . send ( msgpack . packb ( { " component " : " retrieve " , " timestamp " : int ( time . time ( ) ) , " message " : " Got throttled, sleeping... " } ) )
time . sleep ( 600 )
continue # Retry
break # Done
if gone :
logger . send ( msgpack . packb ( { " component " : " retrieve " , " timestamp " : int ( time . time ( ) ) , " message " : " Paste %s gone, skipping... " % item [ " id " ] } ) )
continue # Next!
item [ " retrieval_time " ] = int ( time . time ( ) )
item [ " paste " ] = paste
logger . send ( msgpack . packb ( { " component " : " retrieve " , " timestamp " : int ( time . time ( ) ) , " message " : " Downloaded paste body for %s . " % item [ " id " ] } ) )
sender . send ( msgpack . packb ( item ) )
time . sleep ( 1.3 ) # Wait a second between each paste retrieval...