You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

53 lines
1.7 KiB
Python

import zmq, msgpack, requests, time
context = zmq.Context()
receiver = context.socket(zmq.PULL)
receiver.connect("ipc:///tmp/pbscrape-tasks")
sender = context.socket(zmq.PUSH)
sender.connect("ipc:///tmp/pbscrape-results")
logger = context.socket(zmq.PUSH)
logger.connect("ipc:///tmp/pbscrape-log")
while True:
item = msgpack.unpackb(receiver.recv())
gone = False
while True: # We want to keep trying until it succeeds...
try:
response = requests.get("http://pastebin.com/raw.php?i=%s" % item["id"])
if response.status_code == 404:
# Gone...
gone = True
break
elif "text/html" in response.headers["Content-Type"]:
# We most likely got an "under heavy load" message or similar; sleep a while and retry
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Hit a text/html response for raw.php, servers most likely overloaded, sleeping..."}))
time.sleep(10)
continue # Retry
paste = response.text
except Exception, e:
# TODO: Log error
print e
time.sleep(5)
continue # Retry
if response.status_code == 403:
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."}))
time.sleep(600)
continue # Retry
break # Done
if gone:
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Paste %s gone, skipping..." % item["id"]}))
continue # Next!
item["retrieval_time"] = int(time.time())
item["paste"] = paste
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Downloaded paste body for %s." % item["id"]}))
sender.send(msgpack.packb(item))
time.sleep(1.3) # Wait a second between each paste retrieval...