You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
53 lines
1.7 KiB
Python
53 lines
1.7 KiB
Python
import zmq, msgpack, requests, time
|
|
|
|
context = zmq.Context()
|
|
receiver = context.socket(zmq.PULL)
|
|
receiver.connect("ipc:///tmp/pbscrape-tasks")
|
|
sender = context.socket(zmq.PUSH)
|
|
sender.connect("ipc:///tmp/pbscrape-results")
|
|
logger = context.socket(zmq.PUSH)
|
|
logger.connect("ipc:///tmp/pbscrape-log")
|
|
|
|
while True:
|
|
item = msgpack.unpackb(receiver.recv())
|
|
|
|
gone = False
|
|
|
|
while True: # We want to keep trying until it succeeds...
|
|
try:
|
|
response = requests.get("http://pastebin.com/raw.php?i=%s" % item["id"])
|
|
if response.status_code == 404:
|
|
# Gone...
|
|
gone = True
|
|
break
|
|
elif "text/html" in response.headers["Content-Type"]:
|
|
# We most likely got an "under heavy load" message or similar; sleep a while and retry
|
|
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Hit a text/html response for raw.php, servers most likely overloaded, sleeping..."}))
|
|
time.sleep(10)
|
|
continue # Retry
|
|
paste = response.text
|
|
except Exception, e:
|
|
# TODO: Log error
|
|
print e
|
|
time.sleep(5)
|
|
continue # Retry
|
|
|
|
if response.status_code == 403:
|
|
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."}))
|
|
time.sleep(600)
|
|
continue # Retry
|
|
|
|
break # Done
|
|
|
|
if gone:
|
|
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Paste %s gone, skipping..." % item["id"]}))
|
|
continue # Next!
|
|
item["retrieval_time"] = int(time.time())
|
|
item["paste"] = paste
|
|
|
|
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Downloaded paste body for %s." % item["id"]}))
|
|
|
|
sender.send(msgpack.packb(item))
|
|
|
|
time.sleep(1.3) # Wait a second between each paste retrieval...
|