Compare commits
13 Commits
Author | SHA1 | Date |
---|---|---|
|
d6995b5030 | 9 years ago |
|
16a5bea805 | 9 years ago |
|
389a49e6c3 | 9 years ago |
|
68e82f8ac8 | 9 years ago |
|
73fa91c953 | 9 years ago |
|
e49a9bdef6 | 9 years ago |
|
53b4111c59 | 9 years ago |
|
8a079d7090 | 9 years ago |
|
047f08fdf6 | 9 years ago |
|
4994c1baa7 | 9 years ago |
|
7d5736c28e | 9 years ago |
|
d35311f6db | 9 years ago |
|
b3fa3fba4a | 9 years ago |
6 changed files with 186 additions and 0 deletions
@ -0,0 +1,4 @@ |
|||
python log.py & |
|||
python collect.py & |
|||
python retrieve.py & |
|||
python scrape.py & |
@ -0,0 +1,34 @@ |
|||
import zmq, msgpack, json, os, time |
|||
|
|||
context = zmq.Context() |
|||
socket = context.socket(zmq.PULL) |
|||
socket.bind("ipc:///tmp/pbscrape-results") |
|||
logger = context.socket(zmq.PUSH) |
|||
logger.connect("ipc:///tmp/pbscrape-log") |
|||
|
|||
try: |
|||
os.makedirs("pastes") |
|||
except OSError, e: |
|||
pass |
|||
|
|||
while True: |
|||
item = msgpack.unpackb(socket.recv()) |
|||
|
|||
target_dir = time.strftime("%Y-%m-%d") |
|||
|
|||
try: |
|||
os.makedirs("pastes/%s" % target_dir) |
|||
except OSError, e: |
|||
pass |
|||
|
|||
f = open("pastes/%s/%s.txt" % (target_dir, item["id"]), "wb") |
|||
f.write(item["paste"]) |
|||
f.close() |
|||
|
|||
del item["paste"] # To prevent writing the paste to the metadata file as well |
|||
|
|||
f = open("pastes/%s/%s.json" % (target_dir, item["id"]), "wb") |
|||
json.dump(item, f) |
|||
f.close() |
|||
|
|||
logger.send(msgpack.packb({"component": "collect", "timestamp": int(time.time()), "message": "Stored %s." % item["id"]})) |
@ -0,0 +1,12 @@ |
|||
import zmq, msgpack |
|||
|
|||
context = zmq.Context() |
|||
socket = context.socket(zmq.PULL) |
|||
socket.bind("ipc:///tmp/pbscrape-log") |
|||
|
|||
while True: |
|||
entry = msgpack.unpackb(socket.recv()) |
|||
|
|||
f = open("scrape.log", "a") |
|||
f.write("[%(component)s] %(timestamp)s : %(message)s\n" % entry) |
|||
f.close() |
@ -0,0 +1,52 @@ |
|||
import zmq, msgpack, requests, time |
|||
|
|||
context = zmq.Context() |
|||
receiver = context.socket(zmq.PULL) |
|||
receiver.connect("ipc:///tmp/pbscrape-tasks") |
|||
sender = context.socket(zmq.PUSH) |
|||
sender.connect("ipc:///tmp/pbscrape-results") |
|||
logger = context.socket(zmq.PUSH) |
|||
logger.connect("ipc:///tmp/pbscrape-log") |
|||
|
|||
while True: |
|||
item = msgpack.unpackb(receiver.recv()) |
|||
|
|||
gone = False |
|||
|
|||
while True: # We want to keep trying until it succeeds... |
|||
try: |
|||
response = requests.get("http://pastebin.com/raw.php?i=%s" % item["id"]) |
|||
if response.status_code == 404: |
|||
# Gone... |
|||
gone = True |
|||
break |
|||
elif "text/html" in response.headers["Content-Type"]: |
|||
# We most likely got an "under heavy load" message or similar; sleep a while and retry |
|||
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Hit a text/html response for raw.php, servers most likely overloaded, sleeping..."})) |
|||
time.sleep(10) |
|||
continue # Retry |
|||
paste = response.text |
|||
except Exception, e: |
|||
# TODO: Log error |
|||
print e |
|||
time.sleep(5) |
|||
continue # Retry |
|||
|
|||
if response.status_code == 403: |
|||
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."})) |
|||
time.sleep(600) |
|||
continue # Retry |
|||
|
|||
break # Done |
|||
|
|||
if gone: |
|||
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Paste %s gone, skipping..." % item["id"]})) |
|||
continue # Next! |
|||
item["retrieval_time"] = int(time.time()) |
|||
item["paste"] = paste |
|||
|
|||
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Downloaded paste body for %s." % item["id"]})) |
|||
|
|||
sender.send(msgpack.packb(item)) |
|||
|
|||
time.sleep(1.3) # Wait a second between each paste retrieval... |
@ -0,0 +1,51 @@ |
|||
import zmq, time, requests, lxml.html, msgpack |
|||
|
|||
context = zmq.Context() |
|||
socket = context.socket(zmq.PUSH) |
|||
socket.bind("ipc:///tmp/pbscrape-tasks") |
|||
logger = context.socket(zmq.PUSH) |
|||
logger.connect("ipc:///tmp/pbscrape-log") |
|||
|
|||
last_list = [] |
|||
|
|||
while True: |
|||
try: |
|||
page = requests.get("http://pastebin.com/archive").text |
|||
except Exception, e: |
|||
# TODO: Log HTTP error |
|||
time.sleep(30) |
|||
continue |
|||
|
|||
if "temporarily blocked your computer" in page: |
|||
logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."})) |
|||
time.sleep(600) |
|||
continue |
|||
|
|||
basetime = int(time.time()) |
|||
|
|||
xml = lxml.html.fromstring(page) |
|||
|
|||
pastes = xml.xpath("//table[@class='maintable']/tr") |
|||
new_list = [] |
|||
found = 0 |
|||
|
|||
for paste in pastes: |
|||
try: |
|||
title, filetype = paste.xpath("td/a/text()") |
|||
except ValueError, e: |
|||
continue # Not a valid entry |
|||
|
|||
paste_id = paste.xpath("td[1]/a/@href")[0][1:] |
|||
ago = paste.xpath("td[2]/text()")[0] |
|||
|
|||
new_list.append(paste_id) |
|||
|
|||
if paste_id not in last_list: |
|||
found += 1 |
|||
socket.send(msgpack.packb({"id": paste_id, "type": filetype, "title": title, "base_time": basetime, "ago": ago})) |
|||
|
|||
logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Scraped metadata for %d new pastes." % found})) |
|||
|
|||
last_list = new_list |
|||
|
|||
time.sleep(1 * 60) |
@ -0,0 +1,33 @@ |
|||
#!/usr/bin/env python2 |
|||
|
|||
import sys, subprocess |
|||
errors = False |
|||
|
|||
try: |
|||
import zmq |
|||
except ImportError, e: |
|||
sys.stderr.write("You are missing ZeroMQ; `pip install pyzmq`\n") |
|||
errors = True |
|||
|
|||
try: |
|||
import zmq |
|||
except ImportError, e: |
|||
sys.stderr.write("You are missing msgpack; `pip install msgpack-python`\n") |
|||
errors = True |
|||
|
|||
try: |
|||
import zmq |
|||
except ImportError, e: |
|||
sys.stderr.write("You are missing requests; `pip install requests`\n") |
|||
errors = True |
|||
|
|||
try: |
|||
import lxml.html |
|||
except ImportError, e: |
|||
sys.stderr.write("You are missing lxml (needs 2.0 or higher); `pip install lxml`\n") |
|||
errors = True |
|||
|
|||
if errors == False: |
|||
subprocess.call(["/bin/sh", "_start.sh"]) |
|||
|
|||
sys.stdout.write("pastebin-scrape is now running. Run `python retrieve.py` to add additional retrieval workers.\n") |
Loading…
Reference in new issue