Compare commits
13 commits
Author | SHA1 | Date | |
---|---|---|---|
d6995b5030 | |||
16a5bea805 | |||
389a49e6c3 | |||
68e82f8ac8 | |||
73fa91c953 | |||
e49a9bdef6 | |||
53b4111c59 | |||
8a079d7090 | |||
047f08fdf6 | |||
4994c1baa7 | |||
7d5736c28e | |||
d35311f6db | |||
b3fa3fba4a |
4
_start.sh
Executable file
4
_start.sh
Executable file
|
@ -0,0 +1,4 @@
|
||||||
|
python log.py &
|
||||||
|
python collect.py &
|
||||||
|
python retrieve.py &
|
||||||
|
python scrape.py &
|
34
collect.py
Normal file
34
collect.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
import zmq, msgpack, json, os, time
|
||||||
|
|
||||||
|
context = zmq.Context()
|
||||||
|
socket = context.socket(zmq.PULL)
|
||||||
|
socket.bind("ipc:///tmp/pbscrape-results")
|
||||||
|
logger = context.socket(zmq.PUSH)
|
||||||
|
logger.connect("ipc:///tmp/pbscrape-log")
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.makedirs("pastes")
|
||||||
|
except OSError, e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
while True:
|
||||||
|
item = msgpack.unpackb(socket.recv())
|
||||||
|
|
||||||
|
target_dir = time.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.makedirs("pastes/%s" % target_dir)
|
||||||
|
except OSError, e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
f = open("pastes/%s/%s.txt" % (target_dir, item["id"]), "wb")
|
||||||
|
f.write(item["paste"])
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
del item["paste"] # To prevent writing the paste to the metadata file as well
|
||||||
|
|
||||||
|
f = open("pastes/%s/%s.json" % (target_dir, item["id"]), "wb")
|
||||||
|
json.dump(item, f)
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
logger.send(msgpack.packb({"component": "collect", "timestamp": int(time.time()), "message": "Stored %s." % item["id"]}))
|
12
log.py
Normal file
12
log.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
import zmq, msgpack
|
||||||
|
|
||||||
|
context = zmq.Context()
|
||||||
|
socket = context.socket(zmq.PULL)
|
||||||
|
socket.bind("ipc:///tmp/pbscrape-log")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
entry = msgpack.unpackb(socket.recv())
|
||||||
|
|
||||||
|
f = open("scrape.log", "a")
|
||||||
|
f.write("[%(component)s] %(timestamp)s : %(message)s\n" % entry)
|
||||||
|
f.close()
|
52
retrieve.py
Normal file
52
retrieve.py
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
import zmq, msgpack, requests, time
|
||||||
|
|
||||||
|
context = zmq.Context()
|
||||||
|
receiver = context.socket(zmq.PULL)
|
||||||
|
receiver.connect("ipc:///tmp/pbscrape-tasks")
|
||||||
|
sender = context.socket(zmq.PUSH)
|
||||||
|
sender.connect("ipc:///tmp/pbscrape-results")
|
||||||
|
logger = context.socket(zmq.PUSH)
|
||||||
|
logger.connect("ipc:///tmp/pbscrape-log")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
item = msgpack.unpackb(receiver.recv())
|
||||||
|
|
||||||
|
gone = False
|
||||||
|
|
||||||
|
while True: # We want to keep trying until it succeeds...
|
||||||
|
try:
|
||||||
|
response = requests.get("http://pastebin.com/raw.php?i=%s" % item["id"])
|
||||||
|
if response.status_code == 404:
|
||||||
|
# Gone...
|
||||||
|
gone = True
|
||||||
|
break
|
||||||
|
elif "text/html" in response.headers["Content-Type"]:
|
||||||
|
# We most likely got an "under heavy load" message or similar; sleep a while and retry
|
||||||
|
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Hit a text/html response for raw.php, servers most likely overloaded, sleeping..."}))
|
||||||
|
time.sleep(10)
|
||||||
|
continue # Retry
|
||||||
|
paste = response.text
|
||||||
|
except Exception, e:
|
||||||
|
# TODO: Log error
|
||||||
|
print e
|
||||||
|
time.sleep(5)
|
||||||
|
continue # Retry
|
||||||
|
|
||||||
|
if response.status_code == 403:
|
||||||
|
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."}))
|
||||||
|
time.sleep(600)
|
||||||
|
continue # Retry
|
||||||
|
|
||||||
|
break # Done
|
||||||
|
|
||||||
|
if gone:
|
||||||
|
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Paste %s gone, skipping..." % item["id"]}))
|
||||||
|
continue # Next!
|
||||||
|
item["retrieval_time"] = int(time.time())
|
||||||
|
item["paste"] = paste
|
||||||
|
|
||||||
|
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Downloaded paste body for %s." % item["id"]}))
|
||||||
|
|
||||||
|
sender.send(msgpack.packb(item))
|
||||||
|
|
||||||
|
time.sleep(1.3) # Wait a second between each paste retrieval...
|
51
scrape.py
Normal file
51
scrape.py
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
import zmq, time, requests, lxml.html, msgpack
|
||||||
|
|
||||||
|
context = zmq.Context()
|
||||||
|
socket = context.socket(zmq.PUSH)
|
||||||
|
socket.bind("ipc:///tmp/pbscrape-tasks")
|
||||||
|
logger = context.socket(zmq.PUSH)
|
||||||
|
logger.connect("ipc:///tmp/pbscrape-log")
|
||||||
|
|
||||||
|
last_list = []
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
page = requests.get("http://pastebin.com/archive").text
|
||||||
|
except Exception, e:
|
||||||
|
# TODO: Log HTTP error
|
||||||
|
time.sleep(30)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "temporarily blocked your computer" in page:
|
||||||
|
logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."}))
|
||||||
|
time.sleep(600)
|
||||||
|
continue
|
||||||
|
|
||||||
|
basetime = int(time.time())
|
||||||
|
|
||||||
|
xml = lxml.html.fromstring(page)
|
||||||
|
|
||||||
|
pastes = xml.xpath("//table[@class='maintable']/tr")
|
||||||
|
new_list = []
|
||||||
|
found = 0
|
||||||
|
|
||||||
|
for paste in pastes:
|
||||||
|
try:
|
||||||
|
title, filetype = paste.xpath("td/a/text()")
|
||||||
|
except ValueError, e:
|
||||||
|
continue # Not a valid entry
|
||||||
|
|
||||||
|
paste_id = paste.xpath("td[1]/a/@href")[0][1:]
|
||||||
|
ago = paste.xpath("td[2]/text()")[0]
|
||||||
|
|
||||||
|
new_list.append(paste_id)
|
||||||
|
|
||||||
|
if paste_id not in last_list:
|
||||||
|
found += 1
|
||||||
|
socket.send(msgpack.packb({"id": paste_id, "type": filetype, "title": title, "base_time": basetime, "ago": ago}))
|
||||||
|
|
||||||
|
logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Scraped metadata for %d new pastes." % found}))
|
||||||
|
|
||||||
|
last_list = new_list
|
||||||
|
|
||||||
|
time.sleep(1 * 60)
|
33
start.py
Executable file
33
start.py
Executable file
|
@ -0,0 +1,33 @@
|
||||||
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
|
import sys, subprocess
|
||||||
|
errors = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
import zmq
|
||||||
|
except ImportError, e:
|
||||||
|
sys.stderr.write("You are missing ZeroMQ; `pip install pyzmq`\n")
|
||||||
|
errors = True
|
||||||
|
|
||||||
|
try:
|
||||||
|
import zmq
|
||||||
|
except ImportError, e:
|
||||||
|
sys.stderr.write("You are missing msgpack; `pip install msgpack-python`\n")
|
||||||
|
errors = True
|
||||||
|
|
||||||
|
try:
|
||||||
|
import zmq
|
||||||
|
except ImportError, e:
|
||||||
|
sys.stderr.write("You are missing requests; `pip install requests`\n")
|
||||||
|
errors = True
|
||||||
|
|
||||||
|
try:
|
||||||
|
import lxml.html
|
||||||
|
except ImportError, e:
|
||||||
|
sys.stderr.write("You are missing lxml (needs 2.0 or higher); `pip install lxml`\n")
|
||||||
|
errors = True
|
||||||
|
|
||||||
|
if errors == False:
|
||||||
|
subprocess.call(["/bin/sh", "_start.sh"])
|
||||||
|
|
||||||
|
sys.stdout.write("pastebin-scrape is now running. Run `python retrieve.py` to add additional retrieval workers.\n")
|
Loading…
Reference in a new issue