Compare commits
13 commits
Author | SHA1 | Date | |
---|---|---|---|
d6995b5030 | |||
16a5bea805 | |||
389a49e6c3 | |||
68e82f8ac8 | |||
73fa91c953 | |||
e49a9bdef6 | |||
53b4111c59 | |||
8a079d7090 | |||
047f08fdf6 | |||
4994c1baa7 | |||
7d5736c28e | |||
d35311f6db | |||
b3fa3fba4a |
4
_start.sh
Executable file
4
_start.sh
Executable file
|
@ -0,0 +1,4 @@
|
|||
python log.py &
|
||||
python collect.py &
|
||||
python retrieve.py &
|
||||
python scrape.py &
|
34
collect.py
Normal file
34
collect.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
import zmq, msgpack, json, os, time
|
||||
|
||||
context = zmq.Context()
|
||||
socket = context.socket(zmq.PULL)
|
||||
socket.bind("ipc:///tmp/pbscrape-results")
|
||||
logger = context.socket(zmq.PUSH)
|
||||
logger.connect("ipc:///tmp/pbscrape-log")
|
||||
|
||||
try:
|
||||
os.makedirs("pastes")
|
||||
except OSError, e:
|
||||
pass
|
||||
|
||||
while True:
|
||||
item = msgpack.unpackb(socket.recv())
|
||||
|
||||
target_dir = time.strftime("%Y-%m-%d")
|
||||
|
||||
try:
|
||||
os.makedirs("pastes/%s" % target_dir)
|
||||
except OSError, e:
|
||||
pass
|
||||
|
||||
f = open("pastes/%s/%s.txt" % (target_dir, item["id"]), "wb")
|
||||
f.write(item["paste"])
|
||||
f.close()
|
||||
|
||||
del item["paste"] # To prevent writing the paste to the metadata file as well
|
||||
|
||||
f = open("pastes/%s/%s.json" % (target_dir, item["id"]), "wb")
|
||||
json.dump(item, f)
|
||||
f.close()
|
||||
|
||||
logger.send(msgpack.packb({"component": "collect", "timestamp": int(time.time()), "message": "Stored %s." % item["id"]}))
|
12
log.py
Normal file
12
log.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
import zmq, msgpack
|
||||
|
||||
context = zmq.Context()
|
||||
socket = context.socket(zmq.PULL)
|
||||
socket.bind("ipc:///tmp/pbscrape-log")
|
||||
|
||||
while True:
|
||||
entry = msgpack.unpackb(socket.recv())
|
||||
|
||||
f = open("scrape.log", "a")
|
||||
f.write("[%(component)s] %(timestamp)s : %(message)s\n" % entry)
|
||||
f.close()
|
52
retrieve.py
Normal file
52
retrieve.py
Normal file
|
@ -0,0 +1,52 @@
|
|||
import zmq, msgpack, requests, time
|
||||
|
||||
context = zmq.Context()
|
||||
receiver = context.socket(zmq.PULL)
|
||||
receiver.connect("ipc:///tmp/pbscrape-tasks")
|
||||
sender = context.socket(zmq.PUSH)
|
||||
sender.connect("ipc:///tmp/pbscrape-results")
|
||||
logger = context.socket(zmq.PUSH)
|
||||
logger.connect("ipc:///tmp/pbscrape-log")
|
||||
|
||||
while True:
|
||||
item = msgpack.unpackb(receiver.recv())
|
||||
|
||||
gone = False
|
||||
|
||||
while True: # We want to keep trying until it succeeds...
|
||||
try:
|
||||
response = requests.get("http://pastebin.com/raw.php?i=%s" % item["id"])
|
||||
if response.status_code == 404:
|
||||
# Gone...
|
||||
gone = True
|
||||
break
|
||||
elif "text/html" in response.headers["Content-Type"]:
|
||||
# We most likely got an "under heavy load" message or similar; sleep a while and retry
|
||||
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Hit a text/html response for raw.php, servers most likely overloaded, sleeping..."}))
|
||||
time.sleep(10)
|
||||
continue # Retry
|
||||
paste = response.text
|
||||
except Exception, e:
|
||||
# TODO: Log error
|
||||
print e
|
||||
time.sleep(5)
|
||||
continue # Retry
|
||||
|
||||
if response.status_code == 403:
|
||||
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."}))
|
||||
time.sleep(600)
|
||||
continue # Retry
|
||||
|
||||
break # Done
|
||||
|
||||
if gone:
|
||||
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Paste %s gone, skipping..." % item["id"]}))
|
||||
continue # Next!
|
||||
item["retrieval_time"] = int(time.time())
|
||||
item["paste"] = paste
|
||||
|
||||
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Downloaded paste body for %s." % item["id"]}))
|
||||
|
||||
sender.send(msgpack.packb(item))
|
||||
|
||||
time.sleep(1.3) # Wait a second between each paste retrieval...
|
51
scrape.py
Normal file
51
scrape.py
Normal file
|
@ -0,0 +1,51 @@
|
|||
import zmq, time, requests, lxml.html, msgpack
|
||||
|
||||
context = zmq.Context()
|
||||
socket = context.socket(zmq.PUSH)
|
||||
socket.bind("ipc:///tmp/pbscrape-tasks")
|
||||
logger = context.socket(zmq.PUSH)
|
||||
logger.connect("ipc:///tmp/pbscrape-log")
|
||||
|
||||
last_list = []
|
||||
|
||||
while True:
|
||||
try:
|
||||
page = requests.get("http://pastebin.com/archive").text
|
||||
except Exception, e:
|
||||
# TODO: Log HTTP error
|
||||
time.sleep(30)
|
||||
continue
|
||||
|
||||
if "temporarily blocked your computer" in page:
|
||||
logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."}))
|
||||
time.sleep(600)
|
||||
continue
|
||||
|
||||
basetime = int(time.time())
|
||||
|
||||
xml = lxml.html.fromstring(page)
|
||||
|
||||
pastes = xml.xpath("//table[@class='maintable']/tr")
|
||||
new_list = []
|
||||
found = 0
|
||||
|
||||
for paste in pastes:
|
||||
try:
|
||||
title, filetype = paste.xpath("td/a/text()")
|
||||
except ValueError, e:
|
||||
continue # Not a valid entry
|
||||
|
||||
paste_id = paste.xpath("td[1]/a/@href")[0][1:]
|
||||
ago = paste.xpath("td[2]/text()")[0]
|
||||
|
||||
new_list.append(paste_id)
|
||||
|
||||
if paste_id not in last_list:
|
||||
found += 1
|
||||
socket.send(msgpack.packb({"id": paste_id, "type": filetype, "title": title, "base_time": basetime, "ago": ago}))
|
||||
|
||||
logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Scraped metadata for %d new pastes." % found}))
|
||||
|
||||
last_list = new_list
|
||||
|
||||
time.sleep(1 * 60)
|
33
start.py
Executable file
33
start.py
Executable file
|
@ -0,0 +1,33 @@
|
|||
#!/usr/bin/env python2
|
||||
|
||||
import sys, subprocess
|
||||
errors = False
|
||||
|
||||
try:
|
||||
import zmq
|
||||
except ImportError, e:
|
||||
sys.stderr.write("You are missing ZeroMQ; `pip install pyzmq`\n")
|
||||
errors = True
|
||||
|
||||
try:
|
||||
import zmq
|
||||
except ImportError, e:
|
||||
sys.stderr.write("You are missing msgpack; `pip install msgpack-python`\n")
|
||||
errors = True
|
||||
|
||||
try:
|
||||
import zmq
|
||||
except ImportError, e:
|
||||
sys.stderr.write("You are missing requests; `pip install requests`\n")
|
||||
errors = True
|
||||
|
||||
try:
|
||||
import lxml.html
|
||||
except ImportError, e:
|
||||
sys.stderr.write("You are missing lxml (needs 2.0 or higher); `pip install lxml`\n")
|
||||
errors = True
|
||||
|
||||
if errors == False:
|
||||
subprocess.call(["/bin/sh", "_start.sh"])
|
||||
|
||||
sys.stdout.write("pastebin-scrape is now running. Run `python retrieve.py` to add additional retrieval workers.\n")
|
Loading…
Reference in a new issue