Compare commits

..

No commits in common. 'develop' and 'master' have entirely different histories.

@ -1,4 +0,0 @@
python log.py &
python collect.py &
python retrieve.py &
python scrape.py &

@ -1,34 +0,0 @@
import zmq, msgpack, json, os, time
context = zmq.Context()
socket = context.socket(zmq.PULL)
socket.bind("ipc:///tmp/pbscrape-results")
logger = context.socket(zmq.PUSH)
logger.connect("ipc:///tmp/pbscrape-log")
try:
os.makedirs("pastes")
except OSError, e:
pass
while True:
item = msgpack.unpackb(socket.recv())
target_dir = time.strftime("%Y-%m-%d")
try:
os.makedirs("pastes/%s" % target_dir)
except OSError, e:
pass
f = open("pastes/%s/%s.txt" % (target_dir, item["id"]), "wb")
f.write(item["paste"])
f.close()
del item["paste"] # To prevent writing the paste to the metadata file as well
f = open("pastes/%s/%s.json" % (target_dir, item["id"]), "wb")
json.dump(item, f)
f.close()
logger.send(msgpack.packb({"component": "collect", "timestamp": int(time.time()), "message": "Stored %s." % item["id"]}))

@ -1,12 +0,0 @@
import zmq, msgpack
context = zmq.Context()
socket = context.socket(zmq.PULL)
socket.bind("ipc:///tmp/pbscrape-log")
while True:
entry = msgpack.unpackb(socket.recv())
f = open("scrape.log", "a")
f.write("[%(component)s] %(timestamp)s : %(message)s\n" % entry)
f.close()

@ -1,52 +0,0 @@
import zmq, msgpack, requests, time
context = zmq.Context()
receiver = context.socket(zmq.PULL)
receiver.connect("ipc:///tmp/pbscrape-tasks")
sender = context.socket(zmq.PUSH)
sender.connect("ipc:///tmp/pbscrape-results")
logger = context.socket(zmq.PUSH)
logger.connect("ipc:///tmp/pbscrape-log")
while True:
item = msgpack.unpackb(receiver.recv())
gone = False
while True: # We want to keep trying until it succeeds...
try:
response = requests.get("http://pastebin.com/raw.php?i=%s" % item["id"])
if response.status_code == 404:
# Gone...
gone = True
break
elif "text/html" in response.headers["Content-Type"]:
# We most likely got an "under heavy load" message or similar; sleep a while and retry
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Hit a text/html response for raw.php, servers most likely overloaded, sleeping..."}))
time.sleep(10)
continue # Retry
paste = response.text
except Exception, e:
# TODO: Log error
print e
time.sleep(5)
continue # Retry
if response.status_code == 403:
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."}))
time.sleep(600)
continue # Retry
break # Done
if gone:
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Paste %s gone, skipping..." % item["id"]}))
continue # Next!
item["retrieval_time"] = int(time.time())
item["paste"] = paste
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Downloaded paste body for %s." % item["id"]}))
sender.send(msgpack.packb(item))
time.sleep(1.3) # Wait a second between each paste retrieval...

@ -1,51 +0,0 @@
import zmq, time, requests, lxml.html, msgpack
context = zmq.Context()
socket = context.socket(zmq.PUSH)
socket.bind("ipc:///tmp/pbscrape-tasks")
logger = context.socket(zmq.PUSH)
logger.connect("ipc:///tmp/pbscrape-log")
last_list = []
while True:
try:
page = requests.get("http://pastebin.com/archive").text
except Exception, e:
# TODO: Log HTTP error
time.sleep(30)
continue
if "temporarily blocked your computer" in page:
logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."}))
time.sleep(600)
continue
basetime = int(time.time())
xml = lxml.html.fromstring(page)
pastes = xml.xpath("//table[@class='maintable']/tr")
new_list = []
found = 0
for paste in pastes:
try:
title, filetype = paste.xpath("td/a/text()")
except ValueError, e:
continue # Not a valid entry
paste_id = paste.xpath("td[1]/a/@href")[0][1:]
ago = paste.xpath("td[2]/text()")[0]
new_list.append(paste_id)
if paste_id not in last_list:
found += 1
socket.send(msgpack.packb({"id": paste_id, "type": filetype, "title": title, "base_time": basetime, "ago": ago}))
logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Scraped metadata for %d new pastes." % found}))
last_list = new_list
time.sleep(1 * 60)

@ -1,33 +0,0 @@
#!/usr/bin/env python2
import sys, subprocess
errors = False
try:
import zmq
except ImportError, e:
sys.stderr.write("You are missing ZeroMQ; `pip install pyzmq`\n")
errors = True
try:
import zmq
except ImportError, e:
sys.stderr.write("You are missing msgpack; `pip install msgpack-python`\n")
errors = True
try:
import zmq
except ImportError, e:
sys.stderr.write("You are missing requests; `pip install requests`\n")
errors = True
try:
import lxml.html
except ImportError, e:
sys.stderr.write("You are missing lxml (needs 2.0 or higher); `pip install lxml`\n")
errors = True
if errors == False:
subprocess.call(["/bin/sh", "_start.sh"])
sys.stdout.write("pastebin-scrape is now running. Run `python retrieve.py` to add additional retrieval workers.\n")
Loading…
Cancel
Save