Compare commits

...

13 Commits

Author SHA1 Message Date
Sven Slootweg d6995b5030 Deal with 404 properly 10 years ago
Sven Slootweg 16a5bea805 Deal with overloaded servers... 10 years ago
Sven Slootweg 389a49e6c3 Wait a bit longer between pastes, to avoid throttling 11 years ago
Sven Slootweg 68e82f8ac8 Deal with throttling properly 11 years ago
Sven Slootweg 73fa91c953 Change scrape checking interval, to deal with traffic peaks 11 years ago
Sven Slootweg e49a9bdef6 Logging 11 years ago
Sven Slootweg 53b4111c59 Make collector bind instead of retriever 11 years ago
Sven Slootweg 8a079d7090 Also check for lxml 11 years ago
Sven Slootweg 047f08fdf6 Fixed start script; also, print is evil 11 years ago
Sven Slootweg 4994c1baa7 Shebang 11 years ago
Sven Slootweg 7d5736c28e Fix permissions 11 years ago
Sven Slootweg d35311f6db Start scripts 11 years ago
Sven Slootweg b3fa3fba4a First working version! 11 years ago

@ -0,0 +1,4 @@
python log.py &
python collect.py &
python retrieve.py &
python scrape.py &

@ -0,0 +1,34 @@
import zmq, msgpack, json, os, time
context = zmq.Context()
socket = context.socket(zmq.PULL)
socket.bind("ipc:///tmp/pbscrape-results")
logger = context.socket(zmq.PUSH)
logger.connect("ipc:///tmp/pbscrape-log")
try:
os.makedirs("pastes")
except OSError, e:
pass
while True:
item = msgpack.unpackb(socket.recv())
target_dir = time.strftime("%Y-%m-%d")
try:
os.makedirs("pastes/%s" % target_dir)
except OSError, e:
pass
f = open("pastes/%s/%s.txt" % (target_dir, item["id"]), "wb")
f.write(item["paste"])
f.close()
del item["paste"] # To prevent writing the paste to the metadata file as well
f = open("pastes/%s/%s.json" % (target_dir, item["id"]), "wb")
json.dump(item, f)
f.close()
logger.send(msgpack.packb({"component": "collect", "timestamp": int(time.time()), "message": "Stored %s." % item["id"]}))

@ -0,0 +1,12 @@
import zmq, msgpack
context = zmq.Context()
socket = context.socket(zmq.PULL)
socket.bind("ipc:///tmp/pbscrape-log")
while True:
entry = msgpack.unpackb(socket.recv())
f = open("scrape.log", "a")
f.write("[%(component)s] %(timestamp)s : %(message)s\n" % entry)
f.close()

@ -0,0 +1,52 @@
import zmq, msgpack, requests, time
context = zmq.Context()
receiver = context.socket(zmq.PULL)
receiver.connect("ipc:///tmp/pbscrape-tasks")
sender = context.socket(zmq.PUSH)
sender.connect("ipc:///tmp/pbscrape-results")
logger = context.socket(zmq.PUSH)
logger.connect("ipc:///tmp/pbscrape-log")
while True:
item = msgpack.unpackb(receiver.recv())
gone = False
while True: # We want to keep trying until it succeeds...
try:
response = requests.get("http://pastebin.com/raw.php?i=%s" % item["id"])
if response.status_code == 404:
# Gone...
gone = True
break
elif "text/html" in response.headers["Content-Type"]:
# We most likely got an "under heavy load" message or similar; sleep a while and retry
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Hit a text/html response for raw.php, servers most likely overloaded, sleeping..."}))
time.sleep(10)
continue # Retry
paste = response.text
except Exception, e:
# TODO: Log error
print e
time.sleep(5)
continue # Retry
if response.status_code == 403:
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."}))
time.sleep(600)
continue # Retry
break # Done
if gone:
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Paste %s gone, skipping..." % item["id"]}))
continue # Next!
item["retrieval_time"] = int(time.time())
item["paste"] = paste
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Downloaded paste body for %s." % item["id"]}))
sender.send(msgpack.packb(item))
time.sleep(1.3) # Wait a second between each paste retrieval...

@ -0,0 +1,51 @@
import zmq, time, requests, lxml.html, msgpack
context = zmq.Context()
socket = context.socket(zmq.PUSH)
socket.bind("ipc:///tmp/pbscrape-tasks")
logger = context.socket(zmq.PUSH)
logger.connect("ipc:///tmp/pbscrape-log")
last_list = []
while True:
try:
page = requests.get("http://pastebin.com/archive").text
except Exception, e:
# TODO: Log HTTP error
time.sleep(30)
continue
if "temporarily blocked your computer" in page:
logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."}))
time.sleep(600)
continue
basetime = int(time.time())
xml = lxml.html.fromstring(page)
pastes = xml.xpath("//table[@class='maintable']/tr")
new_list = []
found = 0
for paste in pastes:
try:
title, filetype = paste.xpath("td/a/text()")
except ValueError, e:
continue # Not a valid entry
paste_id = paste.xpath("td[1]/a/@href")[0][1:]
ago = paste.xpath("td[2]/text()")[0]
new_list.append(paste_id)
if paste_id not in last_list:
found += 1
socket.send(msgpack.packb({"id": paste_id, "type": filetype, "title": title, "base_time": basetime, "ago": ago}))
logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Scraped metadata for %d new pastes." % found}))
last_list = new_list
time.sleep(1 * 60)

@ -0,0 +1,33 @@
#!/usr/bin/env python2
import sys, subprocess
errors = False
try:
import zmq
except ImportError, e:
sys.stderr.write("You are missing ZeroMQ; `pip install pyzmq`\n")
errors = True
try:
import zmq
except ImportError, e:
sys.stderr.write("You are missing msgpack; `pip install msgpack-python`\n")
errors = True
try:
import zmq
except ImportError, e:
sys.stderr.write("You are missing requests; `pip install requests`\n")
errors = True
try:
import lxml.html
except ImportError, e:
sys.stderr.write("You are missing lxml (needs 2.0 or higher); `pip install lxml`\n")
errors = True
if errors == False:
subprocess.call(["/bin/sh", "_start.sh"])
sys.stdout.write("pastebin-scrape is now running. Run `python retrieve.py` to add additional retrieval workers.\n")
Loading…
Cancel
Save