First working version!

This commit is contained in:
Sven Slootweg 2013-10-30 15:25:17 +01:00
parent ed38503695
commit b3fa3fba4a
3 changed files with 97 additions and 0 deletions

30
collect.py Normal file
View file

@ -0,0 +1,30 @@
import zmq, msgpack, json, os, time
context = zmq.Context()
socket = context.socket(zmq.PULL)
socket.connect("ipc:///tmp/pbscrape-results")
try:
os.makedirs("pastes")
except OSError, e:
pass
while True:
item = msgpack.unpackb(socket.recv())
target_dir = time.strftime("%Y-%m-%d")
try:
os.makedirs("pastes/%s" % target_dir)
except OSError, e:
pass
f = open("pastes/%s/%s.txt" % (target_dir, item["id"]), "wb")
f.write(item["paste"])
f.close()
del item["paste"] # To prevent writing the paste to the metadata file as well
f = open("pastes/%s/%s.json" % (target_dir, item["id"]), "wb")
json.dump(item, f)
f.close()

27
retrieve.py Normal file
View file

@ -0,0 +1,27 @@
import zmq, msgpack, requests, time
context = zmq.Context()
receiver = context.socket(zmq.PULL)
receiver.connect("ipc:///tmp/pbscrape-tasks")
sender = context.socket(zmq.PUSH)
sender.bind("ipc:///tmp/pbscrape-results")
while True:
item = msgpack.unpackb(receiver.recv())
while True: # We want to keep trying until it succeeds...
try:
paste = requests.get("http://pastebin.com/raw.php?i=%s" % item["id"]).text
except Exception, e:
# TODO: Log error
print e
time.sleep(5)
continue # Retry
break # Done
item["retrieval_time"] = int(time.time())
item["paste"] = paste
sender.send(msgpack.packb(item))
print item
time.sleep(1) # Wait a second between each paste retrieval...

40
scrape.py Normal file
View file

@ -0,0 +1,40 @@
import zmq, time, requests, lxml.html, msgpack
context = zmq.Context()
socket = context.socket(zmq.PUSH)
socket.bind("ipc:///tmp/pbscrape-tasks")
last_list = []
while True:
try:
page = requests.get("http://pastebin.com/archive").text
except Exception, e:
# TODO: Log HTTP error
time.sleep(30)
continue
basetime = int(time.time())
xml = lxml.html.fromstring(page)
pastes = xml.xpath("//table[@class='maintable']/tr")
new_list = []
for paste in pastes:
try:
title, filetype = paste.xpath("td/a/text()")
except ValueError, e:
continue # Not a valid entry
paste_id = paste.xpath("td[1]/a/@href")[0][1:]
ago = paste.xpath("td[2]/text()")[0]
new_list.append(paste_id)
if paste_id not in last_list:
socket.send(msgpack.packb({"id": paste_id, "type": filetype, "title": title, "base_time": basetime, "ago": ago}))
last_list = new_list
time.sleep(5 * 60)