First working version!
This commit is contained in:
parent
ed38503695
commit
b3fa3fba4a
30
collect.py
Normal file
30
collect.py
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
import zmq, msgpack, json, os, time
|
||||||
|
|
||||||
|
context = zmq.Context()
|
||||||
|
socket = context.socket(zmq.PULL)
|
||||||
|
socket.connect("ipc:///tmp/pbscrape-results")
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.makedirs("pastes")
|
||||||
|
except OSError, e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
while True:
|
||||||
|
item = msgpack.unpackb(socket.recv())
|
||||||
|
|
||||||
|
target_dir = time.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.makedirs("pastes/%s" % target_dir)
|
||||||
|
except OSError, e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
f = open("pastes/%s/%s.txt" % (target_dir, item["id"]), "wb")
|
||||||
|
f.write(item["paste"])
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
del item["paste"] # To prevent writing the paste to the metadata file as well
|
||||||
|
|
||||||
|
f = open("pastes/%s/%s.json" % (target_dir, item["id"]), "wb")
|
||||||
|
json.dump(item, f)
|
||||||
|
f.close()
|
27
retrieve.py
Normal file
27
retrieve.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
import zmq, msgpack, requests, time
|
||||||
|
|
||||||
|
context = zmq.Context()
|
||||||
|
receiver = context.socket(zmq.PULL)
|
||||||
|
receiver.connect("ipc:///tmp/pbscrape-tasks")
|
||||||
|
sender = context.socket(zmq.PUSH)
|
||||||
|
sender.bind("ipc:///tmp/pbscrape-results")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
item = msgpack.unpackb(receiver.recv())
|
||||||
|
|
||||||
|
while True: # We want to keep trying until it succeeds...
|
||||||
|
try:
|
||||||
|
paste = requests.get("http://pastebin.com/raw.php?i=%s" % item["id"]).text
|
||||||
|
except Exception, e:
|
||||||
|
# TODO: Log error
|
||||||
|
print e
|
||||||
|
time.sleep(5)
|
||||||
|
continue # Retry
|
||||||
|
break # Done
|
||||||
|
|
||||||
|
item["retrieval_time"] = int(time.time())
|
||||||
|
item["paste"] = paste
|
||||||
|
sender.send(msgpack.packb(item))
|
||||||
|
print item
|
||||||
|
|
||||||
|
time.sleep(1) # Wait a second between each paste retrieval...
|
40
scrape.py
Normal file
40
scrape.py
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
import zmq, time, requests, lxml.html, msgpack
|
||||||
|
|
||||||
|
context = zmq.Context()
|
||||||
|
socket = context.socket(zmq.PUSH)
|
||||||
|
socket.bind("ipc:///tmp/pbscrape-tasks")
|
||||||
|
|
||||||
|
last_list = []
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
page = requests.get("http://pastebin.com/archive").text
|
||||||
|
except Exception, e:
|
||||||
|
# TODO: Log HTTP error
|
||||||
|
time.sleep(30)
|
||||||
|
continue
|
||||||
|
|
||||||
|
basetime = int(time.time())
|
||||||
|
|
||||||
|
xml = lxml.html.fromstring(page)
|
||||||
|
|
||||||
|
pastes = xml.xpath("//table[@class='maintable']/tr")
|
||||||
|
new_list = []
|
||||||
|
|
||||||
|
for paste in pastes:
|
||||||
|
try:
|
||||||
|
title, filetype = paste.xpath("td/a/text()")
|
||||||
|
except ValueError, e:
|
||||||
|
continue # Not a valid entry
|
||||||
|
|
||||||
|
paste_id = paste.xpath("td[1]/a/@href")[0][1:]
|
||||||
|
ago = paste.xpath("td[2]/text()")[0]
|
||||||
|
|
||||||
|
new_list.append(paste_id)
|
||||||
|
|
||||||
|
if paste_id not in last_list:
|
||||||
|
socket.send(msgpack.packb({"id": paste_id, "type": filetype, "title": title, "base_time": basetime, "ago": ago}))
|
||||||
|
|
||||||
|
last_list = new_list
|
||||||
|
|
||||||
|
time.sleep(5 * 60)
|
Loading…
Reference in a new issue