First working version!
This commit is contained in:
parent
ed38503695
commit
b3fa3fba4a
30
collect.py
Normal file
30
collect.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
import zmq, msgpack, json, os, time
|
||||
|
||||
context = zmq.Context()
|
||||
socket = context.socket(zmq.PULL)
|
||||
socket.connect("ipc:///tmp/pbscrape-results")
|
||||
|
||||
try:
|
||||
os.makedirs("pastes")
|
||||
except OSError, e:
|
||||
pass
|
||||
|
||||
while True:
|
||||
item = msgpack.unpackb(socket.recv())
|
||||
|
||||
target_dir = time.strftime("%Y-%m-%d")
|
||||
|
||||
try:
|
||||
os.makedirs("pastes/%s" % target_dir)
|
||||
except OSError, e:
|
||||
pass
|
||||
|
||||
f = open("pastes/%s/%s.txt" % (target_dir, item["id"]), "wb")
|
||||
f.write(item["paste"])
|
||||
f.close()
|
||||
|
||||
del item["paste"] # To prevent writing the paste to the metadata file as well
|
||||
|
||||
f = open("pastes/%s/%s.json" % (target_dir, item["id"]), "wb")
|
||||
json.dump(item, f)
|
||||
f.close()
|
27
retrieve.py
Normal file
27
retrieve.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
import zmq, msgpack, requests, time
|
||||
|
||||
context = zmq.Context()
|
||||
receiver = context.socket(zmq.PULL)
|
||||
receiver.connect("ipc:///tmp/pbscrape-tasks")
|
||||
sender = context.socket(zmq.PUSH)
|
||||
sender.bind("ipc:///tmp/pbscrape-results")
|
||||
|
||||
while True:
|
||||
item = msgpack.unpackb(receiver.recv())
|
||||
|
||||
while True: # We want to keep trying until it succeeds...
|
||||
try:
|
||||
paste = requests.get("http://pastebin.com/raw.php?i=%s" % item["id"]).text
|
||||
except Exception, e:
|
||||
# TODO: Log error
|
||||
print e
|
||||
time.sleep(5)
|
||||
continue # Retry
|
||||
break # Done
|
||||
|
||||
item["retrieval_time"] = int(time.time())
|
||||
item["paste"] = paste
|
||||
sender.send(msgpack.packb(item))
|
||||
print item
|
||||
|
||||
time.sleep(1) # Wait a second between each paste retrieval...
|
40
scrape.py
Normal file
40
scrape.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
import zmq, time, requests, lxml.html, msgpack
|
||||
|
||||
context = zmq.Context()
|
||||
socket = context.socket(zmq.PUSH)
|
||||
socket.bind("ipc:///tmp/pbscrape-tasks")
|
||||
|
||||
last_list = []
|
||||
|
||||
while True:
|
||||
try:
|
||||
page = requests.get("http://pastebin.com/archive").text
|
||||
except Exception, e:
|
||||
# TODO: Log HTTP error
|
||||
time.sleep(30)
|
||||
continue
|
||||
|
||||
basetime = int(time.time())
|
||||
|
||||
xml = lxml.html.fromstring(page)
|
||||
|
||||
pastes = xml.xpath("//table[@class='maintable']/tr")
|
||||
new_list = []
|
||||
|
||||
for paste in pastes:
|
||||
try:
|
||||
title, filetype = paste.xpath("td/a/text()")
|
||||
except ValueError, e:
|
||||
continue # Not a valid entry
|
||||
|
||||
paste_id = paste.xpath("td[1]/a/@href")[0][1:]
|
||||
ago = paste.xpath("td[2]/text()")[0]
|
||||
|
||||
new_list.append(paste_id)
|
||||
|
||||
if paste_id not in last_list:
|
||||
socket.send(msgpack.packb({"id": paste_id, "type": filetype, "title": title, "base_time": basetime, "ago": ago}))
|
||||
|
||||
last_list = new_list
|
||||
|
||||
time.sleep(5 * 60)
|
Loading…
Reference in a new issue