diff --git a/collect.py b/collect.py new file mode 100644 index 0000000..bf184f6 --- /dev/null +++ b/collect.py @@ -0,0 +1,30 @@ +import zmq, msgpack, json, os, time + +context = zmq.Context() +socket = context.socket(zmq.PULL) +socket.connect("ipc:///tmp/pbscrape-results") + +try: + os.makedirs("pastes") +except OSError, e: + pass + +while True: + item = msgpack.unpackb(socket.recv()) + + target_dir = time.strftime("%Y-%m-%d") + + try: + os.makedirs("pastes/%s" % target_dir) + except OSError, e: + pass + + f = open("pastes/%s/%s.txt" % (target_dir, item["id"]), "wb") + f.write(item["paste"]) + f.close() + + del item["paste"] # To prevent writing the paste to the metadata file as well + + f = open("pastes/%s/%s.json" % (target_dir, item["id"]), "wb") + json.dump(item, f) + f.close() diff --git a/retrieve.py b/retrieve.py new file mode 100644 index 0000000..01bdf17 --- /dev/null +++ b/retrieve.py @@ -0,0 +1,27 @@ +import zmq, msgpack, requests, time + +context = zmq.Context() +receiver = context.socket(zmq.PULL) +receiver.connect("ipc:///tmp/pbscrape-tasks") +sender = context.socket(zmq.PUSH) +sender.bind("ipc:///tmp/pbscrape-results") + +while True: + item = msgpack.unpackb(receiver.recv()) + + while True: # We want to keep trying until it succeeds... + try: + paste = requests.get("http://pastebin.com/raw.php?i=%s" % item["id"]).text + except Exception, e: + # TODO: Log error + print e + time.sleep(5) + continue # Retry + break # Done + + item["retrieval_time"] = int(time.time()) + item["paste"] = paste + sender.send(msgpack.packb(item)) + print item + + time.sleep(1) # Wait a second between each paste retrieval... diff --git a/scrape.py b/scrape.py new file mode 100644 index 0000000..b488665 --- /dev/null +++ b/scrape.py @@ -0,0 +1,40 @@ +import zmq, time, requests, lxml.html, msgpack + +context = zmq.Context() +socket = context.socket(zmq.PUSH) +socket.bind("ipc:///tmp/pbscrape-tasks") + +last_list = [] + +while True: + try: + page = requests.get("http://pastebin.com/archive").text + except Exception, e: + # TODO: Log HTTP error + time.sleep(30) + continue + + basetime = int(time.time()) + + xml = lxml.html.fromstring(page) + + pastes = xml.xpath("//table[@class='maintable']/tr") + new_list = [] + + for paste in pastes: + try: + title, filetype = paste.xpath("td/a/text()") + except ValueError, e: + continue # Not a valid entry + + paste_id = paste.xpath("td[1]/a/@href")[0][1:] + ago = paste.xpath("td[2]/text()")[0] + + new_list.append(paste_id) + + if paste_id not in last_list: + socket.send(msgpack.packb({"id": paste_id, "type": filetype, "title": title, "base_time": basetime, "ago": ago})) + + last_list = new_list + + time.sleep(5 * 60)