diff --git a/_start.sh b/_start.sh index 8e6f333..dcaaa4b 100755 --- a/_start.sh +++ b/_start.sh @@ -1,3 +1,4 @@ +python log.py & python collect.py & python retrieve.py & python scrape.py & diff --git a/collect.py b/collect.py index a4da229..b22eff6 100644 --- a/collect.py +++ b/collect.py @@ -3,6 +3,8 @@ import zmq, msgpack, json, os, time context = zmq.Context() socket = context.socket(zmq.PULL) socket.bind("ipc:///tmp/pbscrape-results") +logger = context.socket(zmq.PUSH) +logger.connect("ipc:///tmp/pbscrape-log") try: os.makedirs("pastes") @@ -28,3 +30,5 @@ while True: f = open("pastes/%s/%s.json" % (target_dir, item["id"]), "wb") json.dump(item, f) f.close() + + logger.send(msgpack.packb({"component": "collect", "timestamp": int(time.time()), "message": "Stored %s." % item["id"]})) diff --git a/log.py b/log.py new file mode 100644 index 0000000..7f9bdcd --- /dev/null +++ b/log.py @@ -0,0 +1,12 @@ +import zmq, msgpack + +context = zmq.Context() +socket = context.socket(zmq.PULL) +socket.bind("ipc:///tmp/pbscrape-log") + +while True: + entry = msgpack.unpackb(socket.recv()) + + f = open("scrape.log", "a") + f.write("[%(component)s] %(timestamp)s : %(message)s\n" % entry) + f.close() diff --git a/retrieve.py b/retrieve.py index 92438ae..398ab5a 100644 --- a/retrieve.py +++ b/retrieve.py @@ -5,6 +5,8 @@ receiver = context.socket(zmq.PULL) receiver.connect("ipc:///tmp/pbscrape-tasks") sender = context.socket(zmq.PUSH) sender.connect("ipc:///tmp/pbscrape-results") +logger = context.socket(zmq.PUSH) +logger.connect("ipc:///tmp/pbscrape-log") while True: item = msgpack.unpackb(receiver.recv()) @@ -21,6 +23,9 @@ while True: item["retrieval_time"] = int(time.time()) item["paste"] = paste + + logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Downloaded paste body for %s." % item["id"]})) + sender.send(msgpack.packb(item)) time.sleep(1) # Wait a second between each paste retrieval... diff --git a/scrape.py b/scrape.py index b488665..a29431e 100644 --- a/scrape.py +++ b/scrape.py @@ -3,6 +3,8 @@ import zmq, time, requests, lxml.html, msgpack context = zmq.Context() socket = context.socket(zmq.PUSH) socket.bind("ipc:///tmp/pbscrape-tasks") +logger = context.socket(zmq.PUSH) +logger.connect("ipc:///tmp/pbscrape-log") last_list = [] @@ -20,6 +22,7 @@ while True: pastes = xml.xpath("//table[@class='maintable']/tr") new_list = [] + found = 0 for paste in pastes: try: @@ -33,8 +36,11 @@ while True: new_list.append(paste_id) if paste_id not in last_list: + found += 1 socket.send(msgpack.packb({"id": paste_id, "type": filetype, "title": title, "base_time": basetime, "ago": ago})) + logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Scraped metadata for %d new pastes." % found})) + last_list = new_list time.sleep(5 * 60)