import zmq, time, requests, lxml.html, msgpack context = zmq.Context() socket = context.socket(zmq.PUSH) socket.bind("ipc:///tmp/pbscrape-tasks") logger = context.socket(zmq.PUSH) logger.connect("ipc:///tmp/pbscrape-log") last_list = [] while True: try: page = requests.get("http://pastebin.com/archive").text except Exception, e: # TODO: Log HTTP error time.sleep(30) continue if "temporarily blocked your computer" in page: logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."})) time.sleep(600) continue basetime = int(time.time()) xml = lxml.html.fromstring(page) pastes = xml.xpath("//table[@class='maintable']/tr") new_list = [] found = 0 for paste in pastes: try: title, filetype = paste.xpath("td/a/text()") except ValueError, e: continue # Not a valid entry paste_id = paste.xpath("td[1]/a/@href")[0][1:] ago = paste.xpath("td[2]/text()")[0] new_list.append(paste_id) if paste_id not in last_list: found += 1 socket.send(msgpack.packb({"id": paste_id, "type": filetype, "title": title, "base_time": basetime, "ago": ago})) logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Scraped metadata for %d new pastes." % found})) last_list = new_list time.sleep(1 * 60)