Compare commits
No commits in common. 'develop' and 'master' have entirely different histories.
@ -1,4 +0,0 @@
|
|||||||
python log.py &
|
|
||||||
python collect.py &
|
|
||||||
python retrieve.py &
|
|
||||||
python scrape.py &
|
|
@ -1,34 +0,0 @@
|
|||||||
import zmq, msgpack, json, os, time
|
|
||||||
|
|
||||||
context = zmq.Context()
|
|
||||||
socket = context.socket(zmq.PULL)
|
|
||||||
socket.bind("ipc:///tmp/pbscrape-results")
|
|
||||||
logger = context.socket(zmq.PUSH)
|
|
||||||
logger.connect("ipc:///tmp/pbscrape-log")
|
|
||||||
|
|
||||||
try:
|
|
||||||
os.makedirs("pastes")
|
|
||||||
except OSError, e:
|
|
||||||
pass
|
|
||||||
|
|
||||||
while True:
|
|
||||||
item = msgpack.unpackb(socket.recv())
|
|
||||||
|
|
||||||
target_dir = time.strftime("%Y-%m-%d")
|
|
||||||
|
|
||||||
try:
|
|
||||||
os.makedirs("pastes/%s" % target_dir)
|
|
||||||
except OSError, e:
|
|
||||||
pass
|
|
||||||
|
|
||||||
f = open("pastes/%s/%s.txt" % (target_dir, item["id"]), "wb")
|
|
||||||
f.write(item["paste"])
|
|
||||||
f.close()
|
|
||||||
|
|
||||||
del item["paste"] # To prevent writing the paste to the metadata file as well
|
|
||||||
|
|
||||||
f = open("pastes/%s/%s.json" % (target_dir, item["id"]), "wb")
|
|
||||||
json.dump(item, f)
|
|
||||||
f.close()
|
|
||||||
|
|
||||||
logger.send(msgpack.packb({"component": "collect", "timestamp": int(time.time()), "message": "Stored %s." % item["id"]}))
|
|
@ -1,12 +0,0 @@
|
|||||||
import zmq, msgpack
|
|
||||||
|
|
||||||
context = zmq.Context()
|
|
||||||
socket = context.socket(zmq.PULL)
|
|
||||||
socket.bind("ipc:///tmp/pbscrape-log")
|
|
||||||
|
|
||||||
while True:
|
|
||||||
entry = msgpack.unpackb(socket.recv())
|
|
||||||
|
|
||||||
f = open("scrape.log", "a")
|
|
||||||
f.write("[%(component)s] %(timestamp)s : %(message)s\n" % entry)
|
|
||||||
f.close()
|
|
@ -1,52 +0,0 @@
|
|||||||
import zmq, msgpack, requests, time
|
|
||||||
|
|
||||||
context = zmq.Context()
|
|
||||||
receiver = context.socket(zmq.PULL)
|
|
||||||
receiver.connect("ipc:///tmp/pbscrape-tasks")
|
|
||||||
sender = context.socket(zmq.PUSH)
|
|
||||||
sender.connect("ipc:///tmp/pbscrape-results")
|
|
||||||
logger = context.socket(zmq.PUSH)
|
|
||||||
logger.connect("ipc:///tmp/pbscrape-log")
|
|
||||||
|
|
||||||
while True:
|
|
||||||
item = msgpack.unpackb(receiver.recv())
|
|
||||||
|
|
||||||
gone = False
|
|
||||||
|
|
||||||
while True: # We want to keep trying until it succeeds...
|
|
||||||
try:
|
|
||||||
response = requests.get("http://pastebin.com/raw.php?i=%s" % item["id"])
|
|
||||||
if response.status_code == 404:
|
|
||||||
# Gone...
|
|
||||||
gone = True
|
|
||||||
break
|
|
||||||
elif "text/html" in response.headers["Content-Type"]:
|
|
||||||
# We most likely got an "under heavy load" message or similar; sleep a while and retry
|
|
||||||
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Hit a text/html response for raw.php, servers most likely overloaded, sleeping..."}))
|
|
||||||
time.sleep(10)
|
|
||||||
continue # Retry
|
|
||||||
paste = response.text
|
|
||||||
except Exception, e:
|
|
||||||
# TODO: Log error
|
|
||||||
print e
|
|
||||||
time.sleep(5)
|
|
||||||
continue # Retry
|
|
||||||
|
|
||||||
if response.status_code == 403:
|
|
||||||
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."}))
|
|
||||||
time.sleep(600)
|
|
||||||
continue # Retry
|
|
||||||
|
|
||||||
break # Done
|
|
||||||
|
|
||||||
if gone:
|
|
||||||
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Paste %s gone, skipping..." % item["id"]}))
|
|
||||||
continue # Next!
|
|
||||||
item["retrieval_time"] = int(time.time())
|
|
||||||
item["paste"] = paste
|
|
||||||
|
|
||||||
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Downloaded paste body for %s." % item["id"]}))
|
|
||||||
|
|
||||||
sender.send(msgpack.packb(item))
|
|
||||||
|
|
||||||
time.sleep(1.3) # Wait a second between each paste retrieval...
|
|
@ -1,51 +0,0 @@
|
|||||||
import zmq, time, requests, lxml.html, msgpack
|
|
||||||
|
|
||||||
context = zmq.Context()
|
|
||||||
socket = context.socket(zmq.PUSH)
|
|
||||||
socket.bind("ipc:///tmp/pbscrape-tasks")
|
|
||||||
logger = context.socket(zmq.PUSH)
|
|
||||||
logger.connect("ipc:///tmp/pbscrape-log")
|
|
||||||
|
|
||||||
last_list = []
|
|
||||||
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
page = requests.get("http://pastebin.com/archive").text
|
|
||||||
except Exception, e:
|
|
||||||
# TODO: Log HTTP error
|
|
||||||
time.sleep(30)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if "temporarily blocked your computer" in page:
|
|
||||||
logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."}))
|
|
||||||
time.sleep(600)
|
|
||||||
continue
|
|
||||||
|
|
||||||
basetime = int(time.time())
|
|
||||||
|
|
||||||
xml = lxml.html.fromstring(page)
|
|
||||||
|
|
||||||
pastes = xml.xpath("//table[@class='maintable']/tr")
|
|
||||||
new_list = []
|
|
||||||
found = 0
|
|
||||||
|
|
||||||
for paste in pastes:
|
|
||||||
try:
|
|
||||||
title, filetype = paste.xpath("td/a/text()")
|
|
||||||
except ValueError, e:
|
|
||||||
continue # Not a valid entry
|
|
||||||
|
|
||||||
paste_id = paste.xpath("td[1]/a/@href")[0][1:]
|
|
||||||
ago = paste.xpath("td[2]/text()")[0]
|
|
||||||
|
|
||||||
new_list.append(paste_id)
|
|
||||||
|
|
||||||
if paste_id not in last_list:
|
|
||||||
found += 1
|
|
||||||
socket.send(msgpack.packb({"id": paste_id, "type": filetype, "title": title, "base_time": basetime, "ago": ago}))
|
|
||||||
|
|
||||||
logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Scraped metadata for %d new pastes." % found}))
|
|
||||||
|
|
||||||
last_list = new_list
|
|
||||||
|
|
||||||
time.sleep(1 * 60)
|
|
@ -1,33 +0,0 @@
|
|||||||
#!/usr/bin/env python2
|
|
||||||
|
|
||||||
import sys, subprocess
|
|
||||||
errors = False
|
|
||||||
|
|
||||||
try:
|
|
||||||
import zmq
|
|
||||||
except ImportError, e:
|
|
||||||
sys.stderr.write("You are missing ZeroMQ; `pip install pyzmq`\n")
|
|
||||||
errors = True
|
|
||||||
|
|
||||||
try:
|
|
||||||
import zmq
|
|
||||||
except ImportError, e:
|
|
||||||
sys.stderr.write("You are missing msgpack; `pip install msgpack-python`\n")
|
|
||||||
errors = True
|
|
||||||
|
|
||||||
try:
|
|
||||||
import zmq
|
|
||||||
except ImportError, e:
|
|
||||||
sys.stderr.write("You are missing requests; `pip install requests`\n")
|
|
||||||
errors = True
|
|
||||||
|
|
||||||
try:
|
|
||||||
import lxml.html
|
|
||||||
except ImportError, e:
|
|
||||||
sys.stderr.write("You are missing lxml (needs 2.0 or higher); `pip install lxml`\n")
|
|
||||||
errors = True
|
|
||||||
|
|
||||||
if errors == False:
|
|
||||||
subprocess.call(["/bin/sh", "_start.sh"])
|
|
||||||
|
|
||||||
sys.stdout.write("pastebin-scrape is now running. Run `python retrieve.py` to add additional retrieval workers.\n")
|
|
Loading…
Reference in New Issue