Compare commits

...

13 commits

Author SHA1 Message Date
Sven Slootweg d6995b5030 Deal with 404 properly 2013-12-16 17:05:56 +01:00
Sven Slootweg 16a5bea805 Deal with overloaded servers... 2013-12-14 04:30:18 +01:00
Sven Slootweg 389a49e6c3 Wait a bit longer between pastes, to avoid throttling 2013-11-02 22:01:22 +01:00
Sven Slootweg 68e82f8ac8 Deal with throttling properly 2013-11-02 22:00:29 +01:00
Sven Slootweg 73fa91c953 Change scrape checking interval, to deal with traffic peaks 2013-10-30 20:08:06 +01:00
Sven Slootweg e49a9bdef6 Logging 2013-10-30 16:27:50 +01:00
Sven Slootweg 53b4111c59 Make collector bind instead of retriever 2013-10-30 16:04:45 +01:00
Sven Slootweg 8a079d7090 Also check for lxml 2013-10-30 16:02:22 +01:00
Sven Slootweg 047f08fdf6 Fixed start script; also, print is evil 2013-10-30 15:34:05 +01:00
Sven Slootweg 4994c1baa7 Shebang 2013-10-30 15:31:55 +01:00
Sven Slootweg 7d5736c28e Fix permissions 2013-10-30 15:31:18 +01:00
Sven Slootweg d35311f6db Start scripts 2013-10-30 15:30:36 +01:00
Sven Slootweg b3fa3fba4a First working version! 2013-10-30 15:25:17 +01:00
6 changed files with 186 additions and 0 deletions

4
_start.sh Executable file
View file

@ -0,0 +1,4 @@
python log.py &
python collect.py &
python retrieve.py &
python scrape.py &

34
collect.py Normal file
View file

@ -0,0 +1,34 @@
import zmq, msgpack, json, os, time
context = zmq.Context()
socket = context.socket(zmq.PULL)
socket.bind("ipc:///tmp/pbscrape-results")
logger = context.socket(zmq.PUSH)
logger.connect("ipc:///tmp/pbscrape-log")
try:
os.makedirs("pastes")
except OSError, e:
pass
while True:
item = msgpack.unpackb(socket.recv())
target_dir = time.strftime("%Y-%m-%d")
try:
os.makedirs("pastes/%s" % target_dir)
except OSError, e:
pass
f = open("pastes/%s/%s.txt" % (target_dir, item["id"]), "wb")
f.write(item["paste"])
f.close()
del item["paste"] # To prevent writing the paste to the metadata file as well
f = open("pastes/%s/%s.json" % (target_dir, item["id"]), "wb")
json.dump(item, f)
f.close()
logger.send(msgpack.packb({"component": "collect", "timestamp": int(time.time()), "message": "Stored %s." % item["id"]}))

12
log.py Normal file
View file

@ -0,0 +1,12 @@
import zmq, msgpack
context = zmq.Context()
socket = context.socket(zmq.PULL)
socket.bind("ipc:///tmp/pbscrape-log")
while True:
entry = msgpack.unpackb(socket.recv())
f = open("scrape.log", "a")
f.write("[%(component)s] %(timestamp)s : %(message)s\n" % entry)
f.close()

52
retrieve.py Normal file
View file

@ -0,0 +1,52 @@
import zmq, msgpack, requests, time
context = zmq.Context()
receiver = context.socket(zmq.PULL)
receiver.connect("ipc:///tmp/pbscrape-tasks")
sender = context.socket(zmq.PUSH)
sender.connect("ipc:///tmp/pbscrape-results")
logger = context.socket(zmq.PUSH)
logger.connect("ipc:///tmp/pbscrape-log")
while True:
item = msgpack.unpackb(receiver.recv())
gone = False
while True: # We want to keep trying until it succeeds...
try:
response = requests.get("http://pastebin.com/raw.php?i=%s" % item["id"])
if response.status_code == 404:
# Gone...
gone = True
break
elif "text/html" in response.headers["Content-Type"]:
# We most likely got an "under heavy load" message or similar; sleep a while and retry
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Hit a text/html response for raw.php, servers most likely overloaded, sleeping..."}))
time.sleep(10)
continue # Retry
paste = response.text
except Exception, e:
# TODO: Log error
print e
time.sleep(5)
continue # Retry
if response.status_code == 403:
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."}))
time.sleep(600)
continue # Retry
break # Done
if gone:
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Paste %s gone, skipping..." % item["id"]}))
continue # Next!
item["retrieval_time"] = int(time.time())
item["paste"] = paste
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Downloaded paste body for %s." % item["id"]}))
sender.send(msgpack.packb(item))
time.sleep(1.3) # Wait a second between each paste retrieval...

51
scrape.py Normal file
View file

@ -0,0 +1,51 @@
import zmq, time, requests, lxml.html, msgpack
context = zmq.Context()
socket = context.socket(zmq.PUSH)
socket.bind("ipc:///tmp/pbscrape-tasks")
logger = context.socket(zmq.PUSH)
logger.connect("ipc:///tmp/pbscrape-log")
last_list = []
while True:
try:
page = requests.get("http://pastebin.com/archive").text
except Exception, e:
# TODO: Log HTTP error
time.sleep(30)
continue
if "temporarily blocked your computer" in page:
logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."}))
time.sleep(600)
continue
basetime = int(time.time())
xml = lxml.html.fromstring(page)
pastes = xml.xpath("//table[@class='maintable']/tr")
new_list = []
found = 0
for paste in pastes:
try:
title, filetype = paste.xpath("td/a/text()")
except ValueError, e:
continue # Not a valid entry
paste_id = paste.xpath("td[1]/a/@href")[0][1:]
ago = paste.xpath("td[2]/text()")[0]
new_list.append(paste_id)
if paste_id not in last_list:
found += 1
socket.send(msgpack.packb({"id": paste_id, "type": filetype, "title": title, "base_time": basetime, "ago": ago}))
logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Scraped metadata for %d new pastes." % found}))
last_list = new_list
time.sleep(1 * 60)

33
start.py Executable file
View file

@ -0,0 +1,33 @@
#!/usr/bin/env python2
import sys, subprocess
errors = False
try:
import zmq
except ImportError, e:
sys.stderr.write("You are missing ZeroMQ; `pip install pyzmq`\n")
errors = True
try:
import zmq
except ImportError, e:
sys.stderr.write("You are missing msgpack; `pip install msgpack-python`\n")
errors = True
try:
import zmq
except ImportError, e:
sys.stderr.write("You are missing requests; `pip install requests`\n")
errors = True
try:
import lxml.html
except ImportError, e:
sys.stderr.write("You are missing lxml (needs 2.0 or higher); `pip install lxml`\n")
errors = True
if errors == False:
subprocess.call(["/bin/sh", "_start.sh"])
sys.stdout.write("pastebin-scrape is now running. Run `python retrieve.py` to add additional retrieval workers.\n")