Compare commits

...

13 Commits

Author SHA1 Message Date
Sven Slootweg d6995b5030 Deal with 404 properly 9 years ago
Sven Slootweg 16a5bea805 Deal with overloaded servers... 9 years ago
Sven Slootweg 389a49e6c3 Wait a bit longer between pastes, to avoid throttling 9 years ago
Sven Slootweg 68e82f8ac8 Deal with throttling properly 9 years ago
Sven Slootweg 73fa91c953 Change scrape checking interval, to deal with traffic peaks 9 years ago
Sven Slootweg e49a9bdef6 Logging 9 years ago
Sven Slootweg 53b4111c59 Make collector bind instead of retriever 9 years ago
Sven Slootweg 8a079d7090 Also check for lxml 9 years ago
Sven Slootweg 047f08fdf6 Fixed start script; also, print is evil 9 years ago
Sven Slootweg 4994c1baa7 Shebang 9 years ago
Sven Slootweg 7d5736c28e Fix permissions 9 years ago
Sven Slootweg d35311f6db Start scripts 9 years ago
Sven Slootweg b3fa3fba4a First working version! 9 years ago
  1. 4
      _start.sh
  2. 34
      collect.py
  3. 12
      log.py
  4. 52
      retrieve.py
  5. 51
      scrape.py
  6. 33
      start.py

4
_start.sh

@ -0,0 +1,4 @@
python log.py &
python collect.py &
python retrieve.py &
python scrape.py &

34
collect.py

@ -0,0 +1,34 @@
import zmq, msgpack, json, os, time
context = zmq.Context()
socket = context.socket(zmq.PULL)
socket.bind("ipc:///tmp/pbscrape-results")
logger = context.socket(zmq.PUSH)
logger.connect("ipc:///tmp/pbscrape-log")
try:
os.makedirs("pastes")
except OSError, e:
pass
while True:
item = msgpack.unpackb(socket.recv())
target_dir = time.strftime("%Y-%m-%d")
try:
os.makedirs("pastes/%s" % target_dir)
except OSError, e:
pass
f = open("pastes/%s/%s.txt" % (target_dir, item["id"]), "wb")
f.write(item["paste"])
f.close()
del item["paste"] # To prevent writing the paste to the metadata file as well
f = open("pastes/%s/%s.json" % (target_dir, item["id"]), "wb")
json.dump(item, f)
f.close()
logger.send(msgpack.packb({"component": "collect", "timestamp": int(time.time()), "message": "Stored %s." % item["id"]}))

12
log.py

@ -0,0 +1,12 @@
import zmq, msgpack
context = zmq.Context()
socket = context.socket(zmq.PULL)
socket.bind("ipc:///tmp/pbscrape-log")
while True:
entry = msgpack.unpackb(socket.recv())
f = open("scrape.log", "a")
f.write("[%(component)s] %(timestamp)s : %(message)s\n" % entry)
f.close()

52
retrieve.py

@ -0,0 +1,52 @@
import zmq, msgpack, requests, time
context = zmq.Context()
receiver = context.socket(zmq.PULL)
receiver.connect("ipc:///tmp/pbscrape-tasks")
sender = context.socket(zmq.PUSH)
sender.connect("ipc:///tmp/pbscrape-results")
logger = context.socket(zmq.PUSH)
logger.connect("ipc:///tmp/pbscrape-log")
while True:
item = msgpack.unpackb(receiver.recv())
gone = False
while True: # We want to keep trying until it succeeds...
try:
response = requests.get("http://pastebin.com/raw.php?i=%s" % item["id"])
if response.status_code == 404:
# Gone...
gone = True
break
elif "text/html" in response.headers["Content-Type"]:
# We most likely got an "under heavy load" message or similar; sleep a while and retry
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Hit a text/html response for raw.php, servers most likely overloaded, sleeping..."}))
time.sleep(10)
continue # Retry
paste = response.text
except Exception, e:
# TODO: Log error
print e
time.sleep(5)
continue # Retry
if response.status_code == 403:
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."}))
time.sleep(600)
continue # Retry
break # Done
if gone:
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Paste %s gone, skipping..." % item["id"]}))
continue # Next!
item["retrieval_time"] = int(time.time())
item["paste"] = paste
logger.send(msgpack.packb({"component": "retrieve", "timestamp": int(time.time()), "message": "Downloaded paste body for %s." % item["id"]}))
sender.send(msgpack.packb(item))
time.sleep(1.3) # Wait a second between each paste retrieval...

51
scrape.py

@ -0,0 +1,51 @@
import zmq, time, requests, lxml.html, msgpack
context = zmq.Context()
socket = context.socket(zmq.PUSH)
socket.bind("ipc:///tmp/pbscrape-tasks")
logger = context.socket(zmq.PUSH)
logger.connect("ipc:///tmp/pbscrape-log")
last_list = []
while True:
try:
page = requests.get("http://pastebin.com/archive").text
except Exception, e:
# TODO: Log HTTP error
time.sleep(30)
continue
if "temporarily blocked your computer" in page:
logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Got throttled, sleeping..."}))
time.sleep(600)
continue
basetime = int(time.time())
xml = lxml.html.fromstring(page)
pastes = xml.xpath("//table[@class='maintable']/tr")
new_list = []
found = 0
for paste in pastes:
try:
title, filetype = paste.xpath("td/a/text()")
except ValueError, e:
continue # Not a valid entry
paste_id = paste.xpath("td[1]/a/@href")[0][1:]
ago = paste.xpath("td[2]/text()")[0]
new_list.append(paste_id)
if paste_id not in last_list:
found += 1
socket.send(msgpack.packb({"id": paste_id, "type": filetype, "title": title, "base_time": basetime, "ago": ago}))
logger.send(msgpack.packb({"component": "scrape", "timestamp": int(time.time()), "message": "Scraped metadata for %d new pastes." % found}))
last_list = new_list
time.sleep(1 * 60)

33
start.py

@ -0,0 +1,33 @@
#!/usr/bin/env python2
import sys, subprocess
errors = False
try:
import zmq
except ImportError, e:
sys.stderr.write("You are missing ZeroMQ; `pip install pyzmq`\n")
errors = True
try:
import zmq
except ImportError, e:
sys.stderr.write("You are missing msgpack; `pip install msgpack-python`\n")
errors = True
try:
import zmq
except ImportError, e:
sys.stderr.write("You are missing requests; `pip install requests`\n")
errors = True
try:
import lxml.html
except ImportError, e:
sys.stderr.write("You are missing lxml (needs 2.0 or higher); `pip install lxml`\n")
errors = True
if errors == False:
subprocess.call(["/bin/sh", "_start.sh"])
sys.stdout.write("pastebin-scrape is now running. Run `python retrieve.py` to add additional retrieval workers.\n")
Loading…
Cancel
Save