main/tools/scrapers/devilskitchen.py

#!/usr/bin/python
start_page = "http://www.devilskitchen.me.uk/"
default_headers = {
	'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.A.B.C Safari/525.13",
	'Referer': start_page
}

import re, urllib2, datetime, argparse, os, json, time
from bs4 import BeautifulSoup

parser = argparse.ArgumentParser(description='Archive posts for devilskitchen.me.uk in JSON format')

parser.add_argument('-O', dest='output_dir', action='store', default='.',
                   help='output directory for archived posts')

args = parser.parse_args()
options = vars(args)

try:
	os.mkdir(options['output_dir'])
except:
	pass

def fetch_page_headers(url, headers):
	request = urllib2.Request(url, headers=headers)
	opener = urllib2.build_opener()
	response = opener.open(request)
	return (response.code, response.headers, response.read())

def fetch_archives():
	status_code, headers, response = fetch_page_headers(start_page, default_headers)

	if status_code == 200:
		archive_pages = re.findall("(http:\/\/www\.devilskitchen\.me\.uk\/([0-9]{4})_([0-9]{2})_[0-9]{2}_archive\.html)", response)

		for page in archive_pages:
			if os.path.exists("%s/%s-%d" % (options['output_dir'], page[1], int(page[2]))):
				print "%s-%s already exists, skipping..." % (page[1], page[2])
			else:
				print "Scraping %s..." % page[0]
				fetch_articles(page[0])
				time.sleep(5)
	else:
		print "ERROR: Failed to retrieve archive index! Exiting..."
		exit(1)


def fetch_articles(url):
	try:
		status_code, headers, response = fetch_page_headers(url, default_headers)
	except urllib2.HTTPError, e:
		print "ERROR: 403 encountered on %s" % url
		return False

	if status_code == 200:
		soup = BeautifulSoup(response)
		posts = soup.find_all("div", class_="post")

		for post in posts:
			try:
				post_title = post.h5.string
			except AttributeError, e:
				print "WARNING: Article with missing title"
				post_title = ""

			author_details = post.find_all("p", class_="author-details")[0]
			author_name = author_details.find_all("span", class_="author-name")[0].string
			post_date = author_details.find_all("a")[0].string
			post_body = post.find_all("div", class_="post-body")[0].div.prettify()

			actual_date = datetime.datetime.strptime(post_date, "%m/%d/%Y %I:%M:%S %p")

			try:
				os.mkdir("%s/%d-%d" % (options['output_dir'], actual_date.year, actual_date.month))
			except:
				pass

			try:
				json_file = open("%s/%d-%d/%d-%d-%d-%d-%d-%d.json" % (options['output_dir'], actual_date.year, actual_date.month, actual_date.year,
					actual_date.month, actual_date.day, actual_date.hour, actual_date.minute, actual_date.second), 'w')

				json.dump({
					'title': post_title,
					'date': actual_date.isoformat(),
					'author': author_name,
					'body': post_body
				}, json_file)

				json_file.close()
			except:
				raise


			print "Archived '%s', posted at %s by %s" % (post_title, actual_date.isoformat(), author_name)
	else:
		print "ERROR: Failed to retrieve %s! Status code was %d" % (url, status_code)

#soup = BeautifulSoup(html_doc)
fetch_archives()