wpcom-to-jekyll/convert.py

import html2text, os, sys, email, urllib, re, shutil
from lxml import etree
from datetime import datetime

# https://gist.github.com/robertklep/2928188
def rfc822(timestamp):
	return datetime.fromtimestamp(email.Utils.mktime_tz(email.Utils.parsedate_tz(timestamp)))

def chomp_indent(text):
	return u"\n".join(line.lstrip() for line in text.split(u"\n"))

def markdownize(text):
	if "<p>" not in text:
		# Old-style posts, these don't have explicit HTML paragraph tags. Let's break it up into paragraphs.
		text = text.replace(u"\n<", u"\n\n<") # Add an extra newline before HTML tags at the start of a line, to prevent them from getting caught in a paragraph
		text = u"\n\n".join(u"<p>%s</p>" % paragraph for paragraph in re.split(u"\n{2,}", text))
	# Don't try to instantiate a single HTML2Text and use it for every post; it will break with a MemoryError
	htmlparser = html2text.HTML2Text()
	htmlparser.body_width = 0
	text = htmlparser.handle(text) # Parse HTML and convert to Markdown
	text = re.sub(u"(?<=\))\s*(.+?)(?=\[\/caption\])", u"\n\n*\\1*", text) # Replace captions with italicized text in a new paragraph... best-effort
	text = re.sub(u"\[caption [^\]]+\]", u"", text) # Remove WordPress [caption] prefix tags
	text = text.replace(u"[/caption]", u"") # ... and the postfix.
	return shift_headers(text) # Make every header one level lower, so as to fit in with the headers in the default Jekyll theme
	
def shift_headers(text):
	for x in xrange(6, 0, -1):
		text = text.replace(u"\n" + (u"#" * x) + u" ", u"\n" + (u"#" * (x + 2)) + u" ")
	return text

xml_path = sys.argv[1]

try:
	target_path = sys.argv[2]
	shutil.copy("_layouts/page.html", os.path.join(target_path, "_layouts"))
except IndexError, e:
	target_path = "."

drafts_path = os.path.join(target_path, "_drafts")
posts_path = os.path.join(target_path, "_posts")
attachments_path = os.path.join(target_path, "attachments")

for path in (posts_path, drafts_path, attachments_path):
	try:
		os.makedirs(path)
	except OSError, e:
		pass
	
with open(xml_path, "r") as xml_file:
	xml = etree.parse(xml_file)
	
xmlns = {
	"excerpt": "http://wordpress.org/export/1.2/excerpt/",
	"content": "http://purl.org/rss/1.0/modules/content/",
	"wfw": "http://wellformedweb.org/CommentAPI/",
	"dc": "http://purl.org/dc/elements/1.1/",
	"wp": "http://wordpress.org/export/1.2/",
}

draft_counter = 1
attachments = []
posts = []
pages = []

site_url = xml.xpath("/rss/channel/link/text()")[0]

print "Site URL: %s\n" % site_url

print "Parsing XML..."

for item in xml.xpath("/rss/channel/item"):
	post_type = item.xpath("wp:post_type/text()", namespaces=xmlns)[0]
	
	try:
		post_title = item.xpath("title/text()", namespaces=xmlns)[0]
	except IndexError, e:
		post_title = "Untitled"
	
	try:
		post_slug = item.xpath("wp:post_name/text()", namespaces=xmlns)[0]
	except IndexError, e:
		post_slug = "draft-%s" % str(draft_counter).zfill(3)  # Drafts do not have slugs...
		draft_counter += 1
	
	if post_type == "attachment":
		attachment_url = item.xpath("wp:attachment_url/text()", namespaces=xmlns)[0]
		
		attachments.append({
			"url": attachment_url,
			"filename": attachment_url.split("/")[-1].split("?")[0]
		})
		
		print "   attachment: %s" % attachment_url
	elif post_type == "page":
		post_body = markdownize(item.xpath("content:encoded/text()", namespaces=xmlns)[0])
		
		pages.append({
			"title": post_title,
			"body": post_body,
			"slug": post_slug
		})
		
		print "   page: %s" % post_title
	elif post_type == "post":
		post_status = item.xpath("wp:status/text()", namespaces=xmlns)[0]
		post_body = markdownize(item.xpath("content:encoded/text()", namespaces=xmlns)[0])
		post_date = rfc822(item.xpath("pubDate/text()", namespaces=xmlns)[0])
		post_tags = [tag for tag in item.xpath("category/text()") if tag != "Uncategorized"]
		if len(post_tags) == 0:
			post_tags = ["untagged"]
		
		posts.append({
			"title": post_title,
			"body": post_body,
			"slug": post_slug,
			"status": post_status,
			"date": post_date,
			"tags": post_tags
		})
		
		if post_status == "draft":
			print "   draft: %s (%s)" % (post_title, post_date)
		elif post_status == "publish":
			print "   post: %s (%s)" % (post_title, post_date)

print "Replacing image URLs..."

for post in posts + pages:
	for attachment in attachments:
		post["body"] = post["body"].replace(attachment["url"], os.path.join("{{ site.url }}/attachments", attachment["filename"]))

print "Fixing internal hyperlinks..."

for post in posts + pages:
	post["body"] = post["body"].replace(site_url, "../../../..")

print "Downloading attachments..."

for attachment in attachments:
	urllib.urlretrieve(attachment["url"], os.path.join(attachments_path, attachment["filename"]))
	print "   %s" % attachment["url"]
	
print "Generating Jekyll posts..."

for post in posts:
	if post["status"] == "publish":
		post_date = post["date"]
		
		with open(os.path.join(posts_path, "%s-%s-%s-%s.md" % (post_date.year, unicode(post_date.month).zfill(2), unicode(post_date.day).zfill(2), post["slug"])), "w") as f:
			f.write(chomp_indent(u"""---
				layout: post
				title: "%(title)s"
				permalink: %(year)s/%(month)s/%(day)s/%(slug)s
				postday: %(year)s/%(month)s/%(day)s
				posttime: %(hour)s_%(minute)s
				tags: %(tags)s
			---\n""" % {
				"title": post["title"],
				"slug": post["slug"],
				"tags": ", ".join(post["tags"]),
				"year": post_date.year,
				"month": unicode(post_date.month).zfill(2),
				"day": unicode(post_date.day).zfill(2),
				"hour": unicode(post_date.hour).zfill(2),
				"minute": unicode(post_date.minute).zfill(2)
			}).encode("utf-8"))
			f.write(post["body"].encode("utf-8"))
	elif post["status"] == "draft":
		post_date = post["date"]
		
		with open(os.path.join(drafts_path, "%s.md" % post["slug"]), "w") as f:
			f.write(chomp_indent(u"""---
				layout: post
				title: "%(title)s"
				tags: %(tags)s
			---\n""" % {
				"title": post["title"],
				"tags": ", ".join(post["tags"])
			}).encode("utf-8"))
			f.write(post["body"].encode("utf-8"))
			
print "Generating static pages..."

for page in pages:
	page_dir = os.path.join(target_path, page["slug"])
	
	try:
		os.makedirs(page_dir)
	except OSError, e:
		pass
		
	with open(os.path.join(page_dir, "index.md"), "w") as f:
		f.write(chomp_indent(u"""---
			layout: page
			title: "%(title)s"
		---\n""" % {
			"title": page["title"]
		}).encode("utf-8"))
		f.write(page["body"].encode("utf-8"))

print "Done!"
Initial commit 10 years ago			`import html2text, os, sys, email, urllib, re, shutil`
			`from lxml import etree`
			`from datetime import datetime`

			`# https://gist.github.com/robertklep/2928188`
			`def rfc822(timestamp):`
			`return datetime.fromtimestamp(email.Utils.mktime_tz(email.Utils.parsedate_tz(timestamp)))`

			`def chomp_indent(text):`
			`return u"\n".join(line.lstrip() for line in text.split(u"\n"))`

			`def markdownize(text):`
			`if "<p>" not in text:`
			`# Old-style posts, these don't have explicit HTML paragraph tags. Let's break it up into paragraphs.`
			`text = text.replace(u"\n<", u"\n\n<") # Add an extra newline before HTML tags at the start of a line, to prevent them from getting caught in a paragraph`
			`text = u"\n\n".join(u"<p>%s</p>" % paragraph for paragraph in re.split(u"\n{2,}", text))`
			`# Don't try to instantiate a single HTML2Text and use it for every post; it will break with a MemoryError`
			`htmlparser = html2text.HTML2Text()`
			`htmlparser.body_width = 0`
			`text = htmlparser.handle(text) # Parse HTML and convert to Markdown`
			`text = re.sub(u"(?<=\))\s(.+?)(?=\[\/caption\])", u"\n\n\\1*", text) # Replace captions with italicized text in a new paragraph... best-effort`
			`text = re.sub(u"\[caption [^\]]+\]", u"", text) # Remove WordPress [caption] prefix tags`
			`text = text.replace(u"[/caption]", u"") # ... and the postfix.`
			`return shift_headers(text) # Make every header one level lower, so as to fit in with the headers in the default Jekyll theme`

			`def shift_headers(text):`
			`for x in xrange(6, 0, -1):`
			`text = text.replace(u"\n" + (u"#" * x) + u" ", u"\n" + (u"#" * (x + 2)) + u" ")`
			`return text`

			`xml_path = sys.argv[1]`

			`try:`
			`target_path = sys.argv[2]`
			`shutil.copy("_layouts/page.html", os.path.join(target_path, "_layouts"))`
			`except IndexError, e:`
			`target_path = "."`

			`drafts_path = os.path.join(target_path, "_drafts")`
			`posts_path = os.path.join(target_path, "_posts")`
			`attachments_path = os.path.join(target_path, "attachments")`

			`for path in (posts_path, drafts_path, attachments_path):`
			`try:`
			`os.makedirs(path)`
			`except OSError, e:`
			`pass`

			`with open(xml_path, "r") as xml_file:`
			`xml = etree.parse(xml_file)`

			`xmlns = {`
			`"excerpt": "http://wordpress.org/export/1.2/excerpt/",`
			`"content": "http://purl.org/rss/1.0/modules/content/",`
			`"wfw": "http://wellformedweb.org/CommentAPI/",`
			`"dc": "http://purl.org/dc/elements/1.1/",`
			`"wp": "http://wordpress.org/export/1.2/",`
			`}`

			`draft_counter = 1`
			`attachments = []`
			`posts = []`
			`pages = []`

			`site_url = xml.xpath("/rss/channel/link/text()")[0]`

			`print "Site URL: %s\n" % site_url`

			`print "Parsing XML..."`

			`for item in xml.xpath("/rss/channel/item"):`
			`post_type = item.xpath("wp:post_type/text()", namespaces=xmlns)[0]`
Bugfix 10 years ago
			`try:`
			`post_title = item.xpath("title/text()", namespaces=xmlns)[0]`
			`except IndexError, e:`
			`post_title = "Untitled"`
Initial commit 10 years ago
			`try:`
			`post_slug = item.xpath("wp:post_name/text()", namespaces=xmlns)[0]`
			`except IndexError, e:`
			`post_slug = "draft-%s" % str(draft_counter).zfill(3) # Drafts do not have slugs...`
			`draft_counter += 1`

			`if post_type == "attachment":`
			`attachment_url = item.xpath("wp:attachment_url/text()", namespaces=xmlns)[0]`

			`attachments.append({`
			`"url": attachment_url,`
			`"filename": attachment_url.split("/")[-1].split("?")[0]`
			`})`

			`print " attachment: %s" % attachment_url`
			`elif post_type == "page":`
			`post_body = markdownize(item.xpath("content:encoded/text()", namespaces=xmlns)[0])`

			`pages.append({`
			`"title": post_title,`
			`"body": post_body,`
			`"slug": post_slug`
			`})`

			`print " page: %s" % post_title`
			`elif post_type == "post":`
			`post_status = item.xpath("wp:status/text()", namespaces=xmlns)[0]`
			`post_body = markdownize(item.xpath("content:encoded/text()", namespaces=xmlns)[0])`
			`post_date = rfc822(item.xpath("pubDate/text()", namespaces=xmlns)[0])`
			`post_tags = [tag for tag in item.xpath("category/text()") if tag != "Uncategorized"]`
			`if len(post_tags) == 0:`
			`post_tags = ["untagged"]`

			`posts.append({`
			`"title": post_title,`
			`"body": post_body,`
			`"slug": post_slug,`
			`"status": post_status,`
			`"date": post_date,`
			`"tags": post_tags`
			`})`

			`if post_status == "draft":`
			`print " draft: %s (%s)" % (post_title, post_date)`
			`elif post_status == "publish":`
			`print " post: %s (%s)" % (post_title, post_date)`

			`print "Replacing image URLs..."`

			`for post in posts + pages:`
			`for attachment in attachments:`
			`post["body"] = post["body"].replace(attachment["url"], os.path.join("{{ site.url }}/attachments", attachment["filename"]))`

			`print "Fixing internal hyperlinks..."`

			`for post in posts + pages:`
			`post["body"] = post["body"].replace(site_url, "../../../..")`

			`print "Downloading attachments..."`

			`for attachment in attachments:`
			`urllib.urlretrieve(attachment["url"], os.path.join(attachments_path, attachment["filename"]))`
			`print " %s" % attachment["url"]`

			`print "Generating Jekyll posts..."`

			`for post in posts:`
			`if post["status"] == "publish":`
			`post_date = post["date"]`

			`with open(os.path.join(posts_path, "%s-%s-%s-%s.md" % (post_date.year, unicode(post_date.month).zfill(2), unicode(post_date.day).zfill(2), post["slug"])), "w") as f:`
			`f.write(chomp_indent(u"""---`
			`layout: post`
			`title: "%(title)s"`
			`permalink: %(year)s/%(month)s/%(day)s/%(slug)s`
			`postday: %(year)s/%(month)s/%(day)s`
			`posttime: %(hour)s_%(minute)s`
			`tags: %(tags)s`
			`---\n""" % {`
			`"title": post["title"],`
			`"slug": post["slug"],`
			`"tags": ", ".join(post["tags"]),`
			`"year": post_date.year,`
			`"month": unicode(post_date.month).zfill(2),`
			`"day": unicode(post_date.day).zfill(2),`
			`"hour": unicode(post_date.hour).zfill(2),`
			`"minute": unicode(post_date.minute).zfill(2)`
			`}).encode("utf-8"))`
			`f.write(post["body"].encode("utf-8"))`
			`elif post["status"] == "draft":`
			`post_date = post["date"]`

			`with open(os.path.join(drafts_path, "%s.md" % post["slug"]), "w") as f:`
			`f.write(chomp_indent(u"""---`
			`layout: post`
			`title: "%(title)s"`
			`tags: %(tags)s`
			`---\n""" % {`
			`"title": post["title"],`
			`"tags": ", ".join(post["tags"])`
			`}).encode("utf-8"))`
			`f.write(post["body"].encode("utf-8"))`

			`print "Generating static pages..."`

			`for page in pages:`
			`page_dir = os.path.join(target_path, page["slug"])`

			`try:`
			`os.makedirs(page_dir)`
			`except OSError, e:`
			`pass`

			`with open(os.path.join(page_dir, "index.md"), "w") as f:`
			`f.write(chomp_indent(u"""---`
			`layout: page`
			`title: "%(title)s"`
			`---\n""" % {`
			`"title": page["title"]`
			`}).encode("utf-8"))`
			`f.write(page["body"].encode("utf-8"))`

			`print "Done!"`