You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
104 lines
3.1 KiB
Python
104 lines
3.1 KiB
Python
import lxml.html, requests, urlparse, re
|
|
from lxml import etree
|
|
from datetime import datetime
|
|
|
|
endpoint = "http://caltsardragon.com:8080"
|
|
|
|
def get_date(string):
|
|
# Because for whatever reason, strptime doesn't work
|
|
month, year = string.split()
|
|
month_map = {
|
|
"Jan": 1,
|
|
"Feb": 2,
|
|
"Mar": 3,
|
|
"Apr": 4,
|
|
"May": 5,
|
|
"Jun": 6,
|
|
"Jul": 7,
|
|
"Aug": 8,
|
|
"Sep": 9,
|
|
"Oct": 10,
|
|
"Nov": 11,
|
|
"Dec": 12
|
|
}
|
|
|
|
return (int(year), month_map[month])
|
|
|
|
|
|
# We'll retrieve a list of all book IDs for this installation
|
|
response = requests.get("%s/browse/category/allbooks" % endpoint)
|
|
xml = lxml.html.fromstring(response.text.encode("utf-8"))
|
|
book_ids = {}
|
|
|
|
for item in xml.xpath("//*[@id='booklist']/div[@class='page']/div[@class='load_data']/@title"):
|
|
response = requests.post("%s/browse/booklist_page" % endpoint, data={"ids": item})
|
|
xml_titles = lxml.html.fromstring(response.json().encode("utf-8"))
|
|
title_map = {}
|
|
|
|
for subitem in xml_titles.xpath("//div[@class='summary']"):
|
|
#print str(etree.tostring(subitem))
|
|
id_ = subitem.xpath("div[@class='details-href']/@title")[0].split("/")[-1]
|
|
title = subitem.xpath("div/div[@class='title']/strong/text()")
|
|
book_ids[id_] = title
|
|
print "Done %s..." % item
|
|
|
|
for id_, title in book_ids.iteritems():
|
|
details_url = "/browse/details/%s" % id_
|
|
cover_url = "/get/cover/%s" % id_
|
|
|
|
response = requests.get(endpoint + details_url)
|
|
xml = lxml.html.fromstring(response.json().encode("utf-8"))
|
|
#print etree.tostring(xml)
|
|
|
|
downloads = {}
|
|
|
|
for item in xml.xpath("//div[@class='field formats']/a"):
|
|
filetype = item.get("title")
|
|
url = endpoint + item.get("href")
|
|
downloads[filetype.lower()] = url
|
|
|
|
isbn = xml.xpath("//div[@class='field']/a[starts-with(@title,'isbn:')]/text()")
|
|
amazon = xml.xpath("//div[@class='field']/a[starts-with(@title,'amazon:')]/@href")
|
|
google = xml.xpath("//div[@class='field']/a[starts-with(@title,'google:')]/@href")
|
|
|
|
tags = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by tags:')]/text()")
|
|
publish_date = [get_date(date) for date in xml.xpath("//div[@class='field' and strong/text() = 'Published: ']/text()")]
|
|
language = xml.xpath("//div[@class='field' and strong/text() = 'Languages: ']/text()")
|
|
publishers = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by publisher:')]/text()")
|
|
authors = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by authors:')]/text()")
|
|
|
|
series = xml.xpath("//div[@class='field']/a[starts-with(@title,'Browse books by series:')]/text()")
|
|
if len(series) > 0:
|
|
try:
|
|
series_title, series_id = re.match("(.+) \[(.+)\]$", series[0]).groups(1)
|
|
except AttributeError, e:
|
|
series_title, series_id = (None, None)
|
|
else:
|
|
series_title, series_id = (None, None)
|
|
|
|
print "%s: %s" % (series_title, series_id)
|
|
|
|
obj = {
|
|
"ids": {
|
|
"isbn": isbn,
|
|
"amazon": amazon,
|
|
"google": google,
|
|
},
|
|
"title": title,
|
|
"authors": authors,
|
|
"publishers": publishers,
|
|
"publish_date": publish_date,
|
|
"language": language,
|
|
"tags": tags,
|
|
"urls": downloads,
|
|
"cover_url": cover_url,
|
|
"series": [
|
|
{
|
|
"title": series_title,
|
|
"item": series_id
|
|
}
|
|
]
|
|
}
|
|
|
|
print obj
|