
141 lines
5 KiB
Raw Normal View History

2012-06-03 05:37:23 +02:00
import urllib, gzip, sys, argparse, sqlite3, datetime, time
2012-06-03 03:10:59 +02:00
from lxml.etree import iterparse
2012-06-03 01:30:31 +02:00
xml_url = "http://img.jamendo.com/data/dbdump_artistalbumtrack.xml.gz"
def update_progress(count, blocksize, totalsize):
percent = int(count * blocksize * 100 / totalsize)
2013-04-25 19:10:08 +02:00
if options['no_output'] == False:
sys.stdout.write("\rRetrieving Jamendo database... %2d%%" % percent)
2012-06-03 03:10:59 +02:00
def get_attribute(element, tagname):
val = element.find(tagname)
if val is None:
return ""
if val.text == "None":
return ""
return val.text
2012-06-03 01:30:31 +02:00
2012-06-03 03:23:37 +02:00
parser = argparse.ArgumentParser(description='Downloads and parses the Jamendo XML dump, and creates an SQLite database with all artist, album, track, and tag data.')
parser.add_argument('-D', dest='no_download', action='store_true',
help='don\'t download the XML dump and use an existing XML dump instead')
2013-04-25 19:10:08 +02:00
parser.add_argument('-N', dest='no_output', action='store_true',
help='prevents the application from outputting anything')
2012-06-03 03:23:37 +02:00
parser.add_argument('-d', dest='database', action='store', default='jamendo.db',
help='path of the database that should be used to store the data (will be created if it does not exist yet)')
parser.add_argument('-x', dest='xml_path', action='store', default='jamendo.xml.gz',
help='path to the Jamendo XML dump (this is the file that will be created when a new dump is downloaded)')
args = parser.parse_args()
options = vars(args)
2012-06-03 03:24:43 +02:00
xml_file = options['xml_path']
if options['no_download'] == False:
urllib.urlretrieve(xml_url, xml_file, reporthook=update_progress)
2013-04-25 19:10:08 +02:00
if options['no_output'] == False:
print ""
2012-06-03 01:30:31 +02:00
database = sqlite3.connect(options['database'])
cursor = database.cursor()
2012-06-03 03:58:10 +02:00
# Try to create artists table
cursor.execute("CREATE TABLE artists (`id`, `name`, `url`, `image`, `mbgid`, `location`)")
except sqlite3.OperationalError:
# Try to create albums table
2012-06-03 04:01:45 +02:00
cursor.execute("CREATE TABLE albums (`id`, `artist_id`, `name`, `url`, `releasedate`, `filename`, `mbgid`, `license_artwork`)")
2012-06-03 03:58:10 +02:00
except sqlite3.OperationalError:
# Try to create tracks table
cursor.execute("CREATE TABLE tracks (`id`, `artist_id`, `album_id`, `name`, `filename`, `mbgid`, `tracknumber`, `genre`, `license`)")
except sqlite3.OperationalError:
# Try to create tags table
cursor.execute("CREATE TABLE tags (`track_id`, `name`, `weight`)")
except sqlite3.OperationalError:
2012-06-03 03:10:59 +02:00
xml = gzip.open(xml_file)
2012-06-03 05:41:10 +02:00
total = 0
2012-06-03 01:30:31 +02:00
2012-06-03 03:10:59 +02:00
for event, element in iterparse(xml, tag="artist"):
# id, name, url, image, mbgid, location, Albums
artistid = get_attribute(element, 'id')
name = get_attribute(element, 'name')
2012-06-03 04:00:21 +02:00
url = get_attribute(element, 'url')
2012-06-03 03:10:59 +02:00
image = get_attribute(element, 'image')
mbgid = get_attribute(element, 'mbgid')
2012-06-03 05:37:23 +02:00
location_element = element.find('location')
country = get_attribute(location_element, 'country')
except AttributeError:
country = ""
cursor.execute("INSERT INTO artists VALUES (?, ?, ?, ?, ?, ?)", (artistid, name, url, image, mbgid, country))
2012-06-03 03:10:59 +02:00
for album in element.find('Albums'):
# id, name, url, releasedate, filename, mbgid, license_artwork, Tracks
2012-06-03 04:29:12 +02:00
albumid = get_attribute(album, 'id')
2012-06-03 03:10:59 +02:00
albumname = get_attribute(album, 'name')
albumurl = get_attribute(album, 'url')
2012-06-03 05:37:23 +02:00
albumrelease = int(time.mktime(datetime.datetime.strptime(get_attribute(album, 'releasedate').split('+')[0], '%Y-%m-%dT%H:%M:%S').timetuple()))
2012-06-03 03:10:59 +02:00
albumfilename = get_attribute(album, 'filename')
albummbgid = get_attribute(album, 'mbgid')
albumartworklicense = get_attribute(album, 'license_artwork')
cursor.execute("INSERT INTO albums VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (albumid, artistid, albumname, albumurl, albumrelease, albumfilename, albummbgid, albumartworklicense))
2012-06-03 03:10:59 +02:00
for track in album.find('Tracks'):
# id, name, filename, mbgid, numalbum, id3genre, license, Tags
trackid = get_attribute(track, 'id')
trackname = get_attribute(track, 'name')
trackfilename = get_attribute(track, 'filename')
trackmbgid = get_attribute(track, 'mbgid')
tracknumber = get_attribute(track, 'numalbum')
trackgenre = get_attribute(track, 'id3genre')
tracklicense = get_attribute(track, 'license')
2012-06-03 04:41:27 +02:00
cursor.execute("INSERT INTO tracks VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", (trackid, artistid, albumid, trackname, trackfilename, trackmbgid, tracknumber, trackgenre, tracklicense))
2012-06-03 03:10:59 +02:00
taglist = track.find('Tags')
if taglist is not None:
for tag in taglist:
# idstr, weight
tagid = get_attribute(tag, 'idstr')
tagweight = get_attribute(tag, 'weight')
2012-06-03 04:42:59 +02:00
cursor.execute("INSERT INTO tags VALUES (?, ?, ?)", (trackid, tagid, tagweight))
2013-04-25 19:10:08 +02:00
if options['no_output'] == False:
sys.stdout.write("\rInserting artists... %6d done" % (total + 1))
2012-06-03 05:41:10 +02:00
total += 1
2012-06-03 03:10:59 +02:00
2012-06-03 05:37:23 +02:00
2013-04-25 19:10:08 +02:00
if options['no_output'] == False:
print ""
print "Parsed and inserted a total of %d artists." % total
2012-06-03 05:37:23 +02:00
2013-04-25 19:10:08 +02:00
if options['no_output'] == False:
print "Changes committed to database."