jamendoparser/jamendoparser.py

import urllib, gzip, sys, argparse
from lxml.etree import iterparse

xml_url = "http://img.jamendo.com/data/dbdump_artistalbumtrack.xml.gz"

def update_progress(count, blocksize, totalsize):
	percent = int(count * blocksize * 100 / totalsize)
	sys.stdout.write("\r%2d%%" % percent)
	sys.stdout.flush()
	
def get_attribute(element, tagname):
	val = element.find(tagname)
	
	if val is None:
		return ""
	else:
		if val.text == "None":
			return ""
		else:
			return val.text

database = sqlite3.connect(options['database'])
cursor = database.cursor()

parser = argparse.ArgumentParser(description='Downloads and parses the Jamendo XML dump, and creates an SQLite database with all artist, album, track, and tag data.')

parser.add_argument('-D', dest='no_download', action='store_true',
                   help='don\'t download the XML dump and use an existing XML dump instead')
                   
parser.add_argument('-d', dest='database', action='store', default='jamendo.db',
                   help='path of the database that should be used to store the data (will be created if it does not exist yet)')
                   
parser.add_argument('-x', dest='xml_path', action='store', default='jamendo.xml.gz',
                   help='path to the Jamendo XML dump (this is the file that will be created when a new dump is downloaded)')

args = parser.parse_args()
options = vars(args)

if options['no_download'] == False:
	print "Retrieving Jamendo database..."
	urllib.urlretrieve(xml_url, xml_file, reporthook=update_progress)
	print ""

xml = gzip.open(xml_file)

for event, element in iterparse(xml, tag="artist"):
	# id, name, url, image, mbgid, location, Albums
	artistid = get_attribute(element, 'id')
	name = get_attribute(element, 'name')
	image = get_attribute(element, 'image')
	mbgid = get_attribute(element, 'mbgid')
	location = get_attribute(element, 'location')
	
	print "[%s] %s from %s (image: %s)" % (artistid, name, location, image)
	
	for album in element.find('Albums'):
		# id, name, url, releasedate, filename, mbgid, license_artwork, Tracks
		albumname = get_attribute(album, 'name')
		albumurl = get_attribute(album, 'url')
		albumrelease = get_attribute(album, 'releasedate')
		albumfilename = get_attribute(album, 'filename')
		albummbgid = get_attribute(album, 'mbgid')
		albumartworklicense = get_attribute(album, 'license_artwork')
		
		print "    -> Album: %s (%s) at %s" % (albumname, albumrelease, albumurl)
		
		for track in album.find('Tracks'):
			# id, name, filename, mbgid, numalbum, id3genre, license, Tags
			trackid = get_attribute(track, 'id')
			trackname = get_attribute(track, 'name')
			trackfilename = get_attribute(track, 'filename')
			trackmbgid = get_attribute(track, 'mbgid')
			tracknumber = get_attribute(track, 'numalbum')
			trackgenre = get_attribute(track, 'id3genre')
			tracklicense = get_attribute(track, 'license')
			
			print "        [%3d] %s (ID: %s)" % (int(tracknumber), trackname, trackid)
			
			alltags = []
			taglist = track.find('Tags')
			
			if taglist is not None:
				for tag in taglist:
					# idstr, weight
					tagid = get_attribute(tag, 'idstr')
					tagweight = get_attribute(tag, 'weight')
					alltags.append("%s (weight %s)" % (tagid, tagweight))
				
				print "                  %s" % '   '.join(alltags)
			else:
				print "                  No tags."
		
	print ""
	element.clear()
Add argparse block 12 years ago			`import urllib, gzip, sys, argparse`
Implemented a functional parser 12 years ago			`from lxml.etree import iterparse`
Initial commit 12 years ago
			`xml_url = "http://img.jamendo.com/data/dbdump_artistalbumtrack.xml.gz"`

			`def update_progress(count, blocksize, totalsize):`
			`percent = int(count * blocksize * 100 / totalsize)`
			`sys.stdout.write("\r%2d%%" % percent)`
			`sys.stdout.flush()`
Implemented a functional parser 12 years ago
			`def get_attribute(element, tagname):`
			`val = element.find(tagname)`

			`if val is None:`
			`return ""`
			`else:`
			`if val.text == "None":`
			`return ""`
			`else:`
			`return val.text`
Initial commit 12 years ago
Connect to a SQLite database 12 years ago			`database = sqlite3.connect(options['database'])`
			`cursor = database.cursor()`

Add argparse block 12 years ago			`parser = argparse.ArgumentParser(description='Downloads and parses the Jamendo XML dump, and creates an SQLite database with all artist, album, track, and tag data.')`

			`parser.add_argument('-D', dest='no_download', action='store_true',`
			`help='don\'t download the XML dump and use an existing XML dump instead')`

			`parser.add_argument('-d', dest='database', action='store', default='jamendo.db',`
			`help='path of the database that should be used to store the data (will be created if it does not exist yet)')`

			`parser.add_argument('-x', dest='xml_path', action='store', default='jamendo.xml.gz',`
			`help='path to the Jamendo XML dump (this is the file that will be created when a new dump is downloaded)')`

			`args = parser.parse_args()`
			`options = vars(args)`

Download XML dump when -D is not specified 12 years ago			`if options['no_download'] == False:`
			`print "Retrieving Jamendo database..."`
			`urllib.urlretrieve(xml_url, xml_file, reporthook=update_progress)`
			`print ""`
Initial commit 12 years ago
Implemented a functional parser 12 years ago			`xml = gzip.open(xml_file)`
Initial commit 12 years ago
Implemented a functional parser 12 years ago			`for event, element in iterparse(xml, tag="artist"):`
			`# id, name, url, image, mbgid, location, Albums`
			`artistid = get_attribute(element, 'id')`
			`name = get_attribute(element, 'name')`
			`image = get_attribute(element, 'image')`
			`mbgid = get_attribute(element, 'mbgid')`
			`location = get_attribute(element, 'location')`

			`print "[%s] %s from %s (image: %s)" % (artistid, name, location, image)`

			`for album in element.find('Albums'):`
			`# id, name, url, releasedate, filename, mbgid, license_artwork, Tracks`
			`albumname = get_attribute(album, 'name')`
			`albumurl = get_attribute(album, 'url')`
			`albumrelease = get_attribute(album, 'releasedate')`
			`albumfilename = get_attribute(album, 'filename')`
			`albummbgid = get_attribute(album, 'mbgid')`
			`albumartworklicense = get_attribute(album, 'license_artwork')`

			`print " -> Album: %s (%s) at %s" % (albumname, albumrelease, albumurl)`

			`for track in album.find('Tracks'):`
			`# id, name, filename, mbgid, numalbum, id3genre, license, Tags`
			`trackid = get_attribute(track, 'id')`
			`trackname = get_attribute(track, 'name')`
			`trackfilename = get_attribute(track, 'filename')`
			`trackmbgid = get_attribute(track, 'mbgid')`
			`tracknumber = get_attribute(track, 'numalbum')`
			`trackgenre = get_attribute(track, 'id3genre')`
			`tracklicense = get_attribute(track, 'license')`

			`print " [%3d] %s (ID: %s)" % (int(tracknumber), trackname, trackid)`

			`alltags = []`
			`taglist = track.find('Tags')`

			`if taglist is not None:`
			`for tag in taglist:`
			`# idstr, weight`
			`tagid = get_attribute(tag, 'idstr')`
			`tagweight = get_attribute(tag, 'weight')`
			`alltags.append("%s (weight %s)" % (tagid, tagweight))`

			`print " %s" % ' '.join(alltags)`
			`else:`
			`print " No tags."`

			`print ""`
			`element.clear()`