From 0864499ad57740626873001812ed067da6a098d7 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 3 Jun 2012 03:10:59 +0200 Subject: [PATCH] Implemented a functional parser --- jamendoparser.py | 83 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 18 deletions(-) diff --git a/jamendoparser.py b/jamendoparser.py index f8a1e50..c786cb4 100644 --- a/jamendoparser.py +++ b/jamendoparser.py @@ -1,5 +1,5 @@ import urllib, gzip, sys -from lxml import etree +from lxml.etree import iterparse xml_url = "http://img.jamendo.com/data/dbdump_artistalbumtrack.xml.gz" xml_file = "dump.xml.gz" @@ -8,23 +8,70 @@ def update_progress(count, blocksize, totalsize): percent = int(count * blocksize * 100 / totalsize) sys.stdout.write("\r%2d%%" % percent) sys.stdout.flush() + +def get_attribute(element, tagname): + val = element.find(tagname) + + if val is None: + return "" + else: + if val.text == "None": + return "" + else: + return val.text -def fast_iter(context, func): - # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ - # Author: Liza Daly - for event, elem in context: - func(elem) - elem.clear() - while elem.getprevious() is not None: - del elem.getparent()[0] - del context +#print "Retrieving Jamendo database..." +#urllib.urlretrieve(xml_url, xml_file, reporthook=update_progress) +#print "" -def process_element(elem): - print elem.xpath( 'description/text( )' ) +xml = gzip.open(xml_file) -print "Retrieving Jamendo database..." -urllib.urlretrieve(xml_url, xml_file, reporthook=update_progress) - -exit(0) -context = etree.iterparse( MYFILE, tag='item' ) -fast_iter(context,process_element) +for event, element in iterparse(xml, tag="artist"): + # id, name, url, image, mbgid, location, Albums + artistid = get_attribute(element, 'id') + name = get_attribute(element, 'name') + image = get_attribute(element, 'image') + mbgid = get_attribute(element, 'mbgid') + location = get_attribute(element, 'location') + + print "[%s] %s from %s (image: %s)" % (artistid, name, location, image) + + for album in element.find('Albums'): + # id, name, url, releasedate, filename, mbgid, license_artwork, Tracks + albumname = get_attribute(album, 'name') + albumurl = get_attribute(album, 'url') + albumrelease = get_attribute(album, 'releasedate') + albumfilename = get_attribute(album, 'filename') + albummbgid = get_attribute(album, 'mbgid') + albumartworklicense = get_attribute(album, 'license_artwork') + + print " -> Album: %s (%s) at %s" % (albumname, albumrelease, albumurl) + + for track in album.find('Tracks'): + # id, name, filename, mbgid, numalbum, id3genre, license, Tags + trackid = get_attribute(track, 'id') + trackname = get_attribute(track, 'name') + trackfilename = get_attribute(track, 'filename') + trackmbgid = get_attribute(track, 'mbgid') + tracknumber = get_attribute(track, 'numalbum') + trackgenre = get_attribute(track, 'id3genre') + tracklicense = get_attribute(track, 'license') + + print " [%3d] %s (ID: %s)" % (int(tracknumber), trackname, trackid) + + alltags = [] + taglist = track.find('Tags') + + if taglist is not None: + for tag in taglist: + # idstr, weight + tagid = get_attribute(tag, 'idstr') + tagweight = get_attribute(tag, 'weight') + alltags.append("%s (weight %s)" % (tagid, tagweight)) + + print " %s" % ' '.join(alltags) + else: + print " No tags." + + print "" + element.clear()