2012-06-03 05:37:23 +02:00
import urllib , gzip , sys , argparse , sqlite3 , datetime , time
2012-06-03 03:10:59 +02:00
from lxml . etree import iterparse
2012-06-03 01:30:31 +02:00
xml_url = " http://img.jamendo.com/data/dbdump_artistalbumtrack.xml.gz "
def update_progress ( count , blocksize , totalsize ) :
percent = int ( count * blocksize * 100 / totalsize )
2013-04-25 19:10:08 +02:00
if options [ ' no_output ' ] == False :
sys . stdout . write ( " \r Retrieving Jamendo database... %2d %% " % percent )
sys . stdout . flush ( )
2012-06-03 03:10:59 +02:00
def get_attribute ( element , tagname ) :
val = element . find ( tagname )
if val is None :
return " "
else :
if val . text == " None " :
return " "
else :
return val . text
2012-06-03 01:30:31 +02:00
2012-06-03 03:23:37 +02:00
parser = argparse . ArgumentParser ( description = ' Downloads and parses the Jamendo XML dump, and creates an SQLite database with all artist, album, track, and tag data. ' )
parser . add_argument ( ' -D ' , dest = ' no_download ' , action = ' store_true ' ,
help = ' don \' t download the XML dump and use an existing XML dump instead ' )
2013-04-25 19:10:08 +02:00
parser . add_argument ( ' -N ' , dest = ' no_output ' , action = ' store_true ' ,
help = ' prevents the application from outputting anything ' )
2012-06-03 03:23:37 +02:00
parser . add_argument ( ' -d ' , dest = ' database ' , action = ' store ' , default = ' jamendo.db ' ,
help = ' path of the database that should be used to store the data (will be created if it does not exist yet) ' )
parser . add_argument ( ' -x ' , dest = ' xml_path ' , action = ' store ' , default = ' jamendo.xml.gz ' ,
help = ' path to the Jamendo XML dump (this is the file that will be created when a new dump is downloaded) ' )
args = parser . parse_args ( )
options = vars ( args )
2012-06-03 03:24:43 +02:00
xml_file = options [ ' xml_path ' ]
2012-06-03 03:24:18 +02:00
if options [ ' no_download ' ] == False :
urllib . urlretrieve ( xml_url , xml_file , reporthook = update_progress )
2013-04-25 19:10:08 +02:00
if options [ ' no_output ' ] == False :
print " "
2012-06-03 01:30:31 +02:00
2012-06-03 03:25:47 +02:00
database = sqlite3 . connect ( options [ ' database ' ] )
cursor = database . cursor ( )
2012-06-03 03:58:10 +02:00
try :
# Try to create artists table
cursor . execute ( " CREATE TABLE artists (`id`, `name`, `url`, `image`, `mbgid`, `location`) " )
except sqlite3 . OperationalError :
pass
try :
# Try to create albums table
2012-06-03 04:01:45 +02:00
cursor . execute ( " CREATE TABLE albums (`id`, `artist_id`, `name`, `url`, `releasedate`, `filename`, `mbgid`, `license_artwork`) " )
2012-06-03 03:58:10 +02:00
except sqlite3 . OperationalError :
pass
try :
# Try to create tracks table
cursor . execute ( " CREATE TABLE tracks (`id`, `artist_id`, `album_id`, `name`, `filename`, `mbgid`, `tracknumber`, `genre`, `license`) " )
except sqlite3 . OperationalError :
pass
try :
# Try to create tags table
cursor . execute ( " CREATE TABLE tags (`track_id`, `name`, `weight`) " )
except sqlite3 . OperationalError :
pass
2012-06-03 03:10:59 +02:00
xml = gzip . open ( xml_file )
2012-06-03 05:41:10 +02:00
total = 0
2012-06-03 01:30:31 +02:00
2012-06-03 03:10:59 +02:00
for event , element in iterparse ( xml , tag = " artist " ) :
# id, name, url, image, mbgid, location, Albums
artistid = get_attribute ( element , ' id ' )
name = get_attribute ( element , ' name ' )
2012-06-03 04:00:21 +02:00
url = get_attribute ( element , ' url ' )
2012-06-03 03:10:59 +02:00
image = get_attribute ( element , ' image ' )
mbgid = get_attribute ( element , ' mbgid ' )
2012-06-03 05:37:23 +02:00
location_element = element . find ( ' location ' )
try :
country = get_attribute ( location_element , ' country ' )
except AttributeError :
country = " "
cursor . execute ( " INSERT INTO artists VALUES (?, ?, ?, ?, ?, ?) " , ( artistid , name , url , image , mbgid , country ) )
2012-06-03 03:10:59 +02:00
for album in element . find ( ' Albums ' ) :
# id, name, url, releasedate, filename, mbgid, license_artwork, Tracks
2012-06-03 04:29:12 +02:00
albumid = get_attribute ( album , ' id ' )
2012-06-03 03:10:59 +02:00
albumname = get_attribute ( album , ' name ' )
albumurl = get_attribute ( album , ' url ' )
2012-06-03 05:37:23 +02:00
albumrelease = int ( time . mktime ( datetime . datetime . strptime ( get_attribute ( album , ' releasedate ' ) . split ( ' + ' ) [ 0 ] , ' % Y- % m- %d T % H: % M: % S ' ) . timetuple ( ) ) )
2012-06-03 03:10:59 +02:00
albumfilename = get_attribute ( album , ' filename ' )
albummbgid = get_attribute ( album , ' mbgid ' )
albumartworklicense = get_attribute ( album , ' license_artwork ' )
2012-06-03 04:41:58 +02:00
cursor . execute ( " INSERT INTO albums VALUES (?, ?, ?, ?, ?, ?, ?, ?) " , ( albumid , artistid , albumname , albumurl , albumrelease , albumfilename , albummbgid , albumartworklicense ) )
2012-06-03 03:10:59 +02:00
for track in album . find ( ' Tracks ' ) :
# id, name, filename, mbgid, numalbum, id3genre, license, Tags
trackid = get_attribute ( track , ' id ' )
trackname = get_attribute ( track , ' name ' )
trackfilename = get_attribute ( track , ' filename ' )
trackmbgid = get_attribute ( track , ' mbgid ' )
tracknumber = get_attribute ( track , ' numalbum ' )
trackgenre = get_attribute ( track , ' id3genre ' )
tracklicense = get_attribute ( track , ' license ' )
2012-06-03 04:41:27 +02:00
cursor . execute ( " INSERT INTO tracks VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) " , ( trackid , artistid , albumid , trackname , trackfilename , trackmbgid , tracknumber , trackgenre , tracklicense ) )
2012-06-03 03:10:59 +02:00
taglist = track . find ( ' Tags ' )
if taglist is not None :
for tag in taglist :
# idstr, weight
tagid = get_attribute ( tag , ' idstr ' )
tagweight = get_attribute ( tag , ' weight ' )
2012-06-03 04:42:59 +02:00
cursor . execute ( " INSERT INTO tags VALUES (?, ?, ?) " , ( trackid , tagid , tagweight ) )
2012-06-03 04:29:48 +02:00
2013-04-25 19:10:08 +02:00
if options [ ' no_output ' ] == False :
sys . stdout . write ( " \r Inserting artists... %6d done " % ( total + 1 ) )
sys . stdout . flush ( )
2012-06-03 04:43:49 +02:00
2012-06-03 05:41:10 +02:00
total + = 1
2012-06-03 03:10:59 +02:00
element . clear ( )
2012-06-03 05:37:23 +02:00
2013-04-25 19:10:08 +02:00
if options [ ' no_output ' ] == False :
print " "
print " Parsed and inserted a total of %d artists. " % total
2012-06-03 05:37:23 +02:00
database . commit ( )
2013-04-25 19:10:08 +02:00
if options [ ' no_output ' ] == False :
print " Changes committed to database. "