#!/usr/bin/perl -w # # This script is intended to assist you in renaming/retagging MP3 # files you've downl^Wripped from your legally-purchased CDs but # neglected to tag/name correctly. It uses FreeDB to check what your # file's supposed to be tagged as based on information gleaned from # the file. If you've got a griprc, it uses that to determine what the # file should be named. # # Basically, it's part of my ongoing quest to have my computer do damn # near everything for me, # # 22/06/2003 Created # use lib "$ENV{HOME}/src/perl"; use LWP::UserAgent; use HTML::TokeParser; use URI::Escape; use CDDB 1.15; # 1.15 required for file parsing use MP3::Info; use Getopt::Long; use File::Basename; use MP3::ID3Lib; use bytes; # because CDDB is ISO8859-1, everyone is. Tough luck. use strict; my $searchurl = 'http://www.freedb.org/freedb_search.php?'; $| = 1; my $debug = 1; # These are the current mirrors (January 2003) # June 2005: not responding? # freedb.freedb.de # de.freedb.org my @mirrorsites = qw( at.freedb.org au.freedb.org bg.freedb.org ca.freedb.org es.freedb.org fi.freedb.org lu.freedb.org no.freedb.org uk.freedb.org us.freedb.org ); my $id; my $force = 0; my $track_override = 0; GetOptions( "id=s" => \$id, "force!" => \$force, "track=i" => \$track_override ); # from Gronk's config.pl my $force_lowercase = 0; my $force_underscore = 0; my $safechars = "a-z0-9" . # alphanumerics "_()"; # punctuation # lowercase ISO chars for my $i ( 0xe0..0xff ) { $safechars .= pack( "C", $i ); } # make sure we don't cripple ourselves in cddb.pl::string_to_file if ( $force_underscore ) { $safechars .= "_"; } else { $safechars .= " "; } # Open a connection for each mirror my @mirrors; for my $mirror ( @mirrorsites) { my $cddbp = new CDDB( Host => $mirror, Port => 888 ); if ( defined( $cddbp )) { push @mirrors, $cddbp; } else { print STDERR "Failed to connect to $mirror\n"; } } my $cddbp = $mirrors[ 0 ]; my $ua = new LWP::UserAgent; $ua->env_proxy(); my @fullmatch; if ( my @discids = grep /discid/, @ARGV ) { my $discid = shift @discids; open( DISCID, $discid ) or die "$discid: $!"; my $newid = ; close( DISCID ); chomp( $newid ); if ( !defined( $id )) { $id = $newid; } } my ( $artist, $album, $track, $length, $tnum ); while ( my $file = shift ) { next if $file =~ /discid/; if ( -d $file ) { $file =~ s/\/$//; # canonicalise if ( opendir( DIR, $file )) { push @ARGV, map { "$file/$_" } grep !/^\.\.?$/, readdir( DIR ); closedir( DIR ); next; } else { die "can't open $file: $!\n"; } } if ( ! -e $file ) { print "$file does not exist!\n"; next; } print "$file\n"; $#fullmatch = -1; my $tag = get_mp3tag( $file ); if ( !defined( $tag )) { print " No MP3 tags found!\n"; my $filetype = `file '$file'`; chomp( $filetype ); if ( $filetype !~ "MP3|MPEG Layer 3" ) { print " doesn't appear to be an MP3 file, even\n"; next; } } $artist = $tag->{'ARTIST'} || ""; $album = $tag->{'ALBUM'} || ""; $track = $tag->{'TITLE'} || ""; $tnum = $tag->{'TRACKNUM'} || 0; # ARGH. Stupid API change demands stupid fix. for my $var qw( artist album track tnum ) { eval "if ( ref \$$var ) { \$$var = \$${var}->[-1]; \$$var =~ s/\\0\$//; };"; } if ( !$track ) { $track = basename( $file ); # attempt to strip out useless bits $track =~ s/\.mp3|\.m4a$//i; # extension if ( $tnum ) { print STDERR "discarding $tnum from $track\n"; $track =~ s/\b$tnum\b(\s*-\s*)//; } if ( title_compare( $album, $track )) { $track = title_compare( $album, $track ); } if ( title_compare( $artist, $track )) { $track = title_compare( $artist, $track ); } } if ( !$artist ) { my ( $a, $t ) = split( /\s*-\s*/, $track ); if ( $a and $t ) { $artist = $a; $track = $t; } } if ( defined( $tnum ) and $tnum !~ /^\d+$/ ) { $tnum =~ s/\/.*$//; # more muppets } if ( defined( $tnum ) and $tnum =~ /([^0-9])/ ) { print STDERR "Muppet alert, discarding $tnum.\n"; $tnum = 0; } if ( $tnum == 0 ) { if ( $track =~ /Track.*?(\d+)/ ) { $tnum = $1; } elsif ( $track =~ /^(\d+[ -]+)/ ) { $tnum = $1; } else { print STDERR "uhoh, can't get a tnum\n"; } } if ( defined( $track_override ) and $track_override ) { $tnum = $track_override; } my $mp3info = get_mp3info( $file ); $length = $mp3info->{SECS} if $mp3info; $length ||= 0; if ( $debug ) { print "Extracted info:\n"; print "Artist: $artist\n"; print "Album: $album\n"; print "Track: $track\n"; print "Track number: $tnum Track Time: $length seconds\n"; } my %guessed = ( "artist" => $artist, "album" => $album, "track" => $track, "tnum" => $tnum, "length" => $length, ); if ( defined( $id )) { my ( $g, $i ) = split( /[\/ ]/, $id ); $g ||= ""; $i ||= $id; # CDDB file or ID specified on the command line. my $disc_details; if ( open( XMCD, "<$id" )) { print " Reading from file\n"; binmode( XMCD ); my @lines = ; $disc_details = CDDB::parse_xmcd_file( \@lines ); } else { print " Reading from cache '$g' '$i'\n"; $disc_details = getcddbfile_cache( $g, $i ); } if ( !defined( $disc_details )) { die "erm. Can't figure out what to do with $id!\n" } # now we need to identify which track this is... my @bits; # genre, id, disc title, track title, duration, tracknum push @bits, ""; # we don't know the genre. push @bits, $disc_details->{discid}; push @bits, $disc_details->{dtitle}; $tnum = find_track_in_album( $disc_details, $file, \%guessed ); if ( !defined( $tnum )) { print STDERR "Can't find this track on the album you specified...\n"; if ( defined( $guessed{tnum})) { print STDERR "Going by existing track number, it should be:\n"; print STDERR $disc_details->{ttitles}->[$guessed{tnum} - 1]; print STDERR "\n"; } if ( !$force ) { die "Stopped"; } else { $tnum = $guessed{tnum}; } } push @bits, $disc_details->{ttitles}->[$tnum - 1]; push @bits, $disc_details->{seconds}->[$tnum - 1]; push @bits, $tnum - 1; push @fullmatch, \@bits; } else { # try successively more desperate searches get_cddb_matches( $artist, $album, $track, $length ); if ( !@fullmatch ) { get_cddb_matches( $artist, "", $track, $length ); if ( !@fullmatch ) { get_cddb_matches( "", "", $track, $length ); if ( !@fullmatch ) { # at this point, it's likely that the trackname is # hosed. try and build it from the filename. my $strack = $track; $track = $file; # attempt to strip out useless bits $track =~ s/\.mp3|\.m4a$//i; # extension if ( $tnum ) { $track =~ s/\b$tnum\b(\s*-\s*)//; } if ( title_compare( $album, $track )) { $track = title_compare( $album, $track ); } if ( title_compare( $artist, $track )) { $track = title_compare( $artist, $track ); } if ( $track ne $strack ) { get_cddb_matches( "", "", $track, $length ); } } } } # prune fullmatch my %fullmatch; for my $match ( @fullmatch ) { my $dtitle = $match->[2]; my $rdtitle = quotemeta( $dtitle ); my @already = grep /^$rdtitle$/i, keys %fullmatch; if ( !@already ) { $fullmatch{$dtitle} = $match; } my $contender = $dtitle; for my $already ( @already ) { if ( $contender lt $already ) { delete $fullmatch{$contender}; $fullmatch{$already} = $match; $contender = $already; } } } @fullmatch = values %fullmatch; } if ( scalar ( @fullmatch ) == 1 ) { # dump out command line my $dtitle = $fullmatch[0]->[2]; my $ttitle = $fullmatch[0]->[3]; my ( $artist ) = split( '/', $dtitle ); if ( $artist =~ /^various/i ) { ( $artist, $ttitle ) = split( '/', $ttitle, 2 ); } ( undef, $dtitle ) = split( '/', $dtitle, 2 ); # slop $dtitle =~ s/^\s+//; $dtitle =~ s/\s+$//; $artist =~ s/^\s+//; $artist =~ s/\s+$//; $ttitle =~ s/^\s+//; $ttitle =~ s/\s+$//; # string escapes $dtitle =~ s/"/\\"/g; $artist =~ s/"/\\"/g; $ttitle =~ s/"/\\"/g; $file =~ s/"/\\"/g; # fixme: include genre print "CDDB ID : " . $fullmatch[0]->[1] . "\n"; if ( abs( $fullmatch[0]->[4] - $length ) > 1 ) { print "length mismatch: $length should be " . $fullmatch[0]->[4] . "\n"; } print "mp3name.pl --nodry-run"; print " --album \""; print $dtitle; print "\" --artist \""; print $artist; print "\" --title \""; print $ttitle; print "\" --track "; print $fullmatch[0]->[5] + 1; print " \"$file\""; print "\n"; } else { print scalar(@fullmatch) . " match(es) found\n"; for my $details ( @fullmatch ) { print " =================================\n"; print " | file details:\n | " . $details->[2] . " / " . $details->[3] . "\n"; print " | track number: " . ($details->[5] + 1) . "\n"; print " | length: " . $details->[4] . "\n"; print " | discid file should contain " . $details->[1] . "\n"; print " =================================\n"; print "\n"; } } } # avoid stupid warning message. while ( @mirrors ) { my $cddbp = shift @mirrors; $cddbp->disconnect(); } # get a single CDDB file matching GENRE, ID and check it for TITLE, LENGTH sub getcddbfile { my ( $genre, $id ) = split( ' ', $_[0] ); my $title = $_[1]; my $length = $_[2]; my $artist = $_[3]; # Do the lookup my $disc_details = getcddbfile_cache( $genre, $id ); if ( defined( $disc_details )) { my @ttitles = @{$disc_details->{ttitles}}; my $offset = 0; for my $ttitle ( 0 .. $#ttitles ) { $ttitles[$ttitle] ||= "blank"; my $ltext = lc( $ttitles[$ttitle] ); my $ltitle = lc( $title ); printf( "%2d. %s %ds", ( $ttitle + 1 ), $ttitles[$ttitle], $disc_details->{seconds}->[$ttitle] ); $ltext = title_compare( $ltitle, $ltext ); $ltext =~ s/^CD$//i; # acceptable lossage if ( $ltext ) { $ltext = title_compare( lc( $artist ), $ltext ); if ( $ltext ) { print "\n -> post-fuzzy track match: $ltext "; } } if ( !$ltext ) { # gronk allows one second of slop if ( $length and abs( $disc_details->{seconds}->[$ttitle] - $length ) > 1 ) { printf( " -> MATCHED, BUT WRONG LENGTH! (%f vs %f)", $disc_details->{seconds}->[$ttitle], $length ); } else { print " -> MATCH!"; push @fullmatch, [ $genre, $id, $disc_details->{dtitle}, $ttitles[$ttitle], $disc_details->{seconds}->[$ttitle], $ttitle ]; } } elsif ( index( lc( $ttitles[$ttitle]), lc( $title ), 0 ) != -1 ) { print " -> PARTIAL MATCH!"; push @fullmatch, [ $genre, $id, $disc_details->{dtitle}, $ttitles[$ttitle], $disc_details->{seconds}->[$ttitle], $ttitle ]; } else { print " -> didn't match $title"; print " ${length}s" if $length; } print "\n"; } } else { print STDERR "Can't get $genre $id\n"; } } # search for possible CDDB matches sub get_cddb_matches { my ( $artist, $album, $track, $length ) = @_; my $searchstring = "$artist $album $track"; my $save = $searchstring; $searchstring =~ s/\bthe\b//gi; if ( !$searchstring ) { $searchstring = $save; } $searchstring =~ s/[\(\)]//g; $searchstring =~ s/^\s+//; $searchstring =~ s/\s+$//; $searchstring =~ s/\s+/ /g; print " -> Web search for $searchstring\n"; $searchstring = uri_escape( $searchstring ); my $fields = ""; $fields .= "fields=artist" if $artist; $fields .= "&" if $fields; $fields .= "fields=title" if $album; $fields .= "&" if $fields and substr( $fields, -1 ) ne "&"; $fields .= "fields=track" if $track; my $req = new HTTP::Request GET => $searchurl . "grouping=none&allcats=YES&allfields=NO&$fields&words=$searchstring"; my $res = $ua->request( $req ); my @match; if ( $res->is_success ) { my $content = $res->content; my $parser = new HTML::TokeParser( \$content ); my $previous_dtitle = ""; if ( $debug ) { open( DUMP, ">$ENV{HOME}/tmp/cdthingdebug.html" ); print DUMP $content; close( DUMP ); } while ( my $tag = $parser->get_tag( "a" )) { next unless $tag->[1]->{class}; if ( $tag->[1]->{class} =~ m@searchResultTopLinkA@i ) { my $text = $parser->get_trimmed_text( "/a" ); if ( $text =~ /^\d+$/ ) { $text = $previous_dtitle; } $previous_dtitle = $text; print "Found>: $text"; while ( $tag = $parser->get_tag( "b" )) { if ( $parser->get_trimmed_text( "/b" ) eq "Disc-ID:" ) { $id = $parser->get_trimmed_text( "/a" ); $id =~ s/\///; $id =~ s/\s+/ /; last; } } print " ($id)"; if ( lc($text) eq lc( "$artist / $album" )) { print " -> match!\n"; push @match, $id; getcddbfile( $id, $track, $length, $artist ); } else { # try fuzzy matching my $ltext = lc( $text ); my $lartist = lc( $artist ); my $lalbum = lc( $album ); # high stupidity in action. if ( $text =~ /^various(\s+artists)*\s*(\(.*?\))/i ) { $ltext =~ s/$2//i; } if ( !$lartist ) { $ltext =~ s@^.*/\s*@@; } elsif ( !$lalbum ) { $ltext =~ s@/\s*.*$@@; } if ( $lartist or $lalbum ) { $ltext = title_compare( $lartist, $ltext ); $ltext = title_compare( $lalbum, $ltext ); $ltext =~ s/\bCD\b//i; # acceptable lossage $ltext = title_compare( "various artists", $ltext ); $ltext = title_compare( "various", $ltext ); } else { $ltext = ""; } if ( $ltext ) { if ( $text =~ /soundtrack/ ) { $ltext = title_compare( "original", $ltext ); $ltext = title_compare( "motion picture", $ltext ); } } if ( $ltext ) { if ( $ltext =~ /remixed by/i ) { $ltext =~ s/\s*remixed by\s*//i; } } if ( $ltext ) { print "\n -> post-fuzzy title match: [$ltext]"; } else { print "\n"; push @match, $id; getcddbfile( $id, $track, $length, $artist ); } print "\n"; } } } } else { warn $res->code; next; } } # This is from Gronk's cddb.pl # Given a piece of text (a song title or band name) converts it to something # usable as a file name using the same algorithm as Grip: downcase, delete # all non-alphanumerics, and map space to underscore. # sub string_to_file { local $_ = shift; # protect $_ # note: do not localize this: Grip doesn't. tr/A-Z/a-z/ if $force_lowercase; # downcase s@ @_@g if $force_underscore; # now map space to underscore s/[^$safechars]//gi; # delete unsafe chars return $_; } # fuzzy match for movie titles, will use for album title comparison sub title_compare { my @args = @_; map { $_ = uc( $_ ); s/\(\d{4}\)//; # year s/[^A-Z0-9]/ /g; s/\s+/ /g; s/^\s+//; s/\s+$//; } @args; my @twords1 = split( /\s/, $args[0] ); my $t2 = $args[1]; for my $word ( @twords1 ) { $t2 =~ s/\b$word\b//; } $t2 =~ s/\s+/ /g; $t2 =~ s/^\s+//; $t2 =~ s/\s+$//; # if there's anything left in t2 at this point, we've probably got # a mismatch. return $t2; } # caching CDDB files sub getcddbfile_cache { my ( $gen, $id ) = @_; my $site = shift @mirrors; push @mirrors, $site; my @genres; # if genre is undefined, cycle over the lot of 'em. if ( !$gen ) { @genres = $site->get_genres(); } else { push @genres, $gen; } my @results; for my $genre ( @genres ) { my $disc_details; if ( $#genres > 0 ) { print STDERR " trying $genre/$id\n" if $debug; } $site = shift @mirrors; push @mirrors, $site; my @cache = ( "$ENV{HOME}/.cddb/$id", "$ENV{HOME}/.cddb/$genre/$id", "$ENV{HOME}/.cddb-web/$genre/$id", ); for my $cachefile ( @cache ) { if ( open( XMCD, "<$cachefile" )) { binmode( XMCD ); my @lines = ; my $lines = \@lines; $disc_details = CDDB::parse_xmcd_file( \@lines ); close( XMCD ); print " unable to parse $cachefile!\n" unless $disc_details; last if $disc_details; } else { print " unable to open $cachefile ($!)\n" if -f $cachefile; } } if ( !defined( $disc_details )) { print " fetching $id from " . $site->{host} . "..."; $disc_details = $site->get_disc_details( $genre, $id ); print "done\n"; # now cache for future use if ( defined( $disc_details )) { print STDERR " saving $genre/$id\n" if $debug; -d "$ENV{HOME}/.cddb-web" or mkdir "$ENV{HOME}/.cddb-web", 0755; -d "$ENV{HOME}/.cddb-web/$genre" or mkdir "$ENV{HOME}/.cddb-web/$genre", 0755; if ( open( XMCD, ">$ENV{HOME}/.cddb-web/$genre/$id" )) { print XMCD $disc_details->{xmcd_record}; close( XMCD ); } } } push @results, $disc_details if $disc_details; } if ( $#results > 0 ) { die "Multiple results found for $id\n"; } else { return $results[0]; } } # used when we know for certain that the track we want is in the # specified album. returns the track number. sub find_track_in_album { my $disc_details = shift; my $file = shift; my $guessed = shift; my $mp3info = get_mp3info( $file ); my $length = $mp3info->{SECS}; my $id3 = new MP3::ID3Lib( $file ); for my $ttitle ( 0 .. $#{$disc_details->{ttitles}}) { my $ondisc_name = $disc_details->{ttitles}->[$ttitle]; my $ondisc_len = $disc_details->{seconds}->[$ttitle]; # if there's an exact match, but the lengths don't match, # maybe there's another track with the same title? Really need # to keep a list of possible matches and then postprocess it. XXX if ( $ondisc_name eq $guessed->{track} ) { print "Name match (track " . ( $ttitle + 1 ) . ")..."; if ( $ondisc_len != $length ) { print "length mismatch ($ondisc_len vs $length), throwing it back\n"; # next; } print "\n"; } my $ltext = title_compare( $guessed->{track}, $ondisc_name ); if ( $ltext ) { if ( title_compare( $ltext, $ondisc_name )) { # cool, partial match. let's try tossing the artist. $ltext = title_compare( $guessed->{artist}, $ltext ); if ( !$ltext ) { return $ttitle + 1; } # stupid, but it happens if ( $ltext =~ /\bmix(ed)?\b/i and $ondisc_name =~ /\bremix(ed)?\b/i ) { return $ttitle + 1; } if ( $ltext =~ /\bremix(ed)?\b/i and $ondisc_name =~ /\bmix(ed)?\b/i ) { return $ttitle + 1; } # close enough for government work if ( abs( $ondisc_len - $guessed->{length}) <=1 ) { return $ttitle + 1; } # substrings, woo. this is actually bad, and stupid, # and whatnot, but will do for now. my $glc = quotemeta( lc( $guessed->{track})); my $olc = lc( $ondisc_name ); if ( $olc =~ /^$glc/ ) { return $ttitle + 1; } } } else { # exact match return $ttitle + 1; } } return undef; }