#!/usr/bin/perl -w # Thread a bunch of mail messages # Based on jwz's threading algorithm # Created: Waider August 29 2000 # Modified: # Waider |2001/03/14| clean up and comment code, add HTMLifyer # Waider |2001/03/15| tweak threading algorithm, date sorting # Waider |2001/03/20| adding navigation # Waider |2001/03/24| discarded nav, trying to improve threading further # ----------------------------------------------------------------------------- package Message; use Date::Parse; # Message object, consisting of: # author # date # subject # msgid # references/in-reply-to # reference to message itself # Constructor sub new { my ( $type, $subject, $id, $msg ) = @_; my $self = bless {}, $type; $self->{'subject'} = $subject; $self->{'message_id'} = $id; $self->{'message'} = $msg; $self; } # Author accessor (read/write) sub author { my ( $self, $auth ) = @_; if ( defined( $auth )) { $self->{'author'} = $auth; } $self->{'author'}; } # Date accessor (read/write) sub date { my ( $self, $date ) = @_; if ( defined( $date )) { $self->{'date'} = str2time( $date ) || die "Can't parse $date\n"; } $self->{'date'}; } # Subject accessor (read-only) sub subject { my $self = shift; $self->{'subject'}; } # Message Id accessor (read-only) sub message_id { my $self = shift; $self->{'message_id'}; } # References accessor (read-only) sub references { my $self = shift; $self->{'references'}; } # Add a reference sub add_reference { my ( $self, $ref ) = @_; $self->{'references'} = push @{$self->{'references'}}, $ref; } # Message accessor (read/write) sub message { my ( $self, $msg ) = @_; if ( defined( $msg )) { if ( defined( $self->{'message'})) { die "Duplicate message ids"; } else { $self->{'message'} = $msg; } } $self->{'message'}; } # ----------------------------------------------------------------------------- package Container; # Container object, consisting of: # message (Message object) # parent (Container object) # children (array of Container objects) # date (date of message or youngest child) sub new { my ( $type, $msg ) = @_; my $self = bless {}, $type; $self->message( $msg ); $self; } # Message accesor (read/write) sub message { my ( $self, $msg ) = @_; if ( defined( $msg )) { $self->{'message'} = $msg; $self->date( $msg->date ); } $self->{'message'}; } # Message ID accessor (read-only) sub message_id { my ( $self ) = @_; if ( !defined( $self->{'message'})) { return undef; } return $self->{'message'}->message_id; } # Date accesor (read/write) sub date { my ( $self, $date ) = @_; if ( defined( $date )) { $self->{'date'} = $date if ( !defined( $self->{'date' }) || ( $date < $self->{'date'})); } $self->{'date'}; } # Parent accessor (read/write) sub parent { my ( $self, $cont ) = @_; if ( defined( $cont )) { $self->{'parent'} = $cont; if ( ref( $cont ) eq "Container" ) { $cont->date( $self->{'date'} ); } } $self->{'parent'} || ""; } # Child accessor (read/write) sub child { my ( $self, $cont ) = @_; my $cref = $self->{'child'}; my @cref; if ( defined( $cont )) { if ( defined( $cref )) { @cref = @{$cref}; } if ( !$self->is_child( $cont )) { # hmm. I maybe shouldn't need this. push @cref, $cont; $self->{'child'} = \@cref; } } if ( defined( $cref )) { @{$cref}; } else { @cref; } } # Is the specified container a child of this container? sub is_child { my ( $self, $cont ) = @_; my $cref = $self->{'child'}; if ( defined( $cref )) { for my $c ( @{$cref} ) { if ( $c eq $cont ) { return 1; } } } return 0; } # Return a string representing the Container: # message id if available, # [no message id] if no message id # [empty container] if no message sub as_string { my ( $self ) = @_; if ( defined( $self->message )) { if ( defined( $self->message->message_id )) { return $self->message->message_id; } else { return "[no message id]"; } } else { return "[empty container]"; } } # Delete the specified child sub delete_child { my ( $self, $cont ) = @_; my $cref = $self->{'child'}; if ( defined( $cref )) { my @cref; for my $c ( @{$cref} ) { if ( $c ne $cont ) { push @cref, $c; } } $self->{'child'} = \@cref; # store } } # ------------------------------ package main; my %id_table; my @roots; # regexp for pruning/matching RE: DSPsrv: etc. AW is german, I # think. FW is for forwards. my $re_exp = "^\\s*((\\[?dspsrv[:\\s\\]]*)?((re|aw|fw)(\\[\\d+\\])*: *))+"; # Parse the text of an In-Reply-To header into a valid message ID. sub parse_inreplyto { my ( $rep ) = @_; my $ref; # We could do smarter stuff here, like parsing "FOO's message of BAR" ( $ref ) = $rep =~ m|.*?(\<.*?\>)|; warn "$rep" if $rep =~ /message of/i && !defined($ref); $ref = msgid_clean( $ref ) if defined( $ref ); $ref; } # Parse a References header into a list of message IDs. sub parse_references { my ( $ref ) = @_; my @refs; for my $r ( split( /\s+/, $ref )) { push @refs, msgid_clean( $r ); } # fixme validate @refs; } # Run a loop check on adding the second parameter as a child of the # first, both Containers. sub not_a_loop { my ( $a, $b ) = @_; my ( $rnext, @next ); die "Wrong arg types" unless ref( $a ) eq "Container" and ref( $b ) eq "Container"; return 0 if $a eq $b; @next = $b->child; return 1 if !( @next ); # recurse! for my $next ( @next ) { return 1 if !defined( $next ); return 0 unless not_a_loop( $a, $next ); } return 1; } use lib "$ENV{HOME}/src/perl"; use MboxHack; # because Mail::Folder::Mbox blows goats use MIME::Parser; use Storable; use Date::Format; use diagnostics; select( STDERR ); $| = 1; select( STDOUT ); # Iterate over supplied folders while ( @ARGV ) { my $file = shift;; if ( !( $folder = new MboxHack $file )) { print STDERR "$file is not a mbox folder\n"; next; } $folder->set_readonly(); $count = $folder->qty(); # Make folder directory my $htmldir = "$ {file}-html"; -d $htmldir or mkdir $htmldir, 0755; # Load the id_table if ( -f "idtable" ) { print STDERR "Loading idtable..."; $idref = retrieve( "idtable" ); %id_table = %{$idref}; print STDERR "done.\n"; } # Iterate over messages in the folder print STDERR "Processing $count messages"; for my $i ( 1 .. $count ) { # For each message: my $message = $folder->get_message( $i ); print STDERR "."; # Clean up the message here $message = clean_message( $message ); # HTMLify the message, preserving any navigation stuff that's # already there. my @current_html; if ( open( HTML, "<$ {htmldir}/$ {i}.html" )) { @current_html = grep /^\<\!-- nav /, ; close( HTML ); } open( HTML, ">$ {htmldir}/$ {i}.html" ); print HTML join( "", @current_html ) if @current_html; print HTML message_to_html( $message, $file, $i, $count ); close( HTML ); # Get required head bits my $head = $message->head(); my $author = $head->get( "From" ); my $subj = $head->get( "Subject" ) || ""; my $msgid = $head->get( "Message-ID" ) || ""; my $date = $head->get( "Date" ); my $refs = $head->get( "References" ) || ""; my $inrep = $head->get( "In-Reply-To" ) || ""; # ARGH. chomp( $author ); chomp( $subj ); $msgid = msgid_clean( $msgid ); chomp( $date ); chomp( $refs ); chomp( $inrep ); my @refs = parse_references( $refs ); if (!@refs && $inrep ) { my $r = parse_inreplyto( $inrep ); defined( $r ) and push @refs, $r; } # 1 A. Check if it's in the table; if it is, stash the message, if # not, make an empty container and stash the message. my $cont = $id_table{ $msgid } || new Container; my $msg = new Message( $subj, $msgid, $folder->get_message_hash( $i )); $msg->author( $author ); $msg->date( $date ); $cont->message( $msg ); $id_table{ $msgid } = $cont; # No references? Skip to the next message. if ( !@refs ) { next; } # 1 B. For each reference: for my $i ( 0..$#refs ) { my $ref = $refs[ $i ]; # find a container my $rcont = $id_table{ $ref }; # if there isn't one, make one with a null message. if ( !defined( $rcont )) { $rcont = new Container( new Message( undef, $ref, undef )); $id_table{ $ref } = $rcont; } # 1 B 2 Link the messages together in the order implied by the # References header, except # 1. don't break existing links and # 2. don't make loops. if ( $i > 0 ) { # can't link zeroth element to anything above it my $parent_ref = $refs[ $i - 1 ]; my $parent = $id_table{ $parent_ref }; # Do we have a parent? if ( !$rcont->parent ) { # loop check if ( not_a_loop( $parent, $rcont )) { $rcont->parent( $parent ); $parent->child( $rcont ); } } } } # 1 C # Set parent of this message = $refs[ -1 ]; if ( $cont->parent ) { my $old = $cont->parent; if ( defined( $old )) { $old->delete_child( $cont ); } } my $parent = $id_table{ $refs[ -1 ]}; # Has to exist! if ( defined( $parent )) { if ( not_a_loop( $cont, $parent )) { $cont->parent( $parent ); $parent->child( $cont ); } } } # end of message iteration loop, i.e. we're done reading the mailbox. # Safe (and prudent) to close the folder now. $folder->close(); print STDERR "done.\n"; print STDERR "Saving idtable..."; store \%id_table, "idtable"; print STDERR "done.\n"; # 2 Find root set, i.e. all messages with no parents for my $m ( keys %id_table ) { if ( $id_table{ $m }->parent) { } else { push @roots, $id_table{$m}; } # 3 Discard id_table # delete $id_table{ $m } unless @ARGV; # let's not. let's keep it for message output below. } # 4 prune empties print STDERR "Pruning empties..."; @roots = map { prune_empties( $_ ) } @roots; @roots = grep { defined( $_ )} @roots; print STDERR "done.\n"; # 5 group root set by subject print STDERR "Grouping roots by subject..."; # 5 A new hashtable my %subject_table; for my $cont ( @roots ) { # 5 B Find subject of this subtree my $subject; if ( defined( $cont->message )) { $subject = $cont->message->subject || ""; } else { $subject = ($cont->child)[0]->message->subject; } # Strip out rubbish $subject = subject_clean( $subject ); $subject or next; # give up if subject is now effectively useless. if ( !defined( $subject_table{ $subject })) { $subject_table{ $subject } = $cont; } else { my $oldm = $subject_table{ $subject }->message; my $newm = $cont->message; # empty supersedes non-empty if ( defined( $oldm ) && !defined( $newm )) { $subject_table{ $subject } = $cont; } else { if ( defined( $newm )) { # non-RE: supersedes RE: if ( $oldm->subject =~ /$re_exp/i && $newm !~ /$re_exp/i ) { $subject_table{ $subject } = $cont; } } } } } print STDERR "done.\n"; my @old_roots = map { defined( $_ )?$_:() } @roots; @roots = (); # 5 C # Gather together the difference (what?) print STDERR "Threading remainder..."; for my $cont ( @old_roots ) { # Find subject of this subtree. my $subject; if ( defined( $cont->message )) { $subject = $cont->message->subject || ""; } else { $subject = ($cont->child)[0]->message->subject; # whee! } # Strip out rubbish $subject = subject_clean( $subject ); # give up loop if subject is now effectively useless, but save it # as a root. if ( !$subject ) { push @roots, $cont; next; } # skip it if the hash is this container or a null container my $orig = $subject_table{ $subject }; if ( $orig eq $cont ) { push @roots, $cont unless grep { $_ eq $orig } @roots; next; } if ( !defined( $orig->message )) { push @roots, $orig unless grep { $_ eq $orig } @roots; next; } # Now all hell breaks loose. # If they're both empties, merge the kids. if ( !defined( $orig->message ) && ( !defined( $cont->message ))) { for my $c ( $cont->child ) { $orig->child( $c ); # $cont->delete_child( $c ); no need, we're discarding it anyway. } # Stack it if it's not already in the list. push @roots, $orig unless grep { $_ eq $orig } @roots; $cont = undef; # trash the original next; } # If one's empty and the other's not, make the non-empty be a # child of the empty. # If one is non-empty, non-re, and this one is not empty and # re, make the re a child of the non-re. if (( !defined( $orig->message ) && defined( $cont->message )) or ( defined( $orig->message ) && defined( $cont->message ) && $orig->message->subject !~ /$re_exp/i && $cont->message->subject =~ /$re_exp/i )){ # original is empty, current is not. $orig->child( $cont ); $cont->parent( $orig ); @roots = map { $_ eq $cont ? () : $_ } @roots; push @roots, $orig unless grep { $_ eq $orig } @roots; next; } if (( defined( $orig->message ) && !defined( $cont->message )) or ( defined( $orig->message ) && defined( $cont->message ) && ( $cont->message->subject !~ /$re_exp/i ) && ( $orig->message->subject =~ /$re_exp/i ))) { # original is not empty, current is. $cont->child( $orig ); $orig->parent( $cont ); @roots = map { $_ eq $orig ? () : $_ } @roots; push @roots, $cont unless grep { $_ eq $cont } @roots; next; } # If all else fails, make a new empty and make both of these # siblings, but only if these are both Re: messages. # The latter prevents false trees from being created. if ( defined( $orig->message ) && defined( $cont->message ) && ( $cont->message->subject !~ /$re_exp/i ) && ( $orig->message->subject !~ /$re_exp/i )) { push @roots, $cont; } else { my $ncont = new Container; $ncont->child( $orig ); $orig->parent( $ncont ); $ncont->child( $cont ); $cont->parent( $ncont ); $subject_table{ $subject } = $ncont; # Clean up roots list @roots = map { $_ eq $orig ? () : $_ } @roots; push @roots, $ncont; } } print STDERR "done.\n"; # 6 we could now discard the parent links, really # 7 order the siblings by date (or whatever) } # Dump out threading information print STDERR "Dumping messages..."; print "\n"; print STDERR "done.\n"; # Sort containers by date sub container_date_sort { my ( $c1, $c2 ) = ( $a, $b ); my $d1 = defined( $c1->date ) ? $c1->date : 0; my $d2 = defined( $c2->date ) ? $c2->date : 0; $d1 <=> $d2; } # DIAGNOSTIC, KINDA sub dump_message { my ( $container, $i ) = @_; my $message = $container->message; my $subject = "[no subject]"; my $author = "[no author]"; my $date = "[no date]"; my $msgid = "[no message]"; if ( defined( $message )) { $subject = $message->subject; $subject ||= "[no subject]"; $msgid = $message->message_id; $msgid ||= "[no msgid]"; $date = $message->date; $date ||= 0; $author = $message->author; $author ||= "[no author]"; $author =~ s/\message ) { $subject = "[EMPTY MESSAGE]"; } } my $m; if ( $msgid eq "[no message]" ) { print "[PARENT NOT FOUND]" } else { if ( defined( $msgid ) && ( $m = find_ref( $msgid ))) { $m =~ s|/|-html/|; $m.= ".html"; print qq($subject); } else { print "$subject"; } print qq( $author ); print time2str( "%b %d", $date ); } if ( $container->child ) { print "\n"; } } # Clean up a message id sub msgid_clean { my $msgid = shift; chomp( $msgid ); $msgid =~ s/[\<\>]//g; $msgid; } # Look up a message ID and see if we can link it. sub find_ref { my $msgid = msgid_clean( $_[ 0 ] ); if ( defined( $id_table{ $msgid })) { my $msg = $id_table{ $msgid }->message; if ( defined( $msg )) { my $href = $msg->message; if ( defined( $href )) { return $href->[0] . "/" . $href->[2]; } } } return undef; } # Return a possible link for a message ID. sub maybe_link_msgid { my $l = shift; my $li = find_ref( $l ); if ( defined( $li )) { $li =~ s|/|-html/|; "<$l>"; } else { "<$l;>"; } } # Convert a message to HTML. sub message_to_html { my ( $message, $file, $i, $count, $nottop ) = @_; my $parser = new MIME::Parser; my $output_dir = "$ {file}-html/$ {i}"; my $ret = ""; $nottop ||= 0; # Stock goop. if ( !$nottop ) { $ret .= "$file, message $i\n"; $ret .= "\n"; } # Clean up output dir before we parse if ( -d $output_dir && !$nottop ) { if ( opendir( DIR, $output_dir )) { my @files = grep !/^\.\.?$/, readdir( DIR ); closedir( DIR ); for my $f ( @files ) { unlink "$output_dir/$f"; } } } else { mkdir $output_dir, 0755; } $parser->output_dir( $output_dir ); $parser->output_to_core( 0 ); my @lines = ( @{$message->header}, "\n", @{$message->body}); my $entity; if ( $entity = $parser->parse_data( \@lines )) { if ( !$entity->is_multipart ) { $entity->make_multipart; # force mail to always be multipart. easier. } # Dump out headers my $head = $message->head; $ret .= "\n"; # for my $h ( qw( Date From Subject Message-Id References In-Reply-To )) { for my $h ( $head->tags ) { # Skip some headers that we couldn't throw away next if ( $h =~ /^(Content-Type|Content-Transfer-Encoding|Content-Length|MIME-Version|From )$/i ); my $text = $head->get( $h ); $text ||= ""; chomp( $text ); # lose lose lose if ( $h =~ /^(Message-Id|References|In-Reply-To)$/ ) { $text =~ s|\<([^\n>]+)\>|maybe_link_msgid( $1 )|eig; } else { $text =~ s/\\n\  \  \ /g; # wrap a la Mail $ret .= qq(\n) if $text; } $ret .= "
$h: $text
\n
\n"; # Now deal with the body. my $content_type = $entity->effective_type; $content_type ||= "text/plain"; if ( $content_type eq "multipart/alternative" ) { my @parts; for my $p ( $entity->parts ) { if ( $p->effective_type eq "text/html" ) { # selecting HTML push @parts, $p; } else { if ( defined( $p->bodyhandle )) { unlink( $p->bodyhandle->path ); # discard text } } } $entity->parts( \@parts ); } for my $p ( $entity->parts ) { my $ctype = lc( $p->effective_type ); # deal with bogosity $ctype = "text/plain" if $ctype eq "text"; if ( $ctype =~ m|^text/| ) { my $fh = $p->bodyhandle->open( "r" ); my $line; if ( $ctype eq "text/plain" ) { $ret .= "
\n";
		}

		# have to use this interface to get the mimedecoded version
		while ( defined( $line = $fh->getline )) {
		  if ( $ctype eq "text/plain" ) {
			$line =~ s/\/\>/g;
			#$line =~ s/\&/\&/g;
		  }
		  $ret .= $line;
		}

		# close the file, thanks.
		$fh->close;

		if ( $ctype eq "text/plain" ) {
		  $ret .= "
\n"; } # now discard the inhaled bodypart unlink( $p->bodyhandle->path ); } else { # not a text type. my $b = $p->bodyhandle; if ( defined( $b )) { my $href = $b->path; $href =~ s|^$file-html/||; my $desc = ""; my $parthead = $p->head; if ( defined( $parthead )) { $desc = $parthead->get( "Content-Description" ); } $desc ||= $href; # If it's an image, inline it. if ( $ctype =~ m|^image/| ) { $ret .= qq(); } else { $ret .= qq($desc [$ctype]); } } else { if (( $ctype eq "message/rfc822" ) || ( $ctype eq "multipart/alternative" )) { $ret .= "
\n"; # sillywalking my @msg = map { $_ .= "\n" } split( /\n/, $p->stringify_body ); $ret .= message_to_html( new Mail::Internet( \@msg ), $file, $i, $count, 1 ); $ret .= "
\n"; } else { die "

Part: " . $ctype . "

\n"; } } } rmdir( $output_dir ); # may fail, may not. } } else { # PARSE FAILED die "PARSE FAILED: $!\n"; } # Trailer if ( !$nottop ) { $ret .= "\n"; } $ret; } # Attempt to figure out the REAL from address, discard junk headers, etc. sub clean_message { my ( $message ) = shift; my $head = $message->head; my $body = $message->body; $head->replace( "To", "The DSPsrv" ); # Successive versions of DSPsrv my $from = $head->get( "X-Orig-From" ); $from ||= $head->get( "X-Original-From" ); $from ||= $head->get( "From" ); # Last resort! $head->replace( "From", $from ); # Delete senders; we're not getting anything useful there. $head->delete( "Sender" ); # Other things we don't need. $head->delete( "Content-MD5" ); $head->delete( "Content-MD5-Origin" ); $head->delete( "Precedence" ); $head->delete( "Priority" ); $head->delete( "Received" ); $head->delete( "Reply-To" ); $head->delete( "Return-Path" ); $head->delete( "Status" ); $head->delete( "X-Accept-Language" ); $head->delete( "X-Authentication-Warning" ); $head->delete( "X-Brought-To-You-By" ); $head->delete( "X-Charset" ); # could use? $head->delete( "X-Cc" ); $head->delete( "X-Envelope-To" ); $head->delete( "X-Filtered-By" ); $head->delete( "X-Mailer" ); $head->delete( "X-Msmail-Conversation-Id" ); $head->delete( "X-Msmail-Fixed-Font" ); $head->delete( "X-Msmail-Message-Id" ); $head->delete( "X-No-Archive" ); $head->delete( "X-Orig-From" ); $head->delete( "X-Original-From" ); $head->delete( "X-Orig-To" ); $head->delete( "X-Pop3-RCPT" ); $head->delete( "X-Priority" ); $head->delete( "X-Sender" ); $head->delete( "X-Sponsored-By" ); $head->delete( "X-Uidl" ); $head->delete( "X-VM-v5-data" ); new Mail::Internet( Header=>$head, Body=>$body ); } sub subject_clean { my $subject = shift; # BIG hackety: convert "Re: foo (was Re: bar)" to "bar" #$subject =~ s/^.*\(was re:\s*(.*)\)$/$1/i; # Hmm. Not sure about using this. # hackety $subject =~ s/\[?dspsrv[:\s\]]*//ig; # Remove RE: etc. $subject =~ s/$re_exp//i; $subject =~ s/\s?\(fwd\)$//i; # Do these last, in case the other ops leave trailing or leading space. $subject =~ s/^\s+//; $subject =~ s/\s+$//; $subject; } # STEP 4 sub prune_empties { my ( $container ) = @_; my $message = $container->message; my $empty = 1; my $parent = $container->parent; my @children; # 4 A - Empty with no children, nuke. # 4 B - Empty with children, promote unless it makes them root, unless # there's only one child. # Check for emptiness if ( defined( $message ) && defined( $message->message )) { $empty = 0; } for my $c ( $container->child ) { if ( !defined( $c ) || !$container->is_child( $c )) { next; } else { if ( !prune_empties( $c )) { $container->delete_child( $c ); } } } @children = $container->child; # reload if ( $empty && (( $#children == 0 ) || defined( $parent ))) { for my $c ( @children ) { if ( $parent ) { $parent->child( $c ); $container->delete_child( $c ); } # hack to promote to root $c->parent( $parent ); } } else { $empty = 0; # leave a branch for the non-empties to hang off } @children = $container->child; # once more for the road if ( $empty ) { if ( $#children == 0 ) { return $children[ 0 ]; # hack to promote to root. } else { return undef; } } else { return $container; } }