#!/usr/bin/perl -w # Thread a bunch of mail messages # Based on jwz's threading algorithm # Created: Waider August 29 2000 # Modified: # Waider |2001/03/14| clean up and comment code, add HTMLifyer # Waider |2001/03/15| tweak threading algorithm, date sorting # Waider |2001/03/20| adding navigation # Waider |2001/03/24| discarded nav, trying to improve threading further # ----------------------------------------------------------------------------- package Message; use Date::Parse; # Message object, consisting of: # author # date # subject # msgid # references/in-reply-to # reference to message itself # Constructor sub new { my ( $type, $subject, $id, $msg ) = @_; my $self = bless {}, $type; $self->{'subject'} = $subject; $self->{'message_id'} = $id; $self->{'message'} = $msg; $self; } # Author accessor (read/write) sub author { my ( $self, $auth ) = @_; if ( defined( $auth )) { $self->{'author'} = $auth; } $self->{'author'}; } # Date accessor (read/write) sub date { my ( $self, $date ) = @_; if ( defined( $date )) { $self->{'date'} = str2time( $date ) || die "Can't parse $date\n"; } $self->{'date'}; } # Subject accessor (read-only) sub subject { my $self = shift; $self->{'subject'}; } # Message Id accessor (read-only) sub message_id { my $self = shift; $self->{'message_id'}; } # References accessor (read-only) sub references { my $self = shift; $self->{'references'}; } # Add a reference sub add_reference { my ( $self, $ref ) = @_; $self->{'references'} = push @{$self->{'references'}}, $ref; } # Message accessor (read/write) sub message { my ( $self, $msg ) = @_; if ( defined( $msg )) { if ( defined( $self->{'message'})) { die "Duplicate message ids"; } else { $self->{'message'} = $msg; } } $self->{'message'}; } # ----------------------------------------------------------------------------- package Container; # Container object, consisting of: # message (Message object) # parent (Container object) # children (array of Container objects) # date (date of message or youngest child) sub new { my ( $type, $msg ) = @_; my $self = bless {}, $type; $self->message( $msg ); $self; } # Message accesor (read/write) sub message { my ( $self, $msg ) = @_; if ( defined( $msg )) { $self->{'message'} = $msg; $self->date( $msg->date ); } $self->{'message'}; } # Message ID accessor (read-only) sub message_id { my ( $self ) = @_; if ( !defined( $self->{'message'})) { return undef; } return $self->{'message'}->message_id; } # Date accesor (read/write) sub date { my ( $self, $date ) = @_; if ( defined( $date )) { $self->{'date'} = $date if ( !defined( $self->{'date' }) || ( $date < $self->{'date'})); } $self->{'date'}; } # Parent accessor (read/write) sub parent { my ( $self, $cont ) = @_; if ( defined( $cont )) { $self->{'parent'} = $cont; if ( ref( $cont ) eq "Container" ) { $cont->date( $self->{'date'} ); } } $self->{'parent'} || ""; } # Child accessor (read/write) sub child { my ( $self, $cont ) = @_; my $cref = $self->{'child'}; my @cref; if ( defined( $cont )) { if ( defined( $cref )) { @cref = @{$cref}; } if ( !$self->is_child( $cont )) { # hmm. I maybe shouldn't need this. push @cref, $cont; $self->{'child'} = \@cref; } } if ( defined( $cref )) { @{$cref}; } else { @cref; } } # Is the specified container a child of this container? sub is_child { my ( $self, $cont ) = @_; my $cref = $self->{'child'}; if ( defined( $cref )) { for my $c ( @{$cref} ) { if ( $c eq $cont ) { return 1; } } } return 0; } # Return a string representing the Container: # message id if available, # [no message id] if no message id # [empty container] if no message sub as_string { my ( $self ) = @_; if ( defined( $self->message )) { if ( defined( $self->message->message_id )) { return $self->message->message_id; } else { return "[no message id]"; } } else { return "[empty container]"; } } # Delete the specified child sub delete_child { my ( $self, $cont ) = @_; my $cref = $self->{'child'}; if ( defined( $cref )) { my @cref; for my $c ( @{$cref} ) { if ( $c ne $cont ) { push @cref, $c; } } $self->{'child'} = \@cref; # store } } # ------------------------------ package main; my %id_table; my @roots; # regexp for pruning/matching RE: DSPsrv: etc. AW is german, I # think. FW is for forwards. my $re_exp = "^\\s*((\\[?dspsrv[:\\s\\]]*)?((re|aw|fw)(\\[\\d+\\])*: *))+"; # Parse the text of an In-Reply-To header into a valid message ID. sub parse_inreplyto { my ( $rep ) = @_; my $ref; # We could do smarter stuff here, like parsing "FOO's message of BAR" ( $ref ) = $rep =~ m|.*?(\<.*?\>)|; warn "$rep" if $rep =~ /message of/i && !defined($ref); $ref = msgid_clean( $ref ) if defined( $ref ); $ref; } # Parse a References header into a list of message IDs. sub parse_references { my ( $ref ) = @_; my @refs; for my $r ( split( /\s+/, $ref )) { push @refs, msgid_clean( $r ); } # fixme validate @refs; } # Run a loop check on adding the second parameter as a child of the # first, both Containers. sub not_a_loop { my ( $a, $b ) = @_; my ( $rnext, @next ); die "Wrong arg types" unless ref( $a ) eq "Container" and ref( $b ) eq "Container"; return 0 if $a eq $b; @next = $b->child; return 1 if !( @next ); # recurse! for my $next ( @next ) { return 1 if !defined( $next ); return 0 unless not_a_loop( $a, $next ); } return 1; } use lib "$ENV{HOME}/src/perl"; use MboxHack; # because Mail::Folder::Mbox blows goats use MIME::Parser; use Storable; use Date::Format; use diagnostics; select( STDERR ); $| = 1; select( STDOUT ); # Iterate over supplied folders while ( @ARGV ) { my $file = shift;; if ( !( $folder = new MboxHack $file )) { print STDERR "$file is not a mbox folder\n"; next; } $folder->set_readonly(); $count = $folder->qty(); # Make folder directory my $htmldir = "$ {file}-html"; -d $htmldir or mkdir $htmldir, 0755; # Load the id_table if ( -f "idtable" ) { print STDERR "Loading idtable..."; $idref = retrieve( "idtable" ); %id_table = %{$idref}; print STDERR "done.\n"; } # Iterate over messages in the folder print STDERR "Processing $count messages"; for my $i ( 1 .. $count ) { # For each message: my $message = $folder->get_message( $i ); print STDERR "."; # Clean up the message here $message = clean_message( $message ); # HTMLify the message, preserving any navigation stuff that's # already there. my @current_html; if ( open( HTML, "<$ {htmldir}/$ {i}.html" )) { @current_html = grep /^\<\!-- nav /, ; close( HTML ); } open( HTML, ">$ {htmldir}/$ {i}.html" ); print HTML join( "", @current_html ) if @current_html; print HTML message_to_html( $message, $file, $i, $count ); close( HTML ); # Get required head bits my $head = $message->head(); my $author = $head->get( "From" ); my $subj = $head->get( "Subject" ) || ""; my $msgid = $head->get( "Message-ID" ) || ""; my $date = $head->get( "Date" ); my $refs = $head->get( "References" ) || ""; my $inrep = $head->get( "In-Reply-To" ) || ""; # ARGH. chomp( $author ); chomp( $subj ); $msgid = msgid_clean( $msgid ); chomp( $date ); chomp( $refs ); chomp( $inrep ); my @refs = parse_references( $refs ); if (!@refs && $inrep ) { my $r = parse_inreplyto( $inrep ); defined( $r ) and push @refs, $r; } # 1 A. Check if it's in the table; if it is, stash the message, if # not, make an empty container and stash the message. my $cont = $id_table{ $msgid } || new Container; my $msg = new Message( $subj, $msgid, $folder->get_message_hash( $i )); $msg->author( $author ); $msg->date( $date ); $cont->message( $msg ); $id_table{ $msgid } = $cont; # No references? Skip to the next message. if ( !@refs ) { next; } # 1 B. For each reference: for my $i ( 0..$#refs ) { my $ref = $refs[ $i ]; # find a container my $rcont = $id_table{ $ref }; # if there isn't one, make one with a null message. if ( !defined( $rcont )) { $rcont = new Container( new Message( undef, $ref, undef )); $id_table{ $ref } = $rcont; } # 1 B 2 Link the messages together in the order implied by the # References header, except # 1. don't break existing links and # 2. don't make loops. if ( $i > 0 ) { # can't link zeroth element to anything above it my $parent_ref = $refs[ $i - 1 ]; my $parent = $id_table{ $parent_ref }; # Do we have a parent? if ( !$rcont->parent ) { # loop check if ( not_a_loop( $parent, $rcont )) { $rcont->parent( $parent ); $parent->child( $rcont ); } } } } # 1 C # Set parent of this message = $refs[ -1 ]; if ( $cont->parent ) { my $old = $cont->parent; if ( defined( $old )) { $old->delete_child( $cont ); } } my $parent = $id_table{ $refs[ -1 ]}; # Has to exist! if ( defined( $parent )) { if ( not_a_loop( $cont, $parent )) { $cont->parent( $parent ); $parent->child( $cont ); } } } # end of message iteration loop, i.e. we're done reading the mailbox. # Safe (and prudent) to close the folder now. $folder->close(); print STDERR "done.\n"; print STDERR "Saving idtable..."; store \%id_table, "idtable"; print STDERR "done.\n"; # 2 Find root set, i.e. all messages with no parents for my $m ( keys %id_table ) { if ( $id_table{ $m }->parent) { } else { push @roots, $id_table{$m}; } # 3 Discard id_table # delete $id_table{ $m } unless @ARGV; # let's not. let's keep it for message output below. } # 4 prune empties print STDERR "Pruning empties..."; @roots = map { prune_empties( $_ ) } @roots; @roots = grep { defined( $_ )} @roots; print STDERR "done.\n"; # 5 group root set by subject print STDERR "Grouping roots by subject..."; # 5 A new hashtable my %subject_table; for my $cont ( @roots ) { # 5 B Find subject of this subtree my $subject; if ( defined( $cont->message )) { $subject = $cont->message->subject || ""; } else { $subject = ($cont->child)[0]->message->subject; } # Strip out rubbish $subject = subject_clean( $subject ); $subject or next; # give up if subject is now effectively useless. if ( !defined( $subject_table{ $subject })) { $subject_table{ $subject } = $cont; } else { my $oldm = $subject_table{ $subject }->message; my $newm = $cont->message; # empty supersedes non-empty if ( defined( $oldm ) && !defined( $newm )) { $subject_table{ $subject } = $cont; } else { if ( defined( $newm )) { # non-RE: supersedes RE: if ( $oldm->subject =~ /$re_exp/i && $newm !~ /$re_exp/i ) { $subject_table{ $subject } = $cont; } } } } } print STDERR "done.\n"; my @old_roots = map { defined( $_ )?$_:() } @roots; @roots = (); # 5 C # Gather together the difference (what?) print STDERR "Threading remainder..."; for my $cont ( @old_roots ) { # Find subject of this subtree. my $subject; if ( defined( $cont->message )) { $subject = $cont->message->subject || ""; } else { $subject = ($cont->child)[0]->message->subject; # whee! } # Strip out rubbish $subject = subject_clean( $subject ); # give up loop if subject is now effectively useless, but save it # as a root. if ( !$subject ) { push @roots, $cont; next; } # skip it if the hash is this container or a null container my $orig = $subject_table{ $subject }; if ( $orig eq $cont ) { push @roots, $cont unless grep { $_ eq $orig } @roots; next; } if ( !defined( $orig->message )) { push @roots, $orig unless grep { $_ eq $orig } @roots; next; } # Now all hell breaks loose. # If they're both empties, merge the kids. if ( !defined( $orig->message ) && ( !defined( $cont->message ))) { for my $c ( $cont->child ) { $orig->child( $c ); # $cont->delete_child( $c ); no need, we're discarding it anyway. } # Stack it if it's not already in the list. push @roots, $orig unless grep { $_ eq $orig } @roots; $cont = undef; # trash the original next; } # If one's empty and the other's not, make the non-empty be a # child of the empty. # If one is non-empty, non-re, and this one is not empty and # re, make the re a child of the non-re. if (( !defined( $orig->message ) && defined( $cont->message )) or ( defined( $orig->message ) && defined( $cont->message ) && $orig->message->subject !~ /$re_exp/i && $cont->message->subject =~ /$re_exp/i )){ # original is empty, current is not. $orig->child( $cont ); $cont->parent( $orig ); @roots = map { $_ eq $cont ? () : $_ } @roots; push @roots, $orig unless grep { $_ eq $orig } @roots; next; } if (( defined( $orig->message ) && !defined( $cont->message )) or ( defined( $orig->message ) && defined( $cont->message ) && ( $cont->message->subject !~ /$re_exp/i ) && ( $orig->message->subject =~ /$re_exp/i ))) { # original is not empty, current is. $cont->child( $orig ); $orig->parent( $cont ); @roots = map { $_ eq $orig ? () : $_ } @roots; push @roots, $cont unless grep { $_ eq $cont } @roots; next; } # If all else fails, make a new empty and make both of these # siblings, but only if these are both Re: messages. # The latter prevents false trees from being created. if ( defined( $orig->message ) && defined( $cont->message ) && ( $cont->message->subject !~ /$re_exp/i ) && ( $orig->message->subject !~ /$re_exp/i )) { push @roots, $cont; } else { my $ncont = new Container; $ncont->child( $orig ); $orig->parent( $ncont ); $ncont->child( $cont ); $cont->parent( $ncont ); $subject_table{ $subject } = $ncont; # Clean up roots list @roots = map { $_ eq $orig ? () : $_ } @roots; push @roots, $ncont; } } print STDERR "done.\n"; # 6 we could now discard the parent links, really # 7 order the siblings by date (or whatever) } # Dump out threading information print STDERR "Dumping messages..."; print "
| $h: | $text |
|---|
\n";
}
# have to use this interface to get the mimedecoded version
while ( defined( $line = $fh->getline )) {
if ( $ctype eq "text/plain" ) {
$line =~ s/\\</g; # this is all you need?
#$line =~ s/\>/\>/g;
#$line =~ s/\&/\&/g;
}
$ret .= $line;
}
# close the file, thanks.
$fh->close;
if ( $ctype eq "text/plain" ) {
$ret .= "\n";
}
# now discard the inhaled bodypart
unlink( $p->bodyhandle->path );
} else {
# not a text type.
my $b = $p->bodyhandle;
if ( defined( $b )) {
my $href = $b->path;
$href =~ s|^$file-html/||;
my $desc = "";
my $parthead = $p->head;
if ( defined( $parthead )) {
$desc = $parthead->get( "Content-Description" );
}
$desc ||= $href;
# If it's an image, inline it.
if ( $ctype =~ m|^image/| ) {
$ret .= qq(\n"; # sillywalking my @msg = map { $_ .= "\n" } split( /\n/, $p->stringify_body ); $ret .= message_to_html( new Mail::Internet( \@msg ), $file, $i, $count, 1 ); $ret .= "\n"; } else { die "
Part: " . $ctype . "
\n"; } } } rmdir( $output_dir ); # may fail, may not. } } else { # PARSE FAILED die "PARSE FAILED: $!\n"; } # Trailer if ( !$nottop ) { $ret .= "\n"; } $ret; } # Attempt to figure out the REAL from address, discard junk headers, etc. sub clean_message { my ( $message ) = shift; my $head = $message->head; my $body = $message->body; $head->replace( "To", "The DSPsrv" ); # Successive versions of DSPsrv my $from = $head->get( "X-Orig-From" ); $from ||= $head->get( "X-Original-From" ); $from ||= $head->get( "From" ); # Last resort! $head->replace( "From", $from ); # Delete senders; we're not getting anything useful there. $head->delete( "Sender" ); # Other things we don't need. $head->delete( "Content-MD5" ); $head->delete( "Content-MD5-Origin" ); $head->delete( "Precedence" ); $head->delete( "Priority" ); $head->delete( "Received" ); $head->delete( "Reply-To" ); $head->delete( "Return-Path" ); $head->delete( "Status" ); $head->delete( "X-Accept-Language" ); $head->delete( "X-Authentication-Warning" ); $head->delete( "X-Brought-To-You-By" ); $head->delete( "X-Charset" ); # could use? $head->delete( "X-Cc" ); $head->delete( "X-Envelope-To" ); $head->delete( "X-Filtered-By" ); $head->delete( "X-Mailer" ); $head->delete( "X-Msmail-Conversation-Id" ); $head->delete( "X-Msmail-Fixed-Font" ); $head->delete( "X-Msmail-Message-Id" ); $head->delete( "X-No-Archive" ); $head->delete( "X-Orig-From" ); $head->delete( "X-Original-From" ); $head->delete( "X-Orig-To" ); $head->delete( "X-Pop3-RCPT" ); $head->delete( "X-Priority" ); $head->delete( "X-Sender" ); $head->delete( "X-Sponsored-By" ); $head->delete( "X-Uidl" ); $head->delete( "X-VM-v5-data" ); new Mail::Internet( Header=>$head, Body=>$body ); } sub subject_clean { my $subject = shift; # BIG hackety: convert "Re: foo (was Re: bar)" to "bar" #$subject =~ s/^.*\(was re:\s*(.*)\)$/$1/i; # Hmm. Not sure about using this. # hackety $subject =~ s/\[?dspsrv[:\s\]]*//ig; # Remove RE: etc. $subject =~ s/$re_exp//i; $subject =~ s/\s?\(fwd\)$//i; # Do these last, in case the other ops leave trailing or leading space. $subject =~ s/^\s+//; $subject =~ s/\s+$//; $subject; } # STEP 4 sub prune_empties { my ( $container ) = @_; my $message = $container->message; my $empty = 1; my $parent = $container->parent; my @children; # 4 A - Empty with no children, nuke. # 4 B - Empty with children, promote unless it makes them root, unless # there's only one child. # Check for emptiness if ( defined( $message ) && defined( $message->message )) { $empty = 0; } for my $c ( $container->child ) { if ( !defined( $c ) || !$container->is_child( $c )) { next; } else { if ( !prune_empties( $c )) { $container->delete_child( $c ); } } } @children = $container->child; # reload if ( $empty && (( $#children == 0 ) || defined( $parent ))) { for my $c ( @children ) { if ( $parent ) { $parent->child( $c ); $container->delete_child( $c ); } # hack to promote to root $c->parent( $parent ); } } else { $empty = 0; # leave a branch for the non-empties to hang off } @children = $container->child; # once more for the road if ( $empty ) { if ( $#children == 0 ) { return $children[ 0 ]; # hack to promote to root. } else { return undef; } } else { return $container; } }