#!/usr/bin/perl -w # # MAILMAN BLOWS GOATS. # You can't block mails to mailman, you can only redirect them to the # admin address. So my attempts to stop up some of the spam on the # BBDB list results in me getting a steady stream of crap at the admin # address instead. This script is designed to go in and rip them # out. If there's a real admin request in there, tough. I might notice # it in the mailshots I get, and then again I might not. # 22/05/2003 Rewrote to use HTML::TokeParser and also Mail::SpamAssassin # # 22/08/2003 HTA: Added some stuff to make Mailman 2.1 easier # (mailman 2.1 has to have details=all in order to get # the messages, but fails if you add that into the URL # when blipping them) # Also added reporting of nonspam. # # 23/08/2003 Add support for custom SpamAssassin config file (--saconfig) # # 15/04/2003 Add learning support (Ted T'so) # # 24/04/2003 Merge SpamAssassin 3.0.0 and MailMan 2.1.4 support (Justin Mason) # # 27/05/2006 Switch to using WWW::Mechanize # # TODO: # * cgi mode, where it generates a Mailman admin page, but with # defaults set sanely (i.e. if it's spam, set the default option to # 'discard') # use WWW::Mechanize; use Getopt::Long; use HTML::TokeParser; use Mail::SpamAssassin; package main; $| = 1; my $listurl = ''; my $password; my $learn = 0; my $saconfig; my $usesa = 1; my $dryrun = 0; my $nuke = 0; # because sometimes you don't care my $debug = 0; GetOptions( "url:s" => \$listurl, "password:s" => \$password, "learn!" => \$learn, "nuke!" => \$nuke, "sa!" => \$usesa, "saconfig:s" => \$saconfig, "dryrun!" => \$dryrun, "debug!" => \$debug, ) or usage(); usage() unless $listurl; usage() unless $password; my $mech = new WWW::Mechanize( env_proxy => 1 ); my $formparams = { }; my $res = $mech->get( $listurl ); if ( !$res->is_success ) { die "Request failed: " . $res->code . "\n"; } $res = $mech->submit_form( fields => { adminpw => $password } ); if ( !$res->is_success ) { die "Request failed: " . $res->code . "\n"; } # catch sneaky spamassassin 2.1 if ( $mech->find_link( url_regex => qr/details=all/ )) { $res = $mech->follow_link( url_regex => qr/details=all/ ); # so we get a useful link in the report $listurl .= "?details=all"; if ( !$res->is_success ) { die "Request failed: " . $res->code . "\n"; } } my $content = $mech->content; # Set up for SPAM ASSASSINATIONS, muahahah! my $spamass; if ( defined( $saconfig )) { $spamass = new Mail::SpamAssassin { userprefs_filename => $saconfig }; } else { $spamass = new Mail::SpamAssassin; } if ( $learn ) { $spamass->init_learner(); } my $p = HTML::TokeParser->new( \$content ); my $text = ""; my $tag = ""; my $count = 0; my %lusers; $p->get_tag( "form" ) or exit; while ( 1 ) { $p->get_tag( "h2" ) or last; $text = $p->get_trimmed_text; if ( $text !~ /Posting Held for Approval/is ) { print "Unexpected: $text\n"; last; } else { my ($msgnum) = $text =~ m{\((\d+) of \d+\)}; #}; # find the From address $p->get_tag( "td" ); # label $p->get_tag( "td" ); my $from = $p->get_trimmed_text(); # and the subject $p->get_tag( "td" ); # label $p->get_tag( "td" ); my $subject = $p->get_trimmed_text(); # Now get the ID so we can nuke it later $tag = $p->get_tag( "input" ); my $id = $tag->[1]{'name'}; if ( !defined( $id ) or $id !~ /^\d+$/ ) { warn "Lost track at message $msgnum\n"; last; } # Next, grab the message headers $tag = $p->get_tag( "textarea" ); # if you reject this post... $tag = $p->get_tag( "textarea" ); if ( $tag->[1]{'name'} !~ /headers-$id/ ) { warn "Lost track at message $msgnum\n"; last; } my $headers = $p->get_text( "/textarea" ); # and a message excerpt $tag = $p->get_tag( "textarea" ); my $excerpt = $p->get_text( "/textarea" ); # I'm not entirely sure about this. It looks like 2.4 uses # some new code to fetch the headers that breaks on the # "From_" line. So, if we're 2.4 or newer, discard the $from # header entirely - it's included in $headers anyway. if ( $Mail::SpamAssassin::VERSION <= 2.3 ) { $headers = "$from\n$headers"; } my $msg =<<"EOT"; $headers $excerpt EOT my ( $is_spam, $score, $required, $rules, @lines, $mail_obj ); @lines = split (/^/m, $msg); print STDERR " creating spamassassin mail object.." if $debug; if ( $Mail::SpamAssassin::VERSION >= 3.0 ) { $mail_obj = $spamass->parse (\@lines); } else { $mail_obj = Mail::SpamAssassin::NoMailAudit->new ('data' => \@lines); } print STDERR " done." if $debug; if ( $learn ) { my $status; print STDERR " learning spam status..." if $debug; eval { $status = $spamass->learn( $mail_obj, undef, 1, 0 ); print STDERR "done." if $debug; }; if ( $@ ) { print STDERR "failed: $@\n" if $debug; print "Failed to learn as spam: $!\n"; $is_spam = 0; $score = 0; $required = 5; $rules = "LEARN"; } if ( $nuke or $status->did_learn()) { if ( $status and $status->did_learn ) { print "Learned as spam:\n"; $rules = "LEARN"; } else { print "Failed to learn as spam but nuking anyway.\n"; $rules = "NUKE"; } # these are arbitrary fake values, since we're telling # SA that this is spam. $is_spam = 1; $score = 5; $required = 5; } else { print "Failed to learn as spam: $!\n"; $is_spam = 0; $score = 0; $required = 5; $rules = "LEARN"; } $status->finish if $status; $mail_obj->finish if ref ( $mail_obj ) eq "Mail::SpamAssassin::Message"; } elsif ( $usesa ) { my $status; eval { $status = $spamass->check ( $mail_obj ); $is_spam = $status->is_spam(); $score = $status->get_hits(); $required = $status->get_required_hits(); $rules = $status->get_names_of_tests_hit(); $status->finish; $mail_obj->finish if ref ( $mail_obj ) eq "Mail::SpamAssassin::Message"; }; if ( $@ ) { $is_spam = 0; $score = 0; $required = 5; $rules = "ERROR"; } } else { # if we're not using SA, we assume there's already SA # headers on the message, so try and extract the data and # fake up the status object. # The standard SA header looks like this. If you've # modified it ($conf->{*report_template}), This won't # work. If this is important to you, bug me. # X-Spam-Status: No, hits=-4.9 required=5.0 tests=BAYES_00 autolearn=ham version=2.63 ( $is_spam, $score, $required ) = $headers =~ m/X-Spam-Status: (Yes|No), hits=([0-9.-]+) required=([0-9.-]+)/s; if ( !defined( $score )) { warn "Can't find SA header in this message, setting score to 0\n"; $score = 0; } # we should respect local conventions on scores. # SA doesn't give me API access to its config object, grr. $required = $spamass->{conf}->{required_hits}; $is_spam = ( $score >= $required ); } if ( $is_spam ) { print "Spam: " . $score . "/" . $required . " ($rules)\n"; print " Blipping $from\n"; $formparams->{$id} = '3'; $count++; } else { # HTA: This branch is for nonspam. Report. print "Not Spam: " . $score . "/" . $required . " ($rules)\n"; print " Keeping $from\n"; $lusers{$from} ||= 0; $lusers{$from}++; } } } # make the request with these parameters #$req = POST $listurl, $formparams; if ( $count ) { if ( !$dryrun ) { $res = $mech->submit_form( fields => $formparams ); if ( !$res->is_success ) { die "Request failed: " . $res->code . "\n"; } print "Okay, done. Cleared out $count bits of crap\n"; } else { print "Would have cleared $count bits of crap\n"; } } else { print "Can't find any requests. Being overzealous, are we?\n" unless %lusers; } if ( %lusers ) { print "Leftovers:\n"; print join( "\n", map { $_ . " (" . $lusers{$_} . ")" } sort { my ($sa) = $a =~ /@(.*)$/; my ( $sb ) = $b =~ /@(.*)$/; $sa ||= $a; $sb ||= $b; $sa cmp $sb;} keys %lusers ) . "\n"; print "Visit $listurl to clean 'em up\n"; } if ( $learn ) { $spamass->finish_learner(); $spamass->rebuild_learner_caches({ verbose => 1 }); } sub usage { die "usage: $0 [--url=] --password= [--[no]sa] [--saconfig=]\n default is to use SA\n"; }