#!/usr/bin/perl -w
#
# MAILMAN BLOWS GOATS.

# You can't block mails to mailman, you can only redirect them to the
# admin address. So my attempts to stop up some of the spam on the
# BBDB list results in me getting a steady stream of crap at the admin
# address instead. This script is designed to go in and rip them
# out. If there's a real admin request in there, tough. I might notice
# it in the mailshots I get, and then again I might not.

# 22/05/2003 Rewrote to use HTML::TokeParser and also Mail::SpamAssassin
#
# 22/08/2003 HTA: Added some stuff to make Mailman 2.1 easier
#                 (mailman 2.1 has to have details=all in order to get
#                 the messages, but fails if you add that into the URL
#                 when blipping them)
#                 Also added reporting of nonspam.
#
# 23/08/2003 Add support for custom SpamAssassin config file (--saconfig)
#
# 15/04/2003 Add learning support (Ted T'so)
#
# 24/04/2003 Merge SpamAssassin 3.0.0 and MailMan 2.1.4 support (Justin Mason)
#
# 27/05/2006 Switch to using WWW::Mechanize
#
# TODO:
# * cgi mode, where it generates a Mailman admin page, but with
#   defaults set sanely (i.e. if it's spam, set the default option to
#   'discard')
#
use WWW::Mechanize;
use Getopt::Long;
use HTML::TokeParser;
use Mail::SpamAssassin;

package main;

$| = 1;

my $listurl = '';
my $password;
my $learn = 0;
my $saconfig;
my $usesa = 1;
my $dryrun = 0;
my $nuke = 0; # because sometimes you don't care
my $debug = 0;

GetOptions( "url:s" => \$listurl,
            "password:s" => \$password,
            "learn!" => \$learn,
            "nuke!" => \$nuke,
            "sa!" => \$usesa,
            "saconfig:s" => \$saconfig,
            "dryrun!" => \$dryrun,
            "debug!" => \$debug,
          ) or usage();

usage() unless $listurl;
usage() unless $password;

my $mech = new WWW::Mechanize( env_proxy => 1 );
my $formparams = { };
my $res = $mech->get( $listurl );
if ( !$res->is_success ) {
    die "Request failed: " . $res->code . "\n";
}
$res = $mech->submit_form( fields => { adminpw => $password } );
if ( !$res->is_success ) {
    die "Request failed: " . $res->code . "\n";
}

# catch sneaky spamassassin 2.1
if ( $mech->find_link( url_regex => qr/details=all/ )) {
    $res = $mech->follow_link( url_regex => qr/details=all/ );
    # so we get a useful link in the report
    $listurl .= "?details=all";
    if ( !$res->is_success ) {
        die "Request failed: " . $res->code . "\n";
    }
}

my $content = $mech->content;

# Set up for SPAM ASSASSINATIONS, muahahah!
my $spamass;

if ( defined( $saconfig )) {
    $spamass = new Mail::SpamAssassin { userprefs_filename => $saconfig };
} else {
    $spamass = new Mail::SpamAssassin;
}

if ( $learn ) {
    $spamass->init_learner();
}

my $p = HTML::TokeParser->new( \$content );
my $text = "";
my $tag = "";
my $count = 0;
my %lusers;

$p->get_tag( "form" ) or exit;

while ( 1 ) {
    $p->get_tag( "h2" ) or last;
    $text = $p->get_trimmed_text;
    if ( $text !~ /Posting Held for Approval/is ) {
        print "Unexpected: $text\n";
        last;
    } else {
        my ($msgnum) = $text =~ m{\((\d+) of \d+\)}; #};

        # find the From address
        $p->get_tag( "td" );    # label
        $p->get_tag( "td" );
        my $from = $p->get_trimmed_text();

        # and the subject
        $p->get_tag( "td" );    # label
        $p->get_tag( "td" );
        my $subject = $p->get_trimmed_text();

        # Now get the ID so we can nuke it later
        $tag = $p->get_tag( "input" );
        my $id = $tag->[1]{'name'};

        if ( !defined( $id ) or $id !~ /^\d+$/ ) {
            warn "Lost track at message $msgnum\n";
            last;
        }

        # Next, grab the message headers
        $tag = $p->get_tag( "textarea" ); # if you reject this post...
        $tag = $p->get_tag( "textarea" );
        if ( $tag->[1]{'name'} !~ /headers-$id/ ) {
            warn "Lost track at message $msgnum\n";
            last;
        }
        my $headers = $p->get_text( "/textarea" );

        # and a message excerpt
        $tag = $p->get_tag( "textarea" );
        my $excerpt = $p->get_text( "/textarea" );

        # I'm not entirely sure about this. It looks like 2.4 uses
        # some new code to fetch the headers that breaks on the
        # "From_" line. So, if we're 2.4 or newer, discard the $from
        # header entirely - it's included in $headers anyway.
        if ( $Mail::SpamAssassin::VERSION <= 2.3 ) {
            $headers = "$from\n$headers";
        }

        my $msg =<<"EOT";
$headers

$excerpt
EOT

        my ( $is_spam, $score, $required, $rules, @lines, $mail_obj );

        @lines = split (/^/m, $msg);
        print STDERR " creating spamassassin mail object.." if $debug;
        if ( $Mail::SpamAssassin::VERSION >= 3.0 ) {
            $mail_obj = $spamass->parse (\@lines);
        } else {
            $mail_obj = Mail::SpamAssassin::NoMailAudit->new ('data' => \@lines);
        }
        print STDERR " done." if $debug;

        if ( $learn ) {
            my $status;
            print STDERR " learning spam status..." if $debug;
            eval {
                $status = $spamass->learn( $mail_obj, undef, 1, 0 );
                print STDERR "done." if $debug;
            };
            if ( $@ ) {
                print STDERR "failed: $@\n" if $debug;
                print "Failed to learn as spam: $!\n";
                $is_spam = 0;
                $score = 0;
                $required = 5;
                $rules = "LEARN";
            }

            if ( $nuke or $status->did_learn()) {
                if ( $status and $status->did_learn ) {
                    print "Learned as spam:\n";
                    $rules = "LEARN";
                } else {
                    print "Failed to learn as spam but nuking anyway.\n";
                    $rules = "NUKE";
                }

                # these are arbitrary fake values, since we're telling
                # SA that this is spam.
                $is_spam = 1;
                $score = 5;
                $required = 5;
            } else {
                print "Failed to learn as spam: $!\n";
                $is_spam = 0;
                $score = 0;
                $required = 5;
                $rules = "LEARN";
            }
            $status->finish if $status;
            $mail_obj->finish
                if ref ( $mail_obj ) eq "Mail::SpamAssassin::Message";
        } elsif ( $usesa ) {
            my $status;
            eval {
                $status = $spamass->check ( $mail_obj );
                $is_spam = $status->is_spam();
                $score = $status->get_hits();
                $required = $status->get_required_hits();
                $rules = $status->get_names_of_tests_hit();
                $status->finish;
                $mail_obj->finish
                    if ref ( $mail_obj ) eq "Mail::SpamAssassin::Message";
            };
            if ( $@ ) {
                $is_spam = 0;
                $score = 0;
                $required = 5;
                $rules = "ERROR";
            }
        } else {
            # if we're not using SA, we assume there's already SA
            # headers on the message, so try and extract the data and
            # fake up the status object.
            # The standard SA header looks like this. If you've
            # modified it ($conf->{*report_template}), This won't
            # work. If this is important to you, bug me.
            # X-Spam-Status: No, hits=-4.9 required=5.0 tests=BAYES_00 autolearn=ham version=2.63
            ( $is_spam, $score, $required ) = $headers =~ m/X-Spam-Status: (Yes|No), hits=([0-9.-]+) required=([0-9.-]+)/s;

            if ( !defined( $score )) {
                warn "Can't find SA header in this message, setting score to 0\n";
                $score = 0;
            }

            # we should respect local conventions on scores.
            # SA doesn't give me API access to its config object, grr.
            $required = $spamass->{conf}->{required_hits};
            $is_spam = ( $score >= $required );
        }

        if ( $is_spam ) {
            print "Spam: " . $score . "/" . $required . " ($rules)\n";
            print "  Blipping $from\n";
            $formparams->{$id} = '3';
            $count++;
        } else {
            # HTA: This branch is for nonspam. Report.
            print "Not Spam: " . $score . "/" . $required . " ($rules)\n";
            print "  Keeping $from\n";
            $lusers{$from} ||= 0;
            $lusers{$from}++;
        }
    }
}

# make the request with these parameters
#$req = POST $listurl, $formparams;

if ( $count ) {
    if ( !$dryrun ) {
        $res = $mech->submit_form( fields => $formparams );
        if ( !$res->is_success ) {
            die "Request failed: " . $res->code . "\n";
        }
        print "Okay, done. Cleared out $count bits of crap\n";
    } else {
        print "Would have cleared $count bits of crap\n";
    }
} else {
    print "Can't find any requests. Being overzealous, are we?\n" unless %lusers;
}

if ( %lusers ) {
    print "Leftovers:\n";
    print join( "\n", map { $_ . " (" . $lusers{$_} . ")" } sort { my ($sa) = $a =~ /@(.*)$/; my ( $sb ) = $b =~ /@(.*)$/; $sa ||= $a; $sb ||= $b; $sa cmp $sb;} keys %lusers ) . "\n";
    print "Visit $listurl to clean 'em up\n";
}

if ( $learn ) {
    $spamass->finish_learner();
    $spamass->rebuild_learner_caches({ verbose => 1 });
}

sub usage {
    die "usage: $0 [--url=<url>] --password=<password> [--[no]sa] [--saconfig=<path>]\n    default is to use SA\n";
}
