#!/usr/bin/perl -w use LWP::UserAgent; use Date::Parse; use HTML::Filter; use CGI; my $query; my $debug = $ENV{'DEBUG'}; my $doc; my $base; # CGI-mode hooks if ( $ENV{'REMOTE_ADDR'}) { $| = 1; $query = new CGI; print $query->header; if ( $ENV{'REMOTE_USER'}) { print "

snorq is generating your page, please wait...

";
  } else {
	print "

You'll have to log in to run snorq!

"; exit; } } chdir( "$ENV{'HOME'}/public_html/misc/avantgo" ); my $stylesheet_link = ''; my %pages; #$pages{'Cringely'} = # [ # 'http://www.pbs.org/cgi-registry/cringely/thisweek.pl?pulpit' # ]; #$filters{'Cringely'} = # [ # '', # '', # '^\s*', 'Cringely', # '$', '' # ]; $pages{'jwzrants'} = [ 'http://www.jwz.org/gruntle', '', # '\", # title # '\", # credit # '\', # '


', # '', '', # 'bgcolor="#......"', '' # ]; # Hah. This is fun. my $t1 = scalar( localtime( time - ( 60 * 60 * 24 * 30 ))); my $t2 = scalar( localtime( time )); my @bits1 = split( /\s+/, $t1 ); my @bits2 = split( /\s+/, $t2 ); # I sold my shares. Actually, they were compulsorily # acquired. Bastards. DIE DIE FORNICATE DIE 666 DIE, as Jerrell would # say. #$pages{'Eircom'} = # [ # "http://www.ise.ie/php3/graph_make.php3" . # "?CompID=189&Index=None&CompID_A=189&CompID_B=None" . # "&Start=" . $bits1[ 2 ] . "+" . $bits1[ 1 ] . "+" . $bits1[ 4 ] . # "&End=" . $bits2[ 2 ] . "+" . $bits2[ 1 ] . "+" . $bits2[ 4 ] # ]; #$pages {'Goats'} = # [ # 'http://www.goats.com/', # 'img src="(/comix/\d+/goats\d+\.gif)"' # ]; #$pages {'JerkCity'} = # [ # 'http://www.jerkcity.com/', # 'img [^>]*src="/(jerkcity\d+.gif)' # ]; #$pages {'Loadza' } = # [ # 'http://www.unison.ie/loadza/display_results.php3?cat_id=258' # ]; #$pages {'Register'} = # [ # 'http://www.theregister.co.uk/' # ]; #$filters{'Register'} = # [ # '

','' # ]; # also no more #$pages {'Tatemae'} = # [ # 'http://www.cloudiness.com/tatemae/' # ]; #$pages{'Doonesbury'} = # [ # 'http://www.doonesbury.com/strip/dailydose/', # 'img.*?src="[^"]*(/comics/db/\d+/db\d+.gif)"', # ]; # ----------------------------- page layout, such as it is -------------------- my ( $host, $now ); $now = scalar( localtime ); $host = `/bin/hostname`; my $compilation =<<"COMP"; Compilation Page for $ENV{'LOGNAME'} $stylesheet_link

Compilation Page for $ENV{'LOGNAME'}


Generated by snorq on $host at $now
COMP # ------------------------------- end of setup -------------------------------- my $ua = new LWP::UserAgent; $ua->agent( "Snorq/0.1" . $ua->agent ); my ( $req, $res ); $ua->env_proxy(); for my $page ( sort keys %pages ) { if ( $#ARGV != -1 ) { next unless grep /$page/i, @ARGV; } print "$page\n" if $debug; # Figure out what we're getting! my $content = ""; my $contenttype = ""; my $numrules = $#{$pages{$page}}; my $n = -1; # gack my $url; RULE: for my $rule ( @{$pages{$page}} ) { # increment rule number $n++; print " rule ", $n + 1, " of ", $numrules + 1, " : $rule\n" if $debug; if ( !$content ) { # First rule is always a URL $url = $rule; } else { ( $url ) = $content =~ m/$rule/si; if (!defined( $url )) { print " error extracting $rule\n" if $debug; $content = undef; last RULE; } } # Patch in base and stuff if ( defined $base ) { $uri = new URI $url; if ( !defined( $uri->scheme ) or !$uri->scheme ) { $uri = new URI $url, $base->scheme; } # Gack! relative URL! if ( $uri->path !~ m|^/| ) { local $URI::ABS_ALLOW_RELATIVE_SCHEME = 1; # gack gack $uri = URI->new($url)->abs( $base ); } if ( !defined( $uri->host )) { $uri->scheme( $base->scheme ); $uri->host( $base->host ); } $url = $uri->as_string; } print " fetching $url\n" if $debug; $cached = 0; # if this is the terminal rule, try a HEAD instead of a GET if ( $n == $numrules ) { $req = new HTTP::Request HEAD => $url; $res = $ua->request( $req ); if ( $res->is_success ) { my $utime; # See if we get a datestamp $date = $res->headers->header( 'Last-Modified' ); if ( defined( $date )) { print " Last Mod: $date\n" if $debug; $utime = str2time( $date ); } else { $utime = 0; } $contenttype = $res->content_type; # And this is what we call a "hack" $filename = "${page}_$contenttype"; $filename =~ s|/|.|g; if ( -f $filename ) { (undef, undef, undef, undef, undef, undef, undef, undef, undef, $mtime, undef, undef, undef ) = stat( $filename ); if ( $mtime > $utime ) { $cached = 1; } else { $cached = 0; } } } else { print " head failed, for some reason.\n" if $debug; } } # Screw caching, since it seems not to work. $cached = 0; if ( $cached ) { print " cached, not fetching\n" if $debug; } else { $req = new HTTP::Request GET => $url; $res = $ua->request( $req ); if ( $res->is_success ) { $content = $res->content; $contenttype = $res->content_type; # And this is what we call a "hack" $filename = "${page}_$contenttype"; $filename =~ s|/|.|g; $base = $res->base; } else { print " error fetching data\n" if $debug; $page = $res->as_string; undef $content; last RULE; } } next if !defined( $content ); next if $n < $numrules; print " Item $page, content type $contenttype successfully fetched.\n" if $debug; # Now, filter the page. if ( defined( $filters{$page})) { print " filtering it: " if $debug; print "start..." if $debug; my @filters = reverse @{$filters{$page}}; my $filter = pop @filters; $content =~ s/^.*?$filter//si; print "end..." if $debug; $filter = pop @filters; $content =~ s/$filter.*?$//si; if ( $#filters != -1 ) { print "body..." if $debug; while ( $#filters != -1 ) { my $search = pop @filters; my $replace = pop @filters; $content =~ s/$search/$replace/sgie; } } print "done.\n" if $debug; } } # Don't bother doing more if we couldn't get the page next unless $content; # Fix up URLs if ( $contenttype =~ /^text\/html/i ) { print " Repatching URLs to $base\n" if $debug; $doc = ""; my $parser = HTML::Parser->new( api_version => 3, start_h => [\&p_start, "tagname, text, attr"], default_h => [ sub { $doc .= shift }, "text"] ); $parser->parse( $content ); $parser->eof; $content = $doc; } # Save the damn thing open( PAGE, ">$filename" ); binmode(PAGE); # GRR. print PAGE $content; close( PAGE ); # Figure out the link type, and add it. if ( $contenttype =~ /^image/i ) { print " Slicing image... [$page/$contenttype]" if $debug; $new = carve_image( $page, $contenttype ); unlink( $filename ); # don't leave the old image lying around print "done.\n" if $debug; # See if it's got a place of its own to go into. if (!( $compilation =~ s|()|$new\n|)) { $compilation =~ s|()|$new\n$1|; } } else { my $srcurl = ""; $srcurl = " (from $url)
" if $debug; $srcurl .= " ($date)" if $date; if (!( $compilation =~ s|()|$page$srcurl\n| )) { $compilation =~ s|()|$page$srcurl\n$1|; } } } open( PAGE, ">index.html" ); print PAGE $compilation; close( PAGE ); if ( defined( $query )) { print "

All done. Redirecting...

\n"; print <<"EOT"; EOT } # This is ghastly, but noone seems to have a nice image processing # module for Perl that I could use instead. sub carve_image { my ( $name, $type ) = @_; my $html = ""; my $filename = "${name}_$type"; $filename =~ s|/|.|g; # Make directory FIXME nuke it if it exists mkdir $name, 0755 unless -d $name; # Convert to a pnm if ( $type eq "image/png" ) { `pngtopnm $filename > $name/$filename 2>/dev/null`; } else { `anytopnm $filename > $name/$filename 2>/dev/null`; } # Get dimensions (use Image::Info for this!) $pnmfile = `pnmfile $name/$filename`; ( $wide, $high ) = $pnmfile =~ m/:.*?,\s(\d+)\sby\s(\d+).*?/i; return qq(
$pnmfile
\n) if !defined( $wide ) or !defined( $high ); $html = qq(\n); for ( $y = 0; $y < $high; $y += 140 ) { if ( $y + 140 > $high ) { $h = $high - $y; } else { $h = 140; } $html.=""; for ( $x = 0; $x < $wide; $x += 150 ) { if ( $x + 150 > $wide ) { $w = $wide - $x; } else { $w = 150; } `pnmcut $x $y $w $h $name/$filename | ppmquant 256 2>/dev/null | ppmtogif > $name/$ {name}_$ {x}_$ {y}.gif 2>/dev/null`; $html .= qq(); } $html.="\n"; } $html .= "
\n"; # Cleanup unlink( "$name/$filename" ); return $html; } sub patchurl { my $base = shift; my $url = shift; my $uri = new URI $url; eval { if ( !defined( $uri->scheme ) or !$uri->scheme ) { $uri = new URI $url, ($base->scheme || 'http'); # what the hell? } # Gack! relative URL! if ( $uri->path !~ m|^/| ) { local $URI::ABS_ALLOW_RELATIVE_SCHEME = 1; # gack gack $uri = URI->new($url)->abs( $base ); } if ( !defined( $uri->host )) { $uri->scheme( $base->scheme || 'http' ); $uri->host( $base->host ); } }; $uri->scheme( 'http' ) unless $uri->scheme; # thanks, slashdot return $url if $@; # bail out if there's an error. $uri->as_string; } sub p_start { my $tag = $_[1]; if (( $_[0] eq "a" ) || ( $_[0] eq "img" ) || ( $_[0] eq "link" ) || ( $_[0] eq "script" ) || ( $_[0] eq "form" ) || ( $_[0] eq "input" )) { $tag = "<$_[0]"; for my $a ( keys %{$_[2]} ) { my $t = $_[2]->{$a}; if ( $a =~ /^href|src|action$/i ) { $t = patchurl( $base, $t ); $tag .= qq( $a="$t" ); } else { $tag .= qq( $a="$t" ); } } $tag =~ s/\s+$//; # just in case $tag .= ">"; } $doc .= $tag; }