#!/usr/bin/perl -w use LWP::UserAgent; use Date::Parse; use HTML::Filter; use CGI; my $query; my $debug = $ENV{'DEBUG'}; my $doc; my $base; # CGI-mode hooks if ( $ENV{'REMOTE_ADDR'}) { $| = 1; $query = new CGI; print $query->header; if ( $ENV{'REMOTE_USER'}) { print "
snorq is generating your page, please wait...
";
} else {
print "You'll have to log in to run snorq!
";
exit;
}
}
chdir( "$ENV{'HOME'}/public_html/misc/avantgo" );
my $stylesheet_link =
'';
my %pages;
#$pages{'Cringely'} =
# [
# 'http://www.pbs.org/cgi-registry/cringely/thisweek.pl?pulpit'
# ];
#$filters{'Cringely'} =
# [
# '',
# '',
# '^\s*', 'Cringely ',
# '$', ''
# ];
$pages{'jwzrants'} =
[
'http://www.jwz.org/gruntle',
'',
# '\", # title
# '\", # credit
# '\',
# '
',
# '?font.*?>', '',
# 'bgcolor="#......"', ''
# ];
# Hah. This is fun.
my $t1 = scalar( localtime( time - ( 60 * 60 * 24 * 30 )));
my $t2 = scalar( localtime( time ));
my @bits1 = split( /\s+/, $t1 );
my @bits2 = split( /\s+/, $t2 );
# I sold my shares. Actually, they were compulsorily
# acquired. Bastards. DIE DIE FORNICATE DIE 666 DIE, as Jerrell would
# say.
#$pages{'Eircom'} =
# [
# "http://www.ise.ie/php3/graph_make.php3" .
# "?CompID=189&Index=None&CompID_A=189&CompID_B=None" .
# "&Start=" . $bits1[ 2 ] . "+" . $bits1[ 1 ] . "+" . $bits1[ 4 ] .
# "&End=" . $bits2[ 2 ] . "+" . $bits2[ 1 ] . "+" . $bits2[ 4 ]
# ];
#$pages {'Goats'} =
# [
# 'http://www.goats.com/',
# 'img src="(/comix/\d+/goats\d+\.gif)"'
# ];
#$pages {'JerkCity'} =
# [
# 'http://www.jerkcity.com/',
# 'img [^>]*src="/(jerkcity\d+.gif)'
# ];
#$pages {'Loadza' } =
# [
# 'http://www.unison.ie/loadza/display_results.php3?cat_id=258'
# ];
#$pages {'Register'} =
# [
# 'http://www.theregister.co.uk/'
# ];
#$filters{'Register'} =
# [
# '
',''
# ];
# also no more
#$pages {'Tatemae'} =
# [
# 'http://www.cloudiness.com/tatemae/'
# ];
#$pages{'Doonesbury'} =
# [
# 'http://www.doonesbury.com/strip/dailydose/',
# 'img.*?src="[^"]*(/comics/db/\d+/db\d+.gif)"',
# ];
# ----------------------------- page layout, such as it is --------------------
my ( $host, $now );
$now = scalar( localtime );
$host = `/bin/hostname`;
my $compilation =<<"COMP";
Compilation Page for $ENV{'LOGNAME'}
$stylesheet_link
Compilation Page for $ENV{'LOGNAME'}
Generated by snorq on $host at $now
COMP
# ------------------------------- end of setup --------------------------------
my $ua = new LWP::UserAgent;
$ua->agent( "Snorq/0.1" . $ua->agent );
my ( $req, $res );
$ua->env_proxy();
for my $page ( sort keys %pages ) {
if ( $#ARGV != -1 ) {
next unless grep /$page/i, @ARGV;
}
print "$page\n" if $debug;
# Figure out what we're getting!
my $content = "";
my $contenttype = "";
my $numrules = $#{$pages{$page}};
my $n = -1; # gack
my $url;
RULE:
for my $rule ( @{$pages{$page}} ) {
# increment rule number
$n++;
print " rule ", $n + 1, " of ", $numrules + 1, " : $rule\n" if $debug;
if ( !$content ) {
# First rule is always a URL
$url = $rule;
} else {
( $url ) = $content =~ m/$rule/si;
if (!defined( $url )) {
print " error extracting $rule\n" if $debug;
$content = undef;
last RULE;
}
}
# Patch in base and stuff
if ( defined $base ) {
$uri = new URI $url;
if ( !defined( $uri->scheme ) or !$uri->scheme ) {
$uri = new URI $url, $base->scheme;
}
# Gack! relative URL!
if ( $uri->path !~ m|^/| ) {
local $URI::ABS_ALLOW_RELATIVE_SCHEME = 1; # gack gack
$uri = URI->new($url)->abs( $base );
}
if ( !defined( $uri->host )) {
$uri->scheme( $base->scheme );
$uri->host( $base->host );
}
$url = $uri->as_string;
}
print " fetching $url\n" if $debug;
$cached = 0;
# if this is the terminal rule, try a HEAD instead of a GET
if ( $n == $numrules ) {
$req = new HTTP::Request
HEAD => $url;
$res = $ua->request( $req );
if ( $res->is_success ) {
my $utime;
# See if we get a datestamp
$date = $res->headers->header( 'Last-Modified' );
if ( defined( $date )) {
print " Last Mod: $date\n" if $debug;
$utime = str2time( $date );
} else {
$utime = 0;
}
$contenttype = $res->content_type;
# And this is what we call a "hack"
$filename = "${page}_$contenttype";
$filename =~ s|/|.|g;
if ( -f $filename ) {
(undef, undef, undef, undef, undef, undef, undef, undef,
undef, $mtime, undef, undef, undef ) = stat( $filename );
if ( $mtime > $utime ) {
$cached = 1;
} else {
$cached = 0;
}
}
} else {
print " head failed, for some reason.\n" if $debug;
}
}
# Screw caching, since it seems not to work.
$cached = 0;
if ( $cached ) {
print " cached, not fetching\n" if $debug;
} else {
$req = new HTTP::Request
GET => $url;
$res = $ua->request( $req );
if ( $res->is_success ) {
$content = $res->content;
$contenttype = $res->content_type;
# And this is what we call a "hack"
$filename = "${page}_$contenttype";
$filename =~ s|/|.|g;
$base = $res->base;
} else {
print " error fetching data\n" if $debug;
$page = $res->as_string;
undef $content;
last RULE;
}
}
next if !defined( $content );
next if $n < $numrules;
print " Item $page, content type $contenttype successfully fetched.\n" if $debug;
# Now, filter the page.
if ( defined( $filters{$page})) {
print " filtering it: " if $debug;
print "start..." if $debug;
my @filters = reverse @{$filters{$page}};
my $filter = pop @filters;
$content =~ s/^.*?$filter//si;
print "end..." if $debug;
$filter = pop @filters;
$content =~ s/$filter.*?$//si;
if ( $#filters != -1 ) {
print "body..." if $debug;
while ( $#filters != -1 ) {
my $search = pop @filters;
my $replace = pop @filters;
$content =~ s/$search/$replace/sgie;
}
}
print "done.\n" if $debug;
}
}
# Don't bother doing more if we couldn't get the page
next unless $content;
# Fix up URLs
if ( $contenttype =~ /^text\/html/i ) {
print " Repatching URLs to $base\n" if $debug;
$doc = "";
my $parser = HTML::Parser->new( api_version => 3,
start_h => [\&p_start,
"tagname, text, attr"],
default_h =>
[ sub { $doc .= shift }, "text"]
);
$parser->parse( $content );
$parser->eof;
$content = $doc;
}
# Save the damn thing
open( PAGE, ">$filename" );
binmode(PAGE); # GRR.
print PAGE $content;
close( PAGE );
# Figure out the link type, and add it.
if ( $contenttype =~ /^image/i ) {
print " Slicing image... [$page/$contenttype]" if $debug;
$new = carve_image( $page, $contenttype );
unlink( $filename ); # don't leave the old image lying around
print "done.\n" if $debug;
# See if it's got a place of its own to go into.
if (!( $compilation =~
s|()|$new\n|)) {
$compilation =~ s|()|$new\n$1|;
}
} else {
my $srcurl = "";
$srcurl = " (from $url)
" if $debug;
$srcurl .= " ($date)" if $date;
if (!( $compilation =~
s|()|$page$srcurl\n|
)) {
$compilation =~
s|()|$page$srcurl\n$1|;
}
}
}
open( PAGE, ">index.html" );
print PAGE $compilation;
close( PAGE );
if ( defined( $query )) {
print "All done. Redirecting...
\n";
print <<"EOT";
EOT
}
# This is ghastly, but noone seems to have a nice image processing
# module for Perl that I could use instead.
sub carve_image {
my ( $name, $type ) = @_;
my $html = "";
my $filename = "${name}_$type";
$filename =~ s|/|.|g;
# Make directory FIXME nuke it if it exists
mkdir $name, 0755 unless -d $name;
# Convert to a pnm
if ( $type eq "image/png" ) {
`pngtopnm $filename > $name/$filename 2>/dev/null`;
} else {
`anytopnm $filename > $name/$filename 2>/dev/null`;
}
# Get dimensions (use Image::Info for this!)
$pnmfile = `pnmfile $name/$filename`;
( $wide, $high ) = $pnmfile =~ m/:.*?,\s(\d+)\sby\s(\d+).*?/i;
return qq( $pnmfile
\n)
if !defined( $wide ) or !defined( $high );
$html = qq(\n);
for ( $y = 0; $y < $high; $y += 140 ) {
if ( $y + 140 > $high ) {
$h = $high - $y;
} else {
$h = 140;
}
$html.="";
for ( $x = 0; $x < $wide; $x += 150 ) {
if ( $x + 150 > $wide ) {
$w = $wide - $x;
} else {
$w = 150;
}
`pnmcut $x $y $w $h $name/$filename | ppmquant 256 2>/dev/null | ppmtogif > $name/$ {name}_$ {x}_$ {y}.gif 2>/dev/null`;
$html .= qq(
);
}
$html.=" \n";
}
$html .= "
\n";
# Cleanup
unlink( "$name/$filename" );
return $html;
}
sub patchurl
{
my $base = shift;
my $url = shift;
my $uri = new URI $url;
eval {
if ( !defined( $uri->scheme ) or !$uri->scheme ) {
$uri = new URI $url, ($base->scheme || 'http'); # what the hell?
}
# Gack! relative URL!
if ( $uri->path !~ m|^/| ) {
local $URI::ABS_ALLOW_RELATIVE_SCHEME = 1; # gack gack
$uri = URI->new($url)->abs( $base );
}
if ( !defined( $uri->host )) {
$uri->scheme( $base->scheme || 'http' );
$uri->host( $base->host );
}
};
$uri->scheme( 'http' ) unless $uri->scheme; # thanks, slashdot
return $url if $@; # bail out if there's an error.
$uri->as_string;
}
sub p_start
{
my $tag = $_[1];
if (( $_[0] eq "a" ) || ( $_[0] eq "img" ) || ( $_[0] eq "link" ) ||
( $_[0] eq "script" ) || ( $_[0] eq "form" ) || ( $_[0] eq "input" )) {
$tag = "<$_[0]";
for my $a ( keys %{$_[2]} ) {
my $t = $_[2]->{$a};
if ( $a =~ /^href|src|action$/i ) {
$t = patchurl( $base, $t );
$tag .= qq( $a="$t" );
} else {
$tag .= qq( $a="$t" );
}
}
$tag =~ s/\s+$//; # just in case
$tag .= ">";
}
$doc .= $tag;
}