#!/usr/bin/perl -w
use LWP::UserAgent;
use Date::Parse;
use HTML::Filter;
use CGI;

my $query;
my $debug = $ENV{'DEBUG'};
my $doc;
my $base;

# CGI-mode hooks
if ( $ENV{'REMOTE_ADDR'}) {
  $| = 1;
  $query = new CGI;
  print $query->header;
  if ( $ENV{'REMOTE_USER'}) {
	print "<p>snorq is generating your page, please wait...</p><pre>";
  } else {
	print "<p>You'll have to log in to run snorq!</p>";
	exit;
  }
}

chdir( "$ENV{'HOME'}/public_html/misc/avantgo" );

my $stylesheet_link = 
  '<link rel="stylesheet" href="../../waider.css" type="text/css">';

my %pages;

#$pages{'Cringely'} =
#  [
#   'http://www.pbs.org/cgi-registry/cringely/thisweek.pl?pulpit'
#  ];

#$filters{'Cringely'} =
#  [
#   '<!--========================== Content between these lines ==========================-->',
#   '<!--========================== Content between these lines ==========================-->',
#   '^\s*', '<html><head><title>Cringely</title></head><body>',
#   '$', '</body>'
#  ];

$pages{'jwzrants'} =
  [
   'http://www.jwz.org/gruntle',
   '<TR><TD VALIGN=TOP ALIGN=RIGHT><B><A HREF="([^"]+)"'
  ];

$pages{'Dilbert'} =
  [
   'http://www.dilbert.com/comics/dilbert/archive/',
   'img.*?src="(/comics/dilbert/archive/images/dilbert\d+.gif)"'
  ];

$pages{'TomTomorrow'} =
  [
   'http://www.workingforchange.com/column_lst.cfm?AuthrId=43',
   'HREF="(article.cfm\?ItemID=\d+)"',
   'src="(http://workingforchange.speedera.net/www.workingforchange.com/webgraphics/wfc/TM.*?)"'
  ];

# Alas, Bobbins is no more.
#$pages{'Bobbins'} =
#  [
#   'http://www.bobbins.org/',
#   'img.*?src="(/comics/\d+.*?\.(png|gif))"'
#  ];

# But hoorah, there is scarygoround!
$pages{'ScaryGoRound'} =
  [
   'http://www.scarygoround.com/',
   'img src="(strips/.*?\.png)"'
  ];

# $pages{'StateSecrets'} =
#  [
#   'http://www.statesecrets.com/homepage.html',
#   '<a href="(\d{4}/\d{2}/\d{2}/index.html)"',
#   '"(.*?daypage.html)"'
#  ];

#  $filters{ 'StateSecrets' } =
#    [
#     '\<!--title--\>',
#     '\<!--end story block--\>',

#     # And now, the hairy stuff
#     '^', '<html><head><title>StateSecrets</title></head><body><h2>',
#     '\</font.*?\<!--credit--\>', "</h2>\n<!--credit-->", # title

#     '\<!--credit--\>\s?\<.*?\>by \<a.*?\>', '<h3>',
#     '\</a.*\<!--text here--\>', "</h3>\n<!--text-->", # credit

#     '\<!--text--\>.*?<p>', '<p>', # Text block

#     # These are all to trim garbage off the end. No idea why it took
#     # this much effort, but it now works, so I'm not going to touch it.
#     '</?font.*?>', '', # no font tags, thanks
#     '</?t[dra].*?>', '', # lose the tables
#     '<img.*?>', '', # images
#     '\<!--filler here--\>.+?$', '', # cut the tail of the page off

#     "\n+", "\n" # blank lines  # '$', '</body></html>'
#    ];

#$pages{'Slashdot'} =
#  [
#   'http://slashdot.org/index.pl?light=1&noboxes=1'
#  ];

#$filters{'Slashdot'} =
#  [
#   # Start after...
#   'faq.*?\<h2\>',
#   # Stop after...
#   '\<p\>\<p\>\[', # arbitrary, but it works!
#   # Add in a H2 at the top.
#   '^', '<html><head><title>Slashdot</title></head><body><h2>',
#   '$', '</body></html>'
#  ];

#$pages{'NTK'} =
#  [
#   'http://www.ntk.net/'
#  ];

#$filters{'NTK'} =
#  [
#   '\<pre\>',
#   '\</pre\>',
#   '^.*? _', '<html><head><title>NTK</title></head><body><pre> _',
#                                  # restuff. Don't ask about the " _", really.
#   '(</pre>\s+)?$', '</pre></body></html>' # ditto. Makes extra, but screw it.
#  ];

$pages{'RedMeat'} =
  [
   'http://www.redmeat.com/redmeat/current/index.html',
   'img src="(index\-\d+.gif)"'
  ];

# $pages{'DNALounge'} =
#  [
#   'http://www.dnalounge.com/backstage/log/latest.html'
#  ];

# $filters{'DNALounge'} =
#  [
#   '<!-- %%SUBHEADING%% -->',
#   '<P><BR CLEAR=BOTH>',
#   '</?font.*?>', '',
#   'bgcolor="#......"', ''
#  ];

# Hah. This is fun.
my $t1 = scalar( localtime( time - ( 60 * 60 * 24 * 30 )));
my $t2 = scalar( localtime( time ));
my @bits1 = split( /\s+/, $t1 );
my @bits2 = split( /\s+/, $t2 );

# I sold my shares. Actually, they were compulsorily
# acquired. Bastards. DIE DIE FORNICATE DIE 666 DIE, as Jerrell would
# say.

#$pages{'Eircom'} =
#  [
#   "http://www.ise.ie/php3/graph_make.php3" .
#   "?CompID=189&Index=None&CompID_A=189&CompID_B=None" .
#   "&Start=" . $bits1[ 2 ] . "+" . $bits1[ 1 ] . "+" . $bits1[ 4 ] .
#   "&End=" . $bits2[ 2 ] . "+" . $bits2[ 1 ] . "+" . $bits2[ 4 ]
#  ];

#$pages {'Goats'} =
#  [
#   'http://www.goats.com/',
#   'img src="(/comix/\d+/goats\d+\.gif)"'
#  ];

#$pages {'JerkCity'} =
#  [
#   'http://www.jerkcity.com/',
#   'img [^>]*src="/(jerkcity\d+.gif)'
#  ];

#$pages {'Loadza' } =
#  [
#   'http://www.unison.ie/loadza/display_results.php3?cat_id=258'
#  ];

#$pages {'Register'} =
#  [
#   'http://www.theregister.co.uk/'
#  ];

#$filters{'Register'} =
#  [
#   '<DIV><DIV',
#   'Register Services',
#   '^', '<DIV',
#   '<table.*?$', '',
#   ' class=".*?"', '',
#   '</?font.*?>',''
#  ];

# also no more
#$pages {'Tatemae'} =
#  [
#   'http://www.cloudiness.com/tatemae/'
#  ];

#$pages{'Doonesbury'} =
#  [
#   'http://www.doonesbury.com/strip/dailydose/',
#   'img.*?src="[^"]*(/comics/db/\d+/db\d+.gif)"',
#  ];

# ----------------------------- page layout, such as it is --------------------
my ( $host, $now );
$now = scalar( localtime );
$host = `/bin/hostname`;

my $compilation =<<"COMP";
<html>
  <head>
    <meta name="generator" content="snorq">
    <meta name="HandheldFriendly" content="True">
    <title>Compilation Page for $ENV{'LOGNAME'}</title>
    <link rel="shortcut icon" HREF="/favicon.ico" TYPE="image/x-icon">
    $stylesheet_link
  </head>
  <body>
    <h1>Compilation Page for $ENV{'LOGNAME'}</h1>
    <table border="0" cellpadding="0">
      <tr>
        <td><!-- feed me jwzrants --></td>
      </tr>
    </table>
    <!-- feed me text -->
    <!-- feed me pix -->
    <hr>
    <address>Generated by <a href="http://www.waider.ie/hacks/snorq.pl">snorq</a> on $host at $now</address>
  </body>
</html>
COMP

# ------------------------------- end of setup --------------------------------

my $ua = new LWP::UserAgent;
$ua->agent( "Snorq/0.1" . $ua->agent );
my ( $req, $res );
$ua->env_proxy();

for my $page ( sort keys %pages ) {
  if ( $#ARGV != -1 ) {
	next unless grep /$page/i, @ARGV;
  }

  print "$page\n" if $debug;

  # Figure out what we're getting!
  my $content = "";
  my $contenttype = "";
  my $numrules = $#{$pages{$page}};
  my $n = -1; # gack
  my $url;

 RULE:
  for my $rule ( @{$pages{$page}} ) {

	# increment rule number
	$n++;

	print "   rule ", $n + 1, " of ", $numrules + 1, " : $rule\n" if $debug;

	if ( !$content ) {
	  # First rule is always a URL
	  $url = $rule;
	} else {
	  ( $url ) = $content =~ m/$rule/si;
	  if (!defined( $url )) {
		print "   error extracting $rule\n" if $debug;
		$content = undef;
		last RULE;
	  }
	}

	# Patch in base and stuff
	if ( defined $base ) {
	  $uri = new URI $url;
	  if ( !defined( $uri->scheme ) or !$uri->scheme ) {
		$uri = new URI $url, $base->scheme;
	  }

	  # Gack! relative URL!
	  if ( $uri->path !~ m|^/| ) {
		local $URI::ABS_ALLOW_RELATIVE_SCHEME = 1; # gack gack
		$uri = URI->new($url)->abs( $base );
	  }

	  if ( !defined( $uri->host )) {
		$uri->scheme( $base->scheme );
		$uri->host( $base->host );
	  }

	  $url = $uri->as_string;
	}

	print "   fetching $url\n" if $debug;

	$cached = 0;

	# if this is the terminal rule, try a HEAD instead of a GET
	if ( $n == $numrules ) {
	  $req = new HTTP::Request
		HEAD => $url;

	  $res = $ua->request( $req );

	  if ( $res->is_success ) {
		my $utime;

		# See if we get a datestamp
		$date = $res->headers->header( 'Last-Modified' );
		if ( defined( $date )) {
		  print "   Last Mod: $date\n" if $debug;
		  $utime = str2time( $date );
		} else {
		  $utime = 0;
		}
		$contenttype = $res->content_type;

		# And this is what we call a "hack"
		$filename = "${page}_$contenttype";
		$filename =~ s|/|.|g;

		if ( -f $filename ) {
		  (undef, undef, undef, undef, undef, undef, undef, undef,
		   undef, $mtime, undef, undef, undef ) = stat( $filename );
		  if ( $mtime > $utime ) {
			$cached = 1;
		  } else {
			$cached = 0;
		  }
		}
	  } else {
		print "   head failed, for some reason.\n" if $debug;
	  }
	}

	# Screw caching, since it seems not to work.
	$cached = 0;

	if ( $cached ) {
	  print "   cached, not fetching\n" if $debug;
	} else {
	  $req = new HTTP::Request
		GET => $url;

	  $res = $ua->request( $req );

	  if ( $res->is_success ) {
		$content = $res->content;
		$contenttype = $res->content_type;

		# And this is what we call a "hack"
		$filename = "${page}_$contenttype";
		$filename =~ s|/|.|g;

		$base = $res->base;
	  } else {
		print "   error fetching data\n" if $debug;
		$page = $res->as_string;
		undef $content;
		last RULE;
	  }
	}

	next if !defined( $content );
	next if $n < $numrules;

	print "   Item $page, content type $contenttype successfully fetched.\n" if $debug;

	# Now, filter the page.
	if ( defined( $filters{$page})) {
	  print "   filtering it: " if $debug;

	  print "start..." if $debug;
	  my @filters = reverse @{$filters{$page}};

	  my $filter = pop @filters;
	  $content =~ s/^.*?$filter//si;

	  print "end..." if $debug;
	  $filter = pop @filters;
	  $content =~ s/$filter.*?$//si;

	  if ( $#filters != -1 ) {
		print "body..." if $debug;

		while ( $#filters != -1 ) {
		  my $search = pop @filters;
		  my $replace = pop @filters;

		  $content =~ s/$search/$replace/sgie;
		}
	  }
	  print "done.\n" if $debug;
	}

  }

  # Don't bother doing more if we couldn't get the page
  next unless $content;

  # Fix up URLs
  if ( $contenttype =~ /^text\/html/i ) {
	print "   Repatching URLs to $base\n" if $debug;
	$doc = "";
	my $parser = HTML::Parser->new( api_version => 3,
									start_h => [\&p_start,
												"tagname, text, attr"],
									default_h =>
									[ sub { $doc .= shift }, "text"]
								  );
	$parser->parse( $content );
	$parser->eof;
	$content = $doc;
  }

  # Save the damn thing
  open( PAGE, ">$filename" );
  binmode(PAGE); # GRR.
  print PAGE $content;
  close( PAGE );

  # Figure out the link type, and add it.
  if ( $contenttype =~ /^image/i ) {
	print "   Slicing image... [$page/$contenttype]" if $debug;
	$new = carve_image( $page, $contenttype );
	unlink( $filename ); # don't leave the old image lying around
	print "done.\n" if $debug;

	# See if it's got a place of its own to go into.
	if (!( $compilation =~
		   s|(<!-- feed me $page -->)|$new\n|)) {
	  $compilation =~ s|(<!-- feed me pix -->)|$new\n$1|;
	}
  } else {
	my $srcurl = "";
	$srcurl = " (<a href=\"$url\">from $url</a>)<br>" if $debug;
	$srcurl .= " ($date)" if $date;
	if (!(	$compilation =~
			s|(<!-- feed me $page -->)|<a href="$filename">$page</a>$srcurl\n|
)) {
	  $compilation =~
		s|(<!-- feed me text -->)|<a href="$filename">$page</a>$srcurl\n$1|;
	}
  }
}

open( PAGE, ">index.html" );
print PAGE $compilation;
close( PAGE );

if ( defined( $query )) {
  print "</pre><p>All done. Redirecting...</p>\n";
  print <<"EOT";
<script>document.location = "http://www.waider.ie/avantgo/misc/"</script>
EOT
}

# This is ghastly, but noone seems to have a nice image processing
# module for Perl that I could use instead.
sub carve_image {
  my ( $name, $type ) = @_;
  my $html = "";

  my  $filename = "${name}_$type";
  $filename =~ s|/|.|g;

  # Make directory FIXME nuke it if it exists
  mkdir $name, 0755 unless -d $name;

  # Convert to a pnm
  if ( $type eq "image/png" ) {
	`pngtopnm $filename > $name/$filename 2>/dev/null`;
  } else {
	`anytopnm $filename > $name/$filename 2>/dev/null`;
  }

  # Get dimensions (use Image::Info for this!)
  $pnmfile = `pnmfile $name/$filename`;
  ( $wide, $high ) = $pnmfile =~ m/:.*?,\s(\d+)\sby\s(\d+).*?/i;

  return qq( <pre>$pnmfile</pre>\n)
	if !defined( $wide ) or !defined( $high );


  $html = qq(<table cellpadding="0" cellspacing="0" border="0">\n);

  for ( $y = 0; $y < $high; $y += 140 ) {
	if ( $y + 140 > $high ) {
	  $h = $high - $y;
	} else {
	  $h = 140;
	}

	$html.="<tr>";

	for ( $x = 0; $x < $wide; $x += 150 ) {
	  if ( $x + 150 > $wide ) {
		$w = $wide - $x;
	  } else {
		$w = 150;
	  }

	  `pnmcut $x $y $w $h $name/$filename | ppmquant 256 2>/dev/null | ppmtogif > $name/$ {name}_$ {x}_$ {y}.gif 2>/dev/null`;
	  $html .= qq(<td><img src="$name/$ {name}_$ {x}_$ {y}.gif" width="$w" height="$h"></td>);
	}
	$html.="</tr>\n";
  }
  $html .= "</table>\n";

  # Cleanup
  unlink( "$name/$filename" );

  return $html;
}

sub patchurl
  {
	my $base = shift;
	my $url = shift;

	my $uri = new URI $url;

	eval {
	  if ( !defined( $uri->scheme ) or !$uri->scheme ) {
		$uri = new URI $url, ($base->scheme || 'http'); # what the hell?
	  }

	  # Gack! relative URL!
	  if ( $uri->path !~ m|^/| ) {
		local $URI::ABS_ALLOW_RELATIVE_SCHEME = 1; # gack gack
		$uri = URI->new($url)->abs( $base );
	  }

	  if ( !defined( $uri->host )) {
		$uri->scheme( $base->scheme || 'http' );
		$uri->host( $base->host );
	  }
	};

	$uri->scheme( 'http' ) unless $uri->scheme; # thanks, slashdot

	return $url if $@; # bail out if there's an error.

	$uri->as_string;
  }

sub p_start
  {
	my $tag = $_[1];
	if (( $_[0] eq "a" ) || ( $_[0] eq "img" ) || ( $_[0] eq "link" ) ||
		( $_[0] eq "script" ) || ( $_[0] eq "form" ) || ( $_[0] eq "input" )) {
	  $tag = "<$_[0]";
	  for my $a ( keys %{$_[2]} ) {
		my $t = $_[2]->{$a};
		if ( $a =~ /^href|src|action$/i ) {
		  $t = patchurl( $base, $t );
		  $tag .= qq( $a="$t" );
		} else {
		  $tag .= qq( $a="$t" );
		}
	  }
	  $tag =~ s/\s+$//; # just in case
	  $tag .= ">";
	}
	$doc .= $tag;
  }

