#!/usr/bin/perl -w # Link maintenance toy for a website # Waider 2000 use strict; use HTML::Parser; use GDBM_File; use File::Find; use File::Spec; use File::Basename; use Storable qw( freeze thaw ); use Getopt::Long; my $p; my $newdoc; my $linktext; my $href; my $url; my $script = 0; # Parse document text chunk by chunk my %linkfarm; chdir "$ENV{'HOME'}/public_html" or die "You don't have a public_html directory\n"; my $db = tie %linkfarm, 'GDBM_File', "$ENV{'HOME'}/public_html/.linkfarm", &GDBM_WRCREAT, 0644 or die $!; sub usage { print STDERR "usage:\n\t$0 [-dump] [-elisp] [-quiet] [-update] []\n"; exit 1; } my ( $dump, $elisp, $silent, $update, $lookup ) = ( 0, 0, 0, 0, "" ); GetOptions( "dump" => \$dump, "elisp" => \$elisp, "quiet" => \$silent, "update" => \$update, "lookup:s" => \$lookup, ) or usage(); # Dump the link farm in the form "phrase\0link1\0link2..." if ( $dump ) { for ( sort keys %linkfarm ) { print $_, "\0", join( "\0", @{thaw $linkfarm{$_}}), "\n"; } exit; } # new hack: output an alist directly. if ( $elisp ) { print "(\n"; for ( sort keys %linkfarm ) { print "(\""; print lisp_clean( $_ ); print "\" \""; print lisp_clean( @{thaw $linkfarm{$_}}); print "\")\n"; } print ")\n"; exit; } # lookup if ( $lookup ) { shift; my $phrase = lc( $lookup ); if ( defined( $linkfarm{$phrase})) { print "$lookup:\n "; print join( "\n ", @{thaw $linkfarm{$phrase}}), "\n" } exit; } # update if ( $update ) { print "Scanning for files..." unless $silent; find( { wanted => sub { push @ARGV, $File::Find::name if !-d $File::Find::name && $File::Find::name =~ /\.html$/; }, no_chdir => 1 }, '.'); print "done.\n" unless $silent; # Also, this is a full update, so flush what we have. for ( sort keys %linkfarm ) { delete $linkfarm{$_}; # is there a faster way to do this? } } # update 1 page if ( $#ARGV == -1 ) { # "Can't find any HTML pages to scan.\nStopped"; usage(); } # Create parser object $p = HTML::Parser->new(api_version => 3, start_h => [\&start, "tagname, attr, text" ], end_h => [\&end, "tagname, text" ], text_h => [\&default, 'text' ], default_h => [sub { $newdoc .= shift }, 'text' ], ); $p->unbroken_text( 1 ); while ($#ARGV > -1 ) { print STDERR "Scanning file $ARGV[0] for links..." unless $silent; $newdoc = ""; $linktext = ""; $p->parse_file($ARGV[0]); print STDERR "done.\n" unless $silent; if ( $href > 0 ) { print STDERR "WARNING: " . $ARGV[0] . " has an unclosed A tag.\n"; $href = 0; } elsif ( $href < 0 ) { print STDERR "WARNING: " . $ARGV[0] . " has an extra A close tag.\n"; $href = 0; } shift; } undef $db; untie %linkfarm; # Subs. sub normalise { my $url = shift; return if !defined( $url ); return $url if $url =~ /^[a-z]+:/; # go back already return undef if $url =~ /^#/; # don't care about in-page bookmarks # Get absolute path to link my $cwd = dirname( File::Spec->rel2abs( $ARGV[ 0 ], "$ENV{'HOME'}/public_html" )); $url =~ s|/~waider/||; if ( $url !~ /^\// ) { $url = File::Spec->rel2abs( $url, $cwd ); } if ( -d $url ) { $url .= "/index.html"; } my $tmpurl = $url; $tmpurl =~ s/#.*$//; # remove bookmarks if ( !-f $tmpurl ) { print STDERR "$url in $ARGV[0] is a dead link.\n" unless $silent; return undef; # ignore broken links! } # HAHAHAH. canonpath isn't REALLY canonical. Thanks for these, troc. 1 while ($url =~ s{(^|/)\.(/|$)}{$1&&$2&&'/'}gex); # cperl '}); 1 while ($url =~ s{(^|/)[^/]+?/\.\.(/|$)}{$1&&$2&&'/'}gex); # fix '}); $url =~ s{([^:/])/+}{$1/}g; # Finally, make it relative to my web tree $url = File::Spec->abs2rel( $url, "$ENV{'HOME'}/public_html" ); $url; } sub start { if ( $_[0] =~ /^a$/i ) { $url = normalise( $_[1]->{'href'} ); $href++; } $newdoc .= $_[2]; } sub end { my $cur; if ( $_[0] =~ /^a$/i ) { $linktext =~ s/[\n\r \t]+/ /gs; $linktext =~ s/^\s+//; $linktext =~ s/\s+$//; $linktext =~ s/"/"/; $linktext = lc( $linktext ); if ( length( $linktext ) > 1 ) { $cur = $linkfarm{$linktext}; if ( defined( $cur )) { $cur = thaw $cur; if ( defined( $url )) { if ( !grep /$url/, @$cur ) { push @$cur, $url; $linkfarm{$linktext} = freeze $cur; } } } else { if ( defined( $url )) { $cur = []; push @{$cur}, $url; $linkfarm{$linktext} = freeze $cur; } } } $linktext = ""; $href = 0; } $newdoc .= $_[1]; } # check if we should be saving text here. sub default { if ( $href ) { $linktext .= $_[0]; } $newdoc .= $_[0]; } sub lisp_clean { my $string = shift; $string =~ s/"/\\"/g; $string; }