#!/usr/bin/perl # # All the cool kids are doing RSS. ME TOO! # use lib qw( /sw/lib/perl5 ); use XML::RSS; use XML::Atom::Feed; use LWP::UserAgent; use Date::Parse; use CGI; use POSIX; use Digest::MD5 qw( md5_hex ); use Data::Dumper; use strict; use HTML::Entities; use Unicode::String qw( utf8 ); use DBI; use DBD::mysql; use Storable qw( freeze ); use Encode qw( encode_utf8 decode_utf8 ); use HTML::TokeParser; use Getopt::Long; BEGIN { # web user will not have HOME set, so be cheeky and assume it's my # own homedir that's required. if ( !( $ENV{HOME}||"" )) { my ( $name, $passwd, $uid, $gid, $quota, $comment, $gcos, $dir, $shell, $expire ) = getpwnam( "waider" ); $ENV{HOME} = $dir; } } use lib "$ENV{HOME}/src/perl"; use WaiderDotIe qw( getconfig ); # IWBNI: # * Users, with passwords and preferences and what not. # * Clean up cache dir/database if a feed is removed # * Clean out expired items my $conf = getconfig(); # connect to the database my $dbh = DBI->connect( "DBI:mysql:database=$conf->{database};host=$conf->{host};port=$conf->{port}" . ( $conf->{ssl} ? ';mysql_ssl=1' : '' ), $conf->{dbuser}, $conf->{dbpass} ) or die $DBI::errstr; # in case of flakiness $dbh->{mysql_auto_reconnect} = 1; # get the list of feeds + urls my @urls; my $feeds = $dbh->selectall_arrayref( 'SELECT name, url, feedid, error FROM feeds' ) or die $DBI::errstr; @urls = @{$feeds}; my $debug = 0; my $reparse = 0; my $refresh = 0; GetOptions( 'debug!' => \$debug, 'refresh!' => \$refresh, 'reparse!' => \$reparse ) or die; my $only = shift; $XML::RSS::AUTO_ADD = 1; # force acceptance of extra namespaces my $ua = new LWP::UserAgent; $ua->agent( "RSS/0.1 " . $ua->agent ); $ua->env_proxy(); # Piping hot! $| = 1; for my $feed ( @urls ) { my ( $title, $url, $feedid, $lasterror ) = @{$feed}; my ( $rss, $channel, $res, $req, $content ); my $lastupdated = time; if ( defined( $only )) { print STDERR "> $feedid: $title\n" if $debug; next unless $title eq $only; } print STDERR "Doing $title\n" if $debug; # see what we have in the database my $feeddata = $dbh->selectall_arrayref( 'SELECT feedid, name, url, ' . 'UNIX_TIMESTAMP(lastupdate), rawfeed, ' . 'etag, modified, cleanfeed, charset ' . 'FROM feeds WHERE feedid=?', undef, $feedid ); my %thisfeed = ( feedid => $feeddata->[0]->[0], name => $feeddata->[0]->[1], url => $feeddata->[0]->[2], lastupdate => $feeddata->[0]->[3], rawfeed => $feeddata->[0]->[4], etag => $feeddata->[0]->[5], modified => $feeddata->[0]->[6], cleanfeed => $feeddata->[0]->[7], charset => $feeddata->[0]->[8], ); if ( $refresh or !$thisfeed{rawfeed}) { $thisfeed{lastupdate} = undef; $thisfeed{rawfeed} = undef; $thisfeed{etag} = undef; $thisfeed{modified} = undef; $thisfeed{cleanfeed} = undef; } # First, check against the cachefile whether we need to update or # not. This includes checking how often the cachefile itself says we # should update. NB this operates on the *cleaned* feed. if ( $thisfeed{cleanfeed} ) { my $updateFreq = 1800; # minimum 30 minutes between refreshes $rss = new XML::RSS; eval { $rss->parse( $thisfeed{cleanfeed} ) or warn $!; }; # check if it needs to be refreshed $channel = $rss->{channel}; if ( defined( $channel )) { my $syn = $channel->{syn}; if ( defined( $syn )) { $updateFreq = $syn->{updateFrequency}; my $updatePeriod = $syn->{updatePeriod}; if ( $updatePeriod eq 'hourly' ) { $updateFreq *= ( 60 * 60 ); } elsif ( $updatePeriod eq 'daily' ) { $updateFreq *= ( 60 * 60 * 24 ); } else { warn "No idea what to do with $updatePeriod\n"; $updateFreq *= ( 60 * 60 * 24 ); } } } # now check against the file date my $mtime = $thisfeed{lastupdate}; $lastupdated = $mtime; if ( time > $mtime + $updateFreq ) { undef $rss; # forces reload } else { print STDERR " not due for update yet\n" if $debug; next unless $only or $refresh or $reparse; } } else { print STDERR " no clean feed found\n" if $debug; } $lasterror = '' if $refresh; if ( !$rss ) { print STDERR " Fetching $url...\n" if $debug; $req = new HTTP::Request GET => $url; # Support ETag and If-Modified-Since # No point if the cache file is missing, mind you. if ( $thisfeed{cleanfeed}||'' ) { if ( $thisfeed{etag}||'' ) { $req->push_header( 'If-None-Match', $thisfeed{etag} ); } if ( $thisfeed{modified}||'' ) { $req->push_header( 'If-Modified-Since', $thisfeed{modified} ); } } $res = $ua->request( $req ); if ( $res->is_success ) { $content = $res->content; $lastupdated = time; my $save = $dbh->prepare( 'UPDATE feeds SET rawfeed=? WHERE feedid=?' ); my $saveres = $save->execute( $content, $thisfeed{feedid} ); if ( !defined( $saveres )) { die $DBI::errstr; } # save the ETag/Last-Modified bits for my $hdr ( "ETag", "Last-Modified", "Content-Type" ) { if ( $debug ) { print STDERR " $hdr: " . ( $res->headers->header( $hdr ) || "unset" ) . "\n"; } if ( my $val = $res->headers->header( $hdr )) { if ( $hdr eq "ETag" ) { $save = $dbh->prepare( 'UPDATE feeds SET etag=? WHERE feedid=?' ); } elsif ( $hdr eq "Last-Modified" ) { $save = $dbh->prepare( 'UPDATE feeds SET modified=? WHERE feedid=?' ); # snag the last-modified date if it's present my $tval = str2time( $val ); $lastupdated = $tval if $tval; } else { $save = $dbh->prepare( 'UPDATE feeds SET charset=? WHERE feedid=?' ); $thisfeed{charset} = $val; } $saveres = $save->execute( $val, $thisfeed{feedid} ); if ( !defined( $saveres )) { warn "saving headers: $DBI::errstr"; } } } } else { # Not modifed if ( $res->code == 304 ) { print STDERR " Page not modified.\n" if $debug; next unless $reparse; } else { print STDERR " Failed to fetch page: " . $res->code . " " . $res->message . "\n" if $debug; next; } } $reparse = 1; } if ( $reparse ) { # if the content is undefined, then snork it up from the cache file. if ( !defined( $content ) or $content eq "" ) { print STDERR " Using cached version\n" if $debug; if ( $thisfeed{rawfeed}||"" ) { $content = $thisfeed{rawfeed}; } else { print STDERR " No cached feed for $title!\n" if $debug; } } # If we still have no content, flee the premises. if ( !defined( $content )) { print STDERR "No content\n" if $debug; next; } my ( $base ) = $url =~ m@^(.*://?.+)/@; my ( $site ) = $url =~ m@^(.*://?[^/]+)/@; my $original = $content; my $preparsed; my ( $charset, $contenttype ); if ( $contenttype = $thisfeed{charset} ) { if ( $contenttype =~ /\bcharset=([^ ;]+)/ ) { print STDERR " Charset: $1\n" if $debug; $charset = $1; } else { $charset = ""; } $contenttype =~ s/;.*$//; print STDERR " Cleaned Type: $contenttype\n" if $debug; } if (( $contenttype ne "text/xml" ) && # is it REALLY html? ( $content =~ /^<\?xml/ )) { $contenttype = "text/xml"; print STDERR " Fixed content type\n" if $debug; } # convert everything to Atom # if ( $content !~ m@(http://purl.org/atom|xmlns=["']?http://www.w3.org/2005/Atom)@si ) { # my $atom = new XML::Atom::Feed; # my $rss = new XML::RSS; # $rss->parse( $content ); # } # next; # if it's an Atom feed, switch it to RSS. This is horribly rough. if ( $content =~ m@(http://purl.org/atom|xmlns=["']?http://www.w3.org/2005/Atom)@si ) { print STDERR " Converting to RSS..." if $debug; eval { $preparsed = atom_to_rss( $content, $url ); }; if ( $@ ) { print STDERR "Feed $feedid ($url): conversion to rss from atom failed! $@"; next; } print STDERR "done.\n" if $debug; } # throw away everything before the first XML declaration so we # can get a clean parse if at all possible. $content =~ s/^.*?(<\?xml)/$1/s; # should be a one-off, but. $content =~ s/£/\&pound/gs; # debugging if ( $debug ) { open( PREPARSE, ">/var/tmp/$title.preparse" ) or warn $!; binmode( PREPARSE ); print PREPARSE $content; close( PREPARSE ); } print STDERR " Parsing RSS " if $debug; eval { # clean up fail-files unlink( "/var/tmp/$feedid.failed" ); unlink( "/var/tmp/$feedid.parsed" ); unlink( "/var/tmp/$title.new" ); die "content-type incorrect" if $contenttype ne "text/xml"; if ( !defined( $preparsed )) { $rss = new XML::RSS( version => "1.0", encoding => 'UTF-8', encode_output => 1 ); if ( !$rss->parse( $content )) { my $err = ""; $err = $@ if $@; $err .= " ($!)" if $! and $! ne "Success"; $err ||= "unknown error"; # Eval will catch print STDERR " $err\n" if $debug; die "\nparser failed: $err. File in /var/tmp/$feedid.failed"; } } else { $rss = $preparsed; } print STDERR $rss->{version} . "\n" if $debug; if ( $debug ) { open( PARSED, ">/var/tmp/$feedid.parsed" ) or warn $!; binmode( PARSED ); print PARSED Dumper( $rss ); close( PARSED ); } # pubDate needs to be in strict Mail date format for RSS2 $channel = $rss->{channel}; if ( defined( $channel->{pubDate})) { print STDERR " Fixing pubDate for channel: " if $debug; my $fixdate = str2time( $channel->{pubDate}); if ( defined( $fixdate )) { $channel->{pubDate} = strftime( "%a, %d %b %Y %H:%M:%S %z", localtime( $fixdate )); } else { # best we can do $channel->{pubDate} =~ s/ (\d?\d:\d\d) / $1:00 /; } print STDERR $channel->{pubDate} . "\n" if $debug; } # XML::RSS barfs on feed images without titles. But it # barfs if there's no image tag, too. What a piece of # shit. delete $rss->{image}->{url} if defined( $rss->{image} ); # Fix a variety of per-item bogosity map { # trim title $_->{title} =~ s/\s+$//s; $_->{title} =~ s/^\s+$//s; # ARGH. XML::RSS won't save items with blank titles. $_->{title} = "(untitled)" if !( $_->{title}); # RTE: wtf? if ( exists( $_->{item} )) { delete $_->{item}; } # and the fucking stupidity with the entities #$_->{title} =~ s/&(?!(amp|gt|lt))/&/gs; # RSS 2.0 gives a content:encoded block which contains # a formatted version of the post. Ideally I'd like to # use this. Both description and content:encoded have # entity escapes, though. my $ctag = $_->{content}; if ( defined( $ctag ) and ref $ctag eq "HASH" and defined( $ctag->{encoded})) { $_->{description} = $ctag->{encoded}; } # cope with 2.0 RSS feed. if ( defined( $_->{guid})) { # don't overwrite good links $_->{link} = $_->{guid} unless $_->{link}; } # Some of the pubDate stuff is messed up. if ( defined( $_->{pubDate} )) { my $fixdate = str2time( $_->{pubDate}); if ( defined( $fixdate )) { $_->{pubDate} = strftime( "%a, %d %b %Y %H:%M:%S %z", localtime( $fixdate )); } else { # best we can do $_->{pubDate} =~ s/ (\d?\d:\d\d) / $1:00 /; } } # HURGH. I don't know whose fault this is, but I don't like it. #$_->{description} =~ s/=&quot;(.+?)&quot;/="$1"/gis; # DIE PUNY HUMANS, so to speak. Turns out Warren ain't # alone. also, I should probably do this with a # parser. $_->{description} =~ s@(href|src|data)="/@$1="$site/@gs; $_->{description} =~ s@(href|src|data)="(?!(http|ftp))@$1="$base/@gs; # final cleanup: nuke leading/trailing space $_->{description} =~ s/^\s+//; $_->{description} =~ s/\s+$//; } @{$rss->{items}}; if ( @{$rss->{items}}) { $rss->{charset} = $charset; timestamp( $title, $rss, $lastupdated, $thisfeed{feedid} ); } else { print STDERR "no items in $title\n"; if ( $debug ) { print STDERR $original . "\n"; } } # this tosspot saves entites as unicode, causing further # irritation. $rss->{channel}->{title} =~ s/&(?!(amp|gt|lt))/&/gs; }; my $errst = $dbh->prepare( "UPDATE feeds SET error=? WHERE feedid=?" ); if ( $@ ) { # did this break already? if ( !$lasterror ) { chomp( $@ ); warn "$title ($url): " . $@; if ( $@ =~ /not well-formed.*byte (\d+)|at line \d+, column \d+, byte (\d+)/ ) { my $offset = ( $1 || $2 ); print STDERR "Excerpt around byte $offset:\n"; print STDERR substr( $content, $offset - 20, 20 ) . "=>" . substr( $content, $offset, 1 ) . "<=" . substr( $content, $offset + 1, 20 ); print STDERR "\n"; } else { print STDERR "failed for unknown reason\n"; } my $res = $errst->execute( substr( $@, 0, 255 ), $thisfeed{feedid} ); if ( !$res ) { die $DBI::errstr; } } else { print STDERR " feed failed and already logged\n" if $debug; } open( SAVED, ">/var/tmp/$title.failed" ) or warn $!; binmode( SAVED ); print SAVED $content; close( SAVED ); next; } else { my $res = $errst->execute( "", $thisfeed{feedid} ); die $DBI::errstr unless $res; } # now check if it changed at all. my $newfeed = ""; eval { $newfeed = $rss->as_string; }; if ( $@ ) { warn "$title ($url): " . $@; if ( $@ =~ /not well-formed.*byte (\d+)|mismatched tag at line \d+, column \d+, byte (\d+)/ ) { print STDERR "Excerpt:\n"; print STDERR substr( $content, $1 - 20, 20 ) . "=>" . substr( $content, $1, 1 ) . "<=" . substr( $content, $1 + 1, 20 ); print STDERR "\n"; } else { print STDERR Dumper( $rss ); } open( SAVED, ">/var/tmp/$title.failed" ) or warn $!; binmode( SAVED ); print SAVED $content; close( SAVED ); next; } if (( $thisfeed{cleanfeed}||'') ne $newfeed ) { my $save = $dbh->prepare( 'UPDATE feeds SET name=?,cleanfeed=? WHERE feedid=?' ); my $saveres = $save->execute( $rss->{channel}->{title}||$title, $newfeed, $thisfeed{feedid} ) or warn "Saving $title: $DBI::errstr"; } # otherwise clean up any existing failed files unlink( "/var/tmp/$title.failed" ); if ( $debug ) { open( SAVED, ">/var/tmp/$title.new" ) or warn $!; binmode( SAVED ); print SAVED Dumper( $newfeed ); close( SAVED ); } # housekeeping: the hdr file is invalid if we couldn't # successfully parse the RSS feed. On the other hand, if we did # parse the feed we don't need the fetched file. # no real database equivalent for this... check if there are # any items in the new feed, perhaps. We can also erase the # rawfeed column. } else { print STDERR " Using data from " . scalar( localtime( $lastupdated )) . "\n" if $debug; die Dumper( $rss ); } # make the timestamp on the file correct. $dbh->do( 'UPDATE feeds SET lastupdate=FROM_UNIXTIME(?) WHERE feedid=?', undef, $lastupdated, $thisfeed{feedid} ) or warn( "timestamp update for $title failed ($DBI::errstr)" ); } # Attempt to attach timestamps to untimestamped feeds. Won't do # anything useful the first time through. sub timestamp { my ( $feed, $rss, $lastupdated, $feedid ) = @_; my $expire = $dbh->do( 'UPDATE items SET active=0 WHERE feedid=' . $feedid ); print STDERR " Timestamping $feed\n" if $debug; for my $item (@{$rss->{items}}) { my ( $itemid, $ts, $date ); # fixme: this should be configurable if ( defined( $item->{dc} ) and defined( $item->{dc}->{date})) { $date = rss_date_to_unix( $item->{dc}->{date} ); print STDERR " Using dc date " . scalar( localtime( $date )) . "\n" if $debug; } elsif ( defined( $item->{pubDate})) { $date = rss_date_to_unix( $item->{pubDate}); print STDERR " Using pubdate " . scalar( localtime( $date )) . "\n" if $debug; } else { my ( $y, $m, $d ); my $guid = $item->{guid}||$item->{link}; my $title = $item->{title}; my $desc = $item->{description}; if ( defined( $guid )) { # dnalounge ( $y, $m, $d ) = $guid =~ m{(\d{4})/(\d{2}).html#(\d{2})}; goto GLOM if $y and $m and $d; } if ( defined( $title )) { # arcamax doonesbury feed ( $m, $d, $y ) = $title =~ m{Doonesbury (\d+)/(\d+)/(\d+)}; goto GLOM if $y and $m and $d; # doonesbury, I hate you next if $title eq "Past Stories"; # jerkcity ( $d, $m, $y ) = $title =~ m{\b(\d{1,2})-(\w{3})-(\d{4})\b}; goto GLOM if $y and $m and $d; # NTK ( $y, $m, $d ) = $title =~ m{\b(\d{4})-(\d{2})-(\d{2})}; goto GLOM if $y and $m and $d; } if ( defined( $desc )) { # john shirley ( $d, $m, $y ) = $desc =~ m{\b(\d{1,2})-(\w{3})-(\d{4})\b}; goto GLOM if $y and $m and $d; } GLOM: $date = sprintf( "%04d-%02d-%02d", $y, $m, $d ) if $y and $m and $d; if ( defined( $date )) { $date = str2time( $date ); } } if ( !defined( $date )) { my $feedfunc = $feed; $feedfunc =~ s/[^a-zA-Z]//gs; eval '$date = ts_' . $feedfunc . '( $item );'; print STDERR " Faked date from item: $date\n" if ( $debug && defined( $date )); $date = rss_date_to_unix( $date ) if $date; } # stuff dc->date if ( defined( $date ) and !defined( $item->{dc})) { my %dc; # stupid W3CDTF can't handle timezones :( $dc{date} = strftime( "%Y-%m-%dT%H:%M:%S GMT", gmtime( $date )); $item->{dc} = \%dc; print STDERR " adding dc:date $date => " . $dc{date} . "\n" if $debug; $ts = $date; } # need to fake up a datestamp, catering for potential in-line utf8. # rawarticlehash is quite obviously a poor name at this point. my $text = $item->{description}; my $p = new HTML::TokeParser( \$text ); my $c = ""; while ( my $t = $p->get_token()) { next unless $t->[0] eq "T"; $c .= $t->[1]; } my $digest = md5_hex( encode_utf8(( squish( $item->{title}||'' )) . ( squish( $c )) . ( $item->{link}||'' ))); print STDERR " Calculated digest $digest\n" if $debug; # nice and all as this is, it's not preserved by the stupid conversion # if ( defined( $item->{guid} )) { # $digest = $item->{guid}; # print " using GUID\n" if $debug; # } my $itemdata = $dbh->selectall_arrayref( 'SELECT itemid, unix_timestamp(ts) FROM items WHERE feedid=? AND rawarticlehash=?', undef, $feedid, $digest ); if ( defined( $itemdata ) and @{$itemdata} ) { $itemid = $itemdata->[0]->[0]; $ts = $itemdata->[0]->[1] unless $ts; } else { $ts = $lastupdated unless $ts; } if ( defined( $itemid )) { print STDERR " wait, this is already in the database!\n" if $debug; } my $stamp; if ( !defined( $itemid )) { print STDERR " generating timestamp for " . encode_utf8( $item->{title} ) . "\n" if $debug; $stamp = $dbh->prepare( 'REPLACE INTO items(feedid,rawarticle,rawarticlehash,subject,body,url,ts,active) VALUES(?,?,?,?,?,?,FROM_UNIXTIME(?),1)' ); #, { mysql_is_blob => [ 0, 1, 0, 0 ] }); # existing date overrides if ( defined( $date )) { print STDERR " Using item date " . scalar( localtime( $date )) . "\n" if $debug; $ts = $date; } my $frozen = freeze( $item ); my $res = $stamp->execute( $feedid, $frozen, $digest, $item->{title}, $item->{description}, $item->{link}, $ts ); } else { $stamp = $dbh->prepare( 'UPDATE items set feedid=?,rawarticle=?,rawarticlehash=?,subject=?,body=?,url=?,ts=FROM_UNIXTIME(?),active=1 WHERE itemid=?' ); my $frozen = freeze( $item ); my $res = $stamp->execute( $feedid, $frozen, $digest, $item->{title}, $item->{description}, $item->{link}, $ts, $itemid ); } my $newid = $dbh->last_insert_id( "", "rss", "items", "itemid" ); print STDERR " inserted as item $newid\n" if $debug and $newid and $newid != ( $itemid || 0 ); if ( !defined( $item->{dc})) { my %dc; #$dc{date} = strftime( '%Y-%m-%dT%H:%M+0000', gmtime( $ts )); # XXXX # stupid W3CDTF can't handle timezones :( $dc{date} = strftime( "%Y-%m-%dT%H:%M:%S GMT", gmtime( $ts )); $item->{dc} = \%dc; print STDERR " patching in time $ts => " . $dc{date} . "\n" if $debug; } print STDERR "\n" if $debug; } # now clean up anything that's not active #my $expire = $dbh->do( 'DELETE FROM items WHERE active=0' ); } # convert rss date to unix time_t sub rss_date_to_unix { my $date = shift; my $inp = $date; # RSS 2.0 uses a totally different date format. On the plus side, # Date::Parse should be able to handle it unchanged. if ( $date =~ /^\d+-/ ) { # Blogger's variation: 2003-07-16 17:44:13Z $date =~ s/[T ](\d+:\d+):\d+Z$/T$1+00:00/; # Z = Zulu time = GMT } # str2date vs ISO8601 $date =~ s/(\d)T(\d)/$1 $2/; $date =~ s/(\d\d):(\d\d)$/$1$2/; my $outp = str2time( $date ); print STDERR " in: $inp out: $outp\n" if $debug; $outp; } # per-feed timestamps # if a sub called ts_FeedName exists, it'll get called to try and # extract a usable timestamp from an RSS item. sub ts_Cloudiness { my $item = shift; my ( $date ) = $item->{link} =~ m|.*blog/(.+)#|; if ( defined( $date )) { $date =~ s@/@-@g; $date =~ s@$@T00:00-0600@; } $date; } sub ts_KevLyda { my $item = shift; my ( $date ) = $item->{link} =~ m|.*blog/(.+)#|; if ( defined( $date )) { $date =~ s@/@-@g; $date =~ s@$@T00:00+0000@; } $date; } # maybe it's time I just folded all these into ts_Bloxsom and added a # bloxsom detector... sub ts_Nanocrew { my $item = shift; my ( $date ) = $item->{link} =~ m|.*blog/(.+)#|; if ( defined( $date )) { $date =~ s@/@-@g; $date =~ s@$@T00:00+0000@; } $date; } sub ts_DNALounge { my $item = shift; my $date = $item->{title}; if ( defined( $date )) { $date =~ s/ \(.*\)//; $date = str2time( $date ); if ( defined( $date )) { $date = strftime( '%Y-%m-%dT%H:%M-0800', gmtime( $date ));; } } $date; } sub ts_LungFish { my $item = shift; my ( $date ) = $item->{link} =~ m/^.*\.(\d+)$/; if ( defined( $date )) { $date = strftime( '%Y-%m-%dT%H:%M-0800', gmtime( $date ));; } $date; } sub ts_RedMeat { my $item = shift; my ( $date ) = $item->{link} =~ m/redmeat\/(\d{4}-\d{2}-\d{2})/; if ( defined( $date )) { $date = str2time( $date ); if ( defined( $date )) { $date = strftime( '%Y-%m-%dT%H:%M-0000', localtime( $date ));; } } return undef; } sub atom_to_rss { my $feed = shift; my $url = shift; $feed =~ s/^.*?<\?xml/new( \$feed ) or die $!; my $rss = new XML::RSS( version => '1.0', encoding => 'UTF-8', encode_output => 1, ); my @links = $atom->link; while ( @links ) { last if $links[0]->type eq "text/html"; shift @links; } $rss->channel ( title => $atom->title, link => ( defined( $links[0] ) ? $links[0]->href : $url ), description => $atom->subtitle, dc => { date => $atom->modified || $atom->updated, language => $atom->language, } ); for my $entry ( $atom->entries ) { if ( !defined( $entry->content )) { if ( !defined( $entry->summary )) { $entry->content( "no content" ); } else { $entry->content( $entry->summary ); } } my %item = ( title => $entry->title, link => $entry->link->href, description => $entry->content->body, dc => { date => $entry->issued || $entry->updated, }, ); if ( $entry->author ) { $item{dc}->{creator} = $entry->author->name; } $rss->add_item( %item ); } return $rss; } sub squish { my $text = shift; $text =~ s/[^[:word:][:space:]]//gs; # whitespace cleanup $text =~ s/ +/ /gs; $text =~ s/^\s+//gs; $text =~ s/\s+$//gs; $text; }