ALT Linux Sisyphus

#!/usr/bin/perl

# $Id: rss2mail2 605 2006-10-07 17:57:15Z struan $

# we need to make sure all this is compiled etc before we do 
# anything in main otherwise it's not found.
BEGIN {
package RSS2Mail::Config;

use AppConfig qw( :argcount :expand );
use vars qw( $base_dir @mail_to $mail_from $oldest $mail_per_item 
             $DEBUG $DUMPER $diff $max_not_found );

my $c = AppConfig->new( { GLOBAL => {
                                ARGCOUNT    =>  ARGCOUNT_ONE,
                                EXPAND      =>  EXPAND_ENV
                            }
                        }, qw( dsn user pass base_dir oldest 
                               mail_to=s@ mail_from mail_per_item!
                               max_not_found no_diffs! )
                      );
my $conf_file = get_conf_file();

$c->file( $conf_file );

# edit these for configuration
$base_dir = $c->base_dir;
@mail_to = @{ $c->mail_to };
$mail_from = $c->mail_from;
$oldest = $c->oldest || 30; # how many days to keep items for
$RSS2Mail::DBI::DSN = $c->dsn;
$RSS2Mail::DBI::user = $c->user;
$RSS2Mail::DBI::pass = $c->pass;

# set to one to get a mail per item
$mail_per_item = 0;
$mail_per_item = 1 if $c->mail_per_item;

$max_not_found = $c->max_not_found || 30;

$diff = 1;
$diff = 0 if $c->no_diffs;

# Don't touch these.
$DEBUG = 0;
$DUMPER = 0;

# convert oldest into epoch seconds...
$oldest *= 24 * 60 * 60;
$RSS2Mail::oldest = time - $oldest;

sub get_conf_file {
    my $conf_file = shift;
    my @possible_locs = ( $ENV{HOME} . "/.rss2mail2rc",
                          '/etc/rss2mail2rc' );

    unshift @possible_locs, $conf_file if $conf_file;

    for ( @possible_locs ) {
        return $_ if -e $_; 
    }

    die "failed to find a configuration file";
}

1;

package RSS2Mail::DBI;

use base 'Class::DBI::BaseDSN';

use vars qw($DSN $user $pass);

__PACKAGE__->set_db("Main", $DSN, $user, $pass);

sub _init_db {
    my $self = shift;

    my $sql = qq{
CREATE TABLE feeds (
    id_feed INTEGER PRIMARY KEY,
    name VARCHAR(50),
    last_update VARCHAR(20),
    headers TEXT,
    failed_updates INTEGER,
    uri VARCHAR(255)
);

CREATE TABLE items (
    id_item INTEGER PRIMARY KEY,
    id_feed INTEGER,
    title VARCHAR(255),
    permalink VARCHAR(255),
    content TEXT,
    last_update VARCHAR(20),
    md5 TEXT
);

};

    my $dbh = $self->db_Main;

    for my $statement (split /;/, $sql) {                                       
        $statement =~ s/\#.*$//mg; # strip # comments                           
        next unless $statement =~ /\S/;                                         
        eval { $dbh->do($statement) };                                          
        die "$@: $statement" if $@;
    }
    return 1;
}

1;

}; # END BEGIN

package RSS2Mail::Feeds;

use base 'RSS2Mail::DBI';

__PACKAGE__->set_up_table('feeds');
__PACKAGE__->has_many('items' => 'RSS2Mail::Items', 'id_feed');
__PACKAGE__->has_a( headers => 'HTTP::Headers', 
                        inflate => \&get_headers,
                        deflate => \&save_headers
                  );

sub save_headers {
    my $headers = shift;
    return '' unless $headers;
    my $header_string = "If-Modified-Since: "
                      . $headers->header('Last-Modified')
                      . "\n"
                      . "If-None-Match: "
                      . $headers->header('ETag')
                      . "\n";
    return $header_string;
}

sub get_headers {
    my $header_string = shift;
    my $headers = HTTP::Headers->new();

    my %headers = map { m/([^:]*):\s*(.*)/; $1 => $2; } 
                                        split("\n", $header_string );

    foreach my $header ( keys %headers ) {
        $headers->header($header   =>  $headers{$header});
    }

    return $headers;
}

1;

package RSS2Mail::Items;

use base 'RSS2Mail::DBI';
use Digest::MD5 qw(md5_hex);

__PACKAGE__->set_up_table('items');
__PACKAGE__->has_a('id_feed' => 'RSS2Mail::Feeds');
__PACKAGE__->columns( TEMP => qw( link diff date author category ) );
__PACKAGE__->set_sql('old_items', <<"");
    SELECT __ESSENTIAL__
      FROM __TABLE__
     WHERE last_update < ?

sub remove_old_items {
    my $self = shift;

    warn $RSS2Mail::oldest;

    foreach my $item ($self->search_old_items($RSS2Mail::oldest)) {
        $item->delete();
        # __PACKAGE__->dbi_commit();
    }
}

1;

package RSS2Mail::Exception; 

use Exception::Class (
    RSS2Mail::Exception => {
        fields  =>  'feed',
        description =>  'rss2mail2 error',
    },
    RSS2Mail::Exception::Fetch => {
        isa =>  'RSS2Mail::Exception',
        fields  =>  [ qw( code msg feed ) ],
        description =>  'error fetching rss',
    },
    RSS2Mail::Exception::Fetch::MaxNotFound => {
        isa =>  'RSS2Mail::Exception',
        fields  =>  [ qw( code msg feed ) ],
        description =>  'Exceeded max not found count',
    },
    RSS2Mail::Exception::Fetch::NoFeed => {
        isa =>  'RSS2Mail::Exception',
        fields  =>  [ qw( code msg feed ) ],
        description =>  'Failed to fetch feed on inital attempt',
    },
    RSS2Mail::Exception::Fetch::NotAllowed => {
        isa =>  'RSS2Mail::Exception',
        fields  =>  [ qw( code msg feed ) ],
        description =>  'Feed is either gone or access forbidden',
    },
    RSS2Mail::Exception::Parse => {
        isa =>  'RSS2Mail::Exception',
        fields => [ qw( fatal feed ) ],
        description =>  'error parsing rss',
    },
    RSS2Mail::Exception::Autodiscover => {
        isa =>  'RSS2Mail::Exception',
        fields => [ qw( uri feed ) ],
        description =>  'autodiscovery failed to find feed',
    },
    RSS2Mail::Exception::Require => {
        isa =>  'RSS2Mail::Exception',
        fields => [ qw( module feed ) ],
        description => 'optional module could not be loaded',
    }
);

# we need this to go after the use Exception::Class as it's
# actually in Exception::Class
use base 'Exception::Class::Base';

sub full_message {
    my $self = shift;

    my $message = 'ERROR' . ( $self->feed ? 
                            ' with ' . $self->feed :
                            '' ) . ': ' . $self->description;

    if ( UNIVERSAL::can( $self, 'code' ) ) {
        $message .= ' (' . $self->code . ' error: ' . $self->msg . ')';
    } elsif ( UNIVERSAL::isa( $self, 'RSS2Mail::Exception::Autodiscover' ) ) {
        $message .= ' (uri: ' . $self->uri . ')';
    } elsif ( UNIVERSAL::isa( $self, 'RSS2Mail::Exception::Require' ) ) {
        $message .= ' (' . $self->module . ')';
    } else {
        $message .= ' (' . $self->error . ')';
    }

    return $message;
}

# hacky xml generation of this sort is asking for trouble
# but anything else is frankly overblown...
sub rss {
    my $self = shift;
    my $feed = shift;
    my $name = $feed->name;
    my $uri = $feed->uri;

    my $rss = qq(<?xml version="1.0"?>
    <rss version="0.91">
    <channel>
        <title>$name</title>
        <link>$uri</link>
        <description>Automatic unsubscribe info</description>
        <language>en</language>
        <item>
             <title>Unsubscribed from feed</title>
             <link>$uri</link>
             <description>);
    $rss .= $self->description;
    $rss .= qq(</description>
            </item>
        </channel>
    </rss>);

    return $rss;
}

1;

# nasty hack to allow us to require the above for testing reasons.
return unless $0 =~ /rss2mail2/;

package MAIN;

use strict;
use warnings;
use XML::Feed;
use List::Util qw( first );
use Carp;
use Digest::MD5 qw(md5_hex);
use File::Path;
use MIME::Lite;
use Fcntl qw( :flock :seek );
use File::Spec::Functions qw(rel2abs splitpath);
use Text::Autoformat;
use Text::Diff;
use HTTP::Headers;
use HTTP::Request;
use LWP::UserAgent;
use HTML::FormatText::WithLinks;
use Getopt::Long qw( :config no_ignore_case );

# we need this for unicode in 5.8+
BEGIN {                                                                         
    unless ( $] < 5.007 ) {                                                     
        require Encode;                                                         
        import Encode;                                                          
    }                                                                           
}; 

use vars qw( $VERSION $max_not_found $no_lint %entities $print_mail );

# lets see if we can use data dumper in our debugging
eval {
    require Data::Dumper;
    import Data::Dumper;
};

$RSS2Mail::Config::DUMPER = 1 unless $@;

# and if we can use HTML::Lint for HTML munging
eval {
    require HTML::Lint;
};

$no_lint = $@;
$max_not_found = $RSS2Mail::Config::max_not_found;

$VERSION = '2.29';

my ($edit, $add, $delete, $list, $help, $opt_uri, $base, $opt_debug, $auto,
    $cleanup, $fetch, $individual, $version, $opml );

GetOptions(
    'edit=s'    =>  \$edit,
    'add=s'     =>  \$add,
    auto        =>  \$auto,
    'delete=s'  =>  \$delete,
    'base=s'    =>  \$base,
    list        =>  \$list,
    cleanup     =>  \$cleanup,
    print_mail  =>  \$print_mail,
    help        =>  \$help,
    'Debug=i'   =>  \$opt_debug,
    'uri=s'     =>  \$opt_uri,
    'fetch=s'   =>  \$fetch,
    items       =>  \$individual,
    Version     =>  \$version,
    opml        =>  \$opml,
);

# deal with setting some options
if ( $base ) {
    $RSS2Mail::Config::base_dir = $base;
} 
if ( $opt_debug ) {
    $RSS2Mail::Config::DEBUG = $opt_debug;
}
if ( $individual ) {
    $RSS2Mail::Config::mail_per_item = 1;
}

# deal with working our what we are going to do
if ($opml) {
    eval {
        require XML::OPML;
    };

    die "Cannot output OPML without XML::OPML module\n" if $@;

    my $o = new XML::OPML( version => "1.1" );

    $o->head( 
        title   => "rss2mail2 subscriptions"
    );

    my $feeds = RSS2Mail::Feeds->retrieve_all();
    while ( my $feed = $feeds->next ) {
        $o->add_outline(
            text    =>  $feed->name,
            xmlUrl  =>  $feed->uri,
        )
    }
    print $o->as_string;
    exit;
} elsif ( $edit ) {
    my $feeds = RSS2Mail::Feeds->search(
                    name    =>  $edit,
               );
    if ( $feeds->count == 1 and $opt_uri ) {
        my $feed = $feeds->first;

        if ( $auto ) {
            my $uri;
            eval {
                $uri = _autodiscover( $opt_uri );
            };
            if ( $@ ) {
                warn $@;
                print "Feed not updated\n ";
                exit;
            }
            $opt_uri = $uri;
        }

        $feed->uri( $opt_uri );
        $feed->update();
        print "Feed updated: $edit\n";
    } elsif ( $feeds->count > 1 ) {
        print "Mmmm, there seems to be more than one feed called $edit\n";
        print "As we can't work out what to do we're going to bail\n";
    } elsif ( $opt_uri ) {
        print "$edit is not the name of a feed\n";
    } else {
        print "You need to supply the uri of the feed\n";
    }
    exit;
} elsif ( $add ) {
    my $auto_name = 0;

    if ( $add =~ /^http/ ) {
        $auto_name = 1;
        $opt_uri = $add;
    }

    if ( defined $add and $opt_uri ) {

        if ( $auto ) {
            my $uri;
            eval { 
                $uri = _autodiscover( $opt_uri );
            };
            if ( $@ ) {
                warn $@;
                print "Feed not added\n";
                exit;
            }

            $opt_uri = $uri;
        }

        my $existing_feed = RSS2Mail::Feeds->search({
                        uri =>  $opt_uri,
                    });

        if ( ! $existing_feed->count ) {

            if ( $auto_name ) {
                my $content = fetch( { name => 'new feed', uri => $opt_uri } );
                $add = parse_rss( string => $content, title_only => 1 );
            }

            my $feed = RSS2Mail::Feeds->create({
                            name    =>  $add,
                            uri     =>  $opt_uri,
                       });
            print "Feed added: $add\n";
        } else {
            die "Already a feed for $opt_uri\n";
        }
    } elsif ( $opt_uri ) {
        print "You need to supply the name of the feed\n";
    } else {
        print "You need to supply the uri of the feed\n";
    }
    exit;
} elsif ( $delete ) {
    my $feeds = RSS2Mail::Feeds->search(
                    name    =>  $delete,
               );
    if ( $feeds->count == 1 ) {
        my $feed = $feeds->first();
        $feed->delete();
        print "Feed deleted: $delete\n";
    } elsif ( $feeds->count > 1 ) {
        print "Mmmm, there seems to be more than one feed called $delete\n";
        print "As we can't work out what to do we're going to bail\n";
    } else {
        print "$delete is not the name of a feed\n";
    }
    exit;
} elsif ( $list ) {
    my $feeds = RSS2Mail::Feeds->retrieve_all();
    while ( my $feed = $feeds->next ) {
        print $feed->name . ": " . $feed->uri . "\n";
    }
    exit;
} elsif ( $cleanup ) { 
    RSS2Mail::Items->remove_old_items();
    print "Old items deleted\n";
} elsif ( $fetch ) {
            my $feed = RSS2Mail::Feeds->find_or_create({
                            name    =>  'TEMP',
            });
            $feed->uri( $fetch );
            $feed->update();
} elsif ( $help ) {
    print <<"HERE";
rss2mail2 v$VERSION
Options are:
    --list : list all feeds and uris
    --add feed_name : add a feed 
    --del feed_name : delete that feed
    --edit feed_name : change the uri of that feed
The uri should be supplied with the --uri option.
If you use the --auto option with --uri then rss2mail2 will attempt
to autodiscover a feed at the supplied uri.
HERE
exit;
} elsif ( $version ) {
    print "This is rss2mail2 version $VERSION\n";
    exit;
}

# these are the entities that HTML::Entities doesn't deal with
# in 5.6 and below as they are unicode chars so we have to
# mangle them by hand. Or at least these are the common ones.
# Also deals with fucked up windows badness that 'nix terminals
# disagree with
%entities = (
    '&lsquo;'   =>  q/'/,
    '&rsquo;'   =>  q/'/,
    '&ndash;'   =>  q/--/,
    '&#233;'    =>  q/И/,
    '&#333;'    =>  q/o/,  # what is this even?
    '&#8211;'   =>  q/--/,
    '&#8212;'   =>  q/--/,
    '&#8216;'   =>  q/'/,
    '&#8217;'   =>  q/'/,
    '&#8220;'   =>  q/"/,
    '&#8221;'   =>  q/"/,
    '&#8230;'   =>  q/.../,
    '&#8364;'   =>  q/EUR/,
    '\205'      =>  q/ --/,
    '\221'      =>  q/`/,
    '\222'      =>  q/'/,
    '\223'      =>  q/``/,
    '\224'      =>  q/''/,
    '\225'      =>  q/*/,
    '\226'      =>  q/-/,
    '\227'      =>  q/ --/,
    '\230'      =>  q/~/,
    'ц╘'        =>  q/И/,   # this seems to be an XML::Parser weirdness
);

sub fetch_feed {
    my $feed = shift;
    my $content;

    eval { 
        $content = fetch( { feed => $feed } );
    };
    if ( $@ ) {
        if ( UNIVERSAL::isa( $@, 'RSS2Mail::Exception::Fetch::NoFeed' ) 
             or UNIVERSAL::isa( $@, 'RSS2Mail::Exception::Fetch::MaxNotFound' ) 
             or UNIVERSAL::isa( $@, 'RSS2Mail::Exception::Fetch::NotAllowed' ) 
        ) {
            warn $@;
            $content = $@->rss($feed);
            $feed->delete;
        } elsif ( UNIVERSAL::isa( $@, 'RSS2Mail::Exception::Fetch' ) ) {
            warn $@;
        } else {
            croak $@;
        }
    }

    return $content;
}

# grab the rss from the intarweb
# we grab the various headers from the db so we can do a conditional
# GET and then return undef if nothing's changed
# also magic to make sure we do the right thing with 301, 410 and 403
# responses.
sub fetch {
    my ( $feed, $name, $uri, $head );
    my $args = shift;
    if ( $args->{feed} ) {
        $feed = $args->{feed};
        $name = $feed->name;
        $uri  = $feed->uri;
    } elsif ( $args->{name} and $args->{uri} ) {
        $name = $args->{name};
        $uri  = $args->{uri};
    }

    # set headers for only updated HTTP GET magic
    if ( $feed ) {
        $head = $feed->headers();
    }
    my $ua = LWP::UserAgent->new(
        agent   =>  "rss2mail/$VERSION (http://exo.org.uk/code/rss2mail/)",
        timeout =>  30,     # otherwise we have to wait 180, YAWN
    );
    my $req = HTTP::Request->new(GET =>  $uri, $head || undef);
    my $resp = $ua->simple_request($req);

    # some sort of redirect was issued so we need to follow it, 
    # however as some people like to issue multiple redirects
    # for reasons unknowable we check till all is well.
    while ( $resp->code != 304 and $resp->code >= 300 and $resp->code < 400 ) { 

        # 'cause you know, some people are idiots
        RSS2Mail::Exception::Fetch->throw(
            feed  => $name,
            error => 'No location header on redirect',
        ) unless $resp->header('location');

        # permanent redirect
        $feed->uri( $resp->header('location') ) if $feed and $resp->code == 301;

        $req = HTTP::Request->new(
                GET =>  _uri_to_abs( $resp->header('Location'), 
                                     $resp->base ),
                    $head
               );
        $resp = $ua->simple_request($req);
    }

    # we're never going to be able to fetch this so unsubscribe
    if ( $resp->code == 410 || $resp->code == 403 ) {
        RSS2Mail::Exception::Fetch::NotAllowed->throw(
            feed  => $name,
            code  => $resp->code,
            msg   => $resp->message, 
            error => 'Cannot fetch content: gone or forbidden',
        );
    }

    # if we had some sort of error then check if we've
    # been able to fetch before. if not then lets just
    # unsub. if it looks like a persistant issue unsub too
    if ( $resp->code >= 400 ) {
        unless ( $feed and $feed->last_update() ) {
            RSS2Mail::Exception::Fetch::NoFeed->throw(
                feed  => $name,
                code  => $resp->code,
                msg   => $resp->message, 
                error => 'No feed found on initial fetch',
            );
        }

        if ( $feed ) {
            $feed->failed_updates( 
                $feed->failed_updates ?
                    $feed->failed_updates + 1 :
                    1
            );
            $feed->update();

            if ( $feed->failed_updates > $max_not_found ) {
                RSS2Mail::Exception::Fetch::MaxNotFound->throw(
                    feed  => $feed->name,
                    code  => $resp->code,
                    msg   => $resp->message, 
                    error => 'Exceeded max not found count',
                );
            }
        }
    }

    # content not changed
    return undef if $resp->code == 304;

    # something else bad happened
    unless ($resp->is_success) {
        RSS2Mail::Exception::Fetch->throw(
            feed  => $name,
            code  => $resp->code,
            msg   => $resp->message,
            error => 'Failed to fetch',
        );
    }

    if ( $feed and $feed->failed_updates ) {
        $feed->failed_updates(0);
    }

    if ( $feed ) {
        # this might look like the wrong place to stick this but regardless 
        # of what the content is like there is NO point in fetching a feed
        # again until it changes. Even if the feed can't be parsed that fact
        # won't change until the feed is updated and hence the last-updated
        # and etag headers change
        $feed->headers($resp->headers);

        $feed->update();
    }

    my $content = $resp->content;
    return $content;
}

# this is all basically to get round the less than optimal unicode 
# processing in perl 5.6. it means we loose some content but that things
# don't blow up in our face. it is less than optimal :(
# it also gets round some issue XML::RSS has with weird windows chars
sub clean_rss {
    my $rss = shift;
    # quick bad feed cleanup hack
    # should weed out worst of problems with ctrl chars and the like
    if ( $] < 5.007 ) {
        $rss =~ s/[^[:ascii:]]+/ /g; # assumes not using too many unicode chars
        $rss =~ s/[^[:graph:][:blank:]\n]+/ /g;
    }
    $rss =~ s/ & / &amp; /g;      # 'cause some people still can't encode...

    # some XML::RSS versions don't seem to like windows encoding maps...
    # god alone knows what happens if there are any windows encoded
    # characters in there. 
    #if ($XML::RSS::VERSION < 2.32 ) {
    #    $rss =~ s#encoding="windows[^"]*"#encoding="iso-8859-1"#;
    #}

    # 'cause Joel on Software has bogus dates and DateTime::Format::Mail
    # seems to ignore the day ;) this is, of course, hacky
    #$rss =~ s#<pubDate>(?=\d)#<pubDate>Mon, #g;
    #$rss =~ s#EST</pubDate>#-0500</pubDate>#g;
    return $rss;
}

# THIS NEEDS SOME WORK!
sub parse_rss {
    my %args = @_;

    my $failed_parse = 0;
    my $parse_count = 0;
    my $parser; # = XML::Feed->new();
    PARSE:
    eval {
        # local $^W = 0; # XML::RSS spews warnings on some rss files :(
        if ( exists( $args{file} ) ) {
            $parser = XML::Feed->parse( $args{file}) 
                or die XML::Feed->errstr;
        } else {
            $parser = XML::Feed->parse( \$args{string} )
                or die XML::Feed->errstr;
        }
    };
    if ($@) {
        # sometimes things fail to parse if we have unicode in them 
        # but they're not properly encoded so we try this to see if
        # it helps. if not then spit out the original error
        unless ( $failed_parse and $] < 5.007 ) {
            $args{ string } = encode( 'utf8', $args{ string } );
            $failed_parse = $@;
            if ( $RSS2Mail::Config::DEBUG > 1 ) {
                warn "DEBUG: reparsing\n";
                warn "DEBUG: original error: $@\n";
            }
            $parse_count++;
            goto PARSE if $parse_count < 2;
        }
        RSS2Mail::Exception::Parse->throw( 
            feed => $args{feed_name},
            error => ( $failed_parse or $@ )
        );
    }

    if ( $args{ title_only } ) {
        return $parser->title;
    }

    my @feed;
    foreach my $item ( $parser->entries ) {
        my $desc = $item->content->body || '[no content]';
        unless ( $] < 5.007 ) {
            $desc = decode( 'utf8', $desc ) 
                unless Encode::is_utf8( $desc );
        }

        my $title = $item->title
                  || '[no title]';
        $title =~s/\n//g;
        unless ( $] < 5.007 ) {
            $title = decode( 'utf8', $title )
                unless Encode::is_utf8( $title );
        }

        my $date = '';
        eval {
            $date = $item->issued->dmy . ' ' . $item->issued->hms
                if $item->issued; 
        };
        if ($@) {
            RSS2Mail::Exception::Parse->throw(
                feed => $args{feed_name},
                error => $@
            );
        }

        my $permalink = $item->id;

        my $author = $item->author 
                     || '';

        my $link = $item->link;

        # this is to deal with a mixture of issues with the way
        # XML::Feed seems to work things out and Sam Ruby's
        # atom feed being too clever by half. It's all very
        # heuristicy
        if ( not $link or $link !~ /^http:/ ) {
            $link = $item->id;
            if ( not $link or $link !~ /^http:/ ) {
                warn "DEBUG: no http in link or id, trying extreme measures\n"
                    if ( $RSS2Mail::Config::DEBUG > 2 );
                # XML::Feed only looks for links with rel eq 'alternate'
                # but the atom spec tells us that if there's no rel then
                # we can assume it meant alternate
                # this is fixed in recent versions of XML::Feed
                $link = undef;
                if ( $item->{entry} and UNIVERSAL::can( $item->{entry}, 'link' ) ) {
                    $link = first { not $_->rel } $item->{entry}->link;
                } elsif ( $item->{entry} and $item->{entry}->{link} ne "") {
                    $link = $item->{entry}->{link};
                }

                if ( $link and UNIVERSAL::can( $link, 'href' ) ) {
                    $link = $link->href;
                }
            }
            # sometime we need to use the base URI in combination 
            # with the link to get an absolute URI...
            if ( not $link or $link !~ /^http:/ ) {
                warn "DEBUG: still no http in link or id, useing _uri_to_abs on
                      $link and " . $parser->link . "\n"
                    if ( $RSS2Mail::Config::DEBUG > 2 );
                my $base = $parser->link;
                $base = $args{feed_uri} if $base eq '.';
                $link = _uri_to_abs($link, $base);    
            }

        }

        my $category = $item->category || '';

        push @feed, { 
                      author => $author,
                      content => $desc, 
                      title => $title, 
                      date => $date, 
                      permalink => $permalink,
                      category => $category,
                      link => $link || '[no link]',
                    };
    }

    return \@feed;
}

sub process_feed {    
    my $items = shift;
    my $feed = shift;
    my $feed_uri = shift;
    my @items_to_mail;

    foreach my $fetched_item ( @$items ) {
        my $string_to_md5 = $fetched_item->{ 'title' } 
                          . $fetched_item->{ 'content' };

        # we do this to stop wide character warnings
        $string_to_md5 = encode('utf8', $string_to_md5) unless $] < 5.007;
        my $md5 = md5_hex( $string_to_md5 );
        $fetched_item->{ 'md5' } = $md5;

        my @items = RSS2Mail::Items->search(
                            permalink => $fetched_item->{'permalink'},
                            id_feed   => UNIVERSAL::can( $feed, 'id' ) ? 
                                            $feed->id 
                                            : 0,
                    );

        my $item;
        if ( @items == 0) {         # new item

            $fetched_item = format_item( $fetched_item, $feed_uri );
            push @items_to_mail, $fetched_item;

        } elsif ( @items == 1 ) {   # we've seen this before
            # skip if it's not changed
            next unless $md5 ne $items[0]->md5();

            $fetched_item = format_item( $fetched_item, $feed_uri );

            # has it changed in any significant way though?
            my $diff;
            if ( $fetched_item->{'content'} and 
                 $diff = diff_text( $fetched_item->{'content'},
                                       $items[0]->content() ) ) 
            {
                if ( $RSS2Mail::Config::DEBUG > 1 ) {
                    warn "DEBUG: found diff\n";
                    warn "DEBUG: stored content\n";
                    warn $items[0]->content . "\n";
                }

                $items[0]->content( $fetched_item->{'content'} );
                $items[0]->title( $fetched_item->{'title'} );
                $items[0]->date( $fetched_item->{'date'} );
                $items[0]->link( $fetched_item->{'link'} );
                $items[0]->author( $fetched_item->{'author'} );
                $items[0]->category( $fetched_item->{'category'} );
                $items[0]->diff( $diff );
                $items[0]->md5( $md5 );
                push @items_to_mail, $items[0];
            } else {
                warn "DEBUG: diff not found\n" if $RSS2Mail::Config::DEBUG > 1;
                next;
            }
        } else {            # more than one item with that permalink
            # should do something more useful here but as it will
            # never happen we'll just skip happily to the next item ;)
            warn "DEBUG: multiple items\n" if $RSS2Mail::Config::DEBUG > 1;
            next;
        }
    }

    return \@items_to_mail;
}

sub format_item {
    my $fetched_item = shift;
    my $feed_uri = shift;

    if ( $RSS2Mail::Config::DEBUG > 1 ) {
        warn "DEBUG: new item\n";
        warn "DEBUG: raw title\n";
        warn $fetched_item->{'title'} . "\n";
        warn "DEBUG: raw content\n";
        warn $fetched_item->{'content'} . "\n";
    }

    # autoformat needs to have ignore set or it applies 
    # an ignore_headers rule which means lines with no indent and
    # a colon in them aren't formatted
    $fetched_item->{'content'} =
        html_to_text( $feed_uri, $fetched_item->{'content'} )
        || autoformat( $fetched_item->{'content'}, 
                        { left => 4, right => 72, 
                          all => 1, ignore => qr/^$/ } );
    $fetched_item->{'title'} =
        html_to_text( $feed_uri, $fetched_item->{'title'} ) 
        || $fetched_item->{'title'};

    for ($fetched_item->{'title'}) {
        s/\A[[:cntrl:]\s]+//;
        s/[[:cntrl:]\s]+\z//;
        s/[[:cntrl:]\s]+/ /g;
    }

    if ( $RSS2Mail::Config::DEBUG > 1 ) {
        warn "DEBUG: formatted title\n";
        warn $fetched_item->{'title'} . "\n";
        warn "DEBUG: formatted content\n";
        warn $fetched_item->{'content'} . "\n";
    }

    # sometimes something in html_to_text causes text to be encoded so
    # we have to check and decode it
    unless ( $] < 5.007 ) {
        $fetched_item->{content} = decode( 'utf8', $fetched_item->{content} )
            unless Encode::is_utf8( $fetched_item->{content} );
    }
    return $fetched_item;
}

sub is_html {
    my $html = shift;

    # does this look like html?
    if ( $html =~ m{(?:&#?[[:alnum:]]+?;)|(?:<[\w /\n]+?>)} ) {

        # so, it looks like html but it might just be a soup of
        # pseudo html tags. if we have HTML::Lint then we pull 
        # out any pseudo, or bad, tags, stuff them in a regex 
        # and then use that to see if it's really HTML.
        # moderate shonk factor alert
        unless ($no_lint) {
            warn "DEBUG: Running through HTML::Lint\n" 
                if $RSS2Mail::Config::DEBUG > 1;
            my $l = HTML::Lint->new();
            $l->parse($html);
            my @re;
            my $re_tmpl = '(?:&#?[[:alnum:]]+?;)|(?:(?!RE)<[\w /\n]+?>)';

            if ($l->errors > 0) {
                foreach my $e ($l->errors) {
                    warn "DEBUG: HTML::Lint error: " . $e->as_string . "\n" 
                        if $RSS2Mail::Config::DEBUG > 1;

                    if ( $e->as_string =~ /Unknown element <(.*)>/ ) {
                        push @re, $1;
                    }
                }
                if (scalar @re) {
                    my $bad_tags = join('|', @re);
                    $re_tmpl =~ s/RE/$bad_tags/;
                    warn "DEBUG: Using Regex: $re_tmpl\n"
                        if $RSS2Mail::Config::DEBUG > 1;
                    return $html =~ m/$re_tmpl/i ? 1 : undef;
                }
            } else {
                return 1;
            }
        }

        # no HTML::Lint so just hope that what looks like HTML is
        return 1;
    } else {
        warn "DEBUG: Not HTML\n"
            if $RSS2Mail::Config::DEBUG > 1;
        return undef;
    }
}

# take some content and if it looks like html make it plain text
# should also canonacilise any relative links in the html as well
# as do some entity conversion
sub html_to_text {
    my $base = shift;
    my $html = shift || ''; 

    return undef unless is_html($html);
    warn "DEBUG: is html\n" if $RSS2Mail::Config::DEBUG > 1;

    # encoded & cause uneeded 2nd parsing, and yes the XML part of 
    # XML::RSS should fix this but this will get round people putting
    # this sort of thing in CDATA tags or other crazy things...
    # $html =~ s/&amp;/&/gi;

    my $format = HTML::FormatText::WithLinks->new(
                            rightmargin => 72, 
                            before_link     =>  '[%n] ',
                            footnote        =>  '[%n] %l', 
                            link_num_generator => sub { shift(); },
                            with_emphasis   =>  1,
                            base => $base);
    $html = $format->parse($html);

    # make good the entities we do know about
    # we need to do this as HTML::Entities doesn't deal with
    # some entities if we're not using perl 5.8 and above.
    if ( $] < 5.007 ) {
        foreach my $entity (keys %entities) {
            $html =~ s/$entity/$entities{$entity}/gi;
        }
    }

    # and remove the ones we don't. 7bit ascii all the way baby
    $html =~ s/&#?[[:alnum:]]+?;/ /g;

    return $html;
}

sub diff_text {
    my $new = shift;
    my $orig = shift;

    # sqlite, i assume, doesn't keep the utf8 flag
    unless ( $] < 5.007 ) {
        $orig = decode( 'utf8', $orig )
            unless Encode::is_utf8( $orig );
    }

    # Text::Diff doesn't add a newline so you can get the lines in
    # the diff running together...
    $new .= "\n" unless $new =~ /\n$/s;
    $orig .= "\n" unless $orig =~ /\n$/s;

    my $diff = diff \$orig, \$new;

    return $diff;
}

sub create_mail_body {
    my $items = shift;
    my $body;

    foreach my $item ( @$items ) {
        $body = '' if $RSS2Mail::Config::mail_per_item;

        my %temp;
        if ( ref( $item ) =~ /RSS2Mail::Item/ ) {
            $temp{title} = $item->title();
            $temp{date} = $item->date();
            $temp{link} = $item->link() || $item->permalink();
            $temp{author} = $item->author(),
            $temp{category} = $item->category();
            $temp{content} = $item->content();
            $temp{diff} = $item->diff();
        } else {
            $temp{title} = $item->{title};
            $temp{date} = $item->{date};
            $temp{author} = $item->{author};
            $temp{category} = $item->{category};
            $temp{link} = $item->{link} || $item->{guid};
            $temp{content} = $item->{content};
        }

        $body .= "$temp{title}\n$temp{link}\n";
        $body .= "\tby $temp{author}\n" if $temp{author};
        $body .= "\tat $temp{date}\n" if $temp{date};
        $body .= "\tcategory: $temp{category}\n" if $temp{category};
        $body .= "\n";

        # make sure there are no newlines at the end of the 
        # title. 
        $temp{title} =~ s/\n+$//;

        #unless ( $] < 5.007 ) {
        #    $temp{content} = decode( 'utf8', $temp{content})
        #        unless Encode::is_utf8( $temp{content} );
        #}

        $body .= sprintf("%s\n\n", $temp{content}) if $temp{content};

        if ( defined $temp{diff} and $RSS2Mail::Config::diff ) {
            #unless ( $] < 5.007 ) {
            #    $temp{diff} = decode( 'utf8', $temp{diff})
            #        unless Encode::is_utf8( $temp{diff} );
            #}
            $body .= "$temp{diff}\n\n";
        }

        $item->{ body } = $body if $RSS2Mail::Config::mail_per_item;
    }

    return $body;
}

sub send_mail {
    my %args = @_;
    # need to do this so that Perl knows it's got UTF8 in it
    # otherwise we get wide character warnings
    # also, as we are claiming the content is UTF8...

    if ( $RSS2Mail::Config::mail_per_item ) {
        # reverse as want mail in mailbox in right order...
        for my $item ( reverse @{ $args{ items_to_mail } } ) {
            my $title = $item->{title};
            unless ($] < 5.007) {
                Encode::_utf8_off($title);
                use MIME::Words qw(encode_mimeword);
                $title = encode_mimeword($title, 'B', "UTF-8")
                    if $title =~ /[^[:print:]]/;
            }
            $item->{body} = encode('utf8',$item->{body}) unless $] < 5.007;
            _send_mail(
                subject => $args{feed} . ": " . $title,
                data    => $item->{body},
            );
        }
    } else {
        $args{body} = encode('utf8',$args{body}) unless $] < 5.007;

        _send_mail(
            subject =>  $args{feed},
            data    =>  $args{body},
        );
    }

    return 1;
}

sub _send_mail {
    my %args = @_;

    my $mail = MIME::Lite->new(
        To      =>  join(',', @RSS2Mail::Config::mail_to),
        From    =>  $RSS2Mail::Config::mail_from,
        Subject =>  $args{subject},
        Data    =>  $args{data},
    );

    $mail->attr('content-type.charset' => 'utf-8') unless $] < 5.007;

    if ( $print_mail ) {
        print $mail->as_string;
    } else {
        $mail->send();
    }
}

sub update_items {
    my $items = shift;
    my $feed = shift;
    foreach my $item ( @$items ) {
        if ( UNIVERSAL::isa( $item, 'RSS2Mail::Items') ) {
            $item->last_update( time() );
            $item->update;
        } else {
            RSS2Mail::Items->create( {
                permalink   =>  $item->{ 'permalink' },
                title       =>  $item->{ 'title' },
                content     =>  $item->{ 'content' },
                md5         =>  $item->{ 'md5' },
                last_update =>  time(),
                id_feed     =>  $feed,
            } );
        }
    }

    return 1;
}

# quick wrapper for RSS autodiscovery
sub _autodiscover {
    my $uri = shift;

    warn "DEBUG: Attempting autodiscovery at $uri\n" if $RSS2Mail::Config::DEBUG;

    my @links = XML::Feed->find_feeds( $uri );

    if ( $RSS2Mail::Config::DUMPER ) {
        warn "DEBUG: autodiscovery found:\n" if $RSS2Mail::Config::DEBUG > 1;
        warn Dumper( \@links ) if $RSS2Mail::Config::DEBUG > 1;
    }

    # just return the first link
    if ( @links ) {
        return $links[0];
    } else {
        RSS2Mail::Exception::Autodiscover->throw(
            uri => $uri,
        );
    }
}

sub save_content {
    my ($dir, $target, $content) = @_;
    return undef unless $content;
    _write_to_file("$dir/$target.rss", $content, $target) or return undef;
    return "$dir/$target.rss";
}

sub _write_to_file {
    my $file = shift;
    my $string = shift;
    my $feed_name = shift;

    return undef unless $file;
    return undef unless $string;

    $file = rel2abs($file);

    my $dir = (splitpath($file))[1];
    mkpath $dir unless -e $dir;

    open FILE, ">", "$file" 
        or do {
            my $_file_err = "Failed to write to [$file] - $!";
            RSS2Mail::Exception->throw(
                feed => $feed_name,
                error => $_file_err
            );
        };
    flock FILE, LOCK_EX;
    seek FILE, 0, SEEK_SET;
    # binmode FILE, "utf8" if $] > 5.007; 
    print FILE $string;
    close FILE;

    return 1;
}

# stolen from LWP::UserAgent::request()
sub _uri_to_abs {
    my $uri = shift;
    my $base = shift;
    local $URI::ABS_ALLOW_RELATIVE_SCHEME = 1;
    return $HTTP::URI_CLASS->new($uri, $base)                                 
        ->abs($base);
}

RSS2Mail::Exception->Trace(1) if $RSS2Mail::Config::DEBUG;

my $feeds;

if ( $fetch ) {
        $feeds = RSS2Mail::Feeds->search({
                        name =>  'TEMP',
                    });
} else {
    $feeds = RSS2Mail::Feeds->retrieve_all();
}

while ( my $feed = $feeds->next ) {
    warn "DEBUG: Processing " . $feed->name . "\n" if $RSS2Mail::Config::DEBUG;

    my ( $content, $rss, $items, @items_to_mail, $file, $feed_name, $feed_uri );

    # grab this here as if we delete the feed then we still need the name
    # for the unsubscribed rss generation
    $feed_name = $feed->name();

    $content = fetch_feed( $feed );

    if ( UNIVERSAL::can( $feed, 'uri' ) ) {
        $feed_uri = $feed->uri;
    } else {
        $feed_uri = '';
    }

    next unless $content;

    warn "DEBUG: Fetched RSS\n" if $RSS2Mail::Config::DEBUG;
    warn "$content\n" if $RSS2Mail::Config::DEBUG > 1;

    $rss = clean_rss( $content );

    warn "DEBUG: Cleaned RSS\n" if $RSS2Mail::Config::DEBUG;
    warn "$rss\n" if $RSS2Mail::Config::DEBUG > 1;

    # we save the file as the debianised version of the module has a
    # patch which makes memory usage balloon if you don't use a file
    # plus having the file around is handy for debugging
    eval {
        $file = save_content($RSS2Mail::Config::base_dir, $feed_name, $rss);
    };
    if ( $@ ) {
        if ( UNIVERSAL::isa( $@, 'RSS2Mail::Exception' ) ) {
            carp $@->error;
        } else {
            croak $@ unless ( UNIVERSAL::isa( $feed, 'Class::Deleted' )
                        or UNIVERSAL::isa( $feed, 'Class::DBI::Object::Has::Been::Deleted' )
                    );
        }
    }

    eval {
        $items = parse_rss( string => $rss, feed_name => $feed_name, feed_uri => $feed_uri );
    };
    if ( $@ ) {
        if ( UNIVERSAL::isa( $@, 'RSS2Mail::Exception::Parse' ) ) {
            warn $@;
            next if $@->fatal;
        } else {
            croak "Error in $feed_name: " . $@;
        }
    }

    warn "DEBUG: RSS parsed\n" if $RSS2Mail::Config::DEBUG;

    @items_to_mail = @{ process_feed( $items, $feed, $feed_uri ) };
    next unless @items_to_mail;

    my $mail_body = create_mail_body( \@items_to_mail );

    warn "DEBUG: mail formatted\n" if $RSS2Mail::Config::DEBUG;
    warn "$mail_body\n" if $RSS2Mail::Config::DEBUG > 1;

    my %send_mail_args;

    if ( $RSS2Mail::Config::mail_per_item ) {
        %send_mail_args = ( 
            feed    =>  $feed_name,
            items_to_mail   =>  \@items_to_mail,
        );
    } else {
        %send_mail_args = (
            feed => $feed_name,
            body => $mail_body,
        );
    }

    if ( send_mail( %send_mail_args ) ) {
        # need to check if feed has been deleted before we try update
        if ( UNIVERSAL::isa( $feed, 'RSS2Mail::Feeds' ) ) {
            update_items( \@items_to_mail, $feed );
            $feed->last_update(time());
            $feed->update();
        }
        warn "DEBUG: mail sent\n" if $RSS2Mail::Config::DEBUG;
    } else {
        carp "Failed to create email for " . $feed->name;
    }
}

if ( $fetch ) {
    warn "DEBUG: deleting one time fetch feed\n" if $RSS2Mail::Config::DEBUG;
    RSS2Mail::Feeds->search( name => 'TEMP' )->first()->delete();
}

# initial set up of database
BEGIN {
    my $base_dir = $RSS2Mail::Config::base_dir;

    unless ( -e $base_dir ) {
        mkpath $base_dir;
    }

    my ($db_file) = ( $RSS2Mail::DBI::DSN =~ /=([^=]+)$/ );
    unless ( -e $db_file ) {
        RSS2Mail::DBI->_init_db();
    }
};

exit;

__END__

=head1 NAME

rss2mail2 - an RSS aggregator that delivers feeds as plain text email

=head1 SYNOPSIS

    # add a feed
    rss2mail2 -add test_feed -uri http://example.com/index.rss

    # add a feed getting the name from the feed file
    rss2mail2 -add http://example.com/index.rss

    # edit feed details
    rss2mail2 -edit test_feed -uri http://example.com/index.atom

    # list all feeds
    rss2mail2 -list
        test_feed: http://example.com/index.rdf

    # fetch feeds
    rss2mail2

    # delete a feed
    rss2mail2 -del test_feed

    # fetch a feed once
    rss2mail2 -fetch http://example.com/index/rss

    # get a summary of the common options
    rss2mail2 -help

    # get the version of rss2mail2
    rss2mail2 -Version

Both the add and edit options also support the --auto option which will
attempt to find an RSS feed at the supplied URI. It also copes with 
feeds in the Atom format.

=head1 DESCRIPTION

rss2mail2 is an RSS aggregator for those that like to get all their 
information delivered to their inbox. It runs through your list of
feeds and sends you one mail per updated feed containing all the
new and updated items in that feed. Optionally it will provide you 
with a universal style diff of the updated feeds.

Any HTML will be converted to plain text and to keep things simple
it will, by default, strip out pretty much anything that isn't
plain text. If you have Perl 5.8 then any Unicode content should
be fine. If you have an earlier version of Perl rss2mail2 will try
to do the right thing but if it doesn't know what that is it will
replace the character with a space.

rss2mail2 tries to do the right thing with missing feeds or changes
in feed location. If you try to subscribe to a URL that isn't there
then it won't subscribe. If the URL for a feed stops working then it
will keep trying for a user configurable number of attempts (this
defaults to 30) and then unsubscribe. If the feed moves but provides 
a permanent redirect then it will update the URL accordingly. If it
does unsubscribe from a feed then a notice to that effect will appear 
as an item in that feed. It also spits out errors on STDERR. 

=head1 OPTIONS

rss2mail2 can be called with the following options:

=over 4

=item -add feed_name

Add a feed with feed_name as the name of the feed. 

If feed_name begins with http then rss2mail will assume you've given it
the address of a feed file and attempt to fetch it. It will then extract
the name from that file, assuming it's a valid feed file.

Alternatively you can supply your own name and supply the location of the
feed using the uri option.

=item -edit feed_name

Changes the location of feed_name. You need to pass the new feed location
with the uri option.

=item -uri feed_location

Used with the add and edit options to supply the location of the feed.

=item -auto

If called with either the add or edit options then it will try and guess
the feed location from the URI passed to the uri option. 

This works with the auto naming behaviour of add.

=item -delete feed_name

Deletes feed_name and all entries from the database.

=item -list 

Lists all the feeds and their locations.

=item -opml

Outputs an OPML file with details of all the feeds. It only provides the
name of the feed and the URL of the feed file but should be enough if 
you want to move your subscriptions to another feed reader that supports
OPML import. This does require the optional XML::OPML module.

=item -fetch feed_location

Performs a one time fetch of the feed at feed_location. 

=item -help

Prints out a usage message summarising commonly used options.

=item -items

Send one email per item instead of an email per feed.

=item -cleanup

Deletes old entries. By default any entry older than 30 days is deleted. 
See the CONFIGURATION for how to alter this.

=item -print_mail

Instead of sending the mails just print them out to STDOUT. Only really
useful for debugging.

=item -Version

Display the version number of rss2mail2 and exit.

=item -Debug level

If set to greater than 0 then rss2mail2 will spew out a whole load of 
debugging information. You'll probably never want to use this. If
set to 3 or above you'll get an awful lot of information.

=back

All of the above options have a short form consisting of only the 
first letter of the option so add can also be called as -a and edit 
as -e.

=head1 CONFIGURATION

rss2mail2 looks for a configuration file called .rss2mail2rc in the
users home directory or, if that doesn't exist, in /etc/rss2mail2rc. If
neither of these exists it spits out an error message and quits.

The configuration file should be readable by L<AppConfig> but in case you
can't be bothered to read the docs for that an example config file 
looks like this:

    base_dir = ${HOME}/.rss2mail2
    mail_to = ${USER}
    mail_to = struan
    mail_to = bob
    mail_from = rss2mail2@example.com
    oldest = 30
    dsn  = dbi:SQLite:dbname=${HOME}/.rss2mail2/rss.db

This example will mail struan and bob and use an SQLite database located
in ~/.rss2mail2/rss2mail.db to store feed data. Any files it needs to save
will also be stored in ~/.rss2mail2 and the mails will use rss@example.com
as a From address. The oldest option is used when you use the -cleanup option 
and means that feed entries older than 30 days will be purged from the database.

Probably all you really need to change is mail_to and mail_from. As above 
if you want to sent the mails to multiple people you should have multiple
mail_to options in the file with one email address each.

If you prefer to receive a mail for each item in the feed then you can
either add a line with mail_per_feed on it to the config file or run 
rss2mail2 with the -i option.

If you don't like the diffs then you can turn them off by adding a line
with no_diffs. 

The only other option you can set is max_not_found which configures
how many times in a row fetching a feed fails before it is deleted.
The default is 30.

Sample procmail recipe:

       :0:
       * ^From: rss2mail2
       * ^Subject: [a-z_]+:
       * ^Subject: \/[a-z_]+
       rss/$MATCH

=head1 CAVEATS

It should be noted that the diffs may not be ideal as the limitations 
of Text::Diff hold. Most notably it's not good with changes in 
whitespace.

Feed auto discovery currently picks the first available feed.

=head1 REQUIRES

What would the world be without a raft of dependencies?

=over 4

=item L<XML::Feed>

=item L<Text::Autoformat>

=item L<LWP>

=item L<AppConfig>

=item L<MIME::Lite>

=item L<MIME::Tools>

=item L<List::Util>

=item L<HTML::FormatText::WithLinks>

=item L<Text::Diff>

=item L<Exception::Class>

=item L<DBI>, L<DBD::SQLite> 

=item L<Class::DBI>, L<Class::DBI::SQLite>, L<Class::DBI::BaseDSN>

=back

Plus of course all the dependencies that they require.

Just run it and install modules till it stops complaining.

rss2mail2 will use the following modules if they are available:

=over 4

=item L<HTML::Lint>

This is used to improve the HTML to text processing 

=item L<XML::OPML>

This is used to spit out the feed list in OPML format. 

=back

=head1 AUTHOR

Struan Donald <code@exo.org.uk>, http://exo.org.uk/

=head1 COPYRIGHT

=head1 LICENSE

Same terms as Perl itself

=head1 SEE ALSO

procmail(1)

=cut

S:	2.29-alt2
5.0:	2.29-alt1
4.1:	2.29-alt1
4.0:	2.29-alt1