#!/usr/bin/perl 
## insert -- stuff messages into the database
## Copyright (c) 2000 Christopher League; see LICENSE for details

## TODO: How to get sendmail to invoke this script robustly?


## I require these packages:
##  - DBI-1.11 (Generic database interface)
##  - Msql-Mysql-modules-1.2209 (Mysql interface)
##  - MailTools-1.13 (RFC822 message parsing) by Graham Barr
##  - TimeDate-1.08 (Date parsing) by Graham Barr
use DBI;
use Mail::Internet;
use Mail::Header;
use Mail::Field::Date;
use Mail::Field::AddrList;
use Date::Format;
use Getopt::Long;
use strict;

## Descriptive text
my $version = "0.1";

my $version_text =
"insert (Meba) $version
Copyright (c) 2000 Christopher League
Meba comes with ABSOLUTELY NO WARRANTY.
You may redistribute and/or modify Meba under the terms of the 
GNU General Public License;  see the file LICENSE for details.
";

my $usage_text = 
"Usage: insert [OPTIONS] [FILES] ...
Insert the email messages in FILES into the database.
If no FILES are given, read from standard input.

  -c  --conf-file=FILE read configuration from FILE (default meba.conf)
  -n, --dry-run        output the queries, but do not execute them
  -v, --verbose        describe what is happening in gross detail
      --help           display this help and exit
      --version        output version information and exit

The following options override the settings in the meba.conf file:

  -d, --database=NAME  name of database to use (default to meba_`whoami`)
  -t, --prefix=TEXT    prefix table names with `TEXT' (default `')
  -h, --host=HOST      host running the database server (default to localhost)
      --port=NUM       port number to use for connection, if non-standard
  -u, --user=NAME      login to server as this user (default to current user)
  -p, --password=TEXT  password to use when connecting to server

Report bugs to meba-bugs\@contrapunctus.net.
";

## We want command line options to override configuration file settings,
## but the user can specify a configuration file as a command line 
## option!  The solution is to parse the command line options first,
## putting the results into a hash called `cmd_line_options'.  Then
## we look for and parse the configuration file(s), putting the results
## into the hash `options'.  Finally, we merge the `cmd_line_options'
## into `options', giving precedence to the former.
my %cmd_line_options;
my %options;
$options{user} = getpwuid $<;     # Q: do I want real or effective UID here?
$options{database} = "meba_$options{user}";

## These are the specs for command-line options
my @cmd_line_specs = 
    (\%cmd_line_options, 
     "database|d=s", "host|h=s", "dry-run|n", "conf-file|c=s",
     "password|p=s", "port=s", "user|u=s", "verbose|v", "prefix|t=s",
     "help", "version" );

## Parse the command line into `cmd_line_options'.
my $getopt_result = GetOptions (@cmd_line_specs);

## If the user wanted the usage or version blurbs, then we can 
## output them and stop now.
print STDERR $usage_text and exit 1
    if $cmd_line_options{help};

print STDERR $version_text and exit 2
    if $cmd_line_options{version};

## If there was an error in the command line options, we can
## also stop now.
unless ($getopt_result) {
    print STDERR "Try `$0 --help' for more information.\n";
    exit 3;
}

## Next, look for and parse the configuration file(s).
## We read each of the following files, in reverse order.
## That way the FIRST file in this list has precedence.
my @conf_file_list = ("./meba.conf", "$ENV{HOME}/.meba.conf");

## Put user-specified configuration file at head of list.
unshift @conf_file_list, $cmd_line_options{"conf-file"}
    if $cmd_line_options{"conf-file"};

## Read configuration file(s) in reverse order..
my $conf_file;
while ($conf_file = pop @conf_file_list) { # for each potential conf file
    if(open (CONF, $conf_file)) {      # if it can be opened,
        print "Reading $conf_file...\n" if $cmd_line_options{verbose};
        while(<CONF>) {                # read each line
            chop;
            if (/\s*\#.*/) { }         # ignore comments
            elsif (/\s*([\w\.]+)\s*\=\s*(.*)/) { # varname = value
                $options{$1} = $2;
            }
            elsif (/\s*/) { }           # ignore blank lines
            else {
                print STDERR "$conf_file:$.: bad configuration command\n";
                print STDERR "Try `$0 --help' for more information.\n";
                exit 5;
            }
        }
        close CONF;
    }
    else {
        print "Configuration file $conf_file could not be opened.\n"
            if $cmd_line_options{verbose};
    }
}

## Now we need to merge the `cmd_line_options' into `options', 
## giving priority to the former in case of conflict.
my @cmd_line_keys = keys %cmd_line_options;
my $key;
while ($key = shift @cmd_line_keys) {
    $options{$key} = $cmd_line_options{$key};
}

## Verbosity: dump key/value pairs in %options
if ($options{verbose}) {
    my $key; my $value;
    print "Configuration:\n";
    while (($key, $value) = each %options) {
        print "  $key = $value\n";
    }
}

##### END OF OPTION PROCESSING, now on to the important stuff...

## Determine arguments to connect()
my $data_source = "DBI:mysql:$options{database}";
if ($options{host}) {
    $data_source .= ":$options{host}";
    $data_source .= ":$options{port}" if $options{port};
}
print "Connect ($data_source, $options{user}, $options{password})\n"
    if $options{verbose};

## Go ahead and connect, unless this is a dry run
my $db;
unless ($options{"dry-run"}) {
    $db = DBI->connect ($data_source, $options{user}, $options{password})
        || exit 6;
}

## Process each file argument in ARGV, or use standard input
## if no arguments exist.
if ($#ARGV < 0) {
    print "Reading from standard input...\n" if $options{verbose};
    &do_stream (\*STDIN);
} 
else {
    my $file;
    while ($file = shift @ARGV) {
        if(open (FILE, $file)) {
            print "Processing `$file'... ";
            print "\n" if $options{verbose};
            &do_stream (\*FILE);
            print "done\n" unless $options{verbose};
            close FILE;
        }
        else {
            print STDERR "Error: could not open `$file'.\n";
        }
    }
}

## All done -- disconnect and exit
unless ($options{"dry-run"}) {
    $db->disconnect || die "Error disconnecting: " . $db->errstr;
}
exit 0;

##### END OF MAIN PROGRAM, subroutines follow.

## do_stream STREAM -- parse and insert a single RFC822 message from
## the input stream STREAM.  Returns nothing.
sub do_stream {
    my ($stream) = @_;

    ## Parse the stream as a single RFC822 message
    my $msg = new Mail::Internet $stream;

    ## Fetch and interpret all the attributes of the message.
    ## Multi-valued attributes will be represented by lists in this hash.
    my %attributes = &parse_message ($msg);

    ## Insert the attributes into the appropriate tables.
    my $id = &insert_attributes (%attributes);

    ## Mark the new message as unseen
    &mark_unseen ($id);
}

## parse_message MSG -- returns a hash containing all the relevant
## attributes of the message MSG, including the `body', complete
## `headers', and specially parsed headers such as `subject' and
## `date'.  Multi-valued attributes will be bound to a list reference
## rather than a string.  None of the strings have been quoted for SQL
## yet.
sub parse_message {
    my ($msg) = @_;
    my %attr;

    ## body text and headers are lists of newline-terminated strings,
    ## which we just join into a single string.
    $attr{body} = join '', @{ $msg->body };
    $attr{headers} = 
        join '', grep {defined } @{ $msg->head->{'mail_hdr_list'}};

    print ("  Body/header contains ", 
           length $attr{body}, "/", 
           length $attr{headers}, " bytes.\n")
        if $options{verbose};

    ## subject -- nothing complicated here.
    $attr{subject} = &get_header($msg, "Subject") || "";

    ## date needs to be parsed
    my $d = extract Mail::Field ("Date", $msg->head);
    $attr{date} = time2str("%Y-%m-%e %H:%M:%S", ($d)? $d->time : 0);

    print "  Date `", &get_header($msg, "Date"), "' => $attr{date}\n"
        if $options{verbose};

    ## message ID needs to be parsed
    my $msgid = (&get_header($msg, "Message-ID") ||
                 &get_header($msg, "Message-Id"));
    if ($msgid =~ /\<([^\>]*)\>/) { $msgid = $1; } # remove angle brackets
    $attr{message_id} = $msgid || "";

    print "  Message ID `$msgid'\n" if $options{verbose};

    ## there should be only one "From" address
    my ($from_addr, $from_name) = &get_addresses($msg, "From");
    $attr{from_addr} = $from_addr->[0] || "";
    $attr{from_name} = $from_name->[0];

    ## recipient addresses
    ($attr{to_addrs}, $attr{to_names}) = &get_addresses($msg, "To");
    ($attr{cc_addrs}, $attr{cc_names}) = &get_addresses($msg, "Cc");
### ($attr{bcc_addrs}, $attr{bcc_names}) = &get_addresses($msg, "Bcc");
### BUG: Bcc will not work as of now, because the Mail::Field 
### extractor does not recognize "Bcc" as an AddrList. -- 8 Apr 2000

    ## Message-IDs to which this message refers
    $attr{references} = &get_references($msg, $attr{from_addr});

    return %attr;
}

## get_header(MSG,HDR) -- returns a string containing the value of 
## header HDR in the message MSG.
sub get_header {
    my ($msg, $hdr) = @_;
    my $v = $msg->head->get($hdr);
    chop $v;                    # remove newline
    return $v;
}

## get_addresses(MSG,HDR) -- where HDR is one of `From', `To', `Cc',
## and so on.  Returns a pair of array references.  The arrays (which
## should be the same size) contain the email addresses and full
## names, respectively, that were present in the HDR field.
sub get_addresses {
    my ($msg, $tag) = @_;
    my @addrs;
    my @names;

    my $a = extract Mail::Field ($tag, $msg->head);
    @addrs = $a->addresses() if $a;
    @names = $a->names() if $a;

    if ($options{verbose}) {
        my $i;
        for ($i = 0;  $i < @addrs;  $i++) {
            print "  $tag", "[$i] $names[$i] <$addrs[$i]>\n";
        }
    }

    return (\@addrs, \@names);
}

## get_references(MSG,FROM) -- return a reference to an array containing
## Message IDs of messages to which this one refers.  This works by checking
## the References and In-Reply-To headers.  Some formats of In-Reply-To do 
## not contain message IDs, but they do have exact dates.  So with FROM, the
## email address in the From field of this message, it may be possible to 
## find the message in the database to which this one is a reply.
sub get_references {
    my ($msg, $from_addr) = @_;
    my @refs;
    ## The format of the References header is the most reliable.
    ## It should be References: <ID@HOST> <ID@HOST> <ID@HOST> ...
    my $refs_line = &get_header($msg, "References");
    if ($refs_line) {
        print "  Parsing References: header...\n" if $options{verbose};
        ## Find and delete each message ID, starting from the
        ## rightmost.  This regexp starts at the beginning of the line
        ## and matches a maximal string, labeling it as \1: ^(.*)
        ## Then, it matches a message ID, which is a string that does
        ## not contain any whitespace or `>', enclosed in `<...>'.
        ## This string, without the angle brackets, is labeled \2.
        ## Finally there might be some white space at the end.  We
        ## substitute all of $refs_line, then, with just \1,
        ## effectively deleting the message ID in \2.
        while ($refs_line =~ s/^(.*)\<([^\s\>]+)\>\s*/\1/s) {
            push @refs, $2;
            print "    found $2\n" if $options{verbose};
        }
    }
    ## According to Jamie Zawinski (from the Mozilla Grendel implementation):
    ##   "The most common forms of In-Reply-To seem to be
    ##        31%  NAME's message of TIME <ID@HOST>
    ##        22%  <ID@HOST>
    ##         9%  <ID@HOST> from NAME at "TIME"
    ##         8%  USER's message of TIME <ID@HOST>
    ##         7%  USER's message of TIME
    ##         6%  Your message of "TIME"
    ##        17%  hundreds of other variants (average 0.4% each?)"
    my $repl_line = (&get_header($msg, "In-Reply-To") ||
                     &get_header($msg, "In-reply-to"));
    if ($repl_line) {
        print "  Parsing In-Reply-To: header...\n" if $options{verbose};
        ## To find a message ID, if there is one, we will skip over
        ## everything up to and including "<", which we assume begins
        ## a message ID.
        if ($repl_line =~ /^[^\<]*\<([^\s\>]+)\>/s) {
            push @refs, $1;
            print "    found $1\n" if $options{verbose};
        }
        ## With a query, we might even be able to deal with the
        ## "Your message of TIME" variants.
        elsif ($repl_line =~ /message\s+of\s+\"?([^\"]*)/s) {
            ## $1 should contain a date/time string.  See if the
            ## MailTools and TimeDate can make sense of it.
            my $f = new Mail::Field ("Date", $1);
            my $d = time2str("%Y-%m-%e %H:%M:%S", $f->time);
            ## Now we could potentially query for a message 
            ## TO (or Cc) this person at that date...
            print "    query recipient $from_addr at $d ??\n"
                if $options{verbose};
        }
        else {
            print "    cannot understand '$repl_line'\n" if $options{verbose};
        }
    }
    ## Remove duplicate message IDs from @refs array
    @refs = &remove_duplicates (@refs);
    return \@refs;
}

## remove_duplicates(LIST) -- return a new list containing every unique
## string in the given LIST.  It also might mutate the given LIST.
## The best way to use this, then, is @list = remove_duplicates(@list);
sub remove_duplicates {
    my (@a) = @_;
    my (@b, $x);

    ## This implementation could be (n lg n) rather than quadratic,
    ## but what's an order of magnitude between friends?
    while ($x = pop @a) {
        unless (grep $_ eq $x, @b) {
            push @b, $x;
        }
    }

    return @b;
}

## insert_attributes(DB,ATTR) -- all of the relevant attributes have
## been collected, parsed, and massaged, and are now stored in the
## ATTR hash.  This subroutine will insert them into the various
## tables in the database.
sub insert_attributes {
    my (%attr) = @_;
    
    if ($options{"dry-run"}) {
        ## since this doesn't count, there's no harm in truncating the
        ## really long fields for display purposes.
        substr($attr{headers},60) = " [...]" if (length($attr{headers})) > 65;
        substr($attr{body},60) = " [...]" if (length($attr{body})) > 65;
        $attr{headers} =~ tr/\n/ /s;
        $attr{body} =~ tr/\n/ /s;
    }

    ## The `message' table is primary, all other tables contain
    ## references to the message_nums here.  We put all single-
    ## valued attributes into `message'...
    my @message_attrs = qw(from_addr from_name
                           subject date message_id
                           headers body);
    my $id = &insert_from_hash(\%attr, "$options{prefix}message", 
                               \@message_attrs);

    ## The various multi-valued recipient fields
    &insert_recipients($id, "To", $attr{to_addrs}, $attr{to_names});
    &insert_recipients($id, "Cc", $attr{cc_addrs}, $attr{cc_names});
###  &insert_recipients($id, "Bcc", $attr{bcc_addrs}, $attr{bcc_names});

    ## The reference table is for the multi-value references 
    ## attribute.
    &insert_references($id, $attr{references});

    my @cc_addr = @{ $attr{cc_addrs} };
    return $id;
}

sub insert_from_hash {
    my ($attr, $table, $keys) = @_;
    my %attr = %{ $attr };
    my @keys = @{ $keys };
    my $fields = join ",\n", @keys;
    my @values = map $attr{$_}, @keys;
    unless ($options{'dry-run'}) {
        @values = map $db->quote($_), @values;
    }
    my $values = join ",\n", @values;
    ## construct the query string
    my $q = 
"INSERT INTO $table (
$fields
) VALUES (
$values
)
";
    my $id = -1;
    print $q if $options{"dry-run"} or $options{verbose};
    unless ($options{"dry-run"}) {
        my $s = $db->prepare($q);
        $s->execute or exit 8;
        $id = $s->{mysql_insertid}; # Non-portable
    }
    return $id;
}

sub insert_recipients {
    my ($id, $tag, $addrs, $names) = @_;
    my @addrs = @{ $addrs };
    my @names = @{ $names };
    my @keys = qw(message_num kind address name);
    my $addr;
    while ($addr = pop @addrs) {
        my $name = pop @names;
        ## don't insert if there is a duplicate address down the line..
        if (grep lc $_ eq lc $addr, @addrs) {
            print "Not inserting $addr, duplicate.\n" if $options{verbose};
        } else {
            my %attr = ('message_num' => $id,
                        'kind' => $tag,
                        'address' => $addr,
                        'name' => $name);
            &insert_from_hash (\%attr, "$options{prefix}recipient", \@keys);
        }
    }
}

sub insert_references {
    my ($id, $refs) = @_;
    my @refs = @{ $refs };
    my @keys = qw(message_num message_id);
    my $ref;
    while ($ref = pop @refs) {
        my %attr = (message_num => $id,
                    message_id => $ref);
        &insert_from_hash (\%attr, "$options{prefix}reference", \@keys);
    }
}

sub mark_unseen {
    my ($id) = @_;
    my @keys = qw(message_num tag);
    my %attr = (message_num => $id,
                tag => "unseen");
    &insert_from_hash (\%attr, "$options{prefix}tag", \@keys);
}

