Forcibly decode headers with unencoded 8bit chars.

[usenet/newsstats.git] / bin / parsedb.pl
diff --git a/bin/parsedb.pl b/bin/parsedb.pl

index 10a1a5d..8004cb1 100755 (executable)
--- a/bin/parsedb.pl
+++ b/bin/parsedb.pl
@@ -27,6 +27,9 @@ use DBI;
  use Getopt::Long qw(GetOptions);
  Getopt::Long::config ('bundling');
  
+use Encode qw/decode/;
+use Mail::Address;
+
  ################################# Definitions ##################################
  
  # define header names with separate database fields
@@ -127,12 +130,19 @@ $DBQuery->execute()
  binmode(STDOUT, ":utf8");
  $DBHandle->do("SET NAMES 'utf8'");
  
+# create a list of supported encondings
+my %LegalEncodings;
+foreach (Encode->encodings()) {
+  $LegalEncodings{$_} = 1;
+}
  # parse data in a loop and write it out
  print "-------------- Parsing data ... -------------\n" if $OptDebug;
  while (my $HeadersR = $DBQuery->fetchrow_hashref) {
    my %Headers = %{$HeadersR};
  
    # parse $Headers{'headers'} ('headers' from DBTableRaw)
+  # remove empty lines (that should not even exist in a header!)
+  $Headers{'headers'} =~ s/\n\s*\n/\n/g;
    # merge continuation lines
    # from Perl Cookbook, 1st German ed. 1999, pg. 91
    $Headers{'headers'} =~ s/\n\s+/ /g;
@@ -140,9 +150,29 @@ while (my $HeadersR = $DBQuery->fetchrow_hashref) {
    my $OtherHeaders;
    for (split(/\n/,$Headers{'headers'})) {
      # split header lines in header name and header content
-    my ($key,$value) = split(/:/,$_,2);
-    $key =~ s/\s*//;
-    $value =~ s/^\s*(.+)\s*$/$1/;
+    my ($key,$value);
+    if ($_ =~ /:/) {
+      ($key,$value) = split(/:/,$_,2);
+      $key =~ s/\s*//;
+      $value =~ s/^\s*(.+)\s*$/$1/;
+    } else {
+      &Bleat(1,sprintf("Illegal header line in %s.%s id %s: %s",
+                       $Conf{'DBDatabase'}, $Conf{'DBTableRaw'},
+                       $Headers{'id'},$_));
+      next;
+    }
+    # check for empty (mandatory) fields from DBTableRaw
+    # and set them from $Headers{'headers', if necessary
+    if (lc($key) =~ /^(message-id|path|newsgroups)$/) {
+      my $HeaderName = lc($key);
+      $HeaderName    = 'mid' if ($HeaderName eq 'message-id');
+      if (!defined($Headers{$HeaderName}) or $Headers{$HeaderName} eq '') {
+        $Headers{$HeaderName} = $value;
+        &Bleat(1,sprintf("Taking missing %s from 'headers' in %s.%s id %s.",
+                         $HeaderName, $Conf{'DBDatabase'}, $Conf{'DBTableRaw'},
+                         $Headers{'id'}));
+      }
+    }
      # save each header, separate database fields in %Headers,
      # the rest in $OtherHeaders (but not Message-ID, Path, Peer
      # and Newsgroups as those do already exist)
@@ -157,6 +187,56 @@ while (my $HeadersR = $DBQuery->fetchrow_hashref) {
    chomp($OtherHeaders);
    $Headers{'headers'} = $OtherHeaders;
  
+  foreach ('from_','sender', 'replyto', 'subject') {
+    if ($Headers{$_}) {
+      my $HeaderName = $_;
+      $HeaderName  =~ s/_$//;
+      # decode From: / Sender: / Reply-To: / Subject:
+      if ($Headers{$_} =~ /\?(B|Q)\?/) {
+        # check for legal encoding and decode
+        (my $Encoding) = $Headers{$_} =~ /\?([^?]+)\?(B|Q)\?/;
+        $Headers{$HeaderName.'_parsed'} = decode('MIME-Header',$Headers{$_})
+          if (exists($LegalEncodings{$Encoding}));
+      }
+      # forcibly modify headers with un-encoded 8bit data assuming utf-8
+      # TODO: try to guess correct enconding
+      elsif ($Headers{$_} =~ /[^\x00-\x7F]/) {
+        $Headers{$_} = decode('utf-8',$Headers{$_});
+      }
+      # extract name(s) and mail(s) from From: / Sender: / Reply-To:
+      # in parsed form, if available
+      if ($_ ne 'subject') {
+        my @Address;
+        # start parser on header or parsed header
+        # @Address will have an array of Mail::Address objects, one for
+        # each name/mail (you can have more than one person in From:!)
+        if (defined($Headers{$HeaderName.'_parsed'})) {
+          @Address = Mail::Address->parse($Headers{$HeaderName.'_parsed'});
+        } else {
+          @Address = Mail::Address->parse($Headers{$_});
+        }
+        # split each Mail::Address object to @Names and @Adresses
+        my (@Names,@Adresses);
+        foreach (@Address) {
+          # take address part in @Addresses
+          push (@Adresses, $_->address());
+          # take name part form "phrase", if there is one:
+          # From: My Name <addr@ess> (Comment)
+          # otherwise, take it from "comment":
+          # From: addr@ess (Comment)
+          # and push it in @Names
+          my ($Name);
+          $Name = $_->comment() unless $Name = $_->phrase;
+          $Name =~ s/^\((.+)\)$/$1/;
+          push (@Names, $Name);
+        }
+        # put all @Adresses and all @Names in %Headers as comma separated lists
+        $Headers{$HeaderName.'_address'} = join(', ',@Adresses);
+        $Headers{$HeaderName.'_name'}    = join(', ',@Names);
+      }
+    }
+  }
+
    # order output for database entry: fill @SQLBindVars
    print "-------------- Next entry:\n" if $OptDebug;
    my @SQLBindVars;
@@ -183,9 +263,9 @@ while (my $HeadersR = $DBQuery->fetchrow_hashref) {
                                         split(/ /,'? ' x scalar(@DBFields)))
                                  ));
    $DBWrite->execute(@SQLBindVars)
-      or &Bleat(2,sprintf("Can't write parsed data for %s to %s.%s: ".
+      or &Bleat(2,sprintf("Can't write parsed data for %s to %s.%s for %s: ".
                            "$DBI::errstr\n",$Period,
-                          $Conf{'DBDatabase'},$Conf{'DBTableParse'}));
+                          $Conf{'DBDatabase'},$Conf{'DBTableParse'}, $Headers{'mid'}));
      $DBWrite->finish;
    }
  };
@@ -213,11 +293,32 @@ See L<doc/README>.
  
  =head1 DESCRIPTION
  
-...
+This script will parse raw, unstructured headers from a database table which is
+fed from F<feedlog.pl> for a given time period and write its results to
+nother database table with separate fields (columns) for most (or even all)
+relevant headers.
+
+I<Subject:>, I<From:>, I<Sender:> and I<Reply-To:> will be parsed from MIME
+encoded words to UTF-8 as needed while the unparsed copy is kept. From that
+parsed copy, I<From:>, I<Sender:> and I<Reply-To:> will also be split into
+separate name(s) and address(es) fields while the un-splitted copy is kept,
+too.
+
+B<parsedb> should be run nightly from cron for yesterdays data so all
+other scripts get current information. The time period to act on defaults to
+yesterday, accordingly; you can assign another time period or a single day via
+the B<--day> option (see below).
  
  =head2 Configuration
  
-...
+B<parsedb> will read its configuration from F<newsstats.conf>
+should be present in etc/ via Config::Auto or from a configuration file
+submitted by the B<--conffile> option.
+
+See L<doc/INSTALL> for an overview of possible configuration options.
+
+You can override configuration options via the B<--rawdb> and
+B<--parsedb> options, respectively.
  
  =head1 OPTIONS
  
@@ -268,7 +369,13 @@ See L<doc/INSTALL>.
  
  =head1 EXAMPLES
  
-...
+An example crontab entry:
+
+    0 1 * * * /path/to/bin/parsedb.pl
+
+Do a dry run for yesterday's data, showing results of processing:
+
+    parsedb --debug --test | less
  
  =head1 FILES