X-Git-Url: https://code.th-h.de/?p=usenet%2Fnewsstats.git;a=blobdiff_plain;f=bin%2Fparsedb.pl;fp=bin%2Fparsedb.pl;h=b4c20566ede19eb318ff5d90168b39bb778ca540;hp=0000000000000000000000000000000000000000;hb=84e9923abeadcfb488985b9e60eb78d4eaa950fd;hpb=fd0717a15cb1b36e25a019060d3f40aac9ca9517 diff --git a/bin/parsedb.pl b/bin/parsedb.pl new file mode 100755 index 0000000..b4c2056 --- /dev/null +++ b/bin/parsedb.pl @@ -0,0 +1,425 @@ +#! /usr/bin/perl +# +# parsedb.pl +# +# This script will parse a database with raw header information +# from a INN feed to a structured database. +# +# It is part of the NewsStats package. +# +# Copyright (c) 2013 Thomas Hochstein +# +# It can be redistributed and/or modified under the same terms under +# which Perl itself is published. + +BEGIN { + our $VERSION = "0.01"; + use File::Basename; + # we're in .../bin, so our module is in ../lib + push(@INC, dirname($0).'/../lib'); +} +use strict; +use warnings; + +use NewsStats qw(:DEFAULT :TimePeriods :SQLHelper); + +use DBI; +use Getopt::Long qw(GetOptions); +Getopt::Long::config ('bundling'); + +use Encode qw/decode/; +use Mail::Address; + +################################# Definitions ################################## + +# define header names with separate database fields +my %DBFields = ('date' => 'date', + 'references' => 'refs', + 'followup-to' => 'fupto', + 'from' => 'from_', + 'sender' => 'sender', + 'reply-to' => 'replyto', + 'subject' => 'subject', + 'organization' => 'organization', + 'lines' => 'linecount', + 'approved' => 'approved', + 'supersedes' => 'supersedes', + 'expires' => 'expires', + 'user-agent' => 'useragent', + 'x-newsreader' => 'xnewsreader', + 'x-mailer' => 'xmailer', + 'x-no-archive' => 'xnoarchive', + 'content-type' => 'contenttype', + 'content-transfer-encoding' => 'contentencoding', + 'cancel-lock' => 'cancellock', + 'injection-info' => 'injectioninfo', + 'x-trace' => 'xtrace', + 'nntp-posting-host' => 'postinghost'); + +# define field list for database +my @DBFields = qw/day mid refs date path newsgroups fupto from_ from_parsed + from_name from_address sender sender_parsed sender_name + sender_address replyto replyto_parsed replyto_name + replyto_address subject subject_parsed organization linecount + approved supersedes expires useragent xnewsreader xmailer + xnoarchive contenttype contentencoding cancellock injectioninfo + xtrace postinghost headers disregard/; + +################################# Main program ################################# + +### read commandline options +my ($OptDay,$OptDebug,$OptParseDB,$OptRawDB,$OptTest,$OptConfFile); +GetOptions ('d|day=s' => \$OptDay, + 'debug!' => \$OptDebug, + 'parsedb=s' => \$OptParseDB, + 'rawdb=s' => \$OptRawDB, + 't|test!' => \$OptTest, + 'conffile=s' => \$OptConfFile, + 'h|help' => \&ShowPOD, + 'V|version' => \&ShowVersion) or exit 1; + +### read configuration +my %Conf = %{ReadConfig($OptConfFile)}; + +### override configuration via commandline options +my %ConfOverride; +$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB; +$ConfOverride{'DBTableParse'} = $OptParseDB if $OptParseDB; +&OverrideConfig(\%Conf,\%ConfOverride); + +### get time period +### and set $Period for output and expression for SQL 'WHERE' clause +my ($Period,$SQLWherePeriod) = &GetTimePeriod($OptDay,'day'); +# bail out if --month is invalid or "all" +&Bleat(2,"--day option has an invalid format - please use 'YYYY-MM-DD' or ". + "'YYYY-MM-DD:YYYY-MM-DD'!") if (!$Period or $Period eq 'all time'); + +### init database +my $DBHandle = InitDB(\%Conf,1); + +### get & write data +&Bleat(1,'Test mode. Database is not updated.') if $OptTest; + +# create $SQLWhereClause +my $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,'NOT disregard'); + +# delete old data for current period +if (!$OptTest) { + print "----------- Deleting old data ... -----------\n" if $OptDebug; + my $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s %s", + $Conf{'DBDatabase'},$Conf{'DBTableParse'}, + $SQLWhereClause)) + or &Bleat(2,sprintf("Can't delete old parsed data for %s from %s.%s: ". + "$DBI::errstr\n",$Period, + $Conf{'DBDatabase'},$Conf{'DBTableParse'})); +}; + +# read from DBTableRaw +print "-------------- Reading data ... -------------\n" if $OptDebug; +my $DBQuery = $DBHandle->prepare(sprintf("SELECT id, day, mid, peer, path, ". + "newsgroups, headers, disregard ". + "FROM %s.%s %s", $Conf{'DBDatabase'}, + $Conf{'DBTableRaw'}, $SQLWhereClause)); +$DBQuery->execute() + or &Bleat(2,sprintf("Can't get data for %s from %s.%s: ". + "$DBI::errstr\n",$Period, + $Conf{'DBDatabase'},$Conf{'DBTableRaw'})); + +# set output and database connection to UTF-8 +# as we're going to write decoded header contents containing UTF-8 chars +binmode(STDOUT, ":utf8"); +$DBHandle->do("SET NAMES 'utf8'"); + +# create a list of supported encondings +my %LegalEncodings; +foreach (Encode->encodings()) { + $LegalEncodings{$_} = 1; +} +# parse data in a loop and write it out +print "-------------- Parsing data ... -------------\n" if $OptDebug; +while (my $HeadersR = $DBQuery->fetchrow_hashref) { + my %Headers = %{$HeadersR}; + + # parse $Headers{'headers'} ('headers' from DBTableRaw) + # remove empty lines (that should not even exist in a header!) + $Headers{'headers'} =~ s/\n\s*\n/\n/g; + # merge continuation lines + # from Perl Cookbook, 1st German ed. 1999, pg. 91 + $Headers{'headers'} =~ s/\n\s+/ /g; + # split headers in single lines + my $OtherHeaders; + for (split(/\n/,$Headers{'headers'})) { + # split header lines in header name and header content + my ($key,$value); + if ($_ =~ /:/) { + ($key,$value) = split(/:/,$_,2); + $key =~ s/\s*//; + $value =~ s/^\s*(.+)\s*$/$1/; + } else { + &Bleat(1,sprintf("Illegal header line in %s.%s id %s: %s", + $Conf{'DBDatabase'}, $Conf{'DBTableRaw'}, + $Headers{'id'},$_)); + next; + } + # check for empty (mandatory) fields from DBTableRaw + # and set them from $Headers{'headers', if necessary + if (lc($key) =~ /^(message-id|path|newsgroups)$/) { + my $HeaderName = lc($key); + $HeaderName = 'mid' if ($HeaderName eq 'message-id'); + if (!defined($Headers{$HeaderName}) or $Headers{$HeaderName} eq '') { + $Headers{$HeaderName} = $value; + &Bleat(1,sprintf("Taking missing %s from 'headers' in %s.%s id %s.", + $HeaderName, $Conf{'DBDatabase'}, $Conf{'DBTableRaw'}, + $Headers{'id'})); + } + } + # save each header, separate database fields in %Headers, + # the rest in $OtherHeaders (but not Message-ID, Path, Peer + # and Newsgroups as those do already exist) + if (defined($DBFields{lc($key)})) { + $Headers{$DBFields{lc($key)}} = $value; + } else { + $OtherHeaders .= sprintf("%s: %s\n",$key,$value) + if lc($key) !~ /^(message-id|path|peer|newsgroups)$/; + } + } + # replace old (now parsed) $Headers{'headers'} with remanining $OtherHeaders + chomp($OtherHeaders); + $Headers{'headers'} = $OtherHeaders; + + foreach ('from_','sender', 'replyto', 'subject') { + if ($Headers{$_}) { + my $HeaderName = $_; + $HeaderName =~ s/_$//; + # decode From: / Sender: / Reply-To: / Subject: + if ($Headers{$_} =~ /\?(B|Q)\?/) { + # check for legal encoding and decode + (my $Encoding) = $Headers{$_} =~ /\?([^?]+)\?(B|Q)\?/; + $Headers{$HeaderName.'_parsed'} = decode('MIME-Header',$Headers{$_}) + if (exists($LegalEncodings{$Encoding})); + } + # extract name(s) and mail(s) from From: / Sender: / Reply-To: + # in parsed form, if available + if ($_ ne 'subject') { + my @Address; + # start parser on header or parsed header + # @Address will have an array of Mail::Address objects, one for + # each name/mail (you can have more than one person in From:!) + if (defined($Headers{$HeaderName.'_parsed'})) { + @Address = Mail::Address->parse($Headers{$HeaderName.'_parsed'}); + } else { + @Address = Mail::Address->parse($Headers{$_}); + } + # split each Mail::Address object to @Names and @Adresses + my (@Names,@Adresses); + foreach (@Address) { + # take address part in @Addresses + push (@Adresses, $_->address()); + # take name part form "phrase", if there is one: + # From: My Name (Comment) + # otherwise, take it from "comment": + # From: addr@ess (Comment) + # and push it in @Names + my ($Name); + $Name = $_->comment() unless $Name = $_->phrase; + $Name =~ s/^\((.+)\)$/$1/; + push (@Names, $Name); + } + # put all @Adresses and all @Names in %Headers as comma separated lists + $Headers{$HeaderName.'_address'} = join(', ',@Adresses); + $Headers{$HeaderName.'_name'} = join(', ',@Names); + } + } + } + + # order output for database entry: fill @SQLBindVars + print "-------------- Next entry:\n" if $OptDebug; + my @SQLBindVars; + foreach (@DBFields) { + if (defined($Headers{$_}) and $Headers{$_} ne '') { + push (@SQLBindVars,$Headers{$_}); + printf ("FOUND: %s -> %s\n",$_,$Headers{$_}) if $OptDebug; + } else { + push (@SQLBindVars,undef); + } + } + + # write data to DBTableParse + if (!$OptTest) { + print "-------------- Writing data ... -------------\n" if $OptDebug; + my $DBWrite = + $DBHandle->prepare(sprintf("INSERT INTO %s.%s (%s) VALUES (%s)", + $Conf{'DBDatabase'}, + $Conf{'DBTableParse'}, + # get field names from @DBFields + join(', ',@DBFields), + # create a list of '?' for each DBField + join(', ', + split(/ /,'? ' x scalar(@DBFields))) + )); + $DBWrite->execute(@SQLBindVars) + or &Bleat(2,sprintf("Can't write parsed data for %s to %s.%s: ". + "$DBI::errstr\n",$Period, + $Conf{'DBDatabase'},$Conf{'DBTableParse'})); + $DBWrite->finish; + } +}; +$DBQuery->finish; + +### close handles +$DBHandle->disconnect; + +print "------------------- DONE! -------------------\n" if $OptDebug; +__END__ + +################################ Documentation ################################# + +=head1 NAME + +parsedb - parse raw data and save it to a database + +=head1 SYNOPSIS + +B [B<-Vht>] [B<--day> I | I] [B<--rawdb> I] [B<--parsedb> I] [B<--conffile> I] [B<--debug>] + +=head1 REQUIREMENTS + +See L. + +=head1 DESCRIPTION + +This script will parse raw, unstructured headers from a database table which is +fed from F for a given time period and write its results to +nother database table with separate fields (columns) for most (or even all) +relevant headers. + +I, I, I and I will be parsed from MIME +encoded words to UTF-8 as needed while the unparsed copy is kept. From that +parsed copy, I, I and I will also be split into +separate name(s) and address(es) fields while the un-splitted copy is kept, +too. + +B should be run nightly from cron for yesterdays data so all +other scripts get current information. The time period to act on defaults to +yesterday, accordingly; you can assign another time period or a single day via +the B<--day> option (see below). + +=head2 Configuration + +B will read its configuration from F +should be present in etc/ via Config::Auto or from a configuration file +submitted by the B<--conffile> option. + +See L for an overview of possible configuration options. + +You can override configuration options via the B<--rawdb> and +B<--parsedb> options, respectively. + +=head1 OPTIONS + +=over 3 + +=item B<-V>, B<--version> + +Print out version and copyright information and exit. + +=item B<-h>, B<--help> + +Print this man page and exit. + +=item B<--debug> + +Output (rather much) debugging information to STDOUT while processing. + +=item B<-t>, B<--test> + +Do not write results to database. You should use B<--debug> in +conjunction with B<--test> ... everything else seems a bit pointless. + +=item B<-d>, B<--day> I + +Set processing period to a single day in YYYY-MM-DD format or to a time +period between two days in YYYY-MM-DD:YYYY-MM-DD format (two days, separated +by a colon). + +Defaults to yesterday. + +=item B<--rawdb> I (raw data table) + +Override I from F. + +=item B<--parsedb> I
(parsed data table) + +Override I from F. + +=item B<--conffile> I + +Load configuration from I instead of F. + +=back + +=head1 INSTALLATION + +See L. + +=head1 EXAMPLES + +An example crontab entry: + + 0 1 * * * /path/to/bin/parsedb.pl + +Do a dry run for yesterday's data, showing results of processing: + + parsedb --debug --test | less + +=head1 FILES + +=over 4 + +=item F + +The script itself. + +=item F + +Library functions for the NewsStats package. + +=item F + +Runtime configuration file. + +=back + +=head1 BUGS + +Please report any bugs or feature requests to the author or use the +bug tracker at L! + +=head1 SEE ALSO + +=over 2 + +=item - + +L + +=item - + +L + +=back + +This script is part of the B package. + +=head1 AUTHOR + +Thomas Hochstein + +=head1 COPYRIGHT AND LICENSE + +Copyright (c) 2013 Thomas Hochstein + +This program is free software; you may redistribute it and/or modify it +under the same terms as Perl itself. + +=cut