X-Git-Url: https://code.th-h.de/?p=usenet%2Fnewsstats.git;a=blobdiff_plain;f=bin%2Fparsedb.pl;fp=bin%2Fparsedb.pl;h=10a1a5d02bdea2adcb1b955b5258deb3b25ca762;hp=0000000000000000000000000000000000000000;hb=6d72dad2c0b70499877bfa844d378fb0ecb58322;hpb=3634010808edf7421d7034e94b8f7928dfadd059 diff --git a/bin/parsedb.pl b/bin/parsedb.pl new file mode 100755 index 0000000..10a1a5d --- /dev/null +++ b/bin/parsedb.pl @@ -0,0 +1,323 @@ +#! /usr/bin/perl +# +# parsedb.pl +# +# This script will parse a database with raw header information +# from a INN feed to a structured database. +# +# It is part of the NewsStats package. +# +# Copyright (c) 2013 Thomas Hochstein +# +# It can be redistributed and/or modified under the same terms under +# which Perl itself is published. + +BEGIN { + our $VERSION = "0.01"; + use File::Basename; + # we're in .../bin, so our module is in ../lib + push(@INC, dirname($0).'/../lib'); +} +use strict; +use warnings; + +use NewsStats qw(:DEFAULT :TimePeriods :SQLHelper); + +use DBI; +use Getopt::Long qw(GetOptions); +Getopt::Long::config ('bundling'); + +################################# Definitions ################################## + +# define header names with separate database fields +my %DBFields = ('date' => 'date', + 'references' => 'refs', + 'followup-to' => 'fupto', + 'from' => 'from_', + 'sender' => 'sender', + 'reply-to' => 'replyto', + 'subject' => 'subject', + 'organization' => 'organization', + 'lines' => 'linecount', + 'approved' => 'approved', + 'supersedes' => 'supersedes', + 'expires' => 'expires', + 'user-agent' => 'useragent', + 'x-newsreader' => 'xnewsreader', + 'x-mailer' => 'xmailer', + 'x-no-archive' => 'xnoarchive', + 'content-type' => 'contenttype', + 'content-transfer-encoding' => 'contentencoding', + 'cancel-lock' => 'cancellock', + 'injection-info' => 'injectioninfo', + 'x-trace' => 'xtrace', + 'nntp-posting-host' => 'postinghost'); + +# define field list for database +my @DBFields = qw/day mid refs date path newsgroups fupto from_ from_parsed + from_name from_address sender sender_parsed sender_name + sender_address replyto replyto_parsed replyto_name + replyto_address subject subject_parsed organization linecount + approved supersedes expires useragent xnewsreader xmailer + xnoarchive contenttype contentencoding cancellock injectioninfo + xtrace postinghost headers disregard/; + +################################# Main program ################################# + +### read commandline options +my ($OptDay,$OptDebug,$OptParseDB,$OptRawDB,$OptTest,$OptConfFile); +GetOptions ('d|day=s' => \$OptDay, + 'debug!' => \$OptDebug, + 'parsedb=s' => \$OptParseDB, + 'rawdb=s' => \$OptRawDB, + 't|test!' => \$OptTest, + 'conffile=s' => \$OptConfFile, + 'h|help' => \&ShowPOD, + 'V|version' => \&ShowVersion) or exit 1; + +### read configuration +my %Conf = %{ReadConfig($OptConfFile)}; + +### override configuration via commandline options +my %ConfOverride; +$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB; +$ConfOverride{'DBTableParse'} = $OptParseDB if $OptParseDB; +&OverrideConfig(\%Conf,\%ConfOverride); + +### get time period +### and set $Period for output and expression for SQL 'WHERE' clause +my ($Period,$SQLWherePeriod) = &GetTimePeriod($OptDay,'day'); +# bail out if --month is invalid or "all" +&Bleat(2,"--day option has an invalid format - please use 'YYYY-MM-DD' or ". + "'YYYY-MM-DD:YYYY-MM-DD'!") if (!$Period or $Period eq 'all time'); + +### init database +my $DBHandle = InitDB(\%Conf,1); + +### get & write data +&Bleat(1,'Test mode. Database is not updated.') if $OptTest; + +# create $SQLWhereClause +my $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,'NOT disregard'); + +# delete old data for current period +if (!$OptTest) { + print "----------- Deleting old data ... -----------\n" if $OptDebug; + my $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s %s", + $Conf{'DBDatabase'},$Conf{'DBTableParse'}, + $SQLWhereClause)) + or &Bleat(2,sprintf("Can't delete old parsed data for %s from %s.%s: ". + "$DBI::errstr\n",$Period, + $Conf{'DBDatabase'},$Conf{'DBTableParse'})); +}; + +# read from DBTableRaw +print "-------------- Reading data ... -------------\n" if $OptDebug; +my $DBQuery = $DBHandle->prepare(sprintf("SELECT id, day, mid, peer, path, ". + "newsgroups, headers, disregard ". + "FROM %s.%s %s", $Conf{'DBDatabase'}, + $Conf{'DBTableRaw'}, $SQLWhereClause)); +$DBQuery->execute() + or &Bleat(2,sprintf("Can't get data for %s from %s.%s: ". + "$DBI::errstr\n",$Period, + $Conf{'DBDatabase'},$Conf{'DBTableRaw'})); + +# set output and database connection to UTF-8 +# as we're going to write decoded header contents containing UTF-8 chars +binmode(STDOUT, ":utf8"); +$DBHandle->do("SET NAMES 'utf8'"); + +# parse data in a loop and write it out +print "-------------- Parsing data ... -------------\n" if $OptDebug; +while (my $HeadersR = $DBQuery->fetchrow_hashref) { + my %Headers = %{$HeadersR}; + + # parse $Headers{'headers'} ('headers' from DBTableRaw) + # merge continuation lines + # from Perl Cookbook, 1st German ed. 1999, pg. 91 + $Headers{'headers'} =~ s/\n\s+/ /g; + # split headers in single lines + my $OtherHeaders; + for (split(/\n/,$Headers{'headers'})) { + # split header lines in header name and header content + my ($key,$value) = split(/:/,$_,2); + $key =~ s/\s*//; + $value =~ s/^\s*(.+)\s*$/$1/; + # save each header, separate database fields in %Headers, + # the rest in $OtherHeaders (but not Message-ID, Path, Peer + # and Newsgroups as those do already exist) + if (defined($DBFields{lc($key)})) { + $Headers{$DBFields{lc($key)}} = $value; + } else { + $OtherHeaders .= sprintf("%s: %s\n",$key,$value) + if lc($key) !~ /^(message-id|path|peer|newsgroups)$/; + } + } + # replace old (now parsed) $Headers{'headers'} with remanining $OtherHeaders + chomp($OtherHeaders); + $Headers{'headers'} = $OtherHeaders; + + # order output for database entry: fill @SQLBindVars + print "-------------- Next entry:\n" if $OptDebug; + my @SQLBindVars; + foreach (@DBFields) { + if (defined($Headers{$_}) and $Headers{$_} ne '') { + push (@SQLBindVars,$Headers{$_}); + printf ("FOUND: %s -> %s\n",$_,$Headers{$_}) if $OptDebug; + } else { + push (@SQLBindVars,undef); + } + } + + # write data to DBTableParse + if (!$OptTest) { + print "-------------- Writing data ... -------------\n" if $OptDebug; + my $DBWrite = + $DBHandle->prepare(sprintf("INSERT INTO %s.%s (%s) VALUES (%s)", + $Conf{'DBDatabase'}, + $Conf{'DBTableParse'}, + # get field names from @DBFields + join(', ',@DBFields), + # create a list of '?' for each DBField + join(', ', + split(/ /,'? ' x scalar(@DBFields))) + )); + $DBWrite->execute(@SQLBindVars) + or &Bleat(2,sprintf("Can't write parsed data for %s to %s.%s: ". + "$DBI::errstr\n",$Period, + $Conf{'DBDatabase'},$Conf{'DBTableParse'})); + $DBWrite->finish; + } +}; +$DBQuery->finish; + +### close handles +$DBHandle->disconnect; + +print "------------------- DONE! -------------------\n" if $OptDebug; +__END__ + +################################ Documentation ################################# + +=head1 NAME + +parsedb - parse raw data and save it to a database + +=head1 SYNOPSIS + +B [B<-Vht>] [B<--day> I | I] [B<--rawdb> I] [B<--parsedb> I] [B<--conffile> I] [B<--debug>] + +=head1 REQUIREMENTS + +See L. + +=head1 DESCRIPTION + +... + +=head2 Configuration + +... + +=head1 OPTIONS + +=over 3 + +=item B<-V>, B<--version> + +Print out version and copyright information and exit. + +=item B<-h>, B<--help> + +Print this man page and exit. + +=item B<--debug> + +Output (rather much) debugging information to STDOUT while processing. + +=item B<-t>, B<--test> + +Do not write results to database. You should use B<--debug> in +conjunction with B<--test> ... everything else seems a bit pointless. + +=item B<-d>, B<--day> I + +Set processing period to a single day in YYYY-MM-DD format or to a time +period between two days in YYYY-MM-DD:YYYY-MM-DD format (two days, separated +by a colon). + +Defaults to yesterday. + +=item B<--rawdb> I (raw data table) + +Override I from F. + +=item B<--parsedb> I
(parsed data table) + +Override I from F. + +=item B<--conffile> I + +Load configuration from I instead of F. + +=back + +=head1 INSTALLATION + +See L. + +=head1 EXAMPLES + +... + +=head1 FILES + +=over 4 + +=item F + +The script itself. + +=item F + +Library functions for the NewsStats package. + +=item F + +Runtime configuration file. + +=back + +=head1 BUGS + +Please report any bugs or feature requests to the author or use the +bug tracker at L! + +=head1 SEE ALSO + +=over 2 + +=item - + +L + +=item - + +L + +=back + +This script is part of the B package. + +=head1 AUTHOR + +Thomas Hochstein + +=head1 COPYRIGHT AND LICENSE + +Copyright (c) 2013 Thomas Hochstein + +This program is free software; you may redistribute it and/or modify it +under the same terms as Perl itself. + +=cut