5 # This script will parse a database with raw header information
6 # from a INN feed to a structured database.
8 # It is part of the NewsStats package.
10 # Copyright (c) 2013 Thomas Hochstein <thh@inter.net>
12 # It can be redistributed and/or modified under the same terms under
13 # which Perl itself is published.
16 our $VERSION = "0.01";
18 # we're in .../bin, so our module is in ../lib
19 push(@INC, dirname($0).'/../lib');
24 use NewsStats qw(:DEFAULT :TimePeriods :SQLHelper);
27 use Getopt::Long qw(GetOptions);
28 Getopt::Long::config ('bundling');
30 use Encode qw/decode/;
33 ################################# Definitions ##################################
35 # define header names with separate database fields
36 my %DBFields = ('date' => 'date',
37 'references' => 'refs',
38 'followup-to' => 'fupto',
41 'reply-to' => 'replyto',
42 'subject' => 'subject',
43 'organization' => 'organization',
44 'lines' => 'linecount',
45 'approved' => 'approved',
46 'supersedes' => 'supersedes',
47 'expires' => 'expires',
48 'user-agent' => 'useragent',
49 'x-newsreader' => 'xnewsreader',
50 'x-mailer' => 'xmailer',
51 'x-no-archive' => 'xnoarchive',
52 'content-type' => 'contenttype',
53 'content-transfer-encoding' => 'contentencoding',
54 'cancel-lock' => 'cancellock',
55 'injection-info' => 'injectioninfo',
56 'x-trace' => 'xtrace',
57 'nntp-posting-host' => 'postinghost');
59 # define field list for database
60 my @DBFields = qw/day mid refs date path newsgroups fupto from_ from_parsed
61 from_name from_address sender sender_parsed sender_name
62 sender_address replyto replyto_parsed replyto_name
63 replyto_address subject subject_parsed organization linecount
64 approved supersedes expires useragent xnewsreader xmailer
65 xnoarchive contenttype contentencoding cancellock injectioninfo
66 xtrace postinghost headers disregard/;
68 ################################# Main program #################################
70 ### read commandline options
71 my ($OptDay,$OptDebug,$OptParseDB,$OptRawDB,$OptTest,$OptConfFile);
72 GetOptions ('d|day=s' => \$OptDay,
73 'debug!' => \$OptDebug,
74 'parsedb=s' => \$OptParseDB,
75 'rawdb=s' => \$OptRawDB,
76 't|test!' => \$OptTest,
77 'conffile=s' => \$OptConfFile,
78 'h|help' => \&ShowPOD,
79 'V|version' => \&ShowVersion) or exit 1;
81 ### read configuration
82 my %Conf = %{ReadConfig($OptConfFile)};
84 ### override configuration via commandline options
86 $ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
87 $ConfOverride{'DBTableParse'} = $OptParseDB if $OptParseDB;
88 &OverrideConfig(\%Conf,\%ConfOverride);
91 ### and set $Period for output and expression for SQL 'WHERE' clause
92 my ($Period,$SQLWherePeriod) = &GetTimePeriod($OptDay,'day');
93 # bail out if --month is invalid or "all"
94 &Bleat(2,"--day option has an invalid format - please use 'YYYY-MM-DD' or ".
95 "'YYYY-MM-DD:YYYY-MM-DD'!") if (!$Period or $Period eq 'all time');
98 my $DBHandle = InitDB(\%Conf,1);
101 &Bleat(1,'Test mode. Database is not updated.') if $OptTest;
103 # create $SQLWhereClause
104 my $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,'NOT disregard');
106 # delete old data for current period
108 print "----------- Deleting old data ... -----------\n" if $OptDebug;
109 my $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s %s",
110 $Conf{'DBDatabase'},$Conf{'DBTableParse'},
112 or &Bleat(2,sprintf("Can't delete old parsed data for %s from %s.%s: ".
113 "$DBI::errstr\n",$Period,
114 $Conf{'DBDatabase'},$Conf{'DBTableParse'}));
117 # read from DBTableRaw
118 print "-------------- Reading data ... -------------\n" if $OptDebug;
119 my $DBQuery = $DBHandle->prepare(sprintf("SELECT id, day, mid, peer, path, ".
120 "newsgroups, headers, disregard ".
121 "FROM %s.%s %s", $Conf{'DBDatabase'},
122 $Conf{'DBTableRaw'}, $SQLWhereClause));
124 or &Bleat(2,sprintf("Can't get data for %s from %s.%s: ".
125 "$DBI::errstr\n",$Period,
126 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
128 # set output and database connection to UTF-8
129 # as we're going to write decoded header contents containing UTF-8 chars
130 binmode(STDOUT, ":utf8");
131 $DBHandle->do("SET NAMES 'utf8'");
133 # parse data in a loop and write it out
134 print "-------------- Parsing data ... -------------\n" if $OptDebug;
135 while (my $HeadersR = $DBQuery->fetchrow_hashref) {
136 my %Headers = %{$HeadersR};
138 # parse $Headers{'headers'} ('headers' from DBTableRaw)
139 # merge continuation lines
140 # from Perl Cookbook, 1st German ed. 1999, pg. 91
141 $Headers{'headers'} =~ s/\n\s+/ /g;
142 # split headers in single lines
144 for (split(/\n/,$Headers{'headers'})) {
145 # split header lines in header name and header content
146 my ($key,$value) = split(/:/,$_,2);
148 $value =~ s/^\s*(.+)\s*$/$1/;
149 # save each header, separate database fields in %Headers,
150 # the rest in $OtherHeaders (but not Message-ID, Path, Peer
151 # and Newsgroups as those do already exist)
152 if (defined($DBFields{lc($key)})) {
153 $Headers{$DBFields{lc($key)}} = $value;
155 $OtherHeaders .= sprintf("%s: %s\n",$key,$value)
156 if lc($key) !~ /^(message-id|path|peer|newsgroups)$/;
159 # replace old (now parsed) $Headers{'headers'} with remanining $OtherHeaders
160 chomp($OtherHeaders);
161 $Headers{'headers'} = $OtherHeaders;
163 foreach ('from_','sender', 'replyto', 'subject') {
166 $HeaderName =~ s/_$//;
167 # decode From: / Sender: / Reply-To: / Subject:
168 if ($Headers{$_} =~ /\?(B|Q)\?/) {
169 $Headers{$HeaderName.'_parsed'} = decode('MIME-Header',$Headers{$_});
171 # extract name(s) and mail(s) from From: / Sender: / Reply-To:
172 # in parsed form, if available
173 if ($_ ne 'subject') {
175 # start parser on header or parsed header
176 # @Address will have an array of Mail::Address objects, one for
177 # each name/mail (you can have more than one person in From:!)
178 if (defined($Headers{$HeaderName.'_parsed'})) {
179 @Address = Mail::Address->parse($Headers{$HeaderName.'_parsed'});
181 @Address = Mail::Address->parse($Headers{$_});
183 # split each Mail::Address object to @Names and @Adresses
184 my (@Names,@Adresses);
186 # take address part in @Addresses
187 push (@Adresses, $_->address());
188 # take name part form "phrase", if there is one:
189 # From: My Name <addr@ess> (Comment)
190 # otherwise, take it from "comment":
191 # From: addr@ess (Comment)
192 # and push it in @Names
194 $Name = $_->comment() unless $Name = $_->phrase;
195 $Name =~ s/^\((.+)\)$/$1/;
196 push (@Names, $Name);
198 # put all @Adresses and all @Names in %Headers as comma separated lists
199 $Headers{$HeaderName.'_address'} = join(', ',@Adresses);
200 $Headers{$HeaderName.'_name'} = join(', ',@Names);
205 # order output for database entry: fill @SQLBindVars
206 print "-------------- Next entry:\n" if $OptDebug;
208 foreach (@DBFields) {
209 if (defined($Headers{$_}) and $Headers{$_} ne '') {
210 push (@SQLBindVars,$Headers{$_});
211 printf ("FOUND: %s -> %s\n",$_,$Headers{$_}) if $OptDebug;
213 push (@SQLBindVars,undef);
217 # write data to DBTableParse
219 print "-------------- Writing data ... -------------\n" if $OptDebug;
221 $DBHandle->prepare(sprintf("INSERT INTO %s.%s (%s) VALUES (%s)",
223 $Conf{'DBTableParse'},
224 # get field names from @DBFields
225 join(', ',@DBFields),
226 # create a list of '?' for each DBField
228 split(/ /,'? ' x scalar(@DBFields)))
230 $DBWrite->execute(@SQLBindVars)
231 or &Bleat(2,sprintf("Can't write parsed data for %s to %s.%s: ".
232 "$DBI::errstr\n",$Period,
233 $Conf{'DBDatabase'},$Conf{'DBTableParse'}));
240 $DBHandle->disconnect;
242 print "------------------- DONE! -------------------\n" if $OptDebug;
245 ################################ Documentation #################################
249 parsedb - parse raw data and save it to a database
253 B<parsedb> [B<-Vht>] [B<--day> I<YYYY-MM-DD> | I<YYYY-MM-DD:YYYY-MM-DD>] [B<--rawdb> I<database table>] [B<--parsedb> I<database table>] [B<--conffile> I<filename>] [B<--debug>]
261 This script will parse raw, unstructured headers from a database table which is
262 fed from F<feedlog.pl> for a given time period and write its results to
263 nother database table with separate fields (columns) for most (or even all)
266 I<Subject:>, I<From:>, I<Sender:> and I<Reply-To:> will be parsed from MIME
267 encoded words to UTF-8 as needed while the unparsed copy is kept. From that
268 parsed copy, I<From:>, I<Sender:> and I<Reply-To:> will also be split into
269 separate name(s) and address(es) fields while the un-splitted copy is kept,
272 B<parsedb> should be run nightly from cron for yesterdays data so all
273 other scripts get current information. The time period to act on defaults to
274 yesterday, accordingly; you can assign another time period or a single day via
275 the B<--day> option (see below).
279 B<parsedb> will read its configuration from F<newsstats.conf>
280 should be present in etc/ via Config::Auto or from a configuration file
281 submitted by the B<--conffile> option.
283 See L<doc/INSTALL> for an overview of possible configuration options.
285 You can override configuration options via the B<--rawdb> and
286 B<--parsedb> options, respectively.
292 =item B<-V>, B<--version>
294 Print out version and copyright information and exit.
296 =item B<-h>, B<--help>
298 Print this man page and exit.
302 Output (rather much) debugging information to STDOUT while processing.
304 =item B<-t>, B<--test>
306 Do not write results to database. You should use B<--debug> in
307 conjunction with B<--test> ... everything else seems a bit pointless.
309 =item B<-d>, B<--day> I<YYYY-MM-DD[:YYYY-MM-DD]>
311 Set processing period to a single day in YYYY-MM-DD format or to a time
312 period between two days in YYYY-MM-DD:YYYY-MM-DD format (two days, separated
315 Defaults to yesterday.
317 =item B<--rawdb> I<table> (raw data table)
319 Override I<DBTableRaw> from F<newsstats.conf>.
321 =item B<--parsedb> I<table> (parsed data table)
323 Override I<DBTableParse> from F<newsstats.conf>.
325 =item B<--conffile> I<filename>
327 Load configuration from I<filename> instead of F<newsstats.conf>.
337 An example crontab entry:
339 0 1 * * * /path/to/bin/parsedb.pl
341 Do a dry run for yesterday's data, showing results of processing:
343 parsedb --debug --test | less
349 =item F<bin/parsedb.pl>
353 =item F<lib/NewsStats.pm>
355 Library functions for the NewsStats package.
357 =item F<etc/newsstats.conf>
359 Runtime configuration file.
365 Please report any bugs or feature requests to the author or use the
366 bug tracker at L<http://bugs.th-h.de/>!
382 This script is part of the B<NewsStats> package.
386 Thomas Hochstein <thh@inter.net>
388 =head1 COPYRIGHT AND LICENSE
390 Copyright (c) 2013 Thomas Hochstein <thh@inter.net>
392 This program is free software; you may redistribute it and/or modify it
393 under the same terms as Perl itself.