Forcibly decode headers with unencoded 8bit chars.
[usenet/newsstats.git] / bin / parsedb.pl
CommitLineData
6d72dad2
TH
1#! /usr/bin/perl
2#
3# parsedb.pl
4#
5# This script will parse a database with raw header information
6# from a INN feed to a structured database.
7#
8# It is part of the NewsStats package.
9#
10# Copyright (c) 2013 Thomas Hochstein <thh@inter.net>
11#
12# It can be redistributed and/or modified under the same terms under
13# which Perl itself is published.
14
15BEGIN {
16 our $VERSION = "0.01";
17 use File::Basename;
18 # we're in .../bin, so our module is in ../lib
19 push(@INC, dirname($0).'/../lib');
20}
21use strict;
22use warnings;
23
24use NewsStats qw(:DEFAULT :TimePeriods :SQLHelper);
25
26use DBI;
27use Getopt::Long qw(GetOptions);
28Getopt::Long::config ('bundling');
29
9630376c
TH
30use Encode qw/decode/;
31use Mail::Address;
32
6d72dad2
TH
33################################# Definitions ##################################
34
35# define header names with separate database fields
36my %DBFields = ('date' => 'date',
37 'references' => 'refs',
38 'followup-to' => 'fupto',
39 'from' => 'from_',
40 'sender' => 'sender',
41 'reply-to' => 'replyto',
42 'subject' => 'subject',
43 'organization' => 'organization',
44 'lines' => 'linecount',
45 'approved' => 'approved',
46 'supersedes' => 'supersedes',
47 'expires' => 'expires',
48 'user-agent' => 'useragent',
49 'x-newsreader' => 'xnewsreader',
50 'x-mailer' => 'xmailer',
51 'x-no-archive' => 'xnoarchive',
52 'content-type' => 'contenttype',
53 'content-transfer-encoding' => 'contentencoding',
54 'cancel-lock' => 'cancellock',
55 'injection-info' => 'injectioninfo',
56 'x-trace' => 'xtrace',
57 'nntp-posting-host' => 'postinghost');
58
59# define field list for database
60my @DBFields = qw/day mid refs date path newsgroups fupto from_ from_parsed
61 from_name from_address sender sender_parsed sender_name
62 sender_address replyto replyto_parsed replyto_name
63 replyto_address subject subject_parsed organization linecount
64 approved supersedes expires useragent xnewsreader xmailer
65 xnoarchive contenttype contentencoding cancellock injectioninfo
66 xtrace postinghost headers disregard/;
67
68################################# Main program #################################
69
70### read commandline options
71my ($OptDay,$OptDebug,$OptParseDB,$OptRawDB,$OptTest,$OptConfFile);
72GetOptions ('d|day=s' => \$OptDay,
73 'debug!' => \$OptDebug,
74 'parsedb=s' => \$OptParseDB,
75 'rawdb=s' => \$OptRawDB,
76 't|test!' => \$OptTest,
77 'conffile=s' => \$OptConfFile,
78 'h|help' => \&ShowPOD,
79 'V|version' => \&ShowVersion) or exit 1;
80
81### read configuration
82my %Conf = %{ReadConfig($OptConfFile)};
83
84### override configuration via commandline options
85my %ConfOverride;
86$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
87$ConfOverride{'DBTableParse'} = $OptParseDB if $OptParseDB;
88&OverrideConfig(\%Conf,\%ConfOverride);
89
90### get time period
91### and set $Period for output and expression for SQL 'WHERE' clause
92my ($Period,$SQLWherePeriod) = &GetTimePeriod($OptDay,'day');
93# bail out if --month is invalid or "all"
94&Bleat(2,"--day option has an invalid format - please use 'YYYY-MM-DD' or ".
95 "'YYYY-MM-DD:YYYY-MM-DD'!") if (!$Period or $Period eq 'all time');
96
97### init database
98my $DBHandle = InitDB(\%Conf,1);
99
100### get & write data
101&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
102
103# create $SQLWhereClause
104my $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,'NOT disregard');
105
106# delete old data for current period
107if (!$OptTest) {
108 print "----------- Deleting old data ... -----------\n" if $OptDebug;
109 my $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s %s",
110 $Conf{'DBDatabase'},$Conf{'DBTableParse'},
111 $SQLWhereClause))
112 or &Bleat(2,sprintf("Can't delete old parsed data for %s from %s.%s: ".
113 "$DBI::errstr\n",$Period,
114 $Conf{'DBDatabase'},$Conf{'DBTableParse'}));
115};
116
117# read from DBTableRaw
118print "-------------- Reading data ... -------------\n" if $OptDebug;
119my $DBQuery = $DBHandle->prepare(sprintf("SELECT id, day, mid, peer, path, ".
120 "newsgroups, headers, disregard ".
121 "FROM %s.%s %s", $Conf{'DBDatabase'},
122 $Conf{'DBTableRaw'}, $SQLWhereClause));
123$DBQuery->execute()
124 or &Bleat(2,sprintf("Can't get data for %s from %s.%s: ".
125 "$DBI::errstr\n",$Period,
126 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
127
128# set output and database connection to UTF-8
129# as we're going to write decoded header contents containing UTF-8 chars
130binmode(STDOUT, ":utf8");
131$DBHandle->do("SET NAMES 'utf8'");
132
48c8d4bb
TH
133# create a list of supported encondings
134my %LegalEncodings;
135foreach (Encode->encodings()) {
136 $LegalEncodings{$_} = 1;
137}
6d72dad2
TH
138# parse data in a loop and write it out
139print "-------------- Parsing data ... -------------\n" if $OptDebug;
140while (my $HeadersR = $DBQuery->fetchrow_hashref) {
141 my %Headers = %{$HeadersR};
142
143 # parse $Headers{'headers'} ('headers' from DBTableRaw)
48c8d4bb
TH
144 # remove empty lines (that should not even exist in a header!)
145 $Headers{'headers'} =~ s/\n\s*\n/\n/g;
6d72dad2
TH
146 # merge continuation lines
147 # from Perl Cookbook, 1st German ed. 1999, pg. 91
148 $Headers{'headers'} =~ s/\n\s+/ /g;
149 # split headers in single lines
150 my $OtherHeaders;
151 for (split(/\n/,$Headers{'headers'})) {
152 # split header lines in header name and header content
48c8d4bb
TH
153 my ($key,$value);
154 if ($_ =~ /:/) {
155 ($key,$value) = split(/:/,$_,2);
156 $key =~ s/\s*//;
157 $value =~ s/^\s*(.+)\s*$/$1/;
158 } else {
159 &Bleat(1,sprintf("Illegal header line in %s.%s id %s: %s",
160 $Conf{'DBDatabase'}, $Conf{'DBTableRaw'},
161 $Headers{'id'},$_));
162 next;
163 }
164 # check for empty (mandatory) fields from DBTableRaw
165 # and set them from $Headers{'headers', if necessary
166 if (lc($key) =~ /^(message-id|path|newsgroups)$/) {
167 my $HeaderName = lc($key);
168 $HeaderName = 'mid' if ($HeaderName eq 'message-id');
169 if (!defined($Headers{$HeaderName}) or $Headers{$HeaderName} eq '') {
170 $Headers{$HeaderName} = $value;
171 &Bleat(1,sprintf("Taking missing %s from 'headers' in %s.%s id %s.",
172 $HeaderName, $Conf{'DBDatabase'}, $Conf{'DBTableRaw'},
173 $Headers{'id'}));
174 }
175 }
6d72dad2
TH
176 # save each header, separate database fields in %Headers,
177 # the rest in $OtherHeaders (but not Message-ID, Path, Peer
178 # and Newsgroups as those do already exist)
179 if (defined($DBFields{lc($key)})) {
180 $Headers{$DBFields{lc($key)}} = $value;
181 } else {
182 $OtherHeaders .= sprintf("%s: %s\n",$key,$value)
183 if lc($key) !~ /^(message-id|path|peer|newsgroups)$/;
184 }
185 }
186 # replace old (now parsed) $Headers{'headers'} with remanining $OtherHeaders
187 chomp($OtherHeaders);
188 $Headers{'headers'} = $OtherHeaders;
189
9630376c
TH
190 foreach ('from_','sender', 'replyto', 'subject') {
191 if ($Headers{$_}) {
192 my $HeaderName = $_;
193 $HeaderName =~ s/_$//;
194 # decode From: / Sender: / Reply-To: / Subject:
195 if ($Headers{$_} =~ /\?(B|Q)\?/) {
48c8d4bb
TH
196 # check for legal encoding and decode
197 (my $Encoding) = $Headers{$_} =~ /\?([^?]+)\?(B|Q)\?/;
198 $Headers{$HeaderName.'_parsed'} = decode('MIME-Header',$Headers{$_})
199 if (exists($LegalEncodings{$Encoding}));
9630376c 200 }
b99d4101
TH
201 # forcibly modify headers with un-encoded 8bit data assuming utf-8
202 # TODO: try to guess correct enconding
203 elsif ($Headers{$_} =~ /[^\x00-\x7F]/) {
204 $Headers{$_} = decode('utf-8',$Headers{$_});
205 }
9630376c
TH
206 # extract name(s) and mail(s) from From: / Sender: / Reply-To:
207 # in parsed form, if available
208 if ($_ ne 'subject') {
209 my @Address;
210 # start parser on header or parsed header
211 # @Address will have an array of Mail::Address objects, one for
212 # each name/mail (you can have more than one person in From:!)
213 if (defined($Headers{$HeaderName.'_parsed'})) {
214 @Address = Mail::Address->parse($Headers{$HeaderName.'_parsed'});
215 } else {
216 @Address = Mail::Address->parse($Headers{$_});
217 }
aef5467b
TH
218 # split each Mail::Address object to @Names and @Adresses
219 my (@Names,@Adresses);
9630376c 220 foreach (@Address) {
aef5467b
TH
221 # take address part in @Addresses
222 push (@Adresses, $_->address());
9630376c
TH
223 # take name part form "phrase", if there is one:
224 # From: My Name <addr@ess> (Comment)
225 # otherwise, take it from "comment":
226 # From: addr@ess (Comment)
aef5467b
TH
227 # and push it in @Names
228 my ($Name);
229 $Name = $_->comment() unless $Name = $_->phrase;
230 $Name =~ s/^\((.+)\)$/$1/;
231 push (@Names, $Name);
9630376c 232 }
aef5467b
TH
233 # put all @Adresses and all @Names in %Headers as comma separated lists
234 $Headers{$HeaderName.'_address'} = join(', ',@Adresses);
235 $Headers{$HeaderName.'_name'} = join(', ',@Names);
9630376c
TH
236 }
237 }
238 }
239
6d72dad2
TH
240 # order output for database entry: fill @SQLBindVars
241 print "-------------- Next entry:\n" if $OptDebug;
242 my @SQLBindVars;
243 foreach (@DBFields) {
244 if (defined($Headers{$_}) and $Headers{$_} ne '') {
245 push (@SQLBindVars,$Headers{$_});
246 printf ("FOUND: %s -> %s\n",$_,$Headers{$_}) if $OptDebug;
247 } else {
248 push (@SQLBindVars,undef);
249 }
250 }
251
252 # write data to DBTableParse
253 if (!$OptTest) {
254 print "-------------- Writing data ... -------------\n" if $OptDebug;
255 my $DBWrite =
256 $DBHandle->prepare(sprintf("INSERT INTO %s.%s (%s) VALUES (%s)",
257 $Conf{'DBDatabase'},
258 $Conf{'DBTableParse'},
259 # get field names from @DBFields
260 join(', ',@DBFields),
261 # create a list of '?' for each DBField
262 join(', ',
263 split(/ /,'? ' x scalar(@DBFields)))
264 ));
265 $DBWrite->execute(@SQLBindVars)
6deb7dba 266 or &Bleat(2,sprintf("Can't write parsed data for %s to %s.%s for %s: ".
6d72dad2 267 "$DBI::errstr\n",$Period,
6deb7dba 268 $Conf{'DBDatabase'},$Conf{'DBTableParse'}, $Headers{'mid'}));
6d72dad2
TH
269 $DBWrite->finish;
270 }
271};
272$DBQuery->finish;
273
274### close handles
275$DBHandle->disconnect;
276
277print "------------------- DONE! -------------------\n" if $OptDebug;
278__END__
279
280################################ Documentation #################################
281
282=head1 NAME
283
284parsedb - parse raw data and save it to a database
285
286=head1 SYNOPSIS
287
288B<parsedb> [B<-Vht>] [B<--day> I<YYYY-MM-DD> | I<YYYY-MM-DD:YYYY-MM-DD>] [B<--rawdb> I<database table>] [B<--parsedb> I<database table>] [B<--conffile> I<filename>] [B<--debug>]
289
290=head1 REQUIREMENTS
291
292See L<doc/README>.
293
294=head1 DESCRIPTION
295
13e00610
TH
296This script will parse raw, unstructured headers from a database table which is
297fed from F<feedlog.pl> for a given time period and write its results to
298nother database table with separate fields (columns) for most (or even all)
299relevant headers.
300
301I<Subject:>, I<From:>, I<Sender:> and I<Reply-To:> will be parsed from MIME
302encoded words to UTF-8 as needed while the unparsed copy is kept. From that
303parsed copy, I<From:>, I<Sender:> and I<Reply-To:> will also be split into
304separate name(s) and address(es) fields while the un-splitted copy is kept,
305too.
306
307B<parsedb> should be run nightly from cron for yesterdays data so all
308other scripts get current information. The time period to act on defaults to
309yesterday, accordingly; you can assign another time period or a single day via
310the B<--day> option (see below).
6d72dad2
TH
311
312=head2 Configuration
313
13e00610
TH
314B<parsedb> will read its configuration from F<newsstats.conf>
315should be present in etc/ via Config::Auto or from a configuration file
316submitted by the B<--conffile> option.
317
318See L<doc/INSTALL> for an overview of possible configuration options.
319
320You can override configuration options via the B<--rawdb> and
321B<--parsedb> options, respectively.
6d72dad2
TH
322
323=head1 OPTIONS
324
325=over 3
326
327=item B<-V>, B<--version>
328
329Print out version and copyright information and exit.
330
331=item B<-h>, B<--help>
332
333Print this man page and exit.
334
335=item B<--debug>
336
337Output (rather much) debugging information to STDOUT while processing.
338
339=item B<-t>, B<--test>
340
341Do not write results to database. You should use B<--debug> in
342conjunction with B<--test> ... everything else seems a bit pointless.
343
344=item B<-d>, B<--day> I<YYYY-MM-DD[:YYYY-MM-DD]>
345
346Set processing period to a single day in YYYY-MM-DD format or to a time
347period between two days in YYYY-MM-DD:YYYY-MM-DD format (two days, separated
348by a colon).
349
350Defaults to yesterday.
351
352=item B<--rawdb> I<table> (raw data table)
353
354Override I<DBTableRaw> from F<newsstats.conf>.
355
356=item B<--parsedb> I<table> (parsed data table)
357
358Override I<DBTableParse> from F<newsstats.conf>.
359
360=item B<--conffile> I<filename>
361
362Load configuration from I<filename> instead of F<newsstats.conf>.
363
364=back
365
366=head1 INSTALLATION
367
368See L<doc/INSTALL>.
369
370=head1 EXAMPLES
371
13e00610
TH
372An example crontab entry:
373
374 0 1 * * * /path/to/bin/parsedb.pl
375
376Do a dry run for yesterday's data, showing results of processing:
377
378 parsedb --debug --test | less
6d72dad2
TH
379
380=head1 FILES
381
382=over 4
383
384=item F<bin/parsedb.pl>
385
386The script itself.
387
388=item F<lib/NewsStats.pm>
389
390Library functions for the NewsStats package.
391
392=item F<etc/newsstats.conf>
393
394Runtime configuration file.
395
396=back
397
398=head1 BUGS
399
400Please report any bugs or feature requests to the author or use the
401bug tracker at L<http://bugs.th-h.de/>!
402
403=head1 SEE ALSO
404
405=over 2
406
407=item -
408
409L<doc/README>
410
411=item -
412
413L<doc/INSTALL>
414
415=back
416
417This script is part of the B<NewsStats> package.
418
419=head1 AUTHOR
420
421Thomas Hochstein <thh@inter.net>
422
423=head1 COPYRIGHT AND LICENSE
424
425Copyright (c) 2013 Thomas Hochstein <thh@inter.net>
426
427This program is free software; you may redistribute it and/or modify it
428under the same terms as Perl itself.
429
430=cut
This page took 0.031268 seconds and 4 git commands to generate.