Create a database table with parsed raw data.
[usenet/newsstats.git] / bin / parsedb.pl
CommitLineData
6d72dad2
TH
1#! /usr/bin/perl
2#
3# parsedb.pl
4#
5# This script will parse a database with raw header information
6# from a INN feed to a structured database.
7#
8# It is part of the NewsStats package.
9#
10# Copyright (c) 2013 Thomas Hochstein <thh@inter.net>
11#
12# It can be redistributed and/or modified under the same terms under
13# which Perl itself is published.
14
15BEGIN {
16 our $VERSION = "0.01";
17 use File::Basename;
18 # we're in .../bin, so our module is in ../lib
19 push(@INC, dirname($0).'/../lib');
20}
21use strict;
22use warnings;
23
24use NewsStats qw(:DEFAULT :TimePeriods :SQLHelper);
25
26use DBI;
27use Getopt::Long qw(GetOptions);
28Getopt::Long::config ('bundling');
29
30################################# Definitions ##################################
31
32# define header names with separate database fields
33my %DBFields = ('date' => 'date',
34 'references' => 'refs',
35 'followup-to' => 'fupto',
36 'from' => 'from_',
37 'sender' => 'sender',
38 'reply-to' => 'replyto',
39 'subject' => 'subject',
40 'organization' => 'organization',
41 'lines' => 'linecount',
42 'approved' => 'approved',
43 'supersedes' => 'supersedes',
44 'expires' => 'expires',
45 'user-agent' => 'useragent',
46 'x-newsreader' => 'xnewsreader',
47 'x-mailer' => 'xmailer',
48 'x-no-archive' => 'xnoarchive',
49 'content-type' => 'contenttype',
50 'content-transfer-encoding' => 'contentencoding',
51 'cancel-lock' => 'cancellock',
52 'injection-info' => 'injectioninfo',
53 'x-trace' => 'xtrace',
54 'nntp-posting-host' => 'postinghost');
55
56# define field list for database
57my @DBFields = qw/day mid refs date path newsgroups fupto from_ from_parsed
58 from_name from_address sender sender_parsed sender_name
59 sender_address replyto replyto_parsed replyto_name
60 replyto_address subject subject_parsed organization linecount
61 approved supersedes expires useragent xnewsreader xmailer
62 xnoarchive contenttype contentencoding cancellock injectioninfo
63 xtrace postinghost headers disregard/;
64
65################################# Main program #################################
66
67### read commandline options
68my ($OptDay,$OptDebug,$OptParseDB,$OptRawDB,$OptTest,$OptConfFile);
69GetOptions ('d|day=s' => \$OptDay,
70 'debug!' => \$OptDebug,
71 'parsedb=s' => \$OptParseDB,
72 'rawdb=s' => \$OptRawDB,
73 't|test!' => \$OptTest,
74 'conffile=s' => \$OptConfFile,
75 'h|help' => \&ShowPOD,
76 'V|version' => \&ShowVersion) or exit 1;
77
78### read configuration
79my %Conf = %{ReadConfig($OptConfFile)};
80
81### override configuration via commandline options
82my %ConfOverride;
83$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
84$ConfOverride{'DBTableParse'} = $OptParseDB if $OptParseDB;
85&OverrideConfig(\%Conf,\%ConfOverride);
86
87### get time period
88### and set $Period for output and expression for SQL 'WHERE' clause
89my ($Period,$SQLWherePeriod) = &GetTimePeriod($OptDay,'day');
90# bail out if --month is invalid or "all"
91&Bleat(2,"--day option has an invalid format - please use 'YYYY-MM-DD' or ".
92 "'YYYY-MM-DD:YYYY-MM-DD'!") if (!$Period or $Period eq 'all time');
93
94### init database
95my $DBHandle = InitDB(\%Conf,1);
96
97### get & write data
98&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
99
100# create $SQLWhereClause
101my $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,'NOT disregard');
102
103# delete old data for current period
104if (!$OptTest) {
105 print "----------- Deleting old data ... -----------\n" if $OptDebug;
106 my $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s %s",
107 $Conf{'DBDatabase'},$Conf{'DBTableParse'},
108 $SQLWhereClause))
109 or &Bleat(2,sprintf("Can't delete old parsed data for %s from %s.%s: ".
110 "$DBI::errstr\n",$Period,
111 $Conf{'DBDatabase'},$Conf{'DBTableParse'}));
112};
113
114# read from DBTableRaw
115print "-------------- Reading data ... -------------\n" if $OptDebug;
116my $DBQuery = $DBHandle->prepare(sprintf("SELECT id, day, mid, peer, path, ".
117 "newsgroups, headers, disregard ".
118 "FROM %s.%s %s", $Conf{'DBDatabase'},
119 $Conf{'DBTableRaw'}, $SQLWhereClause));
120$DBQuery->execute()
121 or &Bleat(2,sprintf("Can't get data for %s from %s.%s: ".
122 "$DBI::errstr\n",$Period,
123 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
124
125# set output and database connection to UTF-8
126# as we're going to write decoded header contents containing UTF-8 chars
127binmode(STDOUT, ":utf8");
128$DBHandle->do("SET NAMES 'utf8'");
129
130# parse data in a loop and write it out
131print "-------------- Parsing data ... -------------\n" if $OptDebug;
132while (my $HeadersR = $DBQuery->fetchrow_hashref) {
133 my %Headers = %{$HeadersR};
134
135 # parse $Headers{'headers'} ('headers' from DBTableRaw)
136 # merge continuation lines
137 # from Perl Cookbook, 1st German ed. 1999, pg. 91
138 $Headers{'headers'} =~ s/\n\s+/ /g;
139 # split headers in single lines
140 my $OtherHeaders;
141 for (split(/\n/,$Headers{'headers'})) {
142 # split header lines in header name and header content
143 my ($key,$value) = split(/:/,$_,2);
144 $key =~ s/\s*//;
145 $value =~ s/^\s*(.+)\s*$/$1/;
146 # save each header, separate database fields in %Headers,
147 # the rest in $OtherHeaders (but not Message-ID, Path, Peer
148 # and Newsgroups as those do already exist)
149 if (defined($DBFields{lc($key)})) {
150 $Headers{$DBFields{lc($key)}} = $value;
151 } else {
152 $OtherHeaders .= sprintf("%s: %s\n",$key,$value)
153 if lc($key) !~ /^(message-id|path|peer|newsgroups)$/;
154 }
155 }
156 # replace old (now parsed) $Headers{'headers'} with remanining $OtherHeaders
157 chomp($OtherHeaders);
158 $Headers{'headers'} = $OtherHeaders;
159
160 # order output for database entry: fill @SQLBindVars
161 print "-------------- Next entry:\n" if $OptDebug;
162 my @SQLBindVars;
163 foreach (@DBFields) {
164 if (defined($Headers{$_}) and $Headers{$_} ne '') {
165 push (@SQLBindVars,$Headers{$_});
166 printf ("FOUND: %s -> %s\n",$_,$Headers{$_}) if $OptDebug;
167 } else {
168 push (@SQLBindVars,undef);
169 }
170 }
171
172 # write data to DBTableParse
173 if (!$OptTest) {
174 print "-------------- Writing data ... -------------\n" if $OptDebug;
175 my $DBWrite =
176 $DBHandle->prepare(sprintf("INSERT INTO %s.%s (%s) VALUES (%s)",
177 $Conf{'DBDatabase'},
178 $Conf{'DBTableParse'},
179 # get field names from @DBFields
180 join(', ',@DBFields),
181 # create a list of '?' for each DBField
182 join(', ',
183 split(/ /,'? ' x scalar(@DBFields)))
184 ));
185 $DBWrite->execute(@SQLBindVars)
186 or &Bleat(2,sprintf("Can't write parsed data for %s to %s.%s: ".
187 "$DBI::errstr\n",$Period,
188 $Conf{'DBDatabase'},$Conf{'DBTableParse'}));
189 $DBWrite->finish;
190 }
191};
192$DBQuery->finish;
193
194### close handles
195$DBHandle->disconnect;
196
197print "------------------- DONE! -------------------\n" if $OptDebug;
198__END__
199
200################################ Documentation #################################
201
202=head1 NAME
203
204parsedb - parse raw data and save it to a database
205
206=head1 SYNOPSIS
207
208B<parsedb> [B<-Vht>] [B<--day> I<YYYY-MM-DD> | I<YYYY-MM-DD:YYYY-MM-DD>] [B<--rawdb> I<database table>] [B<--parsedb> I<database table>] [B<--conffile> I<filename>] [B<--debug>]
209
210=head1 REQUIREMENTS
211
212See L<doc/README>.
213
214=head1 DESCRIPTION
215
216...
217
218=head2 Configuration
219
220...
221
222=head1 OPTIONS
223
224=over 3
225
226=item B<-V>, B<--version>
227
228Print out version and copyright information and exit.
229
230=item B<-h>, B<--help>
231
232Print this man page and exit.
233
234=item B<--debug>
235
236Output (rather much) debugging information to STDOUT while processing.
237
238=item B<-t>, B<--test>
239
240Do not write results to database. You should use B<--debug> in
241conjunction with B<--test> ... everything else seems a bit pointless.
242
243=item B<-d>, B<--day> I<YYYY-MM-DD[:YYYY-MM-DD]>
244
245Set processing period to a single day in YYYY-MM-DD format or to a time
246period between two days in YYYY-MM-DD:YYYY-MM-DD format (two days, separated
247by a colon).
248
249Defaults to yesterday.
250
251=item B<--rawdb> I<table> (raw data table)
252
253Override I<DBTableRaw> from F<newsstats.conf>.
254
255=item B<--parsedb> I<table> (parsed data table)
256
257Override I<DBTableParse> from F<newsstats.conf>.
258
259=item B<--conffile> I<filename>
260
261Load configuration from I<filename> instead of F<newsstats.conf>.
262
263=back
264
265=head1 INSTALLATION
266
267See L<doc/INSTALL>.
268
269=head1 EXAMPLES
270
271...
272
273=head1 FILES
274
275=over 4
276
277=item F<bin/parsedb.pl>
278
279The script itself.
280
281=item F<lib/NewsStats.pm>
282
283Library functions for the NewsStats package.
284
285=item F<etc/newsstats.conf>
286
287Runtime configuration file.
288
289=back
290
291=head1 BUGS
292
293Please report any bugs or feature requests to the author or use the
294bug tracker at L<http://bugs.th-h.de/>!
295
296=head1 SEE ALSO
297
298=over 2
299
300=item -
301
302L<doc/README>
303
304=item -
305
306L<doc/INSTALL>
307
308=back
309
310This script is part of the B<NewsStats> package.
311
312=head1 AUTHOR
313
314Thomas Hochstein <thh@inter.net>
315
316=head1 COPYRIGHT AND LICENSE
317
318Copyright (c) 2013 Thomas Hochstein <thh@inter.net>
319
320This program is free software; you may redistribute it and/or modify it
321under the same terms as Perl itself.
322
323=cut
This page took 0.023988 seconds and 4 git commands to generate.