Be more fault-tolerant when reading checkgroups.
[usenet/newsstats.git] / gatherstats.pl
CommitLineData
2832c235
TH
1#! /usr/bin/perl -W
2#
3# gatherstats.pl
4#
5# This script will gather statistical information from a database
6# containing headers and other information from a INN feed.
7#
8# It is part of the NewsStats package.
9#
880c3eb2 10# Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
2832c235
TH
11#
12# It can be redistributed and/or modified under the same terms under
13# which Perl itself is published.
14
15BEGIN {
16 our $VERSION = "0.01";
17 use File::Basename;
18 push(@INC, dirname($0));
19}
20use strict;
21
ad609792 22use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList);
2832c235
TH
23
24use DBI;
880c3eb2
TH
25use Getopt::Long qw(GetOptions);
26Getopt::Long::config ('bundling');
2832c235
TH
27
28################################# Definitions ##################################
29
30# define types of information that can be gathered
31# all / groups (/ clients / hosts)
880c3eb2
TH
32my %LegalStats;
33@LegalStats{('all','groups')} = ();
2832c235
TH
34
35################################# Main program #################################
36
37### read commandline options
880c3eb2
TH
38my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
39 $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest);
40GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
41 'clientsdb=s' => \$OptClientsDB,
42 'd|debug!' => \$OptDebug,
43 'groupsdb=s' => \$OptGroupsDB,
44 'hierarchy=s' => \$OptTLH,
45 'hostsdb=s' => \$OptHostsDB,
46 'm|month=s' => \$OptMonth,
47 'rawdb=s' => \$OptRawDB,
48 's|stats=s' => \$OptStatsType,
49 't|test!' => \$OptTest,
50 'h|help' => \&ShowPOD,
51 'V|version' => \&ShowVersion) or exit 1;
2832c235
TH
52
53### read configuration
880c3eb2 54my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')};
2832c235
TH
55
56### override configuration via commandline options
57my %ConfOverride;
880c3eb2
TH
58$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
59$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
60$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
61$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
62$ConfOverride{'TLH'} = $OptTLH if $OptTLH;
2832c235
TH
63&OverrideConfig(\%Conf,\%ConfOverride);
64
65### get type of information to gather, defaulting to 'all'
880c3eb2
TH
66$OptStatsType = 'all' if !$OptStatsType;
67&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
68 if !exists($LegalStats{$OptStatsType});
2832c235 69
880c3eb2
TH
70### get time period from --month
71# get verbal description of time period, drop SQL code
72my ($Period) = &GetTimePeriod($OptMonth);
73&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
74 "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
2832c235 75
17ffbeba
TH
76### reformat $Conf{'TLH'}
77my $TLH;
78if ($Conf{'TLH'}) {
79 # $Conf{'TLH'} is parsed as an array by Config::Auto;
80 # make a flat list again, separated by :
43a0fc77 81 if (ref($Conf{'TLH'}) eq 'ARRAY') {
17ffbeba
TH
82 $TLH = join(':',@{$Conf{'TLH'}});
83 } else {
84 $TLH = $Conf{'TLH'};
85 }
86 # strip whitespace
87 $TLH =~ s/\s//g;
7773fb6d
TH
88 # add trailing dots if none are present yet
89 # (using negative look-behind assertions)
90 $TLH =~ s/(?<!\.):/.:/g;
91 $TLH =~ s/(?<!\.)$/./;
17ffbeba 92 # check for illegal characters
880c3eb2 93 &Bleat(2,'Config error - illegal characters in TLH definition!')
314e31aa 94 if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
7773fb6d
TH
95 # escape dots
96 $TLH =~ s/\./\\./g;
17ffbeba 97 if ($TLH =~ /:/) {
880c3eb2 98 # reformat $TLH from a:b to (a)|(b),
43a0fc77 99 # e.g. replace ':' by ')|('
17ffbeba
TH
100 $TLH =~ s/:/)|(/g;
101 $TLH = '(' . $TLH . ')';
102 };
103};
104
880c3eb2
TH
105# read list of newsgroups from --checkgroups
106# into a hash
107my %ValidGroups = %{ReadGroupList($OptCheckgroupsFile)} if $OptCheckgroupsFile;
ad609792 108
2832c235
TH
109### init database
110my $DBHandle = InitDB(\%Conf,1);
111
112### get data for each month
880c3eb2
TH
113&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
114foreach my $Month (&ListMonth($Period)) {
2832c235 115
880c3eb2 116 print "---------- $Month ----------\n" if $OptDebug;
2832c235 117
880c3eb2 118 if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
2832c235
TH
119 ### ----------------------------------------------
120 ### get groups data (number of postings per group)
121 # get groups data from raw table for given month
880c3eb2
TH
122 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
123 "WHERE day LIKE ? AND NOT disregard",
124 $Conf{'DBDatabase'},
125 $Conf{'DBTableRaw'}));
126 $DBQuery->execute($Month.'-%')
127 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
128 "$DBI::errstr\n",$Month,
129 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
2832c235
TH
130
131 # count postings per group
132 my %Postings;
2832c235
TH
133 while (($_) = $DBQuery->fetchrow_array) {
134 # get list oft newsgroups and hierarchies from Newsgroups:
880c3eb2
TH
135 my %Newsgroups = ListNewsgroups($_,$TLH,
136 $OptCheckgroupsFile ? \%ValidGroups : '');
2832c235
TH
137 # count each newsgroup and hierarchy once
138 foreach (sort keys %Newsgroups) {
2832c235
TH
139 $Postings{$_}++;
140 };
141 };
142
880c3eb2 143 # add valid but empty groups if --checkgroups is set
ad609792
TH
144 if (%ValidGroups) {
145 foreach (sort keys %ValidGroups) {
146 if (!defined($Postings{$_})) {
147 $Postings{$_} = 0 ;
148 warn (sprintf("ADDED: %s as empty group\n",$_));
149 }
150 };
151 };
152
71f0178b 153 # delete old data for that month
880c3eb2
TH
154 if (!$OptTest) {
155 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
156 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
157 undef,$Month)
158 or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
159 "$DBI::errstr\n",$Month,
160 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
71f0178b
TH
161 };
162
880c3eb2 163 print "----- GroupStats -----\n" if $OptDebug;
2832c235 164 foreach my $Newsgroup (sort keys %Postings) {
880c3eb2
TH
165 print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
166 if (!$OptTest) {
2832c235 167 # write to database
880c3eb2
TH
168 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
169 "(month,newsgroup,postings) ".
170 "VALUES (?, ?, ?)",
171 $Conf{'DBDatabase'},
172 $Conf{'DBTableGrps'}));
173 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
174 or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
175 "$DBI::errstr\n",$Month,$Newsgroup,
176 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
2832c235
TH
177 $DBQuery->finish;
178 };
179 };
180 } else {
181 # other types of information go here - later on
182 };
183};
184
185### close handles
186$DBHandle->disconnect;
187
188__END__
189
190################################ Documentation #################################
191
192=head1 NAME
193
194gatherstats - process statistical data from a raw source
195
196=head1 SYNOPSIS
197
880c3eb2 198B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats] [B<-c> I<checkgroups file>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>]
2832c235
TH
199
200=head1 REQUIREMENTS
201
880c3eb2 202See L<doc/README>.
2832c235
TH
203
204=head1 DESCRIPTION
205
206This script will extract and process statistical information from a
207database table which is fed from F<feedlog.pl> for a given time period
313610f6 208and write its results to (an)other database table(s). Entries marked
880c3eb2
TH
209with I<'disregard'> in the database will be ignored; currently, you
210have to set this flag yourself, using your database management tools.
211You can exclude erroneous entries that way (e.g. automatic reposts
212(think of cancels flood and resurrectors); spam; ...).
2832c235
TH
213
214The time period to act on defaults to last month; you can assign
880c3eb2
TH
215another time period or a single month via the B<--month> option (see
216below).
2832c235
TH
217
218By default B<gatherstats> will process all types of information; you
880c3eb2
TH
219can change that using the B<--stats> option and assigning the type of
220information to process. Currently that doesn't matter yet as only
221processing of the number of postings per group per month is
222implemented anyway.
2832c235
TH
223
224Possible information types include:
225
226=over 3
227
228=item B<groups> (postings per group per month)
229
230B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
231counted for each single group they appear in. Groups not in I<TLH>
232will be ignored.
233
234B<gatherstats> will also add up the number of postings for each
235hierarchy level, but only count each posting once. A posting to
236de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
237respectively. A crossposting to de.alt.test and de.alt.admin, on the
238other hand, will be counted for de.alt.test and de.alt.admin each, but
239only once for de.alt.ALL and de.ALL.
240
880c3eb2
TH
241Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
242override that default through the B<--groupsdb> option.
2832c235
TH
243
244=back
245
246=head2 Configuration
247
880c3eb2 248B<gatherstats> will read its configuration from F<newsstats.conf>
2832c235
TH
249which should be present in the same directory via Config::Auto.
250
880c3eb2 251See L<doc/INSTALL> for an overview of possible configuration options.
2832c235 252
880c3eb2
TH
253You can override configuration options via the B<--hierarchy>,
254B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
255respectively.
2832c235
TH
256
257=head1 OPTIONS
258
259=over 3
260
880c3eb2 261=item B<-V>, B<--version>
2832c235 262
880c3eb2 263Print out version and copyright information and exit.
2832c235 264
880c3eb2 265=item B<-h>, B<--help>
2832c235
TH
266
267Print this man page and exit.
268
880c3eb2 269=item B<-d>, B<--debug>
2832c235
TH
270
271Output debugging information to STDOUT while processing (number of
272postings per group).
273
880c3eb2 274=item B<-t>, B<--test>
2832c235 275
880c3eb2
TH
276Do not write results to database. You should use B<--debug> in
277conjunction with B<--test> ... everything else seems a bit pointless.
2832c235 278
880c3eb2 279=item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
2832c235 280
880c3eb2
TH
281Set processing period to a single month in YYYY-MM format or to a time
282period between two month in YYYY-MM:YYYY-MM format (two month, separated
283by a colon).
2832c235 284
2832c235 285
880c3eb2 286=item B<-s>, B<--stats> I<type>
2832c235
TH
287
288Set processing type to one of I<all> and I<groups>. Defaults to all
289(and is currently rather pointless as only I<groups> has been
290implemented).
291
880c3eb2 292=item B<-c>, B<--checkgroups> I<filename>
ad609792
TH
293
294Check each group against a list of valid newsgroups read from
295I<filename>, one group on each line and ignoring everything after the
296first whitespace (so you can use a file in checkgroups format or (part
297of) your INN active file).
298
299Newsgroups not found in I<filename> will be dropped (and logged to
300STDERR), and newsgroups found in I<filename> but having no postings
301will be added with a count of 0 (and logged to STDERR).
302
880c3eb2 303=item B<--hierarchy> I<TLH> (newsgroup hierarchy)
2832c235
TH
304
305Override I<TLH> from F<newsstats.conf>.
306
880c3eb2 307=item B<--rawdb> I<table> (raw data table)
2832c235
TH
308
309Override I<DBTableRaw> from F<newsstats.conf>.
310
880c3eb2 311=item B<--groupsdb> I<table> (postings per group table)
2832c235
TH
312
313Override I<DBTableGrps> from F<newsstats.conf>.
314
880c3eb2 315=item B<--clientsdb> I<table> (client data table)
2832c235
TH
316
317Override I<DBTableClnts> from F<newsstats.conf>.
318
880c3eb2 319=item B<--hostsdb> I<table> (host data table)
2832c235
TH
320
321Override I<DBTableHosts> from F<newsstats.conf>.
322
323=back
324
325=head1 INSTALLATION
326
880c3eb2 327See L<doc/INSTALL>.
2832c235
TH
328
329=head1 EXAMPLES
330
331Process all types of information for lasth month:
332
333 gatherstats
334
335Do a dry run, showing results of processing:
336
880c3eb2 337 gatherstats --debug --test
2832c235
TH
338
339Process all types of information for January of 2010:
340
880c3eb2 341 gatherstats --month 2010-01
2832c235 342
ad609792
TH
343Process only number of postings for the year of 2010,
344checking against checkgroups-2010.txt:
2832c235 345
880c3eb2 346 gatherstats -m 2010-01:2010-12 -s groups -c checkgroups-2010.txt
2832c235
TH
347
348=head1 FILES
349
350=over 4
351
352=item F<gatherstats.pl>
353
354The script itself.
355
356=item F<NewsStats.pm>
357
358Library functions for the NewsStats package.
359
360=item F<newsstats.conf>
361
880c3eb2 362Runtime configuration file.
2832c235
TH
363
364=back
365
366=head1 BUGS
367
368Please report any bugs or feature requests to the author or use the
369bug tracker at L<http://bugs.th-h.de/>!
370
371=head1 SEE ALSO
372
373=over 2
374
375=item -
376
880c3eb2 377L<doc/README>
2832c235
TH
378
379=item -
380
880c3eb2 381L<doc/INSTALL>
2832c235
TH
382
383=back
384
385This script is part of the B<NewsStats> package.
386
387=head1 AUTHOR
388
389Thomas Hochstein <thh@inter.net>
390
391=head1 COPYRIGHT AND LICENSE
392
880c3eb2 393Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
2832c235
TH
394
395This program is free software; you may redistribute it and/or modify it
396under the same terms as Perl itself.
397
398=cut
This page took 0.033282 seconds and 4 git commands to generate.