Merge branch 'thh-checkinput' into next
[usenet/newsstats.git] / gatherstats.pl
CommitLineData
3f817eb4 1#! /usr/bin/perl
2832c235
TH
2#
3# gatherstats.pl
4#
5# This script will gather statistical information from a database
6# containing headers and other information from a INN feed.
7#
8# It is part of the NewsStats package.
9#
880c3eb2 10# Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
2832c235
TH
11#
12# It can be redistributed and/or modified under the same terms under
13# which Perl itself is published.
14
15BEGIN {
16 our $VERSION = "0.01";
17 use File::Basename;
18 push(@INC, dirname($0));
19}
20use strict;
3f817eb4 21use warnings;
2832c235 22
1703b8e3 23use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList);
2832c235
TH
24
25use DBI;
880c3eb2
TH
26use Getopt::Long qw(GetOptions);
27Getopt::Long::config ('bundling');
2832c235
TH
28
29################################# Definitions ##################################
30
31# define types of information that can be gathered
32# all / groups (/ clients / hosts)
880c3eb2
TH
33my %LegalStats;
34@LegalStats{('all','groups')} = ();
2832c235
TH
35
36################################# Main program #################################
37
38### read commandline options
880c3eb2
TH
39my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
40 $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest);
41GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
42 'clientsdb=s' => \$OptClientsDB,
43 'd|debug!' => \$OptDebug,
44 'groupsdb=s' => \$OptGroupsDB,
45 'hierarchy=s' => \$OptTLH,
46 'hostsdb=s' => \$OptHostsDB,
47 'm|month=s' => \$OptMonth,
48 'rawdb=s' => \$OptRawDB,
49 's|stats=s' => \$OptStatsType,
50 't|test!' => \$OptTest,
51 'h|help' => \&ShowPOD,
52 'V|version' => \&ShowVersion) or exit 1;
2832c235
TH
53
54### read configuration
880c3eb2 55my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')};
2832c235
TH
56
57### override configuration via commandline options
58my %ConfOverride;
880c3eb2
TH
59$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
60$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
61$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
62$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
63$ConfOverride{'TLH'} = $OptTLH if $OptTLH;
2832c235
TH
64&OverrideConfig(\%Conf,\%ConfOverride);
65
66### get type of information to gather, defaulting to 'all'
880c3eb2
TH
67$OptStatsType = 'all' if !$OptStatsType;
68&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
69 if !exists($LegalStats{$OptStatsType});
2832c235 70
880c3eb2
TH
71### get time period from --month
72# get verbal description of time period, drop SQL code
73my ($Period) = &GetTimePeriod($OptMonth);
74&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
75 "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
2832c235 76
17ffbeba
TH
77### reformat $Conf{'TLH'}
78my $TLH;
79if ($Conf{'TLH'}) {
80 # $Conf{'TLH'} is parsed as an array by Config::Auto;
81 # make a flat list again, separated by :
43a0fc77 82 if (ref($Conf{'TLH'}) eq 'ARRAY') {
17ffbeba
TH
83 $TLH = join(':',@{$Conf{'TLH'}});
84 } else {
85 $TLH = $Conf{'TLH'};
86 }
87 # strip whitespace
88 $TLH =~ s/\s//g;
7773fb6d
TH
89 # add trailing dots if none are present yet
90 # (using negative look-behind assertions)
91 $TLH =~ s/(?<!\.):/.:/g;
92 $TLH =~ s/(?<!\.)$/./;
17ffbeba 93 # check for illegal characters
880c3eb2 94 &Bleat(2,'Config error - illegal characters in TLH definition!')
314e31aa 95 if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
7773fb6d
TH
96 # escape dots
97 $TLH =~ s/\./\\./g;
17ffbeba 98 if ($TLH =~ /:/) {
880c3eb2 99 # reformat $TLH from a:b to (a)|(b),
43a0fc77 100 # e.g. replace ':' by ')|('
17ffbeba
TH
101 $TLH =~ s/:/)|(/g;
102 $TLH = '(' . $TLH . ')';
103 };
104};
105
2832c235
TH
106### init database
107my $DBHandle = InitDB(\%Conf,1);
108
109### get data for each month
880c3eb2
TH
110&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
111foreach my $Month (&ListMonth($Period)) {
2832c235 112
880c3eb2 113 print "---------- $Month ----------\n" if $OptDebug;
2832c235 114
880c3eb2 115 if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
93c8eae2
TH
116 # read list of newsgroups from --checkgroups
117 # into a hash
118 my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))}
119 if $OptCheckgroupsFile;
120
2832c235
TH
121 ### ----------------------------------------------
122 ### get groups data (number of postings per group)
123 # get groups data from raw table for given month
880c3eb2
TH
124 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
125 "WHERE day LIKE ? AND NOT disregard",
126 $Conf{'DBDatabase'},
127 $Conf{'DBTableRaw'}));
128 $DBQuery->execute($Month.'-%')
129 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
130 "$DBI::errstr\n",$Month,
131 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
2832c235
TH
132
133 # count postings per group
134 my %Postings;
2832c235 135 while (($_) = $DBQuery->fetchrow_array) {
b5125b10 136 # get list of newsgroups and hierarchies from Newsgroups:
880c3eb2
TH
137 my %Newsgroups = ListNewsgroups($_,$TLH,
138 $OptCheckgroupsFile ? \%ValidGroups : '');
2832c235
TH
139 # count each newsgroup and hierarchy once
140 foreach (sort keys %Newsgroups) {
2832c235
TH
141 $Postings{$_}++;
142 };
143 };
144
880c3eb2 145 # add valid but empty groups if --checkgroups is set
ad609792
TH
146 if (%ValidGroups) {
147 foreach (sort keys %ValidGroups) {
148 if (!defined($Postings{$_})) {
1703b8e3
TH
149 # add current newsgroup as empty group
150 $Postings{$_} = 0;
151 warn (sprintf("ADDED: %s as empty group\n",$_));
152 # add empty hierarchies for current newsgroup as needed
153 foreach (ParseHierarchies($_)) {
154 my $Hierarchy = $_ . '.ALL';
155 if (!defined($Postings{$Hierarchy})) {
156 $Postings{$Hierarchy} = 0;
157 warn (sprintf("ADDED: %s as empty group\n",$Hierarchy));
b5125b10
TH
158 };
159 };
ad609792
TH
160 }
161 };
162 };
b5125b10 163
71f0178b 164 # delete old data for that month
880c3eb2
TH
165 if (!$OptTest) {
166 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
167 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
168 undef,$Month)
169 or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
170 "$DBI::errstr\n",$Month,
171 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
71f0178b
TH
172 };
173
880c3eb2 174 print "----- GroupStats -----\n" if $OptDebug;
2832c235 175 foreach my $Newsgroup (sort keys %Postings) {
880c3eb2
TH
176 print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
177 if (!$OptTest) {
2832c235 178 # write to database
880c3eb2
TH
179 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
180 "(month,newsgroup,postings) ".
181 "VALUES (?, ?, ?)",
182 $Conf{'DBDatabase'},
183 $Conf{'DBTableGrps'}));
184 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
185 or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
186 "$DBI::errstr\n",$Month,$Newsgroup,
187 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
2832c235
TH
188 $DBQuery->finish;
189 };
190 };
191 } else {
192 # other types of information go here - later on
193 };
194};
195
196### close handles
197$DBHandle->disconnect;
198
199__END__
200
201################################ Documentation #################################
202
203=head1 NAME
204
205gatherstats - process statistical data from a raw source
206
207=head1 SYNOPSIS
208
93c8eae2 209B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>]
2832c235
TH
210
211=head1 REQUIREMENTS
212
880c3eb2 213See L<doc/README>.
2832c235
TH
214
215=head1 DESCRIPTION
216
217This script will extract and process statistical information from a
218database table which is fed from F<feedlog.pl> for a given time period
313610f6 219and write its results to (an)other database table(s). Entries marked
880c3eb2
TH
220with I<'disregard'> in the database will be ignored; currently, you
221have to set this flag yourself, using your database management tools.
222You can exclude erroneous entries that way (e.g. automatic reposts
223(think of cancels flood and resurrectors); spam; ...).
2832c235
TH
224
225The time period to act on defaults to last month; you can assign
880c3eb2
TH
226another time period or a single month via the B<--month> option (see
227below).
2832c235
TH
228
229By default B<gatherstats> will process all types of information; you
880c3eb2
TH
230can change that using the B<--stats> option and assigning the type of
231information to process. Currently that doesn't matter yet as only
232processing of the number of postings per group per month is
233implemented anyway.
2832c235
TH
234
235Possible information types include:
236
237=over 3
238
239=item B<groups> (postings per group per month)
240
241B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
242counted for each single group they appear in. Groups not in I<TLH>
243will be ignored.
244
245B<gatherstats> will also add up the number of postings for each
246hierarchy level, but only count each posting once. A posting to
247de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
248respectively. A crossposting to de.alt.test and de.alt.admin, on the
249other hand, will be counted for de.alt.test and de.alt.admin each, but
250only once for de.alt.ALL and de.ALL.
251
880c3eb2
TH
252Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
253override that default through the B<--groupsdb> option.
2832c235
TH
254
255=back
256
257=head2 Configuration
258
880c3eb2 259B<gatherstats> will read its configuration from F<newsstats.conf>
2832c235
TH
260which should be present in the same directory via Config::Auto.
261
880c3eb2 262See L<doc/INSTALL> for an overview of possible configuration options.
2832c235 263
880c3eb2
TH
264You can override configuration options via the B<--hierarchy>,
265B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
266respectively.
2832c235
TH
267
268=head1 OPTIONS
269
270=over 3
271
880c3eb2 272=item B<-V>, B<--version>
2832c235 273
880c3eb2 274Print out version and copyright information and exit.
2832c235 275
880c3eb2 276=item B<-h>, B<--help>
2832c235
TH
277
278Print this man page and exit.
279
880c3eb2 280=item B<-d>, B<--debug>
2832c235
TH
281
282Output debugging information to STDOUT while processing (number of
283postings per group).
284
880c3eb2 285=item B<-t>, B<--test>
2832c235 286
880c3eb2
TH
287Do not write results to database. You should use B<--debug> in
288conjunction with B<--test> ... everything else seems a bit pointless.
2832c235 289
880c3eb2 290=item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
2832c235 291
880c3eb2
TH
292Set processing period to a single month in YYYY-MM format or to a time
293period between two month in YYYY-MM:YYYY-MM format (two month, separated
294by a colon).
2832c235 295
2832c235 296
880c3eb2 297=item B<-s>, B<--stats> I<type>
2832c235
TH
298
299Set processing type to one of I<all> and I<groups>. Defaults to all
300(and is currently rather pointless as only I<groups> has been
301implemented).
302
93c8eae2
TH
303=item B<-c>, B<--checkgroups> I<filename template>
304
305Check each group against a list of valid newsgroups read from a file,
306one group on each line and ignoring everything after the first
307whitespace (so you can use a file in checkgroups format or (part of)
308your INN active file).
309
310The filename is taken from I<filename template>, amended by each B<--
311month> B<gatherstats> is processing, so that
312
313 gatherstats -m 2010-01:2010-12 -c checkgroups
ad609792 314
93c8eae2
TH
315will check against F<checkgroups-2010-01> for January 2010, against
316F<checkgroups-2010-02> for February 2010 and so on.
ad609792 317
93c8eae2
TH
318Newsgroups not found in the checkgroups file will be dropped (and
319logged to STDERR), and newsgroups found there but having no postings
ad609792
TH
320will be added with a count of 0 (and logged to STDERR).
321
880c3eb2 322=item B<--hierarchy> I<TLH> (newsgroup hierarchy)
2832c235
TH
323
324Override I<TLH> from F<newsstats.conf>.
325
880c3eb2 326=item B<--rawdb> I<table> (raw data table)
2832c235
TH
327
328Override I<DBTableRaw> from F<newsstats.conf>.
329
880c3eb2 330=item B<--groupsdb> I<table> (postings per group table)
2832c235
TH
331
332Override I<DBTableGrps> from F<newsstats.conf>.
333
880c3eb2 334=item B<--clientsdb> I<table> (client data table)
2832c235
TH
335
336Override I<DBTableClnts> from F<newsstats.conf>.
337
880c3eb2 338=item B<--hostsdb> I<table> (host data table)
2832c235
TH
339
340Override I<DBTableHosts> from F<newsstats.conf>.
341
342=back
343
344=head1 INSTALLATION
345
880c3eb2 346See L<doc/INSTALL>.
2832c235
TH
347
348=head1 EXAMPLES
349
350Process all types of information for lasth month:
351
352 gatherstats
353
354Do a dry run, showing results of processing:
355
880c3eb2 356 gatherstats --debug --test
2832c235
TH
357
358Process all types of information for January of 2010:
359
880c3eb2 360 gatherstats --month 2010-01
2832c235 361
ad609792 362Process only number of postings for the year of 2010,
93c8eae2 363checking against checkgroups-*:
2832c235 364
93c8eae2 365 gatherstats -m 2010-01:2010-12 -s groups -c checkgroups
2832c235
TH
366
367=head1 FILES
368
369=over 4
370
371=item F<gatherstats.pl>
372
373The script itself.
374
375=item F<NewsStats.pm>
376
377Library functions for the NewsStats package.
378
379=item F<newsstats.conf>
380
880c3eb2 381Runtime configuration file.
2832c235
TH
382
383=back
384
385=head1 BUGS
386
387Please report any bugs or feature requests to the author or use the
388bug tracker at L<http://bugs.th-h.de/>!
389
390=head1 SEE ALSO
391
392=over 2
393
394=item -
395
880c3eb2 396L<doc/README>
2832c235
TH
397
398=item -
399
880c3eb2 400L<doc/INSTALL>
2832c235
TH
401
402=back
403
404This script is part of the B<NewsStats> package.
405
406=head1 AUTHOR
407
408Thomas Hochstein <thh@inter.net>
409
410=head1 COPYRIGHT AND LICENSE
411
880c3eb2 412Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
2832c235
TH
413
414This program is free software; you may redistribute it and/or modify it
415under the same terms as Perl itself.
416
417=cut
This page took 0.034347 seconds and 4 git commands to generate.