--sums is not compatible with --checkgroups.
[usenet/newsstats.git] / bin / gatherstats.pl
CommitLineData
3f817eb4 1#! /usr/bin/perl
2832c235
TH
2#
3# gatherstats.pl
4#
5# This script will gather statistical information from a database
6# containing headers and other information from a INN feed.
dfc2b81c 7#
2832c235
TH
8# It is part of the NewsStats package.
9#
07c0b258 10# Copyright (c) 2010-2013 Thomas Hochstein <thh@inter.net>
2832c235 11#
dfc2b81c 12# It can be redistributed and/or modified under the same terms under
2832c235
TH
13# which Perl itself is published.
14
15BEGIN {
16 our $VERSION = "0.01";
17 use File::Basename;
2ad99c20
TH
18 # we're in .../bin, so our module is in ../lib
19 push(@INC, dirname($0).'/../lib');
2832c235
TH
20}
21use strict;
3f817eb4 22use warnings;
2832c235 23
1703b8e3 24use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList);
2832c235
TH
25
26use DBI;
880c3eb2
TH
27use Getopt::Long qw(GetOptions);
28Getopt::Long::config ('bundling');
2832c235
TH
29
30################################# Definitions ##################################
31
32# define types of information that can be gathered
33# all / groups (/ clients / hosts)
880c3eb2
TH
34my %LegalStats;
35@LegalStats{('all','groups')} = ();
2832c235
TH
36
37################################# Main program #################################
38
39### read commandline options
880c3eb2 40my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
23ab67a0 41 $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest,$OptConfFile);
880c3eb2
TH
42GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
43 'clientsdb=s' => \$OptClientsDB,
44 'd|debug!' => \$OptDebug,
45 'groupsdb=s' => \$OptGroupsDB,
46 'hierarchy=s' => \$OptTLH,
47 'hostsdb=s' => \$OptHostsDB,
48 'm|month=s' => \$OptMonth,
49 'rawdb=s' => \$OptRawDB,
50 's|stats=s' => \$OptStatsType,
51 't|test!' => \$OptTest,
23ab67a0 52 'conffile=s' => \$OptConfFile,
880c3eb2
TH
53 'h|help' => \&ShowPOD,
54 'V|version' => \&ShowVersion) or exit 1;
2832c235
TH
55
56### read configuration
23ab67a0 57my %Conf = %{ReadConfig($OptConfFile)};
2832c235
TH
58
59### override configuration via commandline options
60my %ConfOverride;
880c3eb2
TH
61$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
62$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
63$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
64$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
65$ConfOverride{'TLH'} = $OptTLH if $OptTLH;
2832c235
TH
66&OverrideConfig(\%Conf,\%ConfOverride);
67
68### get type of information to gather, defaulting to 'all'
880c3eb2
TH
69$OptStatsType = 'all' if !$OptStatsType;
70&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
71 if !exists($LegalStats{$OptStatsType});
2832c235 72
880c3eb2
TH
73### get time period from --month
74# get verbal description of time period, drop SQL code
75my ($Period) = &GetTimePeriod($OptMonth);
76&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
77 "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
2832c235 78
17ffbeba
TH
79### reformat $Conf{'TLH'}
80my $TLH;
81if ($Conf{'TLH'}) {
82 # $Conf{'TLH'} is parsed as an array by Config::Auto;
83 # make a flat list again, separated by :
43a0fc77 84 if (ref($Conf{'TLH'}) eq 'ARRAY') {
17ffbeba
TH
85 $TLH = join(':',@{$Conf{'TLH'}});
86 } else {
87 $TLH = $Conf{'TLH'};
88 }
89 # strip whitespace
90 $TLH =~ s/\s//g;
7773fb6d
TH
91 # add trailing dots if none are present yet
92 # (using negative look-behind assertions)
93 $TLH =~ s/(?<!\.):/.:/g;
94 $TLH =~ s/(?<!\.)$/./;
17ffbeba 95 # check for illegal characters
880c3eb2 96 &Bleat(2,'Config error - illegal characters in TLH definition!')
314e31aa 97 if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
7773fb6d
TH
98 # escape dots
99 $TLH =~ s/\./\\./g;
17ffbeba 100 if ($TLH =~ /:/) {
880c3eb2 101 # reformat $TLH from a:b to (a)|(b),
43a0fc77 102 # e.g. replace ':' by ')|('
17ffbeba
TH
103 $TLH =~ s/:/)|(/g;
104 $TLH = '(' . $TLH . ')';
105 };
106};
107
2832c235
TH
108### init database
109my $DBHandle = InitDB(\%Conf,1);
110
111### get data for each month
880c3eb2
TH
112&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
113foreach my $Month (&ListMonth($Period)) {
2832c235 114
880c3eb2 115 print "---------- $Month ----------\n" if $OptDebug;
2832c235 116
880c3eb2 117 if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
93c8eae2
TH
118 # read list of newsgroups from --checkgroups
119 # into a hash
120 my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))}
121 if $OptCheckgroupsFile;
122
2832c235
TH
123 ### ----------------------------------------------
124 ### get groups data (number of postings per group)
125 # get groups data from raw table for given month
880c3eb2
TH
126 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
127 "WHERE day LIKE ? AND NOT disregard",
128 $Conf{'DBDatabase'},
129 $Conf{'DBTableRaw'}));
130 $DBQuery->execute($Month.'-%')
131 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
132 "$DBI::errstr\n",$Month,
133 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
2832c235
TH
134
135 # count postings per group
136 my %Postings;
2832c235 137 while (($_) = $DBQuery->fetchrow_array) {
b5125b10 138 # get list of newsgroups and hierarchies from Newsgroups:
880c3eb2
TH
139 my %Newsgroups = ListNewsgroups($_,$TLH,
140 $OptCheckgroupsFile ? \%ValidGroups : '');
2832c235
TH
141 # count each newsgroup and hierarchy once
142 foreach (sort keys %Newsgroups) {
2832c235
TH
143 $Postings{$_}++;
144 };
145 };
146
880c3eb2 147 # add valid but empty groups if --checkgroups is set
ad609792
TH
148 if (%ValidGroups) {
149 foreach (sort keys %ValidGroups) {
150 if (!defined($Postings{$_})) {
1703b8e3
TH
151 # add current newsgroup as empty group
152 $Postings{$_} = 0;
153 warn (sprintf("ADDED: %s as empty group\n",$_));
154 # add empty hierarchies for current newsgroup as needed
155 foreach (ParseHierarchies($_)) {
156 my $Hierarchy = $_ . '.ALL';
157 if (!defined($Postings{$Hierarchy})) {
158 $Postings{$Hierarchy} = 0;
159 warn (sprintf("ADDED: %s as empty group\n",$Hierarchy));
b5125b10
TH
160 };
161 };
ad609792
TH
162 }
163 };
164 };
23ab67a0 165
71f0178b 166 # delete old data for that month
880c3eb2
TH
167 if (!$OptTest) {
168 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
169 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
170 undef,$Month)
171 or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
172 "$DBI::errstr\n",$Month,
173 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
71f0178b
TH
174 };
175
880c3eb2 176 print "----- GroupStats -----\n" if $OptDebug;
2832c235 177 foreach my $Newsgroup (sort keys %Postings) {
880c3eb2
TH
178 print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
179 if (!$OptTest) {
2832c235 180 # write to database
880c3eb2
TH
181 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
182 "(month,newsgroup,postings) ".
183 "VALUES (?, ?, ?)",
184 $Conf{'DBDatabase'},
185 $Conf{'DBTableGrps'}));
186 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
187 or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
188 "$DBI::errstr\n",$Month,$Newsgroup,
189 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
2832c235
TH
190 $DBQuery->finish;
191 };
192 };
193 } else {
194 # other types of information go here - later on
195 };
196};
197
198### close handles
199$DBHandle->disconnect;
200
201__END__
202
203################################ Documentation #################################
204
205=head1 NAME
206
207gatherstats - process statistical data from a raw source
208
209=head1 SYNOPSIS
210
23ab67a0 211B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats>] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>] [--conffile I<filename>]
2832c235
TH
212
213=head1 REQUIREMENTS
214
880c3eb2 215See L<doc/README>.
2832c235
TH
216
217=head1 DESCRIPTION
218
219This script will extract and process statistical information from a
220database table which is fed from F<feedlog.pl> for a given time period
313610f6 221and write its results to (an)other database table(s). Entries marked
880c3eb2
TH
222with I<'disregard'> in the database will be ignored; currently, you
223have to set this flag yourself, using your database management tools.
224You can exclude erroneous entries that way (e.g. automatic reposts
225(think of cancels flood and resurrectors); spam; ...).
2832c235
TH
226
227The time period to act on defaults to last month; you can assign
880c3eb2
TH
228another time period or a single month via the B<--month> option (see
229below).
2832c235
TH
230
231By default B<gatherstats> will process all types of information; you
880c3eb2
TH
232can change that using the B<--stats> option and assigning the type of
233information to process. Currently that doesn't matter yet as only
234processing of the number of postings per group per month is
235implemented anyway.
2832c235
TH
236
237Possible information types include:
238
239=over 3
240
241=item B<groups> (postings per group per month)
242
243B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
244counted for each single group they appear in. Groups not in I<TLH>
245will be ignored.
246
247B<gatherstats> will also add up the number of postings for each
248hierarchy level, but only count each posting once. A posting to
249de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
250respectively. A crossposting to de.alt.test and de.alt.admin, on the
251other hand, will be counted for de.alt.test and de.alt.admin each, but
252only once for de.alt.ALL and de.ALL.
253
880c3eb2
TH
254Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
255override that default through the B<--groupsdb> option.
2832c235
TH
256
257=back
258
259=head2 Configuration
260
880c3eb2 261B<gatherstats> will read its configuration from F<newsstats.conf>
2832c235
TH
262which should be present in the same directory via Config::Auto.
263
880c3eb2 264See L<doc/INSTALL> for an overview of possible configuration options.
2832c235 265
880c3eb2
TH
266You can override configuration options via the B<--hierarchy>,
267B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
268respectively.
2832c235
TH
269
270=head1 OPTIONS
271
272=over 3
273
880c3eb2 274=item B<-V>, B<--version>
2832c235 275
880c3eb2 276Print out version and copyright information and exit.
2832c235 277
880c3eb2 278=item B<-h>, B<--help>
2832c235
TH
279
280Print this man page and exit.
281
880c3eb2 282=item B<-d>, B<--debug>
2832c235
TH
283
284Output debugging information to STDOUT while processing (number of
285postings per group).
286
880c3eb2 287=item B<-t>, B<--test>
2832c235 288
880c3eb2
TH
289Do not write results to database. You should use B<--debug> in
290conjunction with B<--test> ... everything else seems a bit pointless.
2832c235 291
880c3eb2 292=item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
2832c235 293
880c3eb2
TH
294Set processing period to a single month in YYYY-MM format or to a time
295period between two month in YYYY-MM:YYYY-MM format (two month, separated
dfc2b81c 296by a colon).
2832c235 297
880c3eb2 298=item B<-s>, B<--stats> I<type>
2832c235
TH
299
300Set processing type to one of I<all> and I<groups>. Defaults to all
301(and is currently rather pointless as only I<groups> has been
302implemented).
303
93c8eae2
TH
304=item B<-c>, B<--checkgroups> I<filename template>
305
306Check each group against a list of valid newsgroups read from a file,
307one group on each line and ignoring everything after the first
308whitespace (so you can use a file in checkgroups format or (part of)
309your INN active file).
310
95d9fe2c
TH
311The filename is taken from I<filename template>, amended by each
312B<--month> B<gatherstats> is processing in the form of I<template-YYYY-MM>,
313so that
93c8eae2
TH
314
315 gatherstats -m 2010-01:2010-12 -c checkgroups
ad609792 316
93c8eae2
TH
317will check against F<checkgroups-2010-01> for January 2010, against
318F<checkgroups-2010-02> for February 2010 and so on.
ad609792 319
93c8eae2
TH
320Newsgroups not found in the checkgroups file will be dropped (and
321logged to STDERR), and newsgroups found there but having no postings
ad609792
TH
322will be added with a count of 0 (and logged to STDERR).
323
880c3eb2 324=item B<--hierarchy> I<TLH> (newsgroup hierarchy)
2832c235
TH
325
326Override I<TLH> from F<newsstats.conf>.
327
880c3eb2 328=item B<--rawdb> I<table> (raw data table)
2832c235
TH
329
330Override I<DBTableRaw> from F<newsstats.conf>.
331
880c3eb2 332=item B<--groupsdb> I<table> (postings per group table)
2832c235
TH
333
334Override I<DBTableGrps> from F<newsstats.conf>.
335
880c3eb2 336=item B<--clientsdb> I<table> (client data table)
2832c235
TH
337
338Override I<DBTableClnts> from F<newsstats.conf>.
339
880c3eb2 340=item B<--hostsdb> I<table> (host data table)
2832c235
TH
341
342Override I<DBTableHosts> from F<newsstats.conf>.
343
23ab67a0
TH
344=item B<--conffile> I<filename>
345
346Load configuration from I<filename> instead of F<newsstats.conf>.
347
2832c235
TH
348=back
349
350=head1 INSTALLATION
351
880c3eb2 352See L<doc/INSTALL>.
2832c235
TH
353
354=head1 EXAMPLES
355
356Process all types of information for lasth month:
357
358 gatherstats
359
360Do a dry run, showing results of processing:
361
880c3eb2 362 gatherstats --debug --test
2832c235
TH
363
364Process all types of information for January of 2010:
365
880c3eb2 366 gatherstats --month 2010-01
2832c235 367
ad609792 368Process only number of postings for the year of 2010,
93c8eae2 369checking against checkgroups-*:
2832c235 370
93c8eae2 371 gatherstats -m 2010-01:2010-12 -s groups -c checkgroups
2832c235
TH
372
373=head1 FILES
374
375=over 4
376
2ad99c20 377=item F<bin/gatherstats.pl>
2832c235
TH
378
379The script itself.
380
2ad99c20 381=item F<lib/NewsStats.pm>
2832c235
TH
382
383Library functions for the NewsStats package.
384
2ad99c20 385=item F<etc/newsstats.conf>
2832c235 386
880c3eb2 387Runtime configuration file.
2832c235
TH
388
389=back
390
391=head1 BUGS
392
393Please report any bugs or feature requests to the author or use the
394bug tracker at L<http://bugs.th-h.de/>!
395
396=head1 SEE ALSO
397
398=over 2
399
400=item -
401
880c3eb2 402L<doc/README>
2832c235
TH
403
404=item -
405
880c3eb2 406L<doc/INSTALL>
2832c235
TH
407
408=back
409
410This script is part of the B<NewsStats> package.
411
412=head1 AUTHOR
413
414Thomas Hochstein <thh@inter.net>
415
416=head1 COPYRIGHT AND LICENSE
417
880c3eb2 418Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
2832c235
TH
419
420This program is free software; you may redistribute it and/or modify it
421under the same terms as Perl itself.
422
423=cut
This page took 0.037807 seconds and 4 git commands to generate.