Change interpretation of --checkgroups to template
[usenet/newsstats.git] / gatherstats.pl
CommitLineData
2832c235
TH
1#! /usr/bin/perl -W
2#
3# gatherstats.pl
4#
5# This script will gather statistical information from a database
6# containing headers and other information from a INN feed.
7#
8# It is part of the NewsStats package.
9#
880c3eb2 10# Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
2832c235
TH
11#
12# It can be redistributed and/or modified under the same terms under
13# which Perl itself is published.
14
15BEGIN {
16 our $VERSION = "0.01";
17 use File::Basename;
18 push(@INC, dirname($0));
19}
20use strict;
21
ad609792 22use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList);
2832c235
TH
23
24use DBI;
880c3eb2
TH
25use Getopt::Long qw(GetOptions);
26Getopt::Long::config ('bundling');
2832c235
TH
27
28################################# Definitions ##################################
29
30# define types of information that can be gathered
31# all / groups (/ clients / hosts)
880c3eb2
TH
32my %LegalStats;
33@LegalStats{('all','groups')} = ();
2832c235
TH
34
35################################# Main program #################################
36
37### read commandline options
880c3eb2
TH
38my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
39 $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest);
40GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
41 'clientsdb=s' => \$OptClientsDB,
42 'd|debug!' => \$OptDebug,
43 'groupsdb=s' => \$OptGroupsDB,
44 'hierarchy=s' => \$OptTLH,
45 'hostsdb=s' => \$OptHostsDB,
46 'm|month=s' => \$OptMonth,
47 'rawdb=s' => \$OptRawDB,
48 's|stats=s' => \$OptStatsType,
49 't|test!' => \$OptTest,
50 'h|help' => \&ShowPOD,
51 'V|version' => \&ShowVersion) or exit 1;
2832c235
TH
52
53### read configuration
880c3eb2 54my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')};
2832c235
TH
55
56### override configuration via commandline options
57my %ConfOverride;
880c3eb2
TH
58$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
59$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
60$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
61$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
62$ConfOverride{'TLH'} = $OptTLH if $OptTLH;
2832c235
TH
63&OverrideConfig(\%Conf,\%ConfOverride);
64
65### get type of information to gather, defaulting to 'all'
880c3eb2
TH
66$OptStatsType = 'all' if !$OptStatsType;
67&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
68 if !exists($LegalStats{$OptStatsType});
2832c235 69
880c3eb2
TH
70### get time period from --month
71# get verbal description of time period, drop SQL code
72my ($Period) = &GetTimePeriod($OptMonth);
73&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
74 "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
2832c235 75
17ffbeba
TH
76### reformat $Conf{'TLH'}
77my $TLH;
78if ($Conf{'TLH'}) {
79 # $Conf{'TLH'} is parsed as an array by Config::Auto;
80 # make a flat list again, separated by :
43a0fc77 81 if (ref($Conf{'TLH'}) eq 'ARRAY') {
17ffbeba
TH
82 $TLH = join(':',@{$Conf{'TLH'}});
83 } else {
84 $TLH = $Conf{'TLH'};
85 }
86 # strip whitespace
87 $TLH =~ s/\s//g;
7773fb6d
TH
88 # add trailing dots if none are present yet
89 # (using negative look-behind assertions)
90 $TLH =~ s/(?<!\.):/.:/g;
91 $TLH =~ s/(?<!\.)$/./;
17ffbeba 92 # check for illegal characters
880c3eb2 93 &Bleat(2,'Config error - illegal characters in TLH definition!')
314e31aa 94 if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
7773fb6d
TH
95 # escape dots
96 $TLH =~ s/\./\\./g;
17ffbeba 97 if ($TLH =~ /:/) {
880c3eb2 98 # reformat $TLH from a:b to (a)|(b),
43a0fc77 99 # e.g. replace ':' by ')|('
17ffbeba
TH
100 $TLH =~ s/:/)|(/g;
101 $TLH = '(' . $TLH . ')';
102 };
103};
104
2832c235
TH
105### init database
106my $DBHandle = InitDB(\%Conf,1);
107
108### get data for each month
880c3eb2
TH
109&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
110foreach my $Month (&ListMonth($Period)) {
2832c235 111
880c3eb2 112 print "---------- $Month ----------\n" if $OptDebug;
2832c235 113
880c3eb2 114 if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
93c8eae2
TH
115 # read list of newsgroups from --checkgroups
116 # into a hash
117 my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))}
118 if $OptCheckgroupsFile;
119
2832c235
TH
120 ### ----------------------------------------------
121 ### get groups data (number of postings per group)
122 # get groups data from raw table for given month
880c3eb2
TH
123 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
124 "WHERE day LIKE ? AND NOT disregard",
125 $Conf{'DBDatabase'},
126 $Conf{'DBTableRaw'}));
127 $DBQuery->execute($Month.'-%')
128 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
129 "$DBI::errstr\n",$Month,
130 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
2832c235
TH
131
132 # count postings per group
133 my %Postings;
2832c235
TH
134 while (($_) = $DBQuery->fetchrow_array) {
135 # get list oft newsgroups and hierarchies from Newsgroups:
880c3eb2
TH
136 my %Newsgroups = ListNewsgroups($_,$TLH,
137 $OptCheckgroupsFile ? \%ValidGroups : '');
2832c235
TH
138 # count each newsgroup and hierarchy once
139 foreach (sort keys %Newsgroups) {
2832c235
TH
140 $Postings{$_}++;
141 };
142 };
143
880c3eb2 144 # add valid but empty groups if --checkgroups is set
ad609792
TH
145 if (%ValidGroups) {
146 foreach (sort keys %ValidGroups) {
147 if (!defined($Postings{$_})) {
148 $Postings{$_} = 0 ;
149 warn (sprintf("ADDED: %s as empty group\n",$_));
150 }
151 };
152 };
153
71f0178b 154 # delete old data for that month
880c3eb2
TH
155 if (!$OptTest) {
156 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
157 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
158 undef,$Month)
159 or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
160 "$DBI::errstr\n",$Month,
161 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
71f0178b
TH
162 };
163
880c3eb2 164 print "----- GroupStats -----\n" if $OptDebug;
2832c235 165 foreach my $Newsgroup (sort keys %Postings) {
880c3eb2
TH
166 print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
167 if (!$OptTest) {
2832c235 168 # write to database
880c3eb2
TH
169 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
170 "(month,newsgroup,postings) ".
171 "VALUES (?, ?, ?)",
172 $Conf{'DBDatabase'},
173 $Conf{'DBTableGrps'}));
174 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
175 or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
176 "$DBI::errstr\n",$Month,$Newsgroup,
177 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
2832c235
TH
178 $DBQuery->finish;
179 };
180 };
181 } else {
182 # other types of information go here - later on
183 };
184};
185
186### close handles
187$DBHandle->disconnect;
188
189__END__
190
191################################ Documentation #################################
192
193=head1 NAME
194
195gatherstats - process statistical data from a raw source
196
197=head1 SYNOPSIS
198
93c8eae2 199B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>]
2832c235
TH
200
201=head1 REQUIREMENTS
202
880c3eb2 203See L<doc/README>.
2832c235
TH
204
205=head1 DESCRIPTION
206
207This script will extract and process statistical information from a
208database table which is fed from F<feedlog.pl> for a given time period
313610f6 209and write its results to (an)other database table(s). Entries marked
880c3eb2
TH
210with I<'disregard'> in the database will be ignored; currently, you
211have to set this flag yourself, using your database management tools.
212You can exclude erroneous entries that way (e.g. automatic reposts
213(think of cancels flood and resurrectors); spam; ...).
2832c235
TH
214
215The time period to act on defaults to last month; you can assign
880c3eb2
TH
216another time period or a single month via the B<--month> option (see
217below).
2832c235
TH
218
219By default B<gatherstats> will process all types of information; you
880c3eb2
TH
220can change that using the B<--stats> option and assigning the type of
221information to process. Currently that doesn't matter yet as only
222processing of the number of postings per group per month is
223implemented anyway.
2832c235
TH
224
225Possible information types include:
226
227=over 3
228
229=item B<groups> (postings per group per month)
230
231B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
232counted for each single group they appear in. Groups not in I<TLH>
233will be ignored.
234
235B<gatherstats> will also add up the number of postings for each
236hierarchy level, but only count each posting once. A posting to
237de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
238respectively. A crossposting to de.alt.test and de.alt.admin, on the
239other hand, will be counted for de.alt.test and de.alt.admin each, but
240only once for de.alt.ALL and de.ALL.
241
880c3eb2
TH
242Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
243override that default through the B<--groupsdb> option.
2832c235
TH
244
245=back
246
247=head2 Configuration
248
880c3eb2 249B<gatherstats> will read its configuration from F<newsstats.conf>
2832c235
TH
250which should be present in the same directory via Config::Auto.
251
880c3eb2 252See L<doc/INSTALL> for an overview of possible configuration options.
2832c235 253
880c3eb2
TH
254You can override configuration options via the B<--hierarchy>,
255B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
256respectively.
2832c235
TH
257
258=head1 OPTIONS
259
260=over 3
261
880c3eb2 262=item B<-V>, B<--version>
2832c235 263
880c3eb2 264Print out version and copyright information and exit.
2832c235 265
880c3eb2 266=item B<-h>, B<--help>
2832c235
TH
267
268Print this man page and exit.
269
880c3eb2 270=item B<-d>, B<--debug>
2832c235
TH
271
272Output debugging information to STDOUT while processing (number of
273postings per group).
274
880c3eb2 275=item B<-t>, B<--test>
2832c235 276
880c3eb2
TH
277Do not write results to database. You should use B<--debug> in
278conjunction with B<--test> ... everything else seems a bit pointless.
2832c235 279
880c3eb2 280=item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
2832c235 281
880c3eb2
TH
282Set processing period to a single month in YYYY-MM format or to a time
283period between two month in YYYY-MM:YYYY-MM format (two month, separated
284by a colon).
2832c235 285
2832c235 286
880c3eb2 287=item B<-s>, B<--stats> I<type>
2832c235
TH
288
289Set processing type to one of I<all> and I<groups>. Defaults to all
290(and is currently rather pointless as only I<groups> has been
291implemented).
292
93c8eae2
TH
293=item B<-c>, B<--checkgroups> I<filename template>
294
295Check each group against a list of valid newsgroups read from a file,
296one group on each line and ignoring everything after the first
297whitespace (so you can use a file in checkgroups format or (part of)
298your INN active file).
299
300The filename is taken from I<filename template>, amended by each B<--
301month> B<gatherstats> is processing, so that
302
303 gatherstats -m 2010-01:2010-12 -c checkgroups
ad609792 304
93c8eae2
TH
305will check against F<checkgroups-2010-01> for January 2010, against
306F<checkgroups-2010-02> for February 2010 and so on.
ad609792 307
93c8eae2
TH
308Newsgroups not found in the checkgroups file will be dropped (and
309logged to STDERR), and newsgroups found there but having no postings
ad609792
TH
310will be added with a count of 0 (and logged to STDERR).
311
880c3eb2 312=item B<--hierarchy> I<TLH> (newsgroup hierarchy)
2832c235
TH
313
314Override I<TLH> from F<newsstats.conf>.
315
880c3eb2 316=item B<--rawdb> I<table> (raw data table)
2832c235
TH
317
318Override I<DBTableRaw> from F<newsstats.conf>.
319
880c3eb2 320=item B<--groupsdb> I<table> (postings per group table)
2832c235
TH
321
322Override I<DBTableGrps> from F<newsstats.conf>.
323
880c3eb2 324=item B<--clientsdb> I<table> (client data table)
2832c235
TH
325
326Override I<DBTableClnts> from F<newsstats.conf>.
327
880c3eb2 328=item B<--hostsdb> I<table> (host data table)
2832c235
TH
329
330Override I<DBTableHosts> from F<newsstats.conf>.
331
332=back
333
334=head1 INSTALLATION
335
880c3eb2 336See L<doc/INSTALL>.
2832c235
TH
337
338=head1 EXAMPLES
339
340Process all types of information for lasth month:
341
342 gatherstats
343
344Do a dry run, showing results of processing:
345
880c3eb2 346 gatherstats --debug --test
2832c235
TH
347
348Process all types of information for January of 2010:
349
880c3eb2 350 gatherstats --month 2010-01
2832c235 351
ad609792 352Process only number of postings for the year of 2010,
93c8eae2 353checking against checkgroups-*:
2832c235 354
93c8eae2 355 gatherstats -m 2010-01:2010-12 -s groups -c checkgroups
2832c235
TH
356
357=head1 FILES
358
359=over 4
360
361=item F<gatherstats.pl>
362
363The script itself.
364
365=item F<NewsStats.pm>
366
367Library functions for the NewsStats package.
368
369=item F<newsstats.conf>
370
880c3eb2 371Runtime configuration file.
2832c235
TH
372
373=back
374
375=head1 BUGS
376
377Please report any bugs or feature requests to the author or use the
378bug tracker at L<http://bugs.th-h.de/>!
379
380=head1 SEE ALSO
381
382=over 2
383
384=item -
385
880c3eb2 386L<doc/README>
2832c235
TH
387
388=item -
389
880c3eb2 390L<doc/INSTALL>
2832c235
TH
391
392=back
393
394This script is part of the B<NewsStats> package.
395
396=head1 AUTHOR
397
398Thomas Hochstein <thh@inter.net>
399
400=head1 COPYRIGHT AND LICENSE
401
880c3eb2 402Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
2832c235
TH
403
404This program is free software; you may redistribute it and/or modify it
405under the same terms as Perl itself.
406
407=cut
This page took 0.033358 seconds and 4 git commands to generate.