Change interpretation of --checkgroups to template
[usenet/newsstats.git] / gatherstats.pl
... / ...
CommitLineData
1#! /usr/bin/perl -W
2#
3# gatherstats.pl
4#
5# This script will gather statistical information from a database
6# containing headers and other information from a INN feed.
7#
8# It is part of the NewsStats package.
9#
10# Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
11#
12# It can be redistributed and/or modified under the same terms under
13# which Perl itself is published.
14
15BEGIN {
16 our $VERSION = "0.01";
17 use File::Basename;
18 push(@INC, dirname($0));
19}
20use strict;
21
22use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList);
23
24use DBI;
25use Getopt::Long qw(GetOptions);
26Getopt::Long::config ('bundling');
27
28################################# Definitions ##################################
29
30# define types of information that can be gathered
31# all / groups (/ clients / hosts)
32my %LegalStats;
33@LegalStats{('all','groups')} = ();
34
35################################# Main program #################################
36
37### read commandline options
38my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
39 $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest);
40GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
41 'clientsdb=s' => \$OptClientsDB,
42 'd|debug!' => \$OptDebug,
43 'groupsdb=s' => \$OptGroupsDB,
44 'hierarchy=s' => \$OptTLH,
45 'hostsdb=s' => \$OptHostsDB,
46 'm|month=s' => \$OptMonth,
47 'rawdb=s' => \$OptRawDB,
48 's|stats=s' => \$OptStatsType,
49 't|test!' => \$OptTest,
50 'h|help' => \&ShowPOD,
51 'V|version' => \&ShowVersion) or exit 1;
52
53### read configuration
54my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')};
55
56### override configuration via commandline options
57my %ConfOverride;
58$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
59$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
60$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
61$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
62$ConfOverride{'TLH'} = $OptTLH if $OptTLH;
63&OverrideConfig(\%Conf,\%ConfOverride);
64
65### get type of information to gather, defaulting to 'all'
66$OptStatsType = 'all' if !$OptStatsType;
67&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
68 if !exists($LegalStats{$OptStatsType});
69
70### get time period from --month
71# get verbal description of time period, drop SQL code
72my ($Period) = &GetTimePeriod($OptMonth);
73&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
74 "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
75
76### reformat $Conf{'TLH'}
77my $TLH;
78if ($Conf{'TLH'}) {
79 # $Conf{'TLH'} is parsed as an array by Config::Auto;
80 # make a flat list again, separated by :
81 if (ref($Conf{'TLH'}) eq 'ARRAY') {
82 $TLH = join(':',@{$Conf{'TLH'}});
83 } else {
84 $TLH = $Conf{'TLH'};
85 }
86 # strip whitespace
87 $TLH =~ s/\s//g;
88 # add trailing dots if none are present yet
89 # (using negative look-behind assertions)
90 $TLH =~ s/(?<!\.):/.:/g;
91 $TLH =~ s/(?<!\.)$/./;
92 # check for illegal characters
93 &Bleat(2,'Config error - illegal characters in TLH definition!')
94 if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
95 # escape dots
96 $TLH =~ s/\./\\./g;
97 if ($TLH =~ /:/) {
98 # reformat $TLH from a:b to (a)|(b),
99 # e.g. replace ':' by ')|('
100 $TLH =~ s/:/)|(/g;
101 $TLH = '(' . $TLH . ')';
102 };
103};
104
105### init database
106my $DBHandle = InitDB(\%Conf,1);
107
108### get data for each month
109&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
110foreach my $Month (&ListMonth($Period)) {
111
112 print "---------- $Month ----------\n" if $OptDebug;
113
114 if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
115 # read list of newsgroups from --checkgroups
116 # into a hash
117 my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))}
118 if $OptCheckgroupsFile;
119
120 ### ----------------------------------------------
121 ### get groups data (number of postings per group)
122 # get groups data from raw table for given month
123 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
124 "WHERE day LIKE ? AND NOT disregard",
125 $Conf{'DBDatabase'},
126 $Conf{'DBTableRaw'}));
127 $DBQuery->execute($Month.'-%')
128 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
129 "$DBI::errstr\n",$Month,
130 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
131
132 # count postings per group
133 my %Postings;
134 while (($_) = $DBQuery->fetchrow_array) {
135 # get list oft newsgroups and hierarchies from Newsgroups:
136 my %Newsgroups = ListNewsgroups($_,$TLH,
137 $OptCheckgroupsFile ? \%ValidGroups : '');
138 # count each newsgroup and hierarchy once
139 foreach (sort keys %Newsgroups) {
140 $Postings{$_}++;
141 };
142 };
143
144 # add valid but empty groups if --checkgroups is set
145 if (%ValidGroups) {
146 foreach (sort keys %ValidGroups) {
147 if (!defined($Postings{$_})) {
148 $Postings{$_} = 0 ;
149 warn (sprintf("ADDED: %s as empty group\n",$_));
150 }
151 };
152 };
153
154 # delete old data for that month
155 if (!$OptTest) {
156 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
157 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
158 undef,$Month)
159 or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
160 "$DBI::errstr\n",$Month,
161 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
162 };
163
164 print "----- GroupStats -----\n" if $OptDebug;
165 foreach my $Newsgroup (sort keys %Postings) {
166 print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
167 if (!$OptTest) {
168 # write to database
169 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
170 "(month,newsgroup,postings) ".
171 "VALUES (?, ?, ?)",
172 $Conf{'DBDatabase'},
173 $Conf{'DBTableGrps'}));
174 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
175 or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
176 "$DBI::errstr\n",$Month,$Newsgroup,
177 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
178 $DBQuery->finish;
179 };
180 };
181 } else {
182 # other types of information go here - later on
183 };
184};
185
186### close handles
187$DBHandle->disconnect;
188
189__END__
190
191################################ Documentation #################################
192
193=head1 NAME
194
195gatherstats - process statistical data from a raw source
196
197=head1 SYNOPSIS
198
199B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>]
200
201=head1 REQUIREMENTS
202
203See L<doc/README>.
204
205=head1 DESCRIPTION
206
207This script will extract and process statistical information from a
208database table which is fed from F<feedlog.pl> for a given time period
209and write its results to (an)other database table(s). Entries marked
210with I<'disregard'> in the database will be ignored; currently, you
211have to set this flag yourself, using your database management tools.
212You can exclude erroneous entries that way (e.g. automatic reposts
213(think of cancels flood and resurrectors); spam; ...).
214
215The time period to act on defaults to last month; you can assign
216another time period or a single month via the B<--month> option (see
217below).
218
219By default B<gatherstats> will process all types of information; you
220can change that using the B<--stats> option and assigning the type of
221information to process. Currently that doesn't matter yet as only
222processing of the number of postings per group per month is
223implemented anyway.
224
225Possible information types include:
226
227=over 3
228
229=item B<groups> (postings per group per month)
230
231B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
232counted for each single group they appear in. Groups not in I<TLH>
233will be ignored.
234
235B<gatherstats> will also add up the number of postings for each
236hierarchy level, but only count each posting once. A posting to
237de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
238respectively. A crossposting to de.alt.test and de.alt.admin, on the
239other hand, will be counted for de.alt.test and de.alt.admin each, but
240only once for de.alt.ALL and de.ALL.
241
242Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
243override that default through the B<--groupsdb> option.
244
245=back
246
247=head2 Configuration
248
249B<gatherstats> will read its configuration from F<newsstats.conf>
250which should be present in the same directory via Config::Auto.
251
252See L<doc/INSTALL> for an overview of possible configuration options.
253
254You can override configuration options via the B<--hierarchy>,
255B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
256respectively.
257
258=head1 OPTIONS
259
260=over 3
261
262=item B<-V>, B<--version>
263
264Print out version and copyright information and exit.
265
266=item B<-h>, B<--help>
267
268Print this man page and exit.
269
270=item B<-d>, B<--debug>
271
272Output debugging information to STDOUT while processing (number of
273postings per group).
274
275=item B<-t>, B<--test>
276
277Do not write results to database. You should use B<--debug> in
278conjunction with B<--test> ... everything else seems a bit pointless.
279
280=item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
281
282Set processing period to a single month in YYYY-MM format or to a time
283period between two month in YYYY-MM:YYYY-MM format (two month, separated
284by a colon).
285
286
287=item B<-s>, B<--stats> I<type>
288
289Set processing type to one of I<all> and I<groups>. Defaults to all
290(and is currently rather pointless as only I<groups> has been
291implemented).
292
293=item B<-c>, B<--checkgroups> I<filename template>
294
295Check each group against a list of valid newsgroups read from a file,
296one group on each line and ignoring everything after the first
297whitespace (so you can use a file in checkgroups format or (part of)
298your INN active file).
299
300The filename is taken from I<filename template>, amended by each B<--
301month> B<gatherstats> is processing, so that
302
303 gatherstats -m 2010-01:2010-12 -c checkgroups
304
305will check against F<checkgroups-2010-01> for January 2010, against
306F<checkgroups-2010-02> for February 2010 and so on.
307
308Newsgroups not found in the checkgroups file will be dropped (and
309logged to STDERR), and newsgroups found there but having no postings
310will be added with a count of 0 (and logged to STDERR).
311
312=item B<--hierarchy> I<TLH> (newsgroup hierarchy)
313
314Override I<TLH> from F<newsstats.conf>.
315
316=item B<--rawdb> I<table> (raw data table)
317
318Override I<DBTableRaw> from F<newsstats.conf>.
319
320=item B<--groupsdb> I<table> (postings per group table)
321
322Override I<DBTableGrps> from F<newsstats.conf>.
323
324=item B<--clientsdb> I<table> (client data table)
325
326Override I<DBTableClnts> from F<newsstats.conf>.
327
328=item B<--hostsdb> I<table> (host data table)
329
330Override I<DBTableHosts> from F<newsstats.conf>.
331
332=back
333
334=head1 INSTALLATION
335
336See L<doc/INSTALL>.
337
338=head1 EXAMPLES
339
340Process all types of information for lasth month:
341
342 gatherstats
343
344Do a dry run, showing results of processing:
345
346 gatherstats --debug --test
347
348Process all types of information for January of 2010:
349
350 gatherstats --month 2010-01
351
352Process only number of postings for the year of 2010,
353checking against checkgroups-*:
354
355 gatherstats -m 2010-01:2010-12 -s groups -c checkgroups
356
357=head1 FILES
358
359=over 4
360
361=item F<gatherstats.pl>
362
363The script itself.
364
365=item F<NewsStats.pm>
366
367Library functions for the NewsStats package.
368
369=item F<newsstats.conf>
370
371Runtime configuration file.
372
373=back
374
375=head1 BUGS
376
377Please report any bugs or feature requests to the author or use the
378bug tracker at L<http://bugs.th-h.de/>!
379
380=head1 SEE ALSO
381
382=over 2
383
384=item -
385
386L<doc/README>
387
388=item -
389
390L<doc/INSTALL>
391
392=back
393
394This script is part of the B<NewsStats> package.
395
396=head1 AUTHOR
397
398Thomas Hochstein <thh@inter.net>
399
400=head1 COPYRIGHT AND LICENSE
401
402Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
403
404This program is free software; you may redistribute it and/or modify it
405under the same terms as Perl itself.
406
407=cut
This page took 0.010597 seconds and 4 git commands to generate.