Fix ea91003a99e679caa5bf56915f085a7c4931805b.
[usenet/newsstats.git] / bin / gatherstats.pl
... / ...
CommitLineData
1#! /usr/bin/perl
2#
3# gatherstats.pl
4#
5# This script will gather statistical information from a database
6# containing headers and other information from a INN feed.
7#
8# It is part of the NewsStats package.
9#
10# Copyright (c) 2010-2013 Thomas Hochstein <thh@inter.net>
11#
12# It can be redistributed and/or modified under the same terms under
13# which Perl itself is published.
14
15BEGIN {
16 our $VERSION = "0.01";
17 use File::Basename;
18 # we're in .../bin, so our module is in ../lib
19 push(@INC, dirname($0).'/../lib');
20}
21use strict;
22use warnings;
23
24use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList);
25
26use DBI;
27use Getopt::Long qw(GetOptions);
28Getopt::Long::config ('bundling');
29
30################################# Definitions ##################################
31
32# define types of information that can be gathered
33# all / groups (/ clients / hosts)
34my %LegalStats;
35@LegalStats{('all','groups')} = ();
36
37################################# Main program #################################
38
39### read commandline options
40my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
41 $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest,$OptConfFile);
42GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
43 'clientsdb=s' => \$OptClientsDB,
44 'd|debug!' => \$OptDebug,
45 'groupsdb=s' => \$OptGroupsDB,
46 'hierarchy=s' => \$OptTLH,
47 'hostsdb=s' => \$OptHostsDB,
48 'm|month=s' => \$OptMonth,
49 'rawdb=s' => \$OptRawDB,
50 's|stats=s' => \$OptStatsType,
51 't|test!' => \$OptTest,
52 'conffile=s' => \$OptConfFile,
53 'h|help' => \&ShowPOD,
54 'V|version' => \&ShowVersion) or exit 1;
55
56### read configuration
57my %Conf = %{ReadConfig($OptConfFile)};
58
59### override configuration via commandline options
60my %ConfOverride;
61$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
62$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
63$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
64$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
65$ConfOverride{'TLH'} = $OptTLH if $OptTLH;
66&OverrideConfig(\%Conf,\%ConfOverride);
67
68### get type of information to gather, defaulting to 'all'
69$OptStatsType = 'all' if !$OptStatsType;
70&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
71 if !exists($LegalStats{$OptStatsType});
72
73### get time period from --month
74# get verbal description of time period, drop SQL code
75my ($Period) = &GetTimePeriod($OptMonth);
76# bail out if --month is invalid or set to 'ALL';
77# we don't support the latter
78&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
79 "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
80
81### reformat $Conf{'TLH'}
82my $TLH;
83if ($Conf{'TLH'}) {
84 # $Conf{'TLH'} is parsed as an array by Config::Auto;
85 # make a flat list again, separated by :
86 if (ref($Conf{'TLH'}) eq 'ARRAY') {
87 $TLH = join(':',@{$Conf{'TLH'}});
88 } else {
89 $TLH = $Conf{'TLH'};
90 }
91 # strip whitespace
92 $TLH =~ s/\s//g;
93 # add trailing dots if none are present yet
94 # (using negative look-behind assertions)
95 $TLH =~ s/(?<!\.):/.:/g;
96 $TLH =~ s/(?<!\.)$/./;
97 # check for illegal characters
98 &Bleat(2,'Config error - illegal characters in TLH definition!')
99 if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
100 # escape dots
101 $TLH =~ s/\./\\./g;
102 if ($TLH =~ /:/) {
103 # reformat $TLH from a:b to (a)|(b),
104 # e.g. replace ':' by ')|('
105 $TLH =~ s/:/)|(/g;
106 $TLH = '(' . $TLH . ')';
107 };
108};
109
110### init database
111my $DBHandle = InitDB(\%Conf,1);
112
113### get data for each month
114&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
115foreach my $Month (&ListMonth($Period)) {
116
117 print "---------- $Month ----------\n" if $OptDebug;
118
119 if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
120 # read list of newsgroups from --checkgroups
121 # into a hash
122 my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))}
123 if $OptCheckgroupsFile;
124
125 ### ----------------------------------------------
126 ### get groups data (number of postings per group)
127 # get groups data from raw table for given month
128 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
129 "WHERE day LIKE ? AND NOT disregard",
130 $Conf{'DBDatabase'},
131 $Conf{'DBTableRaw'}));
132 $DBQuery->execute($Month.'-%')
133 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
134 "$DBI::errstr\n",$Month,
135 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
136
137 # count postings per group
138 my %Postings;
139 while (($_) = $DBQuery->fetchrow_array) {
140 # get list of newsgroups and hierarchies from Newsgroups:
141 my %Newsgroups = ListNewsgroups($_,$TLH,
142 $OptCheckgroupsFile ? \%ValidGroups : '');
143 # count each newsgroup and hierarchy once
144 foreach (sort keys %Newsgroups) {
145 $Postings{$_}++;
146 };
147 };
148
149 # add valid but empty groups if --checkgroups is set
150 if (%ValidGroups) {
151 foreach (sort keys %ValidGroups) {
152 if (!defined($Postings{$_})) {
153 # add current newsgroup as empty group
154 $Postings{$_} = 0;
155 warn (sprintf("ADDED: %s as empty group\n",$_));
156 # add empty hierarchies for current newsgroup as needed
157 foreach (ParseHierarchies($_)) {
158 my $Hierarchy = $_ . '.ALL';
159 if (!defined($Postings{$Hierarchy})) {
160 $Postings{$Hierarchy} = 0;
161 warn (sprintf("ADDED: %s as empty group\n",$Hierarchy));
162 };
163 };
164 }
165 };
166 };
167
168 # delete old data for that month
169 if (!$OptTest) {
170 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
171 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
172 undef,$Month)
173 or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
174 "$DBI::errstr\n",$Month,
175 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
176 };
177
178 print "----- GroupStats -----\n" if $OptDebug;
179 foreach my $Newsgroup (sort keys %Postings) {
180 print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
181 if (!$OptTest) {
182 # write to database
183 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
184 "(month,newsgroup,postings) ".
185 "VALUES (?, ?, ?)",
186 $Conf{'DBDatabase'},
187 $Conf{'DBTableGrps'}));
188 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
189 or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
190 "$DBI::errstr\n",$Month,$Newsgroup,
191 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
192 $DBQuery->finish;
193 };
194 };
195 } else {
196 # other types of information go here - later on
197 };
198};
199
200### close handles
201$DBHandle->disconnect;
202
203__END__
204
205################################ Documentation #################################
206
207=head1 NAME
208
209gatherstats - process statistical data from a raw source
210
211=head1 SYNOPSIS
212
213B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats>] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>] [--conffile I<filename>]
214
215=head1 REQUIREMENTS
216
217See L<doc/README>.
218
219=head1 DESCRIPTION
220
221This script will extract and process statistical information from a
222database table which is fed from F<feedlog.pl> for a given time period
223and write its results to (an)other database table(s). Entries marked
224with I<'disregard'> in the database will be ignored; currently, you
225have to set this flag yourself, using your database management tools.
226You can exclude erroneous entries that way (e.g. automatic reposts
227(think of cancels flood and resurrectors); spam; ...).
228
229The time period to act on defaults to last month; you can assign
230another time period or a single month via the B<--month> option (see
231below).
232
233By default B<gatherstats> will process all types of information; you
234can change that using the B<--stats> option and assigning the type of
235information to process. Currently that doesn't matter yet as only
236processing of the number of postings per group per month is
237implemented anyway.
238
239Possible information types include:
240
241=over 3
242
243=item B<groups> (postings per group per month)
244
245B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
246counted for each single group they appear in. Groups not in I<TLH>
247will be ignored.
248
249B<gatherstats> will also add up the number of postings for each
250hierarchy level, but only count each posting once. A posting to
251de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
252respectively. A crossposting to de.alt.test and de.alt.admin, on the
253other hand, will be counted for de.alt.test and de.alt.admin each, but
254only once for de.alt.ALL and de.ALL.
255
256Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
257override that default through the B<--groupsdb> option.
258
259=back
260
261=head2 Configuration
262
263B<gatherstats> will read its configuration from F<newsstats.conf>
264which should be present in the same directory via Config::Auto.
265
266See L<doc/INSTALL> for an overview of possible configuration options.
267
268You can override configuration options via the B<--hierarchy>,
269B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
270respectively.
271
272=head1 OPTIONS
273
274=over 3
275
276=item B<-V>, B<--version>
277
278Print out version and copyright information and exit.
279
280=item B<-h>, B<--help>
281
282Print this man page and exit.
283
284=item B<-d>, B<--debug>
285
286Output debugging information to STDOUT while processing (number of
287postings per group).
288
289=item B<-t>, B<--test>
290
291Do not write results to database. You should use B<--debug> in
292conjunction with B<--test> ... everything else seems a bit pointless.
293
294=item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
295
296Set processing period to a single month in YYYY-MM format or to a time
297period between two month in YYYY-MM:YYYY-MM format (two month, separated
298by a colon).
299
300=item B<-s>, B<--stats> I<type>
301
302Set processing type to one of I<all> and I<groups>. Defaults to all
303(and is currently rather pointless as only I<groups> has been
304implemented).
305
306=item B<-c>, B<--checkgroups> I<filename template>
307
308Check each group against a list of valid newsgroups read from a file,
309one group on each line and ignoring everything after the first
310whitespace (so you can use a file in checkgroups format or (part of)
311your INN active file).
312
313The filename is taken from I<filename template>, amended by each
314B<--month> B<gatherstats> is processing in the form of I<template-YYYY-MM>,
315so that
316
317 gatherstats -m 2010-01:2010-12 -c checkgroups
318
319will check against F<checkgroups-2010-01> for January 2010, against
320F<checkgroups-2010-02> for February 2010 and so on.
321
322Newsgroups not found in the checkgroups file will be dropped (and
323logged to STDERR), and newsgroups found there but having no postings
324will be added with a count of 0 (and logged to STDERR).
325
326=item B<--hierarchy> I<TLH> (newsgroup hierarchy)
327
328Override I<TLH> from F<newsstats.conf>.
329
330=item B<--rawdb> I<table> (raw data table)
331
332Override I<DBTableRaw> from F<newsstats.conf>.
333
334=item B<--groupsdb> I<table> (postings per group table)
335
336Override I<DBTableGrps> from F<newsstats.conf>.
337
338=item B<--clientsdb> I<table> (client data table)
339
340Override I<DBTableClnts> from F<newsstats.conf>.
341
342=item B<--hostsdb> I<table> (host data table)
343
344Override I<DBTableHosts> from F<newsstats.conf>.
345
346=item B<--conffile> I<filename>
347
348Load configuration from I<filename> instead of F<newsstats.conf>.
349
350=back
351
352=head1 INSTALLATION
353
354See L<doc/INSTALL>.
355
356=head1 EXAMPLES
357
358Process all types of information for lasth month:
359
360 gatherstats
361
362Do a dry run, showing results of processing:
363
364 gatherstats --debug --test
365
366Process all types of information for January of 2010:
367
368 gatherstats --month 2010-01
369
370Process only number of postings for the year of 2010,
371checking against checkgroups-*:
372
373 gatherstats -m 2010-01:2010-12 -s groups -c checkgroups
374
375=head1 FILES
376
377=over 4
378
379=item F<bin/gatherstats.pl>
380
381The script itself.
382
383=item F<lib/NewsStats.pm>
384
385Library functions for the NewsStats package.
386
387=item F<etc/newsstats.conf>
388
389Runtime configuration file.
390
391=back
392
393=head1 BUGS
394
395Please report any bugs or feature requests to the author or use the
396bug tracker at L<http://bugs.th-h.de/>!
397
398=head1 SEE ALSO
399
400=over 2
401
402=item -
403
404L<doc/README>
405
406=item -
407
408L<doc/INSTALL>
409
410=back
411
412This script is part of the B<NewsStats> package.
413
414=head1 AUTHOR
415
416Thomas Hochstein <thh@inter.net>
417
418=head1 COPYRIGHT AND LICENSE
419
420Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
421
422This program is free software; you may redistribute it and/or modify it
423under the same terms as Perl itself.
424
425=cut
This page took 0.011746 seconds and 4 git commands to generate.