Fix some whitespace.
[usenet/newsstats.git] / bin / gatherstats.pl
CommitLineData
3f817eb4 1#! /usr/bin/perl
2832c235
TH
2#
3# gatherstats.pl
4#
5# This script will gather statistical information from a database
6# containing headers and other information from a INN feed.
dfc2b81c 7#
2832c235
TH
8# It is part of the NewsStats package.
9#
07c0b258 10# Copyright (c) 2010-2013 Thomas Hochstein <thh@inter.net>
2832c235 11#
dfc2b81c 12# It can be redistributed and/or modified under the same terms under
2832c235
TH
13# which Perl itself is published.
14
15BEGIN {
16 our $VERSION = "0.01";
17 use File::Basename;
2ad99c20
TH
18 # we're in .../bin, so our module is in ../lib
19 push(@INC, dirname($0).'/../lib');
2832c235
TH
20}
21use strict;
3f817eb4 22use warnings;
2832c235 23
1703b8e3 24use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList);
2832c235
TH
25
26use DBI;
880c3eb2
TH
27use Getopt::Long qw(GetOptions);
28Getopt::Long::config ('bundling');
2832c235
TH
29
30################################# Definitions ##################################
31
32# define types of information that can be gathered
33# all / groups (/ clients / hosts)
880c3eb2
TH
34my %LegalStats;
35@LegalStats{('all','groups')} = ();
2832c235
TH
36
37################################# Main program #################################
38
39### read commandline options
880c3eb2
TH
40my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
41 $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest);
42GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
43 'clientsdb=s' => \$OptClientsDB,
44 'd|debug!' => \$OptDebug,
45 'groupsdb=s' => \$OptGroupsDB,
46 'hierarchy=s' => \$OptTLH,
47 'hostsdb=s' => \$OptHostsDB,
48 'm|month=s' => \$OptMonth,
49 'rawdb=s' => \$OptRawDB,
50 's|stats=s' => \$OptStatsType,
51 't|test!' => \$OptTest,
52 'h|help' => \&ShowPOD,
53 'V|version' => \&ShowVersion) or exit 1;
2832c235
TH
54
55### read configuration
2ad99c20 56my %Conf = %{ReadConfig('')};
2832c235
TH
57
58### override configuration via commandline options
59my %ConfOverride;
880c3eb2
TH
60$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
61$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
62$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
63$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
64$ConfOverride{'TLH'} = $OptTLH if $OptTLH;
2832c235
TH
65&OverrideConfig(\%Conf,\%ConfOverride);
66
67### get type of information to gather, defaulting to 'all'
880c3eb2
TH
68$OptStatsType = 'all' if !$OptStatsType;
69&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
70 if !exists($LegalStats{$OptStatsType});
2832c235 71
880c3eb2
TH
72### get time period from --month
73# get verbal description of time period, drop SQL code
74my ($Period) = &GetTimePeriod($OptMonth);
75&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
76 "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
2832c235 77
17ffbeba
TH
78### reformat $Conf{'TLH'}
79my $TLH;
80if ($Conf{'TLH'}) {
81 # $Conf{'TLH'} is parsed as an array by Config::Auto;
82 # make a flat list again, separated by :
43a0fc77 83 if (ref($Conf{'TLH'}) eq 'ARRAY') {
17ffbeba
TH
84 $TLH = join(':',@{$Conf{'TLH'}});
85 } else {
86 $TLH = $Conf{'TLH'};
87 }
88 # strip whitespace
89 $TLH =~ s/\s//g;
7773fb6d
TH
90 # add trailing dots if none are present yet
91 # (using negative look-behind assertions)
92 $TLH =~ s/(?<!\.):/.:/g;
93 $TLH =~ s/(?<!\.)$/./;
17ffbeba 94 # check for illegal characters
880c3eb2 95 &Bleat(2,'Config error - illegal characters in TLH definition!')
314e31aa 96 if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
7773fb6d
TH
97 # escape dots
98 $TLH =~ s/\./\\./g;
17ffbeba 99 if ($TLH =~ /:/) {
880c3eb2 100 # reformat $TLH from a:b to (a)|(b),
43a0fc77 101 # e.g. replace ':' by ')|('
17ffbeba
TH
102 $TLH =~ s/:/)|(/g;
103 $TLH = '(' . $TLH . ')';
104 };
105};
106
2832c235
TH
107### init database
108my $DBHandle = InitDB(\%Conf,1);
109
110### get data for each month
880c3eb2
TH
111&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
112foreach my $Month (&ListMonth($Period)) {
2832c235 113
880c3eb2 114 print "---------- $Month ----------\n" if $OptDebug;
2832c235 115
880c3eb2 116 if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
93c8eae2
TH
117 # read list of newsgroups from --checkgroups
118 # into a hash
119 my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))}
120 if $OptCheckgroupsFile;
121
2832c235
TH
122 ### ----------------------------------------------
123 ### get groups data (number of postings per group)
124 # get groups data from raw table for given month
880c3eb2
TH
125 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
126 "WHERE day LIKE ? AND NOT disregard",
127 $Conf{'DBDatabase'},
128 $Conf{'DBTableRaw'}));
129 $DBQuery->execute($Month.'-%')
130 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
131 "$DBI::errstr\n",$Month,
132 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
2832c235
TH
133
134 # count postings per group
135 my %Postings;
2832c235 136 while (($_) = $DBQuery->fetchrow_array) {
b5125b10 137 # get list of newsgroups and hierarchies from Newsgroups:
880c3eb2
TH
138 my %Newsgroups = ListNewsgroups($_,$TLH,
139 $OptCheckgroupsFile ? \%ValidGroups : '');
2832c235
TH
140 # count each newsgroup and hierarchy once
141 foreach (sort keys %Newsgroups) {
2832c235
TH
142 $Postings{$_}++;
143 };
144 };
145
880c3eb2 146 # add valid but empty groups if --checkgroups is set
ad609792
TH
147 if (%ValidGroups) {
148 foreach (sort keys %ValidGroups) {
149 if (!defined($Postings{$_})) {
1703b8e3
TH
150 # add current newsgroup as empty group
151 $Postings{$_} = 0;
152 warn (sprintf("ADDED: %s as empty group\n",$_));
153 # add empty hierarchies for current newsgroup as needed
154 foreach (ParseHierarchies($_)) {
155 my $Hierarchy = $_ . '.ALL';
156 if (!defined($Postings{$Hierarchy})) {
157 $Postings{$Hierarchy} = 0;
158 warn (sprintf("ADDED: %s as empty group\n",$Hierarchy));
b5125b10
TH
159 };
160 };
ad609792
TH
161 }
162 };
163 };
b5125b10 164
71f0178b 165 # delete old data for that month
880c3eb2
TH
166 if (!$OptTest) {
167 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
168 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
169 undef,$Month)
170 or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
171 "$DBI::errstr\n",$Month,
172 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
71f0178b
TH
173 };
174
880c3eb2 175 print "----- GroupStats -----\n" if $OptDebug;
2832c235 176 foreach my $Newsgroup (sort keys %Postings) {
880c3eb2
TH
177 print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
178 if (!$OptTest) {
2832c235 179 # write to database
880c3eb2
TH
180 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
181 "(month,newsgroup,postings) ".
182 "VALUES (?, ?, ?)",
183 $Conf{'DBDatabase'},
184 $Conf{'DBTableGrps'}));
185 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
186 or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
187 "$DBI::errstr\n",$Month,$Newsgroup,
188 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
2832c235
TH
189 $DBQuery->finish;
190 };
191 };
192 } else {
193 # other types of information go here - later on
194 };
195};
196
197### close handles
198$DBHandle->disconnect;
199
200__END__
201
202################################ Documentation #################################
203
204=head1 NAME
205
206gatherstats - process statistical data from a raw source
207
208=head1 SYNOPSIS
209
95d9fe2c 210B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats>] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>]
2832c235
TH
211
212=head1 REQUIREMENTS
213
880c3eb2 214See L<doc/README>.
2832c235
TH
215
216=head1 DESCRIPTION
217
218This script will extract and process statistical information from a
219database table which is fed from F<feedlog.pl> for a given time period
313610f6 220and write its results to (an)other database table(s). Entries marked
880c3eb2
TH
221with I<'disregard'> in the database will be ignored; currently, you
222have to set this flag yourself, using your database management tools.
223You can exclude erroneous entries that way (e.g. automatic reposts
224(think of cancels flood and resurrectors); spam; ...).
2832c235
TH
225
226The time period to act on defaults to last month; you can assign
880c3eb2
TH
227another time period or a single month via the B<--month> option (see
228below).
2832c235
TH
229
230By default B<gatherstats> will process all types of information; you
880c3eb2
TH
231can change that using the B<--stats> option and assigning the type of
232information to process. Currently that doesn't matter yet as only
233processing of the number of postings per group per month is
234implemented anyway.
2832c235
TH
235
236Possible information types include:
237
238=over 3
239
240=item B<groups> (postings per group per month)
241
242B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
243counted for each single group they appear in. Groups not in I<TLH>
244will be ignored.
245
246B<gatherstats> will also add up the number of postings for each
247hierarchy level, but only count each posting once. A posting to
248de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
249respectively. A crossposting to de.alt.test and de.alt.admin, on the
250other hand, will be counted for de.alt.test and de.alt.admin each, but
251only once for de.alt.ALL and de.ALL.
252
880c3eb2
TH
253Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
254override that default through the B<--groupsdb> option.
2832c235
TH
255
256=back
257
258=head2 Configuration
259
880c3eb2 260B<gatherstats> will read its configuration from F<newsstats.conf>
2832c235
TH
261which should be present in the same directory via Config::Auto.
262
880c3eb2 263See L<doc/INSTALL> for an overview of possible configuration options.
2832c235 264
880c3eb2
TH
265You can override configuration options via the B<--hierarchy>,
266B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
267respectively.
2832c235
TH
268
269=head1 OPTIONS
270
271=over 3
272
880c3eb2 273=item B<-V>, B<--version>
2832c235 274
880c3eb2 275Print out version and copyright information and exit.
2832c235 276
880c3eb2 277=item B<-h>, B<--help>
2832c235
TH
278
279Print this man page and exit.
280
880c3eb2 281=item B<-d>, B<--debug>
2832c235
TH
282
283Output debugging information to STDOUT while processing (number of
284postings per group).
285
880c3eb2 286=item B<-t>, B<--test>
2832c235 287
880c3eb2
TH
288Do not write results to database. You should use B<--debug> in
289conjunction with B<--test> ... everything else seems a bit pointless.
2832c235 290
880c3eb2 291=item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
2832c235 292
880c3eb2
TH
293Set processing period to a single month in YYYY-MM format or to a time
294period between two month in YYYY-MM:YYYY-MM format (two month, separated
dfc2b81c 295by a colon).
2832c235 296
880c3eb2 297=item B<-s>, B<--stats> I<type>
2832c235
TH
298
299Set processing type to one of I<all> and I<groups>. Defaults to all
300(and is currently rather pointless as only I<groups> has been
301implemented).
302
93c8eae2
TH
303=item B<-c>, B<--checkgroups> I<filename template>
304
305Check each group against a list of valid newsgroups read from a file,
306one group on each line and ignoring everything after the first
307whitespace (so you can use a file in checkgroups format or (part of)
308your INN active file).
309
95d9fe2c
TH
310The filename is taken from I<filename template>, amended by each
311B<--month> B<gatherstats> is processing in the form of I<template-YYYY-MM>,
312so that
93c8eae2
TH
313
314 gatherstats -m 2010-01:2010-12 -c checkgroups
ad609792 315
93c8eae2
TH
316will check against F<checkgroups-2010-01> for January 2010, against
317F<checkgroups-2010-02> for February 2010 and so on.
ad609792 318
93c8eae2
TH
319Newsgroups not found in the checkgroups file will be dropped (and
320logged to STDERR), and newsgroups found there but having no postings
ad609792
TH
321will be added with a count of 0 (and logged to STDERR).
322
880c3eb2 323=item B<--hierarchy> I<TLH> (newsgroup hierarchy)
2832c235
TH
324
325Override I<TLH> from F<newsstats.conf>.
326
880c3eb2 327=item B<--rawdb> I<table> (raw data table)
2832c235
TH
328
329Override I<DBTableRaw> from F<newsstats.conf>.
330
880c3eb2 331=item B<--groupsdb> I<table> (postings per group table)
2832c235
TH
332
333Override I<DBTableGrps> from F<newsstats.conf>.
334
880c3eb2 335=item B<--clientsdb> I<table> (client data table)
2832c235
TH
336
337Override I<DBTableClnts> from F<newsstats.conf>.
338
880c3eb2 339=item B<--hostsdb> I<table> (host data table)
2832c235
TH
340
341Override I<DBTableHosts> from F<newsstats.conf>.
342
343=back
344
345=head1 INSTALLATION
346
880c3eb2 347See L<doc/INSTALL>.
2832c235
TH
348
349=head1 EXAMPLES
350
351Process all types of information for lasth month:
352
353 gatherstats
354
355Do a dry run, showing results of processing:
356
880c3eb2 357 gatherstats --debug --test
2832c235
TH
358
359Process all types of information for January of 2010:
360
880c3eb2 361 gatherstats --month 2010-01
2832c235 362
ad609792 363Process only number of postings for the year of 2010,
93c8eae2 364checking against checkgroups-*:
2832c235 365
93c8eae2 366 gatherstats -m 2010-01:2010-12 -s groups -c checkgroups
2832c235
TH
367
368=head1 FILES
369
370=over 4
371
2ad99c20 372=item F<bin/gatherstats.pl>
2832c235
TH
373
374The script itself.
375
2ad99c20 376=item F<lib/NewsStats.pm>
2832c235
TH
377
378Library functions for the NewsStats package.
379
2ad99c20 380=item F<etc/newsstats.conf>
2832c235 381
880c3eb2 382Runtime configuration file.
2832c235
TH
383
384=back
385
386=head1 BUGS
387
388Please report any bugs or feature requests to the author or use the
389bug tracker at L<http://bugs.th-h.de/>!
390
391=head1 SEE ALSO
392
393=over 2
394
395=item -
396
880c3eb2 397L<doc/README>
2832c235
TH
398
399=item -
400
880c3eb2 401L<doc/INSTALL>
2832c235
TH
402
403=back
404
405This script is part of the B<NewsStats> package.
406
407=head1 AUTHOR
408
409Thomas Hochstein <thh@inter.net>
410
411=head1 COPYRIGHT AND LICENSE
412
880c3eb2 413Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
2832c235
TH
414
415This program is free software; you may redistribute it and/or modify it
416under the same terms as Perl itself.
417
418=cut
This page took 0.037127 seconds and 4 git commands to generate.