Some documentation fixes and enhancments.
[usenet/newsstats.git] / gatherstats.pl
CommitLineData
2832c235
TH
1#! /usr/bin/perl -W
2#
3# gatherstats.pl
4#
5# This script will gather statistical information from a database
6# containing headers and other information from a INN feed.
7#
8# It is part of the NewsStats package.
9#
880c3eb2 10# Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
2832c235
TH
11#
12# It can be redistributed and/or modified under the same terms under
13# which Perl itself is published.
14
15BEGIN {
16 our $VERSION = "0.01";
17 use File::Basename;
18 push(@INC, dirname($0));
19}
20use strict;
21
1703b8e3 22use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList);
2832c235
TH
23
24use DBI;
880c3eb2
TH
25use Getopt::Long qw(GetOptions);
26Getopt::Long::config ('bundling');
2832c235
TH
27
28################################# Definitions ##################################
29
30# define types of information that can be gathered
31# all / groups (/ clients / hosts)
880c3eb2
TH
32my %LegalStats;
33@LegalStats{('all','groups')} = ();
2832c235
TH
34
35################################# Main program #################################
36
37### read commandline options
880c3eb2
TH
38my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
39 $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest);
40GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
41 'clientsdb=s' => \$OptClientsDB,
42 'd|debug!' => \$OptDebug,
43 'groupsdb=s' => \$OptGroupsDB,
44 'hierarchy=s' => \$OptTLH,
45 'hostsdb=s' => \$OptHostsDB,
46 'm|month=s' => \$OptMonth,
47 'rawdb=s' => \$OptRawDB,
48 's|stats=s' => \$OptStatsType,
49 't|test!' => \$OptTest,
50 'h|help' => \&ShowPOD,
51 'V|version' => \&ShowVersion) or exit 1;
2832c235
TH
52
53### read configuration
880c3eb2 54my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')};
2832c235
TH
55
56### override configuration via commandline options
57my %ConfOverride;
880c3eb2
TH
58$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
59$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
60$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
61$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
62$ConfOverride{'TLH'} = $OptTLH if $OptTLH;
2832c235
TH
63&OverrideConfig(\%Conf,\%ConfOverride);
64
65### get type of information to gather, defaulting to 'all'
880c3eb2
TH
66$OptStatsType = 'all' if !$OptStatsType;
67&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
68 if !exists($LegalStats{$OptStatsType});
2832c235 69
880c3eb2
TH
70### get time period from --month
71# get verbal description of time period, drop SQL code
72my ($Period) = &GetTimePeriod($OptMonth);
73&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
74 "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
2832c235 75
17ffbeba
TH
76### reformat $Conf{'TLH'}
77my $TLH;
78if ($Conf{'TLH'}) {
79 # $Conf{'TLH'} is parsed as an array by Config::Auto;
80 # make a flat list again, separated by :
43a0fc77 81 if (ref($Conf{'TLH'}) eq 'ARRAY') {
17ffbeba
TH
82 $TLH = join(':',@{$Conf{'TLH'}});
83 } else {
84 $TLH = $Conf{'TLH'};
85 }
86 # strip whitespace
87 $TLH =~ s/\s//g;
7773fb6d
TH
88 # add trailing dots if none are present yet
89 # (using negative look-behind assertions)
90 $TLH =~ s/(?<!\.):/.:/g;
91 $TLH =~ s/(?<!\.)$/./;
17ffbeba 92 # check for illegal characters
880c3eb2 93 &Bleat(2,'Config error - illegal characters in TLH definition!')
314e31aa 94 if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
7773fb6d
TH
95 # escape dots
96 $TLH =~ s/\./\\./g;
17ffbeba 97 if ($TLH =~ /:/) {
880c3eb2 98 # reformat $TLH from a:b to (a)|(b),
43a0fc77 99 # e.g. replace ':' by ')|('
17ffbeba
TH
100 $TLH =~ s/:/)|(/g;
101 $TLH = '(' . $TLH . ')';
102 };
103};
104
2832c235
TH
105### init database
106my $DBHandle = InitDB(\%Conf,1);
107
108### get data for each month
880c3eb2
TH
109&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
110foreach my $Month (&ListMonth($Period)) {
2832c235 111
880c3eb2 112 print "---------- $Month ----------\n" if $OptDebug;
2832c235 113
880c3eb2 114 if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
93c8eae2
TH
115 # read list of newsgroups from --checkgroups
116 # into a hash
117 my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))}
118 if $OptCheckgroupsFile;
119
2832c235
TH
120 ### ----------------------------------------------
121 ### get groups data (number of postings per group)
122 # get groups data from raw table for given month
880c3eb2
TH
123 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
124 "WHERE day LIKE ? AND NOT disregard",
125 $Conf{'DBDatabase'},
126 $Conf{'DBTableRaw'}));
127 $DBQuery->execute($Month.'-%')
128 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
129 "$DBI::errstr\n",$Month,
130 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
2832c235
TH
131
132 # count postings per group
133 my %Postings;
2832c235 134 while (($_) = $DBQuery->fetchrow_array) {
b5125b10 135 # get list of newsgroups and hierarchies from Newsgroups:
880c3eb2
TH
136 my %Newsgroups = ListNewsgroups($_,$TLH,
137 $OptCheckgroupsFile ? \%ValidGroups : '');
2832c235
TH
138 # count each newsgroup and hierarchy once
139 foreach (sort keys %Newsgroups) {
2832c235
TH
140 $Postings{$_}++;
141 };
142 };
143
880c3eb2 144 # add valid but empty groups if --checkgroups is set
ad609792
TH
145 if (%ValidGroups) {
146 foreach (sort keys %ValidGroups) {
147 if (!defined($Postings{$_})) {
1703b8e3
TH
148 # add current newsgroup as empty group
149 $Postings{$_} = 0;
150 warn (sprintf("ADDED: %s as empty group\n",$_));
151 # add empty hierarchies for current newsgroup as needed
152 foreach (ParseHierarchies($_)) {
153 my $Hierarchy = $_ . '.ALL';
154 if (!defined($Postings{$Hierarchy})) {
155 $Postings{$Hierarchy} = 0;
156 warn (sprintf("ADDED: %s as empty group\n",$Hierarchy));
b5125b10
TH
157 };
158 };
ad609792
TH
159 }
160 };
161 };
b5125b10 162
71f0178b 163 # delete old data for that month
880c3eb2
TH
164 if (!$OptTest) {
165 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
166 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
167 undef,$Month)
168 or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
169 "$DBI::errstr\n",$Month,
170 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
71f0178b
TH
171 };
172
880c3eb2 173 print "----- GroupStats -----\n" if $OptDebug;
2832c235 174 foreach my $Newsgroup (sort keys %Postings) {
880c3eb2
TH
175 print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
176 if (!$OptTest) {
2832c235 177 # write to database
880c3eb2
TH
178 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
179 "(month,newsgroup,postings) ".
180 "VALUES (?, ?, ?)",
181 $Conf{'DBDatabase'},
182 $Conf{'DBTableGrps'}));
183 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
184 or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
185 "$DBI::errstr\n",$Month,$Newsgroup,
186 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
2832c235
TH
187 $DBQuery->finish;
188 };
189 };
190 } else {
191 # other types of information go here - later on
192 };
193};
194
195### close handles
196$DBHandle->disconnect;
197
198__END__
199
200################################ Documentation #################################
201
202=head1 NAME
203
204gatherstats - process statistical data from a raw source
205
206=head1 SYNOPSIS
207
95d9fe2c 208B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats>] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>]
2832c235
TH
209
210=head1 REQUIREMENTS
211
880c3eb2 212See L<doc/README>.
2832c235
TH
213
214=head1 DESCRIPTION
215
216This script will extract and process statistical information from a
217database table which is fed from F<feedlog.pl> for a given time period
313610f6 218and write its results to (an)other database table(s). Entries marked
880c3eb2
TH
219with I<'disregard'> in the database will be ignored; currently, you
220have to set this flag yourself, using your database management tools.
221You can exclude erroneous entries that way (e.g. automatic reposts
222(think of cancels flood and resurrectors); spam; ...).
2832c235
TH
223
224The time period to act on defaults to last month; you can assign
880c3eb2
TH
225another time period or a single month via the B<--month> option (see
226below).
2832c235
TH
227
228By default B<gatherstats> will process all types of information; you
880c3eb2
TH
229can change that using the B<--stats> option and assigning the type of
230information to process. Currently that doesn't matter yet as only
231processing of the number of postings per group per month is
232implemented anyway.
2832c235
TH
233
234Possible information types include:
235
236=over 3
237
238=item B<groups> (postings per group per month)
239
240B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
241counted for each single group they appear in. Groups not in I<TLH>
242will be ignored.
243
244B<gatherstats> will also add up the number of postings for each
245hierarchy level, but only count each posting once. A posting to
246de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
247respectively. A crossposting to de.alt.test and de.alt.admin, on the
248other hand, will be counted for de.alt.test and de.alt.admin each, but
249only once for de.alt.ALL and de.ALL.
250
880c3eb2
TH
251Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
252override that default through the B<--groupsdb> option.
2832c235
TH
253
254=back
255
256=head2 Configuration
257
880c3eb2 258B<gatherstats> will read its configuration from F<newsstats.conf>
2832c235
TH
259which should be present in the same directory via Config::Auto.
260
880c3eb2 261See L<doc/INSTALL> for an overview of possible configuration options.
2832c235 262
880c3eb2
TH
263You can override configuration options via the B<--hierarchy>,
264B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
265respectively.
2832c235
TH
266
267=head1 OPTIONS
268
269=over 3
270
880c3eb2 271=item B<-V>, B<--version>
2832c235 272
880c3eb2 273Print out version and copyright information and exit.
2832c235 274
880c3eb2 275=item B<-h>, B<--help>
2832c235
TH
276
277Print this man page and exit.
278
880c3eb2 279=item B<-d>, B<--debug>
2832c235
TH
280
281Output debugging information to STDOUT while processing (number of
282postings per group).
283
880c3eb2 284=item B<-t>, B<--test>
2832c235 285
880c3eb2
TH
286Do not write results to database. You should use B<--debug> in
287conjunction with B<--test> ... everything else seems a bit pointless.
2832c235 288
880c3eb2 289=item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
2832c235 290
880c3eb2
TH
291Set processing period to a single month in YYYY-MM format or to a time
292period between two month in YYYY-MM:YYYY-MM format (two month, separated
293by a colon).
2832c235 294
880c3eb2 295=item B<-s>, B<--stats> I<type>
2832c235
TH
296
297Set processing type to one of I<all> and I<groups>. Defaults to all
298(and is currently rather pointless as only I<groups> has been
299implemented).
300
93c8eae2
TH
301=item B<-c>, B<--checkgroups> I<filename template>
302
303Check each group against a list of valid newsgroups read from a file,
304one group on each line and ignoring everything after the first
305whitespace (so you can use a file in checkgroups format or (part of)
306your INN active file).
307
95d9fe2c
TH
308The filename is taken from I<filename template>, amended by each
309B<--month> B<gatherstats> is processing in the form of I<template-YYYY-MM>,
310so that
93c8eae2
TH
311
312 gatherstats -m 2010-01:2010-12 -c checkgroups
ad609792 313
93c8eae2
TH
314will check against F<checkgroups-2010-01> for January 2010, against
315F<checkgroups-2010-02> for February 2010 and so on.
ad609792 316
93c8eae2
TH
317Newsgroups not found in the checkgroups file will be dropped (and
318logged to STDERR), and newsgroups found there but having no postings
ad609792
TH
319will be added with a count of 0 (and logged to STDERR).
320
880c3eb2 321=item B<--hierarchy> I<TLH> (newsgroup hierarchy)
2832c235
TH
322
323Override I<TLH> from F<newsstats.conf>.
324
880c3eb2 325=item B<--rawdb> I<table> (raw data table)
2832c235
TH
326
327Override I<DBTableRaw> from F<newsstats.conf>.
328
880c3eb2 329=item B<--groupsdb> I<table> (postings per group table)
2832c235
TH
330
331Override I<DBTableGrps> from F<newsstats.conf>.
332
880c3eb2 333=item B<--clientsdb> I<table> (client data table)
2832c235
TH
334
335Override I<DBTableClnts> from F<newsstats.conf>.
336
880c3eb2 337=item B<--hostsdb> I<table> (host data table)
2832c235
TH
338
339Override I<DBTableHosts> from F<newsstats.conf>.
340
341=back
342
343=head1 INSTALLATION
344
880c3eb2 345See L<doc/INSTALL>.
2832c235
TH
346
347=head1 EXAMPLES
348
349Process all types of information for lasth month:
350
351 gatherstats
352
353Do a dry run, showing results of processing:
354
880c3eb2 355 gatherstats --debug --test
2832c235
TH
356
357Process all types of information for January of 2010:
358
880c3eb2 359 gatherstats --month 2010-01
2832c235 360
ad609792 361Process only number of postings for the year of 2010,
93c8eae2 362checking against checkgroups-*:
2832c235 363
93c8eae2 364 gatherstats -m 2010-01:2010-12 -s groups -c checkgroups
2832c235
TH
365
366=head1 FILES
367
368=over 4
369
370=item F<gatherstats.pl>
371
372The script itself.
373
374=item F<NewsStats.pm>
375
376Library functions for the NewsStats package.
377
378=item F<newsstats.conf>
379
880c3eb2 380Runtime configuration file.
2832c235
TH
381
382=back
383
384=head1 BUGS
385
386Please report any bugs or feature requests to the author or use the
387bug tracker at L<http://bugs.th-h.de/>!
388
389=head1 SEE ALSO
390
391=over 2
392
393=item -
394
880c3eb2 395L<doc/README>
2832c235
TH
396
397=item -
398
880c3eb2 399L<doc/INSTALL>
2832c235
TH
400
401=back
402
403This script is part of the B<NewsStats> package.
404
405=head1 AUTHOR
406
407Thomas Hochstein <thh@inter.net>
408
409=head1 COPYRIGHT AND LICENSE
410
880c3eb2 411Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
2832c235
TH
412
413This program is free software; you may redistribute it and/or modify it
414under the same terms as Perl itself.
415
416=cut
This page took 0.034294 seconds and 4 git commands to generate.