Merge branch 'thh-small-changes' into next
[usenet/newsstats.git] / bin / gatherstats.pl
CommitLineData
3f817eb4 1#! /usr/bin/perl
2832c235
TH
2#
3# gatherstats.pl
4#
5# This script will gather statistical information from a database
6# containing headers and other information from a INN feed.
dfc2b81c 7#
2832c235
TH
8# It is part of the NewsStats package.
9#
07c0b258 10# Copyright (c) 2010-2013 Thomas Hochstein <thh@inter.net>
2832c235 11#
dfc2b81c 12# It can be redistributed and/or modified under the same terms under
2832c235
TH
13# which Perl itself is published.
14
15BEGIN {
24d2011f 16 our $VERSION = "0.02";
2832c235 17 use File::Basename;
2ad99c20
TH
18 # we're in .../bin, so our module is in ../lib
19 push(@INC, dirname($0).'/../lib');
2832c235
TH
20}
21use strict;
3f817eb4 22use warnings;
2832c235 23
1703b8e3 24use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList);
2832c235
TH
25
26use DBI;
880c3eb2
TH
27use Getopt::Long qw(GetOptions);
28Getopt::Long::config ('bundling');
2832c235
TH
29
30################################# Definitions ##################################
31
32# define types of information that can be gathered
33# all / groups (/ clients / hosts)
880c3eb2
TH
34my %LegalStats;
35@LegalStats{('all','groups')} = ();
2832c235
TH
36
37################################# Main program #################################
38
39### read commandline options
880c3eb2 40my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
23ab67a0 41 $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest,$OptConfFile);
880c3eb2
TH
42GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
43 'clientsdb=s' => \$OptClientsDB,
44 'd|debug!' => \$OptDebug,
45 'groupsdb=s' => \$OptGroupsDB,
46 'hierarchy=s' => \$OptTLH,
47 'hostsdb=s' => \$OptHostsDB,
48 'm|month=s' => \$OptMonth,
49 'rawdb=s' => \$OptRawDB,
50 's|stats=s' => \$OptStatsType,
51 't|test!' => \$OptTest,
23ab67a0 52 'conffile=s' => \$OptConfFile,
880c3eb2
TH
53 'h|help' => \&ShowPOD,
54 'V|version' => \&ShowVersion) or exit 1;
2832c235
TH
55
56### read configuration
23ab67a0 57my %Conf = %{ReadConfig($OptConfFile)};
2832c235
TH
58
59### override configuration via commandline options
60my %ConfOverride;
880c3eb2
TH
61$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
62$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
63$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
64$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
65$ConfOverride{'TLH'} = $OptTLH if $OptTLH;
2832c235
TH
66&OverrideConfig(\%Conf,\%ConfOverride);
67
68### get type of information to gather, defaulting to 'all'
880c3eb2
TH
69$OptStatsType = 'all' if !$OptStatsType;
70&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
71 if !exists($LegalStats{$OptStatsType});
2832c235 72
880c3eb2
TH
73### get time period from --month
74# get verbal description of time period, drop SQL code
75my ($Period) = &GetTimePeriod($OptMonth);
8dc6823e
TH
76# bail out if --month is invalid or set to 'ALL';
77# we don't support the latter
880c3eb2
TH
78&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
79 "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
2832c235 80
17ffbeba
TH
81### reformat $Conf{'TLH'}
82my $TLH;
83if ($Conf{'TLH'}) {
84 # $Conf{'TLH'} is parsed as an array by Config::Auto;
85 # make a flat list again, separated by :
43a0fc77 86 if (ref($Conf{'TLH'}) eq 'ARRAY') {
17ffbeba
TH
87 $TLH = join(':',@{$Conf{'TLH'}});
88 } else {
89 $TLH = $Conf{'TLH'};
90 }
91 # strip whitespace
92 $TLH =~ s/\s//g;
7773fb6d
TH
93 # add trailing dots if none are present yet
94 # (using negative look-behind assertions)
95 $TLH =~ s/(?<!\.):/.:/g;
96 $TLH =~ s/(?<!\.)$/./;
17ffbeba 97 # check for illegal characters
880c3eb2 98 &Bleat(2,'Config error - illegal characters in TLH definition!')
314e31aa 99 if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
7773fb6d
TH
100 # escape dots
101 $TLH =~ s/\./\\./g;
17ffbeba 102 if ($TLH =~ /:/) {
880c3eb2 103 # reformat $TLH from a:b to (a)|(b),
43a0fc77 104 # e.g. replace ':' by ')|('
17ffbeba
TH
105 $TLH =~ s/:/)|(/g;
106 $TLH = '(' . $TLH . ')';
107 };
108};
109
2832c235
TH
110### init database
111my $DBHandle = InitDB(\%Conf,1);
112
113### get data for each month
880c3eb2
TH
114&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
115foreach my $Month (&ListMonth($Period)) {
2832c235 116
880c3eb2 117 print "---------- $Month ----------\n" if $OptDebug;
2832c235 118
880c3eb2 119 if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
93c8eae2
TH
120 # read list of newsgroups from --checkgroups
121 # into a hash
122 my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))}
123 if $OptCheckgroupsFile;
124
2832c235
TH
125 ### ----------------------------------------------
126 ### get groups data (number of postings per group)
127 # get groups data from raw table for given month
880c3eb2
TH
128 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
129 "WHERE day LIKE ? AND NOT disregard",
130 $Conf{'DBDatabase'},
131 $Conf{'DBTableRaw'}));
132 $DBQuery->execute($Month.'-%')
133 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
134 "$DBI::errstr\n",$Month,
135 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
2832c235
TH
136
137 # count postings per group
138 my %Postings;
2832c235 139 while (($_) = $DBQuery->fetchrow_array) {
b5125b10 140 # get list of newsgroups and hierarchies from Newsgroups:
880c3eb2
TH
141 my %Newsgroups = ListNewsgroups($_,$TLH,
142 $OptCheckgroupsFile ? \%ValidGroups : '');
2832c235
TH
143 # count each newsgroup and hierarchy once
144 foreach (sort keys %Newsgroups) {
2832c235
TH
145 $Postings{$_}++;
146 };
147 };
148
880c3eb2 149 # add valid but empty groups if --checkgroups is set
ad609792
TH
150 if (%ValidGroups) {
151 foreach (sort keys %ValidGroups) {
152 if (!defined($Postings{$_})) {
1703b8e3
TH
153 # add current newsgroup as empty group
154 $Postings{$_} = 0;
155 warn (sprintf("ADDED: %s as empty group\n",$_));
156 # add empty hierarchies for current newsgroup as needed
157 foreach (ParseHierarchies($_)) {
158 my $Hierarchy = $_ . '.ALL';
159 if (!defined($Postings{$Hierarchy})) {
160 $Postings{$Hierarchy} = 0;
161 warn (sprintf("ADDED: %s as empty group\n",$Hierarchy));
b5125b10
TH
162 };
163 };
ad609792
TH
164 }
165 };
166 };
23ab67a0 167
71f0178b 168 # delete old data for that month
880c3eb2
TH
169 if (!$OptTest) {
170 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
171 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
172 undef,$Month)
173 or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
174 "$DBI::errstr\n",$Month,
175 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
71f0178b
TH
176 };
177
880c3eb2 178 print "----- GroupStats -----\n" if $OptDebug;
2832c235 179 foreach my $Newsgroup (sort keys %Postings) {
880c3eb2
TH
180 print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
181 if (!$OptTest) {
2832c235 182 # write to database
880c3eb2
TH
183 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
184 "(month,newsgroup,postings) ".
185 "VALUES (?, ?, ?)",
186 $Conf{'DBDatabase'},
187 $Conf{'DBTableGrps'}));
188 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
189 or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
190 "$DBI::errstr\n",$Month,$Newsgroup,
191 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
2832c235
TH
192 $DBQuery->finish;
193 };
194 };
195 } else {
196 # other types of information go here - later on
197 };
198};
199
200### close handles
201$DBHandle->disconnect;
202
203__END__
204
205################################ Documentation #################################
206
207=head1 NAME
208
209gatherstats - process statistical data from a raw source
210
211=head1 SYNOPSIS
212
e39d4207 213B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats>] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>] [B<--conffile> I<filename>]
2832c235
TH
214
215=head1 REQUIREMENTS
216
880c3eb2 217See L<doc/README>.
2832c235
TH
218
219=head1 DESCRIPTION
220
221This script will extract and process statistical information from a
222database table which is fed from F<feedlog.pl> for a given time period
313610f6 223and write its results to (an)other database table(s). Entries marked
880c3eb2
TH
224with I<'disregard'> in the database will be ignored; currently, you
225have to set this flag yourself, using your database management tools.
226You can exclude erroneous entries that way (e.g. automatic reposts
227(think of cancels flood and resurrectors); spam; ...).
2832c235
TH
228
229The time period to act on defaults to last month; you can assign
880c3eb2
TH
230another time period or a single month via the B<--month> option (see
231below).
2832c235
TH
232
233By default B<gatherstats> will process all types of information; you
880c3eb2
TH
234can change that using the B<--stats> option and assigning the type of
235information to process. Currently that doesn't matter yet as only
236processing of the number of postings per group per month is
237implemented anyway.
2832c235
TH
238
239Possible information types include:
240
241=over 3
242
243=item B<groups> (postings per group per month)
244
245B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
246counted for each single group they appear in. Groups not in I<TLH>
247will be ignored.
248
249B<gatherstats> will also add up the number of postings for each
250hierarchy level, but only count each posting once. A posting to
251de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
252respectively. A crossposting to de.alt.test and de.alt.admin, on the
253other hand, will be counted for de.alt.test and de.alt.admin each, but
254only once for de.alt.ALL and de.ALL.
255
880c3eb2
TH
256Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
257override that default through the B<--groupsdb> option.
2832c235
TH
258
259=back
260
261=head2 Configuration
262
880c3eb2 263B<gatherstats> will read its configuration from F<newsstats.conf>
44c19709
TH
264which should be present in etc/ via Config::Auto or from a configuration file
265submitted by the B<--conffile> option.
2832c235 266
880c3eb2 267See L<doc/INSTALL> for an overview of possible configuration options.
2832c235 268
880c3eb2
TH
269You can override configuration options via the B<--hierarchy>,
270B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
271respectively.
2832c235
TH
272
273=head1 OPTIONS
274
275=over 3
276
880c3eb2 277=item B<-V>, B<--version>
2832c235 278
880c3eb2 279Print out version and copyright information and exit.
2832c235 280
880c3eb2 281=item B<-h>, B<--help>
2832c235
TH
282
283Print this man page and exit.
284
880c3eb2 285=item B<-d>, B<--debug>
2832c235
TH
286
287Output debugging information to STDOUT while processing (number of
288postings per group).
289
880c3eb2 290=item B<-t>, B<--test>
2832c235 291
880c3eb2
TH
292Do not write results to database. You should use B<--debug> in
293conjunction with B<--test> ... everything else seems a bit pointless.
2832c235 294
880c3eb2 295=item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
2832c235 296
880c3eb2
TH
297Set processing period to a single month in YYYY-MM format or to a time
298period between two month in YYYY-MM:YYYY-MM format (two month, separated
dfc2b81c 299by a colon).
2832c235 300
880c3eb2 301=item B<-s>, B<--stats> I<type>
2832c235
TH
302
303Set processing type to one of I<all> and I<groups>. Defaults to all
304(and is currently rather pointless as only I<groups> has been
305implemented).
306
93c8eae2
TH
307=item B<-c>, B<--checkgroups> I<filename template>
308
309Check each group against a list of valid newsgroups read from a file,
310one group on each line and ignoring everything after the first
311whitespace (so you can use a file in checkgroups format or (part of)
312your INN active file).
313
95d9fe2c
TH
314The filename is taken from I<filename template>, amended by each
315B<--month> B<gatherstats> is processing in the form of I<template-YYYY-MM>,
316so that
93c8eae2
TH
317
318 gatherstats -m 2010-01:2010-12 -c checkgroups
ad609792 319
93c8eae2
TH
320will check against F<checkgroups-2010-01> for January 2010, against
321F<checkgroups-2010-02> for February 2010 and so on.
ad609792 322
93c8eae2
TH
323Newsgroups not found in the checkgroups file will be dropped (and
324logged to STDERR), and newsgroups found there but having no postings
ad609792
TH
325will be added with a count of 0 (and logged to STDERR).
326
880c3eb2 327=item B<--hierarchy> I<TLH> (newsgroup hierarchy)
2832c235
TH
328
329Override I<TLH> from F<newsstats.conf>.
330
880c3eb2 331=item B<--rawdb> I<table> (raw data table)
2832c235
TH
332
333Override I<DBTableRaw> from F<newsstats.conf>.
334
880c3eb2 335=item B<--groupsdb> I<table> (postings per group table)
2832c235
TH
336
337Override I<DBTableGrps> from F<newsstats.conf>.
338
880c3eb2 339=item B<--clientsdb> I<table> (client data table)
2832c235
TH
340
341Override I<DBTableClnts> from F<newsstats.conf>.
342
880c3eb2 343=item B<--hostsdb> I<table> (host data table)
2832c235
TH
344
345Override I<DBTableHosts> from F<newsstats.conf>.
346
23ab67a0
TH
347=item B<--conffile> I<filename>
348
349Load configuration from I<filename> instead of F<newsstats.conf>.
350
2832c235
TH
351=back
352
353=head1 INSTALLATION
354
880c3eb2 355See L<doc/INSTALL>.
2832c235
TH
356
357=head1 EXAMPLES
358
359Process all types of information for lasth month:
360
361 gatherstats
362
363Do a dry run, showing results of processing:
364
880c3eb2 365 gatherstats --debug --test
2832c235
TH
366
367Process all types of information for January of 2010:
368
880c3eb2 369 gatherstats --month 2010-01
2832c235 370
ad609792 371Process only number of postings for the year of 2010,
93c8eae2 372checking against checkgroups-*:
2832c235 373
93c8eae2 374 gatherstats -m 2010-01:2010-12 -s groups -c checkgroups
2832c235
TH
375
376=head1 FILES
377
378=over 4
379
2ad99c20 380=item F<bin/gatherstats.pl>
2832c235
TH
381
382The script itself.
383
2ad99c20 384=item F<lib/NewsStats.pm>
2832c235
TH
385
386Library functions for the NewsStats package.
387
2ad99c20 388=item F<etc/newsstats.conf>
2832c235 389
880c3eb2 390Runtime configuration file.
2832c235
TH
391
392=back
393
394=head1 BUGS
395
396Please report any bugs or feature requests to the author or use the
397bug tracker at L<http://bugs.th-h.de/>!
398
399=head1 SEE ALSO
400
401=over 2
402
403=item -
404
880c3eb2 405L<doc/README>
2832c235
TH
406
407=item -
408
880c3eb2 409L<doc/INSTALL>
2832c235
TH
410
411=back
412
413This script is part of the B<NewsStats> package.
414
415=head1 AUTHOR
416
417Thomas Hochstein <thh@inter.net>
418
419=head1 COPYRIGHT AND LICENSE
420
28717921 421Copyright (c) 2010-2013 Thomas Hochstein <thh@inter.net>
2832c235
TH
422
423This program is free software; you may redistribute it and/or modify it
424under the same terms as Perl itself.
425
426=cut
This page took 0.03842 seconds and 4 git commands to generate.