Merge branch 'feedlog' into next
[usenet/newsstats.git] / gatherstats.pl
CommitLineData
2832c235
TH
1#! /usr/bin/perl -W
2#
3# gatherstats.pl
4#
5# This script will gather statistical information from a database
6# containing headers and other information from a INN feed.
7#
8# It is part of the NewsStats package.
9#
880c3eb2 10# Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
2832c235
TH
11#
12# It can be redistributed and/or modified under the same terms under
13# which Perl itself is published.
14
15BEGIN {
16 our $VERSION = "0.01";
17 use File::Basename;
18 push(@INC, dirname($0));
19}
20use strict;
21
ad609792 22use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList);
2832c235
TH
23
24use DBI;
880c3eb2
TH
25use Getopt::Long qw(GetOptions);
26Getopt::Long::config ('bundling');
2832c235
TH
27
28################################# Definitions ##################################
29
30# define types of information that can be gathered
31# all / groups (/ clients / hosts)
880c3eb2
TH
32my %LegalStats;
33@LegalStats{('all','groups')} = ();
2832c235
TH
34
35################################# Main program #################################
36
37### read commandline options
880c3eb2
TH
38my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
39 $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest);
40GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
41 'clientsdb=s' => \$OptClientsDB,
42 'd|debug!' => \$OptDebug,
43 'groupsdb=s' => \$OptGroupsDB,
44 'hierarchy=s' => \$OptTLH,
45 'hostsdb=s' => \$OptHostsDB,
46 'm|month=s' => \$OptMonth,
47 'rawdb=s' => \$OptRawDB,
48 's|stats=s' => \$OptStatsType,
49 't|test!' => \$OptTest,
50 'h|help' => \&ShowPOD,
51 'V|version' => \&ShowVersion) or exit 1;
2832c235
TH
52
53### read configuration
880c3eb2 54my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')};
2832c235
TH
55
56### override configuration via commandline options
57my %ConfOverride;
880c3eb2
TH
58$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
59$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
60$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
61$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
62$ConfOverride{'TLH'} = $OptTLH if $OptTLH;
2832c235
TH
63&OverrideConfig(\%Conf,\%ConfOverride);
64
65### get type of information to gather, defaulting to 'all'
880c3eb2
TH
66$OptStatsType = 'all' if !$OptStatsType;
67&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
68 if !exists($LegalStats{$OptStatsType});
2832c235 69
880c3eb2
TH
70### get time period from --month
71# get verbal description of time period, drop SQL code
72my ($Period) = &GetTimePeriod($OptMonth);
73&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
74 "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
2832c235 75
17ffbeba
TH
76### reformat $Conf{'TLH'}
77my $TLH;
78if ($Conf{'TLH'}) {
79 # $Conf{'TLH'} is parsed as an array by Config::Auto;
80 # make a flat list again, separated by :
43a0fc77 81 if (ref($Conf{'TLH'}) eq 'ARRAY') {
17ffbeba
TH
82 $TLH = join(':',@{$Conf{'TLH'}});
83 } else {
84 $TLH = $Conf{'TLH'};
85 }
86 # strip whitespace
87 $TLH =~ s/\s//g;
7773fb6d
TH
88 # add trailing dots if none are present yet
89 # (using negative look-behind assertions)
90 $TLH =~ s/(?<!\.):/.:/g;
91 $TLH =~ s/(?<!\.)$/./;
17ffbeba 92 # check for illegal characters
880c3eb2 93 &Bleat(2,'Config error - illegal characters in TLH definition!')
314e31aa 94 if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
7773fb6d
TH
95 # escape dots
96 $TLH =~ s/\./\\./g;
17ffbeba 97 if ($TLH =~ /:/) {
880c3eb2 98 # reformat $TLH from a:b to (a)|(b),
43a0fc77 99 # e.g. replace ':' by ')|('
17ffbeba
TH
100 $TLH =~ s/:/)|(/g;
101 $TLH = '(' . $TLH . ')';
102 };
103};
104
2832c235
TH
105### init database
106my $DBHandle = InitDB(\%Conf,1);
107
108### get data for each month
880c3eb2
TH
109&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
110foreach my $Month (&ListMonth($Period)) {
2832c235 111
880c3eb2 112 print "---------- $Month ----------\n" if $OptDebug;
2832c235 113
880c3eb2 114 if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
93c8eae2
TH
115 # read list of newsgroups from --checkgroups
116 # into a hash
117 my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))}
118 if $OptCheckgroupsFile;
119
2832c235
TH
120 ### ----------------------------------------------
121 ### get groups data (number of postings per group)
122 # get groups data from raw table for given month
880c3eb2
TH
123 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
124 "WHERE day LIKE ? AND NOT disregard",
125 $Conf{'DBDatabase'},
126 $Conf{'DBTableRaw'}));
127 $DBQuery->execute($Month.'-%')
128 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
129 "$DBI::errstr\n",$Month,
130 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
2832c235
TH
131
132 # count postings per group
133 my %Postings;
2832c235 134 while (($_) = $DBQuery->fetchrow_array) {
b5125b10 135 # get list of newsgroups and hierarchies from Newsgroups:
880c3eb2
TH
136 my %Newsgroups = ListNewsgroups($_,$TLH,
137 $OptCheckgroupsFile ? \%ValidGroups : '');
2832c235
TH
138 # count each newsgroup and hierarchy once
139 foreach (sort keys %Newsgroups) {
2832c235
TH
140 $Postings{$_}++;
141 };
142 };
143
880c3eb2 144 # add valid but empty groups if --checkgroups is set
ad609792
TH
145 if (%ValidGroups) {
146 foreach (sort keys %ValidGroups) {
147 if (!defined($Postings{$_})) {
b5125b10
TH
148 # expand newsgroup with hierarchies
149 my @Newsgroups = ParseHierarchies($_);
150 # add each empty newsgroup and empty hierarchies, too, as needed
151 foreach (@Newsgroups) {
152 if (!defined($Postings{$_})) {
153 $Postings{$_} = 0;
154 warn (sprintf("ADDED: %s as empty group\n",$_));
155 };
156 };
ad609792
TH
157 }
158 };
159 };
b5125b10 160
71f0178b 161 # delete old data for that month
880c3eb2
TH
162 if (!$OptTest) {
163 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
164 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
165 undef,$Month)
166 or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
167 "$DBI::errstr\n",$Month,
168 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
71f0178b
TH
169 };
170
880c3eb2 171 print "----- GroupStats -----\n" if $OptDebug;
2832c235 172 foreach my $Newsgroup (sort keys %Postings) {
880c3eb2
TH
173 print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
174 if (!$OptTest) {
2832c235 175 # write to database
880c3eb2
TH
176 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
177 "(month,newsgroup,postings) ".
178 "VALUES (?, ?, ?)",
179 $Conf{'DBDatabase'},
180 $Conf{'DBTableGrps'}));
181 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
182 or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
183 "$DBI::errstr\n",$Month,$Newsgroup,
184 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
2832c235
TH
185 $DBQuery->finish;
186 };
187 };
188 } else {
189 # other types of information go here - later on
190 };
191};
192
193### close handles
194$DBHandle->disconnect;
195
196__END__
197
198################################ Documentation #################################
199
200=head1 NAME
201
202gatherstats - process statistical data from a raw source
203
204=head1 SYNOPSIS
205
93c8eae2 206B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>]
2832c235
TH
207
208=head1 REQUIREMENTS
209
880c3eb2 210See L<doc/README>.
2832c235
TH
211
212=head1 DESCRIPTION
213
214This script will extract and process statistical information from a
215database table which is fed from F<feedlog.pl> for a given time period
313610f6 216and write its results to (an)other database table(s). Entries marked
880c3eb2
TH
217with I<'disregard'> in the database will be ignored; currently, you
218have to set this flag yourself, using your database management tools.
219You can exclude erroneous entries that way (e.g. automatic reposts
220(think of cancels flood and resurrectors); spam; ...).
2832c235
TH
221
222The time period to act on defaults to last month; you can assign
880c3eb2
TH
223another time period or a single month via the B<--month> option (see
224below).
2832c235
TH
225
226By default B<gatherstats> will process all types of information; you
880c3eb2
TH
227can change that using the B<--stats> option and assigning the type of
228information to process. Currently that doesn't matter yet as only
229processing of the number of postings per group per month is
230implemented anyway.
2832c235
TH
231
232Possible information types include:
233
234=over 3
235
236=item B<groups> (postings per group per month)
237
238B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
239counted for each single group they appear in. Groups not in I<TLH>
240will be ignored.
241
242B<gatherstats> will also add up the number of postings for each
243hierarchy level, but only count each posting once. A posting to
244de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
245respectively. A crossposting to de.alt.test and de.alt.admin, on the
246other hand, will be counted for de.alt.test and de.alt.admin each, but
247only once for de.alt.ALL and de.ALL.
248
880c3eb2
TH
249Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
250override that default through the B<--groupsdb> option.
2832c235
TH
251
252=back
253
254=head2 Configuration
255
880c3eb2 256B<gatherstats> will read its configuration from F<newsstats.conf>
2832c235
TH
257which should be present in the same directory via Config::Auto.
258
880c3eb2 259See L<doc/INSTALL> for an overview of possible configuration options.
2832c235 260
880c3eb2
TH
261You can override configuration options via the B<--hierarchy>,
262B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
263respectively.
2832c235
TH
264
265=head1 OPTIONS
266
267=over 3
268
880c3eb2 269=item B<-V>, B<--version>
2832c235 270
880c3eb2 271Print out version and copyright information and exit.
2832c235 272
880c3eb2 273=item B<-h>, B<--help>
2832c235
TH
274
275Print this man page and exit.
276
880c3eb2 277=item B<-d>, B<--debug>
2832c235
TH
278
279Output debugging information to STDOUT while processing (number of
280postings per group).
281
880c3eb2 282=item B<-t>, B<--test>
2832c235 283
880c3eb2
TH
284Do not write results to database. You should use B<--debug> in
285conjunction with B<--test> ... everything else seems a bit pointless.
2832c235 286
880c3eb2 287=item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
2832c235 288
880c3eb2
TH
289Set processing period to a single month in YYYY-MM format or to a time
290period between two month in YYYY-MM:YYYY-MM format (two month, separated
291by a colon).
2832c235 292
2832c235 293
880c3eb2 294=item B<-s>, B<--stats> I<type>
2832c235
TH
295
296Set processing type to one of I<all> and I<groups>. Defaults to all
297(and is currently rather pointless as only I<groups> has been
298implemented).
299
93c8eae2
TH
300=item B<-c>, B<--checkgroups> I<filename template>
301
302Check each group against a list of valid newsgroups read from a file,
303one group on each line and ignoring everything after the first
304whitespace (so you can use a file in checkgroups format or (part of)
305your INN active file).
306
307The filename is taken from I<filename template>, amended by each B<--
308month> B<gatherstats> is processing, so that
309
310 gatherstats -m 2010-01:2010-12 -c checkgroups
ad609792 311
93c8eae2
TH
312will check against F<checkgroups-2010-01> for January 2010, against
313F<checkgroups-2010-02> for February 2010 and so on.
ad609792 314
93c8eae2
TH
315Newsgroups not found in the checkgroups file will be dropped (and
316logged to STDERR), and newsgroups found there but having no postings
ad609792
TH
317will be added with a count of 0 (and logged to STDERR).
318
880c3eb2 319=item B<--hierarchy> I<TLH> (newsgroup hierarchy)
2832c235
TH
320
321Override I<TLH> from F<newsstats.conf>.
322
880c3eb2 323=item B<--rawdb> I<table> (raw data table)
2832c235
TH
324
325Override I<DBTableRaw> from F<newsstats.conf>.
326
880c3eb2 327=item B<--groupsdb> I<table> (postings per group table)
2832c235
TH
328
329Override I<DBTableGrps> from F<newsstats.conf>.
330
880c3eb2 331=item B<--clientsdb> I<table> (client data table)
2832c235
TH
332
333Override I<DBTableClnts> from F<newsstats.conf>.
334
880c3eb2 335=item B<--hostsdb> I<table> (host data table)
2832c235
TH
336
337Override I<DBTableHosts> from F<newsstats.conf>.
338
339=back
340
341=head1 INSTALLATION
342
880c3eb2 343See L<doc/INSTALL>.
2832c235
TH
344
345=head1 EXAMPLES
346
347Process all types of information for lasth month:
348
349 gatherstats
350
351Do a dry run, showing results of processing:
352
880c3eb2 353 gatherstats --debug --test
2832c235
TH
354
355Process all types of information for January of 2010:
356
880c3eb2 357 gatherstats --month 2010-01
2832c235 358
ad609792 359Process only number of postings for the year of 2010,
93c8eae2 360checking against checkgroups-*:
2832c235 361
93c8eae2 362 gatherstats -m 2010-01:2010-12 -s groups -c checkgroups
2832c235
TH
363
364=head1 FILES
365
366=over 4
367
368=item F<gatherstats.pl>
369
370The script itself.
371
372=item F<NewsStats.pm>
373
374Library functions for the NewsStats package.
375
376=item F<newsstats.conf>
377
880c3eb2 378Runtime configuration file.
2832c235
TH
379
380=back
381
382=head1 BUGS
383
384Please report any bugs or feature requests to the author or use the
385bug tracker at L<http://bugs.th-h.de/>!
386
387=head1 SEE ALSO
388
389=over 2
390
391=item -
392
880c3eb2 393L<doc/README>
2832c235
TH
394
395=item -
396
880c3eb2 397L<doc/INSTALL>
2832c235
TH
398
399=back
400
401This script is part of the B<NewsStats> package.
402
403=head1 AUTHOR
404
405Thomas Hochstein <thh@inter.net>
406
407=head1 COPYRIGHT AND LICENSE
408
880c3eb2 409Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
2832c235
TH
410
411This program is free software; you may redistribute it and/or modify it
412under the same terms as Perl itself.
413
414=cut
This page took 0.033732 seconds and 4 git commands to generate.