Merge branch 'thh-small-changes' into next
[usenet/newsstats.git] / bin / gatherstats.pl
... / ...
CommitLineData
1#! /usr/bin/perl
2#
3# gatherstats.pl
4#
5# This script will gather statistical information from a database
6# containing headers and other information from a INN feed.
7#
8# It is part of the NewsStats package.
9#
10# Copyright (c) 2010-2013 Thomas Hochstein <thh@inter.net>
11#
12# It can be redistributed and/or modified under the same terms under
13# which Perl itself is published.
14
15BEGIN {
16 our $VERSION = "0.02";
17 use File::Basename;
18 # we're in .../bin, so our module is in ../lib
19 push(@INC, dirname($0).'/../lib');
20}
21use strict;
22use warnings;
23
24use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList);
25
26use DBI;
27use Getopt::Long qw(GetOptions);
28Getopt::Long::config ('bundling');
29
30################################# Definitions ##################################
31
32# define types of information that can be gathered
33# all / groups (/ clients / hosts)
34my %LegalStats;
35@LegalStats{('all','groups')} = ();
36
37################################# Main program #################################
38
39### read commandline options
40my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
41 $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest,$OptConfFile);
42GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
43 'clientsdb=s' => \$OptClientsDB,
44 'd|debug!' => \$OptDebug,
45 'groupsdb=s' => \$OptGroupsDB,
46 'hierarchy=s' => \$OptTLH,
47 'hostsdb=s' => \$OptHostsDB,
48 'm|month=s' => \$OptMonth,
49 'rawdb=s' => \$OptRawDB,
50 's|stats=s' => \$OptStatsType,
51 't|test!' => \$OptTest,
52 'conffile=s' => \$OptConfFile,
53 'h|help' => \&ShowPOD,
54 'V|version' => \&ShowVersion) or exit 1;
55
56### read configuration
57my %Conf = %{ReadConfig($OptConfFile)};
58
59### override configuration via commandline options
60my %ConfOverride;
61$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
62$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
63$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
64$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
65$ConfOverride{'TLH'} = $OptTLH if $OptTLH;
66&OverrideConfig(\%Conf,\%ConfOverride);
67
68### get type of information to gather, defaulting to 'all'
69$OptStatsType = 'all' if !$OptStatsType;
70&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
71 if !exists($LegalStats{$OptStatsType});
72
73### get time period from --month
74# get verbal description of time period, drop SQL code
75my ($Period) = &GetTimePeriod($OptMonth);
76# bail out if --month is invalid or set to 'ALL';
77# we don't support the latter
78&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
79 "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
80
81### reformat $Conf{'TLH'}
82my $TLH;
83if ($Conf{'TLH'}) {
84 # $Conf{'TLH'} is parsed as an array by Config::Auto;
85 # make a flat list again, separated by :
86 if (ref($Conf{'TLH'}) eq 'ARRAY') {
87 $TLH = join(':',@{$Conf{'TLH'}});
88 } else {
89 $TLH = $Conf{'TLH'};
90 }
91 # strip whitespace
92 $TLH =~ s/\s//g;
93 # add trailing dots if none are present yet
94 # (using negative look-behind assertions)
95 $TLH =~ s/(?<!\.):/.:/g;
96 $TLH =~ s/(?<!\.)$/./;
97 # check for illegal characters
98 &Bleat(2,'Config error - illegal characters in TLH definition!')
99 if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
100 # escape dots
101 $TLH =~ s/\./\\./g;
102 if ($TLH =~ /:/) {
103 # reformat $TLH from a:b to (a)|(b),
104 # e.g. replace ':' by ')|('
105 $TLH =~ s/:/)|(/g;
106 $TLH = '(' . $TLH . ')';
107 };
108};
109
110### init database
111my $DBHandle = InitDB(\%Conf,1);
112
113### get data for each month
114&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
115foreach my $Month (&ListMonth($Period)) {
116
117 print "---------- $Month ----------\n" if $OptDebug;
118
119 if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
120 # read list of newsgroups from --checkgroups
121 # into a hash
122 my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))}
123 if $OptCheckgroupsFile;
124
125 ### ----------------------------------------------
126 ### get groups data (number of postings per group)
127 # get groups data from raw table for given month
128 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
129 "WHERE day LIKE ? AND NOT disregard",
130 $Conf{'DBDatabase'},
131 $Conf{'DBTableRaw'}));
132 $DBQuery->execute($Month.'-%')
133 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
134 "$DBI::errstr\n",$Month,
135 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
136
137 # count postings per group
138 my %Postings;
139 while (($_) = $DBQuery->fetchrow_array) {
140 # get list of newsgroups and hierarchies from Newsgroups:
141 my %Newsgroups = ListNewsgroups($_,$TLH,
142 $OptCheckgroupsFile ? \%ValidGroups : '');
143 # count each newsgroup and hierarchy once
144 foreach (sort keys %Newsgroups) {
145 $Postings{$_}++;
146 };
147 };
148
149 # add valid but empty groups if --checkgroups is set
150 if (%ValidGroups) {
151 foreach (sort keys %ValidGroups) {
152 if (!defined($Postings{$_})) {
153 # add current newsgroup as empty group
154 $Postings{$_} = 0;
155 warn (sprintf("ADDED: %s as empty group\n",$_));
156 # add empty hierarchies for current newsgroup as needed
157 foreach (ParseHierarchies($_)) {
158 my $Hierarchy = $_ . '.ALL';
159 if (!defined($Postings{$Hierarchy})) {
160 $Postings{$Hierarchy} = 0;
161 warn (sprintf("ADDED: %s as empty group\n",$Hierarchy));
162 };
163 };
164 }
165 };
166 };
167
168 # delete old data for that month
169 if (!$OptTest) {
170 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
171 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
172 undef,$Month)
173 or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
174 "$DBI::errstr\n",$Month,
175 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
176 };
177
178 print "----- GroupStats -----\n" if $OptDebug;
179 foreach my $Newsgroup (sort keys %Postings) {
180 print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
181 if (!$OptTest) {
182 # write to database
183 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
184 "(month,newsgroup,postings) ".
185 "VALUES (?, ?, ?)",
186 $Conf{'DBDatabase'},
187 $Conf{'DBTableGrps'}));
188 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
189 or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
190 "$DBI::errstr\n",$Month,$Newsgroup,
191 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
192 $DBQuery->finish;
193 };
194 };
195 } else {
196 # other types of information go here - later on
197 };
198};
199
200### close handles
201$DBHandle->disconnect;
202
203__END__
204
205################################ Documentation #################################
206
207=head1 NAME
208
209gatherstats - process statistical data from a raw source
210
211=head1 SYNOPSIS
212
213B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats>] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>] [B<--conffile> I<filename>]
214
215=head1 REQUIREMENTS
216
217See L<doc/README>.
218
219=head1 DESCRIPTION
220
221This script will extract and process statistical information from a
222database table which is fed from F<feedlog.pl> for a given time period
223and write its results to (an)other database table(s). Entries marked
224with I<'disregard'> in the database will be ignored; currently, you
225have to set this flag yourself, using your database management tools.
226You can exclude erroneous entries that way (e.g. automatic reposts
227(think of cancels flood and resurrectors); spam; ...).
228
229The time period to act on defaults to last month; you can assign
230another time period or a single month via the B<--month> option (see
231below).
232
233By default B<gatherstats> will process all types of information; you
234can change that using the B<--stats> option and assigning the type of
235information to process. Currently that doesn't matter yet as only
236processing of the number of postings per group per month is
237implemented anyway.
238
239Possible information types include:
240
241=over 3
242
243=item B<groups> (postings per group per month)
244
245B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
246counted for each single group they appear in. Groups not in I<TLH>
247will be ignored.
248
249B<gatherstats> will also add up the number of postings for each
250hierarchy level, but only count each posting once. A posting to
251de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
252respectively. A crossposting to de.alt.test and de.alt.admin, on the
253other hand, will be counted for de.alt.test and de.alt.admin each, but
254only once for de.alt.ALL and de.ALL.
255
256Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
257override that default through the B<--groupsdb> option.
258
259=back
260
261=head2 Configuration
262
263B<gatherstats> will read its configuration from F<newsstats.conf>
264which should be present in etc/ via Config::Auto or from a configuration file
265submitted by the B<--conffile> option.
266
267See L<doc/INSTALL> for an overview of possible configuration options.
268
269You can override configuration options via the B<--hierarchy>,
270B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
271respectively.
272
273=head1 OPTIONS
274
275=over 3
276
277=item B<-V>, B<--version>
278
279Print out version and copyright information and exit.
280
281=item B<-h>, B<--help>
282
283Print this man page and exit.
284
285=item B<-d>, B<--debug>
286
287Output debugging information to STDOUT while processing (number of
288postings per group).
289
290=item B<-t>, B<--test>
291
292Do not write results to database. You should use B<--debug> in
293conjunction with B<--test> ... everything else seems a bit pointless.
294
295=item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
296
297Set processing period to a single month in YYYY-MM format or to a time
298period between two month in YYYY-MM:YYYY-MM format (two month, separated
299by a colon).
300
301=item B<-s>, B<--stats> I<type>
302
303Set processing type to one of I<all> and I<groups>. Defaults to all
304(and is currently rather pointless as only I<groups> has been
305implemented).
306
307=item B<-c>, B<--checkgroups> I<filename template>
308
309Check each group against a list of valid newsgroups read from a file,
310one group on each line and ignoring everything after the first
311whitespace (so you can use a file in checkgroups format or (part of)
312your INN active file).
313
314The filename is taken from I<filename template>, amended by each
315B<--month> B<gatherstats> is processing in the form of I<template-YYYY-MM>,
316so that
317
318 gatherstats -m 2010-01:2010-12 -c checkgroups
319
320will check against F<checkgroups-2010-01> for January 2010, against
321F<checkgroups-2010-02> for February 2010 and so on.
322
323Newsgroups not found in the checkgroups file will be dropped (and
324logged to STDERR), and newsgroups found there but having no postings
325will be added with a count of 0 (and logged to STDERR).
326
327=item B<--hierarchy> I<TLH> (newsgroup hierarchy)
328
329Override I<TLH> from F<newsstats.conf>.
330
331=item B<--rawdb> I<table> (raw data table)
332
333Override I<DBTableRaw> from F<newsstats.conf>.
334
335=item B<--groupsdb> I<table> (postings per group table)
336
337Override I<DBTableGrps> from F<newsstats.conf>.
338
339=item B<--clientsdb> I<table> (client data table)
340
341Override I<DBTableClnts> from F<newsstats.conf>.
342
343=item B<--hostsdb> I<table> (host data table)
344
345Override I<DBTableHosts> from F<newsstats.conf>.
346
347=item B<--conffile> I<filename>
348
349Load configuration from I<filename> instead of F<newsstats.conf>.
350
351=back
352
353=head1 INSTALLATION
354
355See L<doc/INSTALL>.
356
357=head1 EXAMPLES
358
359Process all types of information for lasth month:
360
361 gatherstats
362
363Do a dry run, showing results of processing:
364
365 gatherstats --debug --test
366
367Process all types of information for January of 2010:
368
369 gatherstats --month 2010-01
370
371Process only number of postings for the year of 2010,
372checking against checkgroups-*:
373
374 gatherstats -m 2010-01:2010-12 -s groups -c checkgroups
375
376=head1 FILES
377
378=over 4
379
380=item F<bin/gatherstats.pl>
381
382The script itself.
383
384=item F<lib/NewsStats.pm>
385
386Library functions for the NewsStats package.
387
388=item F<etc/newsstats.conf>
389
390Runtime configuration file.
391
392=back
393
394=head1 BUGS
395
396Please report any bugs or feature requests to the author or use the
397bug tracker at L<http://bugs.th-h.de/>!
398
399=head1 SEE ALSO
400
401=over 2
402
403=item -
404
405L<doc/README>
406
407=item -
408
409L<doc/INSTALL>
410
411=back
412
413This script is part of the B<NewsStats> package.
414
415=head1 AUTHOR
416
417Thomas Hochstein <thh@inter.net>
418
419=head1 COPYRIGHT AND LICENSE
420
421Copyright (c) 2010-2013 Thomas Hochstein <thh@inter.net>
422
423This program is free software; you may redistribute it and/or modify it
424under the same terms as Perl itself.
425
426=cut
This page took 0.011149 seconds and 4 git commands to generate.