Fix parsing of more than one TLH in config.
[usenet/newsstats.git] / gatherstats.pl
CommitLineData
2832c235
TH
1#! /usr/bin/perl -W
2#
3# gatherstats.pl
4#
5# This script will gather statistical information from a database
6# containing headers and other information from a INN feed.
7#
8# It is part of the NewsStats package.
9#
880c3eb2 10# Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
2832c235
TH
11#
12# It can be redistributed and/or modified under the same terms under
13# which Perl itself is published.
14
15BEGIN {
16 our $VERSION = "0.01";
17 use File::Basename;
18 push(@INC, dirname($0));
19}
20use strict;
21
ad609792 22use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList);
2832c235
TH
23
24use DBI;
880c3eb2
TH
25use Getopt::Long qw(GetOptions);
26Getopt::Long::config ('bundling');
2832c235
TH
27
28################################# Definitions ##################################
29
30# define types of information that can be gathered
31# all / groups (/ clients / hosts)
880c3eb2
TH
32my %LegalStats;
33@LegalStats{('all','groups')} = ();
2832c235
TH
34
35################################# Main program #################################
36
37### read commandline options
880c3eb2
TH
38my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
39 $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest);
40GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
41 'clientsdb=s' => \$OptClientsDB,
42 'd|debug!' => \$OptDebug,
43 'groupsdb=s' => \$OptGroupsDB,
44 'hierarchy=s' => \$OptTLH,
45 'hostsdb=s' => \$OptHostsDB,
46 'm|month=s' => \$OptMonth,
47 'rawdb=s' => \$OptRawDB,
48 's|stats=s' => \$OptStatsType,
49 't|test!' => \$OptTest,
50 'h|help' => \&ShowPOD,
51 'V|version' => \&ShowVersion) or exit 1;
2832c235
TH
52
53### read configuration
880c3eb2 54my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')};
2832c235
TH
55
56### override configuration via commandline options
57my %ConfOverride;
880c3eb2
TH
58$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
59$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
60$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
61$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
62$ConfOverride{'TLH'} = $OptTLH if $OptTLH;
2832c235
TH
63&OverrideConfig(\%Conf,\%ConfOverride);
64
65### get type of information to gather, defaulting to 'all'
880c3eb2
TH
66$OptStatsType = 'all' if !$OptStatsType;
67&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
68 if !exists($LegalStats{$OptStatsType});
2832c235 69
880c3eb2
TH
70### get time period from --month
71# get verbal description of time period, drop SQL code
72my ($Period) = &GetTimePeriod($OptMonth);
73&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
74 "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
2832c235 75
17ffbeba
TH
76### reformat $Conf{'TLH'}
77my $TLH;
78if ($Conf{'TLH'}) {
79 # $Conf{'TLH'} is parsed as an array by Config::Auto;
80 # make a flat list again, separated by :
43a0fc77 81 if (ref($Conf{'TLH'}) eq 'ARRAY') {
17ffbeba
TH
82 $TLH = join(':',@{$Conf{'TLH'}});
83 } else {
84 $TLH = $Conf{'TLH'};
85 }
86 # strip whitespace
87 $TLH =~ s/\s//g;
88 # check for illegal characters
880c3eb2
TH
89 &Bleat(2,'Config error - illegal characters in TLH definition!')
90 if ($TLH !~ /^[a-zA-Z0-9:]+$/);
17ffbeba 91 if ($TLH =~ /:/) {
880c3eb2 92 # reformat $TLH from a:b to (a)|(b),
43a0fc77 93 # e.g. replace ':' by ')|('
17ffbeba
TH
94 $TLH =~ s/:/)|(/g;
95 $TLH = '(' . $TLH . ')';
96 };
97};
98
880c3eb2
TH
99# read list of newsgroups from --checkgroups
100# into a hash
101my %ValidGroups = %{ReadGroupList($OptCheckgroupsFile)} if $OptCheckgroupsFile;
ad609792 102
2832c235
TH
103### init database
104my $DBHandle = InitDB(\%Conf,1);
105
106### get data for each month
880c3eb2
TH
107&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
108foreach my $Month (&ListMonth($Period)) {
2832c235 109
880c3eb2 110 print "---------- $Month ----------\n" if $OptDebug;
2832c235 111
880c3eb2 112 if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
2832c235
TH
113 ### ----------------------------------------------
114 ### get groups data (number of postings per group)
115 # get groups data from raw table for given month
880c3eb2
TH
116 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
117 "WHERE day LIKE ? AND NOT disregard",
118 $Conf{'DBDatabase'},
119 $Conf{'DBTableRaw'}));
120 $DBQuery->execute($Month.'-%')
121 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
122 "$DBI::errstr\n",$Month,
123 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
2832c235
TH
124
125 # count postings per group
126 my %Postings;
2832c235
TH
127 while (($_) = $DBQuery->fetchrow_array) {
128 # get list oft newsgroups and hierarchies from Newsgroups:
880c3eb2
TH
129 my %Newsgroups = ListNewsgroups($_,$TLH,
130 $OptCheckgroupsFile ? \%ValidGroups : '');
2832c235
TH
131 # count each newsgroup and hierarchy once
132 foreach (sort keys %Newsgroups) {
2832c235
TH
133 $Postings{$_}++;
134 };
135 };
136
880c3eb2 137 # add valid but empty groups if --checkgroups is set
ad609792
TH
138 if (%ValidGroups) {
139 foreach (sort keys %ValidGroups) {
140 if (!defined($Postings{$_})) {
141 $Postings{$_} = 0 ;
142 warn (sprintf("ADDED: %s as empty group\n",$_));
143 }
144 };
145 };
146
71f0178b 147 # delete old data for that month
880c3eb2
TH
148 if (!$OptTest) {
149 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
150 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
151 undef,$Month)
152 or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
153 "$DBI::errstr\n",$Month,
154 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
71f0178b
TH
155 };
156
880c3eb2 157 print "----- GroupStats -----\n" if $OptDebug;
2832c235 158 foreach my $Newsgroup (sort keys %Postings) {
880c3eb2
TH
159 print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
160 if (!$OptTest) {
2832c235 161 # write to database
880c3eb2
TH
162 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
163 "(month,newsgroup,postings) ".
164 "VALUES (?, ?, ?)",
165 $Conf{'DBDatabase'},
166 $Conf{'DBTableGrps'}));
167 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
168 or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
169 "$DBI::errstr\n",$Month,$Newsgroup,
170 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
2832c235
TH
171 $DBQuery->finish;
172 };
173 };
174 } else {
175 # other types of information go here - later on
176 };
177};
178
179### close handles
180$DBHandle->disconnect;
181
182__END__
183
184################################ Documentation #################################
185
186=head1 NAME
187
188gatherstats - process statistical data from a raw source
189
190=head1 SYNOPSIS
191
880c3eb2 192B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats] [B<-c> I<checkgroups file>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>]
2832c235
TH
193
194=head1 REQUIREMENTS
195
880c3eb2 196See L<doc/README>.
2832c235
TH
197
198=head1 DESCRIPTION
199
200This script will extract and process statistical information from a
201database table which is fed from F<feedlog.pl> for a given time period
313610f6 202and write its results to (an)other database table(s). Entries marked
880c3eb2
TH
203with I<'disregard'> in the database will be ignored; currently, you
204have to set this flag yourself, using your database management tools.
205You can exclude erroneous entries that way (e.g. automatic reposts
206(think of cancels flood and resurrectors); spam; ...).
2832c235
TH
207
208The time period to act on defaults to last month; you can assign
880c3eb2
TH
209another time period or a single month via the B<--month> option (see
210below).
2832c235
TH
211
212By default B<gatherstats> will process all types of information; you
880c3eb2
TH
213can change that using the B<--stats> option and assigning the type of
214information to process. Currently that doesn't matter yet as only
215processing of the number of postings per group per month is
216implemented anyway.
2832c235
TH
217
218Possible information types include:
219
220=over 3
221
222=item B<groups> (postings per group per month)
223
224B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
225counted for each single group they appear in. Groups not in I<TLH>
226will be ignored.
227
228B<gatherstats> will also add up the number of postings for each
229hierarchy level, but only count each posting once. A posting to
230de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
231respectively. A crossposting to de.alt.test and de.alt.admin, on the
232other hand, will be counted for de.alt.test and de.alt.admin each, but
233only once for de.alt.ALL and de.ALL.
234
880c3eb2
TH
235Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
236override that default through the B<--groupsdb> option.
2832c235
TH
237
238=back
239
240=head2 Configuration
241
880c3eb2 242B<gatherstats> will read its configuration from F<newsstats.conf>
2832c235
TH
243which should be present in the same directory via Config::Auto.
244
880c3eb2 245See L<doc/INSTALL> for an overview of possible configuration options.
2832c235 246
880c3eb2
TH
247You can override configuration options via the B<--hierarchy>,
248B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
249respectively.
2832c235
TH
250
251=head1 OPTIONS
252
253=over 3
254
880c3eb2 255=item B<-V>, B<--version>
2832c235 256
880c3eb2 257Print out version and copyright information and exit.
2832c235 258
880c3eb2 259=item B<-h>, B<--help>
2832c235
TH
260
261Print this man page and exit.
262
880c3eb2 263=item B<-d>, B<--debug>
2832c235
TH
264
265Output debugging information to STDOUT while processing (number of
266postings per group).
267
880c3eb2 268=item B<-t>, B<--test>
2832c235 269
880c3eb2
TH
270Do not write results to database. You should use B<--debug> in
271conjunction with B<--test> ... everything else seems a bit pointless.
2832c235 272
880c3eb2 273=item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
2832c235 274
880c3eb2
TH
275Set processing period to a single month in YYYY-MM format or to a time
276period between two month in YYYY-MM:YYYY-MM format (two month, separated
277by a colon).
2832c235 278
2832c235 279
880c3eb2 280=item B<-s>, B<--stats> I<type>
2832c235
TH
281
282Set processing type to one of I<all> and I<groups>. Defaults to all
283(and is currently rather pointless as only I<groups> has been
284implemented).
285
880c3eb2 286=item B<-c>, B<--checkgroups> I<filename>
ad609792
TH
287
288Check each group against a list of valid newsgroups read from
289I<filename>, one group on each line and ignoring everything after the
290first whitespace (so you can use a file in checkgroups format or (part
291of) your INN active file).
292
293Newsgroups not found in I<filename> will be dropped (and logged to
294STDERR), and newsgroups found in I<filename> but having no postings
295will be added with a count of 0 (and logged to STDERR).
296
880c3eb2 297=item B<--hierarchy> I<TLH> (newsgroup hierarchy)
2832c235
TH
298
299Override I<TLH> from F<newsstats.conf>.
300
880c3eb2 301=item B<--rawdb> I<table> (raw data table)
2832c235
TH
302
303Override I<DBTableRaw> from F<newsstats.conf>.
304
880c3eb2 305=item B<--groupsdb> I<table> (postings per group table)
2832c235
TH
306
307Override I<DBTableGrps> from F<newsstats.conf>.
308
880c3eb2 309=item B<--clientsdb> I<table> (client data table)
2832c235
TH
310
311Override I<DBTableClnts> from F<newsstats.conf>.
312
880c3eb2 313=item B<--hostsdb> I<table> (host data table)
2832c235
TH
314
315Override I<DBTableHosts> from F<newsstats.conf>.
316
317=back
318
319=head1 INSTALLATION
320
880c3eb2 321See L<doc/INSTALL>.
2832c235
TH
322
323=head1 EXAMPLES
324
325Process all types of information for lasth month:
326
327 gatherstats
328
329Do a dry run, showing results of processing:
330
880c3eb2 331 gatherstats --debug --test
2832c235
TH
332
333Process all types of information for January of 2010:
334
880c3eb2 335 gatherstats --month 2010-01
2832c235 336
ad609792
TH
337Process only number of postings for the year of 2010,
338checking against checkgroups-2010.txt:
2832c235 339
880c3eb2 340 gatherstats -m 2010-01:2010-12 -s groups -c checkgroups-2010.txt
2832c235
TH
341
342=head1 FILES
343
344=over 4
345
346=item F<gatherstats.pl>
347
348The script itself.
349
350=item F<NewsStats.pm>
351
352Library functions for the NewsStats package.
353
354=item F<newsstats.conf>
355
880c3eb2 356Runtime configuration file.
2832c235
TH
357
358=back
359
360=head1 BUGS
361
362Please report any bugs or feature requests to the author or use the
363bug tracker at L<http://bugs.th-h.de/>!
364
365=head1 SEE ALSO
366
367=over 2
368
369=item -
370
880c3eb2 371L<doc/README>
2832c235
TH
372
373=item -
374
880c3eb2 375L<doc/INSTALL>
2832c235
TH
376
377=back
378
379This script is part of the B<NewsStats> package.
380
381=head1 AUTHOR
382
383Thomas Hochstein <thh@inter.net>
384
385=head1 COPYRIGHT AND LICENSE
386
880c3eb2 387Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
2832c235
TH
388
389This program is free software; you may redistribute it and/or modify it
390under the same terms as Perl itself.
391
392=cut
This page took 0.03298 seconds and 4 git commands to generate.