Merge branch 'tools' into pu
[usenet/newsstats.git] / gatherstats.pl
... / ...
CommitLineData
1#! /usr/bin/perl -W
2#
3# gatherstats.pl
4#
5# This script will gather statistical information from a database
6# containing headers and other information from a INN feed.
7#
8# It is part of the NewsStats package.
9#
10# Copyright (c) 2010 Thomas Hochstein <thh@inter.net>
11#
12# It can be redistributed and/or modified under the same terms under
13# which Perl itself is published.
14
15BEGIN {
16 our $VERSION = "0.01";
17 use File::Basename;
18 push(@INC, dirname($0));
19}
20use strict;
21
22use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList);
23
24use DBI;
25
26################################# Definitions ##################################
27
28# define types of information that can be gathered
29# all / groups (/ clients / hosts)
30my %LegalTypes;
31@LegalTypes{('all','groups')} = ();
32
33################################# Main program #################################
34
35### read commandline options
36my %Options = &ReadOptions('dom:p:t:l:n:r:g:c:s:');
37
38### read configuration
39my %Conf = %{ReadConfig('newsstats.conf')};
40
41### override configuration via commandline options
42my %ConfOverride;
43$ConfOverride{'DBTableRaw'} = $Options{'r'} if $Options{'r'};
44$ConfOverride{'DBTableGrps'} = $Options{'g'} if $Options{'g'};
45$ConfOverride{'DBTableClnts'} = $Options{'c'} if $Options{'c'};
46$ConfOverride{'DBTableHosts'} = $Options{'s'} if $Options{'s'};
47$ConfOverride{'TLH'} = $Options{'n'} if $Options{'n'};
48&OverrideConfig(\%Conf,\%ConfOverride);
49
50### get type of information to gather, defaulting to 'all'
51$Options{'t'} = 'all' if !$Options{'t'};
52die "$MySelf: E: Unknown type '-t $Options{'t'}'!\n" if !exists($LegalTypes{$Options{'t'}});
53
54### get time period (-m or -p)
55my ($StartMonth,$EndMonth) = &GetTimePeriod($Options{'m'},$Options{'p'});
56
57### reformat $Conf{'TLH'}
58my $TLH;
59if ($Conf{'TLH'}) {
60 # $Conf{'TLH'} is parsed as an array by Config::Auto;
61 # make a flat list again, separated by :
62 if (ref($TLH) eq 'ARRAY') {
63 $TLH = join(':',@{$Conf{'TLH'}});
64 } else {
65 $TLH = $Conf{'TLH'};
66 }
67 # strip whitespace
68 $TLH =~ s/\s//g;
69 # check for illegal characters
70 die "$MySelf: E: Config error - illegal characters in TLH definition\n" if ($TLH !~ /^[a-zA-Z0-9:]+$/);
71 if ($TLH =~ /:/) {
72 # reformat $TLH form a:b to (a)|(b)
73 $TLH =~ s/:/)|(/g;
74 $TLH = '(' . $TLH . ')';
75 };
76};
77
78### read newsgroups list from -l
79my %ValidGroups = %{&ReadGroupList($Options{'l'})} if $Options{'l'};
80
81### init database
82my $DBHandle = InitDB(\%Conf,1);
83
84### get data for each month
85warn "$MySelf: W: Output only mode. Database is not updated.\n" if $Options{'o'};
86foreach my $Month (&ListMonth($StartMonth,$EndMonth)) {
87
88 print "---------- $Month ----------\n" if $Options{'d'};
89
90 if ($Options{'t'} eq 'all' or $Options{'t'} eq 'groups') {
91 ### ----------------------------------------------
92 ### get groups data (number of postings per group)
93 # get groups data from raw table for given month
94 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s WHERE day LIKE ? AND NOT disregard",$Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
95 $DBQuery->execute($Month.'-%') or die sprintf("$MySelf: E: Can't get groups data for %s from %s.%s: $DBI::errstr\n",$Month,$Conf{'DBDatabase'},$Conf{'DBTableRaw'});
96
97 # count postings per group
98 my %Postings;
99 while (($_) = $DBQuery->fetchrow_array) {
100 # get list oft newsgroups and hierarchies from Newsgroups:
101 my %Newsgroups = ListNewsgroups($_,$TLH,$Options{'l'} ? \%ValidGroups : '');
102 # count each newsgroup and hierarchy once
103 foreach (sort keys %Newsgroups) {
104 $Postings{$_}++;
105 };
106 };
107
108 # add valid but empty groups if -l is set
109 if (%ValidGroups) {
110 foreach (sort keys %ValidGroups) {
111 if (!defined($Postings{$_})) {
112 $Postings{$_} = 0 ;
113 warn (sprintf("ADDED: %s as empty group\n",$_));
114 }
115 };
116 };
117
118 # delete old data for that month
119 if (!$Options{'o'}) {
120 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",$Conf{'DBDatabase'},$Conf{'DBTableGrps'}),undef,$Month)
121 or warn sprintf("$MySelf: E: Can't delete old groups data for %s from %s.%s: $DBI::errstr\n",$Month,$Conf{'DBDatabase'},$Conf{'DBTableGrps'});
122 };
123
124 print "----- GroupStats -----\n" if $Options{'d'};
125 foreach my $Newsgroup (sort keys %Postings) {
126 print "$Newsgroup => $Postings{$Newsgroup}\n" if $Options{'d'};
127 if (!$Options{'o'}) {
128 # write to database
129 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s (month,newsgroup,postings) VALUES (?, ?, ?)",$Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
130 # $DBQuery = $DBHandle->prepare(sprintf("REPLACE INTO %s.%s (month,newsgroup,postings) VALUES (?, ?, ?)",$Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
131 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup}) or die sprintf("$MySelf: E: Can't write groups data for %s/%s to %s.%s: $DBI::errstr\n",$Month,$Newsgroup,$Conf{'DBDatabase'},$Conf{'DBTableGrps'});
132 $DBQuery->finish;
133 };
134 };
135 } else {
136 # other types of information go here - later on
137 };
138};
139
140### close handles
141$DBHandle->disconnect;
142
143__END__
144
145################################ Documentation #################################
146
147=head1 NAME
148
149gatherstats - process statistical data from a raw source
150
151=head1 SYNOPSIS
152
153B<gatherstats> [B<-Vhdo>] [B<-m> I<YYYY-MM>] [B<-p> I<YYYY-MM:YYYY-MM>] [B<-t> I<type>] [B<-l> I<filename>] [B<-n> I<TLH>] [B<-r> I<database table>] [B<-g> I<database table>] [B<-c> I<database table>] [B<-s> I<database table>]
154
155=head1 REQUIREMENTS
156
157See doc/README: Perl 5.8.x itself and the following modules from CPAN:
158
159=over 2
160
161=item -
162
163Config::Auto
164
165=item -
166
167DBI
168
169=back
170
171=head1 DESCRIPTION
172
173This script will extract and process statistical information from a
174database table which is fed from F<feedlog.pl> for a given time period
175and write its results to (an)other database table(s). Entries marked
176with I<'disregard'> in the database will be ignored; currently, you have
177to set this flag yourself, using your database management tools. You
178can exclude erroneous entries that way (e.g. automatic reposts (think
179of cancels flood and resurrectors); spam; ...).
180
181The time period to act on defaults to last month; you can assign
182another month via the B<-m> switch or a time period via the B<-p>
183switch; the latter takes preference.
184
185By default B<gatherstats> will process all types of information; you
186can change that using the B<-t> switch and assigning the type of
187information to process. Currently only processing of the number of
188postings per group per month is implemented anyway, so that doesn't
189matter yet.
190
191Possible information types include:
192
193=over 3
194
195=item B<groups> (postings per group per month)
196
197B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
198counted for each single group they appear in. Groups not in I<TLH>
199will be ignored.
200
201B<gatherstats> will also add up the number of postings for each
202hierarchy level, but only count each posting once. A posting to
203de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
204respectively. A crossposting to de.alt.test and de.alt.admin, on the
205other hand, will be counted for de.alt.test and de.alt.admin each, but
206only once for de.alt.ALL and de.ALL.
207
208Data is written to I<DBTableGrps> (see doc/INSTALL).
209
210=back
211
212=head2 Configuration
213
214F<gatherstats.pl> will read its configuration from F<newsstats.conf>
215which should be present in the same directory via Config::Auto.
216
217See doc/INSTALL for an overview of possible configuration options.
218
219You can override configuration options via the B<-n>, B<-r>, B<-g>,
220B<-c> and B<-s> switches, respectively.
221
222=head1 OPTIONS
223
224=over 3
225
226=item B<-V> (version)
227
228Print out version and copyright information on B<yapfaq> and exit.
229
230=item B<-h> (help)
231
232Print this man page and exit.
233
234=item B<-d> (debug)
235
236Output debugging information to STDOUT while processing (number of
237postings per group).
238
239=item B<-o> (output only)
240
241Do not write results to database. You should use B<-d> in conjunction
242with B<-o> ... everything else seems a bit pointless.
243
244=item B<-m> I<YYYY-MM> (month)
245
246Set processing period to a month in YYYY-MM format. Ignored if B<-p>
247is set.
248
249=item B<-p> I<YYYY-MM:YYYY-MM> (period)
250
251Set processing period to a time period between two month, each in
252YYYY-MM format, separated by a colon. Overrides B<-m>.
253
254=item B<-t> I<type> (type)
255
256Set processing type to one of I<all> and I<groups>. Defaults to all
257(and is currently rather pointless as only I<groups> has been
258implemented).
259
260=item B<-l> I<filename> (check against list)
261
262Check each group against a list of valid newsgroups read from
263I<filename>, one group on each line and ignoring everything after the
264first whitespace (so you can use a file in checkgroups format or (part
265of) your INN active file).
266
267Newsgroups not found in I<filename> will be dropped (and logged to
268STDERR), and newsgroups found in I<filename> but having no postings
269will be added with a count of 0 (and logged to STDERR).
270
271=item B<-n> I<TLH> (newsgroup hierarchy)
272
273Override I<TLH> from F<newsstats.conf>.
274
275=item B<-r> I<table> (raw data table)
276
277Override I<DBTableRaw> from F<newsstats.conf>.
278
279=item B<-g> I<table> (postings per group table)
280
281Override I<DBTableGrps> from F<newsstats.conf>.
282
283=item B<-c> I<table> (client data table)
284
285Override I<DBTableClnts> from F<newsstats.conf>.
286
287=item B<-s> I<table> (server/host data table)
288
289Override I<DBTableHosts> from F<newsstats.conf>.
290
291=back
292
293=head1 INSTALLATION
294
295See doc/INSTALL.
296
297=head1 EXAMPLES
298
299Process all types of information for lasth month:
300
301 gatherstats
302
303Do a dry run, showing results of processing:
304
305 gatherstats -do
306
307Process all types of information for January of 2010:
308
309 gatherstats -m 2010-01
310
311Process only number of postings for the year of 2010,
312checking against checkgroups-2010.txt:
313
314 gatherstats -p 2010-01:2010-12 -t groups -l checkgroups-2010.txt
315
316=head1 FILES
317
318=over 4
319
320=item F<gatherstats.pl>
321
322The script itself.
323
324=item F<NewsStats.pm>
325
326Library functions for the NewsStats package.
327
328=item F<newsstats.conf>
329
330Runtime configuration file for B<yapfaq>.
331
332=back
333
334=head1 BUGS
335
336Please report any bugs or feature requests to the author or use the
337bug tracker at L<http://bugs.th-h.de/>!
338
339=head1 SEE ALSO
340
341=over 2
342
343=item -
344
345doc/README
346
347=item -
348
349doc/INSTALL
350
351=back
352
353This script is part of the B<NewsStats> package.
354
355=head1 AUTHOR
356
357Thomas Hochstein <thh@inter.net>
358
359=head1 COPYRIGHT AND LICENSE
360
361Copyright (c) 2010 Thomas Hochstein <thh@inter.net>
362
363This program is free software; you may redistribute it and/or modify it
364under the same terms as Perl itself.
365
366=cut
This page took 0.011075 seconds and 4 git commands to generate.