gatherstats.pl: Move TLH check to NewsStats.pm.
[usenet/newsstats.git] / gatherstats.pl
CommitLineData
2832c235
TH
1#! /usr/bin/perl -W
2#
3# gatherstats.pl
4#
5# This script will gather statistical information from a database
6# containing headers and other information from a INN feed.
7#
8# It is part of the NewsStats package.
9#
10# Copyright (c) 2010 Thomas Hochstein <thh@inter.net>
11#
12# It can be redistributed and/or modified under the same terms under
13# which Perl itself is published.
14
15BEGIN {
16 our $VERSION = "0.01";
17 use File::Basename;
18 push(@INC, dirname($0));
19}
20use strict;
21
ad609792 22use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList);
2832c235
TH
23
24use DBI;
25
26################################# Definitions ##################################
27
28# define types of information that can be gathered
29# all / groups (/ clients / hosts)
30my %LegalTypes;
31@LegalTypes{('all','groups')} = ();
32
33################################# Main program #################################
34
35### read commandline options
ad609792 36my %Options = &ReadOptions('dom:p:t:l:n:r:g:c:s:');
2832c235
TH
37
38### read configuration
39my %Conf = %{ReadConfig('newsstats.conf')};
40
41### override configuration via commandline options
42my %ConfOverride;
43$ConfOverride{'DBTableRaw'} = $Options{'r'} if $Options{'r'};
44$ConfOverride{'DBTableGrps'} = $Options{'g'} if $Options{'g'};
45$ConfOverride{'DBTableClnts'} = $Options{'c'} if $Options{'c'};
46$ConfOverride{'DBTableHosts'} = $Options{'s'} if $Options{'s'};
47$ConfOverride{'TLH'} = $Options{'n'} if $Options{'n'};
48&OverrideConfig(\%Conf,\%ConfOverride);
49
50### get type of information to gather, defaulting to 'all'
51$Options{'t'} = 'all' if !$Options{'t'};
52die "$MySelf: E: Unknown type '-t $Options{'t'}'!\n" if !exists($LegalTypes{$Options{'t'}});
53
54### get time period (-m or -p)
55my ($StartMonth,$EndMonth) = &GetTimePeriod($Options{'m'},$Options{'p'});
56
ad609792
TH
57### read newsgroups list from -l
58my %ValidGroups = %{&ReadGroupList($Options{'l'})} if $Options{'l'};
59
2832c235
TH
60### init database
61my $DBHandle = InitDB(\%Conf,1);
62
63### get data for each month
64warn "$MySelf: W: Output only mode. Database is not updated.\n" if $Options{'o'};
65foreach my $Month (&ListMonth($StartMonth,$EndMonth)) {
66
67 print "---------- $Month ----------\n" if $Options{'d'};
68
69 if ($Options{'t'} eq 'all' or $Options{'t'} eq 'groups') {
70 ### ----------------------------------------------
71 ### get groups data (number of postings per group)
72 # get groups data from raw table for given month
73 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s WHERE day LIKE ? AND NOT disregard",$Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
74 $DBQuery->execute($Month.'-%') or die sprintf("$MySelf: E: Can't get groups data for %s from %s.%s: $DBI::errstr\n",$Month,$Conf{'DBDatabase'},$Conf{'DBTableRaw'});
75
76 # count postings per group
77 my %Postings;
2832c235
TH
78 while (($_) = $DBQuery->fetchrow_array) {
79 # get list oft newsgroups and hierarchies from Newsgroups:
89db2f90 80 my %Newsgroups = ListNewsgroups($_,$Conf{'TLH'},$Options{'l'} ? \%ValidGroups : '');
2832c235
TH
81 # count each newsgroup and hierarchy once
82 foreach (sort keys %Newsgroups) {
2832c235
TH
83 $Postings{$_}++;
84 };
85 };
86
ad609792
TH
87 # add valid but empty groups if -l is set
88 if (%ValidGroups) {
89 foreach (sort keys %ValidGroups) {
90 if (!defined($Postings{$_})) {
91 $Postings{$_} = 0 ;
92 warn (sprintf("ADDED: %s as empty group\n",$_));
93 }
94 };
95 };
96
2832c235
TH
97 print "----- GroupStats -----\n" if $Options{'d'};
98 foreach my $Newsgroup (sort keys %Postings) {
99 print "$Newsgroup => $Postings{$Newsgroup}\n" if $Options{'d'};
100 if (!$Options{'o'}) {
101 # write to database
102 $DBQuery = $DBHandle->prepare(sprintf("REPLACE INTO %s.%s (month,newsgroup,postings) VALUES (?, ?, ?)",$Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
103 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup}) or die sprintf("$MySelf: E: Can't write groups data for %s/%s to %s.%s: $DBI::errstr\n",$Month,$Newsgroup,$Conf{'DBDatabase'},$Conf{'DBTableGrps'});
104 $DBQuery->finish;
105 };
106 };
107 } else {
108 # other types of information go here - later on
109 };
110};
111
112### close handles
113$DBHandle->disconnect;
114
115__END__
116
117################################ Documentation #################################
118
119=head1 NAME
120
121gatherstats - process statistical data from a raw source
122
123=head1 SYNOPSIS
124
ad609792 125B<gatherstats> [B<-Vhdo>] [B<-m> I<YYYY-MM>] [B<-p> I<YYYY-MM:YYYY-MM>] [B<-t> I<type>] [B<-l> I<filename>] [B<-n> I<TLH>] [B<-r> I<database table>] [B<-g> I<database table>] [B<-c> I<database table>] [B<-s> I<database table>]
2832c235
TH
126
127=head1 REQUIREMENTS
128
129See doc/README: Perl 5.8.x itself and the following modules from CPAN:
130
131=over 2
132
133=item -
134
135Config::Auto
136
137=item -
138
139DBI
140
141=back
142
143=head1 DESCRIPTION
144
145This script will extract and process statistical information from a
146database table which is fed from F<feedlog.pl> for a given time period
313610f6
TH
147and write its results to (an)other database table(s). Entries marked
148with I<'disregard'> in the database will be ignored; currently, you have
149to set this flag yourself, using your database management tools. You
150can exclude erroneous entries that way (e.g. automatic reposts (think
151of cancels flood and resurrectors); spam; ...).
2832c235
TH
152
153The time period to act on defaults to last month; you can assign
154another month via the B<-m> switch or a time period via the B<-p>
155switch; the latter takes preference.
156
157By default B<gatherstats> will process all types of information; you
158can change that using the B<-t> switch and assigning the type of
159information to process. Currently only processing of the number of
160postings per group per month is implemented anyway, so that doesn't
161matter yet.
162
163Possible information types include:
164
165=over 3
166
167=item B<groups> (postings per group per month)
168
169B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
170counted for each single group they appear in. Groups not in I<TLH>
171will be ignored.
172
173B<gatherstats> will also add up the number of postings for each
174hierarchy level, but only count each posting once. A posting to
175de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
176respectively. A crossposting to de.alt.test and de.alt.admin, on the
177other hand, will be counted for de.alt.test and de.alt.admin each, but
178only once for de.alt.ALL and de.ALL.
179
180Data is written to I<DBTableGrps> (see doc/INSTALL).
181
182=back
183
184=head2 Configuration
185
186F<gatherstats.pl> will read its configuration from F<newsstats.conf>
187which should be present in the same directory via Config::Auto.
188
189See doc/INSTALL for an overview of possible configuration options.
190
191You can override configuration options via the B<-n>, B<-r>, B<-g>,
192B<-c> and B<-s> switches, respectively.
193
194=head1 OPTIONS
195
196=over 3
197
198=item B<-V> (version)
199
200Print out version and copyright information on B<yapfaq> and exit.
201
202=item B<-h> (help)
203
204Print this man page and exit.
205
206=item B<-d> (debug)
207
208Output debugging information to STDOUT while processing (number of
209postings per group).
210
211=item B<-o> (output only)
212
213Do not write results to database. You should use B<-d> in conjunction
214with B<-o> ... everything else seems a bit pointless.
215
216=item B<-m> I<YYYY-MM> (month)
217
218Set processing period to a month in YYYY-MM format. Ignored if B<-p>
219is set.
220
221=item B<-p> I<YYYY-MM:YYYY-MM> (period)
222
223Set processing period to a time period between two month, each in
224YYYY-MM format, separated by a colon. Overrides B<-m>.
225
226=item B<-t> I<type> (type)
227
228Set processing type to one of I<all> and I<groups>. Defaults to all
229(and is currently rather pointless as only I<groups> has been
230implemented).
231
ad609792
TH
232=item B<-l> I<filename> (check against list)
233
234Check each group against a list of valid newsgroups read from
235I<filename>, one group on each line and ignoring everything after the
236first whitespace (so you can use a file in checkgroups format or (part
237of) your INN active file).
238
239Newsgroups not found in I<filename> will be dropped (and logged to
240STDERR), and newsgroups found in I<filename> but having no postings
241will be added with a count of 0 (and logged to STDERR).
242
2832c235
TH
243=item B<-n> I<TLH> (newsgroup hierarchy)
244
245Override I<TLH> from F<newsstats.conf>.
246
247=item B<-r> I<table> (raw data table)
248
249Override I<DBTableRaw> from F<newsstats.conf>.
250
251=item B<-g> I<table> (postings per group table)
252
253Override I<DBTableGrps> from F<newsstats.conf>.
254
255=item B<-c> I<table> (client data table)
256
257Override I<DBTableClnts> from F<newsstats.conf>.
258
259=item B<-s> I<table> (server/host data table)
260
261Override I<DBTableHosts> from F<newsstats.conf>.
262
263=back
264
265=head1 INSTALLATION
266
267See doc/INSTALL.
268
269=head1 EXAMPLES
270
271Process all types of information for lasth month:
272
273 gatherstats
274
275Do a dry run, showing results of processing:
276
277 gatherstats -do
278
279Process all types of information for January of 2010:
280
281 gatherstats -m 2010-01
282
ad609792
TH
283Process only number of postings for the year of 2010,
284checking against checkgroups-2010.txt:
2832c235 285
ad609792 286 gatherstats -p 2010-01:2010-12 -t groups -l checkgroups-2010.txt
2832c235
TH
287
288=head1 FILES
289
290=over 4
291
292=item F<gatherstats.pl>
293
294The script itself.
295
296=item F<NewsStats.pm>
297
298Library functions for the NewsStats package.
299
300=item F<newsstats.conf>
301
302Runtime configuration file for B<yapfaq>.
303
304=back
305
306=head1 BUGS
307
308Please report any bugs or feature requests to the author or use the
309bug tracker at L<http://bugs.th-h.de/>!
310
311=head1 SEE ALSO
312
313=over 2
314
315=item -
316
317doc/README
318
319=item -
320
321doc/INSTALL
322
323=back
324
325This script is part of the B<NewsStats> package.
326
327=head1 AUTHOR
328
329Thomas Hochstein <thh@inter.net>
330
331=head1 COPYRIGHT AND LICENSE
332
333Copyright (c) 2010 Thomas Hochstein <thh@inter.net>
334
335This program is free software; you may redistribute it and/or modify it
336under the same terms as Perl itself.
337
338=cut
This page took 0.024908 seconds and 4 git commands to generate.