Fix parsing of more than one TLH in config.
[usenet/newsstats.git] / gatherstats.pl
... / ...
CommitLineData
1#! /usr/bin/perl -W
2#
3# gatherstats.pl
4#
5# This script will gather statistical information from a database
6# containing headers and other information from a INN feed.
7#
8# It is part of the NewsStats package.
9#
10# Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
11#
12# It can be redistributed and/or modified under the same terms under
13# which Perl itself is published.
14
15BEGIN {
16 our $VERSION = "0.01";
17 use File::Basename;
18 push(@INC, dirname($0));
19}
20use strict;
21
22use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList);
23
24use DBI;
25use Getopt::Long qw(GetOptions);
26Getopt::Long::config ('bundling');
27
28################################# Definitions ##################################
29
30# define types of information that can be gathered
31# all / groups (/ clients / hosts)
32my %LegalStats;
33@LegalStats{('all','groups')} = ();
34
35################################# Main program #################################
36
37### read commandline options
38my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
39 $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest);
40GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
41 'clientsdb=s' => \$OptClientsDB,
42 'd|debug!' => \$OptDebug,
43 'groupsdb=s' => \$OptGroupsDB,
44 'hierarchy=s' => \$OptTLH,
45 'hostsdb=s' => \$OptHostsDB,
46 'm|month=s' => \$OptMonth,
47 'rawdb=s' => \$OptRawDB,
48 's|stats=s' => \$OptStatsType,
49 't|test!' => \$OptTest,
50 'h|help' => \&ShowPOD,
51 'V|version' => \&ShowVersion) or exit 1;
52
53### read configuration
54my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')};
55
56### override configuration via commandline options
57my %ConfOverride;
58$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
59$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
60$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
61$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
62$ConfOverride{'TLH'} = $OptTLH if $OptTLH;
63&OverrideConfig(\%Conf,\%ConfOverride);
64
65### get type of information to gather, defaulting to 'all'
66$OptStatsType = 'all' if !$OptStatsType;
67&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
68 if !exists($LegalStats{$OptStatsType});
69
70### get time period from --month
71# get verbal description of time period, drop SQL code
72my ($Period) = &GetTimePeriod($OptMonth);
73&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
74 "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
75
76### reformat $Conf{'TLH'}
77my $TLH;
78if ($Conf{'TLH'}) {
79 # $Conf{'TLH'} is parsed as an array by Config::Auto;
80 # make a flat list again, separated by :
81 if (ref($Conf{'TLH'}) eq 'ARRAY') {
82 $TLH = join(':',@{$Conf{'TLH'}});
83 } else {
84 $TLH = $Conf{'TLH'};
85 }
86 # strip whitespace
87 $TLH =~ s/\s//g;
88 # check for illegal characters
89 &Bleat(2,'Config error - illegal characters in TLH definition!')
90 if ($TLH !~ /^[a-zA-Z0-9:]+$/);
91 if ($TLH =~ /:/) {
92 # reformat $TLH from a:b to (a)|(b),
93 # e.g. replace ':' by ')|('
94 $TLH =~ s/:/)|(/g;
95 $TLH = '(' . $TLH . ')';
96 };
97};
98
99# read list of newsgroups from --checkgroups
100# into a hash
101my %ValidGroups = %{ReadGroupList($OptCheckgroupsFile)} if $OptCheckgroupsFile;
102
103### init database
104my $DBHandle = InitDB(\%Conf,1);
105
106### get data for each month
107&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
108foreach my $Month (&ListMonth($Period)) {
109
110 print "---------- $Month ----------\n" if $OptDebug;
111
112 if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
113 ### ----------------------------------------------
114 ### get groups data (number of postings per group)
115 # get groups data from raw table for given month
116 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
117 "WHERE day LIKE ? AND NOT disregard",
118 $Conf{'DBDatabase'},
119 $Conf{'DBTableRaw'}));
120 $DBQuery->execute($Month.'-%')
121 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
122 "$DBI::errstr\n",$Month,
123 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
124
125 # count postings per group
126 my %Postings;
127 while (($_) = $DBQuery->fetchrow_array) {
128 # get list oft newsgroups and hierarchies from Newsgroups:
129 my %Newsgroups = ListNewsgroups($_,$TLH,
130 $OptCheckgroupsFile ? \%ValidGroups : '');
131 # count each newsgroup and hierarchy once
132 foreach (sort keys %Newsgroups) {
133 $Postings{$_}++;
134 };
135 };
136
137 # add valid but empty groups if --checkgroups is set
138 if (%ValidGroups) {
139 foreach (sort keys %ValidGroups) {
140 if (!defined($Postings{$_})) {
141 $Postings{$_} = 0 ;
142 warn (sprintf("ADDED: %s as empty group\n",$_));
143 }
144 };
145 };
146
147 # delete old data for that month
148 if (!$OptTest) {
149 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
150 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
151 undef,$Month)
152 or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
153 "$DBI::errstr\n",$Month,
154 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
155 };
156
157 print "----- GroupStats -----\n" if $OptDebug;
158 foreach my $Newsgroup (sort keys %Postings) {
159 print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
160 if (!$OptTest) {
161 # write to database
162 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
163 "(month,newsgroup,postings) ".
164 "VALUES (?, ?, ?)",
165 $Conf{'DBDatabase'},
166 $Conf{'DBTableGrps'}));
167 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
168 or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
169 "$DBI::errstr\n",$Month,$Newsgroup,
170 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
171 $DBQuery->finish;
172 };
173 };
174 } else {
175 # other types of information go here - later on
176 };
177};
178
179### close handles
180$DBHandle->disconnect;
181
182__END__
183
184################################ Documentation #################################
185
186=head1 NAME
187
188gatherstats - process statistical data from a raw source
189
190=head1 SYNOPSIS
191
192B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats] [B<-c> I<checkgroups file>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>]
193
194=head1 REQUIREMENTS
195
196See L<doc/README>.
197
198=head1 DESCRIPTION
199
200This script will extract and process statistical information from a
201database table which is fed from F<feedlog.pl> for a given time period
202and write its results to (an)other database table(s). Entries marked
203with I<'disregard'> in the database will be ignored; currently, you
204have to set this flag yourself, using your database management tools.
205You can exclude erroneous entries that way (e.g. automatic reposts
206(think of cancels flood and resurrectors); spam; ...).
207
208The time period to act on defaults to last month; you can assign
209another time period or a single month via the B<--month> option (see
210below).
211
212By default B<gatherstats> will process all types of information; you
213can change that using the B<--stats> option and assigning the type of
214information to process. Currently that doesn't matter yet as only
215processing of the number of postings per group per month is
216implemented anyway.
217
218Possible information types include:
219
220=over 3
221
222=item B<groups> (postings per group per month)
223
224B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
225counted for each single group they appear in. Groups not in I<TLH>
226will be ignored.
227
228B<gatherstats> will also add up the number of postings for each
229hierarchy level, but only count each posting once. A posting to
230de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
231respectively. A crossposting to de.alt.test and de.alt.admin, on the
232other hand, will be counted for de.alt.test and de.alt.admin each, but
233only once for de.alt.ALL and de.ALL.
234
235Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
236override that default through the B<--groupsdb> option.
237
238=back
239
240=head2 Configuration
241
242B<gatherstats> will read its configuration from F<newsstats.conf>
243which should be present in the same directory via Config::Auto.
244
245See L<doc/INSTALL> for an overview of possible configuration options.
246
247You can override configuration options via the B<--hierarchy>,
248B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
249respectively.
250
251=head1 OPTIONS
252
253=over 3
254
255=item B<-V>, B<--version>
256
257Print out version and copyright information and exit.
258
259=item B<-h>, B<--help>
260
261Print this man page and exit.
262
263=item B<-d>, B<--debug>
264
265Output debugging information to STDOUT while processing (number of
266postings per group).
267
268=item B<-t>, B<--test>
269
270Do not write results to database. You should use B<--debug> in
271conjunction with B<--test> ... everything else seems a bit pointless.
272
273=item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
274
275Set processing period to a single month in YYYY-MM format or to a time
276period between two month in YYYY-MM:YYYY-MM format (two month, separated
277by a colon).
278
279
280=item B<-s>, B<--stats> I<type>
281
282Set processing type to one of I<all> and I<groups>. Defaults to all
283(and is currently rather pointless as only I<groups> has been
284implemented).
285
286=item B<-c>, B<--checkgroups> I<filename>
287
288Check each group against a list of valid newsgroups read from
289I<filename>, one group on each line and ignoring everything after the
290first whitespace (so you can use a file in checkgroups format or (part
291of) your INN active file).
292
293Newsgroups not found in I<filename> will be dropped (and logged to
294STDERR), and newsgroups found in I<filename> but having no postings
295will be added with a count of 0 (and logged to STDERR).
296
297=item B<--hierarchy> I<TLH> (newsgroup hierarchy)
298
299Override I<TLH> from F<newsstats.conf>.
300
301=item B<--rawdb> I<table> (raw data table)
302
303Override I<DBTableRaw> from F<newsstats.conf>.
304
305=item B<--groupsdb> I<table> (postings per group table)
306
307Override I<DBTableGrps> from F<newsstats.conf>.
308
309=item B<--clientsdb> I<table> (client data table)
310
311Override I<DBTableClnts> from F<newsstats.conf>.
312
313=item B<--hostsdb> I<table> (host data table)
314
315Override I<DBTableHosts> from F<newsstats.conf>.
316
317=back
318
319=head1 INSTALLATION
320
321See L<doc/INSTALL>.
322
323=head1 EXAMPLES
324
325Process all types of information for lasth month:
326
327 gatherstats
328
329Do a dry run, showing results of processing:
330
331 gatherstats --debug --test
332
333Process all types of information for January of 2010:
334
335 gatherstats --month 2010-01
336
337Process only number of postings for the year of 2010,
338checking against checkgroups-2010.txt:
339
340 gatherstats -m 2010-01:2010-12 -s groups -c checkgroups-2010.txt
341
342=head1 FILES
343
344=over 4
345
346=item F<gatherstats.pl>
347
348The script itself.
349
350=item F<NewsStats.pm>
351
352Library functions for the NewsStats package.
353
354=item F<newsstats.conf>
355
356Runtime configuration file.
357
358=back
359
360=head1 BUGS
361
362Please report any bugs or feature requests to the author or use the
363bug tracker at L<http://bugs.th-h.de/>!
364
365=head1 SEE ALSO
366
367=over 2
368
369=item -
370
371L<doc/README>
372
373=item -
374
375L<doc/INSTALL>
376
377=back
378
379This script is part of the B<NewsStats> package.
380
381=head1 AUTHOR
382
383Thomas Hochstein <thh@inter.net>
384
385=head1 COPYRIGHT AND LICENSE
386
387Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
388
389This program is free software; you may redistribute it and/or modify it
390under the same terms as Perl itself.
391
392=cut
This page took 0.011737 seconds and 4 git commands to generate.