5 # This script will gather statistical information from a database
6 # containing headers and other information from a INN feed.
8 # It is part of the NewsStats package.
10 # Copyright (c) 2010-2013 Thomas Hochstein <thh@inter.net>
12 # It can be redistributed and/or modified under the same terms under
13 # which Perl itself is published.
16 our $VERSION = "0.01";
18 # we're in .../bin, so our module is in ../lib
19 push(@INC, dirname($0).'/../lib');
24 use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList);
27 use Getopt::Long qw(GetOptions);
28 Getopt::Long::config ('bundling');
30 ################################# Definitions ##################################
32 # define types of information that can be gathered
33 # all / groups (/ clients / hosts)
35 @LegalStats{('all','groups')} = ();
37 ################################# Main program #################################
39 ### read commandline options
40 my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
41 $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest,$OptConfFile);
42 GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
43 'clientsdb=s' => \$OptClientsDB,
44 'd|debug!' => \$OptDebug,
45 'groupsdb=s' => \$OptGroupsDB,
46 'hierarchy=s' => \$OptTLH,
47 'hostsdb=s' => \$OptHostsDB,
48 'm|month=s' => \$OptMonth,
49 'rawdb=s' => \$OptRawDB,
50 's|stats=s' => \$OptStatsType,
51 't|test!' => \$OptTest,
52 'conffile=s' => \$OptConfFile,
53 'h|help' => \&ShowPOD,
54 'V|version' => \&ShowVersion) or exit 1;
56 ### read configuration
57 my %Conf = %{ReadConfig($OptConfFile)};
59 ### override configuration via commandline options
61 $ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
62 $ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
63 $ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
64 $ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
65 $ConfOverride{'TLH'} = $OptTLH if $OptTLH;
66 &OverrideConfig(\%Conf,\%ConfOverride);
68 ### get type of information to gather, defaulting to 'all'
69 $OptStatsType = 'all' if !$OptStatsType;
70 &Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
71 if !exists($LegalStats{$OptStatsType});
73 ### get time period from --month
74 # get verbal description of time period, drop SQL code
75 my ($Period) = &GetTimePeriod($OptMonth);
76 # bail out if --month is invalid or set to 'ALL';
77 # we don't support the latter
78 &Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
79 "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
81 ### reformat $Conf{'TLH'}
84 # $Conf{'TLH'} is parsed as an array by Config::Auto;
85 # make a flat list again, separated by :
86 if (ref($Conf{'TLH'}) eq 'ARRAY') {
87 $TLH = join(':',@{$Conf{'TLH'}});
93 # add trailing dots if none are present yet
94 # (using negative look-behind assertions)
95 $TLH =~ s/(?<!\.):/.:/g;
96 $TLH =~ s/(?<!\.)$/./;
97 # check for illegal characters
98 &Bleat(2,'Config error - illegal characters in TLH definition!')
99 if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
103 # reformat $TLH from a:b to (a)|(b),
104 # e.g. replace ':' by ')|('
106 $TLH = '(' . $TLH . ')';
111 my $DBHandle = InitDB(\%Conf,1);
113 ### get data for each month
114 &Bleat(1,'Test mode. Database is not updated.') if $OptTest;
115 foreach my $Month (&ListMonth($Period)) {
117 print "---------- $Month ----------\n" if $OptDebug;
119 if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
120 # read list of newsgroups from --checkgroups
122 my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))}
123 if $OptCheckgroupsFile;
125 ### ----------------------------------------------
126 ### get groups data (number of postings per group)
127 # get groups data from raw table for given month
128 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
129 "WHERE day LIKE ? AND NOT disregard",
131 $Conf{'DBTableRaw'}));
132 $DBQuery->execute($Month.'-%')
133 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
134 "$DBI::errstr\n",$Month,
135 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
137 # count postings per group
139 while (($_) = $DBQuery->fetchrow_array) {
140 # get list of newsgroups and hierarchies from Newsgroups:
141 my %Newsgroups = ListNewsgroups($_,$TLH,
142 $OptCheckgroupsFile ? \%ValidGroups : '');
143 # count each newsgroup and hierarchy once
144 foreach (sort keys %Newsgroups) {
149 # add valid but empty groups if --checkgroups is set
151 foreach (sort keys %ValidGroups) {
152 if (!defined($Postings{$_})) {
153 # add current newsgroup as empty group
155 warn (sprintf("ADDED: %s as empty group\n",$_));
156 # add empty hierarchies for current newsgroup as needed
157 foreach (ParseHierarchies($_)) {
158 my $Hierarchy = $_ . '.ALL';
159 if (!defined($Postings{$Hierarchy})) {
160 $Postings{$Hierarchy} = 0;
161 warn (sprintf("ADDED: %s as empty group\n",$Hierarchy));
168 # delete old data for that month
170 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
171 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
173 or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
174 "$DBI::errstr\n",$Month,
175 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
178 print "----- GroupStats -----\n" if $OptDebug;
179 foreach my $Newsgroup (sort keys %Postings) {
180 print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
183 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
184 "(month,newsgroup,postings) ".
187 $Conf{'DBTableGrps'}));
188 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
189 or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
190 "$DBI::errstr\n",$Month,$Newsgroup,
191 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
196 # other types of information go here - later on
201 $DBHandle->disconnect;
205 ################################ Documentation #################################
209 gatherstats - process statistical data from a raw source
213 B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats>] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>] [--conffile I<filename>]
221 This script will extract and process statistical information from a
222 database table which is fed from F<feedlog.pl> for a given time period
223 and write its results to (an)other database table(s). Entries marked
224 with I<'disregard'> in the database will be ignored; currently, you
225 have to set this flag yourself, using your database management tools.
226 You can exclude erroneous entries that way (e.g. automatic reposts
227 (think of cancels flood and resurrectors); spam; ...).
229 The time period to act on defaults to last month; you can assign
230 another time period or a single month via the B<--month> option (see
233 By default B<gatherstats> will process all types of information; you
234 can change that using the B<--stats> option and assigning the type of
235 information to process. Currently that doesn't matter yet as only
236 processing of the number of postings per group per month is
239 Possible information types include:
243 =item B<groups> (postings per group per month)
245 B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
246 counted for each single group they appear in. Groups not in I<TLH>
249 B<gatherstats> will also add up the number of postings for each
250 hierarchy level, but only count each posting once. A posting to
251 de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
252 respectively. A crossposting to de.alt.test and de.alt.admin, on the
253 other hand, will be counted for de.alt.test and de.alt.admin each, but
254 only once for de.alt.ALL and de.ALL.
256 Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
257 override that default through the B<--groupsdb> option.
263 B<gatherstats> will read its configuration from F<newsstats.conf>
264 which should be present in the same directory via Config::Auto.
266 See L<doc/INSTALL> for an overview of possible configuration options.
268 You can override configuration options via the B<--hierarchy>,
269 B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
276 =item B<-V>, B<--version>
278 Print out version and copyright information and exit.
280 =item B<-h>, B<--help>
282 Print this man page and exit.
284 =item B<-d>, B<--debug>
286 Output debugging information to STDOUT while processing (number of
289 =item B<-t>, B<--test>
291 Do not write results to database. You should use B<--debug> in
292 conjunction with B<--test> ... everything else seems a bit pointless.
294 =item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
296 Set processing period to a single month in YYYY-MM format or to a time
297 period between two month in YYYY-MM:YYYY-MM format (two month, separated
300 =item B<-s>, B<--stats> I<type>
302 Set processing type to one of I<all> and I<groups>. Defaults to all
303 (and is currently rather pointless as only I<groups> has been
306 =item B<-c>, B<--checkgroups> I<filename template>
308 Check each group against a list of valid newsgroups read from a file,
309 one group on each line and ignoring everything after the first
310 whitespace (so you can use a file in checkgroups format or (part of)
311 your INN active file).
313 The filename is taken from I<filename template>, amended by each
314 B<--month> B<gatherstats> is processing in the form of I<template-YYYY-MM>,
317 gatherstats -m 2010-01:2010-12 -c checkgroups
319 will check against F<checkgroups-2010-01> for January 2010, against
320 F<checkgroups-2010-02> for February 2010 and so on.
322 Newsgroups not found in the checkgroups file will be dropped (and
323 logged to STDERR), and newsgroups found there but having no postings
324 will be added with a count of 0 (and logged to STDERR).
326 =item B<--hierarchy> I<TLH> (newsgroup hierarchy)
328 Override I<TLH> from F<newsstats.conf>.
330 =item B<--rawdb> I<table> (raw data table)
332 Override I<DBTableRaw> from F<newsstats.conf>.
334 =item B<--groupsdb> I<table> (postings per group table)
336 Override I<DBTableGrps> from F<newsstats.conf>.
338 =item B<--clientsdb> I<table> (client data table)
340 Override I<DBTableClnts> from F<newsstats.conf>.
342 =item B<--hostsdb> I<table> (host data table)
344 Override I<DBTableHosts> from F<newsstats.conf>.
346 =item B<--conffile> I<filename>
348 Load configuration from I<filename> instead of F<newsstats.conf>.
358 Process all types of information for lasth month:
362 Do a dry run, showing results of processing:
364 gatherstats --debug --test
366 Process all types of information for January of 2010:
368 gatherstats --month 2010-01
370 Process only number of postings for the year of 2010,
371 checking against checkgroups-*:
373 gatherstats -m 2010-01:2010-12 -s groups -c checkgroups
379 =item F<bin/gatherstats.pl>
383 =item F<lib/NewsStats.pm>
385 Library functions for the NewsStats package.
387 =item F<etc/newsstats.conf>
389 Runtime configuration file.
395 Please report any bugs or feature requests to the author or use the
396 bug tracker at L<http://bugs.th-h.de/>!
412 This script is part of the B<NewsStats> package.
416 Thomas Hochstein <thh@inter.net>
418 =head1 COPYRIGHT AND LICENSE
420 Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
422 This program is free software; you may redistribute it and/or modify it
423 under the same terms as Perl itself.