5 # This script will gather statistical information from a database
6 # containing headers and other information from a INN feed.
8 # It is part of the NewsStats package.
10 # Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
12 # It can be redistributed and/or modified under the same terms under
13 # which Perl itself is published.
16 our $VERSION = "0.01";
18 push(@INC, dirname($0));
22 use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList);
25 use Getopt::Long qw(GetOptions);
26 Getopt::Long::config ('bundling');
28 ################################# Definitions ##################################
30 # define types of information that can be gathered
31 # all / groups (/ clients / hosts)
33 @LegalStats{('all','groups')} = ();
35 ################################# Main program #################################
37 ### read commandline options
38 my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
39 $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest);
40 GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
41 'clientsdb=s' => \$OptClientsDB,
42 'd|debug!' => \$OptDebug,
43 'groupsdb=s' => \$OptGroupsDB,
44 'hierarchy=s' => \$OptTLH,
45 'hostsdb=s' => \$OptHostsDB,
46 'm|month=s' => \$OptMonth,
47 'rawdb=s' => \$OptRawDB,
48 's|stats=s' => \$OptStatsType,
49 't|test!' => \$OptTest,
50 'h|help' => \&ShowPOD,
51 'V|version' => \&ShowVersion) or exit 1;
53 ### read configuration
54 my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')};
56 ### override configuration via commandline options
58 $ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
59 $ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
60 $ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
61 $ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
62 $ConfOverride{'TLH'} = $OptTLH if $OptTLH;
63 &OverrideConfig(\%Conf,\%ConfOverride);
65 ### get type of information to gather, defaulting to 'all'
66 $OptStatsType = 'all' if !$OptStatsType;
67 &Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
68 if !exists($LegalStats{$OptStatsType});
70 ### get time period from --month
71 # get verbal description of time period, drop SQL code
72 my ($Period) = &GetTimePeriod($OptMonth);
73 &Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
74 "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
76 ### reformat $Conf{'TLH'}
79 # $Conf{'TLH'} is parsed as an array by Config::Auto;
80 # make a flat list again, separated by :
81 if (ref($Conf{'TLH'}) eq 'ARRAY') {
82 $TLH = join(':',@{$Conf{'TLH'}});
88 # add trailing dots if none are present yet
89 # (using negative look-behind assertions)
90 $TLH =~ s/(?<!\.):/.:/g;
91 $TLH =~ s/(?<!\.)$/./;
92 # check for illegal characters
93 &Bleat(2,'Config error - illegal characters in TLH definition!')
94 if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
98 # reformat $TLH from a:b to (a)|(b),
99 # e.g. replace ':' by ')|('
101 $TLH = '(' . $TLH . ')';
106 my $DBHandle = InitDB(\%Conf,1);
108 ### get data for each month
109 &Bleat(1,'Test mode. Database is not updated.') if $OptTest;
110 foreach my $Month (&ListMonth($Period)) {
112 print "---------- $Month ----------\n" if $OptDebug;
114 if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
115 # read list of newsgroups from --checkgroups
117 my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))}
118 if $OptCheckgroupsFile;
120 ### ----------------------------------------------
121 ### get groups data (number of postings per group)
122 # get groups data from raw table for given month
123 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
124 "WHERE day LIKE ? AND NOT disregard",
126 $Conf{'DBTableRaw'}));
127 $DBQuery->execute($Month.'-%')
128 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
129 "$DBI::errstr\n",$Month,
130 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
132 # count postings per group
134 while (($_) = $DBQuery->fetchrow_array) {
135 # get list oft newsgroups and hierarchies from Newsgroups:
136 my %Newsgroups = ListNewsgroups($_,$TLH,
137 $OptCheckgroupsFile ? \%ValidGroups : '');
138 # count each newsgroup and hierarchy once
139 foreach (sort keys %Newsgroups) {
144 # add valid but empty groups if --checkgroups is set
146 foreach (sort keys %ValidGroups) {
147 if (!defined($Postings{$_})) {
149 warn (sprintf("ADDED: %s as empty group\n",$_));
154 # delete old data for that month
156 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
157 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
159 or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
160 "$DBI::errstr\n",$Month,
161 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
164 print "----- GroupStats -----\n" if $OptDebug;
165 foreach my $Newsgroup (sort keys %Postings) {
166 print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
169 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
170 "(month,newsgroup,postings) ".
173 $Conf{'DBTableGrps'}));
174 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
175 or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
176 "$DBI::errstr\n",$Month,$Newsgroup,
177 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
182 # other types of information go here - later on
187 $DBHandle->disconnect;
191 ################################ Documentation #################################
195 gatherstats - process statistical data from a raw source
199 B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>]
207 This script will extract and process statistical information from a
208 database table which is fed from F<feedlog.pl> for a given time period
209 and write its results to (an)other database table(s). Entries marked
210 with I<'disregard'> in the database will be ignored; currently, you
211 have to set this flag yourself, using your database management tools.
212 You can exclude erroneous entries that way (e.g. automatic reposts
213 (think of cancels flood and resurrectors); spam; ...).
215 The time period to act on defaults to last month; you can assign
216 another time period or a single month via the B<--month> option (see
219 By default B<gatherstats> will process all types of information; you
220 can change that using the B<--stats> option and assigning the type of
221 information to process. Currently that doesn't matter yet as only
222 processing of the number of postings per group per month is
225 Possible information types include:
229 =item B<groups> (postings per group per month)
231 B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
232 counted for each single group they appear in. Groups not in I<TLH>
235 B<gatherstats> will also add up the number of postings for each
236 hierarchy level, but only count each posting once. A posting to
237 de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
238 respectively. A crossposting to de.alt.test and de.alt.admin, on the
239 other hand, will be counted for de.alt.test and de.alt.admin each, but
240 only once for de.alt.ALL and de.ALL.
242 Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
243 override that default through the B<--groupsdb> option.
249 B<gatherstats> will read its configuration from F<newsstats.conf>
250 which should be present in the same directory via Config::Auto.
252 See L<doc/INSTALL> for an overview of possible configuration options.
254 You can override configuration options via the B<--hierarchy>,
255 B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
262 =item B<-V>, B<--version>
264 Print out version and copyright information and exit.
266 =item B<-h>, B<--help>
268 Print this man page and exit.
270 =item B<-d>, B<--debug>
272 Output debugging information to STDOUT while processing (number of
275 =item B<-t>, B<--test>
277 Do not write results to database. You should use B<--debug> in
278 conjunction with B<--test> ... everything else seems a bit pointless.
280 =item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
282 Set processing period to a single month in YYYY-MM format or to a time
283 period between two month in YYYY-MM:YYYY-MM format (two month, separated
287 =item B<-s>, B<--stats> I<type>
289 Set processing type to one of I<all> and I<groups>. Defaults to all
290 (and is currently rather pointless as only I<groups> has been
293 =item B<-c>, B<--checkgroups> I<filename template>
295 Check each group against a list of valid newsgroups read from a file,
296 one group on each line and ignoring everything after the first
297 whitespace (so you can use a file in checkgroups format or (part of)
298 your INN active file).
300 The filename is taken from I<filename template>, amended by each B<--
301 month> B<gatherstats> is processing, so that
303 gatherstats -m 2010-01:2010-12 -c checkgroups
305 will check against F<checkgroups-2010-01> for January 2010, against
306 F<checkgroups-2010-02> for February 2010 and so on.
308 Newsgroups not found in the checkgroups file will be dropped (and
309 logged to STDERR), and newsgroups found there but having no postings
310 will be added with a count of 0 (and logged to STDERR).
312 =item B<--hierarchy> I<TLH> (newsgroup hierarchy)
314 Override I<TLH> from F<newsstats.conf>.
316 =item B<--rawdb> I<table> (raw data table)
318 Override I<DBTableRaw> from F<newsstats.conf>.
320 =item B<--groupsdb> I<table> (postings per group table)
322 Override I<DBTableGrps> from F<newsstats.conf>.
324 =item B<--clientsdb> I<table> (client data table)
326 Override I<DBTableClnts> from F<newsstats.conf>.
328 =item B<--hostsdb> I<table> (host data table)
330 Override I<DBTableHosts> from F<newsstats.conf>.
340 Process all types of information for lasth month:
344 Do a dry run, showing results of processing:
346 gatherstats --debug --test
348 Process all types of information for January of 2010:
350 gatherstats --month 2010-01
352 Process only number of postings for the year of 2010,
353 checking against checkgroups-*:
355 gatherstats -m 2010-01:2010-12 -s groups -c checkgroups
361 =item F<gatherstats.pl>
365 =item F<NewsStats.pm>
367 Library functions for the NewsStats package.
369 =item F<newsstats.conf>
371 Runtime configuration file.
377 Please report any bugs or feature requests to the author or use the
378 bug tracker at L<http://bugs.th-h.de/>!
394 This script is part of the B<NewsStats> package.
398 Thomas Hochstein <thh@inter.net>
400 =head1 COPYRIGHT AND LICENSE
402 Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
404 This program is free software; you may redistribute it and/or modify it
405 under the same terms as Perl itself.