5 # This script will gather statistical information from a database
6 # containing headers and other information from a INN feed.
8 # It is part of the NewsStats package.
10 # Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
12 # It can be redistributed and/or modified under the same terms under
13 # which Perl itself is published.
16 our $VERSION = "0.01";
18 push(@INC, dirname($0));
23 use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList);
26 use Getopt::Long qw(GetOptions);
27 Getopt::Long::config ('bundling');
29 ################################# Definitions ##################################
31 # define types of information that can be gathered
32 # all / groups (/ clients / hosts)
34 @LegalStats{('all','groups')} = ();
36 ################################# Main program #################################
38 ### read commandline options
39 my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
40 $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest);
41 GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
42 'clientsdb=s' => \$OptClientsDB,
43 'd|debug!' => \$OptDebug,
44 'groupsdb=s' => \$OptGroupsDB,
45 'hierarchy=s' => \$OptTLH,
46 'hostsdb=s' => \$OptHostsDB,
47 'm|month=s' => \$OptMonth,
48 'rawdb=s' => \$OptRawDB,
49 's|stats=s' => \$OptStatsType,
50 't|test!' => \$OptTest,
51 'h|help' => \&ShowPOD,
52 'V|version' => \&ShowVersion) or exit 1;
54 ### read configuration
55 my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')};
57 ### override configuration via commandline options
59 $ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
60 $ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
61 $ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
62 $ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
63 $ConfOverride{'TLH'} = $OptTLH if $OptTLH;
64 &OverrideConfig(\%Conf,\%ConfOverride);
66 ### get type of information to gather, defaulting to 'all'
67 $OptStatsType = 'all' if !$OptStatsType;
68 &Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
69 if !exists($LegalStats{$OptStatsType});
71 ### get time period from --month
72 # get verbal description of time period, drop SQL code
73 my ($Period) = &GetTimePeriod($OptMonth);
74 &Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
75 "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
77 ### reformat $Conf{'TLH'}
80 # $Conf{'TLH'} is parsed as an array by Config::Auto;
81 # make a flat list again, separated by :
82 if (ref($Conf{'TLH'}) eq 'ARRAY') {
83 $TLH = join(':',@{$Conf{'TLH'}});
89 # add trailing dots if none are present yet
90 # (using negative look-behind assertions)
91 $TLH =~ s/(?<!\.):/.:/g;
92 $TLH =~ s/(?<!\.)$/./;
93 # check for illegal characters
94 &Bleat(2,'Config error - illegal characters in TLH definition!')
95 if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
99 # reformat $TLH from a:b to (a)|(b),
100 # e.g. replace ':' by ')|('
102 $TLH = '(' . $TLH . ')';
107 my $DBHandle = InitDB(\%Conf,1);
109 ### get data for each month
110 &Bleat(1,'Test mode. Database is not updated.') if $OptTest;
111 foreach my $Month (&ListMonth($Period)) {
113 print "---------- $Month ----------\n" if $OptDebug;
115 if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
116 # read list of newsgroups from --checkgroups
118 my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))}
119 if $OptCheckgroupsFile;
121 ### ----------------------------------------------
122 ### get groups data (number of postings per group)
123 # get groups data from raw table for given month
124 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
125 "WHERE day LIKE ? AND NOT disregard",
127 $Conf{'DBTableRaw'}));
128 $DBQuery->execute($Month.'-%')
129 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
130 "$DBI::errstr\n",$Month,
131 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
133 # count postings per group
135 while (($_) = $DBQuery->fetchrow_array) {
136 # get list of newsgroups and hierarchies from Newsgroups:
137 my %Newsgroups = ListNewsgroups($_,$TLH,
138 $OptCheckgroupsFile ? \%ValidGroups : '');
139 # count each newsgroup and hierarchy once
140 foreach (sort keys %Newsgroups) {
145 # add valid but empty groups if --checkgroups is set
147 foreach (sort keys %ValidGroups) {
148 if (!defined($Postings{$_})) {
149 # add current newsgroup as empty group
151 warn (sprintf("ADDED: %s as empty group\n",$_));
152 # add empty hierarchies for current newsgroup as needed
153 foreach (ParseHierarchies($_)) {
154 my $Hierarchy = $_ . '.ALL';
155 if (!defined($Postings{$Hierarchy})) {
156 $Postings{$Hierarchy} = 0;
157 warn (sprintf("ADDED: %s as empty group\n",$Hierarchy));
164 # delete old data for that month
166 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
167 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
169 or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
170 "$DBI::errstr\n",$Month,
171 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
174 print "----- GroupStats -----\n" if $OptDebug;
175 foreach my $Newsgroup (sort keys %Postings) {
176 print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
179 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
180 "(month,newsgroup,postings) ".
183 $Conf{'DBTableGrps'}));
184 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
185 or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
186 "$DBI::errstr\n",$Month,$Newsgroup,
187 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
192 # other types of information go here - later on
197 $DBHandle->disconnect;
201 ################################ Documentation #################################
205 gatherstats - process statistical data from a raw source
209 B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>]
217 This script will extract and process statistical information from a
218 database table which is fed from F<feedlog.pl> for a given time period
219 and write its results to (an)other database table(s). Entries marked
220 with I<'disregard'> in the database will be ignored; currently, you
221 have to set this flag yourself, using your database management tools.
222 You can exclude erroneous entries that way (e.g. automatic reposts
223 (think of cancels flood and resurrectors); spam; ...).
225 The time period to act on defaults to last month; you can assign
226 another time period or a single month via the B<--month> option (see
229 By default B<gatherstats> will process all types of information; you
230 can change that using the B<--stats> option and assigning the type of
231 information to process. Currently that doesn't matter yet as only
232 processing of the number of postings per group per month is
235 Possible information types include:
239 =item B<groups> (postings per group per month)
241 B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
242 counted for each single group they appear in. Groups not in I<TLH>
245 B<gatherstats> will also add up the number of postings for each
246 hierarchy level, but only count each posting once. A posting to
247 de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
248 respectively. A crossposting to de.alt.test and de.alt.admin, on the
249 other hand, will be counted for de.alt.test and de.alt.admin each, but
250 only once for de.alt.ALL and de.ALL.
252 Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
253 override that default through the B<--groupsdb> option.
259 B<gatherstats> will read its configuration from F<newsstats.conf>
260 which should be present in the same directory via Config::Auto.
262 See L<doc/INSTALL> for an overview of possible configuration options.
264 You can override configuration options via the B<--hierarchy>,
265 B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
272 =item B<-V>, B<--version>
274 Print out version and copyright information and exit.
276 =item B<-h>, B<--help>
278 Print this man page and exit.
280 =item B<-d>, B<--debug>
282 Output debugging information to STDOUT while processing (number of
285 =item B<-t>, B<--test>
287 Do not write results to database. You should use B<--debug> in
288 conjunction with B<--test> ... everything else seems a bit pointless.
290 =item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
292 Set processing period to a single month in YYYY-MM format or to a time
293 period between two month in YYYY-MM:YYYY-MM format (two month, separated
297 =item B<-s>, B<--stats> I<type>
299 Set processing type to one of I<all> and I<groups>. Defaults to all
300 (and is currently rather pointless as only I<groups> has been
303 =item B<-c>, B<--checkgroups> I<filename template>
305 Check each group against a list of valid newsgroups read from a file,
306 one group on each line and ignoring everything after the first
307 whitespace (so you can use a file in checkgroups format or (part of)
308 your INN active file).
310 The filename is taken from I<filename template>, amended by each B<--
311 month> B<gatherstats> is processing, so that
313 gatherstats -m 2010-01:2010-12 -c checkgroups
315 will check against F<checkgroups-2010-01> for January 2010, against
316 F<checkgroups-2010-02> for February 2010 and so on.
318 Newsgroups not found in the checkgroups file will be dropped (and
319 logged to STDERR), and newsgroups found there but having no postings
320 will be added with a count of 0 (and logged to STDERR).
322 =item B<--hierarchy> I<TLH> (newsgroup hierarchy)
324 Override I<TLH> from F<newsstats.conf>.
326 =item B<--rawdb> I<table> (raw data table)
328 Override I<DBTableRaw> from F<newsstats.conf>.
330 =item B<--groupsdb> I<table> (postings per group table)
332 Override I<DBTableGrps> from F<newsstats.conf>.
334 =item B<--clientsdb> I<table> (client data table)
336 Override I<DBTableClnts> from F<newsstats.conf>.
338 =item B<--hostsdb> I<table> (host data table)
340 Override I<DBTableHosts> from F<newsstats.conf>.
350 Process all types of information for lasth month:
354 Do a dry run, showing results of processing:
356 gatherstats --debug --test
358 Process all types of information for January of 2010:
360 gatherstats --month 2010-01
362 Process only number of postings for the year of 2010,
363 checking against checkgroups-*:
365 gatherstats -m 2010-01:2010-12 -s groups -c checkgroups
371 =item F<gatherstats.pl>
375 =item F<NewsStats.pm>
377 Library functions for the NewsStats package.
379 =item F<newsstats.conf>
381 Runtime configuration file.
387 Please report any bugs or feature requests to the author or use the
388 bug tracker at L<http://bugs.th-h.de/>!
404 This script is part of the B<NewsStats> package.
408 Thomas Hochstein <thh@inter.net>
410 =head1 COPYRIGHT AND LICENSE
412 Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
414 This program is free software; you may redistribute it and/or modify it
415 under the same terms as Perl itself.