5 # This script will gather statistical information from a database
6 # containing headers and other information from a INN feed.
8 # It is part of the NewsStats package.
10 # Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
12 # It can be redistributed and/or modified under the same terms under
13 # which Perl itself is published.
16 our $VERSION = "0.01";
18 push(@INC, dirname($0));
22 use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList);
25 use Getopt::Long qw(GetOptions);
26 Getopt::Long::config ('bundling');
28 ################################# Definitions ##################################
30 # define types of information that can be gathered
31 # all / groups (/ clients / hosts)
33 @LegalStats{('all','groups')} = ();
35 ################################# Main program #################################
37 ### read commandline options
38 my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
39 $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest);
40 GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
41 'clientsdb=s' => \$OptClientsDB,
42 'd|debug!' => \$OptDebug,
43 'groupsdb=s' => \$OptGroupsDB,
44 'hierarchy=s' => \$OptTLH,
45 'hostsdb=s' => \$OptHostsDB,
46 'm|month=s' => \$OptMonth,
47 'rawdb=s' => \$OptRawDB,
48 's|stats=s' => \$OptStatsType,
49 't|test!' => \$OptTest,
50 'h|help' => \&ShowPOD,
51 'V|version' => \&ShowVersion) or exit 1;
53 ### read configuration
54 my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')};
56 ### override configuration via commandline options
58 $ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
59 $ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
60 $ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
61 $ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
62 $ConfOverride{'TLH'} = $OptTLH if $OptTLH;
63 &OverrideConfig(\%Conf,\%ConfOverride);
65 ### get type of information to gather, defaulting to 'all'
66 $OptStatsType = 'all' if !$OptStatsType;
67 &Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
68 if !exists($LegalStats{$OptStatsType});
70 ### get time period from --month
71 # get verbal description of time period, drop SQL code
72 my ($Period) = &GetTimePeriod($OptMonth);
73 &Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
74 "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
76 ### reformat $Conf{'TLH'}
79 # $Conf{'TLH'} is parsed as an array by Config::Auto;
80 # make a flat list again, separated by :
81 if (ref($TLH) eq 'ARRAY') {
82 $TLH = join(':',@{$Conf{'TLH'}});
88 # check for illegal characters
89 &Bleat(2,'Config error - illegal characters in TLH definition!')
90 if ($TLH !~ /^[a-zA-Z0-9:]+$/);
92 # reformat $TLH from a:b to (a)|(b),
93 # e.g. replace '.' by '|'
95 $TLH = '(' . $TLH . ')';
99 # read list of newsgroups from --checkgroups
101 my %ValidGroups = %{ReadGroupList($OptCheckgroupsFile)} if $OptCheckgroupsFile;
104 my $DBHandle = InitDB(\%Conf,1);
106 ### get data for each month
107 &Bleat(1,'Test mode. Database is not updated.') if $OptTest;
108 foreach my $Month (&ListMonth($Period)) {
110 print "---------- $Month ----------\n" if $OptDebug;
112 if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
113 ### ----------------------------------------------
114 ### get groups data (number of postings per group)
115 # get groups data from raw table for given month
116 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
117 "WHERE day LIKE ? AND NOT disregard",
119 $Conf{'DBTableRaw'}));
120 $DBQuery->execute($Month.'-%')
121 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
122 "$DBI::errstr\n",$Month,
123 $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
125 # count postings per group
127 while (($_) = $DBQuery->fetchrow_array) {
128 # get list oft newsgroups and hierarchies from Newsgroups:
129 my %Newsgroups = ListNewsgroups($_,$TLH,
130 $OptCheckgroupsFile ? \%ValidGroups : '');
131 # count each newsgroup and hierarchy once
132 foreach (sort keys %Newsgroups) {
137 # add valid but empty groups if --checkgroups is set
139 foreach (sort keys %ValidGroups) {
140 if (!defined($Postings{$_})) {
142 warn (sprintf("ADDED: %s as empty group\n",$_));
147 # delete old data for that month
149 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
150 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
152 or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
153 "$DBI::errstr\n",$Month,
154 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
157 print "----- GroupStats -----\n" if $OptDebug;
158 foreach my $Newsgroup (sort keys %Postings) {
159 print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
162 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
163 "(month,newsgroup,postings) ".
166 $Conf{'DBTableGrps'}));
167 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
168 or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
169 "$DBI::errstr\n",$Month,$Newsgroup,
170 $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
175 # other types of information go here - later on
180 $DBHandle->disconnect;
184 ################################ Documentation #################################
188 gatherstats - process statistical data from a raw source
192 B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats] [B<-c> I<checkgroups file>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>]
200 This script will extract and process statistical information from a
201 database table which is fed from F<feedlog.pl> for a given time period
202 and write its results to (an)other database table(s). Entries marked
203 with I<'disregard'> in the database will be ignored; currently, you
204 have to set this flag yourself, using your database management tools.
205 You can exclude erroneous entries that way (e.g. automatic reposts
206 (think of cancels flood and resurrectors); spam; ...).
208 The time period to act on defaults to last month; you can assign
209 another time period or a single month via the B<--month> option (see
212 By default B<gatherstats> will process all types of information; you
213 can change that using the B<--stats> option and assigning the type of
214 information to process. Currently that doesn't matter yet as only
215 processing of the number of postings per group per month is
218 Possible information types include:
222 =item B<groups> (postings per group per month)
224 B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
225 counted for each single group they appear in. Groups not in I<TLH>
228 B<gatherstats> will also add up the number of postings for each
229 hierarchy level, but only count each posting once. A posting to
230 de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
231 respectively. A crossposting to de.alt.test and de.alt.admin, on the
232 other hand, will be counted for de.alt.test and de.alt.admin each, but
233 only once for de.alt.ALL and de.ALL.
235 Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
236 override that default through the B<--groupsdb> option.
242 B<gatherstats> will read its configuration from F<newsstats.conf>
243 which should be present in the same directory via Config::Auto.
245 See L<doc/INSTALL> for an overview of possible configuration options.
247 You can override configuration options via the B<--hierarchy>,
248 B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
255 =item B<-V>, B<--version>
257 Print out version and copyright information and exit.
259 =item B<-h>, B<--help>
261 Print this man page and exit.
263 =item B<-d>, B<--debug>
265 Output debugging information to STDOUT while processing (number of
268 =item B<-t>, B<--test>
270 Do not write results to database. You should use B<--debug> in
271 conjunction with B<--test> ... everything else seems a bit pointless.
273 =item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
275 Set processing period to a single month in YYYY-MM format or to a time
276 period between two month in YYYY-MM:YYYY-MM format (two month, separated
280 =item B<-s>, B<--stats> I<type>
282 Set processing type to one of I<all> and I<groups>. Defaults to all
283 (and is currently rather pointless as only I<groups> has been
286 =item B<-c>, B<--checkgroups> I<filename>
288 Check each group against a list of valid newsgroups read from
289 I<filename>, one group on each line and ignoring everything after the
290 first whitespace (so you can use a file in checkgroups format or (part
291 of) your INN active file).
293 Newsgroups not found in I<filename> will be dropped (and logged to
294 STDERR), and newsgroups found in I<filename> but having no postings
295 will be added with a count of 0 (and logged to STDERR).
297 =item B<--hierarchy> I<TLH> (newsgroup hierarchy)
299 Override I<TLH> from F<newsstats.conf>.
301 =item B<--rawdb> I<table> (raw data table)
303 Override I<DBTableRaw> from F<newsstats.conf>.
305 =item B<--groupsdb> I<table> (postings per group table)
307 Override I<DBTableGrps> from F<newsstats.conf>.
309 =item B<--clientsdb> I<table> (client data table)
311 Override I<DBTableClnts> from F<newsstats.conf>.
313 =item B<--hostsdb> I<table> (host data table)
315 Override I<DBTableHosts> from F<newsstats.conf>.
325 Process all types of information for lasth month:
329 Do a dry run, showing results of processing:
331 gatherstats --debug --test
333 Process all types of information for January of 2010:
335 gatherstats --month 2010-01
337 Process only number of postings for the year of 2010,
338 checking against checkgroups-2010.txt:
340 gatherstats -m 2010-01:2010-12 -s groups -c checkgroups-2010.txt
346 =item F<gatherstats.pl>
350 =item F<NewsStats.pm>
352 Library functions for the NewsStats package.
354 =item F<newsstats.conf>
356 Runtime configuration file.
362 Please report any bugs or feature requests to the author or use the
363 bug tracker at L<http://bugs.th-h.de/>!
379 This script is part of the B<NewsStats> package.
383 Thomas Hochstein <thh@inter.net>
385 =head1 COPYRIGHT AND LICENSE
387 Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
389 This program is free software; you may redistribute it and/or modify it
390 under the same terms as Perl itself.