5 # This script will gather statistical information from a database
6 # containing headers and other information from a INN feed.
8 # It is part of the NewsStats package.
10 # Copyright (c) 2010 Thomas Hochstein <thh@inter.net>
12 # It can be redistributed and/or modified under the same terms under
13 # which Perl itself is published.
16 our $VERSION = "0.01";
18 push(@INC, dirname($0));
22 use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList);
26 ################################# Definitions ##################################
28 # define types of information that can be gathered
29 # all / groups (/ clients / hosts)
31 @LegalTypes{('all','groups')} = ();
33 ################################# Main program #################################
35 ### read commandline options
36 my %Options = &ReadOptions('dom:p:t:l:n:r:g:c:s:');
38 ### read configuration
39 my %Conf = %{ReadConfig('newsstats.conf')};
41 ### override configuration via commandline options
43 $ConfOverride{'DBTableRaw'} = $Options{'r'} if $Options{'r'};
44 $ConfOverride{'DBTableGrps'} = $Options{'g'} if $Options{'g'};
45 $ConfOverride{'DBTableClnts'} = $Options{'c'} if $Options{'c'};
46 $ConfOverride{'DBTableHosts'} = $Options{'s'} if $Options{'s'};
47 $ConfOverride{'TLH'} = $Options{'n'} if $Options{'n'};
48 &OverrideConfig(\%Conf,\%ConfOverride);
50 ### get type of information to gather, defaulting to 'all'
51 $Options{'t'} = 'all' if !$Options{'t'};
52 die "$MySelf: E: Unknown type '-t $Options{'t'}'!\n" if !exists($LegalTypes{$Options{'t'}});
54 ### get time period (-m or -p)
55 my ($StartMonth,$EndMonth) = &GetTimePeriod($Options{'m'},$Options{'p'});
57 ### reformat $Conf{'TLH'}
60 # $Conf{'TLH'} is parsed as an array by Config::Auto;
61 # make a flat list again, separated by :
62 if (ref($TLH) eq 'ARRAY') {
63 $TLH = join(':',@{$Conf{'TLH'}});
69 # check for illegal characters
70 die "$MySelf: E: Config error - illegal characters in TLH definition\n" if ($TLH !~ /^[a-zA-Z0-9:]+$/);
72 # reformat $TLH form a:b to (a)|(b)
74 $TLH = '(' . $TLH . ')';
78 ### read newsgroups list from -l
79 my %ValidGroups = %{&ReadGroupList($Options{'l'})} if $Options{'l'};
82 my $DBHandle = InitDB(\%Conf,1);
84 ### get data for each month
85 warn "$MySelf: W: Output only mode. Database is not updated.\n" if $Options{'o'};
86 foreach my $Month (&ListMonth($StartMonth,$EndMonth)) {
88 print "---------- $Month ----------\n" if $Options{'d'};
90 if ($Options{'t'} eq 'all' or $Options{'t'} eq 'groups') {
91 ### ----------------------------------------------
92 ### get groups data (number of postings per group)
93 # get groups data from raw table for given month
94 my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s WHERE day LIKE ? AND NOT disregard",$Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
95 $DBQuery->execute($Month.'-%') or die sprintf("$MySelf: E: Can't get groups data for %s from %s.%s: $DBI::errstr\n",$Month,$Conf{'DBDatabase'},$Conf{'DBTableRaw'});
97 # count postings per group
99 while (($_) = $DBQuery->fetchrow_array) {
100 # get list oft newsgroups and hierarchies from Newsgroups:
101 my %Newsgroups = ListNewsgroups($_,$TLH,$Options{'l'} ? \%ValidGroups : '');
102 # count each newsgroup and hierarchy once
103 foreach (sort keys %Newsgroups) {
108 # add valid but empty groups if -l is set
110 foreach (sort keys %ValidGroups) {
111 if (!defined($Postings{$_})) {
113 warn (sprintf("ADDED: %s as empty group\n",$_));
118 # delete old data for that month
119 if (!$Options{'o'}) {
120 $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",$Conf{'DBDatabase'},$Conf{'DBTableGrps'}),undef,$Month)
121 or warn sprintf("$MySelf: E: Can't delete old groups data for %s from %s.%s: $DBI::errstr\n",$Month,$Conf{'DBDatabase'},$Conf{'DBTableGrps'});
124 print "----- GroupStats -----\n" if $Options{'d'};
125 foreach my $Newsgroup (sort keys %Postings) {
126 print "$Newsgroup => $Postings{$Newsgroup}\n" if $Options{'d'};
127 if (!$Options{'o'}) {
129 $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s (month,newsgroup,postings) VALUES (?, ?, ?)",$Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
130 # $DBQuery = $DBHandle->prepare(sprintf("REPLACE INTO %s.%s (month,newsgroup,postings) VALUES (?, ?, ?)",$Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
131 $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup}) or die sprintf("$MySelf: E: Can't write groups data for %s/%s to %s.%s: $DBI::errstr\n",$Month,$Newsgroup,$Conf{'DBDatabase'},$Conf{'DBTableGrps'});
136 # other types of information go here - later on
141 $DBHandle->disconnect;
145 ################################ Documentation #################################
149 gatherstats - process statistical data from a raw source
153 B<gatherstats> [B<-Vhdo>] [B<-m> I<YYYY-MM>] [B<-p> I<YYYY-MM:YYYY-MM>] [B<-t> I<type>] [B<-l> I<filename>] [B<-n> I<TLH>] [B<-r> I<database table>] [B<-g> I<database table>] [B<-c> I<database table>] [B<-s> I<database table>]
157 See doc/README: Perl 5.8.x itself and the following modules from CPAN:
173 This script will extract and process statistical information from a
174 database table which is fed from F<feedlog.pl> for a given time period
175 and write its results to (an)other database table(s). Entries marked
176 with I<'disregard'> in the database will be ignored; currently, you have
177 to set this flag yourself, using your database management tools. You
178 can exclude erroneous entries that way (e.g. automatic reposts (think
179 of cancels flood and resurrectors); spam; ...).
181 The time period to act on defaults to last month; you can assign
182 another month via the B<-m> switch or a time period via the B<-p>
183 switch; the latter takes preference.
185 By default B<gatherstats> will process all types of information; you
186 can change that using the B<-t> switch and assigning the type of
187 information to process. Currently only processing of the number of
188 postings per group per month is implemented anyway, so that doesn't
191 Possible information types include:
195 =item B<groups> (postings per group per month)
197 B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
198 counted for each single group they appear in. Groups not in I<TLH>
201 B<gatherstats> will also add up the number of postings for each
202 hierarchy level, but only count each posting once. A posting to
203 de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
204 respectively. A crossposting to de.alt.test and de.alt.admin, on the
205 other hand, will be counted for de.alt.test and de.alt.admin each, but
206 only once for de.alt.ALL and de.ALL.
208 Data is written to I<DBTableGrps> (see doc/INSTALL).
214 F<gatherstats.pl> will read its configuration from F<newsstats.conf>
215 which should be present in the same directory via Config::Auto.
217 See doc/INSTALL for an overview of possible configuration options.
219 You can override configuration options via the B<-n>, B<-r>, B<-g>,
220 B<-c> and B<-s> switches, respectively.
226 =item B<-V> (version)
228 Print out version and copyright information on B<yapfaq> and exit.
232 Print this man page and exit.
236 Output debugging information to STDOUT while processing (number of
239 =item B<-o> (output only)
241 Do not write results to database. You should use B<-d> in conjunction
242 with B<-o> ... everything else seems a bit pointless.
244 =item B<-m> I<YYYY-MM> (month)
246 Set processing period to a month in YYYY-MM format. Ignored if B<-p>
249 =item B<-p> I<YYYY-MM:YYYY-MM> (period)
251 Set processing period to a time period between two month, each in
252 YYYY-MM format, separated by a colon. Overrides B<-m>.
254 =item B<-t> I<type> (type)
256 Set processing type to one of I<all> and I<groups>. Defaults to all
257 (and is currently rather pointless as only I<groups> has been
260 =item B<-l> I<filename> (check against list)
262 Check each group against a list of valid newsgroups read from
263 I<filename>, one group on each line and ignoring everything after the
264 first whitespace (so you can use a file in checkgroups format or (part
265 of) your INN active file).
267 Newsgroups not found in I<filename> will be dropped (and logged to
268 STDERR), and newsgroups found in I<filename> but having no postings
269 will be added with a count of 0 (and logged to STDERR).
271 =item B<-n> I<TLH> (newsgroup hierarchy)
273 Override I<TLH> from F<newsstats.conf>.
275 =item B<-r> I<table> (raw data table)
277 Override I<DBTableRaw> from F<newsstats.conf>.
279 =item B<-g> I<table> (postings per group table)
281 Override I<DBTableGrps> from F<newsstats.conf>.
283 =item B<-c> I<table> (client data table)
285 Override I<DBTableClnts> from F<newsstats.conf>.
287 =item B<-s> I<table> (server/host data table)
289 Override I<DBTableHosts> from F<newsstats.conf>.
299 Process all types of information for lasth month:
303 Do a dry run, showing results of processing:
307 Process all types of information for January of 2010:
309 gatherstats -m 2010-01
311 Process only number of postings for the year of 2010,
312 checking against checkgroups-2010.txt:
314 gatherstats -p 2010-01:2010-12 -t groups -l checkgroups-2010.txt
320 =item F<gatherstats.pl>
324 =item F<NewsStats.pm>
326 Library functions for the NewsStats package.
328 =item F<newsstats.conf>
330 Runtime configuration file for B<yapfaq>.
336 Please report any bugs or feature requests to the author or use the
337 bug tracker at L<http://bugs.th-h.de/>!
353 This script is part of the B<NewsStats> package.
357 Thomas Hochstein <thh@inter.net>
359 =head1 COPYRIGHT AND LICENSE
361 Copyright (c) 2010 Thomas Hochstein <thh@inter.net>
363 This program is free software; you may redistribute it and/or modify it
364 under the same terms as Perl itself.