Commit | Line | Data |
---|---|---|
3f817eb4 | 1 | #! /usr/bin/perl |
2832c235 TH |
2 | # |
3 | # gatherstats.pl | |
4 | # | |
5 | # This script will gather statistical information from a database | |
6 | # containing headers and other information from a INN feed. | |
dfc2b81c | 7 | # |
2832c235 TH |
8 | # It is part of the NewsStats package. |
9 | # | |
07c0b258 | 10 | # Copyright (c) 2010-2013 Thomas Hochstein <thh@inter.net> |
2832c235 | 11 | # |
dfc2b81c | 12 | # It can be redistributed and/or modified under the same terms under |
2832c235 TH |
13 | # which Perl itself is published. |
14 | ||
15 | BEGIN { | |
24d2011f | 16 | our $VERSION = "0.02"; |
2832c235 | 17 | use File::Basename; |
2ad99c20 TH |
18 | # we're in .../bin, so our module is in ../lib |
19 | push(@INC, dirname($0).'/../lib'); | |
2832c235 TH |
20 | } |
21 | use strict; | |
3f817eb4 | 22 | use warnings; |
2832c235 | 23 | |
1703b8e3 | 24 | use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList); |
2832c235 TH |
25 | |
26 | use DBI; | |
880c3eb2 TH |
27 | use Getopt::Long qw(GetOptions); |
28 | Getopt::Long::config ('bundling'); | |
2832c235 TH |
29 | |
30 | ################################# Definitions ################################## | |
31 | ||
32 | # define types of information that can be gathered | |
33 | # all / groups (/ clients / hosts) | |
880c3eb2 TH |
34 | my %LegalStats; |
35 | @LegalStats{('all','groups')} = (); | |
2832c235 TH |
36 | |
37 | ################################# Main program ################################# | |
38 | ||
39 | ### read commandline options | |
880c3eb2 | 40 | my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH, |
23ab67a0 | 41 | $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest,$OptConfFile); |
880c3eb2 TH |
42 | GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile, |
43 | 'clientsdb=s' => \$OptClientsDB, | |
44 | 'd|debug!' => \$OptDebug, | |
45 | 'groupsdb=s' => \$OptGroupsDB, | |
46 | 'hierarchy=s' => \$OptTLH, | |
47 | 'hostsdb=s' => \$OptHostsDB, | |
48 | 'm|month=s' => \$OptMonth, | |
49 | 'rawdb=s' => \$OptRawDB, | |
50 | 's|stats=s' => \$OptStatsType, | |
51 | 't|test!' => \$OptTest, | |
23ab67a0 | 52 | 'conffile=s' => \$OptConfFile, |
880c3eb2 TH |
53 | 'h|help' => \&ShowPOD, |
54 | 'V|version' => \&ShowVersion) or exit 1; | |
2832c235 TH |
55 | |
56 | ### read configuration | |
23ab67a0 | 57 | my %Conf = %{ReadConfig($OptConfFile)}; |
2832c235 TH |
58 | |
59 | ### override configuration via commandline options | |
60 | my %ConfOverride; | |
880c3eb2 TH |
61 | $ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB; |
62 | $ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB; | |
63 | $ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB; | |
64 | $ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB; | |
65 | $ConfOverride{'TLH'} = $OptTLH if $OptTLH; | |
2832c235 TH |
66 | &OverrideConfig(\%Conf,\%ConfOverride); |
67 | ||
68 | ### get type of information to gather, defaulting to 'all' | |
880c3eb2 TH |
69 | $OptStatsType = 'all' if !$OptStatsType; |
70 | &Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType)) | |
71 | if !exists($LegalStats{$OptStatsType}); | |
2832c235 | 72 | |
880c3eb2 TH |
73 | ### get time period from --month |
74 | # get verbal description of time period, drop SQL code | |
75 | my ($Period) = &GetTimePeriod($OptMonth); | |
8dc6823e TH |
76 | # bail out if --month is invalid or set to 'ALL'; |
77 | # we don't support the latter | |
880c3eb2 TH |
78 | &Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ". |
79 | "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time'); | |
2832c235 | 80 | |
17ffbeba TH |
81 | ### reformat $Conf{'TLH'} |
82 | my $TLH; | |
83 | if ($Conf{'TLH'}) { | |
84 | # $Conf{'TLH'} is parsed as an array by Config::Auto; | |
85 | # make a flat list again, separated by : | |
43a0fc77 | 86 | if (ref($Conf{'TLH'}) eq 'ARRAY') { |
17ffbeba TH |
87 | $TLH = join(':',@{$Conf{'TLH'}}); |
88 | } else { | |
89 | $TLH = $Conf{'TLH'}; | |
90 | } | |
91 | # strip whitespace | |
92 | $TLH =~ s/\s//g; | |
7773fb6d TH |
93 | # add trailing dots if none are present yet |
94 | # (using negative look-behind assertions) | |
95 | $TLH =~ s/(?<!\.):/.:/g; | |
96 | $TLH =~ s/(?<!\.)$/./; | |
17ffbeba | 97 | # check for illegal characters |
880c3eb2 | 98 | &Bleat(2,'Config error - illegal characters in TLH definition!') |
314e31aa | 99 | if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/); |
7773fb6d TH |
100 | # escape dots |
101 | $TLH =~ s/\./\\./g; | |
17ffbeba | 102 | if ($TLH =~ /:/) { |
880c3eb2 | 103 | # reformat $TLH from a:b to (a)|(b), |
43a0fc77 | 104 | # e.g. replace ':' by ')|(' |
17ffbeba TH |
105 | $TLH =~ s/:/)|(/g; |
106 | $TLH = '(' . $TLH . ')'; | |
107 | }; | |
108 | }; | |
109 | ||
2832c235 TH |
110 | ### init database |
111 | my $DBHandle = InitDB(\%Conf,1); | |
112 | ||
113 | ### get data for each month | |
880c3eb2 TH |
114 | &Bleat(1,'Test mode. Database is not updated.') if $OptTest; |
115 | foreach my $Month (&ListMonth($Period)) { | |
2832c235 | 116 | |
880c3eb2 | 117 | print "---------- $Month ----------\n" if $OptDebug; |
2832c235 | 118 | |
880c3eb2 | 119 | if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') { |
93c8eae2 TH |
120 | # read list of newsgroups from --checkgroups |
121 | # into a hash | |
122 | my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))} | |
123 | if $OptCheckgroupsFile; | |
124 | ||
2832c235 TH |
125 | ### ---------------------------------------------- |
126 | ### get groups data (number of postings per group) | |
127 | # get groups data from raw table for given month | |
880c3eb2 TH |
128 | my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ". |
129 | "WHERE day LIKE ? AND NOT disregard", | |
130 | $Conf{'DBDatabase'}, | |
131 | $Conf{'DBTableRaw'})); | |
132 | $DBQuery->execute($Month.'-%') | |
133 | or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ". | |
134 | "$DBI::errstr\n",$Month, | |
135 | $Conf{'DBDatabase'},$Conf{'DBTableRaw'})); | |
2832c235 TH |
136 | |
137 | # count postings per group | |
138 | my %Postings; | |
2832c235 | 139 | while (($_) = $DBQuery->fetchrow_array) { |
b5125b10 | 140 | # get list of newsgroups and hierarchies from Newsgroups: |
880c3eb2 TH |
141 | my %Newsgroups = ListNewsgroups($_,$TLH, |
142 | $OptCheckgroupsFile ? \%ValidGroups : ''); | |
2832c235 TH |
143 | # count each newsgroup and hierarchy once |
144 | foreach (sort keys %Newsgroups) { | |
2832c235 TH |
145 | $Postings{$_}++; |
146 | }; | |
147 | }; | |
148 | ||
880c3eb2 | 149 | # add valid but empty groups if --checkgroups is set |
ad609792 TH |
150 | if (%ValidGroups) { |
151 | foreach (sort keys %ValidGroups) { | |
152 | if (!defined($Postings{$_})) { | |
1703b8e3 TH |
153 | # add current newsgroup as empty group |
154 | $Postings{$_} = 0; | |
155 | warn (sprintf("ADDED: %s as empty group\n",$_)); | |
156 | # add empty hierarchies for current newsgroup as needed | |
157 | foreach (ParseHierarchies($_)) { | |
158 | my $Hierarchy = $_ . '.ALL'; | |
159 | if (!defined($Postings{$Hierarchy})) { | |
160 | $Postings{$Hierarchy} = 0; | |
161 | warn (sprintf("ADDED: %s as empty group\n",$Hierarchy)); | |
b5125b10 TH |
162 | }; |
163 | }; | |
ad609792 TH |
164 | } |
165 | }; | |
166 | }; | |
23ab67a0 | 167 | |
71f0178b | 168 | # delete old data for that month |
880c3eb2 TH |
169 | if (!$OptTest) { |
170 | $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?", | |
171 | $Conf{'DBDatabase'},$Conf{'DBTableGrps'}), | |
172 | undef,$Month) | |
173 | or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ". | |
174 | "$DBI::errstr\n",$Month, | |
175 | $Conf{'DBDatabase'},$Conf{'DBTableGrps'})); | |
71f0178b TH |
176 | }; |
177 | ||
880c3eb2 | 178 | print "----- GroupStats -----\n" if $OptDebug; |
2832c235 | 179 | foreach my $Newsgroup (sort keys %Postings) { |
880c3eb2 TH |
180 | print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug; |
181 | if (!$OptTest) { | |
2832c235 | 182 | # write to database |
880c3eb2 TH |
183 | $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ". |
184 | "(month,newsgroup,postings) ". | |
185 | "VALUES (?, ?, ?)", | |
186 | $Conf{'DBDatabase'}, | |
187 | $Conf{'DBTableGrps'})); | |
188 | $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup}) | |
189 | or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ". | |
190 | "$DBI::errstr\n",$Month,$Newsgroup, | |
191 | $Conf{'DBDatabase'},$Conf{'DBTableGrps'})); | |
2832c235 TH |
192 | $DBQuery->finish; |
193 | }; | |
194 | }; | |
195 | } else { | |
196 | # other types of information go here - later on | |
197 | }; | |
198 | }; | |
199 | ||
200 | ### close handles | |
201 | $DBHandle->disconnect; | |
202 | ||
203 | __END__ | |
204 | ||
205 | ################################ Documentation ################################# | |
206 | ||
207 | =head1 NAME | |
208 | ||
209 | gatherstats - process statistical data from a raw source | |
210 | ||
211 | =head1 SYNOPSIS | |
212 | ||
e39d4207 | 213 | B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats>] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>] [B<--conffile> I<filename>] |
2832c235 TH |
214 | |
215 | =head1 REQUIREMENTS | |
216 | ||
880c3eb2 | 217 | See L<doc/README>. |
2832c235 TH |
218 | |
219 | =head1 DESCRIPTION | |
220 | ||
221 | This script will extract and process statistical information from a | |
222 | database table which is fed from F<feedlog.pl> for a given time period | |
313610f6 | 223 | and write its results to (an)other database table(s). Entries marked |
880c3eb2 TH |
224 | with I<'disregard'> in the database will be ignored; currently, you |
225 | have to set this flag yourself, using your database management tools. | |
226 | You can exclude erroneous entries that way (e.g. automatic reposts | |
227 | (think of cancels flood and resurrectors); spam; ...). | |
2832c235 TH |
228 | |
229 | The time period to act on defaults to last month; you can assign | |
880c3eb2 TH |
230 | another time period or a single month via the B<--month> option (see |
231 | below). | |
2832c235 TH |
232 | |
233 | By default B<gatherstats> will process all types of information; you | |
880c3eb2 TH |
234 | can change that using the B<--stats> option and assigning the type of |
235 | information to process. Currently that doesn't matter yet as only | |
236 | processing of the number of postings per group per month is | |
237 | implemented anyway. | |
2832c235 TH |
238 | |
239 | Possible information types include: | |
240 | ||
241 | =over 3 | |
242 | ||
243 | =item B<groups> (postings per group per month) | |
244 | ||
245 | B<gatherstats> will examine Newsgroups: headers. Crosspostings will be | |
246 | counted for each single group they appear in. Groups not in I<TLH> | |
247 | will be ignored. | |
248 | ||
249 | B<gatherstats> will also add up the number of postings for each | |
250 | hierarchy level, but only count each posting once. A posting to | |
251 | de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL, | |
252 | respectively. A crossposting to de.alt.test and de.alt.admin, on the | |
253 | other hand, will be counted for de.alt.test and de.alt.admin each, but | |
254 | only once for de.alt.ALL and de.ALL. | |
255 | ||
880c3eb2 TH |
256 | Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can |
257 | override that default through the B<--groupsdb> option. | |
2832c235 TH |
258 | |
259 | =back | |
260 | ||
261 | =head2 Configuration | |
262 | ||
880c3eb2 | 263 | B<gatherstats> will read its configuration from F<newsstats.conf> |
44c19709 TH |
264 | which should be present in etc/ via Config::Auto or from a configuration file |
265 | submitted by the B<--conffile> option. | |
2832c235 | 266 | |
880c3eb2 | 267 | See L<doc/INSTALL> for an overview of possible configuration options. |
2832c235 | 268 | |
880c3eb2 TH |
269 | You can override configuration options via the B<--hierarchy>, |
270 | B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options, | |
271 | respectively. | |
2832c235 TH |
272 | |
273 | =head1 OPTIONS | |
274 | ||
275 | =over 3 | |
276 | ||
880c3eb2 | 277 | =item B<-V>, B<--version> |
2832c235 | 278 | |
880c3eb2 | 279 | Print out version and copyright information and exit. |
2832c235 | 280 | |
880c3eb2 | 281 | =item B<-h>, B<--help> |
2832c235 TH |
282 | |
283 | Print this man page and exit. | |
284 | ||
880c3eb2 | 285 | =item B<-d>, B<--debug> |
2832c235 TH |
286 | |
287 | Output debugging information to STDOUT while processing (number of | |
288 | postings per group). | |
289 | ||
880c3eb2 | 290 | =item B<-t>, B<--test> |
2832c235 | 291 | |
880c3eb2 TH |
292 | Do not write results to database. You should use B<--debug> in |
293 | conjunction with B<--test> ... everything else seems a bit pointless. | |
2832c235 | 294 | |
880c3eb2 | 295 | =item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]> |
2832c235 | 296 | |
880c3eb2 TH |
297 | Set processing period to a single month in YYYY-MM format or to a time |
298 | period between two month in YYYY-MM:YYYY-MM format (two month, separated | |
dfc2b81c | 299 | by a colon). |
2832c235 | 300 | |
880c3eb2 | 301 | =item B<-s>, B<--stats> I<type> |
2832c235 TH |
302 | |
303 | Set processing type to one of I<all> and I<groups>. Defaults to all | |
304 | (and is currently rather pointless as only I<groups> has been | |
305 | implemented). | |
306 | ||
93c8eae2 TH |
307 | =item B<-c>, B<--checkgroups> I<filename template> |
308 | ||
309 | Check each group against a list of valid newsgroups read from a file, | |
310 | one group on each line and ignoring everything after the first | |
311 | whitespace (so you can use a file in checkgroups format or (part of) | |
312 | your INN active file). | |
313 | ||
95d9fe2c TH |
314 | The filename is taken from I<filename template>, amended by each |
315 | B<--month> B<gatherstats> is processing in the form of I<template-YYYY-MM>, | |
316 | so that | |
93c8eae2 TH |
317 | |
318 | gatherstats -m 2010-01:2010-12 -c checkgroups | |
ad609792 | 319 | |
93c8eae2 TH |
320 | will check against F<checkgroups-2010-01> for January 2010, against |
321 | F<checkgroups-2010-02> for February 2010 and so on. | |
ad609792 | 322 | |
93c8eae2 TH |
323 | Newsgroups not found in the checkgroups file will be dropped (and |
324 | logged to STDERR), and newsgroups found there but having no postings | |
ad609792 TH |
325 | will be added with a count of 0 (and logged to STDERR). |
326 | ||
880c3eb2 | 327 | =item B<--hierarchy> I<TLH> (newsgroup hierarchy) |
2832c235 TH |
328 | |
329 | Override I<TLH> from F<newsstats.conf>. | |
330 | ||
880c3eb2 | 331 | =item B<--rawdb> I<table> (raw data table) |
2832c235 TH |
332 | |
333 | Override I<DBTableRaw> from F<newsstats.conf>. | |
334 | ||
880c3eb2 | 335 | =item B<--groupsdb> I<table> (postings per group table) |
2832c235 TH |
336 | |
337 | Override I<DBTableGrps> from F<newsstats.conf>. | |
338 | ||
880c3eb2 | 339 | =item B<--clientsdb> I<table> (client data table) |
2832c235 TH |
340 | |
341 | Override I<DBTableClnts> from F<newsstats.conf>. | |
342 | ||
880c3eb2 | 343 | =item B<--hostsdb> I<table> (host data table) |
2832c235 TH |
344 | |
345 | Override I<DBTableHosts> from F<newsstats.conf>. | |
346 | ||
23ab67a0 TH |
347 | =item B<--conffile> I<filename> |
348 | ||
349 | Load configuration from I<filename> instead of F<newsstats.conf>. | |
350 | ||
2832c235 TH |
351 | =back |
352 | ||
353 | =head1 INSTALLATION | |
354 | ||
880c3eb2 | 355 | See L<doc/INSTALL>. |
2832c235 TH |
356 | |
357 | =head1 EXAMPLES | |
358 | ||
359 | Process all types of information for lasth month: | |
360 | ||
361 | gatherstats | |
362 | ||
363 | Do a dry run, showing results of processing: | |
364 | ||
880c3eb2 | 365 | gatherstats --debug --test |
2832c235 TH |
366 | |
367 | Process all types of information for January of 2010: | |
368 | ||
880c3eb2 | 369 | gatherstats --month 2010-01 |
2832c235 | 370 | |
ad609792 | 371 | Process only number of postings for the year of 2010, |
93c8eae2 | 372 | checking against checkgroups-*: |
2832c235 | 373 | |
93c8eae2 | 374 | gatherstats -m 2010-01:2010-12 -s groups -c checkgroups |
2832c235 TH |
375 | |
376 | =head1 FILES | |
377 | ||
378 | =over 4 | |
379 | ||
2ad99c20 | 380 | =item F<bin/gatherstats.pl> |
2832c235 TH |
381 | |
382 | The script itself. | |
383 | ||
2ad99c20 | 384 | =item F<lib/NewsStats.pm> |
2832c235 TH |
385 | |
386 | Library functions for the NewsStats package. | |
387 | ||
2ad99c20 | 388 | =item F<etc/newsstats.conf> |
2832c235 | 389 | |
880c3eb2 | 390 | Runtime configuration file. |
2832c235 TH |
391 | |
392 | =back | |
393 | ||
394 | =head1 BUGS | |
395 | ||
396 | Please report any bugs or feature requests to the author or use the | |
397 | bug tracker at L<http://bugs.th-h.de/>! | |
398 | ||
399 | =head1 SEE ALSO | |
400 | ||
401 | =over 2 | |
402 | ||
403 | =item - | |
404 | ||
880c3eb2 | 405 | L<doc/README> |
2832c235 TH |
406 | |
407 | =item - | |
408 | ||
880c3eb2 | 409 | L<doc/INSTALL> |
2832c235 TH |
410 | |
411 | =back | |
412 | ||
413 | This script is part of the B<NewsStats> package. | |
414 | ||
415 | =head1 AUTHOR | |
416 | ||
417 | Thomas Hochstein <thh@inter.net> | |
418 | ||
419 | =head1 COPYRIGHT AND LICENSE | |
420 | ||
28717921 | 421 | Copyright (c) 2010-2013 Thomas Hochstein <thh@inter.net> |
2832c235 TH |
422 | |
423 | This program is free software; you may redistribute it and/or modify it | |
424 | under the same terms as Perl itself. | |
425 | ||
426 | =cut |