5 # This script will get statistical data on newgroup usage
8 # It is part of the NewsStats package.
10 # Copyright (c) 2010-2013 Thomas Hochstein <thh@inter.net>
12 # It can be redistributed and/or modified under the same terms under
13 # which Perl itself is published.
16 our $VERSION = "0.01";
18 # we're in .../bin, so our module is in ../lib
19 push(@INC, dirname($0).'/../lib');
24 use NewsStats qw(:DEFAULT :TimePeriods :Output :SQLHelper ReadGroupList);
27 use Getopt::Long qw(GetOptions);
28 Getopt::Long::config ('bundling');
30 ################################# Main program #################################
32 ### read commandline options
33 my ($OptBoundType,$OptCaptions,$OptCheckgroupsFile,$OptComments,
34 $OptFileTemplate,$OptFormat,$OptGroupBy,$OptGroupsDB,$LowBound,$OptMonth,
35 $OptNewsgroups,$OptOrderBy,$OptReportType,$OptSums,$UppBound,$OptConfFile);
36 GetOptions ('b|boundary=s' => \$OptBoundType,
37 'c|captions!' => \$OptCaptions,
38 'checkgroups=s' => \$OptCheckgroupsFile,
39 'comments!' => \$OptComments,
40 'filetemplate=s' => \$OptFileTemplate,
41 'f|format=s' => \$OptFormat,
42 'g|group-by=s' => \$OptGroupBy,
43 'groupsdb=s' => \$OptGroupsDB,
44 'l|lower=i' => \$LowBound,
45 'm|month=s' => \$OptMonth,
46 'n|newsgroups=s' => \$OptNewsgroups,
47 'o|order-by=s' => \$OptOrderBy,
48 'r|report=s' => \$OptReportType,
49 's|sums!' => \$OptSums,
50 'u|upper=i' => \$UppBound,
51 'conffile=s' => \$OptConfFile,
52 'h|help' => \&ShowPOD,
53 'V|version' => \&ShowVersion) or exit 1;
55 # $OptComments defaults to TRUE
56 $OptComments = 1 if (!defined($OptComments));
57 # force --nocomments when --filetemplate is used
58 $OptComments = 0 if ($OptFileTemplate);
61 if ($OptBoundType =~ /level/i) {
62 $OptBoundType = 'level';
63 } elsif ($OptBoundType =~ /av(era)?ge?/i) {
64 $OptBoundType = 'average';
65 } elsif ($OptBoundType =~ /sums?/i) {
66 $OptBoundType = 'sum';
68 $OptBoundType = 'default';
71 # parse $OptReportType
73 if ($OptReportType =~ /av(era)?ge?/i) {
74 $OptReportType = 'average';
75 } elsif ($OptReportType =~ /sums?/i) {
76 $OptReportType = 'sum';
78 $OptReportType = 'default';
81 # read list of newsgroups from --checkgroups
82 # into a hash reference
83 my $ValidGroups = &ReadGroupList($OptCheckgroupsFile) if $OptCheckgroupsFile;
85 ### read configuration
86 my %Conf = %{ReadConfig($OptConfFile)};
88 ### override configuration via commandline options
90 $ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
91 &OverrideConfig(\%Conf,\%ConfOverride);
94 my $DBHandle = InitDB(\%Conf,1);
96 ### get time period and newsgroups, prepare SQL 'WHERE' clause
98 # and set caption for output and expression for SQL 'WHERE' clause
99 my ($CaptionPeriod,$SQLWherePeriod) = &GetTimePeriod($OptMonth);
100 # bail out if --month is invalid
101 &Bleat(2,"--month option has an invalid format - ".
102 "please use 'YYYY-MM', 'YYYY-MM:YYYY-MM' or 'ALL'!") if !$CaptionPeriod;
103 # get list of newsgroups and set expression for SQL 'WHERE' clause
104 # with placeholders as well as a list of newsgroup to bind to them
105 my ($SQLWhereNewsgroups,@SQLBindNewsgroups);
106 if ($OptNewsgroups) {
107 ($SQLWhereNewsgroups,@SQLBindNewsgroups) = &SQLGroupList($OptNewsgroups);
108 # bail out if --newsgroups is invalid
109 &Bleat(2,"--newsgroups option has an invalid format!")
110 if !$SQLWhereNewsgroups;
113 ### build SQL WHERE clause (and HAVING clause, if needed)
114 my ($SQLWhereClause,$SQLHavingClause);
115 # $OptBoundType 'level'
116 if ($OptBoundType and $OptBoundType ne 'default') {
117 $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,
118 $SQLWhereNewsgroups,&SQLHierarchies($OptSums));
119 $SQLHavingClause = SQLBuildClause('having',&SQLSetBounds($OptBoundType,
120 $LowBound,$UppBound));
121 # $OptBoundType 'threshold' / 'default' or none
123 $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,
124 $SQLWhereNewsgroups,&SQLHierarchies($OptSums),
125 &SQLSetBounds('default',$LowBound,$UppBound));
128 ### get sort order and build SQL 'ORDER BY' clause
129 # default to 'newsgroup' for $OptBoundType 'level' or 'average'
130 $OptGroupBy = 'newsgroup' if (!$OptGroupBy and
131 $OptBoundType and $OptBoundType ne 'default');
132 # force to 'month' for $OptReportType 'average' or 'sum'
133 $OptGroupBy = 'month' if ($OptReportType and $OptReportType ne 'default');
134 # parse $OptGroupBy to $GroupBy, create ORDER BY clause $SQLOrderClause
135 my ($GroupBy,$SQLOrderClause) = SQLSortOrder($OptGroupBy, $OptOrderBy);
136 # $GroupBy will contain 'month' or 'newsgroup' (parsed result of $OptGroupBy)
137 # set it to 'month' or 'key' for OutputData()
138 $GroupBy = ($GroupBy eq 'month') ? 'month' : 'key';
140 ### get report type and build SQL 'SELECT' query
142 my $SQLGroupClause = '';
143 my $Precision = 0; # number of digits right of decimal point for output
144 if ($OptReportType and $OptReportType ne 'default') {
145 $SQLGroupClause = 'GROUP BY newsgroup';
146 # change $SQLOrderClause: replace everything before 'postings'
147 $SQLOrderClause =~ s/BY.+postings/BY postings/;
148 if ($OptReportType eq 'average') {
149 $SQLSelect = "'All months',newsgroup,AVG(postings)";
151 # change $SQLOrderClause: replace 'postings' with 'AVG(postings)'
152 $SQLOrderClause =~ s/postings/AVG(postings)/;
153 } elsif ($OptReportType eq 'sum') {
154 $SQLSelect = "'All months',newsgroup,SUM(postings)";
155 # change $SQLOrderClause: replace 'postings' with 'SUM(postings)'
156 $SQLOrderClause =~ s/postings/SUM(postings)/;
159 $SQLSelect = 'month,newsgroup,postings';
162 ### get length of longest newsgroup name delivered by query
163 ### for formatting purposes
164 my $Field = ($GroupBy eq 'month') ? 'newsgroup' : 'month';
165 my ($MaxLength,$MaxValLength) = &GetMaxLength($DBHandle,$Conf{'DBTableGrps'},
166 $Field,'postings',$SQLWhereClause,
170 ### build and execute SQL query
172 # special query preparation for $OptBoundType 'level', 'average' or 'sums'
173 if ($OptBoundType and $OptBoundType ne 'default') {
174 # prepare and execute first query:
175 # get list of newsgroups meeting level conditions
176 $DBQuery = $DBHandle->prepare(sprintf('SELECT newsgroup FROM %s.%s %s '.
177 'GROUP BY newsgroup %s',
178 $Conf{'DBDatabase'},$Conf{'DBTableGrps'},
179 $SQLWhereClause,$SQLHavingClause));
180 $DBQuery->execute(@SQLBindNewsgroups)
181 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: %s\n",
182 $CaptionPeriod,$Conf{'DBDatabase'},$Conf{'DBTableGrps'},
184 # add newsgroups to a comma-seperated list ready for IN(...) query
186 while (my ($Newsgroup) = $DBQuery->fetchrow_array) {
187 $GroupList .= ',' if $GroupList;
188 $GroupList .= "'$Newsgroup'";
190 # enhance $WhereClause
192 $SQLWhereClause = SQLBuildClause('where',$SQLWhereClause,
193 sprintf('newsgroup IN (%s)',$GroupList));
195 # condition cannot be satisfied;
196 # force query to fail by adding '0=1'
197 $SQLWhereClause = SQLBuildClause('where',$SQLWhereClause,'0=1');
202 $DBQuery = $DBHandle->prepare(sprintf('SELECT %s FROM %s.%s %s %s %s',
204 $Conf{'DBDatabase'},$Conf{'DBTableGrps'},
205 $SQLWhereClause,$SQLGroupClause,
209 $DBQuery->execute(@SQLBindNewsgroups)
210 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: %s\n",
211 $CaptionPeriod,$Conf{'DBDatabase'},$Conf{'DBTableGrps'},
215 # set default to 'pretty'
216 $OptFormat = 'pretty' if !$OptFormat;
217 # print captions if --caption is set
218 if ($OptCaptions && $OptComments) {
219 # print time period with report type
220 my $CaptionReportType= '(number of postings for each month)';
221 if ($OptReportType and $OptReportType ne 'default') {
222 $CaptionReportType= '(average number of postings for each month)'
223 if $OptReportType eq 'average';
224 $CaptionReportType= '(number of all postings for that time period)'
225 if $OptReportType eq 'sum';
227 printf("# ----- Report for %s %s\n",$CaptionPeriod,$CaptionReportType);
228 # print newsgroup list if --newsgroups is set
229 printf("# ----- Newsgroups: %s\n",join(',',split(/:/,$OptNewsgroups)))
231 # print boundaries, if set
232 my $CaptionBoundary= '(counting only month fulfilling this condition)';
233 if ($OptBoundType and $OptBoundType ne 'default') {
234 $CaptionBoundary= '(every single month)' if $OptBoundType eq 'level';
235 $CaptionBoundary= '(on average)' if $OptBoundType eq 'average';
236 $CaptionBoundary= '(all month summed up)' if $OptBoundType eq 'sum';
238 printf("# ----- Threshold: %s %s x %s %s %s\n",
239 $LowBound ? $LowBound : '',$LowBound ? '=>' : '',
240 $UppBound ? '<=' : '',$UppBound ? $UppBound : '',$CaptionBoundary)
241 if ($LowBound or $UppBound);
242 # print primary and secondary sort order
243 printf("# ----- Grouped by %s (%s), sorted %s%s\n",
244 ($GroupBy eq 'month') ? 'Months' : 'Newsgroups',
245 ($OptGroupBy and $OptGroupBy =~ /-?desc$/i) ? 'descending' : 'ascending',
246 ($OptOrderBy and $OptOrderBy =~ /posting/i) ? 'by number of postings ' : '',
247 ($OptOrderBy and $OptOrderBy =~ /-?desc$/i) ? 'descending' : 'ascending');
251 &OutputData($OptFormat,$OptComments,$GroupBy,$Precision,
252 $OptCheckgroupsFile ? $ValidGroups : '',
253 $OptFileTemplate,$DBQuery,$MaxLength,$MaxValLength);
256 $DBHandle->disconnect;
260 ################################ Documentation #################################
264 groupstats - create reports on newsgroup usage
268 B<groupstats> [B<-Vhcs> B<--comments>] [B<-m> I<YYYY-MM>[:I<YYYY-MM>] | I<all>] [B<-n> I<newsgroup(s)>] [B<--checkgroups> I<checkgroups file>] [B<-r> I<report type>] [B<-l> I<lower boundary>] [B<-u> I<upper boundary>] [B<-b> I<boundary type>] [B<-g> I<group by>] [B<-o> I<order by>] [B<-f> I<output format>] [B<--filetemplate> I<filename template>] [B<--groupsdb> I<database table>] [--conffile I<filename>]
276 This script create reports on newsgroup usage (number of postings per
277 group per month) taken from result tables created by
280 =head2 Features and options
282 =head3 Time period and newsgroups
284 The time period to act on defaults to last month; you can assign another
285 time period or a single month (or drop all time constraints) via the
286 B<--month> option (see below).
288 B<groupstats> will process all newsgroups by default; you can limit
289 processing to only some newsgroups by supplying a list of those groups via
290 B<--newsgroups> option (see below). You can include hierarchy levels in
291 the output by adding the B<--sums> switch (see below). Optionally
292 newsgroups not present in a checkgroups file can be excluded from output,
293 sse B<--checkgroups> below.
297 You can choose between different B<--report> types: postings per month,
298 average postings per month or all postings summed up; for details, see
301 =head3 Upper and lower boundaries
303 Furthermore you can set an upper and/or lower boundary to exclude some
304 results from output via the B<--lower> and B<--upper> options,
305 respectively. By default, all newsgroups with more and/or less postings
306 per month will be excluded from the result set (i.e. not shown and not
307 considered for average and sum reports). You can change the meaning of
308 those boundaries with the B<--boundary> option. For details, please see
311 =head3 Sorting and formatting the output
313 By default, all results are grouped by month; you can group results by
314 newsgroup instead via the B<--groupy-by> option. Within those groups, the
315 list of newsgroups (or months) is sorted alphabetically (or
316 chronologically, respectively) ascending. You can change that order (and
317 sort by number of postings) with the B<--order-by> option. For details and
318 exceptions, please see below.
320 The results will be formatted as a kind of table; you can change the
321 output format to a simple list or just a list of newsgroups and number of
322 postings with the B<--format> option. Captions will be added by means of
323 the B<--caption> option; all comments (and captions) can be supressed by
324 using B<--nocomments>.
326 Last but not least you can redirect all output to a number of files, e.g.
327 one for each month, by submitting the B<--filetemplate> option, see below.
328 Captions and comments are automatically disabled in this case.
332 B<groupstats> will read its configuration from F<newsstats.conf>
333 which should be present in the same directory via Config::Auto.
335 See doc/INSTALL for an overview of possible configuration options.
337 You can override some configuration options via the B<--groupsdb> option.
343 =item B<-V>, B<--version>
345 Print out version and copyright information and exit.
347 =item B<-h>, B<--help>
349 Print this man page and exit.
351 =item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]|all>
353 Set processing period to a single month in YYYY-MM format or to a time
354 period between two month in YYYY-MM:YYYY-MM format (two month, separated
355 by a colon). By using the keyword I<all> instead, you can set no
356 processing period to process the whole database.
358 =item B<-n>, B<--newsgroups> I<newsgroup(s)>
360 Limit processing to a certain set of newsgroups. I<newsgroup(s)> can
361 be a single newsgroup name (de.alt.test), a newsgroup hierarchy
362 (de.alt.*) or a list of either of these, separated by colons, for
365 de.test:de.alt.test:de.newusers.*
367 =item B<-s>, B<--sums|--nosums> (sum per hierarchy level)
369 Include "virtual" groups for every hierarchy level in output, for
376 See the B<gatherstats> man page for details.
378 =item B<--checkgroups> I<filename>
380 Restrict output to those newgroups present in a file in checkgroups format
381 (one newgroup name per line; everything after the first whitespace on each
382 line is ignored). All other newsgroups will be removed from output.
384 Contrary to B<gatherstats>, I<filename> is not a template, but refers to
385 a single file in checkgroups format.
387 =item B<-r>, B<--report> I<default|average|sums>
389 Choose the report type: I<default>, I<average> or I<sums>
391 By default, B<groupstats> will report the number of postings for each
392 newsgroup in each month. But it can also report the average number of
393 postings per group for all months or the total sum of postings per group
396 For report types I<average> and I<sums>, the B<group-by> option has no
397 meaning and will be silently ignored (see below).
399 =item B<-l>, B<--lower> I<lower boundary>
401 Set the lower boundary. See B<--boundary> below.
403 =item B<-l>, B<--upper> I<upper boundary>
405 Set the upper boundary. See B<--boundary> below.
407 =item B<-b>, B<--boundary> I<boundary type>
409 Set the boundary type to one of I<default>, I<level>, I<average> or
412 By default, all newsgroups with more postings per month than the upper
413 boundary and/or less postings per month than the lower boundary will be
414 excluded from further processing. For the default report that means each
415 month only newsgroups with a number of postings between the boundaries
416 will be displayed. For the other report types, newsgroups with a number of
417 postings exceeding the boundaries in all (!) months will not be
420 For example, lets take a list of newsgroups like this:
423 de.comp.datenbanken.misc 6
424 de.comp.datenbanken.ms-access 84
425 de.comp.datenbanken.mysql 88
427 de.comp.datenbanken.misc 8
428 de.comp.datenbanken.ms-access 126
429 de.comp.datenbanken.mysql 21
431 de.comp.datenbanken.misc 24
432 de.comp.datenbanken.ms-access 83
433 de.comp.datenbanken.mysql 36
435 With C<groupstats --month 2012-01:2012-03 --lower 25 --report sums>,
436 you'll get the following result:
439 de.comp.datenbanken.ms-access 293
440 de.comp.datenbanken.mysql 124
442 de.comp.datenbanken.misc has not been considered even though it has 38
443 postings in total, because it has less than 25 postings in every single
444 month. If you want to list all newsgroups with more than 25 postings
445 I<in total>, you'll have to set the boundary type to I<sum>, see below.
447 A boundary type of I<level> will show only those newsgroups - at all -
448 that satisfy the boundaries in each and every single month. With the above
449 list of newsgroups and
450 C<groupstats --month 2012-01:2012-03 --lower 25 --boundary level --report sums>,
451 you'll get this result:
454 de.comp.datenbanken.ms-access 293
456 de.comp.datenbanken.mysql has not been considered because it had less than
457 25 postings in 2012-02 (only).
459 You can use that to get a list of newsgroups that have more (or less) then
460 x postings in every month during the whole reporting period.
462 A boundary type of I<average> will show only those newsgroups - at all -that
463 satisfy the boundaries on average. With the above list of newsgroups and
464 C<groupstats --month 2012-01:2012-03 --lower 25 --boundary avg --report sums>,
465 you'll get this result:
468 de.comp.datenbanken.ms-access 293
469 de.comp.datenbanken.mysql 145
471 The average number of postings in the three groups is:
473 de.comp.datenbanken.misc 12.67
474 de.comp.datenbanken.ms-access 97.67
475 de.comp.datenbanken.mysql 48.33
477 Last but not least, a boundary type of I<sums> will show only those
478 newsgroups - at all - that satisfy the boundaries with the total sum of
479 all postings during the reporting period. With the above list of
481 C<groupstats --month 2012-01:2012-03 --lower 25 --boundary sum --report sums>,
482 you'll finally get this result:
485 de.comp.datenbanken.misc 38
486 de.comp.datenbanken.ms-access 293
487 de.comp.datenbanken.mysql 145
490 =item B<-g>, B<--group-by> I<month[-desc]|newsgroups[-desc]>
492 By default, all results are grouped by month, sorted chronologically in
493 ascending order, like this:
496 de.comp.datenbanken.ms-access 84
497 de.comp.datenbanken.mysql 88
499 de.comp.datenbanken.ms-access 126
500 de.comp.datenbanken.mysql 21
502 The results can be grouped by newsgroups instead via
503 B<--group-by> I<newsgroup>:
505 ----- de.comp.datenbanken.ms-access:
508 ----- de.comp.datenbanken.mysql:
512 By appending I<-desc> to the group-by option parameter, you can reverse
513 the sort order - e.g. B<--group-by> I<month-desc> will give:
516 de.comp.datenbanken.ms-access 126
517 de.comp.datenbanken.mysql 21
519 de.comp.datenbanken.ms-access 84
520 de.comp.datenbanken.mysql 88
522 Average and sums reports (see above) will always be grouped by months;
523 this option will therefore be ignored.
525 =item B<-o>, B<--order-by> I<default[-desc]|postings[-desc]>
527 Within each group (a single month or single newsgroup, see above), the
528 report will be sorted by newsgroup names in ascending alphabetical order
529 by default. You can change the sort order to descending or sort by number
532 =item B<-f>, B<--format> I<pretty|list|dump>
534 Select the output format, I<pretty> being the default:
537 de.comp.datenbanken.ms-access 84
538 de.comp.datenbanken.mysql 88
540 de.comp.datenbanken.ms-access 126
541 de.comp.datenbanken.mysql 21
543 I<list> format looks like this:
545 2012-01 de.comp.datenbanken.ms-access 84
546 2012-01 de.comp.datenbanken.mysql 88
547 2012-02 de.comp.datenbanken.ms-access 126
548 2012-02 de.comp.datenbanken.mysql 21
550 And I<dump> format looks like this:
553 de.comp.datenbanken.ms-access 84
554 de.comp.datenbanken.mysql 88
556 de.comp.datenbanken.ms-access 126
557 de.comp.datenbanken.mysql 21
559 You can remove the comments by using B<--nocomments>, see below.
561 =item B<-c>, B<--captions|--nocaptions>
563 Add captions to output, like this:
565 ----- Report for 2012-01 to 2012-02 (number of postings for each month)
566 ----- Newsgroups: de.comp.datenbanken.*
567 ----- Threshold: 10 => x <= 20 (on average)
568 ----- Grouped by Newsgroups (ascending), sorted by number of postings descending
572 =item B<--comments|--nocomments>
574 Add comments (group headers) to I<dump> and I<pretty> output. True by default.
576 Use I<--nocomments> to suppress anything except newsgroup names/months and
577 numbers of postings. This is enforced when using B<--filetemplate>, see below.
579 =item B<--filetemplate> I<filename template>
581 Save output to file(s) instead of dumping it to STDOUT. B<groupstats> will
582 create one file for each month (or each newsgroup, accordant to the
583 setting of B<--group-by>, see above), with filenames composed by adding
584 year and month (or newsgroup names) to the I<filename template>, for
585 example with B<--filetemplate> I<stats>:
591 B<--nocomments> is enforced, see above.
593 =item B<--groupsdb> I<database table>
595 Override I<DBTableGrps> from F<newsstats.conf>.
597 =item B<--conffile> I<filename>
599 Load configuration from I<filename> instead of F<newsstats.conf>.
609 Show number of postings per group for lasth month in I<pretty> format:
613 Show that report for January of 2010 and de.alt.* plus de.test,
614 including display of hierarchy levels:
616 groupstats --month 2010-01 --newsgroups de.alt.*:de.test --sums
618 Only show newsgroups with 30 postings or less last month, ordered
619 by number of postings, descending, in I<pretty> format:
621 groupstats --upper 30 --order-by postings-desc
623 Show the total of all postings for the year of 2010 for all groups that
624 had 30 postings or less in every single month in that year, ordered by
625 number of postings in descending order:
627 groupstats -m 2010-01:2010-12 -u 30 -b level -r sums -o postings-desc
629 The same for the average number of postings in the year of 2010:
631 groupstats -m 2010-01:2010-12 -u 30 -b level -r avg -o postings-desc
633 List number of postings per group for eacht month of 2010 and redirect
634 output to one file for each month, namend stats-2010-01 and so on, in
635 machine-readable form (without formatting):
637 groupstats -m 2010-01:2010-12 -f dump --filetemplate stats
644 =item F<bin/groupstats.pl>
648 =item F<lib/NewsStats.pm>
650 Library functions for the NewsStats package.
652 =item F<etc/newsstats.conf>
654 Runtime configuration file.
660 Please report any bugs or feature requests to the author or use the
661 bug tracker at L<http://bugs.th-h.de/>!
681 This script is part of the B<NewsStats> package.
685 Thomas Hochstein <thh@inter.net>
687 =head1 COPYRIGHT AND LICENSE
689 Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
691 This program is free software; you may redistribute it and/or modify it
692 under the same terms as Perl itself.