5 # This script will get statistical data on newgroup usage
8 # It is part of the NewsStats package.
10 # Copyright (c) 2010-2013 Thomas Hochstein <thh@inter.net>
12 # It can be redistributed and/or modified under the same terms under
13 # which Perl itself is published.
16 our $VERSION = "0.01";
18 # we're in .../bin, so our module is in ../lib
19 push(@INC, dirname($0).'/../lib');
24 use NewsStats qw(:DEFAULT :TimePeriods :Output :SQLHelper ReadGroupList);
27 use Getopt::Long qw(GetOptions);
28 Getopt::Long::config ('bundling');
30 ################################# Main program #################################
32 ### read commandline options
33 my ($OptBoundType,$OptCaptions,$OptCheckgroupsFile,$OptComments,
34 $OptFileTemplate,$OptFormat,$OptGroupBy,$OptGroupsDB,$LowBound,$OptMonth,
35 $OptNewsgroups,$OptOrderBy,$OptReportType,$OptSums,$UppBound);
36 GetOptions ('b|boundary=s' => \$OptBoundType,
37 'c|captions!' => \$OptCaptions,
38 'checkgroups=s' => \$OptCheckgroupsFile,
39 'comments!' => \$OptComments,
40 'filetemplate=s' => \$OptFileTemplate,
41 'f|format=s' => \$OptFormat,
42 'g|group-by=s' => \$OptGroupBy,
43 'groupsdb=s' => \$OptGroupsDB,
44 'l|lower=i' => \$LowBound,
45 'm|month=s' => \$OptMonth,
46 'n|newsgroups=s' => \$OptNewsgroups,
47 'o|order-by=s' => \$OptOrderBy,
48 'r|report=s' => \$OptReportType,
49 's|sums!' => \$OptSums,
50 'u|upper=i' => \$UppBound,
51 'h|help' => \&ShowPOD,
52 'V|version' => \&ShowVersion) or exit 1;
54 # $OptComments defaults to TRUE
55 $OptComments = 1 if (!defined($OptComments));
56 # force --nocomments when --filetemplate is used
57 $OptComments = 0 if ($OptFileTemplate);
60 if ($OptBoundType =~ /level/i) {
61 $OptBoundType = 'level';
62 } elsif ($OptBoundType =~ /av(era)?ge?/i) {
63 $OptBoundType = 'average';
64 } elsif ($OptBoundType =~ /sums?/i) {
65 $OptBoundType = 'sum';
67 $OptBoundType = 'default';
70 # parse $OptReportType
72 if ($OptReportType =~ /av(era)?ge?/i) {
73 $OptReportType = 'average';
74 } elsif ($OptReportType =~ /sums?/i) {
75 $OptReportType = 'sum';
77 $OptReportType = 'default';
80 # read list of newsgroups from --checkgroups
81 # into a hash reference
82 my $ValidGroups = &ReadGroupList($OptCheckgroupsFile) if $OptCheckgroupsFile;
84 ### read configuration
85 my %Conf = %{ReadConfig('')};
87 ### override configuration via commandline options
89 $ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
90 &OverrideConfig(\%Conf,\%ConfOverride);
93 my $DBHandle = InitDB(\%Conf,1);
95 ### get time period and newsgroups, prepare SQL 'WHERE' clause
97 # and set caption for output and expression for SQL 'WHERE' clause
98 my ($CaptionPeriod,$SQLWherePeriod) = &GetTimePeriod($OptMonth);
99 # bail out if --month is invalid
100 &Bleat(2,"--month option has an invalid format - ".
101 "please use 'YYYY-MM', 'YYYY-MM:YYYY-MM' or 'ALL'!") if !$CaptionPeriod;
102 # get list of newsgroups and set expression for SQL 'WHERE' clause
103 # with placeholders as well as a list of newsgroup to bind to them
104 my ($SQLWhereNewsgroups,@SQLBindNewsgroups);
105 if ($OptNewsgroups) {
106 ($SQLWhereNewsgroups,@SQLBindNewsgroups) = &SQLGroupList($OptNewsgroups);
107 # bail out if --newsgroups is invalid
108 &Bleat(2,"--newsgroups option has an invalid format!")
109 if !$SQLWhereNewsgroups;
112 ### build SQL WHERE clause (and HAVING clause, if needed)
113 my ($SQLWhereClause,$SQLHavingClause);
114 # $OptBoundType 'level'
115 if ($OptBoundType and $OptBoundType ne 'default') {
116 $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,
117 $SQLWhereNewsgroups,&SQLHierarchies($OptSums));
118 $SQLHavingClause = SQLBuildClause('having',&SQLSetBounds($OptBoundType,
119 $LowBound,$UppBound));
120 # $OptBoundType 'threshold' / 'default' or none
122 $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,
123 $SQLWhereNewsgroups,&SQLHierarchies($OptSums),
124 &SQLSetBounds('default',$LowBound,$UppBound));
127 ### get sort order and build SQL 'ORDER BY' clause
128 # default to 'newsgroup' for $OptBoundType 'level' or 'average'
129 $OptGroupBy = 'newsgroup' if (!$OptGroupBy and
130 $OptBoundType and $OptBoundType ne 'default');
131 # force to 'month' for $OptReportType 'average' or 'sum'
132 $OptGroupBy = 'month' if ($OptReportType and $OptReportType ne 'default');
133 # parse $OptGroupBy to $GroupBy, create ORDER BY clause $SQLOrderClause
134 my ($GroupBy,$SQLOrderClause) = SQLSortOrder($OptGroupBy, $OptOrderBy);
135 # $GroupBy will contain 'month' or 'newsgroup' (parsed result of $OptGroupBy)
136 # set it to 'month' or 'key' for OutputData()
137 $GroupBy = ($GroupBy eq 'month') ? 'month' : 'key';
139 ### get report type and build SQL 'SELECT' query
141 my $SQLGroupClause = '';
142 my $Precision = 0; # number of digits right of decimal point for output
143 if ($OptReportType and $OptReportType ne 'default') {
144 $SQLGroupClause = 'GROUP BY newsgroup';
145 # change $SQLOrderClause: replace everything before 'postings'
146 $SQLOrderClause =~ s/BY.+postings/BY postings/;
147 if ($OptReportType eq 'average') {
148 $SQLSelect = "'All months',newsgroup,AVG(postings)";
150 # change $SQLOrderClause: replace 'postings' with 'AVG(postings)'
151 $SQLOrderClause =~ s/postings/AVG(postings)/;
152 } elsif ($OptReportType eq 'sum') {
153 $SQLSelect = "'All months',newsgroup,SUM(postings)";
154 # change $SQLOrderClause: replace 'postings' with 'SUM(postings)'
155 $SQLOrderClause =~ s/postings/SUM(postings)/;
158 $SQLSelect = 'month,newsgroup,postings';
161 ### get length of longest newsgroup name delivered by query
162 ### for formatting purposes
163 my $Field = ($GroupBy eq 'month') ? 'newsgroup' : 'month';
164 my ($MaxLength,$MaxValLength) = &GetMaxLength($DBHandle,$Conf{'DBTableGrps'},
165 $Field,'postings',$SQLWhereClause,
169 ### build and execute SQL query
171 # special query preparation for $OptBoundType 'level', 'average' or 'sums'
172 if ($OptBoundType and $OptBoundType ne 'default') {
173 # prepare and execute first query:
174 # get list of newsgroups meeting level conditions
175 $DBQuery = $DBHandle->prepare(sprintf('SELECT newsgroup FROM %s.%s %s '.
176 'GROUP BY newsgroup %s',
177 $Conf{'DBDatabase'},$Conf{'DBTableGrps'},
178 $SQLWhereClause,$SQLHavingClause));
179 $DBQuery->execute(@SQLBindNewsgroups)
180 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: %s\n",
181 $CaptionPeriod,$Conf{'DBDatabase'},$Conf{'DBTableGrps'},
183 # add newsgroups to a comma-seperated list ready for IN(...) query
185 while (my ($Newsgroup) = $DBQuery->fetchrow_array) {
186 $GroupList .= ',' if $GroupList;
187 $GroupList .= "'$Newsgroup'";
189 # enhance $WhereClause
191 $SQLWhereClause = SQLBuildClause('where',$SQLWhereClause,
192 sprintf('newsgroup IN (%s)',$GroupList));
194 # condition cannot be satisfied;
195 # force query to fail by adding '0=1'
196 $SQLWhereClause = SQLBuildClause('where',$SQLWhereClause,'0=1');
201 $DBQuery = $DBHandle->prepare(sprintf('SELECT %s FROM %s.%s %s %s %s',
203 $Conf{'DBDatabase'},$Conf{'DBTableGrps'},
204 $SQLWhereClause,$SQLGroupClause,
208 $DBQuery->execute(@SQLBindNewsgroups)
209 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: %s\n",
210 $CaptionPeriod,$Conf{'DBDatabase'},$Conf{'DBTableGrps'},
214 # set default to 'pretty'
215 $OptFormat = 'pretty' if !$OptFormat;
216 # print captions if --caption is set
217 if ($OptCaptions && $OptComments) {
218 # print time period with report type
219 my $CaptionReportType= '(number of postings for each month)';
220 if ($OptReportType and $OptReportType ne 'default') {
221 $CaptionReportType= '(average number of postings for each month)'
222 if $OptReportType eq 'average';
223 $CaptionReportType= '(number of all postings for that time period)'
224 if $OptReportType eq 'sum';
226 printf("# ----- Report for %s %s\n",$CaptionPeriod,$CaptionReportType);
227 # print newsgroup list if --newsgroups is set
228 printf("# ----- Newsgroups: %s\n",join(',',split(/:/,$OptNewsgroups)))
230 # print boundaries, if set
231 my $CaptionBoundary= '(counting only month fulfilling this condition)';
232 if ($OptBoundType and $OptBoundType ne 'default') {
233 $CaptionBoundary= '(every single month)' if $OptBoundType eq 'level';
234 $CaptionBoundary= '(on average)' if $OptBoundType eq 'average';
235 $CaptionBoundary= '(all month summed up)' if $OptBoundType eq 'sum';
237 printf("# ----- Threshold: %s %s x %s %s %s\n",
238 $LowBound ? $LowBound : '',$LowBound ? '=>' : '',
239 $UppBound ? '<=' : '',$UppBound ? $UppBound : '',$CaptionBoundary)
240 if ($LowBound or $UppBound);
241 # print primary and secondary sort order
242 printf("# ----- Grouped by %s (%s), sorted %s%s\n",
243 ($GroupBy eq 'month') ? 'Months' : 'Newsgroups',
244 ($OptGroupBy and $OptGroupBy =~ /-?desc$/i) ? 'descending' : 'ascending',
245 ($OptOrderBy and $OptOrderBy =~ /posting/i) ? 'by number of postings ' : '',
246 ($OptOrderBy and $OptOrderBy =~ /-?desc$/i) ? 'descending' : 'ascending');
250 &OutputData($OptFormat,$OptComments,$GroupBy,$Precision,
251 $OptCheckgroupsFile ? $ValidGroups : '',
252 $OptFileTemplate,$DBQuery,$MaxLength,$MaxValLength);
255 $DBHandle->disconnect;
259 ################################ Documentation #################################
263 groupstats - create reports on newsgroup usage
267 B<groupstats> [B<-Vhcs> B<--comments>] [B<-m> I<YYYY-MM>[:I<YYYY-MM>] | I<all>] [B<-n> I<newsgroup(s)>] [B<--checkgroups> I<checkgroups file>] [B<-r> I<report type>] [B<-l> I<lower boundary>] [B<-u> I<upper boundary>] [B<-b> I<boundary type>] [B<-g> I<group by>] [B<-o> I<order by>] [B<-f> I<output format>] [B<--filetemplate> I<filename template>] [B<--groupsdb> I<database table>]
275 This script create reports on newsgroup usage (number of postings per
276 group per month) taken from result tables created by
279 =head2 Features and options
281 =head3 Time period and newsgroups
283 The time period to act on defaults to last month; you can assign another
284 time period or a single month (or drop all time constraints) via the
285 B<--month> option (see below).
287 B<groupstats> will process all newsgroups by default; you can limit
288 processing to only some newsgroups by supplying a list of those groups via
289 B<--newsgroups> option (see below). You can include hierarchy levels in
290 the output by adding the B<--sums> switch (see below). Optionally
291 newsgroups not present in a checkgroups file can be excluded from output,
292 sse B<--checkgroups> below.
296 You can choose between different B<--report> types: postings per month,
297 average postings per month or all postings summed up; for details, see
300 =head3 Upper and lower boundaries
302 Furthermore you can set an upper and/or lower boundary to exclude some
303 results from output via the B<--lower> and B<--upper> options,
304 respectively. By default, all newsgroups with more and/or less postings
305 per month will be excluded from the result set (i.e. not shown and not
306 considered for average and sum reports). You can change the meaning of
307 those boundaries with the B<--boundary> option. For details, please see
310 =head3 Sorting and formatting the output
312 By default, all results are grouped by month; you can group results by
313 newsgroup instead via the B<--groupy-by> option. Within those groups, the
314 list of newsgroups (or months) is sorted alphabetically (or
315 chronologically, respectively) ascending. You can change that order (and
316 sort by number of postings) with the B<--order-by> option. For details and
317 exceptions, please see below.
319 The results will be formatted as a kind of table; you can change the
320 output format to a simple list or just a list of newsgroups and number of
321 postings with the B<--format> option. Captions will be added by means of
322 the B<--caption> option; all comments (and captions) can be supressed by
323 using B<--nocomments>.
325 Last but not least you can redirect all output to a number of files, e.g.
326 one for each month, by submitting the B<--filetemplate> option, see below.
327 Captions and comments are automatically disabled in this case.
331 B<groupstats> will read its configuration from F<newsstats.conf>
332 which should be present in the same directory via Config::Auto.
334 See doc/INSTALL for an overview of possible configuration options.
336 You can override some configuration options via the B<--groupsdb> option.
342 =item B<-V>, B<--version>
344 Print out version and copyright information and exit.
346 =item B<-h>, B<--help>
348 Print this man page and exit.
350 =item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]|all>
352 Set processing period to a single month in YYYY-MM format or to a time
353 period between two month in YYYY-MM:YYYY-MM format (two month, separated
354 by a colon). By using the keyword I<all> instead, you can set no
355 processing period to process the whole database.
357 =item B<-n>, B<--newsgroups> I<newsgroup(s)>
359 Limit processing to a certain set of newsgroups. I<newsgroup(s)> can
360 be a single newsgroup name (de.alt.test), a newsgroup hierarchy
361 (de.alt.*) or a list of either of these, separated by colons, for
364 de.test:de.alt.test:de.newusers.*
366 =item B<-s>, B<--sums|--nosums> (sum per hierarchy level)
368 Include "virtual" groups for every hierarchy level in output, for
375 See the B<gatherstats> man page for details.
377 =item B<--checkgroups> I<filename>
379 Restrict output to those newgroups present in a file in checkgroups format
380 (one newgroup name per line; everything after the first whitespace on each
381 line is ignored). All other newsgroups will be removed from output.
383 Contrary to B<gatherstats>, I<filename> is not a template, but refers to
384 a single file in checkgroups format.
386 =item B<-r>, B<--report> I<default|average|sums>
388 Choose the report type: I<default>, I<average> or I<sums>
390 By default, B<groupstats> will report the number of postings for each
391 newsgroup in each month. But it can also report the average number of
392 postings per group for all months or the total sum of postings per group
395 For report types I<average> and I<sums>, the B<group-by> option has no
396 meaning and will be silently ignored (see below).
398 =item B<-l>, B<--lower> I<lower boundary>
400 Set the lower boundary. See B<--boundary> below.
402 =item B<-l>, B<--upper> I<upper boundary>
404 Set the upper boundary. See B<--boundary> below.
406 =item B<-b>, B<--boundary> I<boundary type>
408 Set the boundary type to one of I<default>, I<level>, I<average> or
411 By default, all newsgroups with more postings per month than the upper
412 boundary and/or less postings per month than the lower boundary will be
413 excluded from further processing. For the default report that means each
414 month only newsgroups with a number of postings between the boundaries
415 will be displayed. For the other report types, newsgroups with a number of
416 postings exceeding the boundaries in all (!) months will not be
419 For example, lets take a list of newsgroups like this:
422 de.comp.datenbanken.misc 6
423 de.comp.datenbanken.ms-access 84
424 de.comp.datenbanken.mysql 88
426 de.comp.datenbanken.misc 8
427 de.comp.datenbanken.ms-access 126
428 de.comp.datenbanken.mysql 21
430 de.comp.datenbanken.misc 24
431 de.comp.datenbanken.ms-access 83
432 de.comp.datenbanken.mysql 36
434 With C<groupstats --month 2012-01:2012-03 --lower 25 --report sums>,
435 you'll get the following result:
438 de.comp.datenbanken.ms-access 293
439 de.comp.datenbanken.mysql 124
441 de.comp.datenbanken.misc has not been considered even though it has 38
442 postings in total, because it has less than 25 postings in every single
443 month. If you want to list all newsgroups with more than 25 postings
444 I<in total>, you'll have to set the boundary type to I<sum>, see below.
446 A boundary type of I<level> will show only those newsgroups - at all -
447 that satisfy the boundaries in each and every single month. With the above
448 list of newsgroups and
449 C<groupstats --month 2012-01:2012-03 --lower 25 --boundary level --report sums>,
450 you'll get this result:
453 de.comp.datenbanken.ms-access 293
455 de.comp.datenbanken.mysql has not been considered because it had less than
456 25 postings in 2012-02 (only).
458 You can use that to get a list of newsgroups that have more (or less) then
459 x postings in every month during the whole reporting period.
461 A boundary type of I<average> will show only those newsgroups - at all -that
462 satisfy the boundaries on average. With the above list of newsgroups and
463 C<groupstats --month 2012-01:2012-03 --lower 25 --boundary avg --report sums>,
464 you'll get this result:
467 de.comp.datenbanken.ms-access 293
468 de.comp.datenbanken.mysql 145
470 The average number of postings in the three groups is:
472 de.comp.datenbanken.misc 12.67
473 de.comp.datenbanken.ms-access 97.67
474 de.comp.datenbanken.mysql 48.33
476 Last but not least, a boundary type of I<sums> will show only those
477 newsgroups - at all - that satisfy the boundaries with the total sum of
478 all postings during the reporting period. With the above list of
480 C<groupstats --month 2012-01:2012-03 --lower 25 --boundary sum --report sums>,
481 you'll finally get this result:
484 de.comp.datenbanken.misc 38
485 de.comp.datenbanken.ms-access 293
486 de.comp.datenbanken.mysql 145
489 =item B<-g>, B<--group-by> I<month[-desc]|newsgroups[-desc]>
491 By default, all results are grouped by month, sorted chronologically in
492 ascending order, like this:
495 de.comp.datenbanken.ms-access 84
496 de.comp.datenbanken.mysql 88
498 de.comp.datenbanken.ms-access 126
499 de.comp.datenbanken.mysql 21
501 The results can be grouped by newsgroups instead via
502 B<--group-by> I<newsgroup>:
504 ----- de.comp.datenbanken.ms-access:
507 ----- de.comp.datenbanken.mysql:
511 By appending I<-desc> to the group-by option parameter, you can reverse
512 the sort order - e.g. B<--group-by> I<month-desc> will give:
515 de.comp.datenbanken.ms-access 126
516 de.comp.datenbanken.mysql 21
518 de.comp.datenbanken.ms-access 84
519 de.comp.datenbanken.mysql 88
521 Average and sums reports (see above) will always be grouped by months;
522 this option will therefore be ignored.
524 =item B<-o>, B<--order-by> I<default[-desc]|postings[-desc]>
526 Within each group (a single month or single newsgroup, see above), the
527 report will be sorted by newsgroup names in ascending alphabetical order
528 by default. You can change the sort order to descending or sort by number
531 =item B<-f>, B<--format> I<pretty|list|dump>
533 Select the output format, I<pretty> being the default:
536 de.comp.datenbanken.ms-access 84
537 de.comp.datenbanken.mysql 88
539 de.comp.datenbanken.ms-access 126
540 de.comp.datenbanken.mysql 21
542 I<list> format looks like this:
544 2012-01 de.comp.datenbanken.ms-access 84
545 2012-01 de.comp.datenbanken.mysql 88
546 2012-02 de.comp.datenbanken.ms-access 126
547 2012-02 de.comp.datenbanken.mysql 21
549 And I<dump> format looks like this:
552 de.comp.datenbanken.ms-access 84
553 de.comp.datenbanken.mysql 88
555 de.comp.datenbanken.ms-access 126
556 de.comp.datenbanken.mysql 21
558 You can remove the comments by using B<--nocomments>, see below.
560 =item B<-c>, B<--captions|--nocaptions>
562 Add captions to output, like this:
564 ----- Report for 2012-01 to 2012-02 (number of postings for each month)
565 ----- Newsgroups: de.comp.datenbanken.*
566 ----- Threshold: 10 => x <= 20 (on average)
567 ----- Grouped by Newsgroups (ascending), sorted by number of postings descending
571 =item B<--comments|--nocomments>
573 Add comments (group headers) to I<dump> and I<pretty> output. True by default.
575 Use I<--nocomments> to suppress anything except newsgroup names/months and
576 numbers of postings. This is enforced when using B<--filetemplate>, see below.
578 =item B<--filetemplate> I<filename template>
580 Save output to file(s) instead of dumping it to STDOUT. B<groupstats> will
581 create one file for each month (or each newsgroup, accordant to the
582 setting of B<--group-by>, see above), with filenames composed by adding
583 year and month (or newsgroup names) to the I<filename template>, for
584 example with B<--filetemplate> I<stats>:
590 B<--nocomments> is enforced, see above.
592 =item B<--groupsdb> I<database table>
594 Override I<DBTableGrps> from F<newsstats.conf>.
604 Show number of postings per group for lasth month in I<pretty> format:
608 Show that report for January of 2010 and de.alt.* plus de.test,
609 including display of hierarchy levels:
611 groupstats --month 2010-01 --newsgroups de.alt.*:de.test --sums
613 Only show newsgroups with 30 postings or less last month, ordered
614 by number of postings, descending, in I<pretty> format:
616 groupstats --upper 30 --order-by postings-desc
618 Show the total of all postings for the year of 2010 for all groups that
619 had 30 postings or less in every single month in that year, ordered by
620 number of postings in descending order:
622 groupstats -m 2010-01:2010-12 -u 30 -b level -r sums -o postings-desc
624 The same for the average number of postings in the year of 2010:
626 groupstats -m 2010-01:2010-12 -u 30 -b level -r avg -o postings-desc
628 List number of postings per group for eacht month of 2010 and redirect
629 output to one file for each month, namend stats-2010-01 and so on, in
630 machine-readable form (without formatting):
632 groupstats -m 2010-01:2010-12 -f dump --filetemplate stats
639 =item F<bin/groupstats.pl>
643 =item F<lib/NewsStats.pm>
645 Library functions for the NewsStats package.
647 =item F<etc/newsstats.conf>
649 Runtime configuration file.
655 Please report any bugs or feature requests to the author or use the
656 bug tracker at L<http://bugs.th-h.de/>!
676 This script is part of the B<NewsStats> package.
680 Thomas Hochstein <thh@inter.net>
682 =head1 COPYRIGHT AND LICENSE
684 Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
686 This program is free software; you may redistribute it and/or modify it
687 under the same terms as Perl itself.