5 # This script will get statistical data on newgroup usage
8 # It is part of the NewsStats package.
10 # Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
12 # It can be redistributed and/or modified under the same terms under
13 # which Perl itself is published.
16 our $VERSION = "0.01";
18 push(@INC, dirname($0));
22 use NewsStats qw(:DEFAULT :TimePeriods :Output :SQLHelper ReadGroupList);
25 use Getopt::Long qw(GetOptions);
26 Getopt::Long::config ('bundling');
28 ################################# Main program #################################
30 ### read commandline options
31 my ($OptBoundType,$OptCaptions,$OptCheckgroupsFile,$OptComments,
32 $OptFileTemplate,$OptFormat,$OptGroupBy,$OptGroupsDB,$LowBound,$OptMonth,
33 $OptNewsgroups,$OptOrderBy,$OptReportType,$OptSums,$UppBound);
34 GetOptions ('b|boundary=s' => \$OptBoundType,
35 'c|captions!' => \$OptCaptions,
36 'checkgroups=s' => \$OptCheckgroupsFile,
37 'comments!' => \$OptComments,
38 'filetemplate=s' => \$OptFileTemplate,
39 'f|format=s' => \$OptFormat,
40 'g|group-by=s' => \$OptGroupBy,
41 'groupsdb=s' => \$OptGroupsDB,
42 'l|lower=i' => \$LowBound,
43 'm|month=s' => \$OptMonth,
44 'n|newsgroups=s' => \$OptNewsgroups,
45 'o|order-by=s' => \$OptOrderBy,
46 'r|report=s' => \$OptReportType,
47 's|sums!' => \$OptSums,
48 'u|upper=i' => \$UppBound,
49 'h|help' => \&ShowPOD,
50 'V|version' => \&ShowVersion) or exit 1;
52 # $OptComments defaults to TRUE
53 $OptComments = 1 if (!defined($OptComments));
54 # force --nocomments when --filetemplate is used
55 $OptComments = 0 if ($OptFileTemplate);
58 if ($OptBoundType =~ /level/i) {
59 $OptBoundType = 'level';
60 } elsif ($OptBoundType =~ /av(era)?ge?/i) {
61 $OptBoundType = 'average';
62 } elsif ($OptBoundType =~ /sums?/i) {
63 $OptBoundType = 'sum';
65 $OptBoundType = 'default';
68 # parse $OptReportType
70 if ($OptReportType =~ /av(era)?ge?/i) {
71 $OptReportType = 'average';
72 } elsif ($OptReportType =~ /sums?/i) {
73 $OptReportType = 'sum';
75 $OptReportType = 'default';
78 # read list of newsgroups from --checkgroups
79 # into a hash reference
80 my $ValidGroups = &ReadGroupList($OptCheckgroupsFile) if $OptCheckgroupsFile;
82 ### read configuration
83 my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')};
85 ### override configuration via commandline options
87 $ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
88 &OverrideConfig(\%Conf,\%ConfOverride);
91 my $DBHandle = InitDB(\%Conf,1);
93 ### get time period and newsgroups, prepare SQL 'WHERE' clause
95 # and set caption for output and expression for SQL 'WHERE' clause
96 my ($CaptionPeriod,$SQLWherePeriod) = &GetTimePeriod($OptMonth);
97 # bail out if --month is invalid
98 &Bleat(2,"--month option has an invalid format - ".
99 "please use 'YYYY-MM', 'YYYY-MM:YYYY-MM' or 'ALL'!") if !$CaptionPeriod;
100 # get list of newsgroups and set expression for SQL 'WHERE' clause
101 # with placeholders as well as a list of newsgroup to bind to them
102 my ($SQLWhereNewsgroups,@SQLBindNewsgroups);
103 if ($OptNewsgroups) {
104 ($SQLWhereNewsgroups,@SQLBindNewsgroups) = &SQLGroupList($OptNewsgroups);
105 # bail out if --newsgroups is invalid
106 &Bleat(2,"--newsgroups option has an invalid format!")
107 if !$SQLWhereNewsgroups;
110 ### build SQL WHERE clause (and HAVING clause, if needed)
111 my ($SQLWhereClause,$SQLHavingClause);
112 # $OptBoundType 'level'
113 if ($OptBoundType and $OptBoundType ne 'default') {
114 $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,
115 $SQLWhereNewsgroups,&SQLHierarchies($OptSums));
116 $SQLHavingClause = SQLBuildClause('having',&SQLSetBounds($OptBoundType,
117 $LowBound,$UppBound));
118 # $OptBoundType 'threshold' / 'default' or none
120 $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,
121 $SQLWhereNewsgroups,&SQLHierarchies($OptSums),
122 &SQLSetBounds('default',$LowBound,$UppBound));
125 ### get sort order and build SQL 'ORDER BY' clause
126 # default to 'newsgroup' for $OptBoundType 'level' or 'average'
127 $OptGroupBy = 'newsgroup' if (!$OptGroupBy and
128 $OptBoundType and $OptBoundType ne 'default');
129 # force to 'month' for $OptReportType 'average' or 'sum'
130 $OptGroupBy = 'month' if ($OptReportType and $OptReportType ne 'default');
131 # parse $OptGroupBy to $GroupBy, create ORDER BY clause $SQLOrderClause
132 my ($GroupBy,$SQLOrderClause) = SQLSortOrder($OptGroupBy, $OptOrderBy);
133 # $GroupBy will contain 'month' or 'newsgroup' (parsed result of $OptGroupBy)
134 # set it to 'month' or 'key' for OutputData()
135 $GroupBy = ($GroupBy eq 'month') ? 'month' : 'key';
137 ### get report type and build SQL 'SELECT' query
139 my $SQLGroupClause = '';
140 my $Precision = 0; # number of digits right of decimal point for output
141 if ($OptReportType and $OptReportType ne 'default') {
142 $SQLGroupClause = 'GROUP BY newsgroup';
143 # change $SQLOrderClause: replace everything before 'postings'
144 $SQLOrderClause =~ s/BY.+postings/BY postings/;
145 if ($OptReportType eq 'average') {
146 $SQLSelect = "'All months',newsgroup,AVG(postings)";
148 # change $SQLOrderClause: replace 'postings' with 'AVG(postings)'
149 $SQLOrderClause =~ s/postings/AVG(postings)/;
150 } elsif ($OptReportType eq 'sum') {
151 $SQLSelect = "'All months',newsgroup,SUM(postings)";
152 # change $SQLOrderClause: replace 'postings' with 'SUM(postings)'
153 $SQLOrderClause =~ s/postings/SUM(postings)/;
156 $SQLSelect = 'month,newsgroup,postings';
159 ### get length of longest newsgroup name delivered by query
160 ### for formatting purposes
161 my $Field = ($GroupBy eq 'month') ? 'newsgroup' : 'month';
162 my $MaxLength = &GetMaxLength($DBHandle,$Conf{'DBTableGrps'},
163 $Field,$SQLWhereClause,$SQLHavingClause,
166 ### build and execute SQL query
168 # special query preparation for $OptBoundType 'level', 'average' or 'sums'
169 if ($OptBoundType and $OptBoundType ne 'default') {
170 # prepare and execute first query:
171 # get list of newsgroups meeting level conditions
172 $DBQuery = $DBHandle->prepare(sprintf('SELECT newsgroup FROM %s.%s %s '.
173 'GROUP BY newsgroup %s',
174 $Conf{'DBDatabase'},$Conf{'DBTableGrps'},
175 $SQLWhereClause,$SQLHavingClause));
176 $DBQuery->execute(@SQLBindNewsgroups)
177 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: %s\n",
178 $CaptionPeriod,$Conf{'DBDatabase'},$Conf{'DBTableGrps'},
180 # add newsgroups to a comma-seperated list ready for IN(...) query
182 while (my ($Newsgroup) = $DBQuery->fetchrow_array) {
183 $GroupList .= ',' if $GroupList;
184 $GroupList .= "'$Newsgroup'";
186 # enhance $WhereClause
188 $SQLWhereClause = SQLBuildClause('where',$SQLWhereClause,
189 sprintf('newsgroup IN (%s)',$GroupList));
191 # condition cannot be satisfied;
192 # force query to fail by adding '0=1'
193 $SQLWhereClause = SQLBuildClause('where',$SQLWhereClause,'0=1');
198 $DBQuery = $DBHandle->prepare(sprintf('SELECT %s FROM %s.%s %s %s %s',
200 $Conf{'DBDatabase'},$Conf{'DBTableGrps'},
201 $SQLWhereClause,$SQLGroupClause,
205 $DBQuery->execute(@SQLBindNewsgroups)
206 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: %s\n",
207 $CaptionPeriod,$Conf{'DBDatabase'},$Conf{'DBTableGrps'},
211 # set default to 'pretty'
212 $OptFormat = 'pretty' if !$OptFormat;
213 # print captions if --caption is set
214 if ($OptCaptions && $OptComments) {
215 # print time period with report type
216 my $CaptionReportType= '(number of postings for each month)';
217 if ($OptReportType and $OptReportType ne 'default') {
218 $CaptionReportType= '(average number of postings for each month)'
219 if $OptReportType eq 'average';
220 $CaptionReportType= '(number of all postings for that time period)'
221 if $OptReportType eq 'sum';
223 printf("# ----- Report for %s %s\n",$CaptionPeriod,$CaptionReportType);
224 # print newsgroup list if --newsgroups is set
225 printf("# ----- Newsgroups: %s\n",join(',',split(/:/,$OptNewsgroups)))
227 # print boundaries, if set
228 my $CaptionBoundary= '(counting only month fulfilling this condition)';
229 if ($OptBoundType and $OptBoundType ne 'default') {
230 $CaptionBoundary= '(every single month)' if $OptBoundType eq 'level';
231 $CaptionBoundary= '(on average)' if $OptBoundType eq 'average';
232 $CaptionBoundary= '(all month summed up)' if $OptBoundType eq 'sum';
234 printf("# ----- Threshold: %s %s x %s %s %s\n",
235 $LowBound ? $LowBound : '',$LowBound ? '=>' : '',
236 $UppBound ? '<=' : '',$UppBound ? $UppBound : '',$CaptionBoundary)
237 if ($LowBound or $UppBound);
238 # print primary and secondary sort order
239 printf("# ----- Grouped by %s (%s), sorted %s%s\n",
240 ($GroupBy eq 'month') ? 'Months' : 'Newsgroups',
241 ($OptGroupBy and $OptGroupBy =~ /-?desc$/i) ? 'descending' : 'ascending',
242 ($OptOrderBy and $OptOrderBy =~ /posting/i) ? 'by number of postings ' : '',
243 ($OptOrderBy and $OptOrderBy =~ /-?desc$/i) ? 'descending' : 'ascending');
247 &OutputData($OptFormat,$OptComments,$GroupBy,$Precision,
248 $OptCheckgroupsFile ? $ValidGroups : '',
249 $OptFileTemplate,$DBQuery,$MaxLength);
252 $DBHandle->disconnect;
256 ################################ Documentation #################################
260 groupstats - create reports on newsgroup usage
264 B<groupstats> [B<-Vhcs> B<--comments>] [B<-m> I<YYYY-MM>[:I<YYYY-MM>] | I<all>] [B<-n> I<newsgroup(s)>] [B<--checkgroups> I<checkgroups file>] [B<-r> I<report type>] [B<-l> I<lower boundary>] [B<-u> I<upper boundary>] [B<-b> I<boundary type>] [B<-g> I<group by>] [B<-o> I<order by>] [B<-f> I<output format>] [B<--filetemplate> I<filename template>] [B<--groupsdb> I<database table>]
272 This script create reports on newsgroup usage (number of postings per
273 group per month) taken from result tables created by
276 =head2 Features and options
278 =head3 Time period and newsgroups
280 The time period to act on defaults to last month; you can assign another
281 time period or a single month (or drop all time constraints) via the
282 B<--month> option (see below).
284 B<groupstats> will process all newsgroups by default; you can limit
285 processing to only some newsgroups by supplying a list of those groups via
286 B<--newsgroups> option (see below). You can include hierarchy levels in
287 the output by adding the B<--sums> switch (see below). Optionally
288 newsgroups not present in a checkgroups file can be excluded from output,
289 sse B<--checkgroups> below.
293 You can choose between different B<--report> types: postings per month,
294 average postings per month or all postings summed up; for details, see
297 =head3 Upper and lower boundaries
299 Furthermore you can set an upper and/or lower boundary to exclude some
300 results from output via the B<--lower> and B<--upper> options,
301 respectively. By default, all newsgroups with more and/or less postings
302 per month will be excluded from the result set (i.e. not shown and not
303 considered for average and sum reports). You can change the meaning of
304 those boundaries with the B<--boundary> option. For details, please see
307 =head3 Sorting and formatting the output
309 By default, all results are grouped by month; you can group results by
310 newsgroup instead via the B<--groupy-by> option. Within those groups, the
311 list of newsgroups (or months) is sorted alphabetically (or
312 chronologically, respectively) ascending. You can change that order (and
313 sort by number of postings) with the B<--order-by> option. For details and
314 exceptions, please see below.
316 The results will be formatted as a kind of table; you can change the
317 output format to a simple list or just a list of newsgroups and number of
318 postings with the B<--format> option. Captions will be added by means of
319 the B<--caption> option; all comments (and captions) can be supressed by
320 using B<--nocomments>.
322 Last but not least you can redirect all output to a number of files, e.g.
323 one for each month, by submitting the B<--filetemplate> option, see below.
324 Captions and comments are automatically disabled in this case.
328 B<groupstats> will read its configuration from F<newsstats.conf>
329 which should be present in the same directory via Config::Auto.
331 See doc/INSTALL for an overview of possible configuration options.
333 You can override some configuration options via the B<--groupsdb> option.
339 =item B<-V>, B<--version>
341 Print out version and copyright information and exit.
343 =item B<-h>, B<--help>
345 Print this man page and exit.
347 =item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]|all>
349 Set processing period to a single month in YYYY-MM format or to a time
350 period between two month in YYYY-MM:YYYY-MM format (two month, separated
351 by a colon). By using the keyword I<all> instead, you can set no
352 processing period to process the whole database.
354 =item B<-n>, B<--newsgroups> I<newsgroup(s)>
356 Limit processing to a certain set of newsgroups. I<newsgroup(s)> can
357 be a single newsgroup name (de.alt.test), a newsgroup hierarchy
358 (de.alt.*) or a list of either of these, separated by colons, for
361 de.test:de.alt.test:de.newusers.*
363 =item B<-s>, B<--sums|--nosums> (sum per hierarchy level)
365 Include "virtual" groups for every hierarchy level in output, for
372 See the B<gatherstats> man page for details.
374 =item B<--checkgroups> I<filename>
376 Restrict output to those newgroups present in a file in checkgroups format
377 (one newgroup name per line; everything after the first whitespace on each
378 line is ignored). All other newsgroups will be removed from output.
380 =item B<-r>, B<--report> I<default|average|sums>
382 Choose the report type: I<default>, I<average> or I<sums>
384 By default, B<groupstats> will report the number of postings for each
385 newsgroup in each month. But it can also report the average number of
386 postings per group for all months or the total sum of postings per group
389 For report types I<average> and I<sums>, the B<group-by> option has no
390 meaning and will be silently ignored (see below).
392 =item B<-l>, B<--lower> I<lower boundary>
394 Set the lower boundary. See B<--boundary> below.
396 =item B<-l>, B<--upper> I<upper boundary>
398 Set the upper boundary. See B<--boundary> below.
400 =item B<-b>, B<--boundary> I<boundary type>
402 Set the boundary type to one of I<default>, I<level>, I<average> or
405 By default, all newsgroups with more postings per month than the upper
406 boundary and/or less postings per month than the lower boundary will be
407 excluded from further processing. For the default report that means each
408 month only newsgroups with a number of postings between the boundaries
409 will be displayed. For the other report types, newsgroups with a number of
410 postings exceeding the boundaries in all (!) months will not be
413 For example, lets take a list of newsgroups like this:
416 de.comp.datenbanken.misc 6
417 de.comp.datenbanken.ms-access 84
418 de.comp.datenbanken.mysql 88
420 de.comp.datenbanken.misc 8
421 de.comp.datenbanken.ms-access 126
422 de.comp.datenbanken.mysql 21
424 de.comp.datenbanken.misc 24
425 de.comp.datenbanken.ms-access 83
426 de.comp.datenbanken.mysql 36
428 With C<groupstats --month 2012-01:2012-03 --lower 25 --report sums>,
429 you'll get the following result:
432 de.comp.datenbanken.ms-access 293
433 de.comp.datenbanken.mysql 124
435 de.comp.datenbanken.misc has not been considered even though it has 38
436 postings in total, because it has less than 25 postings in every single
437 month. If you want to list all newsgroups with more than 25 postings U<in
438 total>, you'll have to set the boundary type to I<sum>, see below.
440 A boundary type of I<level> will show only those newsgroups - at all -
441 that satisfy the boundaries in each and every single month. With the above
442 list of newsgroups and
443 C<groupstats --month 2012-01:2012-03 --lower 25 --boundary level --report sums>,
444 you'll get this result:
447 de.comp.datenbanken.ms-access 293
449 de.comp.datenbanken.mysql has not been considered because it had less than
450 25 postings in 2012-02.
452 You can use that to get a list of newsgroups that have more (or less) then
453 x postings during the whole reporting period.
455 A boundary type of I<average> will show only those newsgroups - at all -that
456 satisfy the boundaries on average. With the above list of newsgroups and
457 C<groupstats --month 2012-01:2012-03 --lower 25 --boundary avg --report sums>,
458 you'll get this result:
461 de.comp.datenbanken.ms-access 293
462 de.comp.datenbanken.mysql 145
464 The average number of postings in the three groups is:
466 de.comp.datenbanken.misc 12.67
467 de.comp.datenbanken.ms-access 97.67
468 de.comp.datenbanken.mysql 48.33
470 Last but not least, a boundary type of I<sums> will show only those
471 newsgroups - at all - that satisfy the boundaries with the total sum of
472 all postings during the reporting period. With the above list of
474 C<groupstats --month 2012-01:2012-03 --lower 25 --boundary sum --report sums>,
475 you'll finally get this result:
478 de.comp.datenbanken.misc 38
479 de.comp.datenbanken.ms-access 293
480 de.comp.datenbanken.mysql 145
483 =item B<-g>, B<--group-by> I<month[-desc]|newsgroups[-desc]>
485 By default, all results are grouped by month, sorted chronologically in
486 ascending order, like this:
489 de.comp.datenbanken.ms-access 84
490 de.comp.datenbanken.mysql 88
492 de.comp.datenbanken.ms-access 126
493 de.comp.datenbanken.mysql 21
495 The results can be grouped by newsgroups instead via
496 B<--group-by> I<newsgroup>:
498 ----- de.comp.datenbanken.ms-access:
501 ----- de.comp.datenbanken.mysql:
505 By appending I<-desc> to the group-by option parameter, you can reverse
506 the sort order - e.g. B<--group-by> I<month-desc> will give:
509 de.comp.datenbanken.ms-access 126
510 de.comp.datenbanken.mysql 21
512 de.comp.datenbanken.ms-access 84
513 de.comp.datenbanken.mysql 88
515 Average and sums reports (see above) will always be grouped by months;
516 this option will therefore be ignored.
518 =item B<-o>, B<--order-by> I<default[-desc]|postings[-desc]>
520 Within each group (a single month or single newsgroup, see above), the
521 report will be sorted by newsgroup names in ascending alphabetical order
522 by default. You can change the sort order to descending or sort by number
525 =item B<-f>, B<--format> I<pretty|list|dump>
527 Select the output format, I<pretty> being the default:
530 de.comp.datenbanken.ms-access 84
531 de.comp.datenbanken.mysql 88
533 de.comp.datenbanken.ms-access 126
534 de.comp.datenbanken.mysql 21
536 I<list> format looks like this:
538 2012-01 de.comp.datenbanken.ms-access 84
539 2012-01 de.comp.datenbanken.mysql 88
540 2012-02 de.comp.datenbanken.ms-access 126
541 2012-02 de.comp.datenbanken.mysql 21
543 And I<dump> format looks like this:
546 de.comp.datenbanken.ms-access 84
547 de.comp.datenbanken.mysql 88
549 de.comp.datenbanken.ms-access 126
550 de.comp.datenbanken.mysql 21
552 You can remove the comments by using B<--nocomments>, see below.
554 =item B<-c>, B<--captions|--nocaptions>
556 Add captions to output, like this:
558 ----- Report for 2012-01 to 2012-02 (number of postings for each month)
559 ----- Newsgroups: de.comp.datenbanken.*
560 ----- Threshold: 10 => x <= 20 (on average)
561 ----- Grouped by Newsgroups (ascending), sorted by number of postings descending
565 =item B<--comments|--nocomments>
567 Add comments (group headers) to I<dump> and I<pretty> output. True by default.
569 Use I<--nocomments> to suppress anything except newsgroup names/months and
570 numbers of postings. This is enforced when using B<--filetemplate>, see below.
572 =item B<--filetemplate> I<filename template>
574 Save output to file(s) instead of dumping it to STDOUT. B<groupstats> will
575 create one file for each month (or each newsgroup, accordant to the
576 setting of B<--group-by>, see above), with filenames composed by adding
577 year and month (or newsgroup names) to the I<filename template>, for
578 example with B<--filetemplate> I<stats>:
584 B<--nocomments> is enforced, see above.
586 =item B<--groupsdb> I<database table>
588 Override I<DBTableGrps> from F<newsstats.conf>.
598 Show number of postings per group for lasth month in I<pretty> format:
602 Show that report for January of 2010 and de.alt.* plus de.test,
603 including display of hierarchy levels:
605 groupstats --month 2010-01 --newsgroups de.alt.*:de.test --sums
607 Only show newsgroups with 30 postings or less last month, ordered
608 by number of postings, descending, in I<pretty> format:
610 groupstats --upper 30 --order-by postings-desc
612 Show the total of all postings for the year of 2010 for all groups that
613 had 30 postings or less in every single month in that year, ordered by
614 number of postings in descending order:
616 groupstats -m 2010-01:2010-12 -u 30 -b level -r sums -o postings-desc
618 The same for the average number of postings in the year of 2010:
620 groupstats -m 2010-01:2010-12 -u 30 -b level -r avg -o postings-desc
622 List number of postings per group for eacht month of 2010 and redirect
623 output to one file for each month, namend stats-2010-01 and so on, in
624 machine-readable form (without formatting):
626 groupstats -m 2010-01:2010-12 -f dump --filetemplate stats
633 =item F<groupstats.pl>
637 =item F<NewsStats.pm>
639 Library functions for the NewsStats package.
641 =item F<newsstats.conf>
643 Runtime configuration file.
649 Please report any bugs or feature requests to the author or use the
650 bug tracker at L<http://bugs.th-h.de/>!
670 This script is part of the B<NewsStats> package.
674 Thomas Hochstein <thh@inter.net>
676 =head1 COPYRIGHT AND LICENSE
678 Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
680 This program is free software; you may redistribute it and/or modify it
681 under the same terms as Perl itself.