5 # This script will get statistical data on newgroup usage
8 # It is part of the NewsStats package.
10 # Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
12 # It can be redistributed and/or modified under the same terms under
13 # which Perl itself is published.
16 our $VERSION = "0.01";
18 push(@INC, dirname($0));
22 use NewsStats qw(:DEFAULT :TimePeriods :Output :SQLHelper ReadGroupList);
25 use Getopt::Long qw(GetOptions);
26 Getopt::Long::config ('bundling');
28 ################################# Main program #################################
30 ### read commandline options
31 my ($OptBoundType,$OptCaptions,$OptCheckgroupsFile,$OptComments,
32 $OptFileTemplate,$OptFormat,$OptGroupBy,$OptGroupsDB,$LowBound,$OptMonth,
33 $OptNewsgroups,$OptOrderBy,$OptReportType,$OptSums,$UppBound);
34 GetOptions ('b|boundary=s' => \$OptBoundType,
35 'c|captions!' => \$OptCaptions,
36 'checkgroups=s' => \$OptCheckgroupsFile,
37 'comments!' => \$OptComments,
38 'filetemplate=s' => \$OptFileTemplate,
39 'f|format=s' => \$OptFormat,
40 'g|group-by=s' => \$OptGroupBy,
41 'groupsdb=s' => \$OptGroupsDB,
42 'l|lower=i' => \$LowBound,
43 'm|month=s' => \$OptMonth,
44 'n|newsgroups=s' => \$OptNewsgroups,
45 'o|order-by=s' => \$OptOrderBy,
46 'r|report=s' => \$OptReportType,
47 's|sums!' => \$OptSums,
48 'u|upper=i' => \$UppBound,
49 'h|help' => \&ShowPOD,
50 'V|version' => \&ShowVersion) or exit 1;
52 # $OptComments defaults to TRUE
53 $OptComments = 1 if (!defined($OptComments));
54 # force --nocomments when --filetemplate is used
55 $OptComments = 0 if ($OptFileTemplate);
58 if ($OptBoundType =~ /level/i) {
59 $OptBoundType = 'level';
60 } elsif ($OptBoundType =~ /av(era)?ge?/i) {
61 $OptBoundType = 'average';
62 } elsif ($OptBoundType =~ /sums?/i) {
63 $OptBoundType = 'sum';
65 $OptBoundType = 'default';
68 # parse $OptReportType
70 if ($OptReportType =~ /av(era)?ge?/i) {
71 $OptReportType = 'average';
72 } elsif ($OptReportType =~ /sums?/i) {
73 $OptReportType = 'sum';
75 $OptReportType = 'default';
78 # read list of newsgroups from --checkgroups
79 # into a hash reference
80 my $ValidGroups = &ReadGroupList($OptCheckgroupsFile) if $OptCheckgroupsFile;
82 ### read configuration
83 my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')};
85 ### override configuration via commandline options
87 $ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
88 &OverrideConfig(\%Conf,\%ConfOverride);
91 my $DBHandle = InitDB(\%Conf,1);
93 ### get time period and newsgroups, prepare SQL 'WHERE' clause
95 # and set caption for output and expression for SQL 'WHERE' clause
96 my ($CaptionPeriod,$SQLWherePeriod) = &GetTimePeriod($OptMonth);
97 # bail out if --month is invalid
98 &Bleat(2,"--month option has an invalid format - ".
99 "please use 'YYYY-MM', 'YYYY-MM:YYYY-MM' or 'ALL'!") if !$CaptionPeriod;
100 # get list of newsgroups and set expression for SQL 'WHERE' clause
101 # with placeholders as well as a list of newsgroup to bind to them
102 my ($SQLWhereNewsgroups,@SQLBindNewsgroups) = &SQLGroupList($OptNewsgroups)
105 ### build SQL WHERE clause (and HAVING clause, if needed)
106 my ($SQLWhereClause,$SQLHavingClause);
107 # $OptBoundType 'level'
108 if ($OptBoundType and $OptBoundType ne 'default') {
109 $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,
110 $SQLWhereNewsgroups,&SQLHierarchies($OptSums));
111 $SQLHavingClause = SQLBuildClause('having',&SQLSetBounds($OptBoundType,
112 $LowBound,$UppBound));
113 # $OptBoundType 'threshold' / 'default' or none
115 $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,
116 $SQLWhereNewsgroups,&SQLHierarchies($OptSums),
117 &SQLSetBounds('default',$LowBound,$UppBound));
120 ### get sort order and build SQL 'ORDER BY' clause
121 # default to 'newsgroup' for $OptBoundType 'level' or 'average'
122 $OptGroupBy = 'newsgroup' if (!$OptGroupBy and
123 $OptBoundType and $OptBoundType ne 'default');
124 # force to 'month' for $OptReportType 'average' or 'sum'
125 $OptGroupBy = 'month' if ($OptReportType and $OptReportType ne 'default');
126 # parse $OptGroupBy to $GroupBy, create ORDER BY clause $SQLOrderClause
127 my ($GroupBy,$SQLOrderClause) = SQLSortOrder($OptGroupBy, $OptOrderBy);
128 # $GroupBy will contain 'month' or 'newsgroup' (parsed result of $OptGroupBy)
129 # set it to 'month' or 'key' for OutputData()
130 $GroupBy = ($GroupBy eq 'month') ? 'month' : 'key';
132 ### get report type and build SQL 'SELECT' query
134 my $SQLGroupClause = '';
135 my $Precision = 0; # number of digits right of decimal point for output
136 if ($OptReportType and $OptReportType ne 'default') {
137 $SQLGroupClause = 'GROUP BY newsgroup';
138 # change $SQLOrderClause: replace everything before 'postings'
139 $SQLOrderClause =~ s/BY.+postings/BY postings/;
140 if ($OptReportType eq 'average') {
141 $SQLSelect = "'All months',newsgroup,AVG(postings)";
143 # change $SQLOrderClause: replace 'postings' with 'AVG(postings)'
144 $SQLOrderClause =~ s/postings/AVG(postings)/;
145 } elsif ($OptReportType eq 'sum') {
146 $SQLSelect = "'All months',newsgroup,SUM(postings)";
147 # change $SQLOrderClause: replace 'postings' with 'SUM(postings)'
148 $SQLOrderClause =~ s/postings/SUM(postings)/;
151 $SQLSelect = 'month,newsgroup,postings';
154 ### get length of longest newsgroup name delivered by query
155 ### for formatting purposes
156 my $Field = ($GroupBy eq 'month') ? 'newsgroup' : 'month';
157 my $MaxLength = &GetMaxLength($DBHandle,$Conf{'DBTableGrps'},
158 $Field,$SQLWhereClause,$SQLHavingClause,
161 ### build and execute SQL query
163 # special query preparation for $OptBoundType 'level', 'average' or 'sums'
164 if ($OptBoundType and $OptBoundType ne 'default') {
165 # prepare and execute first query:
166 # get list of newsgroups meeting level conditions
167 $DBQuery = $DBHandle->prepare(sprintf('SELECT newsgroup FROM %s.%s %s '.
168 'GROUP BY newsgroup %s',
169 $Conf{'DBDatabase'},$Conf{'DBTableGrps'},
170 $SQLWhereClause,$SQLHavingClause));
171 $DBQuery->execute(@SQLBindNewsgroups)
172 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: %s\n",
173 $CaptionPeriod,$Conf{'DBDatabase'},$Conf{'DBTableGrps'},
175 # add newsgroups to a comma-seperated list ready for IN(...) query
177 while (my ($Newsgroup) = $DBQuery->fetchrow_array) {
178 $GroupList .= ',' if $GroupList;
179 $GroupList .= "'$Newsgroup'";
181 # enhance $WhereClause
183 $SQLWhereClause = SQLBuildClause('where',$SQLWhereClause,
184 sprintf('newsgroup IN (%s)',$GroupList));
186 # condition cannot be satisfied;
187 # force query to fail by adding '0=1'
188 $SQLWhereClause = SQLBuildClause('where',$SQLWhereClause,'0=1');
193 $DBQuery = $DBHandle->prepare(sprintf('SELECT %s FROM %s.%s %s %s %s',
195 $Conf{'DBDatabase'},$Conf{'DBTableGrps'},
196 $SQLWhereClause,$SQLGroupClause,$
200 $DBQuery->execute(@SQLBindNewsgroups)
201 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: %s\n",
202 $CaptionPeriod,$Conf{'DBDatabase'},$Conf{'DBTableGrps'},
206 # set default to 'pretty'
207 $OptFormat = 'pretty' if !$OptFormat;
208 # print captions if --caption is set
209 if ($OptCaptions && $OptComments) {
210 # print time period with report type
211 my $CaptionReportType= '(number of postings for each month)';
212 if ($OptReportType and $OptReportType ne 'default') {
213 $CaptionReportType= '(average number of postings for each month)'
214 if $OptReportType eq 'average';
215 $CaptionReportType= '(number of all postings for that time period)'
216 if $OptReportType eq 'sum';
218 printf("# ----- Report for %s %s\n",$CaptionPeriod,$CaptionReportType);
219 # print newsgroup list if --newsgroups is set
220 printf("# ----- Newsgroups: %s\n",join(',',split(/:/,$OptNewsgroups)))
222 # print boundaries, if set
223 my $CaptionBoundary= '(counting only month fulfilling this condition)';
224 if ($OptBoundType and $OptBoundType ne 'default') {
225 $CaptionBoundary= '(every single month)' if $OptBoundType eq 'level';
226 $CaptionBoundary= '(on average)' if $OptBoundType eq 'average';
227 $CaptionBoundary= '(all month summed up)' if $OptBoundType eq 'sum';
229 printf("# ----- Threshold: %s %s x %s %s %s\n",
230 $LowBound ? $LowBound : '',$LowBound ? '=>' : '',
231 $UppBound ? '<=' : '',$UppBound ? $UppBound : '',$CaptionBoundary)
232 if ($LowBound or $UppBound);
233 # print primary and secondary sort order
234 printf("# ----- Grouped by %s (%s), sorted %s%s\n",
235 ($GroupBy eq 'month') ? 'Months' : 'Newsgroups',
236 ($OptGroupBy and $OptGroupBy =~ /-?desc$/i) ? 'descending' : 'ascending',
237 ($OptOrderBy and $OptOrderBy =~ /posting/i) ? 'by number of postings ' : '',
238 ($OptOrderBy and $OptOrderBy =~ /-?desc$/i) ? 'descending' : 'ascending');
242 &OutputData($OptFormat,$OptComments,$GroupBy,$Precision,
243 $OptCheckgroupsFile ? $ValidGroups : '',
244 $OptFileTemplate,$DBQuery,$MaxLength);
247 $DBHandle->disconnect;
251 ################################ Documentation #################################
255 groupstats - create reports on newsgroup usage
259 B<groupstats> [B<-Vhcs> B<--comments>] [B<-m> I<YYYY-MM>[:I<YYYY-MM>] | I<all>] [B<-n> I<newsgroup(s)>] [B<--checkgroups> I<checkgroups file>] [B<-r> I<report type>] [B<-l> I<lower boundary>] [B<-u> I<upper boundary>] [B<-b> I<boundary type>] [B<-g> I<group by>] [B<-o> I<order by>] [B<-f> I<output format>] [B<--filetemplate> I<filename template>] [B<--groupsdb> I<database table>]
267 This script create reports on newsgroup usage (number of postings per
268 group per month) taken from result tables created by
271 =head2 Features and options
273 =head3 Time period and newsgroups
275 The time period to act on defaults to last month; you can assign another
276 time period or a single month (or drop all time constraints) via the
277 B<--month> option (see below).
279 B<groupstats> will process all newsgroups by default; you can limit
280 processing to only some newsgroups by supplying a list of those groups via
281 B<--newsgroups> option (see below). You can include hierarchy levels in
282 the output by adding the B<--sums> switch (see below). Optionally
283 newsgroups not present in a checkgroups file can be excluded from output,
284 sse B<--checkgroups> below.
288 You can choose between different B<--report> types: postings per month,
289 average postings per month or all postings summed up; for details, see
292 =head3 Upper and lower boundaries
294 Furthermore you can set an upper and/or lower boundary to exclude some
295 results from output via the B<--lower> and B<--upper> options,
296 respectively. By default, all newsgroups with more and/or less postings
297 per month will be excluded from the result set (i.e. not shown and not
298 considered for average and sum reports). You can change the meaning of
299 those boundaries with the B<--boundary> option. For details, please see
302 =head3 Sorting and formatting the output
304 By default, all results are grouped by month; you can group results by
305 newsgroup instead via the B<--groupy-by> option. Within those groups, the
306 list of newsgroups (or months) is sorted alphabetically (or
307 chronologically, respectively) ascending. You can change that order (and
308 sort by number of postings) with the B<--order-by> option. For details and
309 exceptions, please see below.
311 The results will be formatted as a kind of table; you can change the
312 output format to a simple list or just a list of newsgroups and number of
313 postings with the B<--format> option. Captions will be added by means of
314 the B<--caption> option; all comments (and captions) can be supressed by
315 using B<--nocomments>.
317 Last but not least you can redirect all output to a number of files, e.g.
318 one for each month, by submitting the B<--filetemplate> option, see below.
319 Captions and comments are automatically disabled in this case.
323 B<groupstats> will read its configuration from F<newsstats.conf>
324 which should be present in the same directory via Config::Auto.
326 See doc/INSTALL for an overview of possible configuration options.
328 You can override some configuration options via the B<--groupsdb> option.
334 =item B<-V>, B<--version>
336 Print out version and copyright information and exit.
338 =item B<-h>, B<--help>
340 Print this man page and exit.
342 =item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]|all>
344 Set processing period to a single month in YYYY-MM format or to a time
345 period between two month in YYYY-MM:YYYY-MM format (two month, separated
346 by a colon). By using the keyword I<all> instead, you can set no
347 processing period to process the whole database.
349 =item B<-n>, B<--newsgroups> I<newsgroup(s)>
351 Limit processing to a certain set of newsgroups. I<newsgroup(s)> can
352 be a single newsgroup name (de.alt.test), a newsgroup hierarchy
353 (de.alt.*) or a list of either of these, separated by colons, for
356 de.test:de.alt.test:de.newusers.*
358 =item B<-s>, B<--sums|--nosums> (sum per hierarchy level)
360 Include "virtual" groups for every hierarchy level in output, for
367 See the B<gatherstats> man page for details.
369 =item B<--checkgroups> I<filename>
371 Restrict output to those newgroups present in a file in checkgroups format
372 (one newgroup name per line; everything after the first whitespace on each
373 line is ignored). All other newsgroups will be removed from output.
375 Contrary to B<gatherstats>, I<filename> is not a template, but refers to
376 a single file in checkgroups format.
378 =item B<-r>, B<--report> I<default|average|sums>
380 Choose the report type: I<default>, I<average> or I<sums>
382 By default, B<groupstats> will report the number of postings for each
383 newsgroup in each month. But it can also report the average number of
384 postings per group for all months or the total sum of postings per group
387 For report types I<average> and I<sums>, the B<group-by> option has no
388 meaning and will be silently ignored (see below).
390 =item B<-l>, B<--lower> I<lower boundary>
392 Set the lower boundary. See B<--boundary> below.
394 =item B<-l>, B<--upper> I<upper boundary>
396 Set the upper boundary. See B<--boundary> below.
398 =item B<-b>, B<--boundary> I<boundary type>
400 Set the boundary type to one of I<default>, I<level>, I<average> or
403 By default, all newsgroups with more postings per month than the upper
404 boundary and/or less postings per month than the lower boundary will be
405 excluded from further processing. For the default report that means each
406 month only newsgroups with a number of postings between the boundaries
407 will be displayed. For the other report types, newsgroups with a number of
408 postings exceeding the boundaries in all (!) months will not be
411 For example, lets take a list of newsgroups like this:
414 de.comp.datenbanken.misc 6
415 de.comp.datenbanken.ms-access 84
416 de.comp.datenbanken.mysql 88
418 de.comp.datenbanken.misc 8
419 de.comp.datenbanken.ms-access 126
420 de.comp.datenbanken.mysql 21
422 de.comp.datenbanken.misc 24
423 de.comp.datenbanken.ms-access 83
424 de.comp.datenbanken.mysql 36
426 With C<groupstats --month 2012-01:2012-03 --lower 25 --report sums>,
427 you'll get the following result:
430 de.comp.datenbanken.ms-access 293
431 de.comp.datenbanken.mysql 124
433 de.comp.datenbanken.misc has not been considered even though it has 38
434 postings in total, because it has less than 25 postings in every single
435 month. If you want to list all newsgroups with more than 25 postings
436 I<in total>, you'll have to set the boundary type to I<sum>, see below.
438 A boundary type of I<level> will show only those newsgroups - at all -
439 that satisfy the boundaries in each and every single month. With the above
440 list of newsgroups and
441 C<groupstats --month 2012-01:2012-03 --lower 25 --boundary level --report sums>,
442 you'll get this result:
445 de.comp.datenbanken.ms-access 293
447 de.comp.datenbanken.mysql has not been considered because it had less than
448 25 postings in 2012-02 (only).
450 You can use that to get a list of newsgroups that have more (or less) then
451 x postings in every month during the whole reporting period.
453 A boundary type of I<average> will show only those newsgroups - at all -that
454 satisfy the boundaries on average. With the above list of newsgroups and
455 C<groupstats --month 2012-01:2012-03 --lower 25 --boundary avg --report sums>,
456 you'll get this result:
459 de.comp.datenbanken.ms-access 293
460 de.comp.datenbanken.mysql 145
462 The average number of postings in the three groups is:
464 de.comp.datenbanken.misc 12.67
465 de.comp.datenbanken.ms-access 97.67
466 de.comp.datenbanken.mysql 48.33
468 Last but not least, a boundary type of I<sums> will show only those
469 newsgroups - at all - that satisfy the boundaries with the total sum of
470 all postings during the reporting period. With the above list of
472 C<groupstats --month 2012-01:2012-03 --lower 25 --boundary sum --report sums>,
473 you'll finally get this result:
476 de.comp.datenbanken.misc 38
477 de.comp.datenbanken.ms-access 293
478 de.comp.datenbanken.mysql 145
481 =item B<-g>, B<--group-by> I<month[-desc]|newsgroups[-desc]>
483 By default, all results are grouped by month, sorted chronologically in
484 ascending order, like this:
487 de.comp.datenbanken.ms-access 84
488 de.comp.datenbanken.mysql 88
490 de.comp.datenbanken.ms-access 126
491 de.comp.datenbanken.mysql 21
493 The results can be grouped by newsgroups instead via
494 B<--group-by> I<newsgroup>:
496 ----- de.comp.datenbanken.ms-access:
499 ----- de.comp.datenbanken.mysql:
503 By appending I<-desc> to the group-by option parameter, you can reverse
504 the sort order - e.g. B<--group-by> I<month-desc> will give:
507 de.comp.datenbanken.ms-access 126
508 de.comp.datenbanken.mysql 21
510 de.comp.datenbanken.ms-access 84
511 de.comp.datenbanken.mysql 88
513 Average and sums reports (see above) will always be grouped by months;
514 this option will therefore be ignored.
516 =item B<-o>, B<--order-by> I<default[-desc]|postings[-desc]>
518 Within each group (a single month or single newsgroup, see above), the
519 report will be sorted by newsgroup names in ascending alphabetical order
520 by default. You can change the sort order to descending or sort by number
523 =item B<-f>, B<--format> I<pretty|list|dump>
525 Select the output format, I<pretty> being the default:
528 de.comp.datenbanken.ms-access 84
529 de.comp.datenbanken.mysql 88
531 de.comp.datenbanken.ms-access 126
532 de.comp.datenbanken.mysql 21
534 I<list> format looks like this:
536 2012-01 de.comp.datenbanken.ms-access 84
537 2012-01 de.comp.datenbanken.mysql 88
538 2012-02 de.comp.datenbanken.ms-access 126
539 2012-02 de.comp.datenbanken.mysql 21
541 And I<dump> format looks like this:
544 de.comp.datenbanken.ms-access 84
545 de.comp.datenbanken.mysql 88
547 de.comp.datenbanken.ms-access 126
548 de.comp.datenbanken.mysql 21
550 You can remove the comments by using B<--nocomments>, see below.
552 =item B<-c>, B<--captions|--nocaptions>
554 Add captions to output, like this:
556 ----- Report for 2012-01 to 2012-02 (number of postings for each month)
557 ----- Newsgroups: de.comp.datenbanken.*
558 ----- Threshold: 10 => x <= 20 (on average)
559 ----- Grouped by Newsgroups (ascending), sorted by number of postings descending
563 =item B<--comments|--nocomments>
565 Add comments (group headers) to I<dump> and I<pretty> output. True by default.
567 Use I<--nocomments> to suppress anything except newsgroup names/months and
568 numbers of postings. This is enforced when using B<--filetemplate>, see below.
570 =item B<--filetemplate> I<filename template>
572 Save output to file(s) instead of dumping it to STDOUT. B<groupstats> will
573 create one file for each month (or each newsgroup, accordant to the
574 setting of B<--group-by>, see above), with filenames composed by adding
575 year and month (or newsgroup names) to the I<filename template>, for
576 example with B<--filetemplate> I<stats>:
582 B<--nocomments> is enforced, see above.
584 =item B<--groupsdb> I<database table>
586 Override I<DBTableGrps> from F<newsstats.conf>.
596 Show number of postings per group for lasth month in I<pretty> format:
600 Show that report for January of 2010 and de.alt.* plus de.test,
601 including display of hierarchy levels:
603 groupstats --month 2010-01 --newsgroups de.alt.*:de.test --sums
605 Only show newsgroups with 30 postings or less last month, ordered
606 by number of postings, descending, in I<pretty> format:
608 groupstats --upper 30 --order-by postings-desc
610 Show the total of all postings for the year of 2010 for all groups that
611 had 30 postings or less in every single month in that year, ordered by
612 number of postings in descending order:
614 groupstats -m 2010-01:2010-12 -u 30 -b level -r sums -o postings-desc
616 The same for the average number of postings in the year of 2010:
618 groupstats -m 2010-01:2010-12 -u 30 -b level -r avg -o postings-desc
620 List number of postings per group for eacht month of 2010 and redirect
621 output to one file for each month, namend stats-2010-01 and so on, in
622 machine-readable form (without formatting):
624 groupstats -m 2010-01:2010-12 -f dump --filetemplate stats
631 =item F<groupstats.pl>
635 =item F<NewsStats.pm>
637 Library functions for the NewsStats package.
639 =item F<newsstats.conf>
641 Runtime configuration file.
647 Please report any bugs or feature requests to the author or use the
648 bug tracker at L<http://bugs.th-h.de/>!
668 This script is part of the B<NewsStats> package.
672 Thomas Hochstein <thh@inter.net>
674 =head1 COPYRIGHT AND LICENSE
676 Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
678 This program is free software; you may redistribute it and/or modify it
679 under the same terms as Perl itself.