5 # This script will get statistical data on newgroup usage
8 # It is part of the NewsStats package.
10 # Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
12 # It can be redistributed and/or modified under the same terms under
13 # which Perl itself is published.
16 our $VERSION = "0.01";
18 push(@INC, dirname($0));
23 use NewsStats qw(:DEFAULT :TimePeriods :Output :SQLHelper ReadGroupList);
26 use Getopt::Long qw(GetOptions);
27 Getopt::Long::config ('bundling');
29 ################################# Main program #################################
31 ### read commandline options
32 my ($OptBoundType,$OptCaptions,$OptCheckgroupsFile,$OptComments,
33 $OptFileTemplate,$OptFormat,$OptGroupBy,$OptGroupsDB,$LowBound,$OptMonth,
34 $OptNewsgroups,$OptOrderBy,$OptReportType,$OptSums,$UppBound);
35 GetOptions ('b|boundary=s' => \$OptBoundType,
36 'c|captions!' => \$OptCaptions,
37 'checkgroups=s' => \$OptCheckgroupsFile,
38 'comments!' => \$OptComments,
39 'filetemplate=s' => \$OptFileTemplate,
40 'f|format=s' => \$OptFormat,
41 'g|group-by=s' => \$OptGroupBy,
42 'groupsdb=s' => \$OptGroupsDB,
43 'l|lower=i' => \$LowBound,
44 'm|month=s' => \$OptMonth,
45 'n|newsgroups=s' => \$OptNewsgroups,
46 'o|order-by=s' => \$OptOrderBy,
47 'r|report=s' => \$OptReportType,
48 's|sums!' => \$OptSums,
49 'u|upper=i' => \$UppBound,
50 'h|help' => \&ShowPOD,
51 'V|version' => \&ShowVersion) or exit 1;
53 # $OptComments defaults to TRUE
54 $OptComments = 1 if (!defined($OptComments));
55 # force --nocomments when --filetemplate is used
56 $OptComments = 0 if ($OptFileTemplate);
59 if ($OptBoundType =~ /level/i) {
60 $OptBoundType = 'level';
61 } elsif ($OptBoundType =~ /av(era)?ge?/i) {
62 $OptBoundType = 'average';
63 } elsif ($OptBoundType =~ /sums?/i) {
64 $OptBoundType = 'sum';
66 $OptBoundType = 'default';
69 # parse $OptReportType
71 if ($OptReportType =~ /av(era)?ge?/i) {
72 $OptReportType = 'average';
73 } elsif ($OptReportType =~ /sums?/i) {
74 $OptReportType = 'sum';
76 $OptReportType = 'default';
79 # read list of newsgroups from --checkgroups
80 # into a hash reference
81 my $ValidGroups = &ReadGroupList($OptCheckgroupsFile) if $OptCheckgroupsFile;
83 ### read configuration
84 my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')};
86 ### override configuration via commandline options
88 $ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
89 &OverrideConfig(\%Conf,\%ConfOverride);
92 my $DBHandle = InitDB(\%Conf,1);
94 ### get time period and newsgroups, prepare SQL 'WHERE' clause
96 # and set caption for output and expression for SQL 'WHERE' clause
97 my ($CaptionPeriod,$SQLWherePeriod) = &GetTimePeriod($OptMonth);
98 # bail out if --month is invalid
99 &Bleat(2,"--month option has an invalid format - ".
100 "please use 'YYYY-MM', 'YYYY-MM:YYYY-MM' or 'ALL'!") if !$CaptionPeriod;
101 # get list of newsgroups and set expression for SQL 'WHERE' clause
102 # with placeholders as well as a list of newsgroup to bind to them
103 my ($SQLWhereNewsgroups,@SQLBindNewsgroups);
104 if ($OptNewsgroups) {
105 ($SQLWhereNewsgroups,@SQLBindNewsgroups) = &SQLGroupList($OptNewsgroups);
106 # bail out if --newsgroups is invalid
107 &Bleat(2,"--newsgroups option has an invalid format!")
108 if !$SQLWhereNewsgroups;
111 ### build SQL WHERE clause (and HAVING clause, if needed)
112 my ($SQLWhereClause,$SQLHavingClause);
113 # $OptBoundType 'level'
114 if ($OptBoundType and $OptBoundType ne 'default') {
115 $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,
116 $SQLWhereNewsgroups,&SQLHierarchies($OptSums));
117 $SQLHavingClause = SQLBuildClause('having',&SQLSetBounds($OptBoundType,
118 $LowBound,$UppBound));
119 # $OptBoundType 'threshold' / 'default' or none
121 $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,
122 $SQLWhereNewsgroups,&SQLHierarchies($OptSums),
123 &SQLSetBounds('default',$LowBound,$UppBound));
126 ### get sort order and build SQL 'ORDER BY' clause
127 # default to 'newsgroup' for $OptBoundType 'level' or 'average'
128 $OptGroupBy = 'newsgroup' if (!$OptGroupBy and
129 $OptBoundType and $OptBoundType ne 'default');
130 # force to 'month' for $OptReportType 'average' or 'sum'
131 $OptGroupBy = 'month' if ($OptReportType and $OptReportType ne 'default');
132 # parse $OptGroupBy to $GroupBy, create ORDER BY clause $SQLOrderClause
133 my ($GroupBy,$SQLOrderClause) = SQLSortOrder($OptGroupBy, $OptOrderBy);
134 # $GroupBy will contain 'month' or 'newsgroup' (parsed result of $OptGroupBy)
135 # set it to 'month' or 'key' for OutputData()
136 $GroupBy = ($GroupBy eq 'month') ? 'month' : 'key';
138 ### get report type and build SQL 'SELECT' query
140 my $SQLGroupClause = '';
141 my $Precision = 0; # number of digits right of decimal point for output
142 if ($OptReportType and $OptReportType ne 'default') {
143 $SQLGroupClause = 'GROUP BY newsgroup';
144 # change $SQLOrderClause: replace everything before 'postings'
145 $SQLOrderClause =~ s/BY.+postings/BY postings/;
146 if ($OptReportType eq 'average') {
147 $SQLSelect = "'All months',newsgroup,AVG(postings)";
149 # change $SQLOrderClause: replace 'postings' with 'AVG(postings)'
150 $SQLOrderClause =~ s/postings/AVG(postings)/;
151 } elsif ($OptReportType eq 'sum') {
152 $SQLSelect = "'All months',newsgroup,SUM(postings)";
153 # change $SQLOrderClause: replace 'postings' with 'SUM(postings)'
154 $SQLOrderClause =~ s/postings/SUM(postings)/;
157 $SQLSelect = 'month,newsgroup,postings';
160 ### get length of longest newsgroup name delivered by query
161 ### for formatting purposes
162 my $Field = ($GroupBy eq 'month') ? 'newsgroup' : 'month';
163 my ($MaxLength,$MaxValLength) = &GetMaxLength($DBHandle,$Conf{'DBTableGrps'},
164 $Field,'postings',$SQLWhereClause,
168 ### build and execute SQL query
170 # special query preparation for $OptBoundType 'level', 'average' or 'sums'
171 if ($OptBoundType and $OptBoundType ne 'default') {
172 # prepare and execute first query:
173 # get list of newsgroups meeting level conditions
174 $DBQuery = $DBHandle->prepare(sprintf('SELECT newsgroup FROM %s.%s %s '.
175 'GROUP BY newsgroup %s',
176 $Conf{'DBDatabase'},$Conf{'DBTableGrps'},
177 $SQLWhereClause,$SQLHavingClause));
178 $DBQuery->execute(@SQLBindNewsgroups)
179 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: %s\n",
180 $CaptionPeriod,$Conf{'DBDatabase'},$Conf{'DBTableGrps'},
182 # add newsgroups to a comma-seperated list ready for IN(...) query
184 while (my ($Newsgroup) = $DBQuery->fetchrow_array) {
185 $GroupList .= ',' if $GroupList;
186 $GroupList .= "'$Newsgroup'";
188 # enhance $WhereClause
190 $SQLWhereClause = SQLBuildClause('where',$SQLWhereClause,
191 sprintf('newsgroup IN (%s)',$GroupList));
193 # condition cannot be satisfied;
194 # force query to fail by adding '0=1'
195 $SQLWhereClause = SQLBuildClause('where',$SQLWhereClause,'0=1');
200 $DBQuery = $DBHandle->prepare(sprintf('SELECT %s FROM %s.%s %s %s %s',
202 $Conf{'DBDatabase'},$Conf{'DBTableGrps'},
203 $SQLWhereClause,$SQLGroupClause,
207 $DBQuery->execute(@SQLBindNewsgroups)
208 or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: %s\n",
209 $CaptionPeriod,$Conf{'DBDatabase'},$Conf{'DBTableGrps'},
213 # set default to 'pretty'
214 $OptFormat = 'pretty' if !$OptFormat;
215 # print captions if --caption is set
216 if ($OptCaptions && $OptComments) {
217 # print time period with report type
218 my $CaptionReportType= '(number of postings for each month)';
219 if ($OptReportType and $OptReportType ne 'default') {
220 $CaptionReportType= '(average number of postings for each month)'
221 if $OptReportType eq 'average';
222 $CaptionReportType= '(number of all postings for that time period)'
223 if $OptReportType eq 'sum';
225 printf("# ----- Report for %s %s\n",$CaptionPeriod,$CaptionReportType);
226 # print newsgroup list if --newsgroups is set
227 printf("# ----- Newsgroups: %s\n",join(',',split(/:/,$OptNewsgroups)))
229 # print boundaries, if set
230 my $CaptionBoundary= '(counting only month fulfilling this condition)';
231 if ($OptBoundType and $OptBoundType ne 'default') {
232 $CaptionBoundary= '(every single month)' if $OptBoundType eq 'level';
233 $CaptionBoundary= '(on average)' if $OptBoundType eq 'average';
234 $CaptionBoundary= '(all month summed up)' if $OptBoundType eq 'sum';
236 printf("# ----- Threshold: %s %s x %s %s %s\n",
237 $LowBound ? $LowBound : '',$LowBound ? '=>' : '',
238 $UppBound ? '<=' : '',$UppBound ? $UppBound : '',$CaptionBoundary)
239 if ($LowBound or $UppBound);
240 # print primary and secondary sort order
241 printf("# ----- Grouped by %s (%s), sorted %s%s\n",
242 ($GroupBy eq 'month') ? 'Months' : 'Newsgroups',
243 ($OptGroupBy and $OptGroupBy =~ /-?desc$/i) ? 'descending' : 'ascending',
244 ($OptOrderBy and $OptOrderBy =~ /posting/i) ? 'by number of postings ' : '',
245 ($OptOrderBy and $OptOrderBy =~ /-?desc$/i) ? 'descending' : 'ascending');
249 &OutputData($OptFormat,$OptComments,$GroupBy,$Precision,
250 $OptCheckgroupsFile ? $ValidGroups : '',
251 $OptFileTemplate,$DBQuery,$MaxLength,$MaxValLength);
254 $DBHandle->disconnect;
258 ################################ Documentation #################################
262 groupstats - create reports on newsgroup usage
266 B<groupstats> [B<-Vhcs> B<--comments>] [B<-m> I<YYYY-MM>[:I<YYYY-MM>] | I<all>] [B<-n> I<newsgroup(s)>] [B<--checkgroups> I<checkgroups file>] [B<-r> I<report type>] [B<-l> I<lower boundary>] [B<-u> I<upper boundary>] [B<-b> I<boundary type>] [B<-g> I<group by>] [B<-o> I<order by>] [B<-f> I<output format>] [B<--filetemplate> I<filename template>] [B<--groupsdb> I<database table>]
274 This script create reports on newsgroup usage (number of postings per
275 group per month) taken from result tables created by
278 =head2 Features and options
280 =head3 Time period and newsgroups
282 The time period to act on defaults to last month; you can assign another
283 time period or a single month (or drop all time constraints) via the
284 B<--month> option (see below).
286 B<groupstats> will process all newsgroups by default; you can limit
287 processing to only some newsgroups by supplying a list of those groups via
288 B<--newsgroups> option (see below). You can include hierarchy levels in
289 the output by adding the B<--sums> switch (see below). Optionally
290 newsgroups not present in a checkgroups file can be excluded from output,
291 sse B<--checkgroups> below.
295 You can choose between different B<--report> types: postings per month,
296 average postings per month or all postings summed up; for details, see
299 =head3 Upper and lower boundaries
301 Furthermore you can set an upper and/or lower boundary to exclude some
302 results from output via the B<--lower> and B<--upper> options,
303 respectively. By default, all newsgroups with more and/or less postings
304 per month will be excluded from the result set (i.e. not shown and not
305 considered for average and sum reports). You can change the meaning of
306 those boundaries with the B<--boundary> option. For details, please see
309 =head3 Sorting and formatting the output
311 By default, all results are grouped by month; you can group results by
312 newsgroup instead via the B<--groupy-by> option. Within those groups, the
313 list of newsgroups (or months) is sorted alphabetically (or
314 chronologically, respectively) ascending. You can change that order (and
315 sort by number of postings) with the B<--order-by> option. For details and
316 exceptions, please see below.
318 The results will be formatted as a kind of table; you can change the
319 output format to a simple list or just a list of newsgroups and number of
320 postings with the B<--format> option. Captions will be added by means of
321 the B<--caption> option; all comments (and captions) can be supressed by
322 using B<--nocomments>.
324 Last but not least you can redirect all output to a number of files, e.g.
325 one for each month, by submitting the B<--filetemplate> option, see below.
326 Captions and comments are automatically disabled in this case.
330 B<groupstats> will read its configuration from F<newsstats.conf>
331 which should be present in the same directory via Config::Auto.
333 See doc/INSTALL for an overview of possible configuration options.
335 You can override some configuration options via the B<--groupsdb> option.
341 =item B<-V>, B<--version>
343 Print out version and copyright information and exit.
345 =item B<-h>, B<--help>
347 Print this man page and exit.
349 =item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]|all>
351 Set processing period to a single month in YYYY-MM format or to a time
352 period between two month in YYYY-MM:YYYY-MM format (two month, separated
353 by a colon). By using the keyword I<all> instead, you can set no
354 processing period to process the whole database.
356 =item B<-n>, B<--newsgroups> I<newsgroup(s)>
358 Limit processing to a certain set of newsgroups. I<newsgroup(s)> can
359 be a single newsgroup name (de.alt.test), a newsgroup hierarchy
360 (de.alt.*) or a list of either of these, separated by colons, for
363 de.test:de.alt.test:de.newusers.*
365 =item B<-s>, B<--sums|--nosums> (sum per hierarchy level)
367 Include "virtual" groups for every hierarchy level in output, for
374 See the B<gatherstats> man page for details.
376 =item B<--checkgroups> I<filename>
378 Restrict output to those newgroups present in a file in checkgroups format
379 (one newgroup name per line; everything after the first whitespace on each
380 line is ignored). All other newsgroups will be removed from output.
382 Contrary to B<gatherstats>, I<filename> is not a template, but refers to
383 a single file in checkgroups format.
385 =item B<-r>, B<--report> I<default|average|sums>
387 Choose the report type: I<default>, I<average> or I<sums>
389 By default, B<groupstats> will report the number of postings for each
390 newsgroup in each month. But it can also report the average number of
391 postings per group for all months or the total sum of postings per group
394 For report types I<average> and I<sums>, the B<group-by> option has no
395 meaning and will be silently ignored (see below).
397 =item B<-l>, B<--lower> I<lower boundary>
399 Set the lower boundary. See B<--boundary> below.
401 =item B<-l>, B<--upper> I<upper boundary>
403 Set the upper boundary. See B<--boundary> below.
405 =item B<-b>, B<--boundary> I<boundary type>
407 Set the boundary type to one of I<default>, I<level>, I<average> or
410 By default, all newsgroups with more postings per month than the upper
411 boundary and/or less postings per month than the lower boundary will be
412 excluded from further processing. For the default report that means each
413 month only newsgroups with a number of postings between the boundaries
414 will be displayed. For the other report types, newsgroups with a number of
415 postings exceeding the boundaries in all (!) months will not be
418 For example, lets take a list of newsgroups like this:
421 de.comp.datenbanken.misc 6
422 de.comp.datenbanken.ms-access 84
423 de.comp.datenbanken.mysql 88
425 de.comp.datenbanken.misc 8
426 de.comp.datenbanken.ms-access 126
427 de.comp.datenbanken.mysql 21
429 de.comp.datenbanken.misc 24
430 de.comp.datenbanken.ms-access 83
431 de.comp.datenbanken.mysql 36
433 With C<groupstats --month 2012-01:2012-03 --lower 25 --report sums>,
434 you'll get the following result:
437 de.comp.datenbanken.ms-access 293
438 de.comp.datenbanken.mysql 124
440 de.comp.datenbanken.misc has not been considered even though it has 38
441 postings in total, because it has less than 25 postings in every single
442 month. If you want to list all newsgroups with more than 25 postings
443 I<in total>, you'll have to set the boundary type to I<sum>, see below.
445 A boundary type of I<level> will show only those newsgroups - at all -
446 that satisfy the boundaries in each and every single month. With the above
447 list of newsgroups and
448 C<groupstats --month 2012-01:2012-03 --lower 25 --boundary level --report sums>,
449 you'll get this result:
452 de.comp.datenbanken.ms-access 293
454 de.comp.datenbanken.mysql has not been considered because it had less than
455 25 postings in 2012-02 (only).
457 You can use that to get a list of newsgroups that have more (or less) then
458 x postings in every month during the whole reporting period.
460 A boundary type of I<average> will show only those newsgroups - at all -that
461 satisfy the boundaries on average. With the above list of newsgroups and
462 C<groupstats --month 2012-01:2012-03 --lower 25 --boundary avg --report sums>,
463 you'll get this result:
466 de.comp.datenbanken.ms-access 293
467 de.comp.datenbanken.mysql 145
469 The average number of postings in the three groups is:
471 de.comp.datenbanken.misc 12.67
472 de.comp.datenbanken.ms-access 97.67
473 de.comp.datenbanken.mysql 48.33
475 Last but not least, a boundary type of I<sums> will show only those
476 newsgroups - at all - that satisfy the boundaries with the total sum of
477 all postings during the reporting period. With the above list of
479 C<groupstats --month 2012-01:2012-03 --lower 25 --boundary sum --report sums>,
480 you'll finally get this result:
483 de.comp.datenbanken.misc 38
484 de.comp.datenbanken.ms-access 293
485 de.comp.datenbanken.mysql 145
488 =item B<-g>, B<--group-by> I<month[-desc]|newsgroups[-desc]>
490 By default, all results are grouped by month, sorted chronologically in
491 ascending order, like this:
494 de.comp.datenbanken.ms-access 84
495 de.comp.datenbanken.mysql 88
497 de.comp.datenbanken.ms-access 126
498 de.comp.datenbanken.mysql 21
500 The results can be grouped by newsgroups instead via
501 B<--group-by> I<newsgroup>:
503 ----- de.comp.datenbanken.ms-access:
506 ----- de.comp.datenbanken.mysql:
510 By appending I<-desc> to the group-by option parameter, you can reverse
511 the sort order - e.g. B<--group-by> I<month-desc> will give:
514 de.comp.datenbanken.ms-access 126
515 de.comp.datenbanken.mysql 21
517 de.comp.datenbanken.ms-access 84
518 de.comp.datenbanken.mysql 88
520 Average and sums reports (see above) will always be grouped by months;
521 this option will therefore be ignored.
523 =item B<-o>, B<--order-by> I<default[-desc]|postings[-desc]>
525 Within each group (a single month or single newsgroup, see above), the
526 report will be sorted by newsgroup names in ascending alphabetical order
527 by default. You can change the sort order to descending or sort by number
530 =item B<-f>, B<--format> I<pretty|list|dump>
532 Select the output format, I<pretty> being the default:
535 de.comp.datenbanken.ms-access 84
536 de.comp.datenbanken.mysql 88
538 de.comp.datenbanken.ms-access 126
539 de.comp.datenbanken.mysql 21
541 I<list> format looks like this:
543 2012-01 de.comp.datenbanken.ms-access 84
544 2012-01 de.comp.datenbanken.mysql 88
545 2012-02 de.comp.datenbanken.ms-access 126
546 2012-02 de.comp.datenbanken.mysql 21
548 And I<dump> format looks like this:
551 de.comp.datenbanken.ms-access 84
552 de.comp.datenbanken.mysql 88
554 de.comp.datenbanken.ms-access 126
555 de.comp.datenbanken.mysql 21
557 You can remove the comments by using B<--nocomments>, see below.
559 =item B<-c>, B<--captions|--nocaptions>
561 Add captions to output, like this:
563 ----- Report for 2012-01 to 2012-02 (number of postings for each month)
564 ----- Newsgroups: de.comp.datenbanken.*
565 ----- Threshold: 10 => x <= 20 (on average)
566 ----- Grouped by Newsgroups (ascending), sorted by number of postings descending
570 =item B<--comments|--nocomments>
572 Add comments (group headers) to I<dump> and I<pretty> output. True by default.
574 Use I<--nocomments> to suppress anything except newsgroup names/months and
575 numbers of postings. This is enforced when using B<--filetemplate>, see below.
577 =item B<--filetemplate> I<filename template>
579 Save output to file(s) instead of dumping it to STDOUT. B<groupstats> will
580 create one file for each month (or each newsgroup, accordant to the
581 setting of B<--group-by>, see above), with filenames composed by adding
582 year and month (or newsgroup names) to the I<filename template>, for
583 example with B<--filetemplate> I<stats>:
589 B<--nocomments> is enforced, see above.
591 =item B<--groupsdb> I<database table>
593 Override I<DBTableGrps> from F<newsstats.conf>.
603 Show number of postings per group for lasth month in I<pretty> format:
607 Show that report for January of 2010 and de.alt.* plus de.test,
608 including display of hierarchy levels:
610 groupstats --month 2010-01 --newsgroups de.alt.*:de.test --sums
612 Only show newsgroups with 30 postings or less last month, ordered
613 by number of postings, descending, in I<pretty> format:
615 groupstats --upper 30 --order-by postings-desc
617 Show the total of all postings for the year of 2010 for all groups that
618 had 30 postings or less in every single month in that year, ordered by
619 number of postings in descending order:
621 groupstats -m 2010-01:2010-12 -u 30 -b level -r sums -o postings-desc
623 The same for the average number of postings in the year of 2010:
625 groupstats -m 2010-01:2010-12 -u 30 -b level -r avg -o postings-desc
627 List number of postings per group for eacht month of 2010 and redirect
628 output to one file for each month, namend stats-2010-01 and so on, in
629 machine-readable form (without formatting):
631 groupstats -m 2010-01:2010-12 -f dump --filetemplate stats
638 =item F<groupstats.pl>
642 =item F<NewsStats.pm>
644 Library functions for the NewsStats package.
646 =item F<newsstats.conf>
648 Runtime configuration file.
654 Please report any bugs or feature requests to the author or use the
655 bug tracker at L<http://bugs.th-h.de/>!
675 This script is part of the B<NewsStats> package.
679 Thomas Hochstein <thh@inter.net>
681 =head1 COPYRIGHT AND LICENSE
683 Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
685 This program is free software; you may redistribute it and/or modify it
686 under the same terms as Perl itself.