5 # This script will get statistical data on newgroup usage
8 # It is part of the NewsStats package.
10 # Copyright (c) 2010 Thomas Hochstein <thh@inter.net>
12 # It can be redistributed and/or modified under the same terms under
13 # which Perl itself is published.
16 our $VERSION = "0.01";
18 push(@INC, dirname($0));
22 use NewsStats qw(:DEFAULT :TimePeriods :Output :SQLHelper);
26 ################################# Main program #################################
28 ### read commandline options
29 my %Options = &ReadOptions('m:p:an:o:t:l:b:iscqdg:');
31 ### read configuration
32 my %Conf = %{ReadConfig('newsstats.conf')};
34 ### override configuration via commandline options
36 $ConfOverride{'DBTableGrps'} = $Options{'g'} if $Options{'g'};
37 &OverrideConfig(\%Conf,\%ConfOverride);
39 ### check for incompatible command line options
40 # you can't mix '-t', '-b' and '-l'
41 # -b/-l take preference over -t, and -b takes preference over -l
42 if ($Options{'b'} or $Options{'l'}) {
45 warn ("$MySelf: W: You cannot combine thresholds (-t) and top lists (-b) or levels (-l). Threshold '-t $Options{'t'}' was ignored.\n");
48 if ($Options{'b'} and $Options{'l'}) {
50 warn ("$MySelf: W: You cannot combine top lists (-b) and levels (-l). Level '-l $Options{'l'}' was ignored.\n");
53 # -q/-d don't work with -b or -l
54 warn ("$MySelf: W: Sorting by number of postings (-q) ignored due to top list mode (-b) / levels (-l).\n") if $Options{'q'};
55 warn ("$MySelf: W: Reverse sorting (-d) ignored due to top list mode (-b) / levels (-l).\n") if $Options{'d'};
59 # default output type to 'dump'
60 $Options{'o'} = 'dump' if !$Options{'o'};
61 # fail if more than one newsgroup is combined with 'dumpgroup' type
62 die ("$MySelf: E: You cannot combine newsgroup lists (-n) with more than one group with '-o dumpgroup'!\n") if ($Options{'o'} eq 'dumpgroup' and defined($Options{'n'}) and $Options{'n'} =~ /:|\*/);
63 # accept 'dumpgroup' only with -n
64 if ($Options{'o'} eq 'dumpgroup' and !defined($Options{'n'})) {
65 $Options{'o'} = 'dump';
66 warn ("$MySelf: W: You must submit exactly one newsgroup ('-n news.group') for '-o dumpgroup'. Output type was set to 'dump'.\n");
68 # set output type to 'pretty' for -l
70 $Options{'o'} = 'pretty';
71 warn ("$MySelf: W: Output type forced to '-o pretty' due to usage of '-l'.\n");
75 my $DBHandle = InitDB(\%Conf,1);
78 my ($StartMonth,$EndMonth);
79 # if '-a' is set, set start/end month from database
80 # FIXME - it doesn't make that much sense to get first/last month from database to query it
81 # with a time period that equals no time period ...
85 my $DBQuery = $DBHandle->prepare(sprintf("SELECT MIN(month),MAX(month) FROM %s.%s",$Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
86 $DBQuery->execute or die sprintf("$MySelf: E: Can't get MIN/MAX month from %s.%s: %s\n",$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$DBI::errstr);
87 ($StartMonth,$EndMonth) = $DBQuery->fetchrow_array;
89 ($StartMonth,$EndMonth) = &GetTimePeriod($Options{'m'},$Options{'p'});
91 # if time period is more than one month: set output type to '-o pretty' or '-o dumpgroup'
92 if ($Options{'o'} eq 'dump' and ($Options{'p'} or $Options{'a'})) {
93 if (defined($Options{'n'}) and $Options{'n'} !~ /:|\*/) {
94 warn ("$MySelf: W: You cannot combine time periods (-p) with '-o dump', changing output type to '-o dumpgroup'.\n");
95 $Options{'o'} = 'dumpgroup';
97 warn ("$MySelf: W: You cannot combine time periods (-p) with '-o dump', changing output type to '-o pretty'.\n");
98 $Options{'o'} = 'pretty';
103 # get list of newsgroups (-n)
104 my ($QueryGroupList,$QueryThreshold,@GroupList,@Params);
105 my $Newsgroups = $Options{'n'};
107 # explode list of newsgroups for WHERE clause
108 ($QueryGroupList,@GroupList) = &SQLGroupList($Newsgroups);
110 # set to dummy value (always true)
115 if (defined($Options{'t'})) {
117 # -i: list groups below threshold
118 $QueryThreshold .= ' postings < ?';
120 # default: list groups above threshold
121 $QueryThreshold .= ' postings > ?';
123 # push threshold to Params
124 push @Params,$Options{'t'};
126 # set to dummy value (always true)
130 # construct WHERE clause
131 # $QueryGroupList is "list of newsgroup" (or 1),
132 # $QueryThreshold is threshold definition (or 1),
133 # &SQLHierarchies() takes care of the exclusion of hierarchy levels (.ALL)
134 # according to setting of -s
135 my $WhereClause = sprintf('month BETWEEN ? AND ? AND %s AND %s %s',$QueryGroupList,$QueryThreshold,&SQLHierarchies($Options{'s'}));
137 # get length of longest newsgroup delivered by query for formatting purposes
139 my $MaxLength = &GetMaxLenght($DBHandle,$Conf{'DBTableGrps'},'newsgroup',$WhereClause,$StartMonth,$EndMonth,(@GroupList,@Params));
141 my ($OrderClause,$DBQuery);
142 # -b (best of / top list) defined?
143 if (!defined($Options{'b'}) and !defined($Options{'l'})) {
144 # default: neither -b nor -l
145 # set ordering (ORDER BY) to "newsgroups" or "postings", "ASC" or "DESC"
146 # according to -q and -d
147 $OrderClause = 'newsgroup';
148 $OrderClause = 'postings' if $Options{'q'};
149 $OrderClause .= ' DESC' if $Options{'d'};
150 # prepare query: get number of postings per group from groups table for given months and newsgroups
151 $DBQuery = $DBHandle->prepare(sprintf("SELECT month,newsgroup,postings FROM %s.%s WHERE %s ORDER BY month,%s",$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$WhereClause,$OrderClause));
152 } elsif ($Options{'b'}) {
153 # -b is set (then -l can't be!)
154 # set sorting order (-i): top or flop list?
156 $OrderClause = 'postings';
158 $OrderClause = 'postings DESC';
160 # set -b to 10 if < 1 (Top 10)
161 $Options{'b'} = 10 if $Options{'b'} !~ /^\d*$/ or $Options{'b'} < 1;
162 # push LIMIT to Params
163 push @Params,$Options{'b'};
164 # prepare query: get sum of postings per group from groups table for given months and newsgroups with LIMIT
165 $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroup,SUM(postings) AS postings FROM %s.%s WHERE %s GROUP BY newsgroup ORDER BY %s,newsgroup LIMIT ?",$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$WhereClause,$OrderClause));
167 # -l must be set now, as all other cases have been taken care of
168 # which kind of level (-i): more than -l x or less than -l x?
175 # prepare and execute query: get list of newsgroups meeting level condition
176 $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroup FROM %s.%s WHERE %s GROUP BY newsgroup HAVING MAX(postings) %s ?",$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$WhereClause,$Level));
177 $DBQuery->execute($StartMonth,$EndMonth,@GroupList,$Options{'l'})
178 or die sprintf("$MySelf: E: Can't get groups data for %s to %s from %s.%s: %s\n",$StartMonth,$EndMonth,$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$DBI::errstr);
179 # add newsgroups to a comma-seperated list ready for IN(...) query
181 while (my ($Newsgroup) = $DBQuery->fetchrow_array) {
182 $GroupList .= ',' if (defined($GroupList) and $GroupList ne '');
183 $GroupList .= "'$Newsgroup'";
185 $DBQuery = $DBHandle->prepare(sprintf("SELECT month,newsgroup,postings FROM %s.%s WHERE newsgroup IN (%s) AND %s ORDER BY newsgroup,month",$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$GroupList,$WhereClause));
189 $DBQuery->execute($StartMonth,$EndMonth,@GroupList,@Params)
190 or die sprintf("$MySelf: E: Can't get groups data for %s to %s from %s.%s: %s\n",$StartMonth,$EndMonth,$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$DBI::errstr);
193 # print caption (-c) with time period if -m or -p is set
196 printf ("----- Report from %s to %s\n",$StartMonth,$EndMonth);
197 } elsif ($Options{'m'}) {
198 printf ("----- Report for %s\n",$StartMonth);
201 # print caption (-c) with newsgroup list if -n is set
202 printf ("----- Newsgroups: %s\n",join(',',split(/:/,$Newsgroups))) if $Options{'c'} and $Options{'n'};
203 # print caption (-c) with threshold if -t is set, taking -i in account
204 printf ("----- Threshold: %s %u\n",$Options{'i'} ? '<' : '>',$Options{'t'}) if $Options{'c'} and $Options{'t'};
205 if (!defined($Options{'b'}) and !defined($Options{'l'})) {
206 # default: neither -b nor -l
207 &OutputData($Options{'o'},$DBQuery,$MaxLength);
208 } elsif ($Options{'b'}) {
209 # -b is set (then -l can't be!)
210 # we have to read in the query results ourselves, as they do not have standard layout
211 while (my ($Newsgroup,$Postings) = $DBQuery->fetchrow_array) {
212 # we just assign "top x" or "bottom x" instead of a month for the caption and force an output type of pretty
213 print &FormatOutput('pretty', ($Options{'i'} ? 'Bottom ' : 'Top ').$Options{'b'}, $Newsgroup, $Postings, $MaxLength);
216 # -l must be set now, as all other cases have been taken care of
217 # print caption (-c) with level, taking -i in account
218 printf ("----- Newsgroups with %s than %u postings over the whole time period\n",$Options{'i'} ? 'less' : 'more',$Options{'l'}) if $Options{'c'};
219 # we have to read in the query results ourselves, as they do not have standard layout
220 while (my ($Month,$Newsgroup,$Postings) = $DBQuery->fetchrow_array) {
221 # we just switch $Newsgroups and $Month for output generation
222 print &FormatOutput($Options{'o'}, $Newsgroup, $Month, $Postings, 7);
227 $DBHandle->disconnect;
231 ################################ Documentation #################################
235 groupstats - create reports on newsgroup usage
239 B<groupstats> [B<-Vhiscqd>] [B<-m> I<YYYY-MM> | B<-p> I<YYYY-MM:YYYY-MM> | B<-a>] [B<-n> I<newsgroup(s)>] [B<-t> I<threshold>] [B<-l> I<level>] [B<-b> I<number>] [B<-o> I<output type>] [B<-g> I<database table>]
243 See doc/README: Perl 5.8.x itself and the following modules from CPAN:
259 This script create reports on newsgroup usage (number of postings per
260 group per month) taken from result tables created by
263 The time period to act on defaults to last month; you can assign
264 another month via the B<-m> switch or a time period via the B<-p>
265 switch; the latter takes preference.
267 B<groupstats> will process all newsgroups by default; you can limit
268 that to only some newsgroups by supplying a list of those groups via
269 B<-n> (see below). You can include hierarchy levels in the output by
270 adding the B<-s> switch (see below).
272 Furthermore you can set a threshold via B<-t> so that only newsgroups
273 with more postings per month will be included in the report. You can
274 invert that by the B<-i> switch so only newsgroups with less than
275 I<threshold> postings per month will be included.
277 You can sort the output by number of postings per month instead of the
278 default (alphabetical list of newsgroups) by using B<-q>; you can
279 reverse the sorting order (from highest to lowest or in reversed
280 alphabetical order) by using B<-d>.
282 Furthermore, you can create a list of newsgroups that had consistently
283 more (or less) than x postings per month during the whole report
284 period by using B<-l> (together with B<i> as needed).
286 Last but not least you can create a "best of" list of the top x
287 newsgroups via B<-b> (or a "worst of" list by adding B<i>).
289 By default, B<groupstats> will dump a very simple alphabetical list of
290 newsgroups, one per line, followed by the number of postings in that
291 month. This output format of course cannot sensibly be combined with
292 time periods, so you can set the output format by using B<-o> (see
293 below). Captions can be added by setting the B<-c> switch.
297 B<groupstats> will read its configuration from F<newsstats.conf>
298 which should be present in the same directory via Config::Auto.
300 See doc/INSTALL for an overview of possible configuration options.
302 You can override configuration options via the B<-g> switch.
308 =item B<-V> (version)
310 Print out version and copyright information on B<yapfaq> and exit.
314 Print this man page and exit.
316 =item B<-m> I<YYYY-MM> (month)
318 Set processing period to a month in YYYY-MM format. Ignored if B<-p>
321 =item B<-p> I<YYYY-MM:YYYY-MM> (period)
323 Set processing period to a time period between two month, each in
324 YYYY-MM format, separated by a colon. Overrides B<-m>. Ignored if
329 Set no processing period (process whole database). Overrides B<-m>
332 =item B<-n> I<newsgroup(s)> (newsgroups)
334 Limit processing to a certain set of newsgroups. I<newsgroup(s)> can
335 be a single newsgroup name (de.alt.test), a newsgroup hierarchy
336 (de.alt.*) or a list of either of these, separated by colons, for
339 de.test:de.alt.test:de.newusers.*
341 =item B<-t> I<threshold> (threshold)
343 Only include newsgroups with more than I<threshold> postings per
344 month. Can be inverted by the B<-i> switch so that only newsgroups
345 with less than I<threshold> postings will be included.
347 This setting will be ignored if B<-l> or B<-b> is set.
349 =item B<-l> I<level> (level)
351 Only include newsgroups with more than I<level> postings per
352 month, every month during the whole reporting period. Can be inverted
353 by the B<-i> switch so that only newsgroups with less than I<level>
354 postings every single month will be included. Output will be ordered
355 by newsgroup name, followed by month.
357 This setting will be ignored if B<-b> is set. Overrides B<-t> and
358 can't be used together with B<-q> or B<-d>.
360 =item B<-b> I<n> (best of)
362 Create a list of the I<n> newsgroups with the most postings over the
363 whole reporting period. Can be inverted by the B<-i> switch so that a
364 list of the I<n> newsgroups with the least postings over the whole
365 period is generated. Output will be ordered by sum of postings.
367 Overrides B<-t> and B<-l> and can't be used together with B<-q> or
368 B<-d>. Output format is set to I<pretty> (see below).
372 Used in conjunction with B<-t>, B<-l> or B<-b> to set a lower
373 threshold or level or generate a "bottom list" instead of a top list.
375 =item B<-s> (sum per hierarchy level)
377 Include "virtual" groups for every hierarchy level in output, for
384 See the B<gatherstats> man page for details.
386 =item B<-o> I<output type> (output format)
388 Set output format. Default is I<dump>, consisting of an alphabetical
389 list of newsgroups, each on a new line, followed by the number of
390 postings in that month. This default format can't be used with time
391 periods of more than one month.
393 I<list> format is like I<dump>, but will print the month in front of
396 I<dumpgroup> format can only be use with a group list (see B<-n>) of
397 exactly one newsgroup and is like I<dump>, but will output months,
398 followed by the number of postings.
400 If you don't need easily parsable output, you'll mostly use I<pretty>
401 format, which will print a header for each new month and try to align
402 newsgroup names and posting counts. Usage of B<-b> will force this
405 =item B<-c> (captions)
407 Add captions to output (reporting period, newsgroups list, threshold).
409 =item B<-q> (quantity of postings)
411 Sort by number of postings instead of by newsgroup names.
413 Cannot be used with B<-l> or B<-b>.
415 =item B<-d> (descending)
417 Change sort order to descending.
419 Cannot be used with B<-l> or B<-b>.
421 =item B<-g> I<table> (postings per group table)
423 Override I<DBTableGrps> from F<newsstats.conf>.
433 Show number of postings per group for lasth month in I<dump> format:
437 Show that report for January of 2010 and de.alt.* plus de.test,
438 including display of hierarchy levels:
440 groupstats -m 2010-01 -n de.alt.*:de.test -s
442 Show that report for the year of 2010 in I<pretty> format:
444 groupstats -p 2010-01:2010-12 -o pretty
446 Only show newsgroups with less than 30 postings last month, ordered
447 by number of postings, descending, in I<pretty> format:
449 groupstats -iqdt 30 -o pretty
451 Show top 10 for the first half-year of of 2010 in I<pretty> format:
453 groupstats -p 2010-01:2010-06 -b 10 -o pretty
455 Report all groups that had less than 30 postings every singele month
456 in the year of 2010 (I<pretty> format is forced)
458 groupstats -p 2010-01:2010-12 -il 30
464 =item F<groupstats.pl>
468 =item F<NewsStats.pm>
470 Library functions for the NewsStats package.
472 =item F<newsstats.conf>
474 Runtime configuration file for B<yapfaq>.
480 Please report any bugs or feature requests to the author or use the
481 bug tracker at L<http://bugs.th-h.de/>!
501 This script is part of the B<NewsStats> package.
505 Thomas Hochstein <thh@inter.net>
507 =head1 COPYRIGHT AND LICENSE
509 Copyright (c) 2010 Thomas Hochstein <thh@inter.net>
511 This program is free software; you may redistribute it and/or modify it
512 under the same terms as Perl itself.