groupstats.pl: Redo level (-l) query, gain speedup.
[usenet/newsstats.git] / groupstats.pl
... / ...
CommitLineData
1#! /usr/bin/perl -W
2#
3# groupstats.pl
4#
5# This script will get statistical data on newgroup usage
6# form a database.
7#
8# It is part of the NewsStats package.
9#
10# Copyright (c) 2010 Thomas Hochstein <thh@inter.net>
11#
12# It can be redistributed and/or modified under the same terms under
13# which Perl itself is published.
14
15BEGIN {
16 our $VERSION = "0.01";
17 use File::Basename;
18 push(@INC, dirname($0));
19}
20use strict;
21
22use NewsStats qw(:DEFAULT :TimePeriods :Output :SQLHelper);
23
24use DBI;
25
26################################# Main program #################################
27
28### read commandline options
29my %Options = &ReadOptions('m:p:n:o:t:l:b:iscqdg:');
30
31### read configuration
32my %Conf = %{ReadConfig('newsstats.conf')};
33
34### override configuration via commandline options
35my %ConfOverride;
36$ConfOverride{'DBTableGrps'} = $Options{'g'} if $Options{'g'};
37&OverrideConfig(\%Conf,\%ConfOverride);
38
39### check for incompatible command line options
40# you can't mix '-t', '-b' and '-l'
41# -b/-l take preference over -t, and -b takes preference over -l
42if ($Options{'b'} or $Options{'l'}) {
43 if ($Options{'t'}) {
44 # drop -t
45 warn ("$MySelf: W: You cannot combine thresholds (-t) and top lists (-b) or levels (-l). Threshold '-t $Options{'t'}' was ignored.\n");
46 undef($Options{'t'});
47 };
48 if ($Options{'b'} and $Options{'l'}) {
49 # drop -l
50 warn ("$MySelf: W: You cannot combine top lists (-b) and levels (-l). Level '-l $Options{'l'}' was ignored.\n");
51 undef($Options{'l'});
52 };
53 # -q/-d don't work with -b or -l
54 warn ("$MySelf: W: Sorting by number of postings (-q) ignored due to top list mode (-b) / levels (-l).\n") if $Options{'q'};
55 warn ("$MySelf: W: Reverse sorting (-d) ignored due to top list mode (-b) / levels (-l).\n") if $Options{'d'};
56};
57
58### check output type
59# default output type to 'dump'
60$Options{'o'} = 'dump' if !$Options{'o'};
61# fail if more than one newsgroup is combined with 'dumpgroup' type
62die ("$MySelf: E: You cannot combine newsgroup lists (-n) with more than one group with '-o dumpgroup'!\n") if ($Options{'o'} eq 'dumpgroup' and defined($Options{'n'}) and $Options{'n'} =~ /:|\*/);
63# accept 'dumpgroup' only with -n
64if ($Options{'o'} eq 'dumpgroup' and !defined($Options{'n'})) {
65 $Options{'o'} = 'dump';
66 warn ("$MySelf: W: You must submit exactly one newsgroup ('-n news.group') for '-o dumpgroup'. Output type was set to 'dump'.\n");
67};
68# set output type to 'pretty' for -l
69if ($Options{'l'}) {
70 $Options{'o'} = 'pretty';
71 warn ("$MySelf: W: Output type forced to '-o pretty' due to usage of '-l'.\n");
72};
73
74### get time period
75my ($StartMonth,$EndMonth) = &GetTimePeriod($Options{'m'},$Options{'p'});
76# reset to one month for 'dump' output type
77if ($Options{'o'} eq 'dump' and $Options{'p'}) {
78 warn ("$MySelf: W: You cannot combine time periods (-p) with '-o dump', changing output type to '-o pretty'.\n");
79 $Options{'o'} = 'pretty';
80};
81
82### init database
83my $DBHandle = InitDB(\%Conf,1);
84
85### create report
86# get list of newsgroups (-n)
87my ($QueryGroupList,$QueryThreshold,@GroupList,@Params);
88my $Newsgroups = $Options{'n'};
89if ($Newsgroups) {
90 # explode list of newsgroups for WHERE clause
91 ($QueryGroupList,@GroupList) = &SQLGroupList($Newsgroups);
92} else {
93 # set to dummy value (always true)
94 $QueryGroupList = 1;
95};
96
97# manage thresholds
98if (defined($Options{'t'})) {
99 if ($Options{'i'}) {
100 # -i: list groups below threshold
101 $QueryThreshold .= ' postings < ?';
102 } else {
103 # default: list groups above threshold
104 $QueryThreshold .= ' postings > ?';
105 };
106 # push threshold to Params
107 push @Params,$Options{'t'};
108} else {
109 # set to dummy value (always true)
110 $QueryThreshold = 1;
111}
112
113# construct WHERE clause
114# $QueryGroupList is "list of newsgroup" (or 1),
115# $QueryThreshold is threshold definition (or 1),
116# &SQLHierarchies() takes care of the exclusion of hierarchy levels (.ALL)
117# according to setting of -s
118my $WhereClause = sprintf('month BETWEEN ? AND ? AND %s AND %s %s',$QueryGroupList,$QueryThreshold,&SQLHierarchies($Options{'s'}));
119
120# get length of longest newsgroup delivered by query for formatting purposes
121# FIXME
122my $MaxLength = &GetMaxLenght($DBHandle,$Conf{'DBTableGrps'},'newsgroup',$WhereClause,$StartMonth,$EndMonth,(@GroupList,@Params));
123
124my ($OrderClause,$DBQuery);
125# -b (best of / top list) defined?
126if (!defined($Options{'b'}) and !defined($Options{'l'})) {
127 # default: neither -b nor -l
128 # set ordering (ORDER BY) to "newsgroups" or "postings", "ASC" or "DESC"
129 # according to -q and -d
130 $OrderClause = 'newsgroup';
131 $OrderClause = 'postings' if $Options{'q'};
132 $OrderClause .= ' DESC' if $Options{'d'};
133 # prepare query: get number of postings per group from groups table for given months and newsgroups
134 $DBQuery = $DBHandle->prepare(sprintf("SELECT month,newsgroup,postings FROM %s.%s WHERE %s ORDER BY month,%s",$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$WhereClause,$OrderClause));
135} elsif ($Options{'b'}) {
136 # -b is set (then -l can't be!)
137 # set sorting order (-i): top or flop list?
138 if ($Options{'i'}) {
139 $OrderClause = 'postings';
140 } else {
141 $OrderClause = 'postings DESC';
142 };
143 # set -b to 10 if < 1 (Top 10)
144 $Options{'b'} = 10 if $Options{'b'} !~ /^\d*$/ or $Options{'b'} < 1;
145 # push LIMIT to Params
146 push @Params,$Options{'b'};
147 # prepare query: get sum of postings per group from groups table for given months and newsgroups with LIMIT
148 $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroup,SUM(postings) AS postings FROM %s.%s WHERE %s GROUP BY newsgroup ORDER BY %s,newsgroup LIMIT ?",$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$WhereClause,$OrderClause));
149} else {
150 # -l must be set now, as all other cases have been taken care of
151 # which kind of level (-i): more than -l x or less than -l x?
152 my ($Level);
153 if ($Options{'i'}) {
154 $Level = '<';
155 } else {
156 $Level = '>';
157 };
158 # prepare and execute query: get list of newsgroups meeting level condition
159 $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroup FROM %s.%s WHERE %s GROUP BY newsgroup HAVING MAX(postings) %s ?",$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$WhereClause,$Level));
160 $DBQuery->execute($StartMonth,$EndMonth,@GroupList,$Options{'l'})
161 or die sprintf("$MySelf: E: Can't get groups data for %s to %s from %s.%s: %s\n",$StartMonth,$EndMonth,$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$DBI::errstr);
162 # add newsgroups to a comma-seperated list ready for IN(...) query
163 my $GroupList;
164 while (my ($Newsgroup) = $DBQuery->fetchrow_array) {
165 $GroupList .= ',' if (defined($GroupList) and $GroupList ne '');
166 $GroupList .= "'$Newsgroup'";
167 };
168 $DBQuery = $DBHandle->prepare(sprintf("SELECT month,newsgroup,postings FROM %s.%s WHERE newsgroup IN (%s) AND %s ORDER BY newsgroup,month",$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$GroupList,$WhereClause));
169};
170
171# execute query
172$DBQuery->execute($StartMonth,$EndMonth,@GroupList,@Params)
173 or die sprintf("$MySelf: E: Can't get groups data for %s to %s from %s.%s: %s\n",$StartMonth,$EndMonth,$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$DBI::errstr);
174
175# output results
176# print caption (-c) with time period if -m or -p is set
177# FIXME - month or period should handled differently
178printf ("----- Report from %s to %s\n",$StartMonth,$EndMonth) if $Options{'c'} and ($Options{'m'} or $Options{'p'});
179# print caption (-c) with newsgroup list if -n is set
180printf ("----- Newsgroups: %s\n",join(',',split(/:/,$Newsgroups))) if $Options{'c'} and $Options{'n'};
181# print caption (-c) with threshold if -t is set, taking -i in account
182printf ("----- Threshold: %s %u\n",$Options{'i'} ? '<' : '>',$Options{'t'}) if $Options{'c'} and $Options{'t'};
183if (!defined($Options{'b'}) and !defined($Options{'l'})) {
184 # default: neither -b nor -l
185 &OutputData($Options{'o'},$DBQuery,$MaxLength);
186} elsif ($Options{'b'}) {
187 # -b is set (then -l can't be!)
188 # we have to read in the query results ourselves, as they do not have standard layout
189 while (my ($Newsgroup,$Postings) = $DBQuery->fetchrow_array) {
190 # we just assign "top x" or "bottom x" instead of a month for the caption
191 # FIXME
192 print &FormatOutput($Options{'o'}, ($Options{'i'} ? 'Bottom ' : 'Top ').$Options{'b'}, $Newsgroup, $Postings, $MaxLength);
193 };
194} else {
195 # -l must be set now, as all other cases have been taken care of
196 # we have to read in the query results ourselves, as they do not have standard layout
197 while (my ($Month,$Newsgroup,$Postings) = $DBQuery->fetchrow_array) {
198 # we just switch $Newsgroups and $Month for output generation
199 # FIXME
200 print &FormatOutput($Options{'o'}, $Newsgroup, $Month, $Postings, 7);
201 };
202};
203
204### close handles
205$DBHandle->disconnect;
206
207__END__
208
209################################ Documentation #################################
210
211=head1 NAME
212
213groupstats - create reports on newsgroup usage
214
215=head1 SYNOPSIS
216
217B<groupstats> [B<-Vhiscqd>] [B<-m> I<YYYY-MM>] [B<-p> I<YYYY-MM:YYYY-MM>] [B<-n> I<newsgroup(s)>] [B<-t> I<threshold>] [B<-l> I<level>] [B<-b> I<number>] [B<-o> I<output type>] [B<-g> I<database table>]
218
219=head1 REQUIREMENTS
220
221See doc/README: Perl 5.8.x itself and the following modules from CPAN:
222
223=over 2
224
225=item -
226
227Config::Auto
228
229=item -
230
231DBI
232
233=back
234
235=head1 DESCRIPTION
236
237This script create reports on newsgroup usage (number of postings per
238group per month) taken from result tables created by
239F<gatherstats.pl>.
240
241The time period to act on defaults to last month; you can assign
242another month via the B<-m> switch or a time period via the B<-p>
243switch; the latter takes preference.
244
245B<groupstats> will process all newsgroups by default; you can limit
246that to only some newsgroups by supplying a list of those groups via
247B<-n> (see below). You can include hierarchy levels in the output by
248adding the B<-s> switch (see below).
249
250Furthermore you can set a threshold via B<-t> so that only newsgroups
251with more postings per month will be included in the report. You can
252invert that by the B<-i> switch so only newsgroups with less than
253I<threshold> postings per month will be included.
254
255You can sort the output by number of postings per month instead of the
256default (alphabetical list of newsgroups) by using B<-q>; you can
257reverse the sorting order (from highest to lowest or in reversed
258alphabetical order) by using B<-d>.
259
260Furthermore, you can create a list of newsgroups that had consistently
261more (or less) than x postings per month during the whole report
262period by using B<-l> (together with B<i> as needed).
263
264Last but not least you can create a "best of" list of the top x
265newsgroups via B<-b> (or a "worst of" list by adding B<i>).
266
267By default, B<groupstats> will dump a very simple alphabetical list of
268newsgroups, one per line, followed by the number of postings in that
269month. This output format of course cannot sensibly be combined with
270time periods, so you can set the output format by using B<-o> (see
271below). Captions can be added by setting the B<-c> switch.
272
273=head2 Configuration
274
275F<groupstats.pl> will read its configuration from F<newsstats.conf>
276which should be present in the same directory via Config::Auto.
277
278See doc/INSTALL for an overview of possible configuration options.
279
280You can override configuration options via the B<-g> switch.
281
282=head1 OPTIONS
283
284=over 3
285
286=item B<-V> (version)
287
288Print out version and copyright information on B<yapfaq> and exit.
289
290=item B<-h> (help)
291
292Print this man page and exit.
293
294=item B<-m> I<YYYY-MM> (month)
295
296Set processing period to a month in YYYY-MM format. Ignored if B<-p>
297is set.
298
299=item B<-p> I<YYYY-MM:YYYY-MM> (period)
300
301Set processing period to a time period between two month, each in
302YYYY-MM format, separated by a colon. Overrides B<-m>.
303
304=item B<-n> I<newsgroup(s)> (newsgroups)
305
306Limit processing to a certain set of newsgroups. I<newsgroup(s)> can
307be a single newsgroup name (de.alt.test), a newsgroup hierarchy
308(de.alt.*) or a list of either of these, separated by colons, for
309example
310
311 de.test:de.alt.test:de.newusers.*
312
313=item B<-t> I<threshold> (threshold)
314
315Only include newsgroups with more than I<threshold> postings per
316month. Can be inverted by the B<-i> switch so that only newsgroups
317with less than I<threshold> postings will be included.
318
319This setting will be ignored if B<-l> or B<-b> is set.
320
321=item B<-l> I<level> (level)
322
323Only include newsgroups with more than I<level> postings per
324month, every month during the whole reporting period. Can be inverted
325by the B<-i> switch so that only newsgroups with less than I<level>
326postings every single month will be included. Output will be ordered
327by newsgroup name, followed by month.
328
329This setting will be ignored if B<-b> is set. Overrides B<-t> and
330can't be used together with B<-q> or B<-d>.
331
332=item B<-b> I<n> (best of)
333
334Create a list of the I<n> newsgroups with the most postings over the
335whole reporting period. Can be inverted by the B<-i> switch so that a
336list of the I<n> newsgroups with the least postings over the whole
337period is generated. Output will be ordered by sum of postings.
338
339Overrides B<-t> and B<-l> and can't be used together with B<-q> or
340B<-d>. Output format is set to I<pretty> (see below).
341
342=item B<-i> (invert)
343
344Used in conjunction with B<-t>, B<-l> or B<-b> to set a lower
345threshold or level or generate a "bottom list" instead of a top list.
346
347=item B<-s> (sum per hierarchy level)
348
349Include "virtual" groups for every hierarchy level in output, for
350example:
351
352 de.alt.ALL 10
353 de.alt.test 5
354 de.alt.admin 7
355
356See the B<gatherstats> man page for details.
357
358=item B<-o> I<output type> (output format)
359
360Set output format. Default is I<dump>, consisting of an alphabetical
361list of newsgroups, each on a new line, followed by the number of
362postings in that month. This default format can't be used with time
363periods of more than one month.
364
365I<list> format is like I<dump>, but will print the month in front of
366the newsgroup name.
367
368I<dumpgroup> format can only be use with a group list (see B<-n>) of
369exactly one newsgroup and is like I<dump>, but will output months,
370followed by the number of postings.
371
372If you don't need easily parsable output, you'll mostly use I<pretty>
373format, which will print a header for each new month and try to align
374newsgroup names and posting counts. Usage of B<-b> will force this
375format.
376
377=item B<-c> (captions)
378
379Add captions to output (reporting period, newsgroups list, threshold).
380
381=item B<-q> (quantity of postings)
382
383Sort by number of postings instead of by newsgroup names.
384
385Cannot be used with B<-l> or B<-b>.
386
387=item B<-d> (descending)
388
389Change sort order to descending.
390
391Cannot be used with B<-l> or B<-b>.
392
393=item B<-g> I<table> (postings per group table)
394
395Override I<DBTableGrps> from F<newsstats.conf>.
396
397=back
398
399=head1 INSTALLATION
400
401See doc/INSTALL.
402
403=head1 EXAMPLES
404
405Show number of postings per group for lasth month in I<dump> format:
406
407 groupstats
408
409Show that report for January of 2010 and de.alt.* plus de.test,
410including display of hierarchy levels:
411
412 groupstats -m 2010-01 -n de.alt.*:de.test -s
413
414Show that report for the year of 2010 in I<pretty> format:
415
416 groupstats -p 2010-01:2010-12 -o pretty
417
418Only show newsgroups with less than 30 postings last month, ordered
419by number of postings, descending, in I<pretty> format:
420
421 groupstats -iqdt 30 -o pretty
422
423Show top 10 for the first half-year of of 2010 in I<pretty> format:
424
425 groupstats -p 2010-01:2010-06 -b 10 -o pretty
426
427Report all groups that had less than 30 postings every singele month
428in the year of 2010 (I<pretty> format is forced)
429
430 groupstats -p 2010-01:2010-12 -il 30
431
432=head1 FILES
433
434=over 4
435
436=item F<groupstats.pl>
437
438The script itself.
439
440=item F<NewsStats.pm>
441
442Library functions for the NewsStats package.
443
444=item F<newsstats.conf>
445
446Runtime configuration file for B<yapfaq>.
447
448=back
449
450=head1 BUGS
451
452Please report any bugs or feature requests to the author or use the
453bug tracker at L<http://bugs.th-h.de/>!
454
455=head1 SEE ALSO
456
457=over 2
458
459=item -
460
461doc/README
462
463=item -
464
465doc/INSTALL
466
467=item -
468
469gatherstats -h
470
471=back
472
473This script is part of the B<NewsStats> package.
474
475=head1 AUTHOR
476
477Thomas Hochstein <thh@inter.net>
478
479=head1 COPYRIGHT AND LICENSE
480
481Copyright (c) 2010 Thomas Hochstein <thh@inter.net>
482
483This program is free software; you may redistribute it and/or modify it
484under the same terms as Perl itself.
485
486=cut
This page took 0.011804 seconds and 4 git commands to generate.