Commit | Line | Data |
---|---|---|
2832c235 TH |
1 | #! /usr/bin/perl -W |
2 | # | |
3 | # groupstats.pl | |
4 | # | |
5 | # This script will get statistical data on newgroup usage | |
6 | # form a database. | |
7 | # | |
8 | # It is part of the NewsStats package. | |
9 | # | |
10 | # Copyright (c) 2010 Thomas Hochstein <thh@inter.net> | |
11 | # | |
12 | # It can be redistributed and/or modified under the same terms under | |
13 | # which Perl itself is published. | |
14 | ||
15 | BEGIN { | |
16 | our $VERSION = "0.01"; | |
17 | use File::Basename; | |
18 | push(@INC, dirname($0)); | |
19 | } | |
20 | use strict; | |
21 | ||
22 | use NewsStats qw(:DEFAULT :TimePeriods :Output :SQLHelper); | |
23 | ||
24 | use DBI; | |
25 | ||
26 | ################################# Main program ################################# | |
27 | ||
28 | ### read commandline options | |
29 | my %Options = &ReadOptions('m:p:n:o:t:l:b:iscqdg:'); | |
30 | ||
31 | ### read configuration | |
32 | my %Conf = %{ReadConfig('newsstats.conf')}; | |
33 | ||
34 | ### override configuration via commandline options | |
35 | my %ConfOverride; | |
36 | $ConfOverride{'DBTableGrps'} = $Options{'g'} if $Options{'g'}; | |
37 | &OverrideConfig(\%Conf,\%ConfOverride); | |
38 | ||
39 | ### check for incompatible command line options | |
40 | # you can't mix '-t', '-b' and '-l' | |
41 | # -b/-l take preference over -t, and -b takes preference over -l | |
42 | if ($Options{'b'} or $Options{'l'}) { | |
43 | if ($Options{'t'}) { | |
44 | # drop -t | |
45 | warn ("$MySelf: W: You cannot combine thresholds (-t) and top lists (-b) or levels (-l). Threshold '-t $Options{'t'}' was ignored.\n"); | |
46 | undef($Options{'t'}); | |
47 | }; | |
48 | if ($Options{'b'} and $Options{'l'}) { | |
49 | # drop -l | |
50 | warn ("$MySelf: W: You cannot combine top lists (-b) and levels (-l). Level '-l $Options{'l'}' was ignored.\n"); | |
51 | undef($Options{'l'}); | |
52 | }; | |
53 | # -q/-d don't work with -b or -l | |
54 | warn ("$MySelf: W: Sorting by number of postings (-q) ignored due to top list mode (-b) / levels (-l).\n") if $Options{'q'}; | |
55 | warn ("$MySelf: W: Reverse sorting (-d) ignored due to top list mode (-b) / levels (-l).\n") if $Options{'d'}; | |
56 | }; | |
57 | ||
58 | ### check output type | |
59 | # default output type to 'dump' | |
60 | $Options{'o'} = 'dump' if !$Options{'o'}; | |
61 | # fail if more than one newsgroup is combined with 'dumpgroup' type | |
62 | die ("$MySelf: E: You cannot combine newsgroup lists (-n) with more than one group with '-o dumpgroup'!\n") if ($Options{'o'} eq 'dumpgroup' and defined($Options{'n'}) and $Options{'n'} =~ /:|\*/); | |
63 | # accept 'dumpgroup' only with -n | |
64 | if ($Options{'o'} eq 'dumpgroup' and !defined($Options{'n'})) { | |
65 | $Options{'o'} = 'dump'; | |
66 | warn ("$MySelf: W: You must submit exactly one newsgroup ('-n news.group') for '-o dumpgroup'. Output type was set to 'dump'.\n"); | |
67 | }; | |
68 | # set output type to 'pretty' for -l | |
69 | if ($Options{'l'}) { | |
70 | $Options{'o'} = 'pretty'; | |
71 | warn ("$MySelf: W: Output type forced to '-o pretty' due to usage of '-l'.\n"); | |
72 | }; | |
73 | ||
74 | ### get time period | |
75 | my ($StartMonth,$EndMonth) = &GetTimePeriod($Options{'m'},$Options{'p'}); | |
76 | # reset to one month for 'dump' output type | |
77 | if ($Options{'o'} eq 'dump' and $Options{'p'}) { | |
d8695b1c TH |
78 | warn ("$MySelf: W: You cannot combine time periods (-p) with '-o dump', changing output type to '-o pretty'.\n"); |
79 | $Options{'o'} = 'pretty'; | |
2832c235 TH |
80 | }; |
81 | ||
82 | ### init database | |
83 | my $DBHandle = InitDB(\%Conf,1); | |
84 | ||
85 | ### create report | |
86 | # get list of newsgroups (-n) | |
87 | my ($QueryPart,@GroupList); | |
88 | my $Newsgroups = $Options{'n'}; | |
89 | if ($Newsgroups) { | |
90 | # explode list of newsgroups for WHERE clause | |
91 | ($QueryPart,@GroupList) = &SQLGroupList($Newsgroups); | |
92 | } else { | |
93 | # set to dummy value (always true) | |
94 | $QueryPart = 1; | |
95 | }; | |
96 | ||
97 | # manage thresholds | |
98 | if (defined($Options{'t'})) { | |
99 | if ($Options{'i'}) { | |
100 | # -i: list groups below threshold | |
101 | $QueryPart .= ' AND postings < ?'; | |
102 | } else { | |
103 | # default: list groups above threshold | |
104 | $QueryPart .= ' AND postings > ?'; | |
105 | }; | |
106 | # push threshold to GroupList to match number of binding vars for DBQuery->execute | |
107 | push @GroupList,$Options{'t'}; | |
108 | } | |
109 | ||
110 | # construct WHERE clause | |
111 | # $QueryPart is "list of newsgroup" (or 1), | |
112 | # &SQLHierarchies() takes care of the exclusion of hierarchy levels (.ALL) | |
113 | # according to setting of -s | |
114 | my $WhereClause = sprintf('month BETWEEN ? AND ? AND %s %s',$QueryPart,&SQLHierarchies($Options{'s'})); | |
115 | ||
116 | # get lenght of longest newsgroup delivered by query for formatting purposes | |
117 | # FIXME | |
118 | my $MaxLength = &GetMaxLenght($DBHandle,$Conf{'DBTableGrps'},'newsgroup',$WhereClause,$StartMonth,$EndMonth,@GroupList); | |
119 | ||
120 | my ($OrderClause,$DBQuery); | |
121 | # -b (best of / top list) defined? | |
122 | if (!defined($Options{'b'}) and !defined($Options{'l'})) { | |
123 | # default: neither -b nor -l | |
124 | # set ordering (ORDER BY) to "newsgroups" or "postings", "ASC" or "DESC" | |
125 | # according to -q and -d | |
126 | $OrderClause = 'newsgroup'; | |
127 | $OrderClause = 'postings' if $Options{'q'}; | |
128 | $OrderClause .= ' DESC' if $Options{'d'}; | |
129 | # prepare query: get number of postings per group from groups table for given months and newsgroups | |
130 | $DBQuery = $DBHandle->prepare(sprintf("SELECT month,newsgroup,postings FROM %s.%s WHERE %s ORDER BY month,%s",$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$WhereClause,$OrderClause)); | |
131 | } elsif ($Options{'b'}) { | |
132 | # -b is set (then -l can't be!) | |
133 | # set sorting order (-i) | |
134 | if ($Options{'i'}) { | |
135 | $OrderClause = 'postings'; | |
136 | } else { | |
137 | $OrderClause = 'postings DESC'; | |
138 | }; | |
d8695b1c TH |
139 | # set -b to 10 if < 1 (Top 10) |
140 | $Options{'b'} = 10 if $Options{'b'} !~ /^\d*$/ or $Options{'b'} < 1; | |
2832c235 TH |
141 | # push LIMIT to GroupList to match number of binding vars for DBQuery->execute |
142 | push @GroupList,$Options{'b'}; | |
143 | # prepare query: get sum of postings per group from groups table for given months and newsgroups with LIMIT | |
144 | $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroup,SUM(postings) AS postings FROM %s.%s WHERE %s GROUP BY newsgroup ORDER BY %s,newsgroup LIMIT ?",$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$WhereClause,$OrderClause)); | |
145 | } else { | |
146 | # -l must be set now, as all other cases have been taken care of | |
147 | # set sorting order (-i) | |
148 | if ($Options{'i'}) { | |
149 | $OrderClause = '<'; | |
150 | } else { | |
151 | $OrderClause = '>'; | |
152 | }; | |
153 | # push level and $StartMonth,$EndMonth - again - to GroupList to match number of binding vars for DBQuery->execute | |
154 | # FIXME -- together with the query (see below) | |
155 | push @GroupList,$Options{'l'}; | |
156 | push @GroupList,$StartMonth,$EndMonth; | |
157 | # prepare query: get number of postings per group from groups table for given months and | |
158 | # FIXME -- this query is ... in dire need of impromevent | |
159 | $DBQuery = $DBHandle->prepare(sprintf("SELECT month,newsgroup,postings FROM %s.%s WHERE newsgroup IN (SELECT newsgroup FROM %s.%s WHERE %s GROUP BY newsgroup HAVING MAX(postings) %s ?) AND %s ORDER BY newsgroup,month",$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$WhereClause,$OrderClause,$WhereClause)); | |
160 | }; | |
161 | ||
162 | # execute query | |
163 | $DBQuery->execute($StartMonth,$EndMonth,@GroupList) | |
164 | or die sprintf("$MySelf: E: Can't get groups data for %s to %s from %s.%s: %s\n",$StartMonth,$EndMonth,$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$DBI::errstr); | |
165 | ||
166 | # output results | |
167 | # print caption (-c) with time period if -m or -p is set | |
168 | # FIXME - month or period should handled differently | |
169 | printf ("----- Report from %s to %s\n",$StartMonth,$EndMonth) if $Options{'c'} and ($Options{'m'} or $Options{'p'}); | |
170 | # print caption (-c) with newsgroup list if -n is set | |
171 | printf ("----- Newsgroups: %s\n",join(',',split(/:/,$Newsgroups))) if $Options{'c'} and $Options{'n'}; | |
172 | # print caption (-c) with threshold if -t is set, taking -i in account | |
173 | printf ("----- Threshold: %s %u\n",$Options{'i'} ? '<' : '>',$Options{'t'}) if $Options{'c'} and $Options{'t'}; | |
174 | if (!defined($Options{'b'}) and !defined($Options{'l'})) { | |
175 | # default: neither -b nor -l | |
176 | &OutputData($Options{'o'},$DBQuery,$MaxLength); | |
177 | } elsif ($Options{'b'}) { | |
178 | # -b is set (then -l can't be!) | |
179 | # we have to read in the query results ourselves, as they do not have standard layout | |
180 | while (my ($Newsgroup,$Postings) = $DBQuery->fetchrow_array) { | |
181 | # we just assign "top x" or "bottom x" instead of a month for the caption | |
182 | # FIXME | |
183 | print &FormatOutput($Options{'o'}, ($Options{'i'} ? 'Bottom ' : 'Top ').$Options{'b'}, $Newsgroup, $Postings, $MaxLength); | |
184 | }; | |
185 | } else { | |
186 | # -l must be set now, as all other cases have been taken care of | |
187 | # we have to read in the query results ourselves, as they do not have standard layout | |
188 | while (my ($Month,$Newsgroup,$Postings) = $DBQuery->fetchrow_array) { | |
189 | # we just switch $Newsgroups and $Month for output generation | |
190 | # FIXME | |
191 | print &FormatOutput($Options{'o'}, $Newsgroup, $Month, $Postings, 7); | |
192 | }; | |
193 | }; | |
194 | ||
195 | ### close handles | |
196 | $DBHandle->disconnect; | |
197 | ||
198 | __END__ | |
199 | ||
200 | ################################ Documentation ################################# | |
201 | ||
202 | =head1 NAME | |
203 | ||
204 | groupstats - create reports on newsgroup usage | |
205 | ||
206 | =head1 SYNOPSIS | |
207 | ||
208 | B<groupstats> [B<-Vhiscqd>] [B<-m> I<YYYY-MM>] [B<-p> I<YYYY-MM:YYYY-MM>] [B<-n> I<newsgroup(s)>] [B<-t> I<threshold>] [B<-l> I<level>] [B<-b> I<number>] [B<-o> I<output type>] [B<-g> I<database table>] | |
209 | ||
210 | =head1 REQUIREMENTS | |
211 | ||
212 | See doc/README: Perl 5.8.x itself and the following modules from CPAN: | |
213 | ||
214 | =over 2 | |
215 | ||
216 | =item - | |
217 | ||
218 | Config::Auto | |
219 | ||
220 | =item - | |
221 | ||
222 | DBI | |
223 | ||
224 | =back | |
225 | ||
226 | =head1 DESCRIPTION | |
227 | ||
228 | This script create reports on newsgroup usage (number of postings per | |
229 | group per month) taken from result tables created by | |
230 | F<gatherstats.pl>. | |
231 | ||
232 | The time period to act on defaults to last month; you can assign | |
233 | another month via the B<-m> switch or a time period via the B<-p> | |
234 | switch; the latter takes preference. | |
235 | ||
236 | B<groupstats> will process all newsgroups by default; you can limit | |
237 | that to only some newsgroups by supplying a list of those groups via | |
238 | B<-n> (see below). You can include hierarchy levels in the output by | |
239 | adding the B<-s> switch (see below). | |
240 | ||
241 | Furthermore you can set a threshold via B<-t> so that only newsgroups | |
242 | with more postings per month will be included in the report. You can | |
243 | invert that by the B<-i> switch so only newsgroups with less than | |
244 | I<threshold> postings per month will be included. | |
245 | ||
246 | You can sort the output by number of postings per month instead of the | |
247 | default (alphabetical list of newsgroups) by using B<-q>; you can | |
248 | reverse the sorting order (from highest to lowest or in reversed | |
249 | alphabetical order) by using B<-d>. | |
250 | ||
251 | Furthermore, you can create a list of newsgroups that had consistently | |
252 | more (or less) than x postings per month during the whole report | |
253 | period by using B<-l> (together with B<i> as needed). | |
254 | ||
255 | Last but not least you can create a "best of" list of the top x | |
256 | newsgroups via B<-b> (or a "worst of" list by adding B<i>). | |
257 | ||
258 | By default, B<groupstats> will dump a very simple alphabetical list of | |
259 | newsgroups, one per line, followed by the number of postings in that | |
260 | month. This output format of course cannot sensibly be combined with | |
261 | time periods, so you can set the output format by using B<-o> (see | |
262 | below). Captions can be added by setting the B<-c> switch. | |
263 | ||
264 | =head2 Configuration | |
265 | ||
266 | F<groupstats.pl> will read its configuration from F<newsstats.conf> | |
267 | which should be present in the same directory via Config::Auto. | |
268 | ||
269 | See doc/INSTALL for an overview of possible configuration options. | |
270 | ||
271 | You can override configuration options via the B<-g> switch. | |
272 | ||
273 | =head1 OPTIONS | |
274 | ||
275 | =over 3 | |
276 | ||
277 | =item B<-V> (version) | |
278 | ||
279 | Print out version and copyright information on B<yapfaq> and exit. | |
280 | ||
281 | =item B<-h> (help) | |
282 | ||
283 | Print this man page and exit. | |
284 | ||
285 | =item B<-m> I<YYYY-MM> (month) | |
286 | ||
287 | Set processing period to a month in YYYY-MM format. Ignored if B<-p> | |
288 | is set. | |
289 | ||
290 | =item B<-p> I<YYYY-MM:YYYY-MM> (period) | |
291 | ||
292 | Set processing period to a time period between two month, each in | |
293 | YYYY-MM format, separated by a colon. Overrides B<-m>. | |
294 | ||
295 | =item B<-n> I<newsgroup(s)> (newsgroups) | |
296 | ||
297 | Limit processing to a certain set of newsgroups. I<newsgroup(s)> can | |
298 | be a single newsgroup name (de.alt.test), a newsgroup hierarchy | |
299 | (de.alt.*) or a list of either of these, separated by colons, for | |
300 | example | |
301 | ||
302 | de.test:de.alt.test:de.newusers.* | |
303 | ||
304 | =item B<-t> I<threshold> (threshold) | |
305 | ||
306 | Only include newsgroups with more than I<threshold> postings per | |
307 | month. Can be inverted by the B<-i> switch so that only newsgroups | |
308 | with less than I<threshold> postings will be included. | |
309 | ||
310 | This setting will be ignored if B<-l> or B<-b> is set. | |
311 | ||
312 | =item B<-l> I<level> (level) | |
313 | ||
314 | Only include newsgroups with more than I<level> postings per | |
315 | month, every month during the whole reporting period. Can be inverted | |
316 | by the B<-i> switch so that only newsgroups with less than I<level> | |
317 | postings every single month will be included. Output will be ordered | |
318 | by newsgroup name, followed by month. | |
319 | ||
320 | This setting will be ignored if B<-b> is set. Overrides B<-t> and | |
321 | can't be used together with B<-q> or B<-d>. | |
322 | ||
323 | =item B<-b> I<n> (best of) | |
324 | ||
325 | Create a list of the I<n> newsgroups with the most postings over the | |
326 | whole reporting period. Can be inverted by the B<-i> switch so that a | |
327 | list of the I<n> newsgroups with the least postings over the whole | |
328 | period is generated. Output will be ordered by sum of postings. | |
329 | ||
330 | Overrides B<-t> and B<-l> and can't be used together with B<-q> or | |
331 | B<-d>. Output format is set to I<pretty> (see below). | |
332 | ||
333 | =item B<-i> (invert) | |
334 | ||
335 | Used in conjunction with B<-t>, B<-l> or B<-b> to set a lower | |
336 | threshold or level or generate a "bottom list" instead of a top list. | |
337 | ||
338 | =item B<-s> (sum per hierarchy level) | |
339 | ||
340 | Include "virtual" groups for every hierarchy level in output, for | |
341 | example: | |
342 | ||
343 | de.alt.ALL 10 | |
344 | de.alt.test 5 | |
345 | de.alt.admin 7 | |
346 | ||
347 | See the B<gatherstats> man page for details. | |
348 | ||
349 | =item B<-o> I<output type> (output format) | |
350 | ||
351 | Set output format. Default is I<dump>, consisting of an alphabetical | |
352 | list of newsgroups, each on a new line, followed by the number of | |
353 | postings in that month. This default format can't be used with time | |
354 | periods of more than one month. | |
355 | ||
356 | I<list> format is like I<dump>, but will print the month in front of | |
357 | the newsgroup name. | |
358 | ||
359 | I<dumpgroup> format can only be use with a group list (see B<-n>) of | |
360 | exactly one newsgroup and is like I<dump>, but will output months, | |
361 | followed by the number of postings. | |
362 | ||
363 | If you don't need easily parsable output, you'll mostly use I<pretty> | |
364 | format, which will print a header for each new month and try to align | |
365 | newsgroup names and posting counts. Usage of B<-b> will force this | |
366 | format. | |
367 | ||
368 | =item B<-c> (captions) | |
369 | ||
370 | Add captions to output (reporting period, newsgroups list, threshold). | |
371 | ||
372 | =item B<-q> (quantity of postings) | |
373 | ||
374 | Sort by number of postings instead of by newsgroup names. | |
375 | ||
376 | Cannot be used with B<-l> or B<-b>. | |
377 | ||
378 | =item B<-d> (descending) | |
379 | ||
380 | Change sort order to descending. | |
381 | ||
382 | Cannot be used with B<-l> or B<-b>. | |
383 | ||
384 | =item B<-g> I<table> (postings per group table) | |
385 | ||
386 | Override I<DBTableGrps> from F<newsstats.conf>. | |
387 | ||
388 | =back | |
389 | ||
390 | =head1 INSTALLATION | |
391 | ||
392 | See doc/INSTALL. | |
393 | ||
394 | =head1 EXAMPLES | |
395 | ||
396 | Show number of postings per group for lasth month in I<dump> format: | |
397 | ||
398 | groupstats | |
399 | ||
400 | Show that report for January of 2010 and de.alt.* plus de.test, | |
401 | including display of hierarchy levels: | |
402 | ||
403 | groupstats -m 2010-01 -n de.alt.*:de.test -s | |
404 | ||
405 | Show that report for the year of 2010 in I<pretty> format: | |
406 | ||
407 | groupstats -p 2010-01:2010-12 -o pretty | |
408 | ||
409 | Only show newsgroups with less than 30 postings last month, ordered | |
410 | by number of postings, descending, in I<pretty> format: | |
411 | ||
412 | groupstats -iqdt 30 -o pretty | |
413 | ||
414 | Show top 10 for the first half-year of of 2010 in I<pretty> format: | |
415 | ||
416 | groupstats -p 2010-01:2010-06 -b 10 -o pretty | |
417 | ||
418 | Report all groups that had less than 30 postings every singele month | |
419 | in the year of 2010 (I<pretty> format is forced) | |
420 | ||
421 | groupstats -p 2010-01:2010-12 -il 30 | |
422 | ||
423 | =head1 FILES | |
424 | ||
425 | =over 4 | |
426 | ||
427 | =item F<groupstats.pl> | |
428 | ||
429 | The script itself. | |
430 | ||
431 | =item F<NewsStats.pm> | |
432 | ||
433 | Library functions for the NewsStats package. | |
434 | ||
435 | =item F<newsstats.conf> | |
436 | ||
437 | Runtime configuration file for B<yapfaq>. | |
438 | ||
439 | =back | |
440 | ||
441 | =head1 BUGS | |
442 | ||
443 | Please report any bugs or feature requests to the author or use the | |
444 | bug tracker at L<http://bugs.th-h.de/>! | |
445 | ||
446 | =head1 SEE ALSO | |
447 | ||
448 | =over 2 | |
449 | ||
450 | =item - | |
451 | ||
452 | doc/README | |
453 | ||
454 | =item - | |
455 | ||
456 | doc/INSTALL | |
457 | ||
458 | =item - | |
459 | ||
460 | gatherstats -h | |
461 | ||
462 | =back | |
463 | ||
464 | This script is part of the B<NewsStats> package. | |
465 | ||
466 | =head1 AUTHOR | |
467 | ||
468 | Thomas Hochstein <thh@inter.net> | |
469 | ||
470 | =head1 COPYRIGHT AND LICENSE | |
471 | ||
472 | Copyright (c) 2010 Thomas Hochstein <thh@inter.net> | |
473 | ||
474 | This program is free software; you may redistribute it and/or modify it | |
475 | under the same terms as Perl itself. | |
476 | ||
477 | =cut |