3 # Library functions for the NewsStats package.
5 # Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
7 # This module can be redistributed and/or modified under the same terms under
8 # which Perl itself is published.
14 our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
46 %EXPORT_TAGS = ( TimePeriods => [qw(GetTimePeriod LastMonth SplitPeriod
48 Output => [qw(OutputData FormatOutput)],
49 SQLHelper => [qw(SQLHierarchies SQLSortOrder SQLGroupList
50 SQLSetBounds SQLBuildClause GetMaxLength)]);
52 our $PackageVersion = '0.01';
60 #####-------------------------------- Vars --------------------------------#####
64 our $HomePath = dirname($0);
67 our $MyVersion = "$0 $::VERSION (NewsStats.pm $VERSION)";
69 #####------------------------------- Basics -------------------------------#####
71 ################################################################################
73 ################################################################################
75 ################################################################################
76 ### display version and exit
77 print "NewsStats v$PackageVersion\n$MyVersion\n";
78 print "Copyright (c) 2010-2012 Thomas Hochstein <thh\@inter.net>\n";
79 print "This program is free software; you may redistribute it ".
80 "and/or modify it under the same terms as Perl itself.\n";
83 ################################################################################
85 ################################################################################
87 ################################################################################
88 ### feed myself to perldoc and exit
89 exec('perldoc', $FullPath);
92 ################################################################################
94 ################################################################################
96 ################################################################################
97 ### read config via Config::Auto
98 ### IN : $ConfFile: config filename
99 ### OUT: reference to a hash containing the configuration
101 return Config::Auto::parse($ConfFile, format => 'equal');
103 ################################################################################
105 ################################################################################
107 ################################################################################
108 ### override configuration values
109 ### IN : $ConfigR : reference to configuration hash
110 ### $OverrideR: reference to a hash containing overrides
111 my ($ConfigR,$OverrideR) = @_;
112 my %Override = %$OverrideR;
114 &Bleat(1,"Empty configuration hash passed to OverrideConfig()")
115 if ( keys %$ConfigR < 1);
116 # return if no overrides
117 return if (keys %Override < 1 or keys %$ConfigR < 1);
118 foreach my $Key (keys %Override) {
119 $$ConfigR{$Key} = $Override{$Key};
122 ################################################################################
124 ################################################################################
126 ################################################################################
127 ### initialise database connection
128 ### IN : $ConfigR: reference to configuration hash
129 ### $Die : if TRUE, die if connection fails
131 my ($ConfigR,$Die) = @_;
132 my %Conf = %$ConfigR;
133 my $DBHandle = DBI->connect(sprintf('DBI:%s:database=%s;host=%s',
134 $Conf{'DBDriver'},$Conf{'DBDatabase'},
135 $Conf{'DBHost'}), $Conf{'DBUser'},
136 $Conf{'DBPw'}, { PrintError => 0 });
138 &Bleat(2,$DBI::errstr) if (defined($Die) and $Die);
139 &Bleat(1,$DBI::errstr);
143 ################################################################################
145 ################################################################################
147 ################################################################################
148 ### print warning or error messages and terminate in case of error
149 ### IN : $Level : 1 = warning, 2 = error
150 ### $Message: warning or error message
151 my ($Level,$Message) = @_;
153 warn "$0 W: $Message\n"
154 } elsif ($Level == 2) {
155 die "$0 E: $Message\n"
157 print "$0: $Message\n"
160 ################################################################################
162 #####------------------------------ GetStats ------------------------------#####
164 ################################################################################
166 ################################################################################
167 ### explode a (scalar) list of newsgroup names to a list of newsgroup and
168 ### hierarchy names where every newsgroup and hierarchy appears only once:
169 ### de.alt.test,de.alt.admin -> de.ALL, de.alt.ALL, de.alt.test, de.alt.admin
170 ### IN : $Newsgroups : a list of newsgroups (content of Newsgroups: header)
171 ### $TLH : top level hierarchy (all other newsgroups are ignored)
172 ### $ValidGroupsR: reference to a hash containing all valid newsgroups
174 ### OUT: %Newsgroups : hash containing all newsgroup and hierarchy names as keys
175 my ($Newsgroups,$TLH,$ValidGroupsR) = @_;
176 my %ValidGroups = %{$ValidGroupsR} if $ValidGroupsR;
179 # remove whitespace from contents of Newsgroups:
180 $Newsgroups =~ s/\s//;
181 # call &HierarchyCount for each newsgroup in $Newsgroups:
182 for (split /,/, $Newsgroups) {
183 # don't count newsgroup/hierarchy in wrong TLH
184 next if($TLH and !/^$TLH/);
185 # don't count invalid newsgroups
186 if(%ValidGroups and !defined($ValidGroups{$_})) {
187 &Bleat(1,sprintf("DROPPED: %s",$_));
190 # add original newsgroup to %Newsgroups
192 # add all hierarchy elements to %Newsgroups, amended by '.ALL',
193 # i.e. de.alt.ALL and de.ALL
194 foreach (ParseHierarchies($_)) {
195 $Newsgroups{$_.'.ALL'} = 1;
201 ################################################################################
202 sub ParseHierarchies {
203 ################################################################################
204 ### return a list of all hierarchy levels a newsgroup belongs to
205 ### (for de.alt.test.moderated that would be de/de.alt/de.alt.test)
206 ### IN : $Newsgroup : a newsgroup name
207 ### OUT: @Hierarchies: array containing all hierarchies the newsgroup belongs to
208 my ($Newsgroup) = @_;
210 # strip trailing dots
211 $Newsgroup =~ s/(.+)\.+$/$1/;
212 # butcher newsgroup name by "." and add each hierarchy to @Hierarchies
213 # i.e. de.alt.test: "de.alt" and "de"
214 while ($Newsgroup =~ /\./) {
215 $Newsgroup =~ s/^((?:\.?[^.]+)*)\.[^.]+$/$1/;
216 push @Hierarchies, $Newsgroup;
221 ################################################################################
223 ################################################################################
224 ### read a list of valid newsgroups from file (each group on one line,
225 ### ignoring everything after the first whitespace and so accepting files
226 ### in checkgroups format as well as (parts of) an INN active file)
227 ### IN : $Filename : file to read
228 ### OUT: \%ValidGroups: hash containing all valid newsgroups
231 open (my $LIST,"<$Filename") or &Bleat(2,"Cannot read $Filename: $!");
235 $ValidGroups{$_} = '1';
238 return \%ValidGroups;
241 ################################################################################
243 #####----------------------------- TimePeriods ----------------------------#####
245 ################################################################################
247 ################################################################################
248 ### get a time period to act on from --month option;
249 ### if empty, default to last month
250 ### IN : $Month: may be empty, 'YYYY-MM', 'YYYY-MM:YYYY-MM' or 'all'
251 ### OUT: $Verbal,$SQL: verbal description and WHERE-clause
252 ### of the chosen time period
254 # define result variables
256 # define a regular expression for a month
257 my $REMonth = '\d{4}-\d{2}';
259 # default to last month if option is not set
264 # check for valid input
265 if ($Month =~ /^$REMonth$/) {
266 # single month (YYYY-MM)
267 ($Month) = &CheckMonth($Month);
269 $SQL = sprintf("month = '%s'",$Month);
270 } elsif ($Month =~ /^$REMonth:$REMonth$/) {
271 # time period (YYYY-MM:YYYY-MM)
272 $Verbal = sprintf('%s to %s',&SplitPeriod($Month));
273 $SQL = sprintf("month BETWEEN '%s' AND '%s'",&SplitPeriod($Month));
274 } elsif ($Month =~ /^all$/i) {
276 $Verbal = 'all time';
280 return (undef,undef);
283 return ($Verbal,$SQL);
286 ################################################################################
288 ################################################################################
289 ### get last month from todays date in YYYY-MM format
290 ### OUT: last month as YYYY-MM
292 my (undef,undef,undef,undef,$Month,$Year,undef,undef,undef) = localtime(time);
293 # $Month is already defined from 0 to 11, so no need to decrease it by 1
300 return sprintf('%4d-%02d',$Year,$Month);
303 ################################################################################
305 ################################################################################
306 ### check if input (in YYYY-MM form) is valid with MM between 01 and 12;
307 ### otherwise, fix it
308 ### IN : @Month: array of month
309 ### OUT: @Month: a valid month
311 foreach my $Month (@Month) {
312 my ($OldMonth) = $Month;
313 my ($CalMonth) = substr ($Month, -2);
314 if ($CalMonth < 1 or $CalMonth > 12) {
315 $CalMonth = '12' if $CalMonth > 12;
316 $CalMonth = '01' if $CalMonth < 1;
317 substr($Month, -2) = $CalMonth;
318 &Bleat(1,sprintf("'%s' is an invalid date (MM must be between '01' ".
319 "and '12'), set to '%s'.",$OldMonth,$Month));
325 ################################################################################
327 ################################################################################
328 ### split a time period denoted by YYYY-MM:YYYY-MM into start and end month
329 ### IN : $Period: time period
330 ### OUT: $StartMonth, $EndMonth
332 my ($StartMonth, $EndMonth) = split /:/, $Period;
333 ($StartMonth,$EndMonth) = CheckMonth($StartMonth,$EndMonth);
334 # switch parameters as necessary
335 if ($EndMonth gt $StartMonth) {
336 return ($StartMonth, $EndMonth);
338 return ($EndMonth, $StartMonth);
342 ################################################################################
344 ################################################################################
345 ### return a list of months (YYYY-MM) between start and end month
346 ### IN : $MonthExpression ('YYYY-MM' or 'YYYY-MM to YYYY-MM')
347 ### OUT: @Months: array containing all months from $MonthExpression enumerated
348 my ($MonthExpression )= @_;
349 # return if single month
350 return ($MonthExpression) if ($MonthExpression =~ /^\d{4}-\d{2}$/);
351 # parse $MonthExpression
352 my ($StartMonth, $EndMonth) = split(' to ',$MonthExpression);
353 # set $Year, $Month from $StartMonth
354 my ($Year, $Month) = split /-/, $StartMonth;
357 until ("$Year-$Month" gt $EndMonth) {
358 push @Months, "$Year-$Month";
359 $Month = "$Month"; # force string context
369 #####---------------------------- OutputFormats ---------------------------#####
371 ################################################################################
373 ################################################################################
374 ### read database query results from DBHandle and print results with formatting
375 ### IN : $Format : format specifier
376 ### $Comments : print or suppress all comments for machine-readable output
377 ### $GroupBy : primary sorting order (month or key)
378 ### $Precision: number of digits right of decimal point (0 or 2)
379 ### $ValidKeys: reference to a hash containing all valid keys
380 ### $FileTempl: file name template (--filetemplate): filetempl-YYYY-MM
381 ### $DBQuery : database query handle with executed query,
382 ### containing $Month, $Key, $Value
383 ### $PadGroup : padding length for key field (optional) for 'pretty'
384 my ($Format, $Comments, $GroupBy, $Precision, $ValidKeys, $FileTempl,
385 $DBQuery, $PadGroup) = @_;
386 my %ValidKeys = %{$ValidKeys} if $ValidKeys;
387 my ($FileName, $Handle, $OUT);
390 # define output types
392 @LegalOutput{('dump',,'list','pretty')} = ();
393 # bail out if format is unknown
394 &Bleat(2,"Unknown output type '$Format'!") if !exists($LegalOutput{$Format});
396 while (my ($Month, $Key, $Value) = $DBQuery->fetchrow_array) {
397 # don't display invalid keys
398 if(%ValidKeys and !defined($ValidKeys{$Key})) {
400 # &Bleat(1,sprintf("DROPPED: %s",$Key));
403 # care for correct sorting order and abstract from month and keys:
404 # $Caption will be $Month or $Key, according to sorting order,
405 # and $Key will be $Key or $Month, respectively
407 if ($GroupBy eq 'key') {
413 # set output file handle
415 $Handle = *STDOUT{IO}; # set $Handle to a reference to STDOUT
416 } elsif (!defined($LastIteration) or $LastIteration ne $Caption) {
417 close $OUT if ($LastIteration);
418 # safeguards for filename creation:
419 # replace potential problem characters with '_'
420 $FileName = sprintf('%s-%s',$FileTempl,$Caption);
421 $FileName =~ s/[^a-zA-Z0-9_-]+/_/g;
422 open ($OUT,">$FileName")
423 or &Bleat(2,sprintf("Cannot open output file '%s': $!",
427 print $Handle &FormatOutput($Format, $Comments, $Caption, $Key, $Value,
428 $Precision, $PadGroup);
429 $LastIteration = $Caption;
431 close $OUT if ($FileTempl);
434 ################################################################################
436 ################################################################################
437 ### format information for output according to format specifier
438 ### IN : $Format : format specifier
439 ### $Comments : print or suppress all comments for machine-readable output
440 ### $Caption : month (as YYYY-MM) or $Key, according to sorting order
441 ### $Key : newsgroup, client, ... or $Month, as above
442 ### $Value : number of postings with that attribute
443 ### $Precision: number of digits right of decimal point (0 or 2)
444 ### $PadGroup : padding length for key field (optional) for 'pretty'
445 ### OUT: $Output: formatted output
446 my ($Format, $Comments, $Caption, $Key, $Value, $Precision, $PadGroup) = @_;
448 # keep last caption in mind
449 our ($LastIteration);
450 # create one line of output
451 if ($Format eq 'dump') {
452 # output as dump (key value)
453 $Output = sprintf ("# %s:\n",$Caption)
454 if ($Comments and (!defined($LastIteration) or $Caption ne $LastIteration));
455 $Output .= sprintf ("%s %u\n",$Key,$Value);
456 } elsif ($Format eq 'list') {
457 # output as list (caption key value)
458 $Output = sprintf ("%s %s %u\n",$Caption,$Key,$Value);
459 } elsif ($Format eq 'pretty') {
461 $Output = sprintf ("# ----- %s:\n",$Caption)
462 if ($Comments and (!defined($LastIteration) or $Caption ne $LastIteration));
463 $Output .= sprintf ($PadGroup ? sprintf("%%-%us %%10.*f\n",$PadGroup) :
464 "%s %.*f\n",$Key,$Precision,$Value);
469 #####------------------------- QueryModifications -------------------------#####
471 ################################################################################
473 ################################################################################
474 ### add exclusion of hierarchy levels (de.alt.ALL) from SQL query by
475 ### amending the WHERE clause if $ShowHierarchies is false (or don't, if it is
476 ### true, accordingly)
477 ### IN : $ShowHierarchies: boolean value
479 my ($ShowHierarchies) = @_;
480 return $ShowHierarchies ? '' : "newsgroup NOT LIKE '%.ALL'";
483 ################################################################################
485 ################################################################################
486 ### get length of longest field in future query result
487 ### IN : $DBHandle : database handel
488 ### $Table : table to query
489 ### $Field : field to check
490 ### $WhereClause : WHERE clause
491 ### $HavingClause: HAVING clause
492 ### @BindVars : bind variables for WHERE clause
493 ### OUT: $Length: length of longest instnace of $Field
494 my ($DBHandle,$Table,$Field,$WhereClause,$HavingClause,@BindVars) = @_;
495 my $DBQuery = $DBHandle->prepare(sprintf("SELECT MAX(LENGTH(%s)) ".
496 "FROM %s %s %s",$Field,$Table,
497 $WhereClause,$HavingClause ?
498 'GROUP BY newsgroup' . $HavingClause .
499 ' ORDER BY LENGTH(newsgroup) '.
500 'DESC LIMIT 1': ''));
501 $DBQuery->execute(@BindVars) or &Bleat(1,sprintf("Can't get field length ".
502 "for '%s' from table '%s': ".
503 "$DBI::errstr",$Field,$Table));
504 my ($Length) = $DBQuery->fetchrow_array;
508 ################################################################################
510 ################################################################################
511 ### build a SQL 'ORDER BY' clause from $OptGroupBy (primary sorting) and
512 ### $OptOrderBy (secondary sorting), both ascending or descending;
513 ### descending sorting order is done by adding '-desc'
514 ### IN : $GroupBy: primary sort by 'month' (default) or 'newsgroups'
515 ### $OrderBy: secondary sort by month/newsgroups (default)
516 ### or number of 'postings'
517 ### OUT: a SQL ORDER BY clause
518 my ($GroupBy,$OrderBy) = @_;
519 my ($GroupSort,$OrderSort) = ('','');
520 # $GroupBy (primary sorting)
524 ($GroupBy, $GroupSort) = SQLParseOrder($GroupBy);
525 if ($GroupBy =~ /group/i) {
526 $GroupBy = 'newsgroup';
531 my $Secondary = ($GroupBy eq 'month') ? 'newsgroup' : 'month';
532 # $OrderBy (secondary sorting)
534 $OrderBy = $Secondary;
536 ($OrderBy, $OrderSort) = SQLParseOrder($OrderBy);
537 if ($OrderBy =~ /posting/i) {
538 $OrderBy = "postings $OrderSort, $Secondary";
540 $OrderBy = "$Secondary $OrderSort";
543 return ($GroupBy,&SQLBuildClause('order',"$GroupBy $GroupSort",$OrderBy));
546 ################################################################################
548 ################################################################################
549 ### parse $OptGroupBy or $OptOrderBy option of the form param[-desc], e.g.
550 ### 'month', 'month-desc', 'newsgroups-desc', but also just 'desc'
551 ### IN : $OrderOption: order option (see above)
552 ### OUT: parameter to sort by,
553 ### sort order ('DESC' or nothing, meaning 'ASC')
554 my ($OrderOption) = @_;
556 if ($OrderOption =~ s/-?desc$//i) {
559 $OrderOption =~ s/-?asc$//i
561 return ($OrderOption,$SortOrder);
564 ################################################################################
566 ################################################################################
567 ### explode list of newsgroups separated by : (with wildcards)
568 ### to a SQL 'WHERE' expression
569 ### IN : $Newsgroups: list of newsgroups (group.one.*:group.two:group.three.*)
570 ### OUT: SQL code to become part of a 'WHERE' clause,
571 ### list of newsgroups for SQL bindings
572 my ($Newsgroups) = @_;
573 # substitute '*' wildcard with SQL wildcard character '%'
574 $Newsgroups =~ s/\*/%/g;
575 # just one newsgroup?
576 return (SQLGroupWildcard($Newsgroups),$Newsgroups) if $Newsgroups !~ /:/;
577 # list of newsgroups separated by ':'
579 my @GroupList = split /:/, $Newsgroups;
580 foreach (@GroupList) {
581 $SQL .= ' OR ' if $SQL gt '(';
582 $SQL .= SQLGroupWildcard($_);
585 return ($SQL,@GroupList);
588 ################################################################################
589 sub SQLGroupWildcard {
590 ################################################################################
591 ### build a valid SQL 'WHERE' expression with or without wildcards
592 ### IN : $Newsgroup: newsgroup expression, probably with wildcard
593 ### (group.name or group.name.%)
594 ### OUT: SQL code to become part of a 'WHERE' clause
595 my ($Newsgroup) = @_;
596 # FIXME: check for validity
597 if ($Newsgroup !~ /%/) {
598 return 'newsgroup = ?';
600 return 'newsgroup LIKE ?';
604 ################################################################################
606 ################################################################################
607 ### set upper and/or lower boundary (number of postings)
608 ### IN : $Type: 'level', 'average', 'sum' or 'default'
609 ### $LowBound,$UppBound: lower/upper boundary, respectively
610 ### OUT: SQL code to become part of a WHERE or HAVING clause
611 my ($Type,$LowBound,$UppBound) = @_;
612 ($LowBound,$UppBound) = SQLCheckNumber($LowBound,$UppBound);
613 if($LowBound and $UppBound and $LowBound > $UppBound) {
614 &Bleat(1,"Lower boundary $LowBound is larger than Upper boundary ".
615 "$UppBound, exchanging boundaries.");
616 ($LowBound,$UppBound) = ($UppBound,$LowBound);
618 # default to 'default'
619 my $WhereHavingFunction = 'postings';
620 # set $LowBound to SQL statement:
621 # 'WHERE postings >=', 'HAVING MIN(postings) >=' or 'HAVING AVG(postings) >='
622 if ($Type eq 'level') {
623 $WhereHavingFunction = 'MIN(postings)'
624 } elsif ($Type eq 'average') {
625 $WhereHavingFunction = 'AVG(postings)'
626 } elsif ($Type eq 'sum') {
627 $WhereHavingFunction = 'SUM(postings)'
629 $LowBound = sprintf('%s >= '.$LowBound,$WhereHavingFunction) if ($LowBound);
630 # set $LowBound to SQL statement:
631 # 'WHERE postings <=', 'HAVING MAX(postings) <=' or 'HAVING AVG(postings) <='
632 if ($Type eq 'level') {
633 $WhereHavingFunction = 'MAX(postings)'
634 } elsif ($Type eq 'average') {
635 $WhereHavingFunction = 'AVG(postings)'
636 } elsif ($Type eq 'sum') {
637 $WhereHavingFunction = 'SUM(postings)'
639 $UppBound = sprintf('%s <= '.$UppBound,$WhereHavingFunction) if ($UppBound);
640 return ($LowBound,$UppBound);
643 ################################################################################
645 ################################################################################
646 ### check if input is a valid positive integer; otherwise, make it one
647 ### IN : @Numbers: array of parameters
648 ### OUT: @Numbers: a valid positive integer
650 foreach my $Number (@Numbers) {
651 if ($Number and $Number < 0) {
652 &Bleat(1,"Boundary $Number is < 0, set to ".-$Number);
655 $Number = '' if ($Number and $Number !~ /^\d+$/);
660 ################################################################################
662 ################################################################################
663 ### build a valid SQL WHERE, GROUP BY, ORDER BY or HAVING clause
664 ### from multiple expressions which *may* be empty
665 ### IN : $Type: 'where', 'having', 'group' or 'order'
666 ### @Expressions: array of expressions
667 ### OUT: $SQLClause: a SQL clause
668 my ($Type,@Expressions) = @_;
669 my ($SQLClause,$Separator,$Statement);
670 # set separator ('AND' or ',')
671 if ($Type eq 'where' or $Type eq 'having') {
677 if ($Type eq 'where') {
678 $Statement = 'WHERE';
679 } elsif ($Type eq 'order') {
680 $Statement = 'ORDER BY';
681 } elsif ($Type eq 'having') {
682 $Statement = 'HAVING';
684 $Statement = 'GROUP BY';
686 # build query from expressions with separators
687 foreach my $Expression (@Expressions) {
689 $SQLClause .= " $Separator " if ($SQLClause);
690 $SQLClause .= $Expression;
693 # add statement in front if not already present
694 $SQLClause = " $Statement " . $SQLClause
695 if ($SQLClause and $SQLClause !~ /$Statement/);
700 #####------------------------------- done ---------------------------------#####