-#! /usr/bin/perl -W\r
-#\r
-# gatherstats.pl\r
-#\r
-# This script will gather statistical information from a database\r
-# containing headers and other information from a INN feed.\r
-# \r
-# It is part of the NewsStats package.\r
-#\r
-# Copyright (c) 2010 Thomas Hochstein <thh@inter.net>\r
-#\r
-# It can be redistributed and/or modified under the same terms under \r
-# which Perl itself is published.\r
-\r
-BEGIN {\r
- our $VERSION = "0.01";\r
- use File::Basename;\r
- push(@INC, dirname($0));\r
-}\r
-use strict;\r
-\r
-use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups);\r
-\r
-use DBI;\r
-\r
-################################# Definitions ##################################\r
-\r
-# define types of information that can be gathered\r
-# all / groups (/ clients / hosts)\r
-my %LegalTypes;\r
-@LegalTypes{('all','groups')} = ();\r
-\r
-################################# Main program #################################\r
-\r
-### read commandline options\r
-my %Options = &ReadOptions('dom:p:t:n:r:g:c:s:');\r
-\r
-### read configuration\r
-my %Conf = %{ReadConfig('newsstats.conf')};\r
-\r
-### override configuration via commandline options\r
-my %ConfOverride;\r
-$ConfOverride{'DBTableRaw'} = $Options{'r'} if $Options{'r'};\r
-$ConfOverride{'DBTableGrps'} = $Options{'g'} if $Options{'g'};\r
-$ConfOverride{'DBTableClnts'} = $Options{'c'} if $Options{'c'};\r
-$ConfOverride{'DBTableHosts'} = $Options{'s'} if $Options{'s'};\r
-$ConfOverride{'TLH'} = $Options{'n'} if $Options{'n'};\r
-&OverrideConfig(\%Conf,\%ConfOverride);\r
-\r
-### get type of information to gather, default to 'all'\r
-$Options{'t'} = 'all' if !$Options{'t'};\r
-die "$MySelf: E: Unknown type '-t $Options{'t'}'!\n" if !exists($LegalTypes{$Options{'t'}});\r
-\r
-### get time period\r
-my ($StartMonth,$EndMonth) = &GetTimePeriod($Options{'m'},$Options{'p'});\r
-\r
-### init database\r
-my $DBHandle = InitDB(\%Conf,1);\r
-\r
-### get data for each month\r
-warn "$MySelf: W: Output only mode. Database is not updated.\n" if $Options{'o'};\r
-foreach my $Month (&ListMonth($StartMonth,$EndMonth)) {\r
-\r
- print "---------- $Month ----------\n" if $Options{'d'};\r
-\r
- if ($Options{'t'} eq 'all' or $Options{'t'} eq 'groups') {\r
- ### ----------------------------------------------\r
- ### get groups data (number of postings per group)\r
- # get groups data from raw table for given month\r
- my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s WHERE day LIKE ? AND NOT disregard",$Conf{'DBDatabase'},$Conf{'DBTableRaw'}));\r
- $DBQuery->execute($Month.'-%') or die sprintf("$MySelf: E: Can't get groups data for %s from %s.%s: $DBI::errstr\n",$Month,$Conf{'DBDatabase'},$Conf{'DBTableRaw'});\r
-\r
- # count postings per group\r
- my %Postings;\r
-\r
- while (($_) = $DBQuery->fetchrow_array) {\r
- # get list oft newsgroups and hierarchies from Newsgroups:\r
- my %Newsgroups = ListNewsgroups($_);\r
- # count each newsgroup and hierarchy once\r
- foreach (sort keys %Newsgroups) {\r
- # don't count newsgroup/hierarchy in wrong TLH\r
- next if(defined($Conf{'TLH'}) and !/^$Conf{'TLH'}/);\r
- $Postings{$_}++;\r
- };\r
- };\r
-\r
- print "----- GroupStats -----\n" if $Options{'d'};\r
- foreach my $Newsgroup (sort keys %Postings) {\r
- print "$Newsgroup => $Postings{$Newsgroup}\n" if $Options{'d'};\r
- if (!$Options{'o'}) {\r
- # write to database\r
- $DBQuery = $DBHandle->prepare(sprintf("REPLACE INTO %s.%s (month,newsgroup,postings) VALUES (?, ?, ?)",$Conf{'DBDatabase'},$Conf{'DBTableGrps'}));\r
- $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup}) or die sprintf("$MySelf: E: Can't write groups data for %s/%s to %s.%s: $DBI::errstr\n",$Month,$Newsgroup,$Conf{'DBDatabase'},$Conf{'DBTableGrps'});\r
- $DBQuery->finish;\r
- };\r
- };\r
- };\r
-};\r
-\r
-### close handles\r
-$DBHandle->disconnect;\r
-\r
+#! /usr/bin/perl -W
+#
+# gatherstats.pl
+#
+# This script will gather statistical information from a database
+# containing headers and other information from a INN feed.
+#
+# It is part of the NewsStats package.
+#
+# Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
+#
+# It can be redistributed and/or modified under the same terms under
+# which Perl itself is published.
+
+BEGIN {
+ our $VERSION = "0.01";
+ use File::Basename;
+ push(@INC, dirname($0));
+}
+use strict;
+
+use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList);
+
+use DBI;
+use Getopt::Long qw(GetOptions);
+Getopt::Long::config ('bundling');
+
+################################# Definitions ##################################
+
+# define types of information that can be gathered
+# all / groups (/ clients / hosts)
+my %LegalStats;
+@LegalStats{('all','groups')} = ();
+
+################################# Main program #################################
+
+### read commandline options
+my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
+ $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest);
+GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
+ 'clientsdb=s' => \$OptClientsDB,
+ 'd|debug!' => \$OptDebug,
+ 'groupsdb=s' => \$OptGroupsDB,
+ 'hierarchy=s' => \$OptTLH,
+ 'hostsdb=s' => \$OptHostsDB,
+ 'm|month=s' => \$OptMonth,
+ 'rawdb=s' => \$OptRawDB,
+ 's|stats=s' => \$OptStatsType,
+ 't|test!' => \$OptTest,
+ 'h|help' => \&ShowPOD,
+ 'V|version' => \&ShowVersion) or exit 1;
+
+### read configuration
+my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')};
+
+### override configuration via commandline options
+my %ConfOverride;
+$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
+$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
+$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
+$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
+$ConfOverride{'TLH'} = $OptTLH if $OptTLH;
+&OverrideConfig(\%Conf,\%ConfOverride);
+
+### get type of information to gather, defaulting to 'all'
+$OptStatsType = 'all' if !$OptStatsType;
+&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
+ if !exists($LegalStats{$OptStatsType});
+
+### get time period from --month
+# get verbal description of time period, drop SQL code
+my ($Period) = &GetTimePeriod($OptMonth);
+&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
+ "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
+
+### reformat $Conf{'TLH'}
+my $TLH;
+if ($Conf{'TLH'}) {
+ # $Conf{'TLH'} is parsed as an array by Config::Auto;
+ # make a flat list again, separated by :
+ if (ref($Conf{'TLH'}) eq 'ARRAY') {
+ $TLH = join(':',@{$Conf{'TLH'}});
+ } else {
+ $TLH = $Conf{'TLH'};
+ }
+ # strip whitespace
+ $TLH =~ s/\s//g;
+ # add trailing dots if none are present yet
+ # (using negative look-behind assertions)
+ $TLH =~ s/(?<!\.):/.:/g;
+ $TLH =~ s/(?<!\.)$/./;
+ # check for illegal characters
+ &Bleat(2,'Config error - illegal characters in TLH definition!')
+ if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
+ # escape dots
+ $TLH =~ s/\./\\./g;
+ if ($TLH =~ /:/) {
+ # reformat $TLH from a:b to (a)|(b),
+ # e.g. replace ':' by ')|('
+ $TLH =~ s/:/)|(/g;
+ $TLH = '(' . $TLH . ')';
+ };
+};
+
+# read list of newsgroups from --checkgroups
+# into a hash
+my %ValidGroups = %{ReadGroupList($OptCheckgroupsFile)} if $OptCheckgroupsFile;
+
+### init database
+my $DBHandle = InitDB(\%Conf,1);
+
+### get data for each month
+&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
+foreach my $Month (&ListMonth($Period)) {
+
+ print "---------- $Month ----------\n" if $OptDebug;
+
+ if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
+ ### ----------------------------------------------
+ ### get groups data (number of postings per group)
+ # get groups data from raw table for given month
+ my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
+ "WHERE day LIKE ? AND NOT disregard",
+ $Conf{'DBDatabase'},
+ $Conf{'DBTableRaw'}));
+ $DBQuery->execute($Month.'-%')
+ or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
+ "$DBI::errstr\n",$Month,
+ $Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
+
+ # count postings per group
+ my %Postings;
+ while (($_) = $DBQuery->fetchrow_array) {
+ # get list oft newsgroups and hierarchies from Newsgroups:
+ my %Newsgroups = ListNewsgroups($_,$TLH,
+ $OptCheckgroupsFile ? \%ValidGroups : '');
+ # count each newsgroup and hierarchy once
+ foreach (sort keys %Newsgroups) {
+ $Postings{$_}++;
+ };
+ };
+
+ # add valid but empty groups if --checkgroups is set
+ if (%ValidGroups) {
+ foreach (sort keys %ValidGroups) {
+ if (!defined($Postings{$_})) {
+ $Postings{$_} = 0 ;
+ warn (sprintf("ADDED: %s as empty group\n",$_));
+ }
+ };
+ };
+
+ # delete old data for that month
+ if (!$OptTest) {
+ $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",
+ $Conf{'DBDatabase'},$Conf{'DBTableGrps'}),
+ undef,$Month)
+ or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ".
+ "$DBI::errstr\n",$Month,
+ $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
+ };
+
+ print "----- GroupStats -----\n" if $OptDebug;
+ foreach my $Newsgroup (sort keys %Postings) {
+ print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug;
+ if (!$OptTest) {
+ # write to database
+ $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ".
+ "(month,newsgroup,postings) ".
+ "VALUES (?, ?, ?)",
+ $Conf{'DBDatabase'},
+ $Conf{'DBTableGrps'}));
+ $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
+ or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ".
+ "$DBI::errstr\n",$Month,$Newsgroup,
+ $Conf{'DBDatabase'},$Conf{'DBTableGrps'}));
+ $DBQuery->finish;
+ };
+ };
+ } else {
+ # other types of information go here - later on
+ };
+};
+
+### close handles
+$DBHandle->disconnect;
+
+__END__
+
+################################ Documentation #################################
+
+=head1 NAME
+
+gatherstats - process statistical data from a raw source
+
+=head1 SYNOPSIS
+
+B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats] [B<-c> I<checkgroups file>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>]
+
+=head1 REQUIREMENTS
+
+See L<doc/README>.
+
+=head1 DESCRIPTION
+
+This script will extract and process statistical information from a
+database table which is fed from F<feedlog.pl> for a given time period
+and write its results to (an)other database table(s). Entries marked
+with I<'disregard'> in the database will be ignored; currently, you
+have to set this flag yourself, using your database management tools.
+You can exclude erroneous entries that way (e.g. automatic reposts
+(think of cancels flood and resurrectors); spam; ...).
+
+The time period to act on defaults to last month; you can assign
+another time period or a single month via the B<--month> option (see
+below).
+
+By default B<gatherstats> will process all types of information; you
+can change that using the B<--stats> option and assigning the type of
+information to process. Currently that doesn't matter yet as only
+processing of the number of postings per group per month is
+implemented anyway.
+
+Possible information types include:
+
+=over 3
+
+=item B<groups> (postings per group per month)
+
+B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
+counted for each single group they appear in. Groups not in I<TLH>
+will be ignored.
+
+B<gatherstats> will also add up the number of postings for each
+hierarchy level, but only count each posting once. A posting to
+de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
+respectively. A crossposting to de.alt.test and de.alt.admin, on the
+other hand, will be counted for de.alt.test and de.alt.admin each, but
+only once for de.alt.ALL and de.ALL.
+
+Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
+override that default through the B<--groupsdb> option.
+
+=back
+
+=head2 Configuration
+
+B<gatherstats> will read its configuration from F<newsstats.conf>
+which should be present in the same directory via Config::Auto.
+
+See L<doc/INSTALL> for an overview of possible configuration options.
+
+You can override configuration options via the B<--hierarchy>,
+B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
+respectively.
+
+=head1 OPTIONS
+
+=over 3
+
+=item B<-V>, B<--version>
+
+Print out version and copyright information and exit.
+
+=item B<-h>, B<--help>
+
+Print this man page and exit.
+
+=item B<-d>, B<--debug>
+
+Output debugging information to STDOUT while processing (number of
+postings per group).
+
+=item B<-t>, B<--test>
+
+Do not write results to database. You should use B<--debug> in
+conjunction with B<--test> ... everything else seems a bit pointless.
+
+=item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
+
+Set processing period to a single month in YYYY-MM format or to a time
+period between two month in YYYY-MM:YYYY-MM format (two month, separated
+by a colon).
+
+
+=item B<-s>, B<--stats> I<type>
+
+Set processing type to one of I<all> and I<groups>. Defaults to all
+(and is currently rather pointless as only I<groups> has been
+implemented).
+
+=item B<-c>, B<--checkgroups> I<filename>
+
+Check each group against a list of valid newsgroups read from
+I<filename>, one group on each line and ignoring everything after the
+first whitespace (so you can use a file in checkgroups format or (part
+of) your INN active file).
+
+Newsgroups not found in I<filename> will be dropped (and logged to
+STDERR), and newsgroups found in I<filename> but having no postings
+will be added with a count of 0 (and logged to STDERR).
+
+=item B<--hierarchy> I<TLH> (newsgroup hierarchy)
+
+Override I<TLH> from F<newsstats.conf>.
+
+=item B<--rawdb> I<table> (raw data table)
+
+Override I<DBTableRaw> from F<newsstats.conf>.
+
+=item B<--groupsdb> I<table> (postings per group table)
+
+Override I<DBTableGrps> from F<newsstats.conf>.
+
+=item B<--clientsdb> I<table> (client data table)
+
+Override I<DBTableClnts> from F<newsstats.conf>.
+
+=item B<--hostsdb> I<table> (host data table)
+
+Override I<DBTableHosts> from F<newsstats.conf>.
+
+=back
+
+=head1 INSTALLATION
+
+See L<doc/INSTALL>.
+
+=head1 EXAMPLES
+
+Process all types of information for lasth month:
+
+ gatherstats
+
+Do a dry run, showing results of processing:
+
+ gatherstats --debug --test
+
+Process all types of information for January of 2010:
+
+ gatherstats --month 2010-01
+
+Process only number of postings for the year of 2010,
+checking against checkgroups-2010.txt:
+
+ gatherstats -m 2010-01:2010-12 -s groups -c checkgroups-2010.txt
+
+=head1 FILES
+
+=over 4
+
+=item F<gatherstats.pl>
+
+The script itself.
+
+=item F<NewsStats.pm>
+
+Library functions for the NewsStats package.
+
+=item F<newsstats.conf>
+
+Runtime configuration file.
+
+=back
+
+=head1 BUGS
+
+Please report any bugs or feature requests to the author or use the
+bug tracker at L<http://bugs.th-h.de/>!
+
+=head1 SEE ALSO
+
+=over 2
+
+=item -
+
+L<doc/README>
+
+=item -
+
+L<doc/INSTALL>
+
+=back
+
+This script is part of the B<NewsStats> package.
+
+=head1 AUTHOR
+
+Thomas Hochstein <thh@inter.net>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net>
+
+This program is free software; you may redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=cut