X-Git-Url: https://code.th-h.de/?p=usenet%2Fnewsstats.git;a=blobdiff_plain;f=gatherstats.pl;h=f1fe7deef8deb57ab5e682358b2aee9cfe56622d;hp=09157d160f3570c430cb30914fce0463dffdc78e;hb=07c0b2589af779c33d5d35b6a7fa0e7883201674;hpb=741336c210429f37bebfd9882b9461e824320cd0 diff --git a/gatherstats.pl b/gatherstats.pl index 09157d1..f1fe7de 100755 --- a/gatherstats.pl +++ b/gatherstats.pl @@ -1,102 +1,417 @@ -#! /usr/bin/perl -W -# -# gatherstats.pl -# -# This script will gather statistical information from a database -# containing headers and other information from a INN feed. -# -# It is part of the NewsStats package. -# -# Copyright (c) 2010 Thomas Hochstein -# -# It can be redistributed and/or modified under the same terms under -# which Perl itself is published. - -BEGIN { - our $VERSION = "0.01"; - use File::Basename; - push(@INC, dirname($0)); -} -use strict; - -use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups); - -use DBI; - -################################# Definitions ################################## - -# define types of information that can be gathered -# all / groups (/ clients / hosts) -my %LegalTypes; -@LegalTypes{('all','groups')} = (); - -################################# Main program ################################# - -### read commandline options -my %Options = &ReadOptions('dom:p:t:n:r:g:c:s:'); - -### read configuration -my %Conf = %{ReadConfig('newsstats.conf')}; - -### override configuration via commandline options -my %ConfOverride; -$ConfOverride{'DBTableRaw'} = $Options{'r'} if $Options{'r'}; -$ConfOverride{'DBTableGrps'} = $Options{'g'} if $Options{'g'}; -$ConfOverride{'DBTableClnts'} = $Options{'c'} if $Options{'c'}; -$ConfOverride{'DBTableHosts'} = $Options{'s'} if $Options{'s'}; -$ConfOverride{'TLH'} = $Options{'n'} if $Options{'n'}; -&OverrideConfig(\%Conf,\%ConfOverride); - -### get type of information to gather, default to 'all' -$Options{'t'} = 'all' if !$Options{'t'}; -die "$MySelf: E: Unknown type '-t $Options{'t'}'!\n" if !exists($LegalTypes{$Options{'t'}}); - -### get time period -my ($StartMonth,$EndMonth) = &GetTimePeriod($Options{'m'},$Options{'p'}); - -### init database -my $DBHandle = InitDB(\%Conf,1); - -### get data for each month -warn "$MySelf: W: Output only mode. Database is not updated.\n" if $Options{'o'}; -foreach my $Month (&ListMonth($StartMonth,$EndMonth)) { - - print "---------- $Month ----------\n" if $Options{'d'}; - - if ($Options{'t'} eq 'all' or $Options{'t'} eq 'groups') { - ### ---------------------------------------------- - ### get groups data (number of postings per group) - # get groups data from raw table for given month - my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s WHERE day LIKE ? AND NOT disregard",$Conf{'DBDatabase'},$Conf{'DBTableRaw'})); - $DBQuery->execute($Month.'-%') or die sprintf("$MySelf: E: Can't get groups data for %s from %s.%s: $DBI::errstr\n",$Month,$Conf{'DBDatabase'},$Conf{'DBTableRaw'}); - - # count postings per group - my %Postings; - - while (($_) = $DBQuery->fetchrow_array) { - # get list oft newsgroups and hierarchies from Newsgroups: - my %Newsgroups = ListNewsgroups($_); - # count each newsgroup and hierarchy once - foreach (sort keys %Newsgroups) { - # don't count newsgroup/hierarchy in wrong TLH - next if(defined($Conf{'TLH'}) and !/^$Conf{'TLH'}/); - $Postings{$_}++; - }; - }; - - print "----- GroupStats -----\n" if $Options{'d'}; - foreach my $Newsgroup (sort keys %Postings) { - print "$Newsgroup => $Postings{$Newsgroup}\n" if $Options{'d'}; - if (!$Options{'o'}) { - # write to database - $DBQuery = $DBHandle->prepare(sprintf("REPLACE INTO %s.%s (month,newsgroup,postings) VALUES (?, ?, ?)",$Conf{'DBDatabase'},$Conf{'DBTableGrps'})); - $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup}) or die sprintf("$MySelf: E: Can't write groups data for %s/%s to %s.%s: $DBI::errstr\n",$Month,$Newsgroup,$Conf{'DBDatabase'},$Conf{'DBTableGrps'}); - $DBQuery->finish; - }; - }; - }; -}; - -### close handles -$DBHandle->disconnect; - +#! /usr/bin/perl +# +# gatherstats.pl +# +# This script will gather statistical information from a database +# containing headers and other information from a INN feed. +# +# It is part of the NewsStats package. +# +# Copyright (c) 2010-2013 Thomas Hochstein +# +# It can be redistributed and/or modified under the same terms under +# which Perl itself is published. + +BEGIN { + our $VERSION = "0.01"; + use File::Basename; + push(@INC, dirname($0)); +} +use strict; +use warnings; + +use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList); + +use DBI; +use Getopt::Long qw(GetOptions); +Getopt::Long::config ('bundling'); + +################################# Definitions ################################## + +# define types of information that can be gathered +# all / groups (/ clients / hosts) +my %LegalStats; +@LegalStats{('all','groups')} = (); + +################################# Main program ################################# + +### read commandline options +my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH, + $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest); +GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile, + 'clientsdb=s' => \$OptClientsDB, + 'd|debug!' => \$OptDebug, + 'groupsdb=s' => \$OptGroupsDB, + 'hierarchy=s' => \$OptTLH, + 'hostsdb=s' => \$OptHostsDB, + 'm|month=s' => \$OptMonth, + 'rawdb=s' => \$OptRawDB, + 's|stats=s' => \$OptStatsType, + 't|test!' => \$OptTest, + 'h|help' => \&ShowPOD, + 'V|version' => \&ShowVersion) or exit 1; + +### read configuration +my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')}; + +### override configuration via commandline options +my %ConfOverride; +$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB; +$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB; +$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB; +$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB; +$ConfOverride{'TLH'} = $OptTLH if $OptTLH; +&OverrideConfig(\%Conf,\%ConfOverride); + +### get type of information to gather, defaulting to 'all' +$OptStatsType = 'all' if !$OptStatsType; +&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType)) + if !exists($LegalStats{$OptStatsType}); + +### get time period from --month +# get verbal description of time period, drop SQL code +my ($Period) = &GetTimePeriod($OptMonth); +&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ". + "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time'); + +### reformat $Conf{'TLH'} +my $TLH; +if ($Conf{'TLH'}) { + # $Conf{'TLH'} is parsed as an array by Config::Auto; + # make a flat list again, separated by : + if (ref($Conf{'TLH'}) eq 'ARRAY') { + $TLH = join(':',@{$Conf{'TLH'}}); + } else { + $TLH = $Conf{'TLH'}; + } + # strip whitespace + $TLH =~ s/\s//g; + # add trailing dots if none are present yet + # (using negative look-behind assertions) + $TLH =~ s/(?prepare(sprintf("SELECT newsgroups FROM %s.%s ". + "WHERE day LIKE ? AND NOT disregard", + $Conf{'DBDatabase'}, + $Conf{'DBTableRaw'})); + $DBQuery->execute($Month.'-%') + or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ". + "$DBI::errstr\n",$Month, + $Conf{'DBDatabase'},$Conf{'DBTableRaw'})); + + # count postings per group + my %Postings; + while (($_) = $DBQuery->fetchrow_array) { + # get list of newsgroups and hierarchies from Newsgroups: + my %Newsgroups = ListNewsgroups($_,$TLH, + $OptCheckgroupsFile ? \%ValidGroups : ''); + # count each newsgroup and hierarchy once + foreach (sort keys %Newsgroups) { + $Postings{$_}++; + }; + }; + + # add valid but empty groups if --checkgroups is set + if (%ValidGroups) { + foreach (sort keys %ValidGroups) { + if (!defined($Postings{$_})) { + # add current newsgroup as empty group + $Postings{$_} = 0; + warn (sprintf("ADDED: %s as empty group\n",$_)); + # add empty hierarchies for current newsgroup as needed + foreach (ParseHierarchies($_)) { + my $Hierarchy = $_ . '.ALL'; + if (!defined($Postings{$Hierarchy})) { + $Postings{$Hierarchy} = 0; + warn (sprintf("ADDED: %s as empty group\n",$Hierarchy)); + }; + }; + } + }; + }; + + # delete old data for that month + if (!$OptTest) { + $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?", + $Conf{'DBDatabase'},$Conf{'DBTableGrps'}), + undef,$Month) + or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ". + "$DBI::errstr\n",$Month, + $Conf{'DBDatabase'},$Conf{'DBTableGrps'})); + }; + + print "----- GroupStats -----\n" if $OptDebug; + foreach my $Newsgroup (sort keys %Postings) { + print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug; + if (!$OptTest) { + # write to database + $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ". + "(month,newsgroup,postings) ". + "VALUES (?, ?, ?)", + $Conf{'DBDatabase'}, + $Conf{'DBTableGrps'})); + $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup}) + or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ". + "$DBI::errstr\n",$Month,$Newsgroup, + $Conf{'DBDatabase'},$Conf{'DBTableGrps'})); + $DBQuery->finish; + }; + }; + } else { + # other types of information go here - later on + }; +}; + +### close handles +$DBHandle->disconnect; + +__END__ + +################################ Documentation ################################# + +=head1 NAME + +gatherstats - process statistical data from a raw source + +=head1 SYNOPSIS + +B [B<-Vhdt>] [B<-m> I | I] [B<-s> I] [B<-c> I]] [B<--hierarchy> I] [B<--rawdb> I] [B<-groupsdb> I] [B<--clientsdb> I] [B<--hostsdb> I] + +=head1 REQUIREMENTS + +See L. + +=head1 DESCRIPTION + +This script will extract and process statistical information from a +database table which is fed from F for a given time period +and write its results to (an)other database table(s). Entries marked +with I<'disregard'> in the database will be ignored; currently, you +have to set this flag yourself, using your database management tools. +You can exclude erroneous entries that way (e.g. automatic reposts +(think of cancels flood and resurrectors); spam; ...). + +The time period to act on defaults to last month; you can assign +another time period or a single month via the B<--month> option (see +below). + +By default B will process all types of information; you +can change that using the B<--stats> option and assigning the type of +information to process. Currently that doesn't matter yet as only +processing of the number of postings per group per month is +implemented anyway. + +Possible information types include: + +=over 3 + +=item B (postings per group per month) + +B will examine Newsgroups: headers. Crosspostings will be +counted for each single group they appear in. Groups not in I +will be ignored. + +B will also add up the number of postings for each +hierarchy level, but only count each posting once. A posting to +de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL, +respectively. A crossposting to de.alt.test and de.alt.admin, on the +other hand, will be counted for de.alt.test and de.alt.admin each, but +only once for de.alt.ALL and de.ALL. + +Data is written to I (see L); you can +override that default through the B<--groupsdb> option. + +=back + +=head2 Configuration + +B will read its configuration from F +which should be present in the same directory via Config::Auto. + +See L for an overview of possible configuration options. + +You can override configuration options via the B<--hierarchy>, +B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options, +respectively. + +=head1 OPTIONS + +=over 3 + +=item B<-V>, B<--version> + +Print out version and copyright information and exit. + +=item B<-h>, B<--help> + +Print this man page and exit. + +=item B<-d>, B<--debug> + +Output debugging information to STDOUT while processing (number of +postings per group). + +=item B<-t>, B<--test> + +Do not write results to database. You should use B<--debug> in +conjunction with B<--test> ... everything else seems a bit pointless. + +=item B<-m>, B<--month> I + +Set processing period to a single month in YYYY-MM format or to a time +period between two month in YYYY-MM:YYYY-MM format (two month, separated +by a colon). + +=item B<-s>, B<--stats> I + +Set processing type to one of I and I. Defaults to all +(and is currently rather pointless as only I has been +implemented). + +=item B<-c>, B<--checkgroups> I + +Check each group against a list of valid newsgroups read from a file, +one group on each line and ignoring everything after the first +whitespace (so you can use a file in checkgroups format or (part of) +your INN active file). + +The filename is taken from I, amended by each +B<--month> B is processing in the form of I, +so that + + gatherstats -m 2010-01:2010-12 -c checkgroups + +will check against F for January 2010, against +F for February 2010 and so on. + +Newsgroups not found in the checkgroups file will be dropped (and +logged to STDERR), and newsgroups found there but having no postings +will be added with a count of 0 (and logged to STDERR). + +=item B<--hierarchy> I (newsgroup hierarchy) + +Override I from F. + +=item B<--rawdb> I (raw data table) + +Override I from F. + +=item B<--groupsdb> I
(postings per group table) + +Override I from F. + +=item B<--clientsdb> I
(client data table) + +Override I from F. + +=item B<--hostsdb> I
(host data table) + +Override I from F. + +=back + +=head1 INSTALLATION + +See L. + +=head1 EXAMPLES + +Process all types of information for lasth month: + + gatherstats + +Do a dry run, showing results of processing: + + gatherstats --debug --test + +Process all types of information for January of 2010: + + gatherstats --month 2010-01 + +Process only number of postings for the year of 2010, +checking against checkgroups-*: + + gatherstats -m 2010-01:2010-12 -s groups -c checkgroups + +=head1 FILES + +=over 4 + +=item F + +The script itself. + +=item F + +Library functions for the NewsStats package. + +=item F + +Runtime configuration file. + +=back + +=head1 BUGS + +Please report any bugs or feature requests to the author or use the +bug tracker at L! + +=head1 SEE ALSO + +=over 2 + +=item - + +L + +=item - + +L + +=back + +This script is part of the B package. + +=head1 AUTHOR + +Thomas Hochstein + +=head1 COPYRIGHT AND LICENSE + +Copyright (c) 2010-2012 Thomas Hochstein + +This program is free software; you may redistribute it and/or modify it +under the same terms as Perl itself. + +=cut