X-Git-Url: https://code.th-h.de/?p=usenet%2Fnewsstats.git;a=blobdiff_plain;f=gatherstats.pl;fp=gatherstats.pl;h=0000000000000000000000000000000000000000;hp=f1fe7deef8deb57ab5e682358b2aee9cfe56622d;hb=1af57a53905788c99a088bb6b17088293c0224d1;hpb=07c0b2589af779c33d5d35b6a7fa0e7883201674 diff --git a/gatherstats.pl b/gatherstats.pl deleted file mode 100755 index f1fe7de..0000000 --- a/gatherstats.pl +++ /dev/null @@ -1,417 +0,0 @@ -#! /usr/bin/perl -# -# gatherstats.pl -# -# This script will gather statistical information from a database -# containing headers and other information from a INN feed. -# -# It is part of the NewsStats package. -# -# Copyright (c) 2010-2013 Thomas Hochstein -# -# It can be redistributed and/or modified under the same terms under -# which Perl itself is published. - -BEGIN { - our $VERSION = "0.01"; - use File::Basename; - push(@INC, dirname($0)); -} -use strict; -use warnings; - -use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList); - -use DBI; -use Getopt::Long qw(GetOptions); -Getopt::Long::config ('bundling'); - -################################# Definitions ################################## - -# define types of information that can be gathered -# all / groups (/ clients / hosts) -my %LegalStats; -@LegalStats{('all','groups')} = (); - -################################# Main program ################################# - -### read commandline options -my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH, - $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest); -GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile, - 'clientsdb=s' => \$OptClientsDB, - 'd|debug!' => \$OptDebug, - 'groupsdb=s' => \$OptGroupsDB, - 'hierarchy=s' => \$OptTLH, - 'hostsdb=s' => \$OptHostsDB, - 'm|month=s' => \$OptMonth, - 'rawdb=s' => \$OptRawDB, - 's|stats=s' => \$OptStatsType, - 't|test!' => \$OptTest, - 'h|help' => \&ShowPOD, - 'V|version' => \&ShowVersion) or exit 1; - -### read configuration -my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')}; - -### override configuration via commandline options -my %ConfOverride; -$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB; -$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB; -$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB; -$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB; -$ConfOverride{'TLH'} = $OptTLH if $OptTLH; -&OverrideConfig(\%Conf,\%ConfOverride); - -### get type of information to gather, defaulting to 'all' -$OptStatsType = 'all' if !$OptStatsType; -&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType)) - if !exists($LegalStats{$OptStatsType}); - -### get time period from --month -# get verbal description of time period, drop SQL code -my ($Period) = &GetTimePeriod($OptMonth); -&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ". - "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time'); - -### reformat $Conf{'TLH'} -my $TLH; -if ($Conf{'TLH'}) { - # $Conf{'TLH'} is parsed as an array by Config::Auto; - # make a flat list again, separated by : - if (ref($Conf{'TLH'}) eq 'ARRAY') { - $TLH = join(':',@{$Conf{'TLH'}}); - } else { - $TLH = $Conf{'TLH'}; - } - # strip whitespace - $TLH =~ s/\s//g; - # add trailing dots if none are present yet - # (using negative look-behind assertions) - $TLH =~ s/(?prepare(sprintf("SELECT newsgroups FROM %s.%s ". - "WHERE day LIKE ? AND NOT disregard", - $Conf{'DBDatabase'}, - $Conf{'DBTableRaw'})); - $DBQuery->execute($Month.'-%') - or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ". - "$DBI::errstr\n",$Month, - $Conf{'DBDatabase'},$Conf{'DBTableRaw'})); - - # count postings per group - my %Postings; - while (($_) = $DBQuery->fetchrow_array) { - # get list of newsgroups and hierarchies from Newsgroups: - my %Newsgroups = ListNewsgroups($_,$TLH, - $OptCheckgroupsFile ? \%ValidGroups : ''); - # count each newsgroup and hierarchy once - foreach (sort keys %Newsgroups) { - $Postings{$_}++; - }; - }; - - # add valid but empty groups if --checkgroups is set - if (%ValidGroups) { - foreach (sort keys %ValidGroups) { - if (!defined($Postings{$_})) { - # add current newsgroup as empty group - $Postings{$_} = 0; - warn (sprintf("ADDED: %s as empty group\n",$_)); - # add empty hierarchies for current newsgroup as needed - foreach (ParseHierarchies($_)) { - my $Hierarchy = $_ . '.ALL'; - if (!defined($Postings{$Hierarchy})) { - $Postings{$Hierarchy} = 0; - warn (sprintf("ADDED: %s as empty group\n",$Hierarchy)); - }; - }; - } - }; - }; - - # delete old data for that month - if (!$OptTest) { - $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?", - $Conf{'DBDatabase'},$Conf{'DBTableGrps'}), - undef,$Month) - or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ". - "$DBI::errstr\n",$Month, - $Conf{'DBDatabase'},$Conf{'DBTableGrps'})); - }; - - print "----- GroupStats -----\n" if $OptDebug; - foreach my $Newsgroup (sort keys %Postings) { - print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug; - if (!$OptTest) { - # write to database - $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ". - "(month,newsgroup,postings) ". - "VALUES (?, ?, ?)", - $Conf{'DBDatabase'}, - $Conf{'DBTableGrps'})); - $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup}) - or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ". - "$DBI::errstr\n",$Month,$Newsgroup, - $Conf{'DBDatabase'},$Conf{'DBTableGrps'})); - $DBQuery->finish; - }; - }; - } else { - # other types of information go here - later on - }; -}; - -### close handles -$DBHandle->disconnect; - -__END__ - -################################ Documentation ################################# - -=head1 NAME - -gatherstats - process statistical data from a raw source - -=head1 SYNOPSIS - -B [B<-Vhdt>] [B<-m> I | I] [B<-s> I] [B<-c> I]] [B<--hierarchy> I] [B<--rawdb> I] [B<-groupsdb> I] [B<--clientsdb> I] [B<--hostsdb> I] - -=head1 REQUIREMENTS - -See L. - -=head1 DESCRIPTION - -This script will extract and process statistical information from a -database table which is fed from F for a given time period -and write its results to (an)other database table(s). Entries marked -with I<'disregard'> in the database will be ignored; currently, you -have to set this flag yourself, using your database management tools. -You can exclude erroneous entries that way (e.g. automatic reposts -(think of cancels flood and resurrectors); spam; ...). - -The time period to act on defaults to last month; you can assign -another time period or a single month via the B<--month> option (see -below). - -By default B will process all types of information; you -can change that using the B<--stats> option and assigning the type of -information to process. Currently that doesn't matter yet as only -processing of the number of postings per group per month is -implemented anyway. - -Possible information types include: - -=over 3 - -=item B (postings per group per month) - -B will examine Newsgroups: headers. Crosspostings will be -counted for each single group they appear in. Groups not in I -will be ignored. - -B will also add up the number of postings for each -hierarchy level, but only count each posting once. A posting to -de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL, -respectively. A crossposting to de.alt.test and de.alt.admin, on the -other hand, will be counted for de.alt.test and de.alt.admin each, but -only once for de.alt.ALL and de.ALL. - -Data is written to I (see L); you can -override that default through the B<--groupsdb> option. - -=back - -=head2 Configuration - -B will read its configuration from F -which should be present in the same directory via Config::Auto. - -See L for an overview of possible configuration options. - -You can override configuration options via the B<--hierarchy>, -B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options, -respectively. - -=head1 OPTIONS - -=over 3 - -=item B<-V>, B<--version> - -Print out version and copyright information and exit. - -=item B<-h>, B<--help> - -Print this man page and exit. - -=item B<-d>, B<--debug> - -Output debugging information to STDOUT while processing (number of -postings per group). - -=item B<-t>, B<--test> - -Do not write results to database. You should use B<--debug> in -conjunction with B<--test> ... everything else seems a bit pointless. - -=item B<-m>, B<--month> I - -Set processing period to a single month in YYYY-MM format or to a time -period between two month in YYYY-MM:YYYY-MM format (two month, separated -by a colon). - -=item B<-s>, B<--stats> I - -Set processing type to one of I and I. Defaults to all -(and is currently rather pointless as only I has been -implemented). - -=item B<-c>, B<--checkgroups> I - -Check each group against a list of valid newsgroups read from a file, -one group on each line and ignoring everything after the first -whitespace (so you can use a file in checkgroups format or (part of) -your INN active file). - -The filename is taken from I, amended by each -B<--month> B is processing in the form of I, -so that - - gatherstats -m 2010-01:2010-12 -c checkgroups - -will check against F for January 2010, against -F for February 2010 and so on. - -Newsgroups not found in the checkgroups file will be dropped (and -logged to STDERR), and newsgroups found there but having no postings -will be added with a count of 0 (and logged to STDERR). - -=item B<--hierarchy> I (newsgroup hierarchy) - -Override I from F. - -=item B<--rawdb> I (raw data table) - -Override I from F. - -=item B<--groupsdb> I
(postings per group table) - -Override I from F. - -=item B<--clientsdb> I
(client data table) - -Override I from F. - -=item B<--hostsdb> I
(host data table) - -Override I from F. - -=back - -=head1 INSTALLATION - -See L. - -=head1 EXAMPLES - -Process all types of information for lasth month: - - gatherstats - -Do a dry run, showing results of processing: - - gatherstats --debug --test - -Process all types of information for January of 2010: - - gatherstats --month 2010-01 - -Process only number of postings for the year of 2010, -checking against checkgroups-*: - - gatherstats -m 2010-01:2010-12 -s groups -c checkgroups - -=head1 FILES - -=over 4 - -=item F - -The script itself. - -=item F - -Library functions for the NewsStats package. - -=item F - -Runtime configuration file. - -=back - -=head1 BUGS - -Please report any bugs or feature requests to the author or use the -bug tracker at L! - -=head1 SEE ALSO - -=over 2 - -=item - - -L - -=item - - -L - -=back - -This script is part of the B package. - -=head1 AUTHOR - -Thomas Hochstein - -=head1 COPYRIGHT AND LICENSE - -Copyright (c) 2010-2012 Thomas Hochstein - -This program is free software; you may redistribute it and/or modify it -under the same terms as Perl itself. - -=cut