X-Git-Url: https://code.th-h.de/?p=usenet%2Fnewsstats.git;a=blobdiff_plain;f=gatherstats.pl;h=e9ae0f8268af50f20d98aa3a4976dce67c57b717;hp=bcb8ba0b9170774c901d041e576cfb15119a6b6c;hb=fe46be2168065d5bbbebb04e9350cc7fcd5cbeeb;hpb=2832c235b2497a02713b12197ed97fbde3a91e15 diff --git a/gatherstats.pl b/gatherstats.pl index bcb8ba0..e9ae0f8 100755 --- a/gatherstats.pl +++ b/gatherstats.pl @@ -19,7 +19,7 @@ BEGIN { } use strict; -use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups); +use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList); use DBI; @@ -33,7 +33,7 @@ my %LegalTypes; ################################# Main program ################################# ### read commandline options -my %Options = &ReadOptions('dom:p:t:n:r:g:c:s:'); +my %Options = &ReadOptions('dom:p:t:l:n:r:g:c:s:'); ### read configuration my %Conf = %{ReadConfig('newsstats.conf')}; @@ -54,6 +54,30 @@ die "$MySelf: E: Unknown type '-t $Options{'t'}'!\n" if !exists($LegalTypes{$Opt ### get time period (-m or -p) my ($StartMonth,$EndMonth) = &GetTimePeriod($Options{'m'},$Options{'p'}); +### reformat $Conf{'TLH'} +my $TLH; +if ($Conf{'TLH'}) { + # $Conf{'TLH'} is parsed as an array by Config::Auto; + # make a flat list again, separated by : + if (ref($TLH) eq 'ARRAY') { + $TLH = join(':',@{$Conf{'TLH'}}); + } else { + $TLH = $Conf{'TLH'}; + } + # strip whitespace + $TLH =~ s/\s//g; + # check for illegal characters + die "$MySelf: E: Config error - illegal characters in TLH definition\n" if ($TLH !~ /^[a-zA-Z0-9:]+$/); + if ($TLH =~ /:/) { + # reformat $TLH form a:b to (a)|(b) + $TLH =~ s/:/)|(/g; + $TLH = '(' . $TLH . ')'; + }; +}; + +### read newsgroups list from -l +my %ValidGroups = %{&ReadGroupList($Options{'l'})} if $Options{'l'}; + ### init database my $DBHandle = InitDB(\%Conf,1); @@ -72,24 +96,38 @@ foreach my $Month (&ListMonth($StartMonth,$EndMonth)) { # count postings per group my %Postings; - while (($_) = $DBQuery->fetchrow_array) { # get list oft newsgroups and hierarchies from Newsgroups: - my %Newsgroups = ListNewsgroups($_); + my %Newsgroups = ListNewsgroups($_,$TLH,$Options{'l'} ? \%ValidGroups : ''); # count each newsgroup and hierarchy once foreach (sort keys %Newsgroups) { - # don't count newsgroup/hierarchy in wrong TLH - next if(defined($Conf{'TLH'}) and !/^$Conf{'TLH'}/); $Postings{$_}++; }; }; + # add valid but empty groups if -l is set + if (%ValidGroups) { + foreach (sort keys %ValidGroups) { + if (!defined($Postings{$_})) { + $Postings{$_} = 0 ; + warn (sprintf("ADDED: %s as empty group\n",$_)); + } + }; + }; + + # delete old data for that month + if (!$Options{'o'}) { + $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",$Conf{'DBDatabase'},$Conf{'DBTableGrps'}),undef,$Month) + or warn sprintf("$MySelf: E: Can't delete old groups data for %s from %s.%s: $DBI::errstr\n",$Month,$Conf{'DBDatabase'},$Conf{'DBTableGrps'}); + }; + print "----- GroupStats -----\n" if $Options{'d'}; foreach my $Newsgroup (sort keys %Postings) { print "$Newsgroup => $Postings{$Newsgroup}\n" if $Options{'d'}; if (!$Options{'o'}) { # write to database - $DBQuery = $DBHandle->prepare(sprintf("REPLACE INTO %s.%s (month,newsgroup,postings) VALUES (?, ?, ?)",$Conf{'DBDatabase'},$Conf{'DBTableGrps'})); + $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s (month,newsgroup,postings) VALUES (?, ?, ?)",$Conf{'DBDatabase'},$Conf{'DBTableGrps'})); + # $DBQuery = $DBHandle->prepare(sprintf("REPLACE INTO %s.%s (month,newsgroup,postings) VALUES (?, ?, ?)",$Conf{'DBDatabase'},$Conf{'DBTableGrps'})); $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup}) or die sprintf("$MySelf: E: Can't write groups data for %s/%s to %s.%s: $DBI::errstr\n",$Month,$Newsgroup,$Conf{'DBDatabase'},$Conf{'DBTableGrps'}); $DBQuery->finish; }; @@ -112,7 +150,7 @@ gatherstats - process statistical data from a raw source =head1 SYNOPSIS -B [B<-Vhdo>] [B<-m> I] [B<-p> I] [B<-t> I] [B<-n> I] [B<-r> I] [B<-g> I] [B<-c> I] [B<-s> I] +B [B<-Vhdo>] [B<-m> I] [B<-p> I] [B<-t> I] [B<-l> I] [B<-n> I] [B<-r> I] [B<-g> I] [B<-c> I] [B<-s> I] =head1 REQUIREMENTS @@ -134,7 +172,11 @@ DBI This script will extract and process statistical information from a database table which is fed from F for a given time period -and write its results to (an)other database table(s). +and write its results to (an)other database table(s). Entries marked +with I<'disregard'> in the database will be ignored; currently, you have +to set this flag yourself, using your database management tools. You +can exclude erroneous entries that way (e.g. automatic reposts (think +of cancels flood and resurrectors); spam; ...). The time period to act on defaults to last month; you can assign another month via the B<-m> switch or a time period via the B<-p> @@ -215,6 +257,17 @@ Set processing type to one of I and I. Defaults to all (and is currently rather pointless as only I has been implemented). +=item B<-l> I (check against list) + +Check each group against a list of valid newsgroups read from +I, one group on each line and ignoring everything after the +first whitespace (so you can use a file in checkgroups format or (part +of) your INN active file). + +Newsgroups not found in I will be dropped (and logged to +STDERR), and newsgroups found in I but having no postings +will be added with a count of 0 (and logged to STDERR). + =item B<-n> I (newsgroup hierarchy) Override I from F. @@ -255,9 +308,10 @@ Process all types of information for January of 2010: gatherstats -m 2010-01 -Process only number of postings for the year of 2010: +Process only number of postings for the year of 2010, +checking against checkgroups-2010.txt: - gatherstats -p 2010-01:2010-12 -t groups + gatherstats -p 2010-01:2010-12 -t groups -l checkgroups-2010.txt =head1 FILES