SplitPeriod
ListMonth
ListNewsgroups
+ ReadGroupList
OutputData
FormatOutput
SQLHierarchies
### explode a (scalar) list of newsgroup names to a list of newsgroup and
### hierarchy names where every newsgroup and hierarchy appears only once:
### de.alt.test,de.alt.admin -> de.ALL, de.alt.ALL, de.alt.test, de.alt.admin
-### IN : $Newsgroups: a list of newsgroups (content of Newsgroups: header)
-### OUT: %Newsgroups: hash containing all newsgroup and hierarchy names as keys
- my ($Newsgroups) = @_;
+### IN : $Newsgroups : a list of newsgroups (content of Newsgroups: header)
+### $ValidGroupsR: reference to a hash containing all valid newsgroups
+### as keys
+### OUT: %Newsgroups : hash containing all newsgroup and hierarchy names as keys
+ my ($Newsgroups,$ValidGroupsR) = @_;
+ my %ValidGroups = %{$ValidGroupsR} if $ValidGroupsR;
my %Newsgroups;
chomp($Newsgroups);
# remove whitespace from contents of Newsgroups:
$Newsgroups =~ s/\s//;
# call &HierarchyCount for each newsgroup in $Newsgroups:
for (split /,/, $Newsgroups) {
+ # don't count invalid newsgroups
+ if(%ValidGroups and !defined($ValidGroups{$_})) {
+ warn (sprintf("DROPPED: %s\n",$_));
+ next;
+ }
# add original newsgroup to %Newsgroups
$Newsgroups{$_} = 1;
# add all hierarchy elements to %Newsgroups, amended by '.ALL',
return @Hierarchies;
};
+################################################################################
+sub ReadGroupList {
+################################################################################
+### read a list of valid newsgroups from file (each group on one line,
+### ignoring everything after the first whitespace and so accepting files
+### in checkgroups format as well as (parts of) an INN active file)
+### IN : $Filename : file to read
+### OUT: \%ValidGroups: hash containing all valid newsgroups
+ my ($Filename) = @_;
+ my %ValidGroups;
+ open (my $LIST,"<$Filename") or die "$MySelf: E: Cannot read $Filename: $!\n";
+ while (<$LIST>) {
+ s/^(\S+).*$/$1/;
+ chomp;
+ $ValidGroups{$_} = '1';
+ };
+ close $LIST;
+ return \%ValidGroups;
+};
+
################################################################################
#####----------------------------- TimePeriods ----------------------------#####
}
use strict;
-use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups);
+use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList);
use DBI;
################################# Main program #################################
### read commandline options
-my %Options = &ReadOptions('dom:p:t:n:r:g:c:s:');
+my %Options = &ReadOptions('dom:p:t:l:n:r:g:c:s:');
### read configuration
my %Conf = %{ReadConfig('newsstats.conf')};
### get time period (-m or -p)
my ($StartMonth,$EndMonth) = &GetTimePeriod($Options{'m'},$Options{'p'});
+### read newsgroups list from -l
+my %ValidGroups = %{&ReadGroupList($Options{'l'})} if $Options{'l'};
+
### init database
my $DBHandle = InitDB(\%Conf,1);
# count postings per group
my %Postings;
-
while (($_) = $DBQuery->fetchrow_array) {
# get list oft newsgroups and hierarchies from Newsgroups:
- my %Newsgroups = ListNewsgroups($_);
+ my %Newsgroups = ListNewsgroups($_,$Options{'l'} ? \%ValidGroups : '');
# count each newsgroup and hierarchy once
foreach (sort keys %Newsgroups) {
- # don't count newsgroup/hierarchy in wrong TLH
- next if(defined($Conf{'TLH'}) and !/^$Conf{'TLH'}/);
$Postings{$_}++;
};
};
+ # add valid but empty groups if -l is set
+ if (%ValidGroups) {
+ foreach (sort keys %ValidGroups) {
+ if (!defined($Postings{$_})) {
+ $Postings{$_} = 0 ;
+ warn (sprintf("ADDED: %s as empty group\n",$_));
+ }
+ };
+ };
+
print "----- GroupStats -----\n" if $Options{'d'};
foreach my $Newsgroup (sort keys %Postings) {
print "$Newsgroup => $Postings{$Newsgroup}\n" if $Options{'d'};
=head1 SYNOPSIS
-B<gatherstats> [B<-Vhdo>] [B<-m> I<YYYY-MM>] [B<-p> I<YYYY-MM:YYYY-MM>] [B<-t> I<type>] [B<-n> I<TLH>] [B<-r> I<database table>] [B<-g> I<database table>] [B<-c> I<database table>] [B<-s> I<database table>]
+B<gatherstats> [B<-Vhdo>] [B<-m> I<YYYY-MM>] [B<-p> I<YYYY-MM:YYYY-MM>] [B<-t> I<type>] [B<-l> I<filename>] [B<-n> I<TLH>] [B<-r> I<database table>] [B<-g> I<database table>] [B<-c> I<database table>] [B<-s> I<database table>]
=head1 REQUIREMENTS
(and is currently rather pointless as only I<groups> has been
implemented).
+=item B<-l> I<filename> (check against list)
+
+Check each group against a list of valid newsgroups read from
+I<filename>, one group on each line and ignoring everything after the
+first whitespace (so you can use a file in checkgroups format or (part
+of) your INN active file).
+
+Newsgroups not found in I<filename> will be dropped (and logged to
+STDERR), and newsgroups found in I<filename> but having no postings
+will be added with a count of 0 (and logged to STDERR).
+
=item B<-n> I<TLH> (newsgroup hierarchy)
Override I<TLH> from F<newsstats.conf>.
gatherstats -m 2010-01
-Process only number of postings for the year of 2010:
+Process only number of postings for the year of 2010,
+checking against checkgroups-2010.txt:
- gatherstats -p 2010-01:2010-12 -t groups
+ gatherstats -p 2010-01:2010-12 -t groups -l checkgroups-2010.txt
=head1 FILES