| 1 | #! /usr/bin/perl -W |
| 2 | # |
| 3 | # gatherstats.pl |
| 4 | # |
| 5 | # This script will gather statistical information from a database |
| 6 | # containing headers and other information from a INN feed. |
| 7 | # |
| 8 | # It is part of the NewsStats package. |
| 9 | # |
| 10 | # Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net> |
| 11 | # |
| 12 | # It can be redistributed and/or modified under the same terms under |
| 13 | # which Perl itself is published. |
| 14 | |
| 15 | BEGIN { |
| 16 | our $VERSION = "0.01"; |
| 17 | use File::Basename; |
| 18 | push(@INC, dirname($0)); |
| 19 | } |
| 20 | use strict; |
| 21 | |
| 22 | use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList); |
| 23 | |
| 24 | use DBI; |
| 25 | use Getopt::Long qw(GetOptions); |
| 26 | Getopt::Long::config ('bundling'); |
| 27 | |
| 28 | ################################# Definitions ################################## |
| 29 | |
| 30 | # define types of information that can be gathered |
| 31 | # all / groups (/ clients / hosts) |
| 32 | my %LegalStats; |
| 33 | @LegalStats{('all','groups')} = (); |
| 34 | |
| 35 | ################################# Main program ################################# |
| 36 | |
| 37 | ### read commandline options |
| 38 | my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH, |
| 39 | $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest); |
| 40 | GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile, |
| 41 | 'clientsdb=s' => \$OptClientsDB, |
| 42 | 'd|debug!' => \$OptDebug, |
| 43 | 'groupsdb=s' => \$OptGroupsDB, |
| 44 | 'hierarchy=s' => \$OptTLH, |
| 45 | 'hostsdb=s' => \$OptHostsDB, |
| 46 | 'm|month=s' => \$OptMonth, |
| 47 | 'rawdb=s' => \$OptRawDB, |
| 48 | 's|stats=s' => \$OptStatsType, |
| 49 | 't|test!' => \$OptTest, |
| 50 | 'h|help' => \&ShowPOD, |
| 51 | 'V|version' => \&ShowVersion) or exit 1; |
| 52 | |
| 53 | ### read configuration |
| 54 | my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')}; |
| 55 | |
| 56 | ### override configuration via commandline options |
| 57 | my %ConfOverride; |
| 58 | $ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB; |
| 59 | $ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB; |
| 60 | $ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB; |
| 61 | $ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB; |
| 62 | $ConfOverride{'TLH'} = $OptTLH if $OptTLH; |
| 63 | &OverrideConfig(\%Conf,\%ConfOverride); |
| 64 | |
| 65 | ### get type of information to gather, defaulting to 'all' |
| 66 | $OptStatsType = 'all' if !$OptStatsType; |
| 67 | &Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType)) |
| 68 | if !exists($LegalStats{$OptStatsType}); |
| 69 | |
| 70 | ### get time period from --month |
| 71 | # get verbal description of time period, drop SQL code |
| 72 | my ($Period) = &GetTimePeriod($OptMonth); |
| 73 | &Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ". |
| 74 | "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time'); |
| 75 | |
| 76 | ### reformat $Conf{'TLH'} |
| 77 | my $TLH; |
| 78 | if ($Conf{'TLH'}) { |
| 79 | # $Conf{'TLH'} is parsed as an array by Config::Auto; |
| 80 | # make a flat list again, separated by : |
| 81 | if (ref($Conf{'TLH'}) eq 'ARRAY') { |
| 82 | $TLH = join(':',@{$Conf{'TLH'}}); |
| 83 | } else { |
| 84 | $TLH = $Conf{'TLH'}; |
| 85 | } |
| 86 | # strip whitespace |
| 87 | $TLH =~ s/\s//g; |
| 88 | # add trailing dots if none are present yet |
| 89 | # (using negative look-behind assertions) |
| 90 | $TLH =~ s/(?<!\.):/.:/g; |
| 91 | $TLH =~ s/(?<!\.)$/./; |
| 92 | # check for illegal characters |
| 93 | &Bleat(2,'Config error - illegal characters in TLH definition!') |
| 94 | if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/); |
| 95 | # escape dots |
| 96 | $TLH =~ s/\./\\./g; |
| 97 | if ($TLH =~ /:/) { |
| 98 | # reformat $TLH from a:b to (a)|(b), |
| 99 | # e.g. replace ':' by ')|(' |
| 100 | $TLH =~ s/:/)|(/g; |
| 101 | $TLH = '(' . $TLH . ')'; |
| 102 | }; |
| 103 | }; |
| 104 | |
| 105 | ### init database |
| 106 | my $DBHandle = InitDB(\%Conf,1); |
| 107 | |
| 108 | ### get data for each month |
| 109 | &Bleat(1,'Test mode. Database is not updated.') if $OptTest; |
| 110 | foreach my $Month (&ListMonth($Period)) { |
| 111 | |
| 112 | print "---------- $Month ----------\n" if $OptDebug; |
| 113 | |
| 114 | if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') { |
| 115 | # read list of newsgroups from --checkgroups |
| 116 | # into a hash |
| 117 | my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))} |
| 118 | if $OptCheckgroupsFile; |
| 119 | |
| 120 | ### ---------------------------------------------- |
| 121 | ### get groups data (number of postings per group) |
| 122 | # get groups data from raw table for given month |
| 123 | my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ". |
| 124 | "WHERE day LIKE ? AND NOT disregard", |
| 125 | $Conf{'DBDatabase'}, |
| 126 | $Conf{'DBTableRaw'})); |
| 127 | $DBQuery->execute($Month.'-%') |
| 128 | or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ". |
| 129 | "$DBI::errstr\n",$Month, |
| 130 | $Conf{'DBDatabase'},$Conf{'DBTableRaw'})); |
| 131 | |
| 132 | # count postings per group |
| 133 | my %Postings; |
| 134 | while (($_) = $DBQuery->fetchrow_array) { |
| 135 | # get list of newsgroups and hierarchies from Newsgroups: |
| 136 | my %Newsgroups = ListNewsgroups($_,$TLH, |
| 137 | $OptCheckgroupsFile ? \%ValidGroups : ''); |
| 138 | # count each newsgroup and hierarchy once |
| 139 | foreach (sort keys %Newsgroups) { |
| 140 | $Postings{$_}++; |
| 141 | }; |
| 142 | }; |
| 143 | |
| 144 | # add valid but empty groups if --checkgroups is set |
| 145 | if (%ValidGroups) { |
| 146 | foreach (sort keys %ValidGroups) { |
| 147 | if (!defined($Postings{$_})) { |
| 148 | # add current newsgroup as empty group |
| 149 | $Postings{$_} = 0; |
| 150 | warn (sprintf("ADDED: %s as empty group\n",$_)); |
| 151 | # add empty hierarchies for current newsgroup as needed |
| 152 | foreach (ParseHierarchies($_)) { |
| 153 | my $Hierarchy = $_ . '.ALL'; |
| 154 | if (!defined($Postings{$Hierarchy})) { |
| 155 | $Postings{$Hierarchy} = 0; |
| 156 | warn (sprintf("ADDED: %s as empty group\n",$Hierarchy)); |
| 157 | }; |
| 158 | }; |
| 159 | } |
| 160 | }; |
| 161 | }; |
| 162 | |
| 163 | # delete old data for that month |
| 164 | if (!$OptTest) { |
| 165 | $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?", |
| 166 | $Conf{'DBDatabase'},$Conf{'DBTableGrps'}), |
| 167 | undef,$Month) |
| 168 | or &Bleat(2,sprintf("Can't delete old groups data for %s from %s.%s: ". |
| 169 | "$DBI::errstr\n",$Month, |
| 170 | $Conf{'DBDatabase'},$Conf{'DBTableGrps'})); |
| 171 | }; |
| 172 | |
| 173 | print "----- GroupStats -----\n" if $OptDebug; |
| 174 | foreach my $Newsgroup (sort keys %Postings) { |
| 175 | print "$Newsgroup => $Postings{$Newsgroup}\n" if $OptDebug; |
| 176 | if (!$OptTest) { |
| 177 | # write to database |
| 178 | $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s ". |
| 179 | "(month,newsgroup,postings) ". |
| 180 | "VALUES (?, ?, ?)", |
| 181 | $Conf{'DBDatabase'}, |
| 182 | $Conf{'DBTableGrps'})); |
| 183 | $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup}) |
| 184 | or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s.%s: ". |
| 185 | "$DBI::errstr\n",$Month,$Newsgroup, |
| 186 | $Conf{'DBDatabase'},$Conf{'DBTableGrps'})); |
| 187 | $DBQuery->finish; |
| 188 | }; |
| 189 | }; |
| 190 | } else { |
| 191 | # other types of information go here - later on |
| 192 | }; |
| 193 | }; |
| 194 | |
| 195 | ### close handles |
| 196 | $DBHandle->disconnect; |
| 197 | |
| 198 | __END__ |
| 199 | |
| 200 | ################################ Documentation ################################# |
| 201 | |
| 202 | =head1 NAME |
| 203 | |
| 204 | gatherstats - process statistical data from a raw source |
| 205 | |
| 206 | =head1 SYNOPSIS |
| 207 | |
| 208 | B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>] |
| 209 | |
| 210 | =head1 REQUIREMENTS |
| 211 | |
| 212 | See L<doc/README>. |
| 213 | |
| 214 | =head1 DESCRIPTION |
| 215 | |
| 216 | This script will extract and process statistical information from a |
| 217 | database table which is fed from F<feedlog.pl> for a given time period |
| 218 | and write its results to (an)other database table(s). Entries marked |
| 219 | with I<'disregard'> in the database will be ignored; currently, you |
| 220 | have to set this flag yourself, using your database management tools. |
| 221 | You can exclude erroneous entries that way (e.g. automatic reposts |
| 222 | (think of cancels flood and resurrectors); spam; ...). |
| 223 | |
| 224 | The time period to act on defaults to last month; you can assign |
| 225 | another time period or a single month via the B<--month> option (see |
| 226 | below). |
| 227 | |
| 228 | By default B<gatherstats> will process all types of information; you |
| 229 | can change that using the B<--stats> option and assigning the type of |
| 230 | information to process. Currently that doesn't matter yet as only |
| 231 | processing of the number of postings per group per month is |
| 232 | implemented anyway. |
| 233 | |
| 234 | Possible information types include: |
| 235 | |
| 236 | =over 3 |
| 237 | |
| 238 | =item B<groups> (postings per group per month) |
| 239 | |
| 240 | B<gatherstats> will examine Newsgroups: headers. Crosspostings will be |
| 241 | counted for each single group they appear in. Groups not in I<TLH> |
| 242 | will be ignored. |
| 243 | |
| 244 | B<gatherstats> will also add up the number of postings for each |
| 245 | hierarchy level, but only count each posting once. A posting to |
| 246 | de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL, |
| 247 | respectively. A crossposting to de.alt.test and de.alt.admin, on the |
| 248 | other hand, will be counted for de.alt.test and de.alt.admin each, but |
| 249 | only once for de.alt.ALL and de.ALL. |
| 250 | |
| 251 | Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can |
| 252 | override that default through the B<--groupsdb> option. |
| 253 | |
| 254 | =back |
| 255 | |
| 256 | =head2 Configuration |
| 257 | |
| 258 | B<gatherstats> will read its configuration from F<newsstats.conf> |
| 259 | which should be present in the same directory via Config::Auto. |
| 260 | |
| 261 | See L<doc/INSTALL> for an overview of possible configuration options. |
| 262 | |
| 263 | You can override configuration options via the B<--hierarchy>, |
| 264 | B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options, |
| 265 | respectively. |
| 266 | |
| 267 | =head1 OPTIONS |
| 268 | |
| 269 | =over 3 |
| 270 | |
| 271 | =item B<-V>, B<--version> |
| 272 | |
| 273 | Print out version and copyright information and exit. |
| 274 | |
| 275 | =item B<-h>, B<--help> |
| 276 | |
| 277 | Print this man page and exit. |
| 278 | |
| 279 | =item B<-d>, B<--debug> |
| 280 | |
| 281 | Output debugging information to STDOUT while processing (number of |
| 282 | postings per group). |
| 283 | |
| 284 | =item B<-t>, B<--test> |
| 285 | |
| 286 | Do not write results to database. You should use B<--debug> in |
| 287 | conjunction with B<--test> ... everything else seems a bit pointless. |
| 288 | |
| 289 | =item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]> |
| 290 | |
| 291 | Set processing period to a single month in YYYY-MM format or to a time |
| 292 | period between two month in YYYY-MM:YYYY-MM format (two month, separated |
| 293 | by a colon). |
| 294 | |
| 295 | |
| 296 | =item B<-s>, B<--stats> I<type> |
| 297 | |
| 298 | Set processing type to one of I<all> and I<groups>. Defaults to all |
| 299 | (and is currently rather pointless as only I<groups> has been |
| 300 | implemented). |
| 301 | |
| 302 | =item B<-c>, B<--checkgroups> I<filename template> |
| 303 | |
| 304 | Check each group against a list of valid newsgroups read from a file, |
| 305 | one group on each line and ignoring everything after the first |
| 306 | whitespace (so you can use a file in checkgroups format or (part of) |
| 307 | your INN active file). |
| 308 | |
| 309 | The filename is taken from I<filename template>, amended by each B<-- |
| 310 | month> B<gatherstats> is processing, so that |
| 311 | |
| 312 | gatherstats -m 2010-01:2010-12 -c checkgroups |
| 313 | |
| 314 | will check against F<checkgroups-2010-01> for January 2010, against |
| 315 | F<checkgroups-2010-02> for February 2010 and so on. |
| 316 | |
| 317 | Newsgroups not found in the checkgroups file will be dropped (and |
| 318 | logged to STDERR), and newsgroups found there but having no postings |
| 319 | will be added with a count of 0 (and logged to STDERR). |
| 320 | |
| 321 | =item B<--hierarchy> I<TLH> (newsgroup hierarchy) |
| 322 | |
| 323 | Override I<TLH> from F<newsstats.conf>. |
| 324 | |
| 325 | =item B<--rawdb> I<table> (raw data table) |
| 326 | |
| 327 | Override I<DBTableRaw> from F<newsstats.conf>. |
| 328 | |
| 329 | =item B<--groupsdb> I<table> (postings per group table) |
| 330 | |
| 331 | Override I<DBTableGrps> from F<newsstats.conf>. |
| 332 | |
| 333 | =item B<--clientsdb> I<table> (client data table) |
| 334 | |
| 335 | Override I<DBTableClnts> from F<newsstats.conf>. |
| 336 | |
| 337 | =item B<--hostsdb> I<table> (host data table) |
| 338 | |
| 339 | Override I<DBTableHosts> from F<newsstats.conf>. |
| 340 | |
| 341 | =back |
| 342 | |
| 343 | =head1 INSTALLATION |
| 344 | |
| 345 | See L<doc/INSTALL>. |
| 346 | |
| 347 | =head1 EXAMPLES |
| 348 | |
| 349 | Process all types of information for lasth month: |
| 350 | |
| 351 | gatherstats |
| 352 | |
| 353 | Do a dry run, showing results of processing: |
| 354 | |
| 355 | gatherstats --debug --test |
| 356 | |
| 357 | Process all types of information for January of 2010: |
| 358 | |
| 359 | gatherstats --month 2010-01 |
| 360 | |
| 361 | Process only number of postings for the year of 2010, |
| 362 | checking against checkgroups-*: |
| 363 | |
| 364 | gatherstats -m 2010-01:2010-12 -s groups -c checkgroups |
| 365 | |
| 366 | =head1 FILES |
| 367 | |
| 368 | =over 4 |
| 369 | |
| 370 | =item F<gatherstats.pl> |
| 371 | |
| 372 | The script itself. |
| 373 | |
| 374 | =item F<NewsStats.pm> |
| 375 | |
| 376 | Library functions for the NewsStats package. |
| 377 | |
| 378 | =item F<newsstats.conf> |
| 379 | |
| 380 | Runtime configuration file. |
| 381 | |
| 382 | =back |
| 383 | |
| 384 | =head1 BUGS |
| 385 | |
| 386 | Please report any bugs or feature requests to the author or use the |
| 387 | bug tracker at L<http://bugs.th-h.de/>! |
| 388 | |
| 389 | =head1 SEE ALSO |
| 390 | |
| 391 | =over 2 |
| 392 | |
| 393 | =item - |
| 394 | |
| 395 | L<doc/README> |
| 396 | |
| 397 | =item - |
| 398 | |
| 399 | L<doc/INSTALL> |
| 400 | |
| 401 | =back |
| 402 | |
| 403 | This script is part of the B<NewsStats> package. |
| 404 | |
| 405 | =head1 AUTHOR |
| 406 | |
| 407 | Thomas Hochstein <thh@inter.net> |
| 408 | |
| 409 | =head1 COPYRIGHT AND LICENSE |
| 410 | |
| 411 | Copyright (c) 2010-2012 Thomas Hochstein <thh@inter.net> |
| 412 | |
| 413 | This program is free software; you may redistribute it and/or modify it |
| 414 | under the same terms as Perl itself. |
| 415 | |
| 416 | =cut |