Commit | Line | Data |
---|---|---|
2832c235 TH |
1 | #! /usr/bin/perl -W |
2 | # | |
3 | # gatherstats.pl | |
4 | # | |
5 | # This script will gather statistical information from a database | |
6 | # containing headers and other information from a INN feed. | |
7 | # | |
8 | # It is part of the NewsStats package. | |
9 | # | |
10 | # Copyright (c) 2010 Thomas Hochstein <thh@inter.net> | |
11 | # | |
12 | # It can be redistributed and/or modified under the same terms under | |
13 | # which Perl itself is published. | |
14 | ||
15 | BEGIN { | |
16 | our $VERSION = "0.01"; | |
17 | use File::Basename; | |
18 | push(@INC, dirname($0)); | |
19 | } | |
20 | use strict; | |
21 | ||
ad609792 | 22 | use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList); |
2832c235 TH |
23 | |
24 | use DBI; | |
25 | ||
26 | ################################# Definitions ################################## | |
27 | ||
28 | # define types of information that can be gathered | |
29 | # all / groups (/ clients / hosts) | |
30 | my %LegalTypes; | |
31 | @LegalTypes{('all','groups')} = (); | |
32 | ||
33 | ################################# Main program ################################# | |
34 | ||
35 | ### read commandline options | |
ad609792 | 36 | my %Options = &ReadOptions('dom:p:t:l:n:r:g:c:s:'); |
2832c235 TH |
37 | |
38 | ### read configuration | |
39 | my %Conf = %{ReadConfig('newsstats.conf')}; | |
40 | ||
41 | ### override configuration via commandline options | |
42 | my %ConfOverride; | |
43 | $ConfOverride{'DBTableRaw'} = $Options{'r'} if $Options{'r'}; | |
44 | $ConfOverride{'DBTableGrps'} = $Options{'g'} if $Options{'g'}; | |
45 | $ConfOverride{'DBTableClnts'} = $Options{'c'} if $Options{'c'}; | |
46 | $ConfOverride{'DBTableHosts'} = $Options{'s'} if $Options{'s'}; | |
47 | $ConfOverride{'TLH'} = $Options{'n'} if $Options{'n'}; | |
48 | &OverrideConfig(\%Conf,\%ConfOverride); | |
49 | ||
50 | ### get type of information to gather, defaulting to 'all' | |
51 | $Options{'t'} = 'all' if !$Options{'t'}; | |
52 | die "$MySelf: E: Unknown type '-t $Options{'t'}'!\n" if !exists($LegalTypes{$Options{'t'}}); | |
53 | ||
54 | ### get time period (-m or -p) | |
55 | my ($StartMonth,$EndMonth) = &GetTimePeriod($Options{'m'},$Options{'p'}); | |
56 | ||
17ffbeba TH |
57 | ### reformat $Conf{'TLH'} |
58 | my $TLH; | |
59 | if ($Conf{'TLH'}) { | |
60 | # $Conf{'TLH'} is parsed as an array by Config::Auto; | |
61 | # make a flat list again, separated by : | |
62 | if (ref($TLH) eq 'ARRAY') { | |
63 | $TLH = join(':',@{$Conf{'TLH'}}); | |
64 | } else { | |
65 | $TLH = $Conf{'TLH'}; | |
66 | } | |
67 | # strip whitespace | |
68 | $TLH =~ s/\s//g; | |
69 | # check for illegal characters | |
70 | die "$MySelf: E: Config error - illegal characters in TLH definition\n" if ($TLH !~ /^[a-zA-Z0-9:]+$/); | |
71 | if ($TLH =~ /:/) { | |
72 | # reformat $TLH form a:b to (a)|(b) | |
73 | $TLH =~ s/:/)|(/g; | |
74 | $TLH = '(' . $TLH . ')'; | |
75 | }; | |
76 | }; | |
77 | ||
ad609792 TH |
78 | ### read newsgroups list from -l |
79 | my %ValidGroups = %{&ReadGroupList($Options{'l'})} if $Options{'l'}; | |
80 | ||
2832c235 TH |
81 | ### init database |
82 | my $DBHandle = InitDB(\%Conf,1); | |
83 | ||
84 | ### get data for each month | |
85 | warn "$MySelf: W: Output only mode. Database is not updated.\n" if $Options{'o'}; | |
86 | foreach my $Month (&ListMonth($StartMonth,$EndMonth)) { | |
87 | ||
88 | print "---------- $Month ----------\n" if $Options{'d'}; | |
89 | ||
90 | if ($Options{'t'} eq 'all' or $Options{'t'} eq 'groups') { | |
91 | ### ---------------------------------------------- | |
92 | ### get groups data (number of postings per group) | |
93 | # get groups data from raw table for given month | |
94 | my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s WHERE day LIKE ? AND NOT disregard",$Conf{'DBDatabase'},$Conf{'DBTableRaw'})); | |
95 | $DBQuery->execute($Month.'-%') or die sprintf("$MySelf: E: Can't get groups data for %s from %s.%s: $DBI::errstr\n",$Month,$Conf{'DBDatabase'},$Conf{'DBTableRaw'}); | |
96 | ||
97 | # count postings per group | |
98 | my %Postings; | |
2832c235 TH |
99 | while (($_) = $DBQuery->fetchrow_array) { |
100 | # get list oft newsgroups and hierarchies from Newsgroups: | |
17ffbeba | 101 | my %Newsgroups = ListNewsgroups($_,$TLH,$Options{'l'} ? \%ValidGroups : ''); |
2832c235 TH |
102 | # count each newsgroup and hierarchy once |
103 | foreach (sort keys %Newsgroups) { | |
2832c235 TH |
104 | $Postings{$_}++; |
105 | }; | |
106 | }; | |
107 | ||
ad609792 TH |
108 | # add valid but empty groups if -l is set |
109 | if (%ValidGroups) { | |
110 | foreach (sort keys %ValidGroups) { | |
111 | if (!defined($Postings{$_})) { | |
112 | $Postings{$_} = 0 ; | |
113 | warn (sprintf("ADDED: %s as empty group\n",$_)); | |
114 | } | |
115 | }; | |
116 | }; | |
117 | ||
71f0178b TH |
118 | # delete old data for that month |
119 | if (!$Options{'o'}) { | |
120 | $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",$Conf{'DBDatabase'},$Conf{'DBTableGrps'}),undef,$Month) | |
121 | or warn sprintf("$MySelf: E: Can't delete old groups data for %s from %s.%s: $DBI::errstr\n",$Month,$Conf{'DBDatabase'},$Conf{'DBTableGrps'}); | |
122 | }; | |
123 | ||
2832c235 TH |
124 | print "----- GroupStats -----\n" if $Options{'d'}; |
125 | foreach my $Newsgroup (sort keys %Postings) { | |
126 | print "$Newsgroup => $Postings{$Newsgroup}\n" if $Options{'d'}; | |
127 | if (!$Options{'o'}) { | |
128 | # write to database | |
71f0178b TH |
129 | $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s (month,newsgroup,postings) VALUES (?, ?, ?)",$Conf{'DBDatabase'},$Conf{'DBTableGrps'})); |
130 | # $DBQuery = $DBHandle->prepare(sprintf("REPLACE INTO %s.%s (month,newsgroup,postings) VALUES (?, ?, ?)",$Conf{'DBDatabase'},$Conf{'DBTableGrps'})); | |
2832c235 TH |
131 | $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup}) or die sprintf("$MySelf: E: Can't write groups data for %s/%s to %s.%s: $DBI::errstr\n",$Month,$Newsgroup,$Conf{'DBDatabase'},$Conf{'DBTableGrps'}); |
132 | $DBQuery->finish; | |
133 | }; | |
134 | }; | |
135 | } else { | |
136 | # other types of information go here - later on | |
137 | }; | |
138 | }; | |
139 | ||
140 | ### close handles | |
141 | $DBHandle->disconnect; | |
142 | ||
143 | __END__ | |
144 | ||
145 | ################################ Documentation ################################# | |
146 | ||
147 | =head1 NAME | |
148 | ||
149 | gatherstats - process statistical data from a raw source | |
150 | ||
151 | =head1 SYNOPSIS | |
152 | ||
ad609792 | 153 | B<gatherstats> [B<-Vhdo>] [B<-m> I<YYYY-MM>] [B<-p> I<YYYY-MM:YYYY-MM>] [B<-t> I<type>] [B<-l> I<filename>] [B<-n> I<TLH>] [B<-r> I<database table>] [B<-g> I<database table>] [B<-c> I<database table>] [B<-s> I<database table>] |
2832c235 TH |
154 | |
155 | =head1 REQUIREMENTS | |
156 | ||
157 | See doc/README: Perl 5.8.x itself and the following modules from CPAN: | |
158 | ||
159 | =over 2 | |
160 | ||
161 | =item - | |
162 | ||
163 | Config::Auto | |
164 | ||
165 | =item - | |
166 | ||
167 | DBI | |
168 | ||
169 | =back | |
170 | ||
171 | =head1 DESCRIPTION | |
172 | ||
173 | This script will extract and process statistical information from a | |
174 | database table which is fed from F<feedlog.pl> for a given time period | |
313610f6 TH |
175 | and write its results to (an)other database table(s). Entries marked |
176 | with I<'disregard'> in the database will be ignored; currently, you have | |
177 | to set this flag yourself, using your database management tools. You | |
178 | can exclude erroneous entries that way (e.g. automatic reposts (think | |
179 | of cancels flood and resurrectors); spam; ...). | |
2832c235 TH |
180 | |
181 | The time period to act on defaults to last month; you can assign | |
182 | another month via the B<-m> switch or a time period via the B<-p> | |
183 | switch; the latter takes preference. | |
184 | ||
185 | By default B<gatherstats> will process all types of information; you | |
186 | can change that using the B<-t> switch and assigning the type of | |
187 | information to process. Currently only processing of the number of | |
188 | postings per group per month is implemented anyway, so that doesn't | |
189 | matter yet. | |
190 | ||
191 | Possible information types include: | |
192 | ||
193 | =over 3 | |
194 | ||
195 | =item B<groups> (postings per group per month) | |
196 | ||
197 | B<gatherstats> will examine Newsgroups: headers. Crosspostings will be | |
198 | counted for each single group they appear in. Groups not in I<TLH> | |
199 | will be ignored. | |
200 | ||
201 | B<gatherstats> will also add up the number of postings for each | |
202 | hierarchy level, but only count each posting once. A posting to | |
203 | de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL, | |
204 | respectively. A crossposting to de.alt.test and de.alt.admin, on the | |
205 | other hand, will be counted for de.alt.test and de.alt.admin each, but | |
206 | only once for de.alt.ALL and de.ALL. | |
207 | ||
208 | Data is written to I<DBTableGrps> (see doc/INSTALL). | |
209 | ||
210 | =back | |
211 | ||
212 | =head2 Configuration | |
213 | ||
214 | F<gatherstats.pl> will read its configuration from F<newsstats.conf> | |
215 | which should be present in the same directory via Config::Auto. | |
216 | ||
217 | See doc/INSTALL for an overview of possible configuration options. | |
218 | ||
219 | You can override configuration options via the B<-n>, B<-r>, B<-g>, | |
220 | B<-c> and B<-s> switches, respectively. | |
221 | ||
222 | =head1 OPTIONS | |
223 | ||
224 | =over 3 | |
225 | ||
226 | =item B<-V> (version) | |
227 | ||
228 | Print out version and copyright information on B<yapfaq> and exit. | |
229 | ||
230 | =item B<-h> (help) | |
231 | ||
232 | Print this man page and exit. | |
233 | ||
234 | =item B<-d> (debug) | |
235 | ||
236 | Output debugging information to STDOUT while processing (number of | |
237 | postings per group). | |
238 | ||
239 | =item B<-o> (output only) | |
240 | ||
241 | Do not write results to database. You should use B<-d> in conjunction | |
242 | with B<-o> ... everything else seems a bit pointless. | |
243 | ||
244 | =item B<-m> I<YYYY-MM> (month) | |
245 | ||
246 | Set processing period to a month in YYYY-MM format. Ignored if B<-p> | |
247 | is set. | |
248 | ||
249 | =item B<-p> I<YYYY-MM:YYYY-MM> (period) | |
250 | ||
251 | Set processing period to a time period between two month, each in | |
252 | YYYY-MM format, separated by a colon. Overrides B<-m>. | |
253 | ||
254 | =item B<-t> I<type> (type) | |
255 | ||
256 | Set processing type to one of I<all> and I<groups>. Defaults to all | |
257 | (and is currently rather pointless as only I<groups> has been | |
258 | implemented). | |
259 | ||
ad609792 TH |
260 | =item B<-l> I<filename> (check against list) |
261 | ||
262 | Check each group against a list of valid newsgroups read from | |
263 | I<filename>, one group on each line and ignoring everything after the | |
264 | first whitespace (so you can use a file in checkgroups format or (part | |
265 | of) your INN active file). | |
266 | ||
267 | Newsgroups not found in I<filename> will be dropped (and logged to | |
268 | STDERR), and newsgroups found in I<filename> but having no postings | |
269 | will be added with a count of 0 (and logged to STDERR). | |
270 | ||
2832c235 TH |
271 | =item B<-n> I<TLH> (newsgroup hierarchy) |
272 | ||
273 | Override I<TLH> from F<newsstats.conf>. | |
274 | ||
275 | =item B<-r> I<table> (raw data table) | |
276 | ||
277 | Override I<DBTableRaw> from F<newsstats.conf>. | |
278 | ||
279 | =item B<-g> I<table> (postings per group table) | |
280 | ||
281 | Override I<DBTableGrps> from F<newsstats.conf>. | |
282 | ||
283 | =item B<-c> I<table> (client data table) | |
284 | ||
285 | Override I<DBTableClnts> from F<newsstats.conf>. | |
286 | ||
287 | =item B<-s> I<table> (server/host data table) | |
288 | ||
289 | Override I<DBTableHosts> from F<newsstats.conf>. | |
290 | ||
291 | =back | |
292 | ||
293 | =head1 INSTALLATION | |
294 | ||
295 | See doc/INSTALL. | |
296 | ||
297 | =head1 EXAMPLES | |
298 | ||
299 | Process all types of information for lasth month: | |
300 | ||
301 | gatherstats | |
302 | ||
303 | Do a dry run, showing results of processing: | |
304 | ||
305 | gatherstats -do | |
306 | ||
307 | Process all types of information for January of 2010: | |
308 | ||
309 | gatherstats -m 2010-01 | |
310 | ||
ad609792 TH |
311 | Process only number of postings for the year of 2010, |
312 | checking against checkgroups-2010.txt: | |
2832c235 | 313 | |
ad609792 | 314 | gatherstats -p 2010-01:2010-12 -t groups -l checkgroups-2010.txt |
2832c235 TH |
315 | |
316 | =head1 FILES | |
317 | ||
318 | =over 4 | |
319 | ||
320 | =item F<gatherstats.pl> | |
321 | ||
322 | The script itself. | |
323 | ||
324 | =item F<NewsStats.pm> | |
325 | ||
326 | Library functions for the NewsStats package. | |
327 | ||
328 | =item F<newsstats.conf> | |
329 | ||
330 | Runtime configuration file for B<yapfaq>. | |
331 | ||
332 | =back | |
333 | ||
334 | =head1 BUGS | |
335 | ||
336 | Please report any bugs or feature requests to the author or use the | |
337 | bug tracker at L<http://bugs.th-h.de/>! | |
338 | ||
339 | =head1 SEE ALSO | |
340 | ||
341 | =over 2 | |
342 | ||
343 | =item - | |
344 | ||
345 | doc/README | |
346 | ||
347 | =item - | |
348 | ||
349 | doc/INSTALL | |
350 | ||
351 | =back | |
352 | ||
353 | This script is part of the B<NewsStats> package. | |
354 | ||
355 | =head1 AUTHOR | |
356 | ||
357 | Thomas Hochstein <thh@inter.net> | |
358 | ||
359 | =head1 COPYRIGHT AND LICENSE | |
360 | ||
361 | Copyright (c) 2010 Thomas Hochstein <thh@inter.net> | |
362 | ||
363 | This program is free software; you may redistribute it and/or modify it | |
364 | under the same terms as Perl itself. | |
365 | ||
366 | =cut |