Commit | Line | Data |
---|---|---|
2832c235 TH |
1 | #! /usr/bin/perl -W |
2 | # | |
3 | # gatherstats.pl | |
4 | # | |
5 | # This script will gather statistical information from a database | |
6 | # containing headers and other information from a INN feed. | |
7 | # | |
8 | # It is part of the NewsStats package. | |
9 | # | |
10 | # Copyright (c) 2010 Thomas Hochstein <thh@inter.net> | |
11 | # | |
12 | # It can be redistributed and/or modified under the same terms under | |
13 | # which Perl itself is published. | |
14 | ||
15 | BEGIN { | |
16 | our $VERSION = "0.01"; | |
17 | use File::Basename; | |
18 | push(@INC, dirname($0)); | |
19 | } | |
20 | use strict; | |
21 | ||
ad609792 | 22 | use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList); |
2832c235 TH |
23 | |
24 | use DBI; | |
25 | ||
26 | ################################# Definitions ################################## | |
27 | ||
28 | # define types of information that can be gathered | |
29 | # all / groups (/ clients / hosts) | |
30 | my %LegalTypes; | |
31 | @LegalTypes{('all','groups')} = (); | |
32 | ||
33 | ################################# Main program ################################# | |
34 | ||
35 | ### read commandline options | |
ad609792 | 36 | my %Options = &ReadOptions('dom:p:t:l:n:r:g:c:s:'); |
2832c235 TH |
37 | |
38 | ### read configuration | |
39 | my %Conf = %{ReadConfig('newsstats.conf')}; | |
40 | ||
41 | ### override configuration via commandline options | |
42 | my %ConfOverride; | |
43 | $ConfOverride{'DBTableRaw'} = $Options{'r'} if $Options{'r'}; | |
44 | $ConfOverride{'DBTableGrps'} = $Options{'g'} if $Options{'g'}; | |
45 | $ConfOverride{'DBTableClnts'} = $Options{'c'} if $Options{'c'}; | |
46 | $ConfOverride{'DBTableHosts'} = $Options{'s'} if $Options{'s'}; | |
47 | $ConfOverride{'TLH'} = $Options{'n'} if $Options{'n'}; | |
48 | &OverrideConfig(\%Conf,\%ConfOverride); | |
49 | ||
50 | ### get type of information to gather, defaulting to 'all' | |
51 | $Options{'t'} = 'all' if !$Options{'t'}; | |
52 | die "$MySelf: E: Unknown type '-t $Options{'t'}'!\n" if !exists($LegalTypes{$Options{'t'}}); | |
53 | ||
54 | ### get time period (-m or -p) | |
55 | my ($StartMonth,$EndMonth) = &GetTimePeriod($Options{'m'},$Options{'p'}); | |
56 | ||
ad609792 TH |
57 | ### read newsgroups list from -l |
58 | my %ValidGroups = %{&ReadGroupList($Options{'l'})} if $Options{'l'}; | |
59 | ||
2832c235 TH |
60 | ### init database |
61 | my $DBHandle = InitDB(\%Conf,1); | |
62 | ||
63 | ### get data for each month | |
64 | warn "$MySelf: W: Output only mode. Database is not updated.\n" if $Options{'o'}; | |
65 | foreach my $Month (&ListMonth($StartMonth,$EndMonth)) { | |
66 | ||
67 | print "---------- $Month ----------\n" if $Options{'d'}; | |
68 | ||
69 | if ($Options{'t'} eq 'all' or $Options{'t'} eq 'groups') { | |
70 | ### ---------------------------------------------- | |
71 | ### get groups data (number of postings per group) | |
72 | # get groups data from raw table for given month | |
73 | my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s WHERE day LIKE ? AND NOT disregard",$Conf{'DBDatabase'},$Conf{'DBTableRaw'})); | |
74 | $DBQuery->execute($Month.'-%') or die sprintf("$MySelf: E: Can't get groups data for %s from %s.%s: $DBI::errstr\n",$Month,$Conf{'DBDatabase'},$Conf{'DBTableRaw'}); | |
75 | ||
76 | # count postings per group | |
77 | my %Postings; | |
2832c235 TH |
78 | while (($_) = $DBQuery->fetchrow_array) { |
79 | # get list oft newsgroups and hierarchies from Newsgroups: | |
ad609792 | 80 | my %Newsgroups = ListNewsgroups($_,$Options{'l'} ? \%ValidGroups : ''); |
2832c235 TH |
81 | # count each newsgroup and hierarchy once |
82 | foreach (sort keys %Newsgroups) { | |
2832c235 TH |
83 | $Postings{$_}++; |
84 | }; | |
85 | }; | |
86 | ||
ad609792 TH |
87 | # add valid but empty groups if -l is set |
88 | if (%ValidGroups) { | |
89 | foreach (sort keys %ValidGroups) { | |
90 | if (!defined($Postings{$_})) { | |
91 | $Postings{$_} = 0 ; | |
92 | warn (sprintf("ADDED: %s as empty group\n",$_)); | |
93 | } | |
94 | }; | |
95 | }; | |
96 | ||
2832c235 TH |
97 | print "----- GroupStats -----\n" if $Options{'d'}; |
98 | foreach my $Newsgroup (sort keys %Postings) { | |
99 | print "$Newsgroup => $Postings{$Newsgroup}\n" if $Options{'d'}; | |
100 | if (!$Options{'o'}) { | |
101 | # write to database | |
102 | $DBQuery = $DBHandle->prepare(sprintf("REPLACE INTO %s.%s (month,newsgroup,postings) VALUES (?, ?, ?)",$Conf{'DBDatabase'},$Conf{'DBTableGrps'})); | |
103 | $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup}) or die sprintf("$MySelf: E: Can't write groups data for %s/%s to %s.%s: $DBI::errstr\n",$Month,$Newsgroup,$Conf{'DBDatabase'},$Conf{'DBTableGrps'}); | |
104 | $DBQuery->finish; | |
105 | }; | |
106 | }; | |
107 | } else { | |
108 | # other types of information go here - later on | |
109 | }; | |
110 | }; | |
111 | ||
112 | ### close handles | |
113 | $DBHandle->disconnect; | |
114 | ||
115 | __END__ | |
116 | ||
117 | ################################ Documentation ################################# | |
118 | ||
119 | =head1 NAME | |
120 | ||
121 | gatherstats - process statistical data from a raw source | |
122 | ||
123 | =head1 SYNOPSIS | |
124 | ||
ad609792 | 125 | B<gatherstats> [B<-Vhdo>] [B<-m> I<YYYY-MM>] [B<-p> I<YYYY-MM:YYYY-MM>] [B<-t> I<type>] [B<-l> I<filename>] [B<-n> I<TLH>] [B<-r> I<database table>] [B<-g> I<database table>] [B<-c> I<database table>] [B<-s> I<database table>] |
2832c235 TH |
126 | |
127 | =head1 REQUIREMENTS | |
128 | ||
129 | See doc/README: Perl 5.8.x itself and the following modules from CPAN: | |
130 | ||
131 | =over 2 | |
132 | ||
133 | =item - | |
134 | ||
135 | Config::Auto | |
136 | ||
137 | =item - | |
138 | ||
139 | DBI | |
140 | ||
141 | =back | |
142 | ||
143 | =head1 DESCRIPTION | |
144 | ||
145 | This script will extract and process statistical information from a | |
146 | database table which is fed from F<feedlog.pl> for a given time period | |
313610f6 TH |
147 | and write its results to (an)other database table(s). Entries marked |
148 | with I<'disregard'> in the database will be ignored; currently, you have | |
149 | to set this flag yourself, using your database management tools. You | |
150 | can exclude erroneous entries that way (e.g. automatic reposts (think | |
151 | of cancels flood and resurrectors); spam; ...). | |
2832c235 TH |
152 | |
153 | The time period to act on defaults to last month; you can assign | |
154 | another month via the B<-m> switch or a time period via the B<-p> | |
155 | switch; the latter takes preference. | |
156 | ||
157 | By default B<gatherstats> will process all types of information; you | |
158 | can change that using the B<-t> switch and assigning the type of | |
159 | information to process. Currently only processing of the number of | |
160 | postings per group per month is implemented anyway, so that doesn't | |
161 | matter yet. | |
162 | ||
163 | Possible information types include: | |
164 | ||
165 | =over 3 | |
166 | ||
167 | =item B<groups> (postings per group per month) | |
168 | ||
169 | B<gatherstats> will examine Newsgroups: headers. Crosspostings will be | |
170 | counted for each single group they appear in. Groups not in I<TLH> | |
171 | will be ignored. | |
172 | ||
173 | B<gatherstats> will also add up the number of postings for each | |
174 | hierarchy level, but only count each posting once. A posting to | |
175 | de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL, | |
176 | respectively. A crossposting to de.alt.test and de.alt.admin, on the | |
177 | other hand, will be counted for de.alt.test and de.alt.admin each, but | |
178 | only once for de.alt.ALL and de.ALL. | |
179 | ||
180 | Data is written to I<DBTableGrps> (see doc/INSTALL). | |
181 | ||
182 | =back | |
183 | ||
184 | =head2 Configuration | |
185 | ||
186 | F<gatherstats.pl> will read its configuration from F<newsstats.conf> | |
187 | which should be present in the same directory via Config::Auto. | |
188 | ||
189 | See doc/INSTALL for an overview of possible configuration options. | |
190 | ||
191 | You can override configuration options via the B<-n>, B<-r>, B<-g>, | |
192 | B<-c> and B<-s> switches, respectively. | |
193 | ||
194 | =head1 OPTIONS | |
195 | ||
196 | =over 3 | |
197 | ||
198 | =item B<-V> (version) | |
199 | ||
200 | Print out version and copyright information on B<yapfaq> and exit. | |
201 | ||
202 | =item B<-h> (help) | |
203 | ||
204 | Print this man page and exit. | |
205 | ||
206 | =item B<-d> (debug) | |
207 | ||
208 | Output debugging information to STDOUT while processing (number of | |
209 | postings per group). | |
210 | ||
211 | =item B<-o> (output only) | |
212 | ||
213 | Do not write results to database. You should use B<-d> in conjunction | |
214 | with B<-o> ... everything else seems a bit pointless. | |
215 | ||
216 | =item B<-m> I<YYYY-MM> (month) | |
217 | ||
218 | Set processing period to a month in YYYY-MM format. Ignored if B<-p> | |
219 | is set. | |
220 | ||
221 | =item B<-p> I<YYYY-MM:YYYY-MM> (period) | |
222 | ||
223 | Set processing period to a time period between two month, each in | |
224 | YYYY-MM format, separated by a colon. Overrides B<-m>. | |
225 | ||
226 | =item B<-t> I<type> (type) | |
227 | ||
228 | Set processing type to one of I<all> and I<groups>. Defaults to all | |
229 | (and is currently rather pointless as only I<groups> has been | |
230 | implemented). | |
231 | ||
ad609792 TH |
232 | =item B<-l> I<filename> (check against list) |
233 | ||
234 | Check each group against a list of valid newsgroups read from | |
235 | I<filename>, one group on each line and ignoring everything after the | |
236 | first whitespace (so you can use a file in checkgroups format or (part | |
237 | of) your INN active file). | |
238 | ||
239 | Newsgroups not found in I<filename> will be dropped (and logged to | |
240 | STDERR), and newsgroups found in I<filename> but having no postings | |
241 | will be added with a count of 0 (and logged to STDERR). | |
242 | ||
2832c235 TH |
243 | =item B<-n> I<TLH> (newsgroup hierarchy) |
244 | ||
245 | Override I<TLH> from F<newsstats.conf>. | |
246 | ||
247 | =item B<-r> I<table> (raw data table) | |
248 | ||
249 | Override I<DBTableRaw> from F<newsstats.conf>. | |
250 | ||
251 | =item B<-g> I<table> (postings per group table) | |
252 | ||
253 | Override I<DBTableGrps> from F<newsstats.conf>. | |
254 | ||
255 | =item B<-c> I<table> (client data table) | |
256 | ||
257 | Override I<DBTableClnts> from F<newsstats.conf>. | |
258 | ||
259 | =item B<-s> I<table> (server/host data table) | |
260 | ||
261 | Override I<DBTableHosts> from F<newsstats.conf>. | |
262 | ||
263 | =back | |
264 | ||
265 | =head1 INSTALLATION | |
266 | ||
267 | See doc/INSTALL. | |
268 | ||
269 | =head1 EXAMPLES | |
270 | ||
271 | Process all types of information for lasth month: | |
272 | ||
273 | gatherstats | |
274 | ||
275 | Do a dry run, showing results of processing: | |
276 | ||
277 | gatherstats -do | |
278 | ||
279 | Process all types of information for January of 2010: | |
280 | ||
281 | gatherstats -m 2010-01 | |
282 | ||
ad609792 TH |
283 | Process only number of postings for the year of 2010, |
284 | checking against checkgroups-2010.txt: | |
2832c235 | 285 | |
ad609792 | 286 | gatherstats -p 2010-01:2010-12 -t groups -l checkgroups-2010.txt |
2832c235 TH |
287 | |
288 | =head1 FILES | |
289 | ||
290 | =over 4 | |
291 | ||
292 | =item F<gatherstats.pl> | |
293 | ||
294 | The script itself. | |
295 | ||
296 | =item F<NewsStats.pm> | |
297 | ||
298 | Library functions for the NewsStats package. | |
299 | ||
300 | =item F<newsstats.conf> | |
301 | ||
302 | Runtime configuration file for B<yapfaq>. | |
303 | ||
304 | =back | |
305 | ||
306 | =head1 BUGS | |
307 | ||
308 | Please report any bugs or feature requests to the author or use the | |
309 | bug tracker at L<http://bugs.th-h.de/>! | |
310 | ||
311 | =head1 SEE ALSO | |
312 | ||
313 | =over 2 | |
314 | ||
315 | =item - | |
316 | ||
317 | doc/README | |
318 | ||
319 | =item - | |
320 | ||
321 | doc/INSTALL | |
322 | ||
323 | =back | |
324 | ||
325 | This script is part of the B<NewsStats> package. | |
326 | ||
327 | =head1 AUTHOR | |
328 | ||
329 | Thomas Hochstein <thh@inter.net> | |
330 | ||
331 | =head1 COPYRIGHT AND LICENSE | |
332 | ||
333 | Copyright (c) 2010 Thomas Hochstein <thh@inter.net> | |
334 | ||
335 | This program is free software; you may redistribute it and/or modify it | |
336 | under the same terms as Perl itself. | |
337 | ||
338 | =cut |