Commit | Line | Data |
---|---|---|
2832c235 TH |
1 | #! /usr/bin/perl -W |
2 | # | |
3 | # gatherstats.pl | |
4 | # | |
5 | # This script will gather statistical information from a database | |
6 | # containing headers and other information from a INN feed. | |
7 | # | |
8 | # It is part of the NewsStats package. | |
9 | # | |
10 | # Copyright (c) 2010 Thomas Hochstein <thh@inter.net> | |
11 | # | |
12 | # It can be redistributed and/or modified under the same terms under | |
13 | # which Perl itself is published. | |
14 | ||
15 | BEGIN { | |
16 | our $VERSION = "0.01"; | |
17 | use File::Basename; | |
18 | push(@INC, dirname($0)); | |
19 | } | |
20 | use strict; | |
21 | ||
ad609792 | 22 | use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList); |
2832c235 TH |
23 | |
24 | use DBI; | |
25 | ||
26 | ################################# Definitions ################################## | |
27 | ||
28 | # define types of information that can be gathered | |
29 | # all / groups (/ clients / hosts) | |
30 | my %LegalTypes; | |
31 | @LegalTypes{('all','groups')} = (); | |
32 | ||
33 | ################################# Main program ################################# | |
34 | ||
35 | ### read commandline options | |
ad609792 | 36 | my %Options = &ReadOptions('dom:p:t:l:n:r:g:c:s:'); |
2832c235 TH |
37 | |
38 | ### read configuration | |
39 | my %Conf = %{ReadConfig('newsstats.conf')}; | |
40 | ||
41 | ### override configuration via commandline options | |
42 | my %ConfOverride; | |
43 | $ConfOverride{'DBTableRaw'} = $Options{'r'} if $Options{'r'}; | |
44 | $ConfOverride{'DBTableGrps'} = $Options{'g'} if $Options{'g'}; | |
45 | $ConfOverride{'DBTableClnts'} = $Options{'c'} if $Options{'c'}; | |
46 | $ConfOverride{'DBTableHosts'} = $Options{'s'} if $Options{'s'}; | |
47 | $ConfOverride{'TLH'} = $Options{'n'} if $Options{'n'}; | |
48 | &OverrideConfig(\%Conf,\%ConfOverride); | |
49 | ||
50 | ### get type of information to gather, defaulting to 'all' | |
51 | $Options{'t'} = 'all' if !$Options{'t'}; | |
52 | die "$MySelf: E: Unknown type '-t $Options{'t'}'!\n" if !exists($LegalTypes{$Options{'t'}}); | |
53 | ||
54 | ### get time period (-m or -p) | |
55 | my ($StartMonth,$EndMonth) = &GetTimePeriod($Options{'m'},$Options{'p'}); | |
56 | ||
ad609792 TH |
57 | ### read newsgroups list from -l |
58 | my %ValidGroups = %{&ReadGroupList($Options{'l'})} if $Options{'l'}; | |
59 | ||
2832c235 TH |
60 | ### init database |
61 | my $DBHandle = InitDB(\%Conf,1); | |
62 | ||
63 | ### get data for each month | |
64 | warn "$MySelf: W: Output only mode. Database is not updated.\n" if $Options{'o'}; | |
65 | foreach my $Month (&ListMonth($StartMonth,$EndMonth)) { | |
66 | ||
67 | print "---------- $Month ----------\n" if $Options{'d'}; | |
68 | ||
69 | if ($Options{'t'} eq 'all' or $Options{'t'} eq 'groups') { | |
70 | ### ---------------------------------------------- | |
71 | ### get groups data (number of postings per group) | |
72 | # get groups data from raw table for given month | |
73 | my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s WHERE day LIKE ? AND NOT disregard",$Conf{'DBDatabase'},$Conf{'DBTableRaw'})); | |
74 | $DBQuery->execute($Month.'-%') or die sprintf("$MySelf: E: Can't get groups data for %s from %s.%s: $DBI::errstr\n",$Month,$Conf{'DBDatabase'},$Conf{'DBTableRaw'}); | |
75 | ||
76 | # count postings per group | |
77 | my %Postings; | |
2832c235 TH |
78 | while (($_) = $DBQuery->fetchrow_array) { |
79 | # get list oft newsgroups and hierarchies from Newsgroups: | |
89db2f90 | 80 | my %Newsgroups = ListNewsgroups($_,$Conf{'TLH'},$Options{'l'} ? \%ValidGroups : ''); |
2832c235 TH |
81 | # count each newsgroup and hierarchy once |
82 | foreach (sort keys %Newsgroups) { | |
2832c235 TH |
83 | $Postings{$_}++; |
84 | }; | |
85 | }; | |
86 | ||
ad609792 TH |
87 | # add valid but empty groups if -l is set |
88 | if (%ValidGroups) { | |
89 | foreach (sort keys %ValidGroups) { | |
90 | if (!defined($Postings{$_})) { | |
91 | $Postings{$_} = 0 ; | |
92 | warn (sprintf("ADDED: %s as empty group\n",$_)); | |
93 | } | |
94 | }; | |
95 | }; | |
96 | ||
71f0178b TH |
97 | # delete old data for that month |
98 | if (!$Options{'o'}) { | |
99 | $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",$Conf{'DBDatabase'},$Conf{'DBTableGrps'}),undef,$Month) | |
100 | or warn sprintf("$MySelf: E: Can't delete old groups data for %s from %s.%s: $DBI::errstr\n",$Month,$Conf{'DBDatabase'},$Conf{'DBTableGrps'}); | |
101 | }; | |
102 | ||
2832c235 TH |
103 | print "----- GroupStats -----\n" if $Options{'d'}; |
104 | foreach my $Newsgroup (sort keys %Postings) { | |
105 | print "$Newsgroup => $Postings{$Newsgroup}\n" if $Options{'d'}; | |
106 | if (!$Options{'o'}) { | |
107 | # write to database | |
71f0178b TH |
108 | $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s.%s (month,newsgroup,postings) VALUES (?, ?, ?)",$Conf{'DBDatabase'},$Conf{'DBTableGrps'})); |
109 | # $DBQuery = $DBHandle->prepare(sprintf("REPLACE INTO %s.%s (month,newsgroup,postings) VALUES (?, ?, ?)",$Conf{'DBDatabase'},$Conf{'DBTableGrps'})); | |
2832c235 TH |
110 | $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup}) or die sprintf("$MySelf: E: Can't write groups data for %s/%s to %s.%s: $DBI::errstr\n",$Month,$Newsgroup,$Conf{'DBDatabase'},$Conf{'DBTableGrps'}); |
111 | $DBQuery->finish; | |
112 | }; | |
113 | }; | |
114 | } else { | |
115 | # other types of information go here - later on | |
116 | }; | |
117 | }; | |
118 | ||
119 | ### close handles | |
120 | $DBHandle->disconnect; | |
121 | ||
122 | __END__ | |
123 | ||
124 | ################################ Documentation ################################# | |
125 | ||
126 | =head1 NAME | |
127 | ||
128 | gatherstats - process statistical data from a raw source | |
129 | ||
130 | =head1 SYNOPSIS | |
131 | ||
ad609792 | 132 | B<gatherstats> [B<-Vhdo>] [B<-m> I<YYYY-MM>] [B<-p> I<YYYY-MM:YYYY-MM>] [B<-t> I<type>] [B<-l> I<filename>] [B<-n> I<TLH>] [B<-r> I<database table>] [B<-g> I<database table>] [B<-c> I<database table>] [B<-s> I<database table>] |
2832c235 TH |
133 | |
134 | =head1 REQUIREMENTS | |
135 | ||
136 | See doc/README: Perl 5.8.x itself and the following modules from CPAN: | |
137 | ||
138 | =over 2 | |
139 | ||
140 | =item - | |
141 | ||
142 | Config::Auto | |
143 | ||
144 | =item - | |
145 | ||
146 | DBI | |
147 | ||
148 | =back | |
149 | ||
150 | =head1 DESCRIPTION | |
151 | ||
152 | This script will extract and process statistical information from a | |
153 | database table which is fed from F<feedlog.pl> for a given time period | |
313610f6 TH |
154 | and write its results to (an)other database table(s). Entries marked |
155 | with I<'disregard'> in the database will be ignored; currently, you have | |
156 | to set this flag yourself, using your database management tools. You | |
157 | can exclude erroneous entries that way (e.g. automatic reposts (think | |
158 | of cancels flood and resurrectors); spam; ...). | |
2832c235 TH |
159 | |
160 | The time period to act on defaults to last month; you can assign | |
161 | another month via the B<-m> switch or a time period via the B<-p> | |
162 | switch; the latter takes preference. | |
163 | ||
164 | By default B<gatherstats> will process all types of information; you | |
165 | can change that using the B<-t> switch and assigning the type of | |
166 | information to process. Currently only processing of the number of | |
167 | postings per group per month is implemented anyway, so that doesn't | |
168 | matter yet. | |
169 | ||
170 | Possible information types include: | |
171 | ||
172 | =over 3 | |
173 | ||
174 | =item B<groups> (postings per group per month) | |
175 | ||
176 | B<gatherstats> will examine Newsgroups: headers. Crosspostings will be | |
177 | counted for each single group they appear in. Groups not in I<TLH> | |
178 | will be ignored. | |
179 | ||
180 | B<gatherstats> will also add up the number of postings for each | |
181 | hierarchy level, but only count each posting once. A posting to | |
182 | de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL, | |
183 | respectively. A crossposting to de.alt.test and de.alt.admin, on the | |
184 | other hand, will be counted for de.alt.test and de.alt.admin each, but | |
185 | only once for de.alt.ALL and de.ALL. | |
186 | ||
187 | Data is written to I<DBTableGrps> (see doc/INSTALL). | |
188 | ||
189 | =back | |
190 | ||
191 | =head2 Configuration | |
192 | ||
193 | F<gatherstats.pl> will read its configuration from F<newsstats.conf> | |
194 | which should be present in the same directory via Config::Auto. | |
195 | ||
196 | See doc/INSTALL for an overview of possible configuration options. | |
197 | ||
198 | You can override configuration options via the B<-n>, B<-r>, B<-g>, | |
199 | B<-c> and B<-s> switches, respectively. | |
200 | ||
201 | =head1 OPTIONS | |
202 | ||
203 | =over 3 | |
204 | ||
205 | =item B<-V> (version) | |
206 | ||
207 | Print out version and copyright information on B<yapfaq> and exit. | |
208 | ||
209 | =item B<-h> (help) | |
210 | ||
211 | Print this man page and exit. | |
212 | ||
213 | =item B<-d> (debug) | |
214 | ||
215 | Output debugging information to STDOUT while processing (number of | |
216 | postings per group). | |
217 | ||
218 | =item B<-o> (output only) | |
219 | ||
220 | Do not write results to database. You should use B<-d> in conjunction | |
221 | with B<-o> ... everything else seems a bit pointless. | |
222 | ||
223 | =item B<-m> I<YYYY-MM> (month) | |
224 | ||
225 | Set processing period to a month in YYYY-MM format. Ignored if B<-p> | |
226 | is set. | |
227 | ||
228 | =item B<-p> I<YYYY-MM:YYYY-MM> (period) | |
229 | ||
230 | Set processing period to a time period between two month, each in | |
231 | YYYY-MM format, separated by a colon. Overrides B<-m>. | |
232 | ||
233 | =item B<-t> I<type> (type) | |
234 | ||
235 | Set processing type to one of I<all> and I<groups>. Defaults to all | |
236 | (and is currently rather pointless as only I<groups> has been | |
237 | implemented). | |
238 | ||
ad609792 TH |
239 | =item B<-l> I<filename> (check against list) |
240 | ||
241 | Check each group against a list of valid newsgroups read from | |
242 | I<filename>, one group on each line and ignoring everything after the | |
243 | first whitespace (so you can use a file in checkgroups format or (part | |
244 | of) your INN active file). | |
245 | ||
246 | Newsgroups not found in I<filename> will be dropped (and logged to | |
247 | STDERR), and newsgroups found in I<filename> but having no postings | |
248 | will be added with a count of 0 (and logged to STDERR). | |
249 | ||
2832c235 TH |
250 | =item B<-n> I<TLH> (newsgroup hierarchy) |
251 | ||
252 | Override I<TLH> from F<newsstats.conf>. | |
253 | ||
254 | =item B<-r> I<table> (raw data table) | |
255 | ||
256 | Override I<DBTableRaw> from F<newsstats.conf>. | |
257 | ||
258 | =item B<-g> I<table> (postings per group table) | |
259 | ||
260 | Override I<DBTableGrps> from F<newsstats.conf>. | |
261 | ||
262 | =item B<-c> I<table> (client data table) | |
263 | ||
264 | Override I<DBTableClnts> from F<newsstats.conf>. | |
265 | ||
266 | =item B<-s> I<table> (server/host data table) | |
267 | ||
268 | Override I<DBTableHosts> from F<newsstats.conf>. | |
269 | ||
270 | =back | |
271 | ||
272 | =head1 INSTALLATION | |
273 | ||
274 | See doc/INSTALL. | |
275 | ||
276 | =head1 EXAMPLES | |
277 | ||
278 | Process all types of information for lasth month: | |
279 | ||
280 | gatherstats | |
281 | ||
282 | Do a dry run, showing results of processing: | |
283 | ||
284 | gatherstats -do | |
285 | ||
286 | Process all types of information for January of 2010: | |
287 | ||
288 | gatherstats -m 2010-01 | |
289 | ||
ad609792 TH |
290 | Process only number of postings for the year of 2010, |
291 | checking against checkgroups-2010.txt: | |
2832c235 | 292 | |
ad609792 | 293 | gatherstats -p 2010-01:2010-12 -t groups -l checkgroups-2010.txt |
2832c235 TH |
294 | |
295 | =head1 FILES | |
296 | ||
297 | =over 4 | |
298 | ||
299 | =item F<gatherstats.pl> | |
300 | ||
301 | The script itself. | |
302 | ||
303 | =item F<NewsStats.pm> | |
304 | ||
305 | Library functions for the NewsStats package. | |
306 | ||
307 | =item F<newsstats.conf> | |
308 | ||
309 | Runtime configuration file for B<yapfaq>. | |
310 | ||
311 | =back | |
312 | ||
313 | =head1 BUGS | |
314 | ||
315 | Please report any bugs or feature requests to the author or use the | |
316 | bug tracker at L<http://bugs.th-h.de/>! | |
317 | ||
318 | =head1 SEE ALSO | |
319 | ||
320 | =over 2 | |
321 | ||
322 | =item - | |
323 | ||
324 | doc/README | |
325 | ||
326 | =item - | |
327 | ||
328 | doc/INSTALL | |
329 | ||
330 | =back | |
331 | ||
332 | This script is part of the B<NewsStats> package. | |
333 | ||
334 | =head1 AUTHOR | |
335 | ||
336 | Thomas Hochstein <thh@inter.net> | |
337 | ||
338 | =head1 COPYRIGHT AND LICENSE | |
339 | ||
340 | Copyright (c) 2010 Thomas Hochstein <thh@inter.net> | |
341 | ||
342 | This program is free software; you may redistribute it and/or modify it | |
343 | under the same terms as Perl itself. | |
344 | ||
345 | =cut |