LogFormat "%t %h \"%{User-agent}i\" %U" robo-cop
CustomLog log/robot_log robo-cop
[08/Feb/1998:12:28:35 -0500] phila249-pri.voicenet.com "Mozilla/3.01 (Win95; U)" /cgi-bin/fortune
#!/usr/local/bin/perl
use Time::ParseDate;
use strict 'vars';
# after 30 minutes, we consider this a new session
use constant MAX_INTERVAL => 60*30;
my (%HITS,%INT_NUMERATOR,%INT_DENOMINATOR,%POLITE,%LAST,$HITS);
my $file = shift;
open (IN,$file=~/\.gz$/ ? "zcat $file |" : $file ) || die "Can't open file/pipe: $!";
while (<IN>) {
my($date,$host,$agent,$URL) = /^\[(.+)\] (\S+) "(.*)" (\S+)$/;
next unless $URL=~/\.(html|htm|txt)$/;
$HITS++;
$host = "$host:$agent"; # concatenate host and agent
$HITS{$host}++;
my $seconds = parsedate($date);
if ($LAST{$host}) {
my $interval = $seconds - $LAST{$host};
if ($interval < MAX_INTERVAL) {
$INT_NUMERATOR{$host} += $interval;
$INT_DENOMINATOR{$host}++;
}
}
$LAST{$host} = $seconds;
$POLITE{$host}++ if $URL eq '/robots.txt';
print STDERR $HITS,"\n" if ($HITS % 1000) == 0;
}
# print out, sorted by hits
print join("\t",qw/Client Robot Hits Interval Hit_Percent Index/),"\n";
foreach (sort {$HITS{$b}<=>$HITS{$a}} keys %HITS) {
next unless $HITS{$_} >= 5; # not enough total hits to mean much
next unless $INT_DENOMINATOR{$_} >= 5; # not enough consecutive hits to mean much
my $mean_interval = $INT_NUMERATOR{$_}/$INT_DENOMINATOR{$_};
my $percent_hits = 100*($HITS{$_}/$HITS);
my $index = $percent_hits/$mean_interval;
print join("\t",
$_,
$POLITE{$_} ? 'yes' : 'no',
$HITS{$_},
$mean_interval,
$percent_hits,
$index
),"\n";
}
|
|
| Contents | Next |