Script I.1.7: Catching Rude Robots

LogFormat Directive

LogFormat "%t %h \"%{User-agent}i\" %U" robo-cop

CustomLog log/robot_log robo-cop

 [08/Feb/1998:12:28:35 -0500] phila249-pri.voicenet.com "Mozilla/3.01 (Win95; U)" /cgi-bin/fortune

Source

 #!/usr/local/bin/perl
 
 use Time::ParseDate;
 use strict 'vars';
 
 # after 30 minutes, we consider this a new session
 use constant MAX_INTERVAL => 60*30;  
 my (%HITS,%INT_NUMERATOR,%INT_DENOMINATOR,%POLITE,%LAST,$HITS);
 
 my $file = shift;
 open (IN,$file=~/\.gz$/ ? "zcat $file |" : $file ) || die "Can't open file/pipe: $!"; 
 
 while (<IN>) {
     my($date,$host,$agent,$URL) = /^\[(.+)\] (\S+) "(.*)" (\S+)$/;
     next unless $URL=~/\.(html|htm|txt)$/;
 
     $HITS++;
     $host = "$host:$agent"; # concatenate host and agent
     $HITS{$host}++;
     my $seconds = parsedate($date);
     if ($LAST{$host}) {
 	my $interval = $seconds - $LAST{$host};
 	if ($interval < MAX_INTERVAL) {
 	    $INT_NUMERATOR{$host} += $interval;
 	    $INT_DENOMINATOR{$host}++;
 	}
     }
     $LAST{$host} = $seconds;
     $POLITE{$host}++ if $URL eq '/robots.txt';
     print STDERR $HITS,"\n" if ($HITS % 1000) == 0;
 }
 
 # print out, sorted by hits
 print join("\t",qw/Client Robot Hits Interval Hit_Percent Index/),"\n";
 foreach (sort {$HITS{$b}<=>$HITS{$a}} keys %HITS) {
     next unless $HITS{$_} >= 5;             # not enough total hits to mean much
     next unless $INT_DENOMINATOR{$_} >= 5;  # not enough consecutive hits to mean much
 
     my $mean_interval = $INT_NUMERATOR{$_}/$INT_DENOMINATOR{$_};
     my $percent_hits = 100*($HITS{$_}/$HITS);
     my $index = $percent_hits/$mean_interval;
 
     print join("\t",
 	       $_,
 	       $POLITE{$_} ? 'yes' : 'no',
 	       $HITS{$_},
 	       $mean_interval,
 	       $percent_hits,
 	       $index
 	       ),"\n";
 }

<< Previous Contents >> Next >>

Lincoln D. Stein, lstein@cshl.org
Cold Spring Harbor Laboratory
Last modified: Mon Aug 17 10:41:50 EDT 1998