#!/usr/bin/perl

use POSIX 'strftime';
use Time::Local;
use English;
use FileHandle;
use Math::BigInt;
use Getopt::Long;

$RCS = '$Id$';

STDERR->autoflush(1);

$cachepat = '^(\S+) (\S+) (\S+) (\[[^]]+\]) "(GET|POST|HEAD)\s+(\S+)(\s*\S*\s*)"\s+([0-9-]+)\s+([0-9-]+)\s*(\S.*)$';
$httpdpat = '^(\S+) (\S+) (\S+) (\[[^]]+\]) "(GET|POST|HEAD)\s+(\S+)(\s*\S*\s*)"\s+([0-9-]+)\s+([0-9-]+).*$';
 
$mode = 1;
$debug = 1;
$log10 = log(10);
output_field_separator STDOUT " ";

@months = ("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug",
	   "Sep", "Oct", "Nov", "Dec");
$dommax = 0;
$hostmax = 0;
$rhostmax = 0;
$urlmax = 0;

if (! ($result = GetOptions ("mode=i", "debug=f"))){
exit(5);
}
if (defined $opt_debug){
   $debug = $opt_debug;
}
if (defined $opt_mode){
$mode = $opt_mode;
}
if ($mode == 1){
    $pat = $cachepat;
} elsif ($mode = 2){
    $pat = $httpdpat;
    } else {
    die "Unknown log file type";
}

while(<>){
    chop;
    if ($debug == 1 && ++$count % 5000 == 0){

	print STDERR $count, " lines, ", 
                     strftime("%d.%m.%y %H:%M:%S", localtime(tim)),
"   \r";

    }
    if (m!$pat!o){
      $host = $1;
      $rfc931 = $2;
      $authuser = $3;
      $datetime = $4;
      $method = $5;
      $request = $6;
      $protocol = $7;
      $statuscode = $8;
      $bytes = $9;

      if ($mode == 1) {		# cache only
        ( $rs_status , $rs_bytes , $cp_post , $pr_post , $cp_hlen ,
          $pc_hlen , $pr_hlen , $rs_hlen , $timetoserve )
	    = split " ", $10;
      }

      $domain = &host2domain($host);
      if (defined $domtab{$domain}){
	  $domnum = $domtab{$domain};
      } else {
	  $domtab{$domain} = ++$dommax;
	  $domnum = $dommax;
      }
      if (defined $hosttab{$host}){
	  $hostnum = $hosttab{$host};
      } else {
	  $hosttab{$host} = ++$hostmax;
	  $hostnum = $hostmax;
      }
      if ($request =~ m!^(http|ftp|gopher|wais)://([^/]+)/!){
	  $rhost = $2;
	  if (defined $rhosttab{$rhost}){
	      $rhostnum = $rhosttab{$rhost};
	  } else {
	      $rhosttab{$rhost} = ++$rhostmax;
	      $rhostnum = $rhostmax;
	  }
      } else {
	  $rhostnum = ++$rhostmax;
      }
      if ($debug > 20){
	  print STDERR "request = $request";
	  print STDERR "rhost = $rhost";
	  print STDERR "rhostnum = $rhostnum";
      }

      @url = split(/\?/, $request);
      @lurl = split(/\?/, $POSTMATCH);

      if (defined $urltab{$url[0]}){
	  $urlnum = $urltab{$url[0]};
      } else {
	  $urltab{$url[0]} = ++$urlmax;
	  $urlnum = $urlmax;
      }
      if (substr($lurl[0], length($lurl[0])-1, 1) eq "/" ||
          length($lurl[0]) == 0){
         $fileext = "{dir}";
      } else {
         @dots = split(/\./, $lurl[0]); # split to dots
         $fileext = $dots[$#dots];
      }
      if ($datetime =~ m!^\[([0-9][0-9])/([A-Z][a-z][a-z])/([0-9]{4}):([0-9][0-9]):([0-9][0-9]):([0-9][0-9]).*\]!){
	  $dd = $1+0;
	  $mmm = $2;
	  $yyyy = $3+0;
	  $HH = $4+0;
	  $MM = $5+0;
	  $SS = $6+0;
	  if ($mmm eq $lastmmm){	# small cache :-).
	      $cache_hit++;
	  } else { 
	      $cache_mis++;
	      for($i=0; $i<=$#months; $i++){
		  if ($months[$i] eq $mmm){
		      $mon = $i;
		      $lastmmm = $mmm;
		      last;
		  }
	      }
	      if ($mon != $i){
		  print STDERR "virheellinen kk: $mmm\n"
		      if $debug > 10;
	      }
	  }
	  $tim = timelocal($SS, $MM, $HH, $dd, $mon, $yyyy-1900);
	  if ($tim == -1){
	      if ($debug > 10){
		  print STDERR "Error from time local: $t{$req}\n";
		  print STDERR join(":", ($SS, $MM, $HH)) . " " .
		      join("/", ($dd, $mon, $yyyy)) . "\n";
	      }
	      next;
	  } 
#	  $tim += $timetoserve;
      } else {
	  printf STDERR "Malformed time: %s\n", $datetime
	      if $debug > 10;
	  next;
      }
      print
# output format:
# 1 time (seconds)
	  $tim,
# 2 host (number)
	  $hostnum,
# 3 domain (number, three-level if *.{co,ac}.* othervice two level)
	  $domnum,
# 4 method (G/P/H)
	  substr($method, 0, 1),
# 5 request_host (number)
	  $rhostnum,
# 6 request_url (number)
	  $urlnum,
# 7 request_type (exit)
          $fileext,
# 8 status (number)
	  $statuscode,
# 9 bytes
	  $bytes;

      if ($mode == 1){
        print " "; print 
# 10 vastauksen coodi,  remote server
	  $rs_status,
# 11 vastauksen pituus, remote server
	  $rs_bytes,
# 12 POSTin pituus, client->proxy
	  $cp_post,
# 13 POSTin pituus, proxy->remote server
	  $pr_post,
# 14 Headerin pituus, client->proxy
	  $cp_hlen,
# 15 Headerin pituus, proxy->client
	  $pc_hlen,
# 16 Headerin pituus, proxyn kysely remotelle
	  $pr_hlen,
# 17 Headerin pituus, remote proxy
	  $rs_hlen,
# 18 Timetoserve
	  $timetoserve;
      }
      print "\n";
  } else {
      print STDERR "Oh?: $_\n" if $debug > 15;
      $invalid_line ++;
  }
}

if ($debug > 0){
    printf STDERR "\ncache_hit = %d\n", $cache_hit;
    printf STDERR "cache_mis = %d\n", $cache_mis;
printf STDERR "invalid_line = %d\n",$invalid_line;
($user,$system,$cuser,$csystem) = times;
printf "used time %.1fu %.1fs total %.1f\n",$user, $system, $user + $system;
}

sub host2domain {
    local($host) = @_;
    local(@hostpart) = split("\.", $host);
    local($domain);
    # jp, uk, au etc...
    if ($hostpart[$#hostpart] ne "com" &&
	($hostpart[$#hostpart-1] eq "co" ||
	 $hostpart[$#hostpart-1] eq "gov" ||
	 $hostpart[$#hostpart-1] eq "go" ||
	 $hostpart[$#hostpart-1] eq "ac")){
	$domain = $hostpart[$#hostpart-2] . "." .
	    $hostpart[$#hostpart-1] . "." .
		$hostpart[$#hostpart] . ".";
    } elsif ($#hostpart == 3 &&
	     $hostpart[$#hostpart] =~ m/^\d+$/) {
	# numerical domains
	$domain = $hostpart[0] . "." . $hostpart[1];
    } else {
	$domain = $hostpart[$#hostpart-1] . "." .
	    $hostpart[$#hostpart];
    }
    return $domain;
}
