#!/usr/bin/perl
#
# stoic.pl - web server log analysis tool thingy
# harrison@area.com, dell@area.com
#
# reports:
#
# - logfile totals (hits, bytes transferred, date ranges, more.)
# - daily totals (hits, bytes transferred, unique sites)
# - hourly totals (hits per hour, percent)
# - domains visiting (includes definition)
# - top n documents requested
# - top n sites visiting
# - status codes received (with notes)
# - agent statistics (browser usage, broken down by known browsers)
# - platform statistics (broken down by known platforms)
# - robots visiting
# - user site report (list of users, sites, hits, bytes on each)
# - user report (hits, bytes, last access)
#
# miscellaneous features:
#
# - aol proxies broken down into one: *.proxy.aol.com
# - major search engine requests are parsed
# - ignores sites listed in specified file (e.g. your own, IP address, etc.)
# - referrals are only collected if specified with -r switch
# - country codes are matched with domains visiting
# - browsers and platform report
# - status codes are explained
# - totals for entire logfile, hits, bytes, hits to home page, more.
# - bad lines in logfile can be dumped to a file
#
# to-do:
#
# - everything gets written to STDOUT now, individual files later (?)
# - switch to choose, order report output
# - boy that danny kaye sure can tap dance
# - ignore referrals from sites (switch pointing to file? use ignore list?)
#
# notes:
#
# - domains.txt should be in the same directory as this script,
#   otherwise specify path with -a. a copy of domains.txt is
#   at http://www.area.com/harrison/domains.txt
# - the only required argument:
#   stoic -l [path-to-logfile]
#   e.g. stoic -l /home/web/mysite/logs/access
# - authorized users file should be in .htaccess format
# - all other switches alter the output slightly:
#   -a must be specified to show user reports
#   -c defaults to ./domains.txt if not specified
#   -d, -h, -r and -s default at 10
#   -e defaults to all
#   -i FILE excludes domains listed in FILE (like your machine)
#   -n SITENAME puts the SITENAME in the totals report
#
# switches:
#
# -a [path]   path to list of authorized users
# -b [path]   path to database (for running totals)
# -c [path]   path to country codes file                  
# -d [n]      top n documents accessed, default is 10 
# -e [path]   path to search engine hits
# -f [path]   path to failed input
# -h [n]      top n hosts, default is 10 (top level domains, .com, .edu, etc)
# -i [path]   path to ignore file
# -l [path]   path to logfile
# -n [name]   name of this site (e.g. www.area.com)
# -p          post-process referral output
# -r [path]   path to referral output
# -s [n]      top n sites visiting, default is 10
# -t [path]   path to timing output
# -u          unique sites broken down by date
# -v          dean stark switch
# -x          interesting debug stuff
# -y          optionally tie aa's to dbm files (testing only)
# -z [duh]    special cases

use Getopt::Std;

&getopt ('a:b:c:d:e:f:h:i:l:n:r:s:t:u:z:');

my $version = "v1.2a";

$|++;

if (! $opt_l)
	{
	print "The -l option should provide the path to the logfile.\n" .
				"(This is the only option that is required.)\n\n";
	&usage;
	exit 1;
	}

print STDERR "stoic $version\n" if ($opt_x);

############
# defaults #
############

$opt_d = 10 if (! $opt_d);		# top n documents accessed 
$opt_h = 10 if (! $opt_h);		# top n hosts visiting 
$opt_s = 10 if (! $opt_s);		# top n sites (defaults to top ten)
$opt_c = "./domains.txt" if (! $opt_c && -f "./domains.txt");

#############
# databases #
#############

if ($opt_y)
	{
	if ($opt_z eq "pornopolis")
		{
		dbmopen (%SECTIONS, "./sections", 0770);

		# remove undef to keep running totals
		undef %SECTIONS;
		}
	}

# running totals:
#
# logfile size (logsize)
#	hits (hits)
# hits to main page (hits-main)
# bytes transferred (bytes)
# unique sites (sites)
# time running (time)

if ($opt_b)
	{
	dbmopen (%RUNNING, "./$opt_b", 0770); 
	}

###########
# globals #
###########

my $start = time;
my $start_time = &logdate;
my $hits = 0;
my $lines = 0;
my $hits_to_home_page = 0;
my $total_documents = 0;
my $total_hosts = 0;
my $total_referrals = 0;
my $ignored = 0;
my $total_bytes = 0;
my $fail = 0;
my $access = $opt_l;
my $start_date;
my $start_date_nice;
my $end_date;
my @date_order;

my @BROWSERS_REDUX =
  (
  "IBM", 
	"Lynx", 
	"Lotus-Notes/4",
  "Mozilla/0", 
	"Mozilla/1", 
	"Mozilla/2", 
	"Mozilla/3", 
	"Mozilla/4",
	"MSIE 4.0",
	"MSIE/2",
	"MSIE/3",
	"Opera",
	"AOL-IWENG",
	"HotJava",
	"Cyberdog",
	"IBrowse",
	"AmigaVoyager",
  );

%BROWSER_SHORTHAND =
	(
	"InfoSeek Sidewinder" => "InfoSeek Sidewinder",
	"PRODIGY" => "Prodigy Web Browser",
	"MSProxy/2" => "[MSProxy v2]", 
	"MSIE 2\." => "MSIE/2",
	"MSIE 3\." => "MSIE/3",
	"MSIE 4\." => "MSIE 4.0",
	"Microsoft Internet Explorer/4" => "MSIE 4.0",
	"SEGA Saturn" => "SEGA Saturn",
	"AOL \d+\.0" => "AOL browser",
	"Quarterdeck Mosaic" => "Quarterdeck Mosaic",
	);

%PLATFORM_SHORTHAND =
  (
	"Windows;" => "Microsoft Windows 3.x",
	"Windows 3.1" => "Microsoft Windows 3.x",
  "Windows 16" => "Microsoft Windows 3.x",
  "Windows NT" => "Microsoft Windows NT",
	"Win16" => "Microsoft Windows 3.x",
  "Win32" => "Microsoft Windows 95 or NT",
  "WinNT" => "Microsoft Windows NT",
  "Win95" => "Microsoft Windows 95",
  "Windows 95" => "Microsoft Windows 95",
	"(Windows)" => "Microsoft Windows 3.x",
  "mac" => "Apple Macintosh",
	"PPC" => "Apple Macintosh",
	"Cyberdog" => "Apple Macintosh",
	"Java" => "Java",
	"NetBSD" => "NetBSD",
  "FreeBSD" => "FreeBSD",
	"BSD/OS" => "BSD/OS",
	"SunOS" => "SunOS",
  "HP-UX" => "HP-UX",
  "IRIX" => "IRIX",
	"DOSlynx" => "MS/DOS",
	"OmniWeb" => "NeXT OpenStep",
	"Windows x86" => "Microsoft Windows 3.x",
	"OpenVMS" => "OpenVMS",
	"wget" => "wget",
	"AIX" => "AIX",
  "Linux 1" => "Linux 1.x",
	"Linux 2" => "Linux 2.x",
  "PCN" => "PointCast Network",
  "Amiga" => "Amiga",
  "OS\/2" => "OS/2",
	"IBM[- ]WebExplorer" => "OS/2",
	"WebTV" => "WebTV", 
	"OSF1" => "OSF1",
  "sega" => "Sega Saturn",
  );

my %STATUS_CODES = 
	( "200" => "Success",
		"202" => "Accepted",
		"203" => "Partial Information",
		"204" => "No response",
		"301" => "Document moved",
		"302" => "Forward",
		"304" => "Not Modified",
		"400" => "Bad request",
		"401" => "Unauthorized",
		"402" => "Payment Required",
		"403" => "Forbidden",
		"404" => "Not Found",
		"500" => "Internal Server Error",
		"501" => "Not Implemented",
	);

# $opt_i points to a file containing 
# a list of sites to ignore

if ($opt_i)
	{
	open (IGNORE, "$opt_i") or die ("can't open $opt_i, $!");
	while (<IGNORE>)
		{
		next if ($_ =~ /^#/);
		next if (! length ($_));
		chomp;
		$IGNORE_SITES{$_}++;
		}
	close IGNORE;
	}

# -c points to country code file
# format: code country

if ($opt_c)
	{
	open (COUNTRIES, $opt_c) or die ("can't open $opt_c, $!");
	while (<COUNTRIES>)
		{
		chomp;
		my ($code, $country) = split (/\s+/, $_, 2);
		$code = lc ($code);
		$COUNTRY{$code} = $country;
		}
	close COUNTRIES;
	}

# -a points to list of authorized users

if ($opt_a)
	{
	open (HTACCESSFILE, $opt_a) or die ("$opt_a not found, $!");
	while (<HTACCESSFILE>)
		{
		my $who;
		($who, undef) = split (/:/, $_);
		$AUTH_USERS{$who}++;
		}
	close (HTACCESSFILE);
	}

# -f points to where you want bad output
# from the logfile dumped

if ($opt_f)
	{
	$failed = $opt_f;
	open (FAILED, ">$failed") or die ("can't write to $failed, $!");
	}

# -r points to referral log files

if ($opt_r)
	{
	open (REFERRALS, ">$opt_r") or die ("can't write to $opt_r, $!");
	}

# -e points to search engine hits

if ($opt_e)
	{
	open (ENGINE_HITS, ">$opt_e") or die ("can't write to $opt_e, $!");
	}

# process logfile

open (FILE, "$access") or die ("can't open $access, $!");

while (<FILE>)
	{
	chomp;
	undef $host;

	$lines++;

	if (length ($_) > 800)
		{
		$failed++;
		print FAILED "$_\n" if ($failed);
		next;
		}

	($host, $remote_user, $auth_user, $date, $request, $status, $bytes, $referral, $agent) = 
		$_ =~ m/^(\S+) (\S+) (\S+) \[([^\]\[]+)\] \"([^"]*)\" (\S+) (\S+) \"?([^"]*)\"? \"([^"]*)\"$/;
		
	if (! $host)
		{
		($host, $remote_user, $auth_user, $date, $request, $status, $bytes) = 
			$_ =~ m/^(\S+) (\S+) (\S+) \[([^\]\[]+)\] \"([^"]*)\" (\S+) (\S+)$/;
		}

	# line in access file failed (for some reason)

	if (! $host)
		{
		$fail++;
		print FAILED "$_\n" if ($failed);
		next;
		}

	# throw away netscape log header/definition/thingy 
	next if (! $hits && $host =~ /^format=/);

	# skip hosts we want to ignore

	if ($opt_i && $IGNORE_SITES{$host}) 
		{
		$ignored++;
		next;
		}

	# special hosts we want to wrap into one
	# (fucking aol proxies..)

	if ($host =~ /proxy.aol.com/)
		{
		$host = "*.proxy.aol.com";
		}

	###################
	# data collection #
	#   starts here   #
	###################

	$hits++;
	$total_bytes += $bytes;

	####################
	# debugging output #
	####################

	if ($opt_x && (($hits / 250) == int ($hits / 250)))
		{
		print STDERR ".";
		}

  if ($opt_x && (($hits / 1000) == int ($hits / 1000)))
		{
		print STDERR 
			"\nstoic: " . &commas ($lines) . " lines [" .
			&commas ($hits) . " hits " . &amount ($total_bytes) . "]\n";
		
		if ($last_time_around)
			{
			print STDERR  
					"       seconds between debug report: " . 
					(time - $last_time_around) . "\n";
			}

		print STDERR
					"       pid=$$ current=$now last=$host\n";

		print STDERR 
					"       hosts=" . &commas ($total_hosts) . 
					" documents=" . &commas ($total_documents) . 
					" referrals=" . &commas ($total_referrals) . "\n";
		print STDERR 
					"       start=$start_date_nice\n" .  
					"       now=" . &logdate . "\n" .
					"       elapsed=" . &commas ((time - $start)) . " seconds\n\n";

		$last_time_around = time;
		}

	# domains

	($domain) = $host =~ m/[A-Z1-9].*\.(\S+)$/i;
	$domain = lc $domain;
	$DOMAINS{$domain}++ if ($domain !~ /^\d+$/);

	# start and end of log

	$start_date = $date if (! $start_date);
	$start_date_nice = &logdate if (! $start_date_nice);
	$end_date = $date;

	# unique hosts
	
	$HOSTS{$host}++;
	$total_hosts++;

	# request
	#
	# "GET /bullfrog/bigfrog.gif HTTP/1.0"

	undef $file;

	# parse file

	if ($status == 200 && $request ne "-")
		{
		($file) = $request =~ m/^\S+ (\S+) \S+$/;
		$file = lc ($file);

		if (! $file)
			{
			# this doesn't happen very often at all
			$fail++;
			print FAILED "$_\n" if ($failed);
			next;
			}
		elsif ($file !~ /cgi-bin/ &&
			$file !~ /\.gif/ &&
			$file !~ /\.jpg/ &&
			$file !~ /\.jpeg/ &&
			$file !~ /\.xbm/)
			{
			$DOCUMENTS{$file}++;

			if ($file eq "/" || $file =~ /index.[s]?htm[l]?/i)
				{
				$hits_to_home_page++;
				}
			$total_documents++;
			}
		}

	# status

	$STATUS{$status}++;

	# date stuff
	#
	# 13/May/1997:11:28:40 -0700

	($day, $mon, $year, $hour) =
		$date =~ m/(\d+)\/(\S+)\/(\d+):(\d+):\d+:\d+.*/;
	
	$HOUR_OF_DAY{$hour}++;
	
	$now = sprintf "%02d-%s-%s", $day, $mon, substr ($year, 2, 2);
	$DAILY_HITS{$now}++;
	$DAILY_BYTES{$now} += $bytes;
	$DAILY_HOSTS{$now}{$host}++;
	
	push (@date_order, $now) if (! grep (/^$now$/, @date_order));

	# daily statistics

	if ($opt_u)
		{
		push (@DAILY_ORDER, $now) if (! grep (/^$now$/, @DAILY_ORDER));
		$DAILY_VISITORS{$now}{$host}++;
		}

  # user statistics
	#
	# counts on parse information from request 
	# and date stuff # above, so don't move this 
	# code around. 

	if ($opt_a && $AUTH_USERS{$auth_user})
		{
		$USER_BYTES{$auth_user} += $bytes;
		$USER_HITS{$auth_user}++;
		$USER_LAST{$auth_user} = $now;
		$USER_SITES_HITS{$auth_user}{$host}++;
		$USER_SITES_BYTES{$auth_user}{$host} += $bytes;
		}
		
		# i suppose this tracks the number of 
		# pictures one looks at, but, duh?
		
	if ($opt_z && $opt_z eq "pornopolis" && $file !~ /thumb/)
		{
		if ($file =~ /view\.pl/)
			{
			$USER_HITS{$auth_user}++;
			}
		}

	# browser stuff

	# MSIE and other browsers mask themselves as "compatible"
	# with Mozilla. we try to catch them and parse them out.

	if ($agent =~ /compatible/)
		{
		my ($compatible, $browser) = $agent =~ m/(\S+) \(compatible; (.*)/i;

		foreach $compat (keys %BROWSER_SHORTHAND)
			{
			if ($browser =~ /$compat/)
				{
				$BROWSERS{$BROWSER_SHORTHAND{$compat}}++;
				}
			}
		}
	elsif ($agent)
		{
    $b_match = 0;
    foreach $b (@BROWSERS_REDUX)
      {
      if ($agent =~ /$b/)
        {
        $BROWSERS{$b}++;
        $b_match = 1;
        last;
        }
      }
		$BROWSERS{'other'}++ if (! $b_match);
		}

	# platforms

	if ($agent && 
			$agent ne "-" && 
			$request !~ /robots.txt/i)
		{
		$platform_identified = 0;

		foreach (keys %PLATFORM_SHORTHAND)
			{
			if ($agent =~ /$_/i)
				{
				$PLATFORMS{$PLATFORM_SHORTHAND{$_}}++;
				$platform_identified = 1;
				last;
				}
			}
	
		if (! $platform_identified)
			{
			if ($agent =~ /Lynx/)
				{
				$PLATFORMS{'unknown [using lynx]'}++;
				}
			elsif ($agent)
				{
				$PLATFORMS{'miscellaneous / unknown'}++;
				}
			else
				{
				$PLATFORMS{'unknown'}++;
				}
			}
		}

	# referrals

	if ($opt_r &&
		$referral && 
		$referral !~ /$opt_n/ &&
		$referral ne "-" && 
		$referral !~ /[\@\#]/ && 		# no embedded usernames or named links, plz.
		!&is_local_url ($referral))	# do special things inside &is_local_url, plz.
		{
		my $string, $matched;

		# strip off quotes around this (some browsers..)

		$referral =~ s/^\"//g;
		$referral =~ s/\"$//g;
		$referral = lc ($referral);

		$total_referrals++;

		# search engine?

		if ($referral =~ /\?/)
			{
    	$matched = 0;
			
			# before and after the "?"

			($chaff, $query) = ( $referral =~ m/(.+)\?(.+)/ );
      (@pieces) = split (/\//, $chaff);

			if ($pieces[2] && $pieces[2] =~ /\./)
				{
				$ENGINES{$pieces[2]}++;
				}
    
			# get pornopolis queries out of the report
			# this happens when IP# is used instead of domain name

			if ($chaff =~ /view\.pl/ || $chaff =~ /index\.pl/)
				{ $matched++; }

    	(@pieces) = split (/&/, $query);
    	foreach $item (@pieces)
      	{
      	if ($item =~ /^q=/ or
          	$item =~ /^query=/ or 
						$item =~ /^search=/ or
						$item =~ /^searchtext=/ or 
          	$item =~ /^qt=/ or
          	$item =~ /^general=/ or
          	$item =~ /^s=/ or
          	$item =~ /^p=/ or
						$item =~ /^text=/ or
          	$item =~ /^MT=/i )
        	{
        	(undef, $string) = split (/=/, $item);
        	$string =~ s/%([\dA-Fa-f][\dA-Fa-f])/pack ("C", hex ($1))/eg;
        	$string =~ s/\+/ /g;
        	$string =~ s/^\'//g;
        	$string =~ s/\"//g;
        	$string =~ s/^\s+//g;
					chomp $string;

					next if ($string =~ /\d+/ || length ($string) < 2);

        	print ENGINE_HITS "$string\n";
    	    $matched++;
        	last;
        	}
				}
      }
		else # not a search engine, an actual link
			{
			print REFERRALS "$referral\n";
			}
		}

	# robots

	if ($file eq "/robots.txt" && $agent !~ /compatible/ && $agent ne "-")
		{
		$ROBOTS{$agent}++;
		$ROBOTS_LAST{$agent} = $now;
		}

	#######################
	# site-specific cases #
	#######################

	if ($opt_v)
		{
		@cs = qw (surf209 enclave.org);

		foreach (@cs)
			{
			if ($host =~ /$_/)
				{
				$CS_SITES{$host}++;
				$CS_SITES_LAST{$host} = $now;
				}
			}
		}

	if ($opt_z && $opt_z eq "pornopolis")
		{
		# /cgi-bin/view.pl?collection=current&dir=male&file=d09.jpg

		if ($file =~ m#^/cgi-bin/view.pl\?collection=(\S+)&dir=(\S+)&file=.*$#)
			{
			$section = $2;

			if ($1 eq "smut" || $1 eq "archive" || $1 eq "exhibits")
				{ $section = "$1/$section"; }

			if ($1 eq "wow" || $1 eq "current")
        { $section = "$1/*"; }

      if ($1 =~ /^save\//)
        { $section = "save/*"; }

      $SECTIONS{$section}++;
			}
		}
  elsif ($opt_z && $opt_z eq "rotten")
    {
    if ($file =~ /gallery/)
      {
      my (@pieces) = split (/\//, $file);
      $SECTIONS{$pieces[$#pieces-1]}++ unless
        ($pieces[$#pieces-1] eq "gallery");
      }
    }

	# done with loop
	}

close FILE;
close FAILED if ($failed);
	
print STDERR "\n\nstoic: file complete.\n\n" if ($opt_x);

@REPORTS = qw 
	(totals daily hourly documents domains sites status agent platforms robots);

push (@REPORTS, "user_sites", "user_report") if ($opt_a);
push (@REPORTS, "referral") if ($opt_r);
push (@REPORTS, "popularity") if ($opt_z eq "pornopolis" || $opt_z eq "rotten");
push (@REPORTS, "vampyre") if ($opt_v);
push (@REPORTS, "daily_visitors") if ($opt_u);

my ($uname) = $opt_n =~ /^\S+\.(\S+)\.\S+$/;
my ($d1, $m1, $y1) = $start_date =~ /^(\d+)\/(\S+)\/(\d{4})/;
my ($d2, $m2, $y2) = $end_date =~ /^(\d+)\/(\S+)\/(\d{4})/;

my $filename = sprintf 
	"%04d%02d%02d-%04d%02d%02d-%s",
	$y1, "5551212", $d1,
	$y2, "5551212", $d2,
	$uname;

open (OUT, ">>$filename.txt") or die ("can't write to $file, $!");

my $link_r = "00-latest-report.txt";
unlink $link_r if (-f $link_r);
`ln -s $filename.txt $link_r`;

foreach (@REPORTS)
	{
	print STDERR "stoic: writing report $_ ..\n\n" if ($opt_x);
	&$_ (\*OUT);
	print OUT "\n";
	}

close OUT;

if ($opt_p)
	{
	open (REF_RAW, $opt_r) or die "duh: can't open $opt_r ??";
	while (<REF_RAW>)
		{
		chomp;
		$REFERRAL{$_}++; 
		}
	close REF_RAW;
		
	my $s = "$m1/$d1/$y1";
	my $e = "$m2/$d2/$y2";

	open (REF_HTML, ">>ref-$filename.html");

	my $link_f = "00-latest-referrals.html";
	unlink $link_f if (-f $link_f);
	`ln -s ref-$filename.html $link_f`;

	print REF_HTML<<FIN;
<html>
<title>referrals for $filename</title>
<body bgcolor="#FFFFFF">
<center>
<table border=1 cellpadding=5>
<tr><td align=center colspan=2 cellpadding=2 bgcolor="#a0b8c8">$uname: $s to $e</td></tr>
FIN

	my $t = 0;
  foreach (sort { $REFERRAL{$b} <=> $REFERRAL{$a} } keys %REFERRAL)
		{
		print REF_HTML <<FIN;
<tr>
<td align=center bgcolor="#dcdcdc">$REFERRAL{$_}</td>
<td align=left bgcolor="#dcdcdc"><a href=\"$_\">$_</a></td>
</tr>
FIN
		$t += $REFERRAL{$_};
		}
	
	print REF_HTML <<FIN;
<tr><td align=center cellpadding=2><b>Total:</b><br>
$t referrals</td>
<td align=left cellpadding=2>
<font size=-2>
stoic.pl 1.2a<br>
Area Systems Confidential<br>
bugs to <a href="mailto:staff\@area.com">staff\@area.com</a>
</font>
</td></tr>
</table>
FIN

	close REF_HTML;
	}

exit 0;

# done

sub usage
	{
	print "stoic $version\n";
	}

#####################
# support functions #
#####################

sub logdate
  {
  my ($sec,$min,$hour,$mday,$mon,$year) = localtime (time);

  $when = sprintf ("%02d/%02d/%02d %02d:%02d",
    $mon + 1, $mday, $year, $hour, $min);

  return $when;
  }

sub nicedate
  {
  my ($sec,$min,$hour,$mday,$mon,$year) = localtime(time);
  $mon++;
  $when = "$mon/$mday/$year";
  return $when;
  }

sub amount
  {
  local ($num) = @_;
  my ($tag) = "K";

  $num = int (($num + 512) / 1024);

  if ($num >= 10000)
    {
    $num = int (($num + 512) / 1024);
    $tag = "M";
    if ($num >= 10000)
      {
      $num = int (($num + 512) / 1024);
      $tag = "G";
      }
    }

  return "$num$tag";
  }

sub commas
  {
  local ($_) = @_;
  1 while s/(.*\d)(\d\d\d)/$1,$2/;
  $_;
  }

sub is_local_url
	{
  local ($url) = @_;
	
	return 1 if ($opt_z && $url =~ /^http:\/\/(www([0-9]?)\.)?$opt_z\.(area\.)?com/i );

  if ($opt_z && $opt_z eq "pornopolis")
    {
		return 1 if ( $url =~ /206.204.77.23/);		# local IP address
    return 1 if ( $url =~ /senorita.com/i );
    }

	return 1 if ($url =~ /^file:/i);

  return 0;
	}

####################
# report functions #
####################

sub referral
	{
	my $fh = shift;
	my $engine_count = 0;

  print $fh "engine referrals:\n\n";

  foreach $eng (sort { $ENGINES{$b} <=> $ENGINES{$a} } keys %ENGINES)
    {
		if (! $engine_count)
			{
			printf $fh "%7s    %s\n", "count", "referral";
			printf $fh "%7s    %s\n", "-----", "--------";
			}
    $engine_count++;
    printf $fh "%7s    %s\n", $ENGINES{$eng}, "$eng";
    }
  print $fh "  - none -\n" if (!$engine_count);
	}

sub agent
	{
	my $fh = shift;
	my $total_browsers = 0;
	
	foreach (sort keys %BROWSERS)
		{
		$total_browsers += $BROWSERS{$_};
		}

	print $fh "browser wars:\n\n";
	printf $fh "%7s   %7s     %s\n", "count", "percent", "browser";
	printf $fh "%7s   %7s     %s\n", "-----", "-------", "-------";

	foreach (sort { $BROWSERS{$b} <=> $BROWSERS{$a} } keys %BROWSERS)
		{
		printf $fh "%7s  %7s%%     %s\n", 	
			&commas ($BROWSERS{$_}), 
			&percent ($BROWSERS{$_}, $total_browsers),
			$_;
		}
	}

sub percent 
	{
	($sum, $total) = @_;
	$pct = int ((($sum / $total) + .005) * 100);
	$pct = "< 1" if (! $pct);
	return $pct;
	}

sub platforms
	{
	my $fh = shift;

	print $fh "platforms visiting:\n\n";

	printf $fh "%7s   %s\n", "count", "platform";
  printf $fh "%7s   %s\n", "-----", "--------";

	foreach (sort { $PLATFORMS{$b} <=> $PLATFORMS{$a} } keys %PLATFORMS)
		{
		printf $fh "%7s   %s\n", &commas ($PLATFORMS{$_}), $_;	
		}
	}

sub robots
	{
	my $fh = shift;

	print $fh "robots visiting:\n\n";

	if (! %ROBOTS)
		{
		print $fh "  - none -\n";
		return;
		}

	printf $fh "%7s   %9s   %s\n", "count", "last", "robot";
	printf $fh "%7s   %9s   %s\n", "-----", "----", "-----";

	foreach (sort { $ROBOTS{$b} <=> $ROBOTS{$a} } keys %ROBOTS)
		{
		printf $fh "%7s   %9s   %s\n", &commas ($ROBOTS{$_}), $ROBOTS_LAST{$_}, $_;
		}
	}

sub daily
	{
	my $fh = shift;
	my @MONTHS = qw (Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
	my $num_sites;
	
	print $fh "hits sorted by date:\n\n";
	
	printf $fh "%12s %12s %12s %12s    %s\n", 
		"date", "hits", "bytes", "transferred", "unique sites";
	printf $fh "%12s %12s %12s %12s    %s\n", 
		"----", "----", "-----", "-----------", "------------";

  foreach (@date_order)
    {
		$num_sites = 0;

		foreach (keys %{$DAILY_HOSTS{$_}})
			{ $num_sites++; }

    printf $fh "%12s %12s %12s %12s    %s\n",
      $_,
      &commas ($DAILY_HITS{$_}),
			&commas ($DAILY_BYTES{$_}),
      &amount ($DAILY_BYTES{$_}),
			&commas ($num_sites);
		}
	}

sub domains
	{
	my $fh = shift;
	my $count = 0;

	print $fh "top level domains visiting:\n\n";
	printf $fh "%7s   %7s   %s\n", "hits", "domain", "country";
	printf $fh "%7s   %7s   %s\n", "----", "------", "-------";

	foreach (sort { $DOMAINS{$b} <=> $DOMAINS{$a} } keys %DOMAINS)
		{
		printf $fh "%7s   %7s   %s\n", &commas ($DOMAINS{$_}), "$_", $COUNTRY{$_};
		last if ($opt_h ne "all" && $opt_h == ++$count);
		}
	}

sub hourly
	{
	my $fh = shift;
	my $hourly_totals = 0;

	print $fh "hits by time of day:\n\n";
  
  printf $fh "%7s  %6s    %s\n", "hour", "count", "percent";
  printf $fh "%7s  %6s    %s\n", "----", "-----", "-------";
	
	foreach $i (sort keys %HOUR_OF_DAY)
    {
    printf $fh "  %02d:00  %6s    %3d%%\n", 
			$i, &commas ($HOUR_OF_DAY{$i}), &percent ($HOUR_OF_DAY{$i}, $hits);
    }
  }

sub documents
	{
	my $fh = shift;
	my $count = 0;

	if ($opt_f ne "all")
		{ print $fh "top $opt_d documents accessed:\n\n"; }
	else
		{ print $fh "documents accessed:\n\n"; }

	printf $fh "%7s   %s\n", "hits", "document";
	printf $fh "%7s   %s\n", "----", "--------";

	foreach (sort { $DOCUMENTS{$b} <=> $DOCUMENTS{$a} } keys %DOCUMENTS)
		{
		printf $fh "%7s   %s\n", &commas ($DOCUMENTS{$_}), $_;
		last if ($opt_d ne "all" && $opt_d == ++$count);
		}
	}

sub sites
	{
	my $fh = shift;
	my $count = 0;
	
	if ($opt_s ne "all")
		{ print $fh "top $opt_s sites:\n\n"; }
	else
		{ print $fh "unique sites visiting:\n\n"; }
	
	printf $fh "%7s   %s\n", "hits", "remote host";
	printf $fh "%7s   %s\n", "----", "-----------";

	foreach (sort { $HOSTS{$b} <=> $HOSTS{$a} } keys %HOSTS)
		{
		printf $fh "%7s   %s\n", &commas ($HOSTS{$_}), $_;
		last if ($opt_s ne "all" && $opt_s == ++$count);
		}
	}

sub status	
	{
	my $fh = shift;

	print $fh "status codes:\n\n";

	printf $fh "%7s   %4s  %s\n", "count", "code", "message";
	printf $fh "%7s   %4s  %s\n", "-----", "----", "-------";

	foreach (sort { $STATUS{$b} <=> $STATUS{$a} } keys %STATUS)
		{
		printf $fh "%7s   %4s  %s\n", &commas ($STATUS{$_}), $_, $STATUS_CODES{$_};
		}
	}

sub totals
	{
	my $fh = shift;
	my $howlong = time - $start;
	my $size = -s $access;
	my $num_sites = (keys %HOSTS);

  if ($opt_b)
		{
		$RUNNING{'logsize'} += $size;
		$RUNNING{'hits'} += $hits;
		$RUNNING{'hits-main'} += $hits_to_home_page;
		$RUNNING{'bytes'} += $total_bytes;
		$RUNNING{'sites'} += $num_sites;
		}

	print $fh "totals for " . ($opt_n ? $opt_n : $access) . ":\n\n";

	print $fh "        log start: $start_date\n";
	print $fh "       log finish: $end_date\n";
	print $fh "             hits: " . &commas ($hits) . "\n";
	print $fh "   home page hits: " . &commas ($hits_to_home_page) . "\n";
	print $fh "bytes transferred: " . &commas ($total_bytes) . 
		" (" . &amount ($total_bytes) . ")\n";
	print $fh "     unique sites: " . &commas ($num_sites) . "\n";
	print $fh "    sites ignored: " . &commas ($ignored) . " site" .
		($ignored == 1 ? "" : "s") . "\n";
  print $fh "        bad input: " . &commas ($fail) . " line" . 
		($fail == 1 ? "" : "s") . "\n";
	print $fh "  time to execute: " . &commas ($howlong) . " seconds " .
		"[$start_time - " . &logdate . "]\n";
	print	$fh "          logfile: $access (" . &amount ($size) . ")\n";
	print $fh "\n";

	if ($opt_b)
		{
		print $fh "running totals:\n\n";
		print $fh "             hits: " . &commas ($RUNNING{'hits'}) . "\n";
		print $fh "   home page hits: " . &commas ($RUNNING{'hits-main'}) . "\n";
		print $fh "bytes transferred: " . &commas ($RUNNING{'bytes'}) . "\n";
  	print $fh "     unique sites: " . &commas ($RUNNING{'sites'}) . "\n";
  	print $fh "       log totals: " . &amount ($RUNNING{'logsize'}) . 
  		" processed\n";
		}

	# timing file

	if ($opt_t)
		{
		$opt_n = $access if (! $opt_n);
		open (TIMING, ">>$opt_t") or die ("can't append to $opt_t, $!");
		printf TIMING "%-27s [%s - %s] " . &amount ($size) . " %s seconds\n", 
			$opt_n, $start_time, substr (&logdate, 9, 6), &commas ($howlong);
		close TIMING;
		}
	}

sub user_sites
	{
	my $fh = shift;
	my $user, $count;

	print $fh "users and the sites they visit from:\n\n";

	printf $fh "%-12s  %-36s  %-8s  %s\n", "user", "sites", "hits", "transferred";
	printf $fh "%-12s  %-36s  %-8s  %s\n", "----", "-----", "----", "-----------";

	foreach $user (sort keys %USER_BYTES)
		{
		printf $fh "%-12s  ", $user;
		
		$count = 0;

		for $key (sort keys %{$USER_SITES_HITS{$user}})
			{
			if ($count++)
				{
				print $fh " " x 14;
				}
			printf $fh "%-36s  %-8s  %s\n", 
				$key, 
				&commas ($USER_SITES_HITS{$user}{$key}), 
				&amount ($USER_SITES_BYTES{$user}{$key});
			}
		}
	}

sub user_report
	{
	my $fh = shift;
	my $num = 0;

	print $fh "\ntransfers per authenticated user:\n\n";

  printf $fh "%3s  %-10.10s  %-11.11s %10s %14s  (%s)\n",
    "#", "user", "last", "hits", "data", "mb";
  printf $fh "%3s  %-10.10s  %-11.11s %10s %14s  ----\n",
    "-", "----", "----", "----", "----";

  foreach $key (sort { $USER_BYTES{$b} <=> $USER_BYTES{$a} } keys %USER_BYTES)
		{
		$num++;
				
    printf $fh "%3s  %-10.10s  %-11.11s %10s %14s  (%s)\n",
     	$num, $key, $USER_LAST{$key}, &commas ($USER_HITS{$key}),
      &commas ($USER_BYTES{$key}), &amount ($USER_BYTES{$key});
		}
	}

sub popularity
	{
	my $fh = shift;
	my $total_section_hits = 0;

  print $fh "areas by popularity:\n\n";
  printf $fh "%10s    %10s    %s\n", "hits", "percent", "area";
  printf $fh "%10s    %10s    %s\n", "----", "-------", "----";
  
	foreach (keys %SECTIONS)
		{
		$total_section_hits += $SECTIONS{$_};
		}

	foreach $sect (sort { $SECTIONS{$b} <=> $SECTIONS{$a} } keys %SECTIONS)
    {
    printf $fh "%10s   %10s%%    %s\n", 
    	&commas ($SECTIONS{$sect}),
    	&percent ($SECTIONS{$sect}, $total_section_hits),
    	$sect;
		}
	}

sub vampyre
	{
	my $fh = shift;
	return;

  my $total_cs_hits = 0;
  my $count = 0;

	print $fh "dean stark visits:\n\n";

	foreach (keys %CS_SITES)
		{
		$total_cs_hits += $CS_SITES{$_};
		}

	if (! $total_cs_hits)
		{
		print $fh "  - none -\n";
		return;
		}

	printf $fh "%10s     %10s     %10s     %s\n", 
		"hits", "percent", "last", "site";
	printf $fh "%10s     %10s     %10s     %s\n", 
		"----", "-------", "----", "----";

	foreach $sect (sort { $CS_SITES{$b} <=> $CS_SITES{$a} } keys %CS_SITES)
		{
    printf $fh "%10s    %10s%%     %10s     %s\n", 
			&commas ($CS_SITES{$sect}), 
			&percent ($CS_SITES{$sect}, $total_cs_hits),
			$CS_SITES_LAST{$sect}, $sect;
		}
	}

sub daily_visitors
	{
	my $fh = shift;
	my $count = 0;
	my $when;

	if ($opt_u eq "all")
		{
		print $fh "all sites visiting each day:\n\n";
		}
	else
		{
		print $fh "visitors for the last $opt_u day" . 
			($opt_u == 1 ? "" : "s") . ":\n\n";
		}

	foreach $when (reverse @DAILY_ORDER)
		{
		++$count;
		last if ($opt_u ne "all" && $count > $opt_u);
		print $fh "unique sites visiting on $when:\n\n";

		printf $fh "%10s     %s\n", "hits", "site";
		printf $fh "%10s     %s\n", "----", "----";

		foreach (sort keys %{$DAILY_VISITORS{$when}})
			{
			printf $fh "%10s     %s\n", $DAILY_VISITORS{$when}{$_}, $_;
			}
		print $fh "\n";
		}
	}

