#!/usr/local/bin/perl -w
#
#-----------------------------------------------------------------------------
#webstats v0.3
#===============
#
#author: 		pat erler
#contact:		pat@patsplanet.com
#projects homepage:	http://patsplanet.com/software/webstats
#latest version:	http://patsplanet.com/software/webstats/download
#
#description:
#------------
#webstats is a front/backend for report magic.
#
#it helps managing the statistics created by analog and report magic for
#multiple virtual domains hosted on one server.
#
#set up properly it creates automagically analog and report magic reports 
#for every virtual domain you add to your site.
#
#additionally included is a php page which shows all existing virtual domains
#on your server with links to the appropriate site, web statistics and netcraft
#surveys.
#
#this version is tested with analog 5.01 and report magic 2.10
#
#usage: 
#------
#please refer to the dosumentation included in the tarball
#or at http://patsplanet.com/webstats/docs
#
#thanks!
#-------
#the sorting algorithm in the combine subroutine is derived from code
#posted by Benjamin Goldberg in c.l.p.m. thanks, benjamin!
#
#
#-----------------------------------------------------------------------------

#use diagnostics;
#use strict;
use Config::IniFiles;

#we read variables from $PREFIX/etc/webstats/webstats.ini
$cfg = new Config::IniFiles -file => "/usr/local/etc/webstats/webstats.ini";

my $VHOSTDIR=$cfg->val('rmagic', 'VHOSTDIR');
my $WWWDIR=$cfg->val('rmagic', 'WWWDIR');
my $WEBSTATSDIR=$cfg->val('rmagic', 'WEBSTATSDIR');
my $LOGSDIR=$cfg->val('rmagic', 'LOGSDIR');
my $COMBINELOGS=$cfg->val('combine', 'COMBINELOGS');
my $NEWLOGSIDENTIFIER=$cfg->val('combine','NEWLOGSIDENTIFIER');
my $WEBSTATSLOG=$cfg->val('misc','WEBSTATSLOG');
my $DEBUG=$cfg->val('misc','DEBUG');
my $ANALOG=$cfg->val('misc','ANALOG');
my $RMAGIC=$cfg->val('misc','RMAGIC');
#we open our logfile
open( LOG,">$WEBSTATSLOG" )
 or die "Could not open logfile: $WEBSTATSLOG for writing: $!\n";

debug("we are running...");
main();

    


#SUBROUTINES
######################################################################
#
#SUB DEBUG
#---------
#
#What does it do?:
#
#	if DEBUG variable in $PREFIX/etc/webstats/webstats.ini is set to "yes"
#	print some useful information to $WEBSTATSLOG
sub debug{
    if ($DEBUG eq "yes"){
	print LOG $_[0];
        print LOG " ...with these variables right now:\n";
        print LOG "\$VHOSTDIR: $VHOSTDIR\n";
        print LOG "\$WWWDIR: $WWWDIR\n";
        print LOG "\$WEBSTATSDIR: $WEBSTATSDIR\n";
        print LOG "\$LOGSDIR: $LOGSDIR\n";
        print LOG "\$COMBINELOGS: $COMBINELOGS\n";
        print LOG "\$NEWLOGSDENTIFIER: $NEWLOGSIDENTIFIER\n";
	print LOG "\$WEBSTATSLOG: $WEBSTATSLOG\n\n";
        }
}
######################################################################
#SUB MAIN
#---------------
#What does it do?:
#
#	Thats the main subroutine. it checks the command line arguments
#	and either spitsout a small help text if no arguments are
#	supplied, a longer help text if webstats is called with -h, 
#	runs webstats for all domains at the server when called
#	with -a or otherwise runs webstats for the specified domain - 
#	but not without checking first if it exists at the server
#	
sub main{
    @ARGV == 1 or die "Usage:\n\t$0 [virtual domain] || [-a] || [-h]\n\n";
    my $TODO = $ARGV[0];
        if($TODO eq "-h"){
    	    help();
    	    exit;
	    }
        elsif ($TODO eq "-a"){
	    allstats();
	    exit;
	    }
        else{
	    my $VHOST=$TODO;
	    check_if_exist ($VHOST);
	    if ($COMBINELOGS eq "yes"){
		combinelogs ($VHOST);
		};
            run_analog ($VHOST);
	    run_rmagic ($VHOST);    
	    exit;
	    }
}	



######################################################################
#SUB COMBINELOGS
#---------------
#What does it do?:
#
#	Takes as argument the name of a virtual host, goes into
#	the logfile directory of that virtual host, reads all
#	new logfiles, sorts them by date, excludes optionally some
#	lines wich you don't want to see in you logfiles (like 
#	checks from your monitoring software) and puts them in
#	a file (one per day) appended by the date 
#	(i.e. access_log_20011203)
#
#	when we collect the list of logfiles we take care that we
#	don't count symlinks as real logfiles. take care if you don't
#	want this because you link your logfiles from somewhere else
#
#
#Important Variables:
#	NEWLOGSIDENTIFIER - the beginning of new logfiles
#	DAILYLOG - the name of the output log
#	DATE - the timestamp, appended to the output log
#
#Logformat:
#	The logformat is a standard apache logformat like this line: 
#	213.83.52.132 - - [30/Jun/2001:12:55:53 +0200] "GET / HTTP/1.0" 200 3392 "-" "check_http/1.32.2.6 (netsaint-plugins 1.2.9-4)" 
#	
#	this line is created by this apache directive:
#	LogFormat "%V %h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" VLOG=%{VLOG}e"	
#
#	the logfile line above shows a line which would be /excluded/
#	from the output logfile, because it contains "check_http" which 
#	is excluded by a grep (look some lines below here) 	
sub combinelogs{
my $VHOST=$_[0];

#we need a timestamp for the daily logs
chomp (my $DATE=`date +%Y%m%d`);

#get the new logfiles
opendir(LOGDIR, "$VHOSTDIR/$VHOST/$LOGSDIR") || die "can't opendir $VHOSTDIR/$VHOST/$LOGSDIR: $!";
my @logs = grep {/^$NEWLOGSIDENTIFIER/ && -f "$VHOSTDIR/$VHOST/$LOGSDIR/$_" && ! -l "$VHOSTDIR/$VHOST/$LOGSDIR/$_"} readdir(LOGDIR);
closedir LOGDIR;

if (@logs eq ""){
    print "no new logfiles";
    exit;
    }

#combine the VLOGfiles, sort them, remove unwanted lines
#and append them to a file, one per day
open ( my $DAILYLOG,">>$VHOSTDIR/$VHOST/$LOGSDIR/access_log.$DATE" )
 or die "Could not open access_log_$DATE for w: $!\n";

my %mon;
@mon{qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)} = (0..11);
my $stabilizer = 0;

foreach (@logs) {
	open ( my $VLOG,"<$VHOSTDIR/$VHOST/$LOGSDIR/$_" )
        or die "Cannot open:$VHOSTDIR/$VHOST/$LOGSDIR/$_";
        for( sort
    	    map {
            # 213.83.52.132 - - [28/Jun/2001:20:29:43 stuff
            m/^\S* \S* \S* (\S*)/;
            my @date = unpack "xa2xa3xa4xa2xa2xa2", $1;
            pack "ncccccNa*",
             $date[2], $mon{$date[1]}, @date[0,3,4,5],
             ++$stabilizer, $_;
            } 
	    grep 
#put all lines you want to exclude from your logs 
#(for instance checks from monitoring software) here
	       !/check_http/ 
	    && !/http_ping\.html/
	    ,<$VLOG>
        ) {  print $DAILYLOG substr $_, 11 }
}
close $DAILYLOG or die "Couldn't close $DAILYLOG $!\n";



#delete VLOGfiles
foreach (@logs){
    unlink "$VHOSTDIR/$VHOST/$LOGSDIR/$_";
    }
}


######################################################################
#SUB RUN_ANALOG
#--------------
#
#What does it do?;
#	
#    first we chdir into the webstats directory of the virtual domain
#    we working on because we have some line in the analog.cfg which 
#    assume that we are right there.
#    
#    then we call a shell in which we run analog with parameters which 
#    exclude all page hits from our own domain
sub run_analog{
    my $VHOST=$_[0];
    chdir "$VHOSTDIR/$VHOST/$WWWDIR/$WEBSTATSDIR";
    my @run_analog = (
	    "$ANALOG", 
	    "-G",
	    "+g$VHOSTDIR/$VHOST/$WWWDIR/$WEBSTATSDIR/analog.cfg",
	    "+O$VHOSTDIR/$VHOST/$WWWDIR/$WEBSTATSDIR/analog.dat", 
	    "+CREFREPEXCLUDE REGEXP:$VHOST",
	    "+CREFSITEEXCLUDE REGEXP:$VHOST",
	    );
    my $returnstatus = system(@run_analog);
#    my $returnstatus = system("/usr/bin/analog -G +g$VHOSTDIR/$VHOST/$WWWDIR/$WEBSTATSDIR/analog.cfg +O$VHOSTDIR/$VHOST/$WWWDIR/$WEBSTATSDIR/analog.dat +C"REFREPEXCLUDE http://*.$VHOST.*/*" +C"REFSITEEXCLUDE http://*.$VHOST.*" +C"REFSITEEXCLUDE http://$VHOST.*"/usr/bin/analog -G +g$VHOSTDIR/$VHOST/$WWWDIR/$WEBSTATSDIR/analog.cfg +O$VHOSTDIR/$VHOST/$WWWDIR/$WEBSTATSDIR/analog.dat +C"REFREPEXCLUDE http://*.$VHOST.*/*" +C"REFSITEEXCLUDE http://*.$VHOST.*" +C"REFSITEEXCLUDE http://$VHOST.*"");
    if ($returnstatus){
	print LOG "analog messed up";
	}
    }

######################################################################
#SUB RUN_RMAGIC
#--------------
#
#What does it do?;
#	
#	we call report magic. thats it.
sub run_rmagic{    
    my $VHOST=$_[0];
    my $returnstatus = system("$RMAGIC $VHOSTDIR/$VHOST/$WWWDIR/$WEBSTATSDIR/rmagic.ini");
    if ($returnstatus){
	print LOG "rmagic messed up";
	}
    }
    
    

######################################################################
#SUB ALLSTATS
#------------
#
#What does it do?;
#
#	Reads the directory with you virtual domains and creates
#	statistics for all of them...
#	
sub allstats{
    opendir(VHOSTDIR, "$VHOSTDIR") || die "can't open directory $VHOSTDIR: $!";
    my @VHOSTLIST = grep {! /^\./ && -d "$VHOSTDIR/$_" && ! -l "$VHOSTDIR/$_"} readdir(VHOSTDIR);
    closedir VHOSTDIR;
    
    
    foreach (@VHOSTLIST){
        if ($COMBINELOGS eq "yes"){
    	    combinelogs ($_);
    	    };
        run_analog ($_);
        run_rmagic ($_);    
    }
}

######################################################################
#SUB CHECK_IF_EXIST
#------------------
#
#What does it do?;
#
#	If you called webstats with a name of a domain you want
#	to create webstats for, this routine checks if you have
#	such a domain on your server	
#	
sub check_if_exist{
    my $CHECKVHOST=$_[0];
    opendir(VHOSTDIR, "$VHOSTDIR") || die "can't open directory $VHOSTDIR: $!";
    my @VHOSTLIST = grep {! /^\./ && -d "$VHOSTDIR/$_" && ! -l "$VHOSTDIR/$_"} readdir(VHOSTDIR);
    closedir VHOSTDIR;

    foreach (@VHOSTLIST){
    	if ($CHECKVHOST eq $_){
	    return;
	    }
	}
    print "There is no home directory for domain \n\n\t$CHECKVHOST\.tld\n\nin your directory $VHOSTDIR!\n\n";
    die "Usage:\n\t$0 [virtual domain] || [-a] || [-h]\n\n";

}


######################################################################
#SUB HELP
#--------
#
#What does it do?;
#	
#	print a help text

sub help{
    print "Usage:\n\t$0 [homedir of virtual domain] || [-a] || [-h]\n\n";
    print "You can use $0 to create statistic for just one \n";
    print "of your virtual domains or to create statistics for \n";
    print "all your virtual domains.\n\n";	   
    print "Example:\n\t$0 domain\n";
    print "\tcreates statistics for domain.tld (.com/.net whatever)\n\n";
    print "Note:\n\t \"domain\" in this example is the home directory of your\n";
    print "\tvirtual domain, not the domain name itself. if both are identical,\n";
    print "\tignore this note ;).\n\n";
    print "\t$0 -a\n";
    print "\tcreates serverwide statistics for all domains in $VHOSTDIR\n\n";
    print "   $0 -h prints this text.\n\n";

}		 
	   

__END__
	   
	   
