#!/usr/local/bin/ruby24 -w
# -*- ruby -*-

# apercu: reads apache logs and describes them.
# see: http://httpd.apache.org/docs/1.3/logs.html

# Combined Log Format

# Another commonly used format string is called the Combined Log Format. It can
# be used as follows.

# LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"" combined
# CustomLog log/acces_log combined 

# This format is exactly the same as the Common Log Format, with the addition of
# two more fields. Each of the additional fields uses the percent-directive
# %{header}i, where header can be any HTTP request header. The access log under
# this format will look like:

# 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)"

require 'optparse'
require 'parsedate'
require 'socket'
require 'time'

$PACKAGE = "apercu"
$VERSION = "1.0.1"

module Timish

  def self.time_to_array(time)
    time.to_a[0 .. 5].reverse
  end

  def self.increment(time, args)
    vals      = time_to_array(time)
    allfields = %w{ year mon  day   hour min  sec  }
    limits    =   [ nil, 12,  31,   nil, nil, nil  ]
    maxes     =   [ nil, nil, nil,  24,  60,  60   ]

    allfields.reverse.each do |field|
      idx = allfields.index(field)

      val = args[field] || args[field.to_sym]
      if val
        vals[idx] += val
      end
      
      if (limit = limits[idx]) && vals[idx] > limit
        vals[idx - 1] += vals[idx] / limit
        vals[idx] %= limit
      elsif (max = maxes[idx]) && vals[idx] >= max
        vals[idx - 1] += vals[idx] / max
        vals[idx] %= max
      end
    end

    return Time.local(*vals)
  end
end


module ApacheLog

  class Parser

    FIELDS = [
      [ "%h",                 "remotehost", '\S+'        ],
      [ "%l",                 "identity",   '\S+'        ],
      [ "%u",                 "userid",     '\S+'        ],
      [ "%t",                 "time",       '\[[^\]]+\]' ],
      [ "%r",                 "request",    '[^\"]+'     ],
      [ "%>s",                "status",     '\d+'        ],
      [ "%b",                 "size",       '\S+'        ],
      [ "%{Referer}i",        "referrer",   '[^\"]*'     ],
      [ "%{User-agent}i",     "useragent",  '[^\"]*'     ],
    ]
    
    attr_reader :format
    attr_reader :conf_file
    attr_reader :writer

    # "%h %l %u %t \"%r\" %>s %b"

    def initialize(writer, fmt = nil)
      @format     = fmt
      @regexp     = nil
      @order      = Array.new
      @captures   = Array.new
      @fields     = nil
      @match      = Hash.new
      @writer     = writer
    end

    def match(field, value)
      @match[field.to_s] = value
    end

    def nomatch(field, value)
      @match[field.to_s] = case value
                           when Regexp
                           when String
                           end
    end

    def captures=(cap)
      @regexp = nil
      @captures = cap
    end

    def captures
      @captures
    end

    def capture(what)
      @captures << what
    end

    def fields
      @fields ||= format_to_records.collect do |rec|
        rec[1]
      end
    end
    
    def regexp
      @regexp ||= begin
                    restr = '^' + @format.dup
                    
                    for field in FIELDS
                      symbol = field[0]
                      name   = field[1]
                      symre  = field[2]
                      
                      restr.sub!(symbol) do
                        re = @match[name] || symre
            
                        if @captures.include?(name)
                          re = '(' + re + ')'
                        end
            
                        re
                      end
                    end
                    
                    Regexp.new(restr)
                  end
    end
    
    def format_to_records
      recs = []
      @format.scan(%r{ [^\s\"]+ }msx).each do |match|
        for field in FIELDS
          if match == field[0]
            recs << field
            break
          end
        end
      end
      recs
    end

    def format_to_fields
      format_to_records.collect { |rec| rec[0] }
    end

    def convert_time(tmstr)
      str = tmstr.gsub('/', '-')
      str.sub!(%r{ (\d+):(\d+:\d+:\d+) }msx) { $1 + " " + $2 }
      Time.local(*ParseDate.parsedate(str))
    end

    def parse_line(line, args = Hash.new)
        if args[:verbose]
          puts "line    : " + line
          puts "regexp  : #{regexp}"
          puts "captures: " + captures.inspect
          puts "fields  : " + fields.inspect
        end

      if md = regexp.match(line)
        act = {}
        fidx = 1
        fields.each do |field|
          if captures.include?(field)
            act[field] = md[fidx]

            if field == "time"
              act[field] =  convert_time(md[fidx])
            end
            
            fidx += 1
          elsif args[:verbose]
            puts "not captured #{field}"
          end
        end

        if args[:verbose]
          puts "act: #{act.inspect}"
        end

        if @writer
          @writer << act
        end

        act
      elsif line.length > 0
        if args[:verbose]
          puts "not a match: #{line}"
        end
        nil
      end
    end

    def parse_file(file, args = Hash.new)
      file.collect do |line|
        parse_line(line, args)
      end.compact
    end

  end

  class Writer

    attr_reader :activities

    def initialize
      @activities = Array.new
    end

    def <<(act)
      @activities << act
    end

    def print_users(args = Hash.new)
      users = []
      byuser = {}

      out = args[:out] || $stdout
      
      @activities.each do |act|
        user = act['remotehost']
        users << user unless users.include?(user)
        (byuser[user] ||= []) << act
      end

      users.each do |user|
        host = args[:lookup] ? begin
                                 ip = user.split('.').collect { |x| x.to_i }.pack("CCCC")
                                 host = Socket::gethostbyaddr(ip)[0]
                               rescue SocketError => e
                                 user
                               end : user
        
        out.puts "    user: #{host}"

        origtime = nil

        acts = byuser[user]

        acts.each_with_index do |act, idx|
          if idx == 0
            out.puts "        agent: #{act['useragent']}"
            out.puts "        ref  : #{act['referrer']}"
            out.puts "        start: #{act['time']}"
            origtime = act['time']
          end

          timediff = act['time'] - origtime
          out.printf "            [%5d] [%3s] %s\n", timediff, act['status'], act['request'].split(%r{\s+})[1]
        end
        out.puts
      end
    end

    def print_hit_counts(args = Hash.new)
      hwriter = HitCountWriter.new(@activities)
      hwriter.print(args)
    end

    def print_hit_averages(args = Hash.new)
      hwriter = HitAverageWriter.new(@activities)
      hwriter.print(args)
    end

    def print_referrers(args = Hash.new)
      out = args[:out] || $stdout
     
      out.puts "referrers"

      referrers = Hash.new(0)
      
      @activities.each do |act|
        ref = act['referrer']
        unless args[:skip] && ref.index(args[:skip])
          referrers[ref] += 1
        end
      end

      sorter = nil
      
      if args[:sort] == "name"
        sorter = Proc.new { |a, b| a[0] <=> b[0] }
      else
        sorter = Proc.new do |a, b|
          cmp = b[1] <=> a[1]
          if cmp == 0
            cmp = a[0] <=> b[0]
          end
          cmp
        end
      end
      
      sorted = referrers.sort do |a, b|
        sorter.call(a, b)
      end

      sorted.each do |ref, count|
        out.printf "%8d %s\n", count, ref
      end
      
      out.puts
    end

    def compare(a, b, factor)
      factor * (a <=> b)
    end

    def print_resources(args = Hash.new)
      out = args[:out] || $stdout
     
      out.puts "resources"

      resources = Hash.new(0)

      @activities.each do |act|
        res = act['request']
        unless args[:skip] && res.index(args[:skip])
          res = res.split(%r{\s+})[1]
          resources[res] += 1
        end
      end

      sorter = nil
      
      if args[:sort] == "name"
        sorter = Proc.new { |a, b| a[0] <=> b[0] }
      else
        factor = args[:order] == "ascending" ? 1 : -1
        sorter = Proc.new do |a, b|
          if (cmp = compare(a[1], b[1], factor)) == 0
            cmp = a[0] <=> b[0]
          end
          cmp
        end
      end
      
      sorted = resources.sort do |a, b|
        sorter.call(a, b)
      end

      sorted.each do |res, count|
        out.printf "%8d %s\n", count, res
      end
      
      out.puts
    end

  end

  class HitWriter

    def initialize(activities)
      @activities = activities
    end

    def print_number_line(out, numbers, times)
      out.print numbers.collect { |ct| sprintf "%8s", ct }.join(" ")

      fields = times.collect { |x| sprintf "%02d", x }
      
      datestr = fields[0 .. 2].join("-")
      if fields.length > 3
        datestr += ": " + fields[3 ... fields.length].join(":")
      end
      
      out.printf "    %s\n", datestr
    end

    def print_banner_line(out, fields, time)
      out.print fields.collect { |ct| sprintf "%8s", ct }.join(" ")
      out.printf "    %s\n", time
    end

    def print_banner(title, args = Hash.new)
      out = args[:out] || $stdout
      
      out.puts title

      unit = args[:unit] || "hour"

      allfields = %w{ year mon day hour min sec }

      fidx = allfields.index(unit)
      unless fidx
        $stderr.puts "no field found for unit #{unit}; must be one of #{allfields.join(', ')}"
        exit(-2)
      end

      fields = allfields[0, fidx + 1]
      
      print_banner_line(out, fields, "time")
      print_banner_line(out, fields.collect { |s| '-' * s.length }, "----")

      fields
    end

  end

  class HitCountWriter < HitWriter

    def print(args = Hash.new)
      out = args[:out] || $stdout

      fields = print_banner("hit counts by time", args)

      unit = args[:unit] || "hour"
      
      counts = Array.new
      fields.each_with_index { |field, idx| counts[idx] = 0 }

      # the log is already/always sorted by time, so this should be unnecessary
      #     sorted = @activities.sort do |a, b|
      #       cmp = a.nil?.to_s <=> b.nil?.to_s
      #       if cmp == 0 
      #         cmp = a["time"] <=> b["time"] 
      #       end
      #       cmp
      #     end

      # sorted = @activities

      meths     = fields.collect { |f| f.to_sym }
      prevtime  = nil

      @activities.each do |act|
        tm = prevtime || args[:starttime]
        if tm
          meths.each_with_index do |meth, idx|
            if act['time'].send(meth) != tm.send(meth)
              print_count_lines(out, counts, tm)
              
              (idx ... fields.length).each do |i|
                counts[i] = 0
              end
              
              break
            end
          end
        end

        counts.collect! { |ct| ct + 1 }

        prevtime = act['time']
      end
      
      if prevtime
        # puts "prevtime: #{prevtime}"
        print_count_lines(out, counts, prevtime)

        tm = Timish::increment(prevtime, { args[:unit] => 1 })

        # puts "args[:endtime]: #{args[:endtime]}; tm: #{tm}"

        if args[:endtime] && tm < args[:endtime] # && tm.to_i <= args[:endtime].to_i
          # puts "args[:endtime]: #{args[:endtime]}; tm: #{tm}"
          print_interval_by_time(out, tm, args[:endtime], counts)
        end
      elsif args[:starttime] && args[:endtime]
        print_interval_by_time(out, args[:starttime], args[:endtime], counts)
      end
      
      out.puts
    end

    def print_count_lines(out, counts, time)
      @lasttimes  ||= nil
      @prevcounts ||= nil

      fields = time.to_a
      # get the first 6 fields, then get only those matching the counts we are dumping.
      times  = fields[0 .. 5].reverse[0 ... counts.length]

      # dump zeros for every time from the previous one to this one.
      if @lasttimes
        print_interval_by_array(out, @lasttimes, times, @prevcounts, false)
      else
        # puts "no last times"
      end
      
      @lasttimes = times.dup
      @prevcounts = counts.dup
      
      fields = times.collect { |x| sprintf "%02d", x }

      datestr = fields[0 .. 2].join("-")
      if fields.length > 3
        datestr += ": " + fields[3 ... fields.length].join(":")
      end
      
      out.print counts.collect { |ct| sprintf "%8s", ct }.join(" ")

      out.printf "    %s\n", datestr
    end

    def print_interval_by_time(out, fromtime, totime, counts)
      from, to = [ fromtime, totime ].collect do |x|
        Timish::time_to_array(x)[0 ... counts.length]
      end
      
      print_interval_by_array(out, from, to, counts, true)
    end

    def print_interval_by_array(out, from, to, counts, inclusive)
      (0 ... from.length - 1).each do |idx|
        if from[idx] < to[idx]
          zeroed = counts.dup
          fr = from.dup
          unless inclusive
            fr[-1] += 1
          end

          while fr != to
            (fr.length - 1).downto(1) do |fi|
              begin
                tm = Time.local(*fr)
                fr = tm.to_a.reverse[-6, fr.length]
                if fr != to
                  (fi ... zeroed.length).each { |zi| zeroed[zi] = 0 }
                  print_number_line(out, zeroed, fr)
                  fr[-1] += 1
                end
                break
              rescue ArgumentError => e
                # fr = fr[0 .. fi - 1] + Array.new(fr.length - fi, 0)
                (fi ... fr.length).each { |i| fr[i] = 0 }
                
                case fi
                when 2
                  # day of the month
                  fr[1] += 1
                  fr[2] = 1
                else
                  fr[fi - 1] += 1
                end
                # and try again.
              end
            end
          end
          puts "returning"
          return
        end
      end

      idx = from.length - 1
      
      if from[idx] + 1 < to[idx] || (inclusive && from[idx] + 1 == to[idx])
        x = from.dup
        unless inclusive
          x[-1] += 1
        end
        y = to.dup
        while x[-1] < y[-1] || (inclusive && x[-1] == y[-1])
          print_number_line(out, counts[0 .. -2] + [0], x)
          x[-1] += 1
        end
      end
    end

  end

  class HitAverageWriter < HitWriter

    def print(args = Hash.new)
      out = args[:out] || $stdout

      fields = print_banner("hit averages by time", args)

      unit = args[:unit] || "hour"
      
      hits = Array.new
      fields.each_with_index { |field, idx| hits[idx] = [1, 0] }

      puts "hits0: {hits.inspect}"

      meths     = fields.collect { |f| f.to_sym }
      prevtime  = nil

      @activities.each do |act|
        tm = prevtime || args[:starttime]
        if tm
          meths.each_with_index do |meth, idx|
            if act['time'].send(meth) != tm.send(meth)

              puts "hits1: {hits.inspect}"

              # print_count_lines(out, counts, tm)
              
              (idx ... fields.length).each do |i|
                hits[i] = [1, 0]
              end
              
              break
            end
          end
        end

        hits.collect! { |x| [x[0], x[1] + 1] }
        
        puts "hits2: #{hits.inspect}"
        
        #counts.collect! { |ct| ct + 1 }

        prevtime = act['time']
      end
      
      if prevtime
        # puts "prevtime: #{prevtime}"
        print_count_lines(out, counts, prevtime)

        tm = Timish::increment(prevtime, { args[:unit] => 1 })

        # puts "args[:endtime]: #{args[:endtime]}; tm: #{tm}"

        if args[:endtime] && tm < args[:endtime] # && tm.to_i <= args[:endtime].to_i
          # puts "args[:endtime]: #{args[:endtime]}; tm: #{tm}"
          print_interval_by_time(out, tm, args[:endtime], counts)
        end
      elsif args[:starttime] && args[:endtime]
        print_interval_by_time(out, args[:starttime], args[:endtime], counts)
      end
      
      out.puts
    end

  end

end

def make_time_pattern(year, month, day)
  mon = month.nil? ? '\w+' : month.kind_of?(Integer) ? Time::RFC2822_MONTH_NAME[month - 1] : month
  day = day.nil? ? '\d+' : sprintf("%02d", day)
  [ day, mon, year ].join("/")
end

def make_time_match(tm)
  unit      = nil
  starttime = nil
  endtime   = nil

  time = case tm
         when "today"
           unit      = "hour"
           now       = Time.new
           starttime = Time.local(now.year, now.mon, now.day)
           endtime   = now
           
           make_time_pattern(now.year, now.mon, now.day)

         when "yesterday"
           unit      = "hour"
           now       = Time.new
           starttime = Time.local(now.year, now.mon, now.day - 1)
           endtime   = Time.local(now.year, now.mon, now.day) - 1

           make_time_pattern(now.year, now.mon, now.day - 1)

         when %r{this\-?week}
           unit = "day"

           mon = Time.new
           while mon.wday != 1
             mon -= SECONDS_IN_DAY
           end

           days = [ mon ]

           day = mon
           while day.wday != Time.new.wday
             day += SECONDS_IN_DAY
             days << day
           end

           today = Time.new

           starttime = Time.local(mon.year, mon.mon, mon.day)
           endtime   = Time.local(day.year, day.mon, day.day) - 1

           "(?:" + days.collect { |d| d.strftime("%d/%b/%Y") }.join("|") + ")"

         when %r{this\-?month}
           unit      = "day"
           now       = Time.new
           starttime = Time.local(now.year, now.mon, 1)
           endtime   = Time.local(now.year, now.mon, now.day)
           
           make_time_pattern(now.year, now.mon, nil)

         when %r{this\-?year}
           unit      = "mon"
           now       = Time.new
           starttime = Time.local(now.year, 1, 1)
           endtime   = Time.local(now.year, now.mon, now.day)
           
           make_time_pattern(now.year, nil, nil)

         else
           tm
         end

  time = "\\[" + time + "[^\\\]]*\\]"
  [ unit, time, starttime, endtime ]
end

SECONDS_IN_DAY = 60 * 60 * 24

def read_rcfile(fname)

  config = Array.new
    
  if File.exists?(fname)
    IO.readlines(fname).each do |line|
      line.sub!(/\s*#.*/, "")
      line.chomp!
      name, value = line.split(/\s*[=:]\s*/)
      next unless name && value

      config << [ name, value ]
    end
  end

  config
end

module Env

  # Returns the home directory, for both Unix and Windows.
  def Env.home_directory
    if hm = ENV["HOME"]
      hm
    else
      hd = ENV["HOMEDRIVE"]
      hp = ENV["HOMEPATH"]
      if hd || hp
        (hd || "") + (hp || "\\")
      else
        nil
      end
    end
  end

end

def main
  default_format = "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\""

  format    = default_format

  # possible matches/filters
  users     = nil
  visits    = nil
  referrers = nil
  resources = nil
  time      = nil
  
  order     = []
  
  args      = { :quiet => false, :verbose => false, :averages => false }
    
  io        = nil

  if hd = Env.home_directory
    homerc = hd + "/.apercu"
    config = read_rcfile(homerc)

    config.each do |cfg|
      name, value = *cfg

      case name
      when "format"
        format = value
      when "sort-by-name"
        args[:sort] = value == "true"
      when "lookup-ips"
        args[:lookup] = value == "true"
      when "quiet"
        args[:quiet] = value == "true"
      else
        $stderr.puts "invalid option in RC file: #{name}"
      end
    end
  end
  
  opts = OptionParser.new do |opts|
    prog = File.basename(__FILE__)
    opts.banner = "Usage: #{prog} [options] input ..."

    opts.on("--format=FORMAT",
            "Use the given log format") { |format| }
    
    opts.on("--users[=FILTER]",
            "Display user information") do |users|
      order << :users
    end
    
    opts.on("--hits[=INTERVAL]",
            "Display hit counts, using the given interval") do |x| 
      args[:unit] = x.nil? ? args[:unit] : x
      order << :hits
    end

    if false
      opts.on("--hit-averages[=INTERVAL]",
              "Display hit averages, using the given interval") do |x| 
        args[:unit] = x.nil? ? args[:unit] : x
        order << :hits
      end
    end
    
    opts.on("--visits[=INTERVAL]",
            "Display visit counts, using the given interval") do |x| 
      args[:unit] = x.nil? ? args[:unit] : x
      order << :visits
    end
    
    opts.on("--visit-averages[=INTERVAL]",
            "Display visit averages, using the given interval") do |x| 
      args[:unit] = x.nil? ? args[:unit] : x
      order << :visits
    end
    
    opts.on("--referrers[=FILTER]",
            "Display referrers, optionally filtering") do |referrers|
      order << :referrers
    end
    
    opts.on("--resources[=FILTER]",
            "Display resources, optionally filtering") do |resources|
      order << :resources
    end

    opts.on("--sort-by-name",
            "Sort resources and referrers by name instead of by count") do
      args[:sort] = "name"
    end

    opts.on("--time[=PERIOD]",
            "Display results from only the given time period") do |tm|
      unit, time, starttime, endtime = make_time_match(tm)
      args[:unit]      ||= unit
      args[:starttime]   = starttime
      args[:endtime]     = endtime
    end

    if false
      opts.on("--timeout[=PERIOD]",
              "Set the visit timeout period, in seconds.") do |tm|
        args[:timeout] = tm
      end
    end
    
    opts.on("--lookup-ips",
            "Resolve IP addresses to host names") { |args[:lookup]| }

    opts.on("--averages",
            "Display hits and visits by their averages") { |args[:averages]| }
    
    opts.on("--quiet",
            "Run with minimal output") { |args[:quiet]| }

    opts.on("--verbose[=LEVEL]", "-V", "Run with verbose output") do |val|
      args[:verbose] = val ? val.to_i : (args[:verbose] || 0) + 1 
    end

    opts.on("-v", "Print the version and exit") do |val|
      puts "#{prog} #{$VERSION}"
      puts "Written by Jeff Pace (jpace@incava.org)"
      puts "Released under the Apache License."
      exit
    end

    opts.on_tail("--help", "-h", "Display this message") do
      puts opts
      exit
    end
  end

  begin
    opts.parse!(ARGV)
  rescue OptionParser::ParseError => e
    puts e
    puts opts.banner
    exit(-2)
  end

  if order.length == 0
    order = [ :users, :hits, :referrers, :resources ]
    # order = [ :users, :visits, :hits, :referrers, :resources ]
  end

  captures = []
  order.each do |ord|
    capts = case ord
            when :users
              %w{ remotehost status time request size referrer useragent request }
            when :visits
              %w{ remotehost }
            when :hits
              %w{ time }
            when :referrers
              %w{ referrer }
            when :resources
              %w{ request }
            end
    captures += capts
  end

  captures.uniq!
  
  unless io
    if ARGV.length == 0 || ARGV[0] == "-"
      io = $stdin
      unless args[:quiet]
        $stderr.puts "reading standard input ..."
      end
    else
      io = File.open(ARGV[0])
    end
  end

  writer = ApacheLog::Writer.new
  parser = ApacheLog::Parser.new(writer, format)

  parser.captures = captures
  matches = [
    [ :remotehost, users     ],
    [ :referrer,   referrers ],
    [ :request,    resources ],
    [ :time,       time      ]
  ]

  matches.each do |match|
    if match[1]
      parser.match(match[0], match[1])
    end
  end

  parser.parse_file(io, args)

  order.each do |ord|
    case ord
    when :users
      writer.print_users(args)
    when :hits
      writer.print_hit_counts(args)
    when :visits
      writer.print_visit_counts(args)
    when :referrers
      writer.print_referrers(args)
    when :resources
      writer.print_resources(args)
   end
  end
end

if __FILE__ == $0
  begin
    main
  rescue Interrupt => e
    # puts "e: " + e.inspect
  end
end
