#!/usr/local/bin/perl

# Feedreaders.pl -- This script estimates the number of people who read your blog and RSS
# feeds.   Simply pass it your log file(s) on the command line, or feed them to stdin.  You don't
# need that much of your log, a couple of day's worth will do.
# The program looks for entries from blog aggregators which report how many people they are
# aggregating.  It also counts ordinary fetchers of your RSS files or home page who fetch more
# than 3 times.  That may slightly overestimate if done over a long period since people change
# IP address.


# Here are some standard feed addresses.   Add more lines to this hash to define others.  You
# can alias one feed to another to combine the results on them if you uncomment some lines
# below, but usually you don't want to do that as many feed aggregators treat your aliased
# feeds as independent.

# By Brad Templeton, http://www.templetons.com/brad and http://ideas.4brad.com 
# Released under the GPL, Version 2.0
# Send changes back to btm@templetons.com


# Add any feeds you might like to count, and if you enable it, aliases for feeds with multiple points
%feeds = {
	    "/index.rdf" => 1,
	    "/index.xml" => "/index.rdf",
	    "/crss" => 1,
	    "/node/feed" => "/index.rdf",
	    "/battlestar/feed" => 1
};



@ARGV = @arg;

# You can uncomment this line to give the program a default file, otherwise it reads stdin
# if given no arguments
# @ARGV = ("blog_log") unless @ARGV;

while( <> ) {

	chomp;

	($host, $d, $d, $date1, $tzone, $get, $url, $proto, $stat, $size,
		$refer, $agent ) = split( / /, $_, 12 );

	next unless length $refer < 4;  # feeds have null referers
	next if $agent =~ /slurp|desktop|spider|crawl|bot/i;

	if( $url =~ /^\/(node)?$/ ) {
	    $ips{"HomePage"}->{$host}++;
	    next;
	    }

	next unless $feeds{$url} || ($url =~ /(feed|rss|.xml|\.rdf)$/);
	#substitue in alternate name if defined.  You probably don't want to do this as some
	#aggregators don't understand two fees are the same
	#$url = $feeds{$url} || $url;

	$agent =~ s/^"|"$//g;


	@agtag = split( /\s\(|;\s*/, $agent );
	my( $who ) = $agtag[0];

	$who =~ s:/.*:: if $who =~ /^activeweave/i;
	$who = "Rojo" if $agent =~ /www.rojo.com/;


	if( $agent =~ /(\d+) (subscriber|reader)/i ) {
	    $subcount = $1;
	    $counts{$url}->{$who} = $subcount;
	    }
	 else{
	    $ips{$url}->{$host}++;
	 }
    

	}

# Now sum up for all 
foreach $page( keys %ips ) {
    @hosts = grep( $ips{$page}->{$_} > 2, keys %{$ips{$page}} );
    $counts{$page}->{"Singles"} = scalar @hosts;
}

foreach $u ( keys %counts ) {
    $subhash = $counts{$u};
    foreach( keys %{$subhash} ) {
	$totals{$u} += $subhash->{$_};
	}
}
foreach $u ( sort {$totals{$a} <=> $totals{$b}}  keys %counts ) {
    next if $totals{$u} < 3;  	# don't show very low readership
    $subhash = $counts{$u};
    foreach( sort {$subhash->{$a} <=> $subhash->{$b}} keys %{$subhash} ) {
	print "$subhash->{$_}\t\t$_\n";
        }
    print   "-----\t----------------\n";
    print   "$totals{$u}\t$u\n\n";
}

$maintotal = 0;
foreach ( keys %totals ) {
    $maintotal += $totals{$_}, push(@main,$_)  if m:^/index|node/feed|blog/1/feed|HomePage:;
    }

printf "\n\nMain Feed total: $maintotal\t(From %s)\n", join( ' + ', @main );
