#!/usr/bin/perl
# -*- perl -*-

=head1 NAME

http_load_ Munin multigraph plugin to monitor websites's HTTP responses and performance

=head1 DESCRIPTION

The purpose of this plugin is to monitor several properties of a web page.
All measurements are done for the complete web page, including images, css
and other content a standard browser would download automatically.

This version supports monitoring:
 - loadtime: total time to download a complete web page (using serial GET requests)
 - size: total size of a web page
 - response: different response codes (200, 404, 500, etc)
 - tags: HTML tags (img src, a href, etc)
 - type: content types (image/png, text/css/, etc)
 - elements: source of elements loaded by the web page

=head1 REQUIREMENTS

 - The server running this plugin must be allowed  to connect to the web 
   server(s) you are going to monitor.
 - Some perl modules: 
   Time::HiRes, LWP::UserAgent, HTML::LinkExtor, LWP::ConnCache

=head1 CONFIGURATION

=head2 INITIAL SETUP

1. Copy this file to /usr/share/munin/plugins/

2. Create a file (/etc/munin/http_load_urls.txt) with one
   full url per line, as many as you want, i.e.:
    $ echo "http://www.dn.no/" >> /etc/munin/urls.txt
    $ echo "http://www.intrafish.no/" >> /etc/munin/urls.txt

3. Add a cron job running the plugin with cron as the argument:
   */15 * * * * <user> /usr/sbin/munin-run http_load_<site>_loadtime cron
   <user> should be the user that has write permission to the $cachedir
   directory set below. <site> should be any of the configured sites (all
   sites will get updated), likewise, you should replace loadtime by any
   metric that is enabled for that site (all metrics will get updated).
   Set the intervals to whatever you want.

   For verbose output (for debugging) you can do:
   sudo -u <user> /usr/share/munin/plugins/http_load_ cron verbose

4. Run munin-node-configure --suggest --shell and run the symlink
   commands manually to update the munin-node plugin list.xi

5. If you want to change the filter which the plugin uses to select which
   tags to follow in a web page, edit the subroutine called "filter" below.)

=head2 SPECIFY URLS TO MONITOR

1. Add a new line in /etc/munin/urls.txt with the full URL, i.e.:
    $ echo "http://www.linpro.no/" >> /etc/munin/http_load_urls.txt

2. Run munin-node-configure --suggest --shell and manually
   add the new symlink(s)

3. /etc/init.d/munin-node restart

=head2 REMOVE A URL

1. Remove it from /etc/munin/http_load_urls.txt

2. Remove ${cachedir}/http_load_<url_id>*

3. Remove /etc/munin/plugins/http_load_<url_id>*

4. /etc/init.d/munin-node restart

=head2 SINGLE GRAPH SUPPORT

The default behaviour is the multigraph mode: only the loadtime will be shown
on the Munin summary page. The graphs there are linked to a second-level
summary page that list all other metrics. It is also possible to create
single graphs, that would show immediately on the summary page, by using
symlinks with a different name, postfixed with the name of the metric:

 - http_load_hostname:		multigraph (default)
 - http_load_hostname_loadtime:	loadtime only
 - http_load_hostname_size:	total page size
 - http_load_hostname_response:	response code
 - http_load_hostname_tags:	HTML tags summary
 - http_load_hostname_type:	Content-Types
 - http_load_hostname_elements:	source site of the loaded elements

Note that hostname is not the FQDN of the host, but rather the one given when
running munin-node-configure --suggest --shell and run the symlink

=head1 MAGIC MARKERS

  #%# family=auto
  #%# capabilities=autoconf suggest

=head1 TODO

 - Specify URLs from a standard Munin plugins configuration file (e.g., env.urls)
 - Add support for forking to simulate real browsers

=head1 AUTHORS

 - Espen Braastad / Linpro AS <espen@linpro.no>, initial implementation
 - Olivier Mehani <shtrom+munin@ssji.net>, multigraph support

=cut

use strict;
use Time::HiRes qw( gettimeofday tv_interval );
use LWP::UserAgent;
use HTML::LinkExtor;
use LWP::ConnCache;

my $url_file="/etc/munin/http_load_urls.txt";
my $cachedir=$ENV{MUNIN_PLUGSTATE};

my $debug=$ENV{MUNIN_DEBUG};
my $timeout=10;
my $max_redirects=10;
my $scriptname="http_load_";
my $useragent="Mozilla/5.0 (Munin; $scriptname)";

# Function to read the $url_file and return the contents in a hash
sub read_urls{
	my $file=$_[0];
	my %urls=();
	if(-r $file){
		open(FILE,'<'.$file);
		while (<FILE>) { 
			my $url=$_;
			chomp($url);
			my $id=get_id($url);
			if(length($id)>0){
				$urls{$id}=$url;
			}
		}
		close (FILE);
	}
	return %urls;
}

# Function to read cache, return a hash
sub read_cache{
	my $file=$_[0];
	my %cache=();
	if(-r $file){
		open(FILE,'<'.$file);
		while (<FILE>) { 
			m/^(\S*)\s+(.*)$/;
			$cache{ $1 } = $2;
		}
		close (FILE);
	}
	return %cache;
}

# Function to filter the html tags, which files do we want to download
sub filter{
	my $tag=$_[0];
	my $status=1;

	# Some example data:
	# link href http://www.intrafish.no/template/include/css/intrafish.css
	# script src http://www.intrafish.no/template/include/js/intrafish.js
	# a href http://adserver.adtech.de/?adlink%7C2.0%7C405%7C119488%7C1%7C16%7CADTECH;grp=8491;loc=300;
	# img src http://adserver.adtech.de/?adserv%7C2.0%7C405%7C119488%7C1%7C16%7CADTECH;grp=8491;
	# area href http://go.vg.no/cgi-bin/go.cgi/sol/http://www.sol.no/sgo/vg/http://www.sol.no/underholdning/humor/?partnerid=vg

	# status=1 => do download (default)
	# status=0 => do not download

	# For links, the 'rel' is more relevant that the 'src' attribute
	if("$tag" =~ /^link/){
		$status=0;
		if("$tag" =~ /stylesheet$/){
			$status=1;
		}
	}
	if("$tag" eq "form action"){
		$status=0;
	}
	if("$tag" eq "a href"){
		$status=0;
	}
	if("$tag" eq "area href"){
		$status=0;
	}
	if("$tag" eq "meta content"){
		$status=0;
	}
	return $status;
}

# Return the cache file name for this plugin
sub get_cache_file_name{
	my $scriptname=$_[0];
	my $id=$_[1];
	my $file="";

	$file = $scriptname . $id . ".cache";
	$debug && print "Cache file: " . $file . "\n";

	return $file;
}

# Get fieldname (making sure it is munin-1.0 "compatible" as a fieldname)
# 1. Remove all non-word characters from a string)
# 2. Make sure it has maximum 19 characters
#    2.1 If not, truncate the host part, while keeping anything after an underscore (e.g., HTTP response status)
sub get_fieldname{
	my $url=$_[0];
	$url =~ s/\W//g;
	if(length($url) > 19){
		$url =~ s/(\S+)_(\S+)/ /g;
		my $host = $1;
		my $info = $2;
		my $suffixlength = length($info) + 1;
		if ($suffixlength > 1) {
			$url = substr($host, 0, 19 - $suffixlength) . '_' . $info;
		} else {
			$url = substr($url, 0, 19);
		}
	}
	return $url;
}

# Same as get_fieldname except it doesn't substr
sub get_id{
	my $url=$_[0];
	$url =~ s/[\W_]//g;
	return $url;
}

sub graph_title_config{
	my $id = $_[0];
	my %urls = %{$_[1]};
	my $type = $_[2];

        print "graph_title $urls{$id} ${type}\n";
        print "graph_args -l 0 --base 1000\n";
        print "graph_category webserver\n";
}

sub size_config{
	my $id = $_[0];
	my %urls = %{$_[1]};
	my %cache = %{$_[2]};

	my $count = 0;

	graph_title_config($id, \%urls, "size");

	print "graph_vlabel Bytes\n";
	print "graph_total Total\n";
	print "graph_info This graph is generated by a set of serial GETs to calculate the total size of $urls{$id}.\n";

	if(keys(%cache)>0){
		for my $key ( sort reverse keys %cache ){
			my $value=$cache{$key};

			if($key =~ m/^size_(\S+)$/){
				my $host=$1;
				my $value=$value;

				my $name=$1;
				$name=get_fieldname($name);

				print "$name.label from $host\n";
				print "$name.min 0\n";
				print "$name.max 20000000\n";
				if($count eq 0){
					print "$name.draw AREA\n";
				} else {
					print "$name.draw STACK\n";
				}
				$count+=1;
			}
		}
	}
}

sub loadtime_config{
	my $id = $_[0];
	my %urls = %{$_[1]};
	my %cache = %{$_[2]};

	my $count = 0;

	graph_title_config($id, \%urls, "loadtime");

	print "graph_vlabel Seconds\n";
	print "graph_total Total\n";
	print "graph_info This graph is generated by a set of serial GETs to calculate the total time to load $urls{$id}. ";
	print "Note that browsers usually fork() the GET requests, resulting in a shorter total loading time.\n";
	
	if(keys(%cache)>0){
		for my $key ( sort reverse keys %cache ){
			my $value=$cache{$key};

			if($key =~ m/^loadtime_(\S+)$/){
				my $host=$1;
				my $value=$value;

				my $name=$1;
				$name=get_fieldname($name);

				print "$name.label from $host\n";
				print "$name.min 0\n";
				print "$name.max 400\n";
				if($count eq 0){
					print "$name.draw AREA\n";
				} else {
					print "$name.draw STACK\n";
				}
				$count+=1;
			}
		}
	}
}

sub elements_config{
	my $id = $_[0];
	my %urls = %{$_[1]};
	my %cache = %{$_[2]};

	my $count = 0;

	graph_title_config($id, \%urls, "elements");

	print "graph_vlabel Number of elements\n";
	print "graph_total Total\n";
	print "graph_info This graph is generated by a set of serial GETs to count the number of elements (images, CSS files, etc) from $urls{$id}.\n";

	if(keys(%cache)>0){
		for my $key ( sort reverse keys %cache ){
			my $value=$cache{$key};

			if($key =~ m/^elements_(\S+)$/){
				my $host=$1;
				my $value=$value;

				my $name=$1;
				$name=get_fieldname($name);

				print "$name.label from $host\n";
				print "$name.min 0\n";
				print "$name.max 10000\n";
				if($count eq 0){
					print "$name.draw AREA\n";
				} else {
					print "$name.draw STACK\n";
				}
				$count+=1;
			}
		}
	}
}

sub response_config{
	my $id = $_[0];
	my %urls = %{$_[1]};
	my %cache = %{$_[2]};

	my $count = 0;

	graph_title_config($id, \%urls, "response");

	print "graph_vlabel Server response code count\n";
	print "graph_total Total\n";
	print "graph_info This graph is generated by a set of serial GETs to visualize the server response codes received while loading $urls{$id}.\n";

	if(keys(%cache)>0){
		for my $key ( sort reverse keys %cache ){
			my $value=$cache{$key};

			if($key =~ m/^response_(\S+)$/){
				my $host=$1;
				my $value=$value;

				my $name=$1;
				$name=get_fieldname($name);

				$host =~ s/\_/ /g;
				$host =~ s/(\S+)\s(\d+)/ /g;
				$host=$1;
				my $code=$2;

				print "$name.label $host ($code)\n";
				print "$name.min 0\n";
				print "$name.max 10000\n";
				if($count eq 0){
					print "$name.draw AREA\n";
				} else {
					print "$name.draw STACK\n";
				}
				$count+=1;
			}
		}
	}
}

sub type_config{
	my $id = $_[0];
	my %urls = %{$_[1]};
	my %cache = %{$_[2]};

	my $count = 0;

	graph_title_config($id, \%urls, "type");

	print "graph_vlabel Content type count\n";
	print "graph_total Total\n";
	print "graph_info This graph is generated by a set of serial GETs to visualize the different content types $urls{$id} consists of.\n";

	if(keys(%cache)>0){
		for my $key ( sort reverse keys %cache ){
			my $value=$cache{$key};

			if($key =~ m/^type_(\S+)$/){
				my $type=$1;
				my $value=$value;

				my $name=$1;
				$name=get_fieldname($name);

				#$host =~ s/\_/ /g;
				#$host =~ s/(\S+)\s(\S+)/ /g;
				#$host=$1;
				#my $type=$2;

				print "$name.label $type\n";
				print "$name.min 0\n";
				print "$name.max 100000\n";
				if($count eq 0){
					print "$name.draw AREA\n";
				} else {
					print "$name.draw STACK\n";
				}
				$count+=1;
			}
		}
	}
}

sub tags_config{
	my $id = $_[0];
	my %urls = %{$_[1]};
	my %cache = %{$_[2]};

	my $count = 0;

	graph_title_config($id, \%urls, "tags");

	print "graph_vlabel HTML tag count\n";
	print "graph_total Total\n";
	print "graph_info This graph is generated by a set of serial GETs to visualize the different tags $urls{$id} consists of.\n";

	if(keys(%cache)>0){
		for my $key ( sort reverse keys %cache ){
			my $value=$cache{$key};

			if($key =~ m/^tags_(\S+)$/){
				my $host=$1;
				my $value=$value;

				my $name=$1;
				$name=get_fieldname($name);

				$host =~ s/\W/ /g;

				print "$name.label $host\n";
				print "$name.min 0\n";
				print "$name.max 100000\n";
				if($count eq 0){
					print "$name.draw AREA\n";
				} else {
					print "$name.draw STACK\n";
				}
				$count+=1;
			}
		}
	}
}

sub cache_values{
	my %cache = %{$_[0]};
	my $type = $_[1];

	if(keys(%cache)>0){
		for my $key ( sort keys %cache ){
			my $value=$cache{$key};
			if($key =~ m/^([A-Za-z]+)\_(\S+)$/){
				my $name=$2;
				
				if ($1 eq $type){
					$name=get_fieldname($name);
					print $name . ".value " . $value . "\n";
				}
			} elsif(m/^(\S+)\s+(\S+)$/){
				if ($1 eq $type){
					print $1 . ".value " . $2 . "\n";
				}
			}
		}
	}
}

sub multi_config{
	my $id = $_[0];
	my %urls = %{$_[1]};
	my %cache = %{$_[2]};

	my $count = 0;


	print "multigraph http_load_$id\n";
	loadtime_config($id, \%urls, \%cache);

	print "\nmultigraph http_load_$id.loadtime\n";
	loadtime_config($id, \%urls, \%cache);

	print "\nmultigraph http_load_$id.size\n";
	size_config($id, \%urls, \%cache);

	print "\nmultigraph http_load_$id.elements\n";
	elements_config($id, \%urls, \%cache);

	print "\nmultigraph http_load_$id.response\n";
	response_config($id, \%urls, \%cache);

	print "\nmultigraph http_load_$id.type\n";
	type_config($id, \%urls, \%cache);

	print "\nmultigraph http_load_$id.tags\n";
	tags_config($id, \%urls, \%cache);

}

sub multi_values{
	my $id = $_[0];
	my %cache = %{$_[1]};

	my $count = 0;


	print "multigraph http_load_$id\n";
	cache_values(\%cache, "loadtime");

	print "\nmultigraph http_load_$id.loadtime\n";
	cache_values(\%cache, "loadtime");

	print "\nmultigraph http_load_$id.size\n";
	cache_values(\%cache, "size");

	print "\nmultigraph http_load_$id.elements\n";
	cache_values(\%cache, "elements");

	print "\nmultigraph http_load_$id.response\n";
	cache_values(\%cache, "response");

	print "\nmultigraph http_load_$id.type\n";
	cache_values(\%cache, "type");

	print "\nmultigraph http_load_$id.tags\n";
	cache_values(\%cache, "tags");

}
$debug && print "Scriptname: " . $scriptname . "\n";

# Get the url id and the type of the graph
#
# The filename format is http_load_X_Y where
# X: The line number in urls.txt
# Y: The type of graph (elements, size, loadtime, ..)

my ($id,$type);
$0 =~ /http_load(?:_([^_]+)|)(_(.+))?\s*$/;
$id  = $1;
$type = $3;

if($type eq "") {
	$type = "multi";
}

$debug && print "Id: $id, Type: $type\n";

if($ARGV[0] and $ARGV[0] eq "autoconf") {
	my %urls=&read_urls($url_file);
	if(keys(%urls) gt 0){
		print "yes\n";
		exit(0);
	} else {
		print "no\n";
		exit(1);
	}

} elsif($ARGV[0] and $ARGV[0] eq "suggest") {
	# get the url list, print suggestions for usage
	my %urls=&read_urls($url_file);
	while ( my ($id, $url) = each(%urls) ) {
        	$debug && print "id: $id => url: $url\n";
        	print $id . "\n";
    	}
	exit(0);

} elsif($ARGV[0] and $ARGV[0] eq "cron") {
	# This thing is run by cron and should write a cache file for munin-node to 
	# read from

	my $verbose=0;
	if(
		$ENV{MUNIN_DEBUG} eq "1" or
		$ARGV[1] and $ARGV[1] eq "verbose"
	) {
		$verbose=1;
		print "Verbose output\n";
	}

	my %urls=&read_urls($url_file);
	my %output;
	my %res;
	my $t0;
	my ($request,$response,$status,$link,$contents,$page_parser,$cachefile);

	while ( my ($id, $url) = each(%urls) ) {
        	$verbose && print "Fetching $url (id: $id)... \n";
		
		$t0=0;
		$status=0;
		%output=();
		my $host="";
		if($url =~ m/\w+\:\/\/([^\/]+).*/){
			$host=$1;
        		$verbose && print " Host: $host\n";
		}

		$output{"url"}=$url;
		$output{"timestamp"}=time();
        	$verbose && print " Timestamp: " . $output{"timestamp"} . "\n";

	        my $browser = LWP::UserAgent->new();

		$browser->agent($useragent);
	        $browser->timeout(${timeout});
		$browser->max_redirect( $max_redirects );
		$browser->conn_cache(LWP::ConnCache->new());

		$response = $browser->get($url);

		# Calculating time from now:
		$t0 = [gettimeofday];
	        if ($response->is_success()) {
	                $status=1;
			$output{"elements_" . $host}+=1;
	        }

        	$contents = $response->content();
	        $output{"loadtime_" . $host} += sprintf("%.6f",tv_interval ( $t0, [gettimeofday]));
        	$output{"size_" . $host}+=length($contents);
		$output{"response_" . $host . "_" . $response->code}+=1;
		$output{"type_" . $response->content_type}+=1;

		# For <link />s, also capture the rel attribute
		$HTML::Tagset::linkElements{'link'} = [ qw( href rel ) ];
	        $page_parser = HTML::LinkExtor->new(undef, $url);
	        $page_parser->parse($contents)->eof;
	        my @links = $page_parser->links;
        	$verbose && print " Processing links:\n";

        	%res=();
	        foreach $link (@links){
			my $tag;
			my($t, %attrs) = @{$link};
			if ($attrs{rel} =~ /.*\/([^\/]+)/) {
				$tag=$$link[0] . " " . $1;
			} else {
				$tag=$$link[0] . " " . $$link[1];
			}
			$output{"tags_" . $$link[0] . "-" . $$link[1]}+=1;
	
			if(filter($tag)){
				$verbose && print "  Processing: " . $$link[0] . " " . $$link[1] . " " . $$link[2] . "\n";

				# Extract the hostname and add it to the hash
				if($$link[2] =~ m/https?\:\/\/([^\/]+).*/){
					$host=$1;
					$output{"elements_" . $host}+=1;
				}

                	        my $suburl=$$link[2];
	
				$t0 = [gettimeofday];
				$response = $browser->get($suburl);
	        		$output{"loadtime_" . $host} += sprintf("%.6f",tv_interval ( $t0, [gettimeofday]));

        			$contents = $response->content();
        			$output{"size_" . $host}+=length($contents);
				$output{"response_" . $host . "_" . $response->code}+=1;
				$output{"type_" . $response->content_type}+=1;

				$verbose && print "              Response: " . $response->code . " Size: " . length($contents) . "\n";
			} else {
				$verbose && print "  Skipping:   " . $$link[0] . " " . $$link[1] . " " . $$link[2] . "\n";
			}
		}

		$cachefile=$cachedir . "/" . &get_cache_file_name($scriptname,$id);
		$debug && print "Reading cache file: " . $cachefile . "... ";

		my %input=read_cache($cachefile);

		$debug && print "done\n";

		# Resetting all values to 0 before adding new values
		while ( my ($id, $value) = each(%input) ) {
			$input{$id}="U";
    		}
		
		# Adding new values
		while ( my ($id, $value) = each(%output) ) {
			$input{$id}=$value;
        		$verbose && print " Result: " . $id . " -> " . $value . "\n";
    		}
		
		# Writing the cache
		$verbose && print "Writing cache file: " . $cachefile . "... ";
		open(FILE,">".$cachefile);
		while ( my ($id, $value) = each(%input) ) {
			print FILE $id . " " . $value . "\n";
		}
		close(FILE);
		$verbose && print "done\n";
	}
	exit(0);
}elsif($ARGV[0] and $ARGV[0] eq "config") {
	my %urls=&read_urls($url_file);
	
	$debug && print "Reading cache file\n";
	my $cachefile=$cachedir . "/" . &get_cache_file_name($scriptname,$id);
	my %cache=read_cache($cachefile);

	$debug && print "The cache file contains " . keys(%cache) . " lines\n";

	if($type eq "size"){
		size_config($id, \%urls, \%cache)
	}elsif($type eq "loadtime"){
		loadtime_config($id, \%urls, \%cache)
	}elsif($type eq "elements"){
		elements_config($id, \%urls, \%cache)
	}elsif($type eq "response"){
		response_config($id, \%urls, \%cache)
	}elsif($type eq "type"){
		type_config($id, \%urls, \%cache)
	}elsif($type eq "tags"){
		tags_config($id, \%urls, \%cache)
	}elsif($type eq "multi"){
		multi_config($id, \%urls, \%cache)
	}
	exit(0);
} else {
	my $cachefile=$cachedir . "/" . &get_cache_file_name($scriptname,$id);
	$debug && print "Reading cache file: " . $cachefile . "\n";
	my %cache=read_cache($cachefile);
	$debug && print "Number of lines in cache file: " . keys(%cache) . "\n";

	if($type eq "multi"){
		multi_values($id, \%cache);
	} else {
		cache_values(\%cache, $type);
	}
} 

# vim:syntax=perl