2
0
Fork 0
mirror of https://github.com/munin-monitoring/contrib.git synced 2018-11-08 00:59:34 +01:00
contrib-munin/plugins/vmware/esx_

491 lines
22 KiB
Perl
Executable file

#!/usr/bin/perl -w
#
# -== Munin plugin for VMware ESXi/vSphere monitoring ==-
#
# Copyright (c) 2012 - Stefan Seidel <munin@stefanseidel.info>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
#
# This plugin uses the vSphere SDK for Perl available at
# http://www.vmware.com/support/developer/viperltoolkit/
# or included in the vSphere CLI available at
# http://www.vmware.com/support/developer/vcli/
# The use of the SDK is subject to the terms and condition
# of VMware, Inc. to which you must agree upon installation.
#
#
# -== Usage ==-
# Put this file in /usr/share/munin/plugins, `chmod +x` it and
# `ln -s` it to /etc/munin/plugins/esx_<hostname of server to monitor>
#
use strict;
use sort 'stable'; # guarantee stability
no warnings; # don't want warnings in output
use VMware::VIRuntime; # need to install VIM SDK (vSphere CLI/SDK 4.1 or newer)
use VMware::VILib;
use VMware::VIExt;
use Data::Dumper;
use DateTime::Format::ISO8601; # may need to install "libdatetime-format-iso8601-perl" on Debian-based systems
use List::Util qw(sum max);
use List::MoreUtils qw(all);
use Munin::Plugin;
# get hostname from filename and blurt it out immediately
# so that when something goes wrong, at least the plugin
# output is linked with the right host
$0 =~ /esx_(.+)$/;
my $host_name = $1;
print "host_name $host_name\n";
# env.user and env.password need to be set in plugin-conf/munin-node
Opts::set_option ('username', $ENV{user} || 'root');
Opts::set_option ('password', $ENV{password} || '');
Opts::set_option ('url',"https://$host_name/sdk/webService");
# plugin needs Munin 1.4 or later
need_multigraph();
# for datetime parsing later on
my $iso8601 = DateTime::Format::ISO8601->new;
# connect to vSphere host
Util::connect();
# central object host_view holds all relevant items (VMs, network, etc.)
my $host_view = VIExt::get_host_view(1, ['summary', 'network', 'datastore', 'vm', 'runtime', 'configManager.networkSystem']);
Opts::assert_usage(defined($host_view), "Invalid host.");
# Performance Manager for getting the actual values
my $perfMan = Vim::get_view (mo_ref => ManagedObjectReference->new(type => 'PerformanceManager', value => 'ha-perfmgr'));
Opts::assert_usage(defined($perfMan), "No PerformanceManager.");
# may be needed later
#my $netsys = Vim::get_view(mo_ref => ManagedObjectReference->new(type => 'HostNetworkSystem', value => 'networkSystem'));
#Opts::assert_usage(defined($netsys), "No NetworkSystem.");
# used for getting the current vSphere server time and then
# defining the (now - 5minutes) interval
my $dtsys = Vim::get_view(mo_ref => ManagedObjectReference->new(type => 'HostDateTimeSystem', value => 'dateTimeSystem'));
Opts::assert_usage(defined($dtsys), "No DateTimeSystem.");
# enumerate all performance counters by their IDs
my %perfCounter = map { $_->key => $_ } @{$perfMan->perfCounter};
# holds all performance data
my @all_perf_data = ();
# store VM ids for iteration later on
my @all_vms = ();
# IDs/UUIDs to human readable names
my $resolveNames;
# retrieve performance counters for host
push @all_perf_data, get_perf_data($host_view);
# manually set UF name for host system
$resolveNames->{vm}->{""} = "Host System";
# only purpose of this loop is getting the UF network names
# network ManagedObjects do not have performance counters
for ($host_view->network) {
for (@$_) {
my $network = Vim::get_view (mo_ref => $_);
$resolveNames->{net}->{$_->{value}} = $_->{value}." (".$network->summary->name.")";
}
}
# purpose of this loop is getting the UF datastore names
# and retrieving capacity and free/uncommitted space
# datastore ManagedObjects do not have performance counters
for ($host_view->datastore) {
for (@$_) {
my $datastore = Vim::get_view (mo_ref => $_);
# update freeSpace values (doesn't work on free ESXi)
eval { $datastore->RefreshDatastore(); };
my $uuid =$datastore->summary->url;
$uuid =~ s!.+/!!;
$resolveNames->{datastore}->{$uuid} = $datastore->name;
push (@all_perf_data,
{ rollup => "latest",
group => "datastore",
name => "capacity",
value => $datastore->summary->capacity,
counter => PerfCounterInfo->new(nameInfo => ElementDescription->new(label => "Capacity", summary => "Maximum amount of storage space on this datastore")),
vm => "",
instance => $uuid,
unit => "Bytes" });
push (@all_perf_data,
{ rollup => "latest",
group => "datastore",
name => "freeSpace",
value => $datastore->summary->freeSpace,
counter => PerfCounterInfo->new(nameInfo => ElementDescription->new(label => "Free", summary => "Total amount of unused, available storage space on this datastore")),
vm => "",
instance => $uuid,
unit => "Bytes" });
push (@all_perf_data,
{ rollup => "latest",
group => "datastore",
name => "uncommitted",
value => $datastore->summary->uncommitted,
counter => PerfCounterInfo->new(nameInfo => ElementDescription->new(label => "Uncommitted", summary => "Total additional storage space, potentially used by all virtual machines on this datastore")),
vm => "",
instance => $uuid,
unit => "Bytes" });
}
}
# iterate over all vms
for ($host_view->vm) {
for (@$_) {
my $vm = Vim::get_view (mo_ref => $_);
# store VM id for later iteration
my $vmId = $_->{value};
push @all_vms, $vmId;
# ID to VM name
$resolveNames->{vm}->{$vmId} = "VM ".$vm->summary->config->name;
# fetch disk space usage per datastore
for (@{$vm->storage->perDatastoreUsage}) {
my $uuid = Vim::get_view(mo_ref => $_->datastore)->summary->url;
$uuid =~ s!.+/!!;
push (@all_perf_data,
{ rollup => "latest",
group => "datastore",
name => "committed",
value => $_->committed,
counter => PerfCounterInfo->new(nameInfo => ElementDescription->new(label => "Comitted", summary => "Storage space, in bytes, on this datastore that is actually being used by the virtual machine.\n\nIt includes space actually occupied by disks, logs, snapshots, configuration files etc. Files of the virtual machine which are present on a different datastore (e.g. a virtual disk on another datastore) are not included here.\n\n")),
vm => $vmId,
instance => $uuid,
unit => "Bytes" });
push (@all_perf_data,
{ rollup => "latest",
group => "datastore",
name => "uncommitted",
value => $_->uncommitted,
counter => PerfCounterInfo->new(nameInfo => ElementDescription->new(label => "Uncomitted", summary => "Additional storage space, in bytes, potentially used by the virtual machine on this datastore.\n\nAdditional space may be needed for example when lazily allocated disks grow, or storage for swap is allocated when powering on the virtual machine.\n\nIf the virtual machine is running off delta disks (for example because a snapshot was taken), then only the potential growth of the currently used delta-disks is considered.\n\n")),
vm => $vmId,
instance => $uuid,
unit => "Bytes" });
push (@all_perf_data,
{ rollup => "latest",
group => "datastore",
name => "unshared",
value => $_->unshared,
counter => PerfCounterInfo->new(nameInfo => ElementDescription->new(label => "Unshared", summary => "Storage space, in bytes, occupied by the virtual machine on this datastore that is not shared with any other virtual machine.\n\n")),
vm => $vmId,
instance => $uuid,
unit => "Bytes" });
}
# retrieve performance counters for this VM
push @all_perf_data, get_perf_data ($_);
}
}
# keep track of how many sensors are in which state
my %sensorCount = ( green => 0, red => 0, unknown => 0, yellow => 0 );
# iterate over all sensor data
my $index = 0;
for (@{$host_view->runtime->healthSystemRuntime->systemHealthInfo->numericSensorInfo}) {
# update counters
$sensorCount{$_->healthState->key}++;
# do not create entries for unmonitorable things like software components
next unless ($_->baseUnits =~ /.+/);
# create entry with sensor data
push (@all_perf_data,
{ rollup => "latest",
group => "sensors",
name => "sensor_".($index++),
value => $_->currentReading,
counter => PerfCounterInfo->new(nameInfo => ElementDescription->new(label => $_->name, summary => "Sensor data for the ".$_->sensorType." sensor ".$_->name.". ".$_->healthState->summary." (".$_->healthState->label.")")),
vm => "",
instance => "",
unitModifier => $_->unitModifier,
unit => $_->baseUnits });
}
# we're finished querying the server, so we can disconnect now
Util::disconnect();
# create entries for the green/red/yellow/unknown counters
for (keys %sensorCount) {
push (@all_perf_data,
{ rollup => "latest",
group => "sensors",
name => $_."_sensors",
value => $sensorCount{$_},
counter => PerfCounterInfo->new(nameInfo => ElementDescription->new(label => ucfirst($_), summary => "Count of sensors in the $_ state")),
vm => "",
instance => "",
unit => "Numbers" });
}
# -> DEBUG
foreach (sort { $a->{group} cmp $b->{group} || $a->{instance} cmp $b->{instance} || $a->{name} cmp $b->{name} || $a->{rollup} cmp $b->{rollup} || $a->{vm} cmp $b->{vm} } @all_perf_data) {
print "# $_->{vm}\t$_->{rollup}\t$_->{group}\t$_->{instance}\t$_->{name}\t$_->{value}\t$_->{unit}\n";
}
# <- DEBUG
# which graphs to draw
my @all_graphs = ();
# host system
push @all_graphs, (
{ selector => { group => qr/^cpu$/i, name => qr/^usagemhz$/i, instance => qr/^$/ },
config => { groupBy => "group", graphName => "usage_", graphTitle => "CPU usage per " }
},
{ selector => { group => qr/^disk$/i, name => qr/^(read|usage|write)$/i, instance => qr/.+/ },
config => { groupBy => "group", graphName => "transfer_", graphTitle => "Disk Transfer Rates per " }
},
{ selector => { group => qr/^disk$/i, name => qr/^.+Averaged$/i, instance => qr/.+/ },
config => { groupBy => "group", graphName => "iops_", graphTitle => "Disk I/O operations per " }
},
{ selector => { group => qr/^disk$/i, name => qr/^.+Latency$/i, instance => qr/.+/, vm => qr/^$/ },
config => { groupBy => "vm", graphName => "latency_disk", graphTitle => "Disk latency for " }
},
{ selector => { group => qr/^mem$/i, unit => qr/^KB$/i, rollup => qr/^none$/, vm => qr/^$/ },
config => { groupBy => "vm", graphName => "mem_host", graphTitle => "Memory usage for " }
},
{ selector => { group => qr/^datastore$/i, unit => qr/^Bytes$/i, vm => qr/^$/ },
config => { groupBy => "vm", graphName => "usage_datastore", graphTitle => "Disk space usage for ", graphArgs => "--lower-limit 10737418240 --logarithmic --alt-autoscale-min --units=si" }
},
{ selector => { group => qr/^net$/i, unit => qr/^KBps$/i, vm => qr/^$/ },
config => { groupBy => "vm", graphName => "traffic_net", graphTitle => "Network traffic for " }
},
{ selector => { group => qr/^net$/i, unit => qr/^Number$/i, vm => qr/^$/ },
config => { groupBy => "vm", graphName => "packets_net", graphTitle => "Network packets for " }
},
{ selector => { group => qr/^sys$/i, name => qr/^diskUsage$/i },
config => { groupBy => "name", graphName => "host_", graphTitle => "Host System " }
},
{ selector => { group => qr/^sys$/i, name => qr/^uptime$/i },
config => { groupBy => "name", graphName => "host_", graphTitle => "Host System and VM ", graphArgs => "--lower-limit 1000 --logarithmic --alt-autoscale-min" }
}
);
# graphs per VM
foreach (@all_vms) {
my $vmName = clean_fieldname($resolveNames->{vm}->{$_});
push @all_graphs, (
{ selector => { group => qr/^cpu$/i, name => qr/^usagemhz$/i, vm => qr/^$_$/ },
config => { groupBy => "vm", graphName => "$vmName.cpu_", graphTitle => "CPU usage for " }
},
{ selector => { group => qr/^mem$/i, unit => qr/^KB$/i, rollup => qr/^none$/, vm => qr/^$_$/ },
config => { groupBy => "vm", graphName => "$vmName.memory_", graphTitle => "Memory usage for " }
},
{ selector => { group => qr/^datastore$/i, unit => qr/^Bytes$/i, vm => qr/^$_$/ },
config => { groupBy => "vm", graphName => "$vmName.datastore_", graphTitle => "Disk space usage for ", graphArgs => "--lower-limit 10485760 --logarithmic --alt-autoscale-min --units=si" }
},
{ selector => { group => qr/^virtualDisk$/i, unit => qr/^Millisecond$/i, vm => qr/^$_$/ },
config => { groupBy => "vm", graphName => "$vmName.disklat_", graphTitle => "Disk latency for " }
},
{ selector => { group => qr/^virtualDisk$/i, unit => qr/^Number$/i, vm => qr/^$_$/ },
config => { groupBy => "vm", graphName => "$vmName.diskiops_", graphTitle => "Disk I/O operations for " }
},
{ selector => { group => qr/^virtualDisk$/i, unit => qr/^KBps$/i, vm => qr/^$_$/ },
config => { groupBy => "vm", graphName => "$vmName.disktrans_", graphTitle => "Disk transfer rates for " }
},
{ selector => { group => qr/^net$/i, unit => qr/^KBps$/i, vm => qr/^$_$/ },
config => { groupBy => "vm", graphName => "$vmName.traffic_net_", graphTitle => "Network traffic for " }
},
{ selector => { group => qr/^net$/i, unit => qr/^Number$/i, vm => qr/^$_$/ },
config => { groupBy => "vm", graphName => "$vmName.packets_net_", graphTitle => "Network packets for " }
},
{ selector => { group => qr/^sys$/i, name => qr/^uptime$/i, vm => qr/^$_$/ },
config => { groupBy => "vm", graphName => "$vmName.uptime_", graphTitle => "VM uptime " }
}
);
}
# sensor graphs
push @all_graphs, (
{ selector => { group => qr/^sensors$/i },
config => { groupBy => "unit", graphName => "sensor_", graphTitle => "Sensors " }
});
# actual processing
foreach (@all_graphs) {
if ((defined $ARGV[0]) and ($ARGV[0] eq "config")) {
munin_print("config", \@all_perf_data, $_);
munin_print("values", \@all_perf_data, $_) if $ENV{MUNIN_CAP_DIRTYCONFIG}; # this doesn't seem to work even on Munin 1.4.6
} else {
munin_print("values", \@all_perf_data, $_);
}
}
0;
####################################################################
# calculate sum, max or avg from performance data values
sub process_value_array {
my $arr = shift;
my $pd = shift;
my @vs = ();
if ($pd->unitInfo->key eq "percent") {
@vs = map { $_ / 100 } @$arr ;
} else {
@vs = @$arr;
}
return sum(@vs) if $pd->rollupType->val eq "summation";
return max(@vs) if $pd->nameInfo->key =~ /max/i;
return sum(@vs)/@$arr;
}
# query performance data for object
sub get_perf_data {
my $entity = shift;
my @ret = ();
# get the current server time
my $curtime = $iso8601->parse_datetime($dtsys->QueryDateTime());
# and subtract 5 minutes to get all values for the last period
my $oldtime = $curtime->clone->add(minutes => -5);
# actual query, intervalId is 20 because that's the default
my $perfQ = $perfMan->QueryPerf(querySpec => PerfQuerySpec->new(entity => $entity, intervalId => 20, startTime => $oldtime));
# loop over PerfEntityMetric
foreach (defined $perfQ ? @$perfQ : ()) {
my $vm = ($_->entity->type eq 'VirtualMachine')?$_->entity->value:"";
# loop over PerfMetricIntSeries
foreach (@{$_->{value}}) {
my $perfDesc = $perfCounter{$_->id->counterId};
next unless defined $perfDesc;
push @ret, { rollup => $perfDesc->rollupType->val,
group => $perfDesc->groupInfo->key,
name => $perfDesc->nameInfo->key,
value => process_value_array(\@{$_->{value}}, $perfDesc),
counter => $perfDesc,
vm => $vm,
instance => $_->id->instance,
unit => $perfDesc->unitInfo->label };
}
}
return @ret;
}
# generate a munin-friendly and unique field name
sub gen_dp_name {
return clean_fieldname("$_[0]->{name}v$_[0]->{vm}i$_[0]->{instance}");
}
# trim white spaces
sub trim {
my $string = shift;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
# print values and configs for graphs
sub munin_print {
# action
my $act = shift || "";
# values
my $arr = shift || ();
# parameters
my $par = shift || {};
my $cfg = $par->{config};
$par = $par->{selector};
my $oldGroup = "_-_";
my $factor;
# find values according to criteria in $par and sort by grouping parameter
foreach (sort { $a->{$cfg->{groupBy}} cmp $b->{$cfg->{groupBy}} } grep { my $d = $_; all { (not exists $d->{$_}) || $d->{$_} =~ /$par->{$_}/ } keys %$par; } @$arr) {
my $groupCrit = $cfg->{groupBy};
my $curGroup = $_->{$groupCrit};
if (!($curGroup eq $oldGroup)) {
# we're in a new group, meaning a new graph starts
$factor = 0;
# clean up group name for multigraph name
my $ccurGroup = $curGroup;
$ccurGroup =~ s/ |\./_/g;
print "multigraph ",$cfg->{graphName},$ccurGroup,"\n";
if ("config" eq $act) {
# want configuration
print "graph_title ",$cfg->{graphTitle},$resolveNames->{$groupCrit}->{$curGroup} || $curGroup,"\n";
#print "graph_order xxx yyy\n";
my $unit = $_->{unit};
my $base = 1000;
# since the y-axis markers are going to be wrong with source units like
# KB, MB, MHz etc., we define a correction factor via cdef later
# this way, if 1024 MB is reported, the graph shows 1G and not 1k
# (although 1k MB is technically also correct, but confusing)
if ($unit =~ /^Bytes$/i) {
$base = 1024;
} elsif ($unit =~ /^KBps$/i) {
$unit = "Bytes/s";
$factor = 1024;
$base = 1024;
} elsif ($unit =~ /^KB$/i) {
$unit = "Bytes";
$factor = 1024;
$base = 1024;
} elsif ($unit =~ /^MB$/i) {
$unit = "Bytes";
$factor = 1024*1024;
$base = 1024;
} elsif ($unit =~ /^MHz$/i) {
$unit = "Hz";
$factor = 1000000;
} elsif ($unit =~ /^Millisecond$/i) {
$unit = "Second";
$factor = 1/1000;
}
print "graph_vlabel $unit\n";
print "graph_category $_->{group}\n";
print "graph_args --base=$base --alt-autoscale-max ",(defined $cfg->{graphArgs})?$cfg->{graphArgs}:"","\n";
}
}
$oldGroup = $curGroup;
my $dpName = gen_dp_name($_);
if ("config" eq $act) {
# want configuration
# get instance and VM names and UF names, if applicable
my $iName = $resolveNames->{$_->{group}}->{$_->{instance}} || (("" eq $_->{instance})?"":$_->{group}." ".$_->{instance});
$iName = " $iName" if $iName;
my $vmName = $resolveNames->{vm}->{$_->{vm}};
$vmName = " $vmName" if $vmName;
# all values are drawn as lines for now
print "$dpName.draw LINE2\n";
print "$dpName.label ",$_->{counter}->nameInfo->label,$iName,("vm" eq $groupCrit)?"":$vmName || "","\n";
my $summary = $_->{counter}->nameInfo->summary;
$summary =~ s!\n!\\n!g;
print "$dpName.info ",$summary,$iName?", instance$iName ($_->{instance})":"",$vmName?",$vmName":"","\n";
# declare CDEF if we want to apply a factor
if ($factor > 1) {
print "$dpName.cdef $dpName,$factor,*\n";
} elsif ($factor <= 0) {
if (defined $_->{unitModifier}) {
# sensor values have a unit modifier M attached to them so that REALVAL=VAL*10^M
# y,x,LOG,*,EXP is x^y, just in case this is not obvious to the reader
print "$dpName.cdef $dpName,",$_->{unitModifier},",10,LOG,*,EXP,*\n";
}
} elsif ($factor < 1) {
print "$dpName.cdef $dpName,",1/$factor,",/\n";
}
} else {
# just print value
print gen_dp_name ($_), ".value $_->{value}\n";
}
}
}