2007-03-07 20:19:58 +01:00
#!/usr/bin/perl -w
#
# (c) 2007 Nathan Rutman nathan@clusterfs.com
2011-08-17 21:24:28 +02:00
#
2007-03-07 20:19:58 +01:00
# Plugin to monitor RAID status
#
# Results are % of healthy drives in a raid device
2010-06-18 17:04:39 +02:00
# and % rebuilt of devices that are resyncing.
2007-03-07 20:19:58 +01:00
#
#%# family=contrib
#%# capabilities=autoconf
2011-08-17 21:24:28 +02:00
if ($ARGV[0] and $ARGV[0] eq "autoconf") {
if (-r "/proc/mdstat" and `grep md /proc/mdstat`) {
2007-03-07 20:19:58 +01:00
print "yes\n";
exit 0;
2011-08-17 21:24:28 +02:00
} else {
2007-03-07 20:19:58 +01:00
print "no RAID devices\n";
exit 1;
}
}
2011-08-17 21:24:28 +02:00
if ( $ARGV[0] and $ARGV[0] eq "config" ) {
2007-03-07 20:19:58 +01:00
print "graph_title RAID status\n";
print "graph_category disk\n";
print "graph_info This graph monitors RAID disk health. Values are percentage of healthy drives in each raid group. Degraded devices are marked Critical.\n";
print "graph_args --base 1000 -l 0\n";
2010-06-18 17:04:39 +02:00
print "graph_vlabel % healthy/rebuilt\n";
2007-03-07 20:19:58 +01:00
print "graph_scale no\n";
}
{
local( $/, *MDSTAT ) ;
open (MDSTAT, "/proc/mdstat") or exit 1;
#open (MDSTAT, "/etc/munin/plugins/sample.failed") or exit 1;
my $text = <MDSTAT>;
close MDSTAT;
2011-08-17 21:24:28 +02:00
# Should look like "active raid1 sda1[0] sdc1[2] sdb1[1]"
# Interestingly, swap is presented as "active (auto-read-only)"
2013-09-04 09:57:12 +02:00
while ($text =~ /(md\d+)\s+:\s+active\s+(\(auto-read-only\)\s+|)(\w+)\s+(.*)\n.*\[(\d+)\/(\d+)]\s+\[(\w+)]\n(.*(check|resync)\s=\s+(\d+\.\d+)%|.*\n)/ ) {
my($dev,$dummy,$type,$members,$nmem,$nact,$status,$dummy2,$dummy3,$proc) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10);
# print "$text\nitem: $dev $type ($members) status=$status $proc\n";
2011-08-17 21:24:28 +02:00
if ( $ARGV[0] and $ARGV[0] eq "config" ) {
2007-03-07 20:19:58 +01:00
print "$dev.label $dev\n";
print "$dev.info $type $members\n";
# 100: means less than 100
2011-08-17 21:24:28 +02:00
# Because of an unfound bug, sometimes reported as 99.XX even when OS reports 100.
print "$dev.critical 98:\n";
2010-06-18 17:04:39 +02:00
print $dev, "_rebuild.label $dev rebuilt\n";
print $dev, "_rebuild.info $type\n";
2011-08-17 21:24:28 +02:00
# Because of an unfound bug, sometimes reported as 99.XX even when OS reports 100.
print $dev, "_rebuild.critical 98:\n";
2013-09-04 09:57:12 +02:00
print $dev, "_check.label $dev check/resync \n";
print $dev, "_check.info $type\n";
2011-08-17 21:24:28 +02:00
} else {
2010-06-18 17:04:39 +02:00
my $pct = 100 * $nact / $nmem;
my $rpct = 100;
2011-08-17 21:24:28 +02:00
if ( $pct < 100 ) {
2012-01-12 19:40:29 +01:00
my @output = `/sbin/mdadm -D /dev/$dev | grep Rebuild`;
2013-09-04 09:57:12 +02:00
if( $output[0] and $output[0] =~ /([0-9]+)% complete/ ) {
2011-08-17 21:24:28 +02:00
$rpct = $1;
} else {
$rpct = 0;
}
2010-06-18 17:04:39 +02:00
}
2013-09-04 09:57:12 +02:00
if ( $proc ) {
$cpct = $proc;
}
2007-03-07 20:19:58 +01:00
print "$dev.value $pct\n";
2010-06-18 17:04:39 +02:00
print $dev, "_rebuild.value $rpct\n";
2013-09-04 09:57:12 +02:00
print $dev, "_check.value $cpct\n";
2007-03-07 20:19:58 +01:00
}
$text = $';
}
}
exit 0;
2011-08-17 21:24:28 +02:00