2007-03-07 20:19:58 +01:00
#!/usr/bin/perl -w
#
# (c) 2007 Nathan Rutman nathan@clusterfs.com
2011-08-17 21:24:28 +02:00
#
2007-03-07 20:19:58 +01:00
# Plugin to monitor RAID status
#
# Results are % of healthy drives in a raid device
2010-06-18 17:04:39 +02:00
# and % rebuilt of devices that are resyncing.
2007-03-07 20:19:58 +01:00
#
#%# family=contrib
#%# capabilities=autoconf
2011-08-17 21:24:28 +02:00
if ($ARGV[0] and $ARGV[0] eq "autoconf") {
if (-r "/proc/mdstat" and `grep md /proc/mdstat`) {
2015-05-04 07:56:30 +02:00
print "yes\n";
exit 0;
2011-08-17 21:24:28 +02:00
} else {
2015-05-04 07:56:30 +02:00
print "no RAID devices\n";
exit 1;
2007-03-07 20:19:58 +01:00
}
}
2011-08-17 21:24:28 +02:00
if ( $ARGV[0] and $ARGV[0] eq "config" ) {
2007-03-07 20:19:58 +01:00
print "graph_title RAID status\n";
print "graph_category disk\n";
print "graph_info This graph monitors RAID disk health. Values are percentage of healthy drives in each raid group. Degraded devices are marked Critical.\n";
print "graph_args --base 1000 -l 0\n";
2010-06-18 17:04:39 +02:00
print "graph_vlabel % healthy/rebuilt\n";
2007-03-07 20:19:58 +01:00
print "graph_scale no\n";
}
2015-05-04 07:56:30 +02:00
open(my $mdstat, "/proc/mdstat");
2015-05-04 08:54:59 +02:00
my(@text) = <$mdstat>;
# contents of <$mdstat> may be changed at next reading, so fetch the contents at a time
2015-05-04 07:56:30 +02:00
close($mdstat);
2007-03-07 20:19:58 +01:00
2015-05-04 08:54:59 +02:00
my($devinfo_re, $devstat_re, $action_re) = (
2015-05-04 10:52:50 +02:00
'(md\d+)\s+:\s+active\s+(\(read-only\)\s+|\(auto-read-only\)\s+|)(\w+)\s+(.*)',
2015-05-04 08:54:59 +02:00
'.*\[(\d+)\/(\d+)]\s+\[(\w+)]',
2015-05-04 11:37:09 +02:00
'.*(reshape|check|resync|recovery)\s*=\s*(\d+\.\d+%|\w+)(.*finish=(.*min))?',
2015-05-04 08:54:59 +02:00
);
2015-05-04 10:52:50 +02:00
# Interestingly, swap is presented as "active (auto-read-only)"
# and mdadm has '--readonly' option to make the array 'active (read-only)'
2015-05-04 08:54:59 +02:00
2018-01-28 15:28:00 +01:00
my($dev, $ro, $type, $members, $failed, $nmem, $nact, $status, $action, $proc, $minute);
2015-05-04 08:54:59 +02:00
while (@text) {
my $line = shift @text;
if ($line =~ /$devinfo_re/) {
# first line should like "active raid1 sda1[0] sdc1[2] sdb1[1]"
2015-05-04 11:28:34 +02:00
$dev = $1;
2015-05-04 11:37:09 +02:00
$ro = $2 || '';
2015-05-04 08:54:59 +02:00
$type = $3;
$members = $4;
2018-01-28 15:28:00 +01:00
$failed = $members;
$failed =~ s/[^F]+//g;
$failed = length($failed);
2015-05-04 08:54:59 +02:00
$line = shift @text;
if ($line =~ /$devstat_re/) {
# second line should like "123456 blocks super 1.2 [2/2] [UU]"
$nmem = $1;
$nact = $2;
$status = $3;
}
else {
2016-05-17 00:11:31 +02:00
# second line did not exist on /proc/mdstat
2015-05-04 08:54:59 +02:00
next;
}
$line = shift @text;
if ($line =~ /$action_re/) {
# third line should like " [==>..................] check = 10.0% (12345/123456) finish=123min speed=12345/sec"
# this line will appear only when the array is in action
2015-05-04 11:28:34 +02:00
$action = $1;
my $percent = $2;
2015-05-04 11:37:09 +02:00
$minute = $4 || '';
2015-05-04 11:28:34 +02:00
if ($percent =~ /(\d+\.\d+)%/) {
$proc = $1;
}
else {
# 'resync=DELAYED' or 'resync=PENDING'
2015-05-04 11:37:09 +02:00
$action .= " ($percent)";
2015-05-04 11:28:34 +02:00
$proc = -1;
}
2015-05-04 08:54:59 +02:00
}
else {
# array is not in action
2015-05-04 11:28:34 +02:00
$action = 'idle';
2015-05-04 11:37:09 +02:00
$minute = '';
2015-05-04 08:54:59 +02:00
unshift(@text, $line);
}
}
else {
# skip until first line is found
next;
}
2015-05-04 07:56:30 +02:00
if ( $ARGV[0] and $ARGV[0] eq "config" ) {
print "$dev.label $dev\n";
2015-05-04 11:37:09 +02:00
print "$dev.info $type $ro$members\n";
2015-05-04 07:56:30 +02:00
# 100: means less than 100
# Because of an unfound bug, sometimes reported as 99.XX even when OS reports 100.
print "$dev.critical 98:\n";
2015-05-04 11:37:09 +02:00
print $dev, "_rebuild.label $dev reshape/recovery\n";
print $dev, "_rebuild.info $action $minute\n";
2015-05-04 07:56:30 +02:00
# Because of an unfound bug, sometimes reported as 99.XX even when OS reports 100.
print $dev, "_rebuild.critical 98:\n";
print $dev, "_check.label $dev check/resync \n";
2015-05-04 11:37:09 +02:00
print $dev, "_check.info $action $minute\n";
2018-01-28 15:28:00 +01:00
print $dev, "_failed.label $dev failed disks \n";
print $dev, "_failed.info $action $minute\n";
print $dev, "_failed.critical 0:0\n";
2015-05-04 07:56:30 +02:00
} else {
my $pct = 100 * $nact / $nmem;
my $rpct = 100;
2015-05-04 11:28:34 +02:00
my $cpct = 100;
if ($action =~ /reshape|recovery/) {
$rpct = $proc;
$cpct = 0; # check/resync is not done
2015-05-04 07:56:30 +02:00
}
2015-05-04 11:28:34 +02:00
elsif ($action =~ /check|resync/) {
if ($proc < 0) {
# array is on DELAYED or PENDING, further info is unknown
$rpct = 0;
$cpct = 0;
}
else {
# reshape/recovery was done, $rpct => 100
$cpct = $proc;
}
2015-05-04 07:56:30 +02:00
}
2015-05-04 11:28:34 +02:00
2015-05-04 07:56:30 +02:00
print "$dev.value $pct\n";
print $dev, "_rebuild.value $rpct\n";
print $dev, "_check.value $cpct\n";
2018-01-28 15:28:00 +01:00
print $dev, "_failed.value $failed\n";
2015-05-04 07:56:30 +02:00
}
2007-03-07 20:19:58 +01:00
}
exit 0;
2011-08-17 21:24:28 +02:00