#!/usr/bin/perl -w # # (c) 2007 Nathan Rutman nathan@clusterfs.com # # Plugin to monitor RAID status # # Results are % of healthy drives in a raid device # and % rebuilt of devices that are resyncing. # #%# family=contrib #%# capabilities=autoconf if ($ARGV[0] and $ARGV[0] eq "autoconf") { if (-r "/proc/mdstat" and `grep md /proc/mdstat`) { print "yes\n"; } else { print "no (no RAID devices found)\n"; } exit 0; } if ( $ARGV[0] and $ARGV[0] eq "config" ) { print "graph_title RAID status\n"; print "graph_category disk\n"; print "graph_info This graph monitors RAID disk health. Values are percentage of healthy drives in each raid group. Degraded devices are marked Critical.\n"; print "graph_args --base 1000 -l 0\n"; print "graph_vlabel % healthy/rebuilt\n"; print "graph_scale no\n"; } open(my $mdstat, "/proc/mdstat"); my(@text) = <$mdstat>; # contents of <$mdstat> may be changed at next reading, so fetch the contents at a time close($mdstat); my($devinfo_re, $devstat_re, $action_re) = ( '(md\d+)\s+:\s+active\s+(\(read-only\)\s+|\(auto-read-only\)\s+|)(\w+)\s+(.*)', '.*\[(\d+)\/(\d+)]\s+\[(\w+)]', '.*(reshape|check|resync|recovery)\s*=\s*(\d+\.\d+%|\w+)(.*finish=(.*min))?', ); # Interestingly, swap is presented as "active (auto-read-only)" # and mdadm has '--readonly' option to make the array 'active (read-only)' my($dev, $ro, $type, $members, $failed, $nmem, $nact, $status, $action, $proc, $minute); while (@text) { my $line = shift @text; if ($line =~ /$devinfo_re/) { # first line should like "active raid1 sda1[0] sdc1[2] sdb1[1]" $dev = $1; $ro = $2 || ''; $type = $3; $members = $4; $failed = $members; $failed =~ s/[^F]+//g; $failed = length($failed); $line = shift @text; if ($line =~ /$devstat_re/) { # second line should like "123456 blocks super 1.2 [2/2] [UU]" $nmem = $1; $nact = $2; $status = $3; } else { # second line did not exist on /proc/mdstat next; } $line = shift @text; if ($line =~ /$action_re/) { # third line should like " [==>..................] check = 10.0% (12345/123456) finish=123min speed=12345/sec" # this line will appear only when the array is in action $action = $1; my $percent = $2; $minute = $4 || ''; if ($percent =~ /(\d+\.\d+)%/) { $proc = $1; } else { # 'resync=DELAYED' or 'resync=PENDING' $action .= " ($percent)"; $proc = -1; } } else { # array is not in action $action = 'idle'; $minute = ''; unshift(@text, $line); } } else { # skip until first line is found next; } if ( $ARGV[0] and $ARGV[0] eq "config" ) { print "$dev.label $dev\n"; print "$dev.info $type $ro$members\n"; # 100: means less than 100 # Because of an unfound bug, sometimes reported as 99.XX even when OS reports 100. print "$dev.critical 98:\n"; print $dev, "_rebuild.label $dev reshape/recovery\n"; print $dev, "_rebuild.info $action $minute\n"; # Because of an unfound bug, sometimes reported as 99.XX even when OS reports 100. print $dev, "_rebuild.critical 98:\n"; print $dev, "_check.label $dev check/resync \n"; print $dev, "_check.info $action $minute\n"; print $dev, "_failed.label $dev failed disks \n"; print $dev, "_failed.info $action $minute\n"; print $dev, "_failed.critical 0:0\n"; } else { my $pct = 100 * $nact / $nmem; my $rpct = 100; my $cpct = 100; if ($action =~ /reshape|recovery/) { $rpct = $proc; $cpct = 0; # check/resync is not done } elsif ($action =~ /check|resync/) { if ($proc < 0) { # array is on DELAYED or PENDING, further info is unknown $rpct = 0; $cpct = 0; } else { # reshape/recovery was done, $rpct => 100 $cpct = $proc; } } print "$dev.value $pct\n"; print $dev, "_rebuild.value $rpct\n"; print $dev, "_check.value $cpct\n"; print $dev, "_failed.value $failed\n"; } } exit 0;