diff --git a/plugins/other/smart_ b/plugins/other/smart_ new file mode 100755 index 00000000..a1180284 --- /dev/null +++ b/plugins/other/smart_ @@ -0,0 +1,585 @@ +#!/usr/bin/env python +# -*- encoding: iso-8859-1 -*- +# +# Wildcard-plugin to monitor S.M.A.R.T attribute values through smartctl, +# which is part of smartmontools package: +# http://smartmontools.sourceforge.net/ +# +# To monitor a S.M.A.R.T device, link smart_ to this file. +# E.g. +# ln -s /usr/share/munin/plugins/smart_ /etc/munin/plugins/smart_hda +# ...will monitor /dev/hda. +# +# Needs following minimal configuration in plugin-conf.d/munin-node: +# [smart_*] +# user root +# group disk +# +# Parameters +# smartpath - Specify path to smartctl program (Default: /usr/sbin/smartctl) +# smartargs - Override '-a' argument passed to smartctl with '-A -i'+smartargs +# ignorestandby - Ignore the standby state of the drive and perform SMART query. Default: False +# +# Parameters can be specified on a per-drive basis, eg: +# [smart_hda] +# user root +# group disk +# env.smartargs -H -c -l error -l selftest -l selective -d ata +# env.smartpath /usr/local/sbin/smartctl +# +# [smart_twa0-1] +# user root +# group disk +# env.smartargs -H -l error -d 3ware,1 +# env.ignorestandby True +# +# [smart_twa0-2] +# user root +# group disk +# env.smartargs -H -l error -d 3ware,2 +# +# Author: Nicolas Stransky +# +# v1.0 22/08/2004 - First draft +# v1.2 28/08/2004 - Clean up the code, add a verbose option +# v1.3 14/11/2004 - Compatibility with python<2.2. See comments in the code +# v1.4 17/11/2004 - Deal with non zero exit codes of smartctl +# - config now prints the critical thresholds, as reported by smartctl +# v1.5 18/11/2004 - Plot smartctl_exit_code bitmask +# v1.6 21/11/2004 - Add autoconf and suggest capabilities +# - smartctl path can be passed through "smartpath" environment variable +# - Additional smartctl args can be passed through "smartargs" environment variable +# v1.7 29/11/2004 - Add suggest capabilities for NetBSD, OpenBSD, FreeBSD and SunOS. +# - Allow to override completely the smartctl arguments with "smartargs" +# v1.8 16/02/2005 - Exit status field now only triggers warnings, not criticals. +# v1.9 07/07/2005 - Allow to query several drives on the same 3ware card. +# - Correct a bug when '-i' was not listed in smartargs +# - Don't fail if no value was obtained for hard drive model +# v1.10 19/08/2005 - smartctl_exit_code is now a numerical value +# v2.0 08/05/2009 - Correct bug in the interpretation of smartctl_exit_code +# - New option to suppress SMART warnings in munin +# - Temporary lack of output for previously existing drive now reports U +# - The plugin now contains its own documentation for use with munindoc +# - Removed python<2.2 compatibility comments +# - Better autodetection of drives +# - Don't spin up devices in a low-power mode. +# +# Copyright (c) 2004-2009 Nicolas Stransky. +# +# Permission to use, copy, and modify this software with or without fee +# is hereby granted, provided that this entire notice is included in +# all source code copies of any software which is or includes a copy or +# modification of this software. +# +# THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR +# IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY +# REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE +# MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR +# PURPOSE. +# +# +# Magic markers +#%# capabilities=autoconf suggest +#%# family=auto + +## You may edit the following 3 variables +# Increase verbosity (True/False) +verbose=False +# Suppress SMART warnings (True/False) +report_warnings=True +# Modify to your needs: +statefiledir='/var/lib/munin/plugin-state/' +# You may not modify anything below this line + +import os, sys, string, pickle +from math import log +plugin_version="2.0" + +def verboselog(s): + global plugin_name + sys.stderr.write(plugin_name+': '+s+'\n') + +if not verbose : + verboselog = lambda s: None + +def read_values(hard_drive): + global smart_values, emptyoutput + try : + verboselog('Reading S.M.A.R.T values') + os.putenv('LC_ALL','C') + smart_output=os.popen(os.getenv('smartpath','/usr/sbin/smartctl')+' '+os.getenv('smartargs','-a')+(os.getenv('ignorestandby',False) and ' ' or ' -n standby ')+'-A -i /dev/'+hard_drive) + read_values=0 + for l in smart_output : + if l[:-1]=='' : + read_values=0 + elif l[:13]=='Device Model:' or l[:7]=='Device:' : + model_list=string.split(string.split(l,':')[1]) + try: model_list.remove('Version') + except : None + model=string.join(model_list) + if read_values==1 : + smart_attribute=string.split(l) + smart_values[string.replace(smart_attribute[1],'-','_')]={"value":smart_attribute[3],"threshold":smart_attribute[5]} + elif l[:18]=="ID# ATTRIBUTE_NAME" : + # Start reading the Attributes block + read_values=1 + exit_status=smart_output.close() + if exit_status!=None : + # smartctl exit code is a bitmask, check man page. + num_exit_status=int(exit_status/256) # Python convention + if int(log(num_exit_status,2))<=2 : # bit code + verboselog('smartctl cannot access S.M.A.R.T values on drive '+hard_drive+'. Command exited with code '+str(num_exit_status)+' (bit '+str(int(log(num_exit_status,2)))+')') + else : + verboselog('smartctl exited with code '+str(num_exit_status)+' (bit '+str(int(log(num_exit_status,2)))+'). '+hard_drive+' may be FAILING RIGHT NOW!') + else : + num_exit_status=0 + except : + verboselog('Cannot access S.M.A.R.T values! Check user rights or propper smartmontools installation/arguments.') + sys.exit(1) + if smart_values=={} : + verboselog('Can\'t find any S.M.A.R.T values in smartctl output!') + emptyoutput=True + #sys.exit(1) + else : emptyoutput=False + smart_values["smartctl_exit_status"]={"value":str(num_exit_status),"threshold":"1"} + try : smart_values["model"]=model + # For some reason we may have no value for "model" + except : smart_values["model"]="unknown" + return(exit_status) + +def open_state_file(hard_drive,mode) : + global statefiledir + return open(statefiledir+'/smart-'+string.join(hard_drive,"-")+'.state',mode) + +def update_state_file(hard_drive) : + try: + verboselog('Saving statefile') + pickle.dump(smart_values,open_state_file(hard_drive,"w")) + except : + verboselog('Error trying to save state file! Check access rights') + +def print_plugin_values(hard_drive) : + global emptyoutput, smart_values + if not emptyoutput: + verboselog('Printing S.M.A.R.T values') + for key in smart_values.keys() : + if key=="model" : continue + print(key+".value "+smart_values[key]["value"]) + else: + print_unknown_from_statefile(hard_drive,smart_values) + +def print_config(hard_drive) : + global report_warnings, smart_values, statefiledir + if os.path.exists(statefiledir+'/smart-'+string.join(hard_drive,"-")+'.state'): + try : + verboselog('Try to recall previous S.M.A.R.T attributes for '+string.join(hard_drive,",")) + smart_values_state=pickle.load(open_state_file(hard_drive,"r")) + except : + verboselog('Error opening existing state file!') + sys.exit(1) + else : + verboselog('No state file, reading S.M.A.R.T values for the first time') + read_values(hard_drive[0]) + pickle.dump(smart_values,open_state_file(hard_drive,"w")) + smart_values_state=smart_values + + verboselog('Printing configuration') + print('graph_title S.M.A.R.T values for drive '+string.join(hard_drive,",")) + print('graph_vlabel Attribute S.M.A.R.T value') + print('graph_args --base 1000 --lower-limit 0') + print('graph_category disk') + print('graph_info This graph shows the value of all S.M.A.R.T attributes of drive '+string.join(hard_drive,",")+' ('+smart_values_state['model']+'). smartctl_exit_status is the return value of smartctl. A non-zero return value indicates an error, a potential error, or a fault on the drive.') + attributes=smart_values_state.keys() + attributes.sort() + for key in attributes : + if key in ['smartctl_exit_status','model'] : continue + print(key+'.label '+key) + print(key+'.draw LINE2') + if report_warnings: print(key+'.critical '+smart_values_state[key]["threshold"]+':') + print('smartctl_exit_status.label smartctl_exit_status') + print('smartctl_exit_status.draw LINE2') + if report_warnings: print('smartctl_exit_status.warning '+smart_values_state['smartctl_exit_status']["threshold"]) + +def print_unknown_from_statefile(hard_drive,smart_values) : + global statefiledir + if os.path.exists(statefiledir+'/smart-'+string.join(hard_drive,"-")+'.state'): + try : + verboselog('Failed to get S.M.A.R.T values from drive. Try to recall previous S.M.A.R.T attributes for '+string.join(hard_drive,",")) + smart_values_state=pickle.load(open_state_file(hard_drive,"r")) + except : + verboselog('Error opening existing state file!') + sys.exit(1) + else : + verboselog('No state file, reading S.M.A.R.T values for the first time') + exit(1) + + verboselog('Printing unknown values for all attributes in state file') + attributes=smart_values_state.keys() + attributes.sort() + for key in attributes : + if key=='model' : continue + print(key+'.value U') + +def get_hard_drive_name() : + global plugin_name + try : + name=[plugin_name[string.rindex(plugin_name,'_')+1:]] + if os.uname()[0]=="SunOS" : + try : + # if hard_drive name starts with "rdsk" or "rmt", try to reconstruct the path + if name[0][0:4]=="rdsk": + name[0]=os.path.join("rdsk",name[0][4:]) + elif name[0][0:3]=="rmt": + name[0]=os.path.join("rmt",name[0][3:]) + except : + verboselog('Failed to find SunOS hard_drive') + # For 3ware cards, we have to set multiple plugins for the same hard drive name. + # Let's see if we find a '-' in the drive name. + if name[0].find('-')!=-1: + # Put the drive name and it's number in a list + name=[name[0][:string.rindex(name[0],'-')],name[0][string.rindex(name[0],'-')+1:]] + # Chech that the drive exists in /dev + if not os.path.exists('/dev/'+name[0]): + verboselog('/dev/'+name[0]+' not found!') + sys.exit(1) + return(name) + except : + verboselog('No S.M.A.R.T device name found in plugin\'s symlink!') + sys.exit(1) + +def find_smart_drives() : + global emptyoutput + # Try to autodetect Linux, *BSD, SunOS drives. Don't try to autodetect drives on a 3Ware card. + drives=[] + if os.uname()[0]=="Linux" : + if os.path.exists('/sys/block/'): + # Running 2.6 + try : + for drive in os.listdir('/sys/block/') : + if drive[:2] in ['md','fd','lo','ra','dm'] : continue # Ignore MD, Floppy, loop , RAM and LVM devices. + try : + verboselog('Trying '+drive+'...') + exit_status=read_values(drive) + if (exit_status==None or int(log(int(exit_status/256),2))>2) and not emptyoutput: + drives.append(drive) + except : + continue + except : + verboselog('Failed to list devices in /sys/block') + else : + verboselog('Not running linux2.6, failing back to /proc/partitions') + try : + partitions=open('/proc/partitions','r') + L=partitions.readlines() + for l in L : + words=string.split(l) + if len(words)==0 or words[0][0] not in string.digits : continue + if words[0] in ['1','9','58','254'] : continue # Ignore RAM, md, LVM and LVM2 devices + if words[-1][-1] not in string.digits : + try : + verboselog('Trying '+words[-1]+'...') + exit_status=read_values(words[-1]) + if (exit_status==None or int(log(int(exit_status/256),2))>2) and not emptyoutput: + drives.append(words[-1]) + except : + continue + verboselog('Found drives in /proc/partitions ! '+str(drives)) + except : + verboselog('Failed to list devices in /proc/partitions') + elif os.uname()[0]=="OpenBSD" : + try : + sysctl_kerndisks=os.popen('sysctl hw.disknames') + kerndisks=string.strip(sysctl_kerndisks.readline()) + for drive in string.split(kerndisks[string.rindex(kerndisks,'=')+1:],',') : + if drive[:2] in ['md','cd','fd'] : continue # Ignore Memory Disks, CD-ROM drives and Floppy + try : + verboselog('Trying '+drive+'c...') + exit_status=read_values(drive+'c') + if (exit_status==None or int(log(int(exit_status/256),2))>2) and not emptyoutput: + drives.append(drive+'c') + except : + continue + except : + verboselog('Failed to list OpenBSD disks') + elif os.uname()[0]=="FreeBSD" : + try : + sysctl_kerndisks=os.popen('sysctl kern.disks') + kerndisks=string.strip(sysctl_kerndisks.readline()) + for drive in string.split(kerndisks)[1:] : + if drive[:2] in ['md','cd','fd'] : continue # Ignore Memory Disks, CD-ROM drives and Floppy + try : + verboselog('Trying '+drive+'...') + exit_status=read_values(drive) + if (exit_status==None or int(log(int(exit_status/256),2))>2) and not emptyoutput: + drives.append(drive) + except : + continue + except : + verboselog('Failed to list FreeBSD disks') + elif os.uname()[0]=="NetBSD" : + try : + sysctl_kerndisks=os.popen('sysctl hw.disknames') + kerndisks=string.strip(sysctl_kerndisks.readline()) + for drive in string.split(kerndisks)[2:] : + if drive[:2] in ['md','cd','fd'] : continue # Ignore Memory Disks, CD-ROM drives and Floppy + try : + verboselog('Trying '+drive+'c...') + exit_status=read_values(drive+'c') + if (exit_status==None or int(log(int(exit_status/256),2))>2) and not emptyoutput: + drives.append(drive+'c') + except : + continue + except : + verboselog('Failed to list NetBSD disks') + elif os.uname()[0]=="SunOS" : + try : + from glob import glob + for drivepath in glob('/dev/rdsk/*s2') : + try : + drive=os.path.basename(drivepath) + verboselog('Trying rdsk'+drive+'...') + exit_status=read_values('rdsk'+drive) + if (exit_status==None or int(log(int(exit_status/256),2))>2) and not emptyoutput: + drives.append('rdsk'+drive) + except : + continue + for drivepath in glob('/dev/rmt/*') : + try : + drive=os.path.basename(drivepath) + verboselog('Trying rmt'+drive+'...') + exit_status=read_values('rmt'+drive) + if (exit_status==None or int(log(int(exit_status/256),2))>2) and not emptyoutput: + drives.append('rmt'+drive) + except : + continue + except : + verboselog('Failed to list SunOS disks') + return(drives) + +### Main part ### + +smart_values={} +emptyoutput=False +plugin_name=list(os.path.split(sys.argv[0]))[1] +verboselog('plugins\' UID: '+str(os.geteuid())+' / plugins\' GID: '+str(os.getegid())) + +# Parse arguments +if len(sys.argv)>1 : + if sys.argv[1]=="config" : + hard_drive=get_hard_drive_name() + print_config(hard_drive) + sys.exit(0) + elif sys.argv[1]=="autoconf" : + if os.path.exists(os.getenv('smartpath','/usr/sbin/smartctl')) : + print('yes') + sys.exit(0) + else : + print('no (smartmontools not found)') + sys.exit(1) + elif sys.argv[1]=="suggest" : + for drive in find_smart_drives() : + print(drive) + sys.exit(0) + elif sys.argv[1]=="version" : + print('smart_ Munin plugin, version '+plugin_version) + sys.exit(0) + elif sys.argv[1]!="" : + verboselog('unknown argument "'+sys.argv[1]+'"') + sys.exit(1) + +# No argument given, doing the real job: +hard_drive=get_hard_drive_name() +read_values(hard_drive[0]) +if not emptyoutput: update_state_file(hard_drive) +print_plugin_values(hard_drive) +exit(0) + + +### The following is the smart_ plugin documentation, intended to be used with munindoc +""" +=head1 NAME + +smart_ - Munin wildcard-plugin to monitor S.M.A.R.T. attribute values through smartctl + +=head1 APPLICABLE SYSTEMS + +Node with B interpreter and B (http://smartmontools.sourceforge.net/) +installed and in function. + +=head1 CONFIGURATION + +=head2 Create link in service directory + +To monitor a S.M.A.R.T device, create a link in the service directory +of the munin-node named smart_, which is pointing to this file. + +E.g. + +ln -s /usr/share/munin/plugins/smart_ /etc/munin/plugins/smart_hda + +...will monitor /dev/hda. + +=head2 Grant privileges in munin-node + +The plugin must be run under high privileged user B, to get access to the raw device. + +So following minimal configuration in plugin-conf.d/munin-node is needed. + +=over 2 + + [smart_*] + user root + group disk + +=back + +=head2 Set Parameter if needed + + smartpath - Specify path to smartctl program (Default: /usr/sbin/smartctl) + smartargs - Override '-a' argument passed to smartctl with '-A -i'+smartargs + ignorestandby - Ignore the standby state of the drive and perform SMART query. Default: False + +Parameters can be specified on a per-drive basis, eg: + +=over 2 + + [smart_hda] + user root + env.smartargs -H -c -l error -l selftest -l selective -d ata + env.smartpath /usr/local/sbin/smartctl + +=back + +In particular, for SATA drives, with older versions of smartctl: + +=over 2 + + [smart_sda] + user root + env.smartargs -d ata -a + + [smart_twa0-1] + user root + env.smartargs -H -l error -d 3ware,1 + env.ignorestandby True + + [smart_twa0-2] + user root + env.smartargs -H -l error -d 3ware,2 + +=back + +=head1 INTERPRETATION + +If a device supports the B it offers readable +access to the attribute table. There you find the B, +a B and a B (set by the vendor) +for each attribute, that is supported by that device. + +The meaning and handling of the raw value is a secret of the +vendors embedded S.M.A.R.T.-Software on the disk. The only +relevant info from our external view is the B +in comparison with the B. If the attributes value is +equal or below the threshold, it signals its failure and +the B of the device will switch from B to B. + +This plugin fetches the B +and draw a curve for each of them. +It takes the vendors threshold as critical limit for the munin datafield. +So you will see an alarm, if the value reaches the vendors threshold. + +Looking at the graph: It is a bad sign, if the curve starts +to curl or to meander. The more horizontal it runs, +the better. Of course it is normal, that the temperatures +curve swings a bit. But the others should stay steady on +their level if everything is ok. + +S.M.A.R.T. distinguishes between B and B +Attributes. An old disk will have more curling curves +because of degradation, especially for the B Attributes. +You should then backup more often, run more selftests[1] and prepare +the disks replacement. + +B, if a Attribute goes below threshold. +Immediately back-up your data and replace your hard disk drive. +A failure may be imminent.. + +[1] Consult the smartmontools manpages to learn about +offline tests and automated selftests with smartd. +Only with both activated, the values of the SMART-Attributes +reflect the all over state of the device. + +Tutorials and articles about S.M.A.R.T. and smartmontools: +http://smartmontools.sourceforge.net/doc.html#tutorials + +=head1 MAGIC MARKERS + + #%# family=auto + #%# capabilities=autoconf suggest + +=head1 CALL OPTIONS + +B + +=over 2 + +Fetches values if called without arguments: + +E.g.: munin-run smart_hda + +=back + +B + +=over 2 + +Prints plugins configuration. + +E.g.: munin-run smart_hda config + +=back + +B + +=over 2 + +Tries to find smartctl and outputs value 'yes' for success, 'no' if not. + +It's used by B to see wether autoconfiguration is possible. + +=back + +B + +=over 2 + +Outputs the list of device names, that it found plugged to the system. + +B use this to build the service links for this wildcard-plugin. + +=back + +=head1 VERSION + +Version 2.0 + +=head1 BUGS + +None known + +=head1 AUTHOR + +(C) 2004-2009 Nicolas Stransky + +(C) 2008 Gabriele Pohl +Reformated existent documentation to POD-Style, added section Interpretation to the documentation. + +=head1 LICENSE + +GPLv2 (http://www.gnu.org/licenses/gpl-2.0.txt) + +=cut + + +"""