2018-02-23 18:52:39 +01:00
|
|
|
#!/bin/bash
|
2012-12-11 20:57:06 +01:00
|
|
|
# -*- sh -*-
|
|
|
|
|
|
|
|
: << =cut
|
|
|
|
|
|
|
|
=head1 NAME
|
|
|
|
|
|
|
|
nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility,
|
|
|
|
usually bundled with NVIDIA GPU driver, to obtain information.
|
|
|
|
|
|
|
|
=head1 CONFIGURATION
|
|
|
|
|
|
|
|
This is a wildcard plugin. The wildcard prefix link name should be the
|
|
|
|
value to monitor.
|
|
|
|
|
|
|
|
This plugin uses the following configuration variables:
|
|
|
|
|
|
|
|
[nvidia_gpu_*]
|
|
|
|
env.smiexec - Location of nvidia-smi executable.
|
2013-11-12 12:29:12 +01:00
|
|
|
env.warning - Warning temperature
|
|
|
|
env.critical - Critical temperature
|
2012-12-11 20:57:06 +01:00
|
|
|
|
|
|
|
=head2 DEFAULT CONFIGURATION
|
|
|
|
|
2013-11-12 14:47:45 +01:00
|
|
|
The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi and
|
|
|
|
assume warning and critical temperatures of 75 and 95 degrees celsius, respectively.
|
2012-12-11 20:57:06 +01:00
|
|
|
|
|
|
|
=head2 EXAMPLE WILDCARD USAGE
|
|
|
|
|
|
|
|
C<ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp>
|
|
|
|
|
|
|
|
...will monitor the temperature of available GPUs.
|
|
|
|
|
2013-11-12 14:47:45 +01:00
|
|
|
=head1 TODO
|
|
|
|
|
|
|
|
=over 4
|
|
|
|
|
|
|
|
=item *
|
|
|
|
|
2018-02-22 12:58:21 +01:00
|
|
|
Add support for specific professional GPU features such as number of compute processes, clocks and so on.
|
2013-11-12 14:47:45 +01:00
|
|
|
|
|
|
|
=item *
|
|
|
|
|
|
|
|
Use multigraphs for multiple GPUs (http://munin-monitoring.org/wiki/MultigraphSampleOutput).
|
|
|
|
|
|
|
|
=back
|
|
|
|
|
2012-12-11 20:57:06 +01:00
|
|
|
=head1 AUTHOR
|
|
|
|
|
|
|
|
Nuno Fachada
|
|
|
|
faken@fakenmc.com
|
|
|
|
|
|
|
|
=head1 LICENSE
|
|
|
|
|
|
|
|
GNU General Public License, version 2
|
|
|
|
http://www.gnu.org/licenses/gpl-2.0.html
|
|
|
|
|
|
|
|
=head1 MAGIC MARKERS
|
|
|
|
|
|
|
|
#%# family=auto
|
|
|
|
#%# capabilities=autoconf suggest
|
|
|
|
|
|
|
|
=cut
|
|
|
|
|
|
|
|
# Determine name of parameter to monitor
|
2018-02-22 12:30:35 +01:00
|
|
|
name=$(basename "$0" | sed 's/^nvidia_gpu_//g')
|
2012-12-11 20:57:06 +01:00
|
|
|
|
|
|
|
# Get location of nvidia-smi executable or use default
|
|
|
|
nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
|
|
|
|
|
|
|
|
# Check if autoconf was requested
|
|
|
|
if [ "$1" = "autoconf" ]; then
|
|
|
|
# Autoconf only returns yes if nvidia-smi exists and is executable
|
2018-02-22 12:38:25 +01:00
|
|
|
if [ -x "$nvSmiExec" ]; then
|
2012-12-11 20:57:06 +01:00
|
|
|
echo yes
|
|
|
|
exit 0
|
|
|
|
else
|
|
|
|
echo "no (nvidia-smi executable not found)"
|
|
|
|
exit 0
|
|
|
|
fi
|
|
|
|
fi
|
|
|
|
|
|
|
|
# Check if suggest was requested
|
|
|
|
if [ "$1" = "suggest" ]; then
|
|
|
|
echo "temp"
|
|
|
|
echo "mem"
|
|
|
|
echo "fan"
|
2017-04-18 18:32:03 +02:00
|
|
|
echo "power"
|
2018-02-22 12:58:21 +01:00
|
|
|
echo "utilization"
|
2012-12-11 20:57:06 +01:00
|
|
|
exit 0
|
|
|
|
fi
|
|
|
|
|
|
|
|
# Get number of GPUs
|
2018-02-23 18:55:59 +01:00
|
|
|
nGpusOutput=$("$nvSmiExec" -L)
|
2018-02-22 12:30:35 +01:00
|
|
|
nGpus=$(echo "$nGpusOutput" | wc -l)
|
2018-02-22 12:38:25 +01:00
|
|
|
if [ "$nGpus" -eq 0 ]; then
|
2012-12-11 20:57:06 +01:00
|
|
|
# Exit if no GPUs found
|
|
|
|
echo "No NVIDIA GPUs detected. Exiting."
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
|
|
|
|
# Get full output from nvidia-smi
|
2018-02-23 18:55:59 +01:00
|
|
|
smiOutput=$("$nvSmiExec" -q)
|
2012-12-11 20:57:06 +01:00
|
|
|
|
|
|
|
# Check if config was requested
|
|
|
|
if [ "$1" = "config" ]; then
|
|
|
|
|
|
|
|
# Get driver version
|
2018-02-23 18:55:59 +01:00
|
|
|
driverVersion=$(echo "$smiOutput" | grep "Driver Version" | cut -d : -f 2 | tr -d ' ')
|
2012-12-11 20:57:06 +01:00
|
|
|
|
|
|
|
# Configure graph depending on what which quantity will be plotted
|
|
|
|
case $name in
|
|
|
|
temp)
|
|
|
|
echo 'graph_title GPU temperature'
|
|
|
|
echo 'graph_args -l 0 -u 120'
|
2018-02-22 13:03:38 +01:00
|
|
|
echo 'graph_vlabel degrees Celsius'
|
2017-02-22 05:34:14 +01:00
|
|
|
echo 'graph_category sensors'
|
2012-12-11 20:57:06 +01:00
|
|
|
echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion"
|
|
|
|
nGpusCounter=0
|
2018-02-23 19:27:22 +01:00
|
|
|
while [ $nGpusCounter -lt "$nGpus" ]
|
2012-12-11 20:57:06 +01:00
|
|
|
do
|
2018-02-22 12:36:25 +01:00
|
|
|
gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
|
2018-02-22 13:14:19 +01:00
|
|
|
echo "${name}${nGpusCounter}.warning ${warning:-75}"
|
|
|
|
echo "${name}${nGpusCounter}.critical ${critical:-95}"
|
|
|
|
echo "${name}${nGpusCounter}.info Temperature information for $gpuName"
|
2018-02-22 12:36:25 +01:00
|
|
|
: $((nGpusCounter=nGpusCounter+1))
|
2018-02-22 13:14:19 +01:00
|
|
|
done
|
2012-12-11 20:57:06 +01:00
|
|
|
;;
|
|
|
|
mem)
|
|
|
|
# First determine total memory of each GPU...
|
2018-02-22 12:30:35 +01:00
|
|
|
gpusTotalMemOutput=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' ')
|
2012-12-11 20:57:06 +01:00
|
|
|
gpusTotalMem=''
|
|
|
|
nGpusCounter=0
|
2018-02-23 19:27:22 +01:00
|
|
|
while [ $nGpusCounter -lt "$nGpus" ]
|
2012-12-11 20:57:06 +01:00
|
|
|
do
|
2018-02-22 12:36:25 +01:00
|
|
|
gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
|
2018-02-22 13:14:19 +01:00
|
|
|
echo "${name}${nGpusCounter}.info Memory information for $gpuName"
|
2018-02-22 12:36:25 +01:00
|
|
|
gpuMem=$(echo "$gpusTotalMemOutput"| sed -n $((nGpusCounter+1))p)
|
2012-12-11 20:57:06 +01:00
|
|
|
gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}"
|
2018-02-22 12:36:25 +01:00
|
|
|
: $((nGpusCounter=nGpusCounter+1))
|
2018-02-22 12:38:25 +01:00
|
|
|
if [ "$nGpusCounter" -lt "$nGpus" ]; then
|
2012-12-11 20:57:06 +01:00
|
|
|
gpusTotalMem="${gpusTotalMem}, "
|
|
|
|
fi
|
|
|
|
done
|
|
|
|
# ...then output config data.
|
|
|
|
echo 'graph_title GPU memory usage'
|
|
|
|
echo 'graph_args -l 0 -u 100'
|
2018-02-22 13:03:38 +01:00
|
|
|
echo 'graph_vlabel %'
|
2017-02-22 05:34:14 +01:00
|
|
|
echo 'graph_category memory'
|
2014-05-04 18:46:40 +02:00
|
|
|
echo "graph_info FB Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)"
|
2012-12-11 20:57:06 +01:00
|
|
|
;;
|
|
|
|
fan)
|
|
|
|
echo 'graph_title GPU fan speed'
|
|
|
|
echo 'graph_args -l 0 -u 100'
|
2018-02-22 13:03:38 +01:00
|
|
|
echo 'graph_vlabel %'
|
2017-02-22 05:34:14 +01:00
|
|
|
echo 'graph_category sensors'
|
2012-12-11 20:57:06 +01:00
|
|
|
echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion"
|
|
|
|
nGpusCounter=0
|
2018-02-23 19:27:22 +01:00
|
|
|
while [ $nGpusCounter -lt "$nGpus" ]
|
2012-12-11 20:57:06 +01:00
|
|
|
do
|
2018-02-22 12:36:25 +01:00
|
|
|
gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
|
2018-02-22 13:14:19 +01:00
|
|
|
echo "${name}${nGpusCounter}.info Fan information for $gpuName"
|
2018-02-22 12:36:25 +01:00
|
|
|
: $((nGpusCounter=nGpusCounter+1))
|
2018-02-22 13:14:19 +01:00
|
|
|
done
|
2012-12-11 20:57:06 +01:00
|
|
|
;;
|
2017-04-18 18:32:03 +02:00
|
|
|
power)
|
|
|
|
echo 'graph_title GPU power consumption'
|
|
|
|
echo 'graph_vlabel Watt'
|
|
|
|
echo 'graph_category sensors'
|
|
|
|
echo "graph_info power consumption of NVIDIA GPUs using driver version $driverVersion"
|
|
|
|
nGpusCounter=0
|
2018-02-23 19:27:22 +01:00
|
|
|
while [ $nGpusCounter -lt "$nGpus" ]
|
2017-04-18 18:32:03 +02:00
|
|
|
do
|
2018-02-22 12:36:25 +01:00
|
|
|
gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
|
2018-02-22 13:14:19 +01:00
|
|
|
echo "${name}${nGpusCounter}.info power consumption of $gpuName"
|
2018-02-22 12:36:25 +01:00
|
|
|
: $((nGpusCounter=nGpusCounter+1))
|
2017-04-18 18:32:03 +02:00
|
|
|
done
|
|
|
|
;;
|
2018-02-22 12:58:21 +01:00
|
|
|
utilization)
|
|
|
|
echo 'graph_title GPU utilization'
|
|
|
|
echo 'graph_args -l 0 -u 100'
|
2018-02-22 13:03:38 +01:00
|
|
|
echo 'graph_vlabel %'
|
|
|
|
echo 'graph_category system'
|
2018-02-22 12:58:21 +01:00
|
|
|
echo "graph_info GPU utilization of NVIDIA GPUs using driver version $driverVersion"
|
|
|
|
nGpusCounter=0
|
2018-02-23 19:27:22 +01:00
|
|
|
while [ $nGpusCounter -lt "$nGpus" ]
|
2018-02-22 12:58:21 +01:00
|
|
|
do
|
|
|
|
gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
|
2018-02-22 13:14:19 +01:00
|
|
|
echo "${name}${nGpusCounter}.info GPU utilization information for $gpuName"
|
2018-02-22 12:58:21 +01:00
|
|
|
: $((nGpusCounter=nGpusCounter+1))
|
|
|
|
done
|
|
|
|
;;
|
2012-12-11 20:57:06 +01:00
|
|
|
*)
|
|
|
|
echo "Can't run without a proper symlink. Exiting."
|
|
|
|
echo "Try running munin-node-configure --suggest."
|
|
|
|
exit 1
|
|
|
|
;;
|
|
|
|
esac
|
|
|
|
|
|
|
|
# Common stuff for all quantities
|
|
|
|
nGpusCounter=0
|
2018-02-23 19:27:22 +01:00
|
|
|
while [ $nGpusCounter -lt "$nGpus" ]
|
2012-12-11 20:57:06 +01:00
|
|
|
do
|
2018-02-22 12:36:25 +01:00
|
|
|
gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
|
2012-12-11 20:57:06 +01:00
|
|
|
echo "${name}${nGpusCounter}.label $gpuName"
|
2018-02-22 12:36:25 +01:00
|
|
|
: $((nGpusCounter=nGpusCounter+1))
|
2012-12-11 20:57:06 +01:00
|
|
|
#print_warning $name
|
|
|
|
#print_critical $name
|
|
|
|
done
|
|
|
|
|
|
|
|
exit 0
|
|
|
|
fi
|
|
|
|
|
|
|
|
# Get requested value
|
|
|
|
case $name in
|
|
|
|
temp)
|
2018-02-22 12:30:35 +01:00
|
|
|
valueGpus=$(echo "$smiOutput" | grep -A 1 "Temperature" | grep -i "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2)
|
2012-12-11 20:57:06 +01:00
|
|
|
;;
|
|
|
|
mem)
|
2018-02-22 12:30:35 +01:00
|
|
|
totalMemGpus=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2)
|
|
|
|
usedMemGpus=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2)
|
2012-12-11 20:57:06 +01:00
|
|
|
valueGpus=''
|
|
|
|
nGpusCounter=0
|
2018-02-23 19:27:22 +01:00
|
|
|
while [ $nGpusCounter -lt "$nGpus" ]
|
2012-12-11 20:57:06 +01:00
|
|
|
do
|
2018-02-22 12:36:25 +01:00
|
|
|
totalMemGpu=$(echo "$totalMemGpus" | sed -n $((nGpusCounter+1))p)
|
|
|
|
usedMemGpu=$(echo "$usedMemGpus" | sed -n $((nGpusCounter+1))p)
|
|
|
|
percentMemUsed=$((usedMemGpu*100/totalMemGpu))
|
2015-01-14 03:16:57 +01:00
|
|
|
valueGpus="${valueGpus}${percentMemUsed}"$'\n'
|
2018-02-22 12:36:25 +01:00
|
|
|
: $((nGpusCounter=nGpusCounter+1))
|
2012-12-11 20:57:06 +01:00
|
|
|
done
|
|
|
|
;;
|
|
|
|
fan)
|
2018-02-22 12:30:35 +01:00
|
|
|
valueGpus=$(echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2)
|
2012-12-11 20:57:06 +01:00
|
|
|
;;
|
2017-04-18 18:32:03 +02:00
|
|
|
power)
|
2018-02-22 12:30:35 +01:00
|
|
|
valueGpus=$(echo "$smiOutput" | grep "Power Draw" | cut -d ':' -f 2 | cut -d ' ' -f 2)
|
2017-04-18 18:32:03 +02:00
|
|
|
;;
|
2018-02-22 12:58:21 +01:00
|
|
|
utilization)
|
|
|
|
valueGpus=$(echo "$smiOutput" | grep "Gpu" | cut -d ':' -f 2 | cut -d ' ' -f 2)
|
|
|
|
;;
|
2012-12-11 20:57:06 +01:00
|
|
|
*)
|
|
|
|
echo "Can't run without a proper symlink. Exiting."
|
|
|
|
echo "Try running munin-node-configure --suggest."
|
|
|
|
exit 1
|
|
|
|
;;
|
|
|
|
esac
|
|
|
|
|
|
|
|
|
|
|
|
# Print requested value
|
|
|
|
nGpusCounter=0
|
2018-02-23 19:27:22 +01:00
|
|
|
while [ $nGpusCounter -lt "$nGpus" ]
|
2012-12-11 20:57:06 +01:00
|
|
|
do
|
2018-02-22 12:36:25 +01:00
|
|
|
value=$(echo "$valueGpus" | sed -n $((nGpusCounter+1))p)
|
2012-12-11 20:57:06 +01:00
|
|
|
echo "${name}${nGpusCounter}.value $value"
|
2018-02-22 12:36:25 +01:00
|
|
|
: $((nGpusCounter=nGpusCounter+1))
|
2012-12-11 20:57:06 +01:00
|
|
|
done
|