2
0
mirror of https://github.com/munin-monitoring/contrib.git synced 2018-11-08 00:59:34 +01:00

Add plugins for monitoring NVIDIA and AMD GPUs

This commit is contained in:
Nuno Fachada 2012-12-11 19:57:06 +00:00
parent c2ecfcb772
commit 426bba4466
2 changed files with 443 additions and 0 deletions

235
plugins/gpu/amd_gpu_ Executable file
View File

@ -0,0 +1,235 @@
#!/bin/bash
# -*- bash -*-
: << =cut
=head1 NAME
amd_gpu_ - Wildcard plugin to monitor AMD GPUs. Uses aticonfig utility,
usually bundled with AMD GPU driver, to obtain information. To use this
plugin you have to make sure aticonfig will run without an active X
server (i.e. without anyone being logged in via the GUI). For more
information on this visit this link:
http://www.mayankdaga.com/running-opencl-applications-remotely-on-amd-gpus/
=head1 CONFIGURATION
This is a wildcard plugin. The wildcard prefix link name should be the
value to monitor.
This plugin uses the following configuration variables:
[amd_gpu_*]
env.aticonfexec - Location of aticonfig executable.
user root
=head2 DEFAULT CONFIGURATION
The default configuration is to set "env.aticonfexec" to /usr/bin/aticonfig.
=head2 EXAMPLE WILDCARD USAGE
C<ln -s /usr/share/munin/plugins/amd_gpu_ /etc/munin/plugins/amd_gpu_temp>
...will monitor the temperature of available AMD GPUs.
=head1 AUTHOR
Nuno Fachada
faken@fakenmc.com
=head1 LICENSE
GNU General Public License, version 2
http://www.gnu.org/licenses/gpl-2.0.html
=head1 MAGIC MARKERS
#%# family=auto
#%# capabilities=autoconf suggest
=cut
# Determine name of parameter to monitor
name=`basename $0 | sed 's/^amd_gpu_//g'`
# Get location of aticonfig executable or use default
atiConfigExec=${aticonfexec:-'/usr/bin/aticonfig'}
# Check if autoconf was requested
if [ "$1" = "autoconf" ]; then
# Autoconf only returns yes if aticonfig exists and is executable
if [ -x $atiConfigExec ]; then
echo yes
exit 0
else
echo "no (aticonfig executable not found)"
exit 0
fi
fi
# Check if suggest was requested
if [ "$1" = "suggest" ]; then
echo "temp"
echo "clocks"
echo "fan"
echo "load"
echo "vcore"
exit 0
fi
# Get number of GPUs
nGpusOutput=`$atiConfigExec --list-adapters`
nGpus=`echo "$nGpusOutput" | wc -l`
nGpus=$((nGpus - 2)) # Last two lines don't matter
if [ $nGpus -eq 0 ]; then
# Exit if no GPUs found
echo "No AMD GPUs detected. Exiting."
exit 1
fi
# Check if config was requested
if [ "$1" = "config" ]; then
# Configure graph depending on what which quantity will be plotted
case $name in
temp)
echo 'graph_title GPU temperature'
echo 'graph_args -l 0 -u 120'
echo 'graph_vlabel Degrees (C)'
echo 'graph_category gpu'
echo "graph_info Temperature information for AMD GPUs"
nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ]
do
gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "`
echo "temp${nGpusCounter}.warning 75"
echo "temp${nGpusCounter}.critical 95"
echo "temp${nGpusCounter}.info Temperature information for $gpuName"
echo "temp${nGpusCounter}.label Temperature ($gpuName)"
: $(( nGpusCounter = $nGpusCounter + 1 ))
done
;;
clocks)
# First determine max clock for each GPU...
read -a array <<< `$atiConfigExec --odgc | grep "Peak Range" | grep -o "[0-9]*"`
maxclock=0
for element in "${array[@]}"
do
if [ "$element" -gt "$maxclock" ]; then
maxclock=$element
fi
done
# ...then output config data.
echo 'graph_title GPU clock'
echo "graph_args -l 0 -u $maxclock"
echo 'graph_vlabel MHz'
echo 'graph_category gpu'
echo "graph_info Core and memory clock info for AMD GPUs"
nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ]
do
gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "`
echo "memclock${nGpusCounter}.info Memory clock information for $gpuName"
echo "memclock${nGpusCounter}.label Memory clock ($gpuName)"
echo "coreclock${nGpusCounter}.info Core clock information for $gpuName"
echo "coreclock${nGpusCounter}.label Core clock ($gpuName)"
: $(( nGpusCounter = $nGpusCounter + 1 ))
done
;;
fan)
echo 'graph_title GPU fan speed'
echo 'graph_args -l 0 -u 100'
echo 'graph_vlabel Percentage'
echo 'graph_category gpu'
echo "graph_info Fan speed of AMD GPUs"
nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ]
do
gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "`
echo "fan${nGpusCounter}.info Fan speed information for $gpuName"
echo "fan${nGpusCounter}.label Fan speed ($gpuName)"
: $(( nGpusCounter = $nGpusCounter + 1 ))
done
;;
load)
echo 'graph_title GPU load'
echo 'graph_args -l 0 -u 100'
echo 'graph_vlabel Percentage'
echo 'graph_category gpu'
echo "graph_info GPU load"
nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ]
do
gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "`
echo "load${nGpusCounter}.info Load information for $gpuName"
echo "load${nGpusCounter}.label Load ($gpuName)"
: $(( nGpusCounter = $nGpusCounter + 1 ))
done
;;
vcore)
echo 'graph_title GPU core voltage'
echo 'graph_vlabel mV'
echo 'graph_category gpu'
echo "graph_info GPU core voltage"
nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ]
do
gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "`
echo "vcore${nGpusCounter}.info Vcore information for $gpuName"
echo "vcore${nGpusCounter}.label Core voltage ($gpuName)"
: $(( nGpusCounter = $nGpusCounter + 1 ))
done
;;
*)
echo "Can't run without a proper symlink. Exiting."
echo "Try running munin-node-configure --suggest."
exit 1
;;
esac
exit 0
fi
# Get and print requested value for all available GPUs
export DISPLAY=:0
nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ]
do
case $name in
temp)
value=`$atiConfigExec --adapter=$nGpusCounter --odgt | grep "Sensor 0: Temperature" | grep -o "[0-9]*\.[0-9]*"`
echo "temp${nGpusCounter}.value $value"
;;
clocks)
value=`$atiConfigExec --adapter=$nGpusCounter --odgc | grep "Current Clocks" | grep -o "[0-9]*"`
coreClock=`echo "$value" | sed -n 1p`
echo "coreclock${nGpusCounter}.value $coreClock"
memClock=`echo "$value" | sed -n 2p`
echo "memclock${nGpusCounter}.value $memClock"
;;
fan)
value=`$atiConfigExec --adapter=$nGpusCounter --pplib-cmd "get fanspeed 0" | grep "Fan Speed" | grep -o "[0-9]*"`
echo "fan${nGpusCounter}.value $value"
;;
load)
value=`$atiConfigExec --adapter=$nGpusCounter --odgc | grep "GPU load" | grep -o "[0-9]*"`
echo "load${nGpusCounter}.value $value"
;;
vcore)
value=`$atiConfigExec --adapter=$nGpusCounter --pplib-cmd "get activity" | grep "VDDC" | grep -o "[0-9]*"`
echo "vcore${nGpusCounter}.value $value"
;;
*)
echo "Can't run without a proper symlink. Exiting."
echo "Try running munin-node-configure --suggest."
exit 1
;;
esac
: $(( nGpusCounter = $nGpusCounter + 1 ))
done

208
plugins/gpu/nvidia_gpu_ Executable file
View File

@ -0,0 +1,208 @@
#!/bin/sh
# -*- sh -*-
: << =cut
=head1 NAME
nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility,
usually bundled with NVIDIA GPU driver, to obtain information.
=head1 CONFIGURATION
This is a wildcard plugin. The wildcard prefix link name should be the
value to monitor.
This plugin uses the following configuration variables:
[nvidia_gpu_*]
env.smiexec - Location of nvidia-smi executable.
=head2 DEFAULT CONFIGURATION
The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi.
=head2 EXAMPLE WILDCARD USAGE
C<ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp>
...will monitor the temperature of available GPUs.
=head1 AUTHOR
Nuno Fachada
faken@fakenmc.com
=head1 LICENSE
GNU General Public License, version 2
http://www.gnu.org/licenses/gpl-2.0.html
=head1 MAGIC MARKERS
#%# family=auto
#%# capabilities=autoconf suggest
=cut
# Determine name of parameter to monitor
name=`basename $0 | sed 's/^nvidia_gpu_//g'`
# Get location of nvidia-smi executable or use default
nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
# Check if autoconf was requested
if [ "$1" = "autoconf" ]; then
# Autoconf only returns yes if nvidia-smi exists and is executable
if [ -x $nvSmiExec ]; then
echo yes
exit 0
else
echo "no (nvidia-smi executable not found)"
exit 0
fi
fi
# Check if suggest was requested
if [ "$1" = "suggest" ]; then
echo "temp"
echo "mem"
echo "fan"
exit 0
fi
# Get number of GPUs
nGpusOutput=`$nvSmiExec -L`
nGpus=`echo "$nGpusOutput" | wc -l`
if [ $nGpus -eq 0 ]; then
# Exit if no GPUs found
echo "No NVIDIA GPUs detected. Exiting."
exit 1
fi
# Get full output from nvidia-smi
smiOutput=`$nvSmiExec -q`
# Check if config was requested
if [ "$1" = "config" ]; then
# Get driver version
driverVersion=`nvidia-smi -q | grep "Driver Version" | cut -d : -f 2 | tr -d ' '`
# Configure graph depending on what which quantity will be plotted
case $name in
temp)
echo 'graph_title GPU temperature'
echo 'graph_args -l 0 -u 120'
echo 'graph_vlabel Degrees (C)'
echo 'graph_category gpu'
echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion"
nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ]
do
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
echo "temp${nGpusCounter}.warning 75"
echo "temp${nGpusCounter}.critical 95"
echo "temp${nGpusCounter}.info Temperature information for $gpuName"
: $(( nGpusCounter = $nGpusCounter + 1 ))
done
;;
mem)
# First determine total memory of each GPU...
gpusTotalMemOutput=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' '`
gpusTotalMem=''
nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ]
do
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
echo "mem${nGpusCounter}.info Memory information for $gpuName"
gpuMem=`echo "$gpusTotalMemOutput"| sed -n $(( $nGpusCounter + 1 ))p`
gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}"
: $(( nGpusCounter = $nGpusCounter + 1 ))
if [ $nGpusCounter -lt $nGpus ]; then
gpusTotalMem="${gpusTotalMem}, "
fi
done
# ...then output config data.
echo 'graph_title GPU memory usage'
echo 'graph_args -l 0 -u 100'
echo 'graph_vlabel Percentage'
echo 'graph_category gpu'
echo "graph_info Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)"
;;
fan)
echo 'graph_title GPU fan speed'
echo 'graph_args -l 0 -u 100'
echo 'graph_vlabel Percentage'
echo 'graph_category gpu'
echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion"
nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ]
do
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
echo "fan${nGpusCounter}.info Fan information for $gpuName"
: $(( nGpusCounter = $nGpusCounter + 1 ))
done
;;
*)
echo "Can't run without a proper symlink. Exiting."
echo "Try running munin-node-configure --suggest."
exit 1
;;
esac
# Common stuff for all quantities
nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ]
do
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
echo "${name}${nGpusCounter}.label $gpuName"
: $(( nGpusCounter = $nGpusCounter + 1 ))
#print_warning $name
#print_critical $name
done
exit 0
fi
# Get requested value
case $name in
temp)
valueGpus=`echo "$smiOutput" | grep -A 1 "Temperature" | grep "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2`
;;
mem)
totalMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2`
usedMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2`
valueGpus=''
nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ]
do
totalMemGpu=`echo "$totalMemGpus" | sed -n $(( $nGpusCounter + 1 ))p`
usedMemGpu=`echo "$usedMemGpus" | sed -n $(( $nGpusCounter + 1 ))p`
percentMemUsed=$(( $usedMemGpu * 100 / $totalMemGpu ))
valueGpus="${valueGpus}${percentMemUsed}\n"
: $(( nGpusCounter = $nGpusCounter + 1 ))
done
;;
fan)
valueGpus=`echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2`
;;
*)
echo "Can't run without a proper symlink. Exiting."
echo "Try running munin-node-configure --suggest."
exit 1
;;
esac
# Print requested value
nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ]
do
value=`echo "$valueGpus" | sed -n $(( $nGpusCounter + 1 ))p`
echo "${name}${nGpusCounter}.value $value"
: $(( nGpusCounter = $nGpusCounter + 1 ))
done