mirror of
https://github.com/munin-monitoring/contrib.git
synced 2018-11-08 00:59:34 +01:00
Add plugins for monitoring NVIDIA and AMD GPUs
This commit is contained in:
parent
c2ecfcb772
commit
426bba4466
235
plugins/gpu/amd_gpu_
Executable file
235
plugins/gpu/amd_gpu_
Executable file
@ -0,0 +1,235 @@
|
||||
#!/bin/bash
|
||||
# -*- bash -*-
|
||||
|
||||
: << =cut
|
||||
|
||||
=head1 NAME
|
||||
|
||||
amd_gpu_ - Wildcard plugin to monitor AMD GPUs. Uses aticonfig utility,
|
||||
usually bundled with AMD GPU driver, to obtain information. To use this
|
||||
plugin you have to make sure aticonfig will run without an active X
|
||||
server (i.e. without anyone being logged in via the GUI). For more
|
||||
information on this visit this link:
|
||||
http://www.mayankdaga.com/running-opencl-applications-remotely-on-amd-gpus/
|
||||
|
||||
=head1 CONFIGURATION
|
||||
|
||||
This is a wildcard plugin. The wildcard prefix link name should be the
|
||||
value to monitor.
|
||||
|
||||
This plugin uses the following configuration variables:
|
||||
|
||||
[amd_gpu_*]
|
||||
env.aticonfexec - Location of aticonfig executable.
|
||||
user root
|
||||
|
||||
=head2 DEFAULT CONFIGURATION
|
||||
|
||||
The default configuration is to set "env.aticonfexec" to /usr/bin/aticonfig.
|
||||
|
||||
=head2 EXAMPLE WILDCARD USAGE
|
||||
|
||||
C<ln -s /usr/share/munin/plugins/amd_gpu_ /etc/munin/plugins/amd_gpu_temp>
|
||||
|
||||
...will monitor the temperature of available AMD GPUs.
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Nuno Fachada
|
||||
faken@fakenmc.com
|
||||
|
||||
=head1 LICENSE
|
||||
|
||||
GNU General Public License, version 2
|
||||
http://www.gnu.org/licenses/gpl-2.0.html
|
||||
|
||||
=head1 MAGIC MARKERS
|
||||
|
||||
#%# family=auto
|
||||
#%# capabilities=autoconf suggest
|
||||
|
||||
=cut
|
||||
|
||||
# Determine name of parameter to monitor
|
||||
name=`basename $0 | sed 's/^amd_gpu_//g'`
|
||||
|
||||
# Get location of aticonfig executable or use default
|
||||
atiConfigExec=${aticonfexec:-'/usr/bin/aticonfig'}
|
||||
|
||||
# Check if autoconf was requested
|
||||
if [ "$1" = "autoconf" ]; then
|
||||
# Autoconf only returns yes if aticonfig exists and is executable
|
||||
if [ -x $atiConfigExec ]; then
|
||||
echo yes
|
||||
exit 0
|
||||
else
|
||||
echo "no (aticonfig executable not found)"
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check if suggest was requested
|
||||
if [ "$1" = "suggest" ]; then
|
||||
echo "temp"
|
||||
echo "clocks"
|
||||
echo "fan"
|
||||
echo "load"
|
||||
echo "vcore"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Get number of GPUs
|
||||
nGpusOutput=`$atiConfigExec --list-adapters`
|
||||
|
||||
nGpus=`echo "$nGpusOutput" | wc -l`
|
||||
nGpus=$((nGpus - 2)) # Last two lines don't matter
|
||||
if [ $nGpus -eq 0 ]; then
|
||||
# Exit if no GPUs found
|
||||
echo "No AMD GPUs detected. Exiting."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if config was requested
|
||||
if [ "$1" = "config" ]; then
|
||||
|
||||
# Configure graph depending on what which quantity will be plotted
|
||||
case $name in
|
||||
temp)
|
||||
echo 'graph_title GPU temperature'
|
||||
echo 'graph_args -l 0 -u 120'
|
||||
echo 'graph_vlabel Degrees (C)'
|
||||
echo 'graph_category gpu'
|
||||
echo "graph_info Temperature information for AMD GPUs"
|
||||
nGpusCounter=0
|
||||
while [ $nGpusCounter -lt $nGpus ]
|
||||
do
|
||||
gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "`
|
||||
echo "temp${nGpusCounter}.warning 75"
|
||||
echo "temp${nGpusCounter}.critical 95"
|
||||
echo "temp${nGpusCounter}.info Temperature information for $gpuName"
|
||||
echo "temp${nGpusCounter}.label Temperature ($gpuName)"
|
||||
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
||||
done
|
||||
;;
|
||||
clocks)
|
||||
# First determine max clock for each GPU...
|
||||
read -a array <<< `$atiConfigExec --odgc | grep "Peak Range" | grep -o "[0-9]*"`
|
||||
maxclock=0
|
||||
for element in "${array[@]}"
|
||||
do
|
||||
if [ "$element" -gt "$maxclock" ]; then
|
||||
maxclock=$element
|
||||
fi
|
||||
done
|
||||
# ...then output config data.
|
||||
echo 'graph_title GPU clock'
|
||||
echo "graph_args -l 0 -u $maxclock"
|
||||
echo 'graph_vlabel MHz'
|
||||
echo 'graph_category gpu'
|
||||
echo "graph_info Core and memory clock info for AMD GPUs"
|
||||
nGpusCounter=0
|
||||
while [ $nGpusCounter -lt $nGpus ]
|
||||
do
|
||||
gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "`
|
||||
echo "memclock${nGpusCounter}.info Memory clock information for $gpuName"
|
||||
echo "memclock${nGpusCounter}.label Memory clock ($gpuName)"
|
||||
echo "coreclock${nGpusCounter}.info Core clock information for $gpuName"
|
||||
echo "coreclock${nGpusCounter}.label Core clock ($gpuName)"
|
||||
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
||||
done
|
||||
;;
|
||||
fan)
|
||||
echo 'graph_title GPU fan speed'
|
||||
echo 'graph_args -l 0 -u 100'
|
||||
echo 'graph_vlabel Percentage'
|
||||
echo 'graph_category gpu'
|
||||
echo "graph_info Fan speed of AMD GPUs"
|
||||
nGpusCounter=0
|
||||
while [ $nGpusCounter -lt $nGpus ]
|
||||
do
|
||||
gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "`
|
||||
echo "fan${nGpusCounter}.info Fan speed information for $gpuName"
|
||||
echo "fan${nGpusCounter}.label Fan speed ($gpuName)"
|
||||
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
||||
done
|
||||
;;
|
||||
load)
|
||||
echo 'graph_title GPU load'
|
||||
echo 'graph_args -l 0 -u 100'
|
||||
echo 'graph_vlabel Percentage'
|
||||
echo 'graph_category gpu'
|
||||
echo "graph_info GPU load"
|
||||
nGpusCounter=0
|
||||
while [ $nGpusCounter -lt $nGpus ]
|
||||
do
|
||||
gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "`
|
||||
echo "load${nGpusCounter}.info Load information for $gpuName"
|
||||
echo "load${nGpusCounter}.label Load ($gpuName)"
|
||||
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
||||
done
|
||||
;;
|
||||
vcore)
|
||||
echo 'graph_title GPU core voltage'
|
||||
echo 'graph_vlabel mV'
|
||||
echo 'graph_category gpu'
|
||||
echo "graph_info GPU core voltage"
|
||||
nGpusCounter=0
|
||||
while [ $nGpusCounter -lt $nGpus ]
|
||||
do
|
||||
gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "`
|
||||
echo "vcore${nGpusCounter}.info Vcore information for $gpuName"
|
||||
echo "vcore${nGpusCounter}.label Core voltage ($gpuName)"
|
||||
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
||||
done
|
||||
;;
|
||||
*)
|
||||
echo "Can't run without a proper symlink. Exiting."
|
||||
echo "Try running munin-node-configure --suggest."
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Get and print requested value for all available GPUs
|
||||
export DISPLAY=:0
|
||||
nGpusCounter=0
|
||||
while [ $nGpusCounter -lt $nGpus ]
|
||||
do
|
||||
case $name in
|
||||
temp)
|
||||
value=`$atiConfigExec --adapter=$nGpusCounter --odgt | grep "Sensor 0: Temperature" | grep -o "[0-9]*\.[0-9]*"`
|
||||
echo "temp${nGpusCounter}.value $value"
|
||||
;;
|
||||
clocks)
|
||||
value=`$atiConfigExec --adapter=$nGpusCounter --odgc | grep "Current Clocks" | grep -o "[0-9]*"`
|
||||
coreClock=`echo "$value" | sed -n 1p`
|
||||
echo "coreclock${nGpusCounter}.value $coreClock"
|
||||
memClock=`echo "$value" | sed -n 2p`
|
||||
echo "memclock${nGpusCounter}.value $memClock"
|
||||
;;
|
||||
fan)
|
||||
value=`$atiConfigExec --adapter=$nGpusCounter --pplib-cmd "get fanspeed 0" | grep "Fan Speed" | grep -o "[0-9]*"`
|
||||
echo "fan${nGpusCounter}.value $value"
|
||||
;;
|
||||
load)
|
||||
value=`$atiConfigExec --adapter=$nGpusCounter --odgc | grep "GPU load" | grep -o "[0-9]*"`
|
||||
echo "load${nGpusCounter}.value $value"
|
||||
;;
|
||||
vcore)
|
||||
value=`$atiConfigExec --adapter=$nGpusCounter --pplib-cmd "get activity" | grep "VDDC" | grep -o "[0-9]*"`
|
||||
echo "vcore${nGpusCounter}.value $value"
|
||||
;;
|
||||
*)
|
||||
echo "Can't run without a proper symlink. Exiting."
|
||||
echo "Try running munin-node-configure --suggest."
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
||||
done
|
||||
|
||||
|
||||
|
||||
|
208
plugins/gpu/nvidia_gpu_
Executable file
208
plugins/gpu/nvidia_gpu_
Executable file
@ -0,0 +1,208 @@
|
||||
#!/bin/sh
|
||||
# -*- sh -*-
|
||||
|
||||
: << =cut
|
||||
|
||||
=head1 NAME
|
||||
|
||||
nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility,
|
||||
usually bundled with NVIDIA GPU driver, to obtain information.
|
||||
|
||||
=head1 CONFIGURATION
|
||||
|
||||
This is a wildcard plugin. The wildcard prefix link name should be the
|
||||
value to monitor.
|
||||
|
||||
This plugin uses the following configuration variables:
|
||||
|
||||
[nvidia_gpu_*]
|
||||
env.smiexec - Location of nvidia-smi executable.
|
||||
|
||||
=head2 DEFAULT CONFIGURATION
|
||||
|
||||
The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi.
|
||||
|
||||
=head2 EXAMPLE WILDCARD USAGE
|
||||
|
||||
C<ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp>
|
||||
|
||||
...will monitor the temperature of available GPUs.
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Nuno Fachada
|
||||
faken@fakenmc.com
|
||||
|
||||
=head1 LICENSE
|
||||
|
||||
GNU General Public License, version 2
|
||||
http://www.gnu.org/licenses/gpl-2.0.html
|
||||
|
||||
=head1 MAGIC MARKERS
|
||||
|
||||
#%# family=auto
|
||||
#%# capabilities=autoconf suggest
|
||||
|
||||
=cut
|
||||
|
||||
# Determine name of parameter to monitor
|
||||
name=`basename $0 | sed 's/^nvidia_gpu_//g'`
|
||||
|
||||
# Get location of nvidia-smi executable or use default
|
||||
nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
|
||||
|
||||
# Check if autoconf was requested
|
||||
if [ "$1" = "autoconf" ]; then
|
||||
# Autoconf only returns yes if nvidia-smi exists and is executable
|
||||
if [ -x $nvSmiExec ]; then
|
||||
echo yes
|
||||
exit 0
|
||||
else
|
||||
echo "no (nvidia-smi executable not found)"
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check if suggest was requested
|
||||
if [ "$1" = "suggest" ]; then
|
||||
echo "temp"
|
||||
echo "mem"
|
||||
echo "fan"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Get number of GPUs
|
||||
nGpusOutput=`$nvSmiExec -L`
|
||||
nGpus=`echo "$nGpusOutput" | wc -l`
|
||||
if [ $nGpus -eq 0 ]; then
|
||||
# Exit if no GPUs found
|
||||
echo "No NVIDIA GPUs detected. Exiting."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get full output from nvidia-smi
|
||||
smiOutput=`$nvSmiExec -q`
|
||||
|
||||
# Check if config was requested
|
||||
if [ "$1" = "config" ]; then
|
||||
|
||||
# Get driver version
|
||||
driverVersion=`nvidia-smi -q | grep "Driver Version" | cut -d : -f 2 | tr -d ' '`
|
||||
|
||||
# Configure graph depending on what which quantity will be plotted
|
||||
case $name in
|
||||
temp)
|
||||
echo 'graph_title GPU temperature'
|
||||
echo 'graph_args -l 0 -u 120'
|
||||
echo 'graph_vlabel Degrees (C)'
|
||||
echo 'graph_category gpu'
|
||||
echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion"
|
||||
nGpusCounter=0
|
||||
while [ $nGpusCounter -lt $nGpus ]
|
||||
do
|
||||
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
|
||||
echo "temp${nGpusCounter}.warning 75"
|
||||
echo "temp${nGpusCounter}.critical 95"
|
||||
echo "temp${nGpusCounter}.info Temperature information for $gpuName"
|
||||
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
||||
done
|
||||
;;
|
||||
mem)
|
||||
# First determine total memory of each GPU...
|
||||
gpusTotalMemOutput=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' '`
|
||||
gpusTotalMem=''
|
||||
nGpusCounter=0
|
||||
while [ $nGpusCounter -lt $nGpus ]
|
||||
do
|
||||
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
|
||||
echo "mem${nGpusCounter}.info Memory information for $gpuName"
|
||||
gpuMem=`echo "$gpusTotalMemOutput"| sed -n $(( $nGpusCounter + 1 ))p`
|
||||
gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}"
|
||||
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
||||
if [ $nGpusCounter -lt $nGpus ]; then
|
||||
gpusTotalMem="${gpusTotalMem}, "
|
||||
fi
|
||||
done
|
||||
# ...then output config data.
|
||||
echo 'graph_title GPU memory usage'
|
||||
echo 'graph_args -l 0 -u 100'
|
||||
echo 'graph_vlabel Percentage'
|
||||
echo 'graph_category gpu'
|
||||
echo "graph_info Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)"
|
||||
;;
|
||||
fan)
|
||||
echo 'graph_title GPU fan speed'
|
||||
echo 'graph_args -l 0 -u 100'
|
||||
echo 'graph_vlabel Percentage'
|
||||
echo 'graph_category gpu'
|
||||
echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion"
|
||||
nGpusCounter=0
|
||||
while [ $nGpusCounter -lt $nGpus ]
|
||||
do
|
||||
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
|
||||
echo "fan${nGpusCounter}.info Fan information for $gpuName"
|
||||
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
||||
done
|
||||
;;
|
||||
*)
|
||||
echo "Can't run without a proper symlink. Exiting."
|
||||
echo "Try running munin-node-configure --suggest."
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
# Common stuff for all quantities
|
||||
nGpusCounter=0
|
||||
while [ $nGpusCounter -lt $nGpus ]
|
||||
do
|
||||
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
|
||||
echo "${name}${nGpusCounter}.label $gpuName"
|
||||
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
||||
#print_warning $name
|
||||
#print_critical $name
|
||||
done
|
||||
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Get requested value
|
||||
case $name in
|
||||
temp)
|
||||
valueGpus=`echo "$smiOutput" | grep -A 1 "Temperature" | grep "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2`
|
||||
;;
|
||||
mem)
|
||||
totalMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2`
|
||||
usedMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2`
|
||||
valueGpus=''
|
||||
nGpusCounter=0
|
||||
while [ $nGpusCounter -lt $nGpus ]
|
||||
do
|
||||
totalMemGpu=`echo "$totalMemGpus" | sed -n $(( $nGpusCounter + 1 ))p`
|
||||
usedMemGpu=`echo "$usedMemGpus" | sed -n $(( $nGpusCounter + 1 ))p`
|
||||
percentMemUsed=$(( $usedMemGpu * 100 / $totalMemGpu ))
|
||||
valueGpus="${valueGpus}${percentMemUsed}\n"
|
||||
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
||||
done
|
||||
;;
|
||||
fan)
|
||||
valueGpus=`echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2`
|
||||
;;
|
||||
*)
|
||||
echo "Can't run without a proper symlink. Exiting."
|
||||
echo "Try running munin-node-configure --suggest."
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
|
||||
# Print requested value
|
||||
nGpusCounter=0
|
||||
while [ $nGpusCounter -lt $nGpus ]
|
||||
do
|
||||
value=`echo "$valueGpus" | sed -n $(( $nGpusCounter + 1 ))p`
|
||||
echo "${name}${nGpusCounter}.value $value"
|
||||
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
||||
done
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user