2012-12-11 20:57:06 +01:00
|
|
|
#!/bin/bash
|
|
|
|
# -*- bash -*-
|
|
|
|
|
|
|
|
: << =cut
|
|
|
|
|
|
|
|
=head1 NAME
|
|
|
|
|
|
|
|
amd_gpu_ - Wildcard plugin to monitor AMD GPUs. Uses aticonfig utility,
|
|
|
|
usually bundled with AMD GPU driver, to obtain information. To use this
|
|
|
|
plugin you have to make sure aticonfig will run without an active X
|
2018-08-02 02:03:42 +02:00
|
|
|
server (i.e. without anyone being logged in via the GUI). For more
|
|
|
|
information about this issue visit the link below:
|
2012-12-11 20:57:06 +01:00
|
|
|
http://www.mayankdaga.com/running-opencl-applications-remotely-on-amd-gpus/
|
|
|
|
|
|
|
|
=head1 CONFIGURATION
|
|
|
|
|
2018-08-02 02:03:42 +02:00
|
|
|
This is a wildcard plugin. The wildcard prefix link name should be the
|
2012-12-11 20:57:06 +01:00
|
|
|
value to monitor.
|
|
|
|
|
|
|
|
This plugin uses the following configuration variables:
|
|
|
|
|
|
|
|
[amd_gpu_*]
|
|
|
|
user root
|
2013-11-12 12:29:12 +01:00
|
|
|
env.aticonfexec - Location of aticonfig executable.
|
|
|
|
env.warning - Warning temperature
|
|
|
|
env.critical - Critical temperature
|
2012-12-11 20:57:06 +01:00
|
|
|
|
|
|
|
=head2 DEFAULT CONFIGURATION
|
|
|
|
|
2013-11-12 14:47:45 +01:00
|
|
|
The default configuration is to set "env.aticonfexec" to /usr/bin/aticonfig and
|
|
|
|
assume warning and critical temperatures of 75 and 95 degrees celsius, respectively.
|
2012-12-11 20:57:06 +01:00
|
|
|
|
|
|
|
=head2 EXAMPLE WILDCARD USAGE
|
|
|
|
|
|
|
|
C<ln -s /usr/share/munin/plugins/amd_gpu_ /etc/munin/plugins/amd_gpu_temp>
|
|
|
|
|
|
|
|
...will monitor the temperature of available AMD GPUs.
|
|
|
|
|
2013-11-12 14:47:45 +01:00
|
|
|
=head1 TODO
|
|
|
|
|
|
|
|
=over 4
|
|
|
|
|
|
|
|
=item *
|
|
|
|
|
|
|
|
Use multigraphs for multiple GPUs (http://munin-monitoring.org/wiki/MultigraphSampleOutput).
|
|
|
|
|
|
|
|
=back
|
|
|
|
|
2012-12-11 20:57:06 +01:00
|
|
|
=head1 AUTHOR
|
|
|
|
|
|
|
|
Nuno Fachada
|
|
|
|
faken@fakenmc.com
|
|
|
|
|
|
|
|
=head1 LICENSE
|
|
|
|
|
|
|
|
GNU General Public License, version 2
|
2018-08-02 02:03:42 +02:00
|
|
|
http://www.gnu.org/licenses/gpl-2.0.html
|
2012-12-11 20:57:06 +01:00
|
|
|
|
|
|
|
=head1 MAGIC MARKERS
|
|
|
|
|
|
|
|
#%# family=auto
|
|
|
|
#%# capabilities=autoconf suggest
|
|
|
|
|
|
|
|
=cut
|
|
|
|
|
|
|
|
# Determine name of parameter to monitor
|
|
|
|
name=`basename $0 | sed 's/^amd_gpu_//g'`
|
|
|
|
|
|
|
|
# Get location of aticonfig executable or use default
|
|
|
|
atiConfigExec=${aticonfexec:-'/usr/bin/aticonfig'}
|
|
|
|
|
|
|
|
# Check if autoconf was requested
|
|
|
|
if [ "$1" = "autoconf" ]; then
|
2018-08-02 02:03:42 +02:00
|
|
|
# Autoconf only returns yes if aticonfig exists and is executable
|
2012-12-11 20:57:06 +01:00
|
|
|
if [ -x $atiConfigExec ]; then
|
|
|
|
echo yes
|
|
|
|
exit 0
|
|
|
|
else
|
|
|
|
echo "no (aticonfig executable not found)"
|
|
|
|
exit 0
|
|
|
|
fi
|
|
|
|
fi
|
|
|
|
|
|
|
|
# Check if suggest was requested
|
|
|
|
if [ "$1" = "suggest" ]; then
|
|
|
|
echo "temp"
|
|
|
|
echo "clocks"
|
|
|
|
echo "fan"
|
|
|
|
echo "load"
|
|
|
|
echo "vcore"
|
|
|
|
exit 0
|
|
|
|
fi
|
|
|
|
|
|
|
|
# Get number of GPUs
|
|
|
|
nGpusOutput=`$atiConfigExec --list-adapters`
|
|
|
|
|
|
|
|
nGpus=`echo "$nGpusOutput" | wc -l`
|
|
|
|
nGpus=$((nGpus - 2)) # Last two lines don't matter
|
|
|
|
if [ $nGpus -eq 0 ]; then
|
|
|
|
# Exit if no GPUs found
|
|
|
|
echo "No AMD GPUs detected. Exiting."
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
|
|
|
|
# Check if config was requested
|
|
|
|
if [ "$1" = "config" ]; then
|
|
|
|
|
|
|
|
# Configure graph depending on what which quantity will be plotted
|
|
|
|
case $name in
|
|
|
|
temp)
|
|
|
|
echo 'graph_title GPU temperature'
|
2016-04-16 09:03:16 +02:00
|
|
|
echo 'graph_args -l 20 -u 120'
|
2012-12-11 20:57:06 +01:00
|
|
|
echo 'graph_vlabel Degrees (C)'
|
2017-02-22 05:34:14 +01:00
|
|
|
echo 'graph_category sensors'
|
2012-12-11 20:57:06 +01:00
|
|
|
echo "graph_info Temperature information for AMD GPUs"
|
|
|
|
nGpusCounter=0
|
|
|
|
while [ $nGpusCounter -lt $nGpus ]
|
|
|
|
do
|
2018-03-08 12:11:32 +01:00
|
|
|
gpuName=`echo "$nGpusOutput" | grep "\ $nGpusCounter\.\ " | cut -f 3 -d "." | sed -r 's/^[0-9]+\ //'`
|
2013-11-12 12:29:12 +01:00
|
|
|
echo "temp${nGpusCounter}.warning ${warning:-75}"
|
|
|
|
echo "temp${nGpusCounter}.critical ${critical:-95}"
|
2012-12-11 20:57:06 +01:00
|
|
|
echo "temp${nGpusCounter}.info Temperature information for $gpuName"
|
|
|
|
echo "temp${nGpusCounter}.label Temperature ($gpuName)"
|
|
|
|
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
2018-08-02 02:03:42 +02:00
|
|
|
done
|
2012-12-11 20:57:06 +01:00
|
|
|
;;
|
|
|
|
clocks)
|
|
|
|
# First determine max clock for each GPU...
|
|
|
|
read -a array <<< `$atiConfigExec --odgc | grep "Peak Range" | grep -o "[0-9]*"`
|
|
|
|
maxclock=0
|
|
|
|
for element in "${array[@]}"
|
|
|
|
do
|
|
|
|
if [ "$element" -gt "$maxclock" ]; then
|
|
|
|
maxclock=$element
|
|
|
|
fi
|
|
|
|
done
|
|
|
|
# ...then output config data.
|
|
|
|
echo 'graph_title GPU clock'
|
|
|
|
echo "graph_args -l 0 -u $maxclock"
|
|
|
|
echo 'graph_vlabel MHz'
|
2018-08-01 23:56:56 +02:00
|
|
|
echo 'graph_category htc'
|
2012-12-11 20:57:06 +01:00
|
|
|
echo "graph_info Core and memory clock info for AMD GPUs"
|
|
|
|
nGpusCounter=0
|
|
|
|
while [ $nGpusCounter -lt $nGpus ]
|
|
|
|
do
|
2018-03-08 12:11:32 +01:00
|
|
|
gpuName=`echo "$nGpusOutput" | grep "\ $nGpusCounter\.\ " | cut -f 3 -d "." | sed -r 's/^[0-9]+\ //'`
|
2012-12-11 20:57:06 +01:00
|
|
|
echo "memclock${nGpusCounter}.info Memory clock information for $gpuName"
|
|
|
|
echo "memclock${nGpusCounter}.label Memory clock ($gpuName)"
|
|
|
|
echo "coreclock${nGpusCounter}.info Core clock information for $gpuName"
|
|
|
|
echo "coreclock${nGpusCounter}.label Core clock ($gpuName)"
|
|
|
|
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
2018-08-02 02:03:42 +02:00
|
|
|
done
|
2012-12-11 20:57:06 +01:00
|
|
|
;;
|
|
|
|
fan)
|
|
|
|
echo 'graph_title GPU fan speed'
|
|
|
|
echo 'graph_args -l 0 -u 100'
|
|
|
|
echo 'graph_vlabel Percentage'
|
2017-02-22 05:34:14 +01:00
|
|
|
echo 'graph_category sensors'
|
2012-12-11 20:57:06 +01:00
|
|
|
echo "graph_info Fan speed of AMD GPUs"
|
|
|
|
nGpusCounter=0
|
|
|
|
while [ $nGpusCounter -lt $nGpus ]
|
|
|
|
do
|
2018-03-08 12:11:32 +01:00
|
|
|
gpuName=`echo "$nGpusOutput" | grep "\ $nGpusCounter\.\ " | cut -f 3 -d "." | sed -r 's/^[0-9]+\ //'`
|
2012-12-11 20:57:06 +01:00
|
|
|
echo "fan${nGpusCounter}.info Fan speed information for $gpuName"
|
|
|
|
echo "fan${nGpusCounter}.label Fan speed ($gpuName)"
|
|
|
|
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
2018-08-02 02:03:42 +02:00
|
|
|
done
|
2012-12-11 20:57:06 +01:00
|
|
|
;;
|
|
|
|
load)
|
|
|
|
echo 'graph_title GPU load'
|
|
|
|
echo 'graph_args -l 0 -u 100'
|
|
|
|
echo 'graph_vlabel Percentage'
|
2018-08-01 23:56:56 +02:00
|
|
|
echo 'graph_category htc'
|
2018-08-02 02:03:42 +02:00
|
|
|
echo "graph_info GPU load"
|
2012-12-11 20:57:06 +01:00
|
|
|
nGpusCounter=0
|
|
|
|
while [ $nGpusCounter -lt $nGpus ]
|
|
|
|
do
|
2018-03-08 12:11:32 +01:00
|
|
|
gpuName=`echo "$nGpusOutput" | grep "\ $nGpusCounter\.\ " | cut -f 3 -d "." | sed -r 's/^[0-9]+\ //'`
|
2012-12-11 20:57:06 +01:00
|
|
|
echo "load${nGpusCounter}.info Load information for $gpuName"
|
|
|
|
echo "load${nGpusCounter}.label Load ($gpuName)"
|
|
|
|
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
2018-08-02 02:03:42 +02:00
|
|
|
done
|
2012-12-11 20:57:06 +01:00
|
|
|
;;
|
|
|
|
vcore)
|
|
|
|
echo 'graph_title GPU core voltage'
|
|
|
|
echo 'graph_vlabel mV'
|
2017-02-22 05:34:14 +01:00
|
|
|
echo 'graph_category sensors'
|
2012-12-11 20:57:06 +01:00
|
|
|
echo "graph_info GPU core voltage"
|
|
|
|
nGpusCounter=0
|
|
|
|
while [ $nGpusCounter -lt $nGpus ]
|
|
|
|
do
|
2018-03-08 12:11:32 +01:00
|
|
|
gpuName=`echo "$nGpusOutput" | grep "\ $nGpusCounter\.\ " | cut -f 3 -d "." | sed -r 's/^[0-9]+\ //'`
|
2012-12-11 20:57:06 +01:00
|
|
|
echo "vcore${nGpusCounter}.info Vcore information for $gpuName"
|
|
|
|
echo "vcore${nGpusCounter}.label Core voltage ($gpuName)"
|
|
|
|
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
2018-08-02 02:03:42 +02:00
|
|
|
done
|
2012-12-11 20:57:06 +01:00
|
|
|
;;
|
|
|
|
*)
|
|
|
|
echo "Can't run without a proper symlink. Exiting."
|
|
|
|
echo "Try running munin-node-configure --suggest."
|
|
|
|
exit 1
|
|
|
|
;;
|
|
|
|
esac
|
|
|
|
|
|
|
|
exit 0
|
|
|
|
fi
|
|
|
|
|
|
|
|
# Get and print requested value for all available GPUs
|
|
|
|
export DISPLAY=:0
|
|
|
|
nGpusCounter=0
|
|
|
|
while [ $nGpusCounter -lt $nGpus ]
|
|
|
|
do
|
|
|
|
case $name in
|
|
|
|
temp)
|
|
|
|
value=`$atiConfigExec --adapter=$nGpusCounter --odgt | grep "Sensor 0: Temperature" | grep -o "[0-9]*\.[0-9]*"`
|
|
|
|
echo "temp${nGpusCounter}.value $value"
|
|
|
|
;;
|
|
|
|
clocks)
|
|
|
|
value=`$atiConfigExec --adapter=$nGpusCounter --odgc | grep "Current Clocks" | grep -o "[0-9]*"`
|
|
|
|
coreClock=`echo "$value" | sed -n 1p`
|
|
|
|
echo "coreclock${nGpusCounter}.value $coreClock"
|
|
|
|
memClock=`echo "$value" | sed -n 2p`
|
|
|
|
echo "memclock${nGpusCounter}.value $memClock"
|
|
|
|
;;
|
|
|
|
fan)
|
|
|
|
value=`$atiConfigExec --adapter=$nGpusCounter --pplib-cmd "get fanspeed 0" | grep "Fan Speed" | grep -o "[0-9]*"`
|
|
|
|
echo "fan${nGpusCounter}.value $value"
|
|
|
|
;;
|
|
|
|
load)
|
|
|
|
value=`$atiConfigExec --adapter=$nGpusCounter --odgc | grep "GPU load" | grep -o "[0-9]*"`
|
|
|
|
echo "load${nGpusCounter}.value $value"
|
|
|
|
;;
|
|
|
|
vcore)
|
|
|
|
value=`$atiConfigExec --adapter=$nGpusCounter --pplib-cmd "get activity" | grep "VDDC" | grep -o "[0-9]*"`
|
|
|
|
echo "vcore${nGpusCounter}.value $value"
|
|
|
|
;;
|
|
|
|
*)
|
|
|
|
echo "Can't run without a proper symlink. Exiting."
|
|
|
|
echo "Try running munin-node-configure --suggest."
|
|
|
|
exit 1
|
|
|
|
;;
|
|
|
|
esac
|
|
|
|
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
|
|
|
done
|
|
|
|
|
|
|
|
|