#!/bin/bash # -*- sh -*- : << =cut =head1 NAME nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility, usually bundled with NVIDIA GPU driver, to obtain information. =head1 CONFIGURATION This is a wildcard plugin. The wildcard prefix link name should be the value to monitor. This plugin uses the following configuration variables: [nvidia_gpu_*] env.smiexec - Location of nvidia-smi executable. env.warning - Warning temperature env.critical - Critical temperature =head2 DEFAULT CONFIGURATION The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi and assume warning and critical temperatures of 75 and 95 degrees celsius, respectively. =head2 EXAMPLE WILDCARD USAGE C ...will monitor the temperature of available GPUs. =head1 TODO =over 4 =item * Add support for specific professional GPU features such as number of compute processes, clocks, power draw, utilization, and so on. =item * Use multigraphs for multiple GPUs (http://munin-monitoring.org/wiki/MultigraphSampleOutput). =back =head1 AUTHOR Nuno Fachada faken@fakenmc.com =head1 LICENSE GNU General Public License, version 2 http://www.gnu.org/licenses/gpl-2.0.html =head1 MAGIC MARKERS #%# family=auto #%# capabilities=autoconf suggest =cut # Determine name of parameter to monitor name=`basename $0 | sed 's/^nvidia_gpu_//g'` # Get location of nvidia-smi executable or use default nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'} # Check if autoconf was requested if [ "$1" = "autoconf" ]; then # Autoconf only returns yes if nvidia-smi exists and is executable if [ -x $nvSmiExec ]; then echo yes exit 0 else echo "no (nvidia-smi executable not found)" exit 0 fi fi # Check if suggest was requested if [ "$1" = "suggest" ]; then echo "temp" echo "mem" echo "fan" exit 0 fi # Get number of GPUs nGpusOutput=`$nvSmiExec -L` nGpus=`echo "$nGpusOutput" | wc -l` if [ $nGpus -eq 0 ]; then # Exit if no GPUs found echo "No NVIDIA GPUs detected. Exiting." exit 1 fi # Get full output from nvidia-smi smiOutput=`$nvSmiExec -q` # Check if config was requested if [ "$1" = "config" ]; then # Get driver version driverVersion=`nvidia-smi -q | grep "Driver Version" | cut -d : -f 2 | tr -d ' '` # Configure graph depending on what which quantity will be plotted case $name in temp) echo 'graph_title GPU temperature' echo 'graph_args -l 0 -u 120' echo 'graph_vlabel Degrees (C)' echo 'graph_category sensors' echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion" nGpusCounter=0 while [ $nGpusCounter -lt $nGpus ] do gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` echo "temp${nGpusCounter}.warning ${warning:-75}" echo "temp${nGpusCounter}.critical ${critical:-95}" echo "temp${nGpusCounter}.info Temperature information for $gpuName" : $(( nGpusCounter = $nGpusCounter + 1 )) done ;; mem) # First determine total memory of each GPU... gpusTotalMemOutput=`echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' '` gpusTotalMem='' nGpusCounter=0 while [ $nGpusCounter -lt $nGpus ] do gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` echo "mem${nGpusCounter}.info Memory information for $gpuName" gpuMem=`echo "$gpusTotalMemOutput"| sed -n $(( $nGpusCounter + 1 ))p` gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}" : $(( nGpusCounter = $nGpusCounter + 1 )) if [ $nGpusCounter -lt $nGpus ]; then gpusTotalMem="${gpusTotalMem}, " fi done # ...then output config data. echo 'graph_title GPU memory usage' echo 'graph_args -l 0 -u 100' echo 'graph_vlabel Percentage' echo 'graph_category memory' echo "graph_info FB Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)" ;; fan) echo 'graph_title GPU fan speed' echo 'graph_args -l 0 -u 100' echo 'graph_vlabel Percentage' echo 'graph_category sensors' echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion" nGpusCounter=0 while [ $nGpusCounter -lt $nGpus ] do gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` echo "fan${nGpusCounter}.info Fan information for $gpuName" : $(( nGpusCounter = $nGpusCounter + 1 )) done ;; *) echo "Can't run without a proper symlink. Exiting." echo "Try running munin-node-configure --suggest." exit 1 ;; esac # Common stuff for all quantities nGpusCounter=0 while [ $nGpusCounter -lt $nGpus ] do gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` echo "${name}${nGpusCounter}.label $gpuName" : $(( nGpusCounter = $nGpusCounter + 1 )) #print_warning $name #print_critical $name done exit 0 fi # Get requested value case $name in temp) valueGpus=`echo "$smiOutput" | grep -A 1 "Temperature" | grep -i "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2` ;; mem) totalMemGpus=`echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2` usedMemGpus=`echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2` valueGpus='' nGpusCounter=0 while [ $nGpusCounter -lt $nGpus ] do totalMemGpu=`echo "$totalMemGpus" | sed -n $(( $nGpusCounter + 1 ))p` usedMemGpu=`echo "$usedMemGpus" | sed -n $(( $nGpusCounter + 1 ))p` percentMemUsed=$(( $usedMemGpu * 100 / $totalMemGpu )) valueGpus="${valueGpus}${percentMemUsed}"$'\n' : $(( nGpusCounter = $nGpusCounter + 1 )) done ;; fan) valueGpus=`echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2` ;; *) echo "Can't run without a proper symlink. Exiting." echo "Try running munin-node-configure --suggest." exit 1 ;; esac # Print requested value nGpusCounter=0 while [ $nGpusCounter -lt $nGpus ] do value=`echo "$valueGpus" | sed -n $(( $nGpusCounter + 1 ))p` echo "${name}${nGpusCounter}.value $value" : $(( nGpusCounter = $nGpusCounter + 1 )) done