#!/bin/bash # -*- sh -*- : << =cut =head1 NAME nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility, usually bundled with NVIDIA GPU driver, to obtain information. =head1 CONFIGURATION This is a wildcard plugin. The wildcard prefix link name should be the value to monitor. This plugin uses the following configuration variables: [nvidia_gpu_*] env.smiexec - Location of nvidia-smi executable. env.warning - Warning temperature env.critical - Critical temperature =head2 DEFAULT CONFIGURATION The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi and assume warning and critical temperatures of 75 and 95 degrees celsius, respectively. =head2 EXAMPLE WILDCARD USAGE C ...will monitor the temperature of available GPUs. =head1 TODO =over 4 =item * Add support for specific professional GPU features such as number of compute processes, clocks and so on. =item * Use multigraphs for multiple GPUs (http://munin-monitoring.org/wiki/MultigraphSampleOutput). =back =head1 AUTHOR Nuno Fachada faken@fakenmc.com =head1 LICENSE GNU General Public License, version 2 http://www.gnu.org/licenses/gpl-2.0.html =head1 MAGIC MARKERS #%# family=auto #%# capabilities=autoconf suggest =cut # Determine name of parameter to monitor name=$(basename "$0" | sed 's/^nvidia_gpu_//g') # Get location of nvidia-smi executable or use default nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'} # Check if autoconf was requested if [ "$1" = "autoconf" ]; then # Autoconf only returns yes if nvidia-smi exists and is executable if [ -x "$nvSmiExec" ]; then echo yes exit 0 else echo "no (nvidia-smi executable not found)" exit 0 fi fi # Check if suggest was requested if [ "$1" = "suggest" ]; then echo "temp" echo "mem" echo "fan" echo "power" echo "utilization" exit 0 fi # Get number of GPUs nGpusOutput=$($nvSmiExec -L) nGpus=$(echo "$nGpusOutput" | wc -l) if [ "$nGpus" -eq 0 ]; then # Exit if no GPUs found echo "No NVIDIA GPUs detected. Exiting." exit 1 fi # Get full output from nvidia-smi smiOutput=$($nvSmiExec -q) # Check if config was requested if [ "$1" = "config" ]; then # Get driver version driverVersion=$(nvidia-smi -q | grep "Driver Version" | cut -d : -f 2 | tr -d ' ') # Configure graph depending on what which quantity will be plotted case $name in temp) echo 'graph_title GPU temperature' echo 'graph_args -l 0 -u 120' echo 'graph_vlabel degrees Celsius' echo 'graph_category sensors' echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion" nGpusCounter=0 while [ "$nGpusCounter" -lt "$nGpus" ] do gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1) echo "${name}${nGpusCounter}.warning ${warning:-75}" echo "${name}${nGpusCounter}.critical ${critical:-95}" echo "${name}${nGpusCounter}.info Temperature information for $gpuName" : $((nGpusCounter=nGpusCounter+1)) done ;; mem) # First determine total memory of each GPU... gpusTotalMemOutput=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' ') gpusTotalMem='' nGpusCounter=0 while [ "$nGpusCounter" -lt "$nGpus" ] do gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1) echo "${name}${nGpusCounter}.info Memory information for $gpuName" gpuMem=$(echo "$gpusTotalMemOutput"| sed -n $((nGpusCounter+1))p) gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}" : $((nGpusCounter=nGpusCounter+1)) if [ "$nGpusCounter" -lt "$nGpus" ]; then gpusTotalMem="${gpusTotalMem}, " fi done # ...then output config data. echo 'graph_title GPU memory usage' echo 'graph_args -l 0 -u 100' echo 'graph_vlabel %' echo 'graph_category memory' echo "graph_info FB Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)" ;; fan) echo 'graph_title GPU fan speed' echo 'graph_args -l 0 -u 100' echo 'graph_vlabel %' echo 'graph_category sensors' echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion" nGpusCounter=0 while [ "$nGpusCounter" -lt "$nGpus" ] do gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1) echo "${name}${nGpusCounter}.info Fan information for $gpuName" : $((nGpusCounter=nGpusCounter+1)) done ;; power) echo 'graph_title GPU power consumption' echo 'graph_vlabel Watt' echo 'graph_category sensors' echo "graph_info power consumption of NVIDIA GPUs using driver version $driverVersion" nGpusCounter=0 while [ "$nGpusCounter" -lt "$nGpus" ] do gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1) echo "${name}${nGpusCounter}.info power consumption of $gpuName" : $((nGpusCounter=nGpusCounter+1)) done ;; utilization) echo 'graph_title GPU utilization' echo 'graph_args -l 0 -u 100' echo 'graph_vlabel %' echo 'graph_category system' echo "graph_info GPU utilization of NVIDIA GPUs using driver version $driverVersion" nGpusCounter=0 while [ "$nGpusCounter" -lt "$nGpus" ] do gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1) echo "${name}${nGpusCounter}.info GPU utilization information for $gpuName" : $((nGpusCounter=nGpusCounter+1)) done ;; *) echo "Can't run without a proper symlink. Exiting." echo "Try running munin-node-configure --suggest." exit 1 ;; esac # Common stuff for all quantities nGpusCounter=0 while [ "$nGpusCounter" -lt "$nGpus" ] do gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1) echo "${name}${nGpusCounter}.label $gpuName" : $((nGpusCounter=nGpusCounter+1)) #print_warning $name #print_critical $name done exit 0 fi # Get requested value case $name in temp) valueGpus=$(echo "$smiOutput" | grep -A 1 "Temperature" | grep -i "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2) ;; mem) totalMemGpus=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2) usedMemGpus=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2) valueGpus='' nGpusCounter=0 while [ "$nGpusCounter" -lt "$nGpus" ] do totalMemGpu=$(echo "$totalMemGpus" | sed -n $((nGpusCounter+1))p) usedMemGpu=$(echo "$usedMemGpus" | sed -n $((nGpusCounter+1))p) percentMemUsed=$((usedMemGpu*100/totalMemGpu)) valueGpus="${valueGpus}${percentMemUsed}"$'\n' : $((nGpusCounter=nGpusCounter+1)) done ;; fan) valueGpus=$(echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2) ;; power) valueGpus=$(echo "$smiOutput" | grep "Power Draw" | cut -d ':' -f 2 | cut -d ' ' -f 2) ;; utilization) valueGpus=$(echo "$smiOutput" | grep "Gpu" | cut -d ':' -f 2 | cut -d ' ' -f 2) ;; *) echo "Can't run without a proper symlink. Exiting." echo "Try running munin-node-configure --suggest." exit 1 ;; esac # Print requested value nGpusCounter=0 while [ "$nGpusCounter" -lt "$nGpus" ] do value=$(echo "$valueGpus" | sed -n $((nGpusCounter+1))p) echo "${name}${nGpusCounter}.value $value" : $((nGpusCounter=nGpusCounter+1)) done