#!/usr/bin/perl # # Copyright and BSD license # # Copyright (c) 2011 NVIDIA Corporation # All rights reserved. # # Redistribution and use in source and binary forms are permitted # provided that the above copyright notice and this paragraph are # duplicated in all such forms and that any documentation, # advertising materials, and other materials related to such # distribution and use acknowledge that the software was developed # by NVIDIA Corporation. The name of the NVIDIA Corporation may not be # used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED # WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. # # # This script collects GPU information for use as a munin plugin # Inspired by Matthew Ritchie and Vadim Bulakh's nvidia_smi_ plugin # # # This requires the NVML bindings and NVIDIA driver >= R270 # $ sudo cpan install nvidia::ml # http://search.cpan.org/~nvbinding/nvidia-ml-pl/lib/nvidia/ml.pm # use strict; use nvidia::ml qw(:all); my $runType = "normal"; my @runTypes = qw( normal config autoconf ); if ($#ARGV + 1 == 1) { if (grep $_ eq $ARGV[0], @runTypes) { $runType = $ARGV[0]; } else { print "Invalid argument: $ARGV[0].\n"; print "Valid arguments: @runTypes.\n"; exit(1); } } my $ret = nvmlInit(); exit(1) unless $ret == $NVML_SUCCESS; ($ret, my $gpuCount) = nvmlDeviceGetCount(); exit(1) unless $ret == $NVML_SUCCESS; ($ret, my $driverVersion) = nvmlSystemGetDriverVersion(); $driverVersion = "Unknown" if $ret != $NVML_SUCCESS; for (my $i = 0; $i < $gpuCount; $i++) { ($ret, my $handle) = nvmlDeviceGetHandleByIndex($i); next if $ret != $NVML_SUCCESS; ($ret, my $pciInfo) = nvmlDeviceGetPciInfo($handle); my $gpuName = $pciInfo->{'busId'} if $ret == $NVML_SUCCESS; if ($runType eq "config") { # only print the graph information once if ($i == 0) { print "graph_title GPU\n"; print "graph_args --upper-limit 120 -l 0\n"; print "graph_vlabel Percent or Degrees C\n"; print "graph_category sensors\n"; print "graph_info Information for NVIDIA GPUs using driver version $driverVersion\n"; } # metrics are collected for all the GPUs to a single graph print "GPU_UTIL_$i.label GPU$i - $gpuName : GPU utilization\n"; print "GPU_FANSPEED_$i.label GPU$i - $gpuName : fan speed\n"; print "GPU_MEM_UTIL_$i.label GPU$i - $gpuName : GPU memory utilization\n"; print "GPU_TEMP_$i.label GPU$i - $gpuName : GPU temperature\n"; } elsif ($runType eq "autoconf") { print "yes\n"; exit(0); } else { ($ret, my $gpuTemp) = nvmlDeviceGetTemperature($handle, $NVML_TEMPERATURE_GPU); $gpuTemp = "N/A" if $ret != $NVML_SUCCESS; ($ret, my $gpuFanSpeed) = nvmlDeviceGetFanSpeed($handle); $gpuFanSpeed = "N/A" if $ret != $NVML_SUCCESS; ($ret, my $utilRates) = nvmlDeviceGetUtilizationRates($handle); my $gpuUtil; my $memUtil; if ($ret == $NVML_SUCCESS) { $gpuUtil = $utilRates->{'gpu'}; $memUtil = $utilRates->{'memory'}; } else { $gpuUtil = "N/A"; ($ret, my $memory) = nvmlDeviceGetMemoryInfo($handle); if ($ret == $NVML_SUCCESS) { $memUtil = $memory->{"used"} / $memory->{"total"} * 100; } else { $memUtil = "N/A"; } } print "GPU_TEMP_$i.value $gpuTemp\n"; print "GPU_FANSPEED_$i.value $gpuFanSpeed\n"; print "GPU_UTIL_$i.value $gpuUtil\n"; print "GPU_MEM_UTIL_$i.value $memUtil\n"; } }