#!/usr/bin/perl -w # -*- perl -*- # # Script to monitor NVIDIA Graphics Card. # # Parameters understood: # # config (required) # autoconf (optional - used by munin-config) # # Version 1.1 # Now works with NVidia >=270.18 Driver # Version 1.0 # Initial Release. Nvidia 260.xx Driver # # Magic markers (optional - used by munin-config and installation # scripts): #%# family=auto #%# capabilities=autoconf suggest use strict; use XML::Simple; my $nvidia_smi = $ENV{nvidia_smi} || "/usr/bin/nvidia-smi"; ## Munin autoconf method. if (exists $ARGV[0] and $ARGV[0] eq "autoconf" ) { if (! (-e $nvidia_smi)){ printf "no (file $nvidia_smi does not exists)\n"; exit 0; } # Now see if "nvidia-smi" can run if (! (-x $nvidia_smi)){ printf "no (file $nvidia_smi exists, but not executable)\n"; exit 0; } my $text = `$nvidia_smi -L 2>/dev/null | grep GPU`; if ($?) { print "no (No GPUs found. Check '$nvidia_smi -L' output)\n"; exit 0; } print "yes\n"; exit 0; } ## Munin suggest method. if (defined $ARGV[0] and $ARGV[0] eq 'suggest') { # FIXME: SHould be done in pure-perl my $gpus = `$nvidia_smi -L | egrep ^GPU | cut -f1 -d ':' | sed -e "s/ //g" | sed -e "s/://g" | tr [:upper:] [:lower:]`; print $gpus if defined $gpus; #FIXME exit 0; } $0 =~ /nvidia_smi_gpu(.+)*$/; my $gpu_id = $1; exit 2 unless defined $gpu_id; # Get XML with sensor values for the GPU with particular ID # Need 2>/dev/null to filter out nvmlSystemGetPersistenceMode useless error message. my $data = `$nvidia_smi -q -g $gpu_id -x 2>/dev/null` or die "Could not run $nvidia_smi: $!\n"; # Parse XML into easy accessable hash-tree my $ref = XMLin($data); my %gpu = (); # Will contain values cleaned form percent and Celsius signs if ( exists $ref->{gpu}->{temperature}->{gpu_temp} ){ $gpu{temp} = $ref->{gpu}->{temperature}->{gpu_temp}; } if ( exists $ref->{gpu}->{fan_speed} ){ $ref->{gpu}->{fan_speed} =~ /^(.+)\%$/; $gpu{fan} = $1; } if ( exists $ref->{gpu}->{utilization}->{gpu_util} ){ $ref->{gpu}->{utilization}->{gpu_util} =~ /^(.+)\%$/; $gpu{util} = $1; } if ( exists $ref->{gpu}->{utilization}->{memory_util} ){ $ref->{gpu}->{utilization}->{memory_util} =~ /^(.+)\%$/; $gpu{mem} = $1; } $gpu{model} = $ref->{gpu}->{product_name} if exists $ref->{gpu}->{product_name}; $gpu{driver} = $ref->{driver_version} if exists $ref->{driver_version}; $gpu{busid} = $ref->{gpu}->{pci}->{pci_bus_id} if exists $ref->{gpu}->{pci}->{pci_bus_id}; my $card_model = $gpu{model} || ""; my $driver_version = $gpu{driver} || ""; my $busid = $gpu{busid} || ""; ## Munin config method. if (exists $ARGV[0] and $ARGV[0] eq "config") { print "graph_title $card_model sensors\n"; print "graph_args --base 1000\n"; print "graph_args --upper-limit 100 -l 0\n"; print "graph_category sensors\n"; print "graph_vlabel % or C\n"; print "graph_info This graph shows information about your $card_model graphics card running driver version $driver_version and sitting on busID $busid.\n"; if (exists $gpu{temp}) { print "gpu_temp.label GPU Temperature (C)\n"; print "gpu_temp.info GPU temperature sensor\n"; print "gpu_temp.draw LINE2\n"; print "gpu_temp.warning :80\n"; print "gpu_temp.critical :100\n"; } if (exists $gpu{mem}) { print "gpu_mem.label Memory consumption (%)\n"; print "gpu_mem.info How much of on-board memory is used\n"; print "gpu_mem.draw LINE2\n"; print "gpu_mem.warning :85\n"; print "gpu_mem.critical :95\n"; } if (exists $gpu{util}) { print "gpu_util.label GPU Utilization (%)\n"; print "gpu_util.info How much computational resourses are used\n"; print "gpu_util.draw LINE2\n"; } if (exists $gpu{fan}) { print "gpu_fan.label Fan Speed (%)\n"; print "gpu_fan.info Fan RPM in precent of maximum\n"; print "gpu_fan.draw LINE2\n"; print "gpu_fan.warning :80\n"; print "gpu_fan.critical :95\n"; } exit 0; } print "gpu_temp.value ",$gpu{temp},"\n" if exists $gpu{temp}; print "gpu_mem.value ", $gpu{mem}, "\n" if exists $gpu{mem}; print "gpu_util.value ",$gpu{util},"\n" if exists $gpu{util}; print "gpu_fan.value ", $gpu{fan}, "\n" if exists $gpu{fan};