From 917d568a77b9e693a6d6a2c7fab57f957eff988d Mon Sep 17 00:00:00 2001 From: romner-set Date: Mon, 15 May 2023 13:58:54 +0200 Subject: [PATCH] Add multi-GPU support for NVML data collection --- src/btop.cpp | 4 +- src/btop_draw.cpp | 6 +- src/btop_shared.hpp | 18 +- src/linux/btop_collect.cpp | 374 +++++++++++++++++++------------------ 4 files changed, 204 insertions(+), 198 deletions(-) diff --git a/src/btop.cpp b/src/btop.cpp index ab5eafa..96e178d 100644 --- a/src/btop.cpp +++ b/src/btop.cpp @@ -532,12 +532,12 @@ namespace Runner { if (Global::debug) debug_timer("gpu", collect_begin); //? Start collect - auto gpu = Gpu::collect(conf.no_update); + auto gpus = Gpu::collect(conf.no_update); if (Global::debug) debug_timer("gpu", draw_begin); //? Draw box - if (not pause_output and Gpu::Nvml::initialized) output += Gpu::draw(gpu, conf.force_redraw, conf.no_update); + if (not pause_output and Gpu::Nvml::initialized) output += Gpu::draw(gpus, conf.force_redraw, conf.no_update); if (Global::debug) debug_timer("gpu", draw_done); } diff --git a/src/btop_draw.cpp b/src/btop_draw.cpp index 25224ac..ccbaec4 100644 --- a/src/btop_draw.cpp +++ b/src/btop_draw.cpp @@ -745,7 +745,7 @@ namespace Gpu { Draw::Meter pwr_meter; string box; - string draw(const gpu_info& gpu, bool force_redraw, bool data_same) { + string draw(const vector& gpus, bool force_redraw, bool data_same) { if (Runner::stopping) return ""; if (force_redraw) redraw = true; bool show_temps = (Config::getB("check_temp")); @@ -757,6 +757,7 @@ namespace Gpu { string out; out.reserve(width * height); + auto gpu = gpus[0]; // TODO: mutli-gpu support //* Redraw elements not needed to be updated every cycle if (redraw) { @@ -1796,6 +1797,7 @@ namespace Draw { using namespace Gpu; width = Term::width; height = max(Gpu::min_height, Cpu::shown ? Cpu::height : (int)ceil((double)Term::height * (trim(boxes) == "gpu" ? 100 : height_p) / 100)); + height += height+Cpu::height == Term::height-1; x = 1; y = 1 + Cpu::shown*Cpu::height; box = createBox(x, y, width, height, Theme::c("cpu_box"), true, "gpu", "", 5); // TODO gpu_box @@ -1809,7 +1811,7 @@ namespace Draw { b_x = x + width - b_width - 1; b_y = y + ceil((double)(height - 2) / 2) - ceil((double)(b_height/*+bproc_height*/) / 2) + 1; - box += createBox(b_x, b_y, b_width, b_height, "", false, gpu_name); + box += createBox(b_x, b_y, b_width, b_height, "", false, gpu_names[0]); //? TODO: Processes box /*bproc_x = x + width - bproc_width - 1; diff --git a/src/btop_shared.hpp b/src/btop_shared.hpp index 5aff032..b0f7dec 100644 --- a/src/btop_shared.hpp +++ b/src/btop_shared.hpp @@ -318,15 +318,15 @@ namespace Gpu { extern string box; extern int x, y, width, height, min_width, min_height; extern bool shown, redraw; - extern string gpu_name; + extern vector gpu_names; const array mem_names { "used"s, "free"s }; - //* Container for process information - struct proc_info { + //* Container for process information // TODO + /*struct proc_info { unsigned int pid; unsigned long long mem; - }; + };*/ //* Per-device container for GPU info struct gpu_info { @@ -351,8 +351,8 @@ namespace Gpu { unsigned int pcie_tx = 0; // KB/s unsigned int pcie_rx = 0; - vector graphics_processes = {}; - vector compute_processes = {}; + // vector graphics_processes = {}; // TODO + // vector compute_processes = {}; }; namespace Nvml { @@ -361,8 +361,8 @@ namespace Gpu { } //* Collect gpu stats and temperatures - auto collect(bool no_update = false) -> gpu_info&; + auto collect(bool no_update = false) -> vector&; - //* Draw contents of gpu box using as source - string draw(const gpu_info& gpu, bool force_redraw, bool data_same); + //* Draw contents of gpu box using as source + string draw(const vector& gpus, bool force_redraw, bool data_same); } diff --git a/src/linux/btop_collect.cpp b/src/linux/btop_collect.cpp index fcb1275..fb6e566 100644 --- a/src/linux/btop_collect.cpp +++ b/src/linux/btop_collect.cpp @@ -90,24 +90,24 @@ namespace Cpu { unordered_flat_map core_mapping; } -namespace Mem { - double old_uptime; -} - namespace Gpu { - gpu_info current_gpu; - unsigned int device_count; - string gpu_name; + vector gpus; + vector gpu_names; //? NVIDIA data collection namespace Nvml { bool initialized = false; bool init(); bool shutdown(); - nvmlDevice_t device; + vector devices; + unsigned int device_count = 0; } } +namespace Mem { + double old_uptime; +} + namespace Shared { fs::path procPath, passwd_path; @@ -172,6 +172,7 @@ namespace Shared { } + } namespace Cpu { @@ -825,6 +826,186 @@ namespace Cpu { } } +namespace Gpu { + //? NVIDIA + namespace Nvml { + bool init() { + if (initialized) {return false;} + + nvmlReturn_t result = nvmlInit(); + if (result != NVML_SUCCESS) { + Logger::warning(std::string("Failed to initialize NVML, NVIDIA GPUs will not be detected: ") + nvmlErrorString(result)); + return false; + } + + //? Device count + result = nvmlDeviceGetCount(&device_count); + if (result != NVML_SUCCESS) { + Logger::error(std::string("NVML: Failed to get device count: ") + nvmlErrorString(result)); + return false; + } + + if (device_count > 0) { + devices.resize(device_count); + gpus.resize(device_count); + gpu_names.resize(device_count); + + for (unsigned int i = 0; i < device_count; ++i) { + //? Device Handle + result = nvmlDeviceGetHandleByIndex(i, devices.data() + i*sizeof(nvmlDevice_t)); + if (result != NVML_SUCCESS) { + Logger::error(std::string("NVML: Failed to get device handle: ") + nvmlErrorString(result)); + return false; + } + + initialized = true; + + //? Device name + char name[NVML_DEVICE_NAME_BUFFER_SIZE]; + result = nvmlDeviceGetName(devices[i], name, NVML_DEVICE_NAME_BUFFER_SIZE); + if (result != NVML_SUCCESS) { + Logger::error(std::string("NVML: Failed to get device name: ") + nvmlErrorString(result)); + } else {gpu_names[i] = string(name);} + + //? Power usage + result = nvmlDeviceGetPowerManagementLimit(devices[i], &gpus[i].pwr_max_usage); + if (result != NVML_SUCCESS) { + Logger::error(std::string("NVML: Failed to get maximum GPU power draw, defaulting to 300W: ") + nvmlErrorString(result)); + } + + //? Get temp_max + unsigned int temp_max = 100; + result = nvmlDeviceGetTemperatureThreshold(devices[i], NVML_TEMPERATURE_THRESHOLD_SHUTDOWN, &temp_max); + if (result != NVML_SUCCESS) { + Logger::error(std::string("NVML: Failed to get maximum GPU temperature, defaulting to 100: ") + nvmlErrorString(result)); + } + gpus[i].temp_max = (long long)temp_max; + } + return true; + } else {initialized = true; shutdown(); return false;} + } + + bool shutdown() { + if (!initialized) {return false;} + + nvmlReturn_t result = nvmlShutdown(); + if (NVML_SUCCESS == result) { + initialized = false; + } else Logger::warning(std::string("Failed to shutdown NVML: ") + nvmlErrorString(result)); + return !initialized; + } + + bool collect(gpu_info* gpus_slice) { // raw pointer to vector data, size == device_count, defined in init() + if (!initialized) return false; + + for (unsigned int i = 0; i < device_count; ++i) { + //? GPU & memory utilization + nvmlUtilization_t utilization; + nvmlReturn_t result = nvmlDeviceGetUtilizationRates(devices[i], &utilization); + if (result != NVML_SUCCESS) { + Logger::error(std::string("NVML: Failed to get GPU utilization: ") + nvmlErrorString(result)); + } else { + gpus_slice[i].gpu_percent.push_back((long long)utilization.gpu); + gpus_slice[i].mem_utilization_percent.push_back((long long)utilization.memory); + //? Reduce size if there are more values than needed for graph + while (cmp_greater(gpus_slice[i].gpu_percent.size(), width * 2)) gpus_slice[i].gpu_percent.pop_front(); + while (cmp_greater(gpus_slice[i].mem_utilization_percent.size(), width)) gpus_slice[i].mem_utilization_percent.pop_front(); + } + + //? Clock speeds + result = nvmlDeviceGetClockInfo(devices[i], NVML_CLOCK_GRAPHICS, &gpus_slice[i].gpu_clock_speed); + if (result != NVML_SUCCESS) { + Logger::error(std::string("NVML: Failed to get GPU clock speed: ") + nvmlErrorString(result)); + } + result = nvmlDeviceGetClockInfo(devices[i], NVML_CLOCK_MEM, &gpus_slice[i].mem_clock_speed); + if (result != NVML_SUCCESS) { + Logger::error(std::string("NVML: Failed to get VRAM clock speed: ") + nvmlErrorString(result)); + } + + //? Power usage & state + result = nvmlDeviceGetPowerUsage(devices[i], &gpus_slice[i].pwr_usage); + if (result != NVML_SUCCESS) { + Logger::error(std::string("NVML: Failed to get GPU power usage: ") + nvmlErrorString(result)); + } else { + gpus_slice[i].pwr_percent.push_back(clamp((long long)round((double)gpus_slice[i].pwr_usage * 100.0 / (double)gpus_slice[i].pwr_max_usage), 0ll, 100ll)); + } + + nvmlPstates_t pState; + result = nvmlDeviceGetPowerState(devices[i], &pState); + if (result != NVML_SUCCESS) { + Logger::error(std::string("NVML: Failed to get GPU power state: ") + nvmlErrorString(result)); + } else { + gpus_slice[i].pwr_state = static_cast(pState); + } + + //? GPU temperature + if (Config::getB("check_temp")) { + unsigned int temp; + nvmlReturn_t result = nvmlDeviceGetTemperature(devices[i], NVML_TEMPERATURE_GPU, &temp); + if (result != NVML_SUCCESS) { + Logger::error(std::string("NVML: Failed to get GPU temperature: ") + nvmlErrorString(result)); + } else { + gpus_slice[i].temp.push_back((long long)temp); + //? Reduce size if there are more values than needed for graph + while (cmp_greater(gpus_slice[i].temp.size(), 18)) gpus_slice[i].temp.pop_front(); + } + } + + //? Memory info + nvmlMemory_t memory; + result = nvmlDeviceGetMemoryInfo(devices[i], &memory); + if (result != NVML_SUCCESS) { + Logger::error(std::string("NVML: Failed to get VRAM info: ") + nvmlErrorString(result)); + } else { + gpus_slice[i].mem_total = memory.total; + gpus_slice[i].mem_used = memory.used; + //gpu.mem_free = memory.free; + + auto used_percent = (long long)round((double)memory.used * 100.0 / (double)memory.total); + gpus_slice[i].mem_used_percent.push_back(used_percent); + + //? Reduce size if there are more values than needed for graphs + while (cmp_greater(gpus_slice[i].mem_used_percent.size(), width/2)) gpus_slice[i].mem_used_percent.pop_front(); + } + + //? PCIe link speeds + result = nvmlDeviceGetPcieThroughput(devices[i], NVML_PCIE_UTIL_TX_BYTES, &gpus_slice[i].pcie_tx); + if (result != NVML_SUCCESS) { + Logger::error(std::string("NVML: Failed to get PCIe TX throughput: ") + nvmlErrorString(result)); + } + result = nvmlDeviceGetPcieThroughput(devices[i], NVML_PCIE_UTIL_RX_BYTES, &gpus_slice[i].pcie_rx); + if (result != NVML_SUCCESS) { + Logger::error(std::string("NVML: Failed to get PCIe RX throughput: ") + nvmlErrorString(result)); + } + + //? TODO: Processes using GPU + /*unsigned int proc_info_len; + nvmlProcessInfo_t* proc_info = 0; + result = nvmlDeviceGetComputeRunningProcesses_v3(device, &proc_info_len, proc_info); + if (result != NVML_SUCCESS) { + Logger::error(std::string("NVML: Failed to get compute processes: ") + nvmlErrorString(result)); + } else { + for (unsigned int i = 0; i < proc_info_len; ++i) + gpus_slice[i].graphics_processes.push_back({proc_info[i].pid, proc_info[i].usedGpuMemory}); + }*/ + } + + return true; + } + } + // TODO: AMD + // TODO: Intel + + //? Collect data from GPU-specific libraries + auto collect(bool no_update) -> vector& { + if (Runner::stopping or (no_update and not gpus.empty())) return gpus; + + Nvml::collect(gpus.data()); // raw pointer to array data, size == Nvml::device_count, defined in Nvml::init() + + return gpus; + } +} + namespace Mem { bool has_swap{}; // defaults to false vector fstab; @@ -2091,180 +2272,3 @@ namespace Tools { throw std::runtime_error("Failed get uptime from " + string{Shared::procPath} + "/uptime"); } } - -namespace Gpu { - //? NVIDIA - namespace Nvml { // TODO: multi-GPU support - bool init() { - if (initialized) {return false;} - - nvmlReturn_t result = nvmlInit(); - if (result != NVML_SUCCESS) { - Logger::warning(std::string("Failed to initialize NVML, NVIDIA GPUs will not be detected: ") + nvmlErrorString(result)); - return false; - } - - //? Device count - unsigned int nvml_count; - result = nvmlDeviceGetCount(&nvml_count); - if (result != NVML_SUCCESS) { - Logger::error(std::string("NVML: Failed to get device count: ") + nvmlErrorString(result)); - return false; - } - device_count += nvml_count; - - //? Device Handle - result = nvmlDeviceGetHandleByIndex(0, &device); - if (result != NVML_SUCCESS) { - Logger::error(std::string("NVML: Failed to get device handle: ") + nvmlErrorString(result)); - return false; - } - - initialized = true; - - //? Device name - char name[NVML_DEVICE_NAME_BUFFER_SIZE]; - result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE); - if (result != NVML_SUCCESS) { - Logger::error(std::string("NVML: Failed to get device name: ") + nvmlErrorString(result)); - } else {gpu_name = string(name);} - - //? Power usage - result = nvmlDeviceGetPowerManagementLimit(device, ¤t_gpu.pwr_max_usage); - if (result != NVML_SUCCESS) { - Logger::error(std::string("NVML: Failed to get maximum GPU power draw, defaulting to 300W: ") + nvmlErrorString(result)); - } - - //? Get temp_max - unsigned int temp_max = 100; - result = nvmlDeviceGetTemperatureThreshold(device, NVML_TEMPERATURE_THRESHOLD_SHUTDOWN, &temp_max); - if (result != NVML_SUCCESS) { - Logger::error(std::string("NVML: Failed to get maximum GPU temperature, defaulting to 100: ") + nvmlErrorString(result)); - return false; - } - current_gpu.temp_max = (long long)temp_max; - return true; - } - - bool shutdown() { - if (!initialized) {return false;} - - nvmlReturn_t result = nvmlShutdown(); - if (NVML_SUCCESS == result) { - initialized = true; - } else Logger::warning(std::string("Failed to shutdown NVML: ") + nvmlErrorString(result)); - return !initialized; - } - - bool collect(gpu_info& gpu) { - if (!initialized) return false; - - //? GPU & memory utilization - nvmlUtilization_t utilization; - nvmlReturn_t result = nvmlDeviceGetUtilizationRates(device, &utilization); - if (result != NVML_SUCCESS) { - Logger::error(std::string("NVML: Failed to get GPU utilization: ") + nvmlErrorString(result)); - } else { - gpu.gpu_percent.push_back((long long)utilization.gpu); - gpu.mem_utilization_percent.push_back((long long)utilization.memory); - //? Reduce size if there are more values than needed for graph - while (cmp_greater(gpu.gpu_percent.size(), width * 2)) gpu.gpu_percent.pop_front(); - while (cmp_greater(gpu.mem_utilization_percent.size(), width)) gpu.mem_utilization_percent.pop_front(); - } - - //? Clock speeds - result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_GRAPHICS, ¤t_gpu.gpu_clock_speed); - if (result != NVML_SUCCESS) { - Logger::error(std::string("NVML: Failed to get GPU clock speed: ") + nvmlErrorString(result)); - } - result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_MEM, ¤t_gpu.mem_clock_speed); - if (result != NVML_SUCCESS) { - Logger::error(std::string("NVML: Failed to get VRAM clock speed: ") + nvmlErrorString(result)); - } - - //? Power usage & state - result = nvmlDeviceGetPowerUsage(device, ¤t_gpu.pwr_usage); - if (result != NVML_SUCCESS) { - Logger::error(std::string("NVML: Failed to get GPU power usage: ") + nvmlErrorString(result)); - } else { - current_gpu.pwr_percent.push_back(clamp((long long)round((double)current_gpu.pwr_usage * 100.0 / (double)current_gpu.pwr_max_usage), 0ll, 100ll)); - } - - nvmlPstates_t pState; - result = nvmlDeviceGetPowerState(device, &pState); - if (result != NVML_SUCCESS) { - Logger::error(std::string("NVML: Failed to get GPU power state: ") + nvmlErrorString(result)); - } else { - current_gpu.pwr_state = static_cast(pState); - } - - //? GPU temperature - if (Config::getB("check_temp")) { - unsigned int temp; - nvmlReturn_t result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temp); - if (result != NVML_SUCCESS) { - Logger::error(std::string("NVML: Failed to get GPU temperature: ") + nvmlErrorString(result)); - } else { - gpu.temp.push_back((long long)temp); - //? Reduce size if there are more values than needed for graph - while (cmp_greater(gpu.temp.size(), 18)) gpu.temp.pop_front(); - } - } - - //? Memory info - nvmlMemory_t memory; - result = nvmlDeviceGetMemoryInfo(device, &memory); - if (result != NVML_SUCCESS) { - Logger::error(std::string("NVML: Failed to get VRAM info: ") + nvmlErrorString(result)); - } else { - gpu.mem_total = memory.total; - gpu.mem_used = memory.used; - //gpu.mem_free = memory.free; - - auto used_percent = (long long)round((double)memory.used * 100.0 / (double)memory.total); - gpu.mem_used_percent.push_back(used_percent); - - //? Reduce size if there are more values than needed for graphs - while (cmp_greater(gpu.mem_used_percent.size(), width/2)) gpu.mem_used_percent.pop_front(); - } - - //? PCIe link speeds - result = nvmlDeviceGetPcieThroughput(device, NVML_PCIE_UTIL_TX_BYTES, ¤t_gpu.pcie_tx); - if (result != NVML_SUCCESS) { - Logger::error(std::string("NVML: Failed to get PCIe TX throughput: ") + nvmlErrorString(result)); - } - result = nvmlDeviceGetPcieThroughput(device, NVML_PCIE_UTIL_RX_BYTES, ¤t_gpu.pcie_rx); - if (result != NVML_SUCCESS) { - Logger::error(std::string("NVML: Failed to get PCIe RX throughput: ") + nvmlErrorString(result)); - } - - //? TODO: Processes using GPU - /*unsigned int proc_info_len; - nvmlProcessInfo_t* proc_info = 0; - result = nvmlDeviceGetComputeRunningProcesses_v3(device, &proc_info_len, proc_info); - if (result != NVML_SUCCESS) { - Logger::error(std::string("NVML: Failed to get compute processes: ") + nvmlErrorString(result)); - } else { - for (unsigned int i = 0; i < proc_info_len; ++i) - current_gpu.graphics_processes.push_back({proc_info[i].pid, proc_info[i].usedGpuMemory}); - }*/ - - return true; - } - } - // TODO: AMD - // TODO: Intel - - //? Collect data from GPU-specific libraries - auto collect(bool no_update) -> gpu_info& { - if (Runner::stopping or (no_update and not current_gpu.gpu_percent.empty())) return current_gpu; - auto& gpu = current_gpu; - - //if (Config::getB("show_gpu_freq")) - // TODO gpuHz = get_gpuHz(); - - Nvml::collect(gpu); - - return gpu; - } -}