在使用nvidia的GPU進行運算的時候,一般會有須要瞭解GPU運行狀態需求。在下面的文章中,將會介紹我在實際使用中到的方法。 git
一般安裝好nvidia驅動程序後會自動安裝好nvidia-smi,使用nvidia-smi是最簡單直接的查看GPU狀態信息的方法。github
直接運行"nvidia-smi",就能夠查看當前的運行狀態。能夠查看到GPU的各個核心的工做狀態、溫度、內存信息、及使用GPU的進程信息等。windows
+-----------------------------------------------------------------------------+ | NVIDIA-SMI 367.27 Driver Version: 367.27 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | |===============================+======================+======================| | 0 GRID K1 Off | 0000:03:00.0 Off | N/A | | N/A 63C P0 18W / 31W | 545MiB / 4036MiB | 28% Default | +-------------------------------+----------------------+----------------------+ | 1 GRID K1 Off | 0000:04:00.0 Off | N/A | | N/A 58C P0 14W / 31W | 195MiB / 4036MiB | 6% Default | +-------------------------------+----------------------+----------------------+ | 2 GRID K1 Off | 0000:05:00.0 Off | N/A | | N/A 40C P0 16W / 31W | 99MiB / 4036MiB | 18% Default | +-------------------------------+----------------------+----------------------+ | 3 GRID K1 Off | 0000:06:00.0 Off | N/A | | N/A 60C P0 16W / 31W | 200MiB / 4036MiB | 30% Default | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: GPU Memory | | GPU PID Type Process name Usage | |=============================================================================| | 0 9133 C ...e/ffmpeg_nvenc_decode/_release/bin/ffmpeg 48MiB | | 0 9136 C ...e/ffmpeg_nvenc_decode/_release/bin/ffmpeg 48MiB | | 1 9132 C ...e/ffmpeg_nvenc_decode/_release/bin/ffmpeg 38MiB | | 1 9133 C ...e/ffmpeg_nvenc_decode/_release/bin/ffmpeg 38MiB | | 1 9134 C ...e/ffmpeg_nvenc_decode/_release/bin/ffmpeg 38MiB | | 1 9135 C ...e/ffmpeg_nvenc_decode/_release/bin/ffmpeg 38MiB | | 1 9136 C ...e/ffmpeg_nvenc_decode/_release/bin/ffmpeg 38MiB | | 2 9132 C ...e/ffmpeg_nvenc_decode/_release/bin/ffmpeg 48MiB | | 2 9134 C ...e/ffmpeg_nvenc_decode/_release/bin/ffmpeg 48MiB | | 3 9135 C ...e/ffmpeg_nvenc_decode/_release/bin/ffmpeg 48MiB | +-----------------------------------------------------------------------------+
若是想了解更多信息,運行"nvidia-smi -a"或"nvidia-smi -q",能夠查看到詳盡的GPU信息。好比我我的比較關心使用GPU進行視頻編解碼的時候的GPU使用狀況,就能夠經過"Utilization"字段下的信息查看到各類使用率的狀況。bash
Utilization Gpu : 5 % Memory : 6 % Encoder : 40 % Decoder : 0 %
使用"nvidia-smi"雖然簡單方便,但由於它是個應用程序,不是很方便在代碼中進行使用。因此就嘗試找找官方是否提供了獲取GPU信息的方法,果真"NVML API"就是所需。在以後的時間裏,經過參考API文檔,比較快速的整理出我但願獲取到的GPU運行狀態信息。ide
"NVML API"主要須要庫libnvidia-ml,我用它主要是獲取GPU的基本信息和各類使用率。具體使用了哪些API這裏不作贅述,其實直接看API文檔和個人測試源碼更容易理解。測試
API文檔:http://docs.nvidia.com/deploy/pdf/NVML_API_Reference_Guide.pdfui
最後,簡單暴力的附上代碼:https://github.com/shenhailuanma/selfTestCode/blob/master/getNvidiaInfo/vn_info.clua
#include <stdio.h> #include <stdint.h> #include <string.h> #include <stdlib.h> #include <unistd.h> #include <time.h> #if defined(_WIN32) #include <windows.h> #else #include <dlfcn.h> #endif #if defined(_WIN32) #define CUDAAPI __stdcall #else #define CUDAAPI #endif #if defined(_WIN32) #define LOAD_FUNC(l, s) GetProcAddress(l, s) #define DL_CLOSE_FUNC(l) FreeLibrary(l) #else #define LOAD_FUNC(l, s) dlsym(l, s) #define DL_CLOSE_FUNC(l) dlclose(l) #endif /** * Return values for NVML API calls. */ typedef enum nvmlReturn_enum { NVML_SUCCESS = 0, //!< The operation was successful NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit() NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred } nvmlReturn_t; typedef void * nvmlDevice_t; /* Memory allocation information for a device. */ typedef struct nvmlMemory_st { unsigned long long total; //!< Total installed FB memory (in bytes) unsigned long long free; //!< Unallocated FB memory (in bytes) unsigned long long used; //!< Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping } nvmlMemory_t; /* Information about running compute processes on the GPU */ typedef struct nvmlProcessInfo_st { unsigned int pid; //!< Process ID unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. //!< Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported //!< because Windows KMD manages all the memory and not the NVIDIA driver } nvmlProcessInfo_t; /* Utilization information for a device. */ typedef struct nvmlUtilization_st { unsigned int gpu; //!< Percent of time over the past second during which one or more kernels was executing on the GPU unsigned int memory; //!< Percent of time over the past second during which global (device) memory was being read or written } nvmlUtilization_t; typedef nvmlReturn_t(CUDAAPI *NVMLINIT)(void); // nvmlInit typedef nvmlReturn_t(CUDAAPI *NVMLSHUTDOWN)(void); // nvmlShutdown typedef nvmlReturn_t(CUDAAPI *NVMLDEVICEGETCOUNT)(unsigned int *deviceCount); // nvmlDeviceGetCount typedef nvmlReturn_t(CUDAAPI *NVMLDEVICEGETHANDLEBYINDEX)(unsigned int index, nvmlDevice_t *device); // nvmlDeviceGetHandleByIndex typedef nvmlReturn_t(CUDAAPI *NVMLDEVICEGETDECODERUTILIZATION)(nvmlDevice_t device, unsigned int *utilization,unsigned int *samplingPeriodUs); // nvmlDeviceGetDecoderUtilization typedef nvmlReturn_t(CUDAAPI *NVMLDEVICEGETENCODERUTILIZATION)(nvmlDevice_t device, unsigned int *utilization,unsigned int *samplingPeriodUs); // nvmlDeviceGetEncoderUtilization typedef nvmlReturn_t(CUDAAPI *NVMLDEVICEGETMEMORYINFO)(nvmlDevice_t device, nvmlMemory_t *memory); // nvmlDeviceGetMemoryInfo typedef nvmlReturn_t(CUDAAPI *NVMLDEVICEGETRUNNINGPROCESSES)(nvmlDevice_t device, unsigned int *infoCount,nvmlProcessInfo_t *infos);// nvmlDeviceGetComputeRunningProcesses typedef nvmlReturn_t(CUDAAPI *NVMLDEVICEGETPPROCESSNAME)(unsigned int pid, char *name, unsigned int length); // nvmlSystemGetProcessName typedef nvmlReturn_t(CUDAAPI *NVMLDEVICEGETUTILIZATIONRATES)(nvmlDevice_t device, nvmlUtilization_t *utilization); // nvmlDeviceGetUtilizationRates typedef nvmlReturn_t(CUDAAPI *NVMLDEVICEGETTEMPERATURE)(nvmlDevice_t device, int sensorType, unsigned int *temp); // nvmlDeviceGetTemperature typedef struct nvGpuUnitInfo_st { unsigned int decoder_utilization; unsigned int encoder_utilization; unsigned int gpu_utilization; unsigned int memory_utilization; unsigned int temperature; unsigned long long memory_total; unsigned long long memory_free; unsigned long long memory_used; }nvGpuUnitInfo_t; #define GPU_MAX_SIZE 64 typedef struct nvGpuInfo_st { unsigned int device_count; nvGpuUnitInfo_t devices[GPU_MAX_SIZE]; }nvGpuInfo_t; #define RETURN_SUCCESS 0 #define RETURN_ERROR_LOAD_LIB (-1) #define RETURN_ERROR_LOAD_FUNC (-2) #define RETURN_ERROR_LIB_FUNC (-3) #define RETURN_ERROR_NULL_POINTER (-4) #define CHECK_LOAD_NVML_FUNC(t, f, s) \ do { \ (f) = (t)LOAD_FUNC(nvml_lib, s); \ if (!(f)) { \ printf("Failed loading %s from NVML library\n", s); \ retCode = RETURN_ERROR_LOAD_FUNC; \ goto gpu_fail; \ } \ } while (0) static int check_nvml_error(int err, const char *func) { if (err != NVML_SUCCESS) { printf(" %s - failed with error code:%d\n", func, err); return 0; } return 1; } #define check_nvml_errors(f) \ do{ \ if (!check_nvml_error(f, #f)) { \ retCode = RETURN_ERROR_LIB_FUNC; \ goto gpu_fail;\ }\ }while(0) static int get_gpu_info(nvGpuInfo_t *infos) { if(infos == NULL){ return RETURN_ERROR_NULL_POINTER; } int retCode = RETURN_SUCCESS; void* nvml_lib; NVMLINIT nvml_init; NVMLSHUTDOWN nvml_shutdown; NVMLDEVICEGETCOUNT nvml_device_get_count; NVMLDEVICEGETHANDLEBYINDEX nvml_device_get_handle_by_index; NVMLDEVICEGETDECODERUTILIZATION nvml_device_get_decoder_utilization; NVMLDEVICEGETENCODERUTILIZATION nvml_device_get_encoder_utilization; NVMLDEVICEGETMEMORYINFO nvml_device_get_memory_info; NVMLDEVICEGETRUNNINGPROCESSES nvml_device_get_running_processes; NVMLDEVICEGETPPROCESSNAME nvml_device_get_process_name; NVMLDEVICEGETUTILIZATIONRATES nvml_device_get_utilization_rates; NVMLDEVICEGETTEMPERATURE nvml_device_get_temperature; nvmlDevice_t device_handel; unsigned int utilization_value = 0; unsigned int utilization_sample = 0; int best_gpu = 0; unsigned int decoder_used = 100; // open the libnvidia-ml.so nvml_lib = NULL; #if defined(_WIN32) if (sizeof(void*) == 8) { nvml_lib = LoadLibrary(TEXT("nvidia-ml.dll")); } else { nvml_lib = LoadLibrary(TEXT("nvidia-ml.dll")); } #else nvml_lib = dlopen("libnvidia-ml.so", RTLD_LAZY); #endif if(nvml_lib == NULL){ return RETURN_ERROR_LOAD_LIB; } CHECK_LOAD_NVML_FUNC(NVMLINIT, nvml_init, "nvmlInit"); CHECK_LOAD_NVML_FUNC(NVMLSHUTDOWN, nvml_shutdown, "nvmlShutdown"); CHECK_LOAD_NVML_FUNC(NVMLDEVICEGETCOUNT, nvml_device_get_count, "nvmlDeviceGetCount"); CHECK_LOAD_NVML_FUNC(NVMLDEVICEGETHANDLEBYINDEX, nvml_device_get_handle_by_index, "nvmlDeviceGetHandleByIndex"); CHECK_LOAD_NVML_FUNC(NVMLDEVICEGETDECODERUTILIZATION, nvml_device_get_decoder_utilization, "nvmlDeviceGetDecoderUtilization"); CHECK_LOAD_NVML_FUNC(NVMLDEVICEGETENCODERUTILIZATION, nvml_device_get_encoder_utilization, "nvmlDeviceGetEncoderUtilization"); CHECK_LOAD_NVML_FUNC(NVMLDEVICEGETMEMORYINFO, nvml_device_get_memory_info, "nvmlDeviceGetMemoryInfo"); CHECK_LOAD_NVML_FUNC(NVMLDEVICEGETRUNNINGPROCESSES, nvml_device_get_running_processes, "nvmlDeviceGetComputeRunningProcesses"); CHECK_LOAD_NVML_FUNC(NVMLDEVICEGETPPROCESSNAME, nvml_device_get_process_name, "nvmlSystemGetProcessName"); CHECK_LOAD_NVML_FUNC(NVMLDEVICEGETUTILIZATIONRATES, nvml_device_get_utilization_rates, "nvmlDeviceGetUtilizationRates"); CHECK_LOAD_NVML_FUNC(NVMLDEVICEGETTEMPERATURE, nvml_device_get_temperature, "nvmlDeviceGetTemperature"); // get gpu info check_nvml_errors(nvml_init()); unsigned int device_count = 0; check_nvml_errors(nvml_device_get_count(&device_count)); infos->device_count = device_count; nvmlMemory_t memory_info; nvmlUtilization_t gpu_utilization; int i = 0; for(i = 0; i < device_count; i++){ check_nvml_errors(nvml_device_get_handle_by_index(i, &device_handel)); check_nvml_errors(nvml_device_get_decoder_utilization(device_handel, &infos->devices[i].decoder_utilization, &utilization_sample)); check_nvml_errors(nvml_device_get_encoder_utilization(device_handel, &infos->devices[i].encoder_utilization, &utilization_sample)); check_nvml_errors(nvml_device_get_memory_info(device_handel, &memory_info)); infos->devices[i].memory_total = memory_info.total; infos->devices[i].memory_free = memory_info.free; infos->devices[i].memory_used = memory_info.used; check_nvml_errors(nvml_device_get_utilization_rates(device_handel, &gpu_utilization)); infos->devices[i].gpu_utilization = gpu_utilization.gpu; infos->devices[i].memory_utilization = gpu_utilization.memory; check_nvml_errors(nvml_device_get_temperature(device_handel, 0, &infos->devices[i].temperature)); } gpu_fail: nvml_shutdown(); return retCode; } static void print_gpu_info(nvGpuInfo_t * infos) { printf("device count:%u\n", infos->device_count); int i = 0; for(i = 0; i < infos->device_count; i++){ printf("GPU:%d\t, Utilization:[decoder:%u, encoder:%u, gpu:%u, memory:%u], Temperature:%uC, Memory:[total:%llu, free:%llu, used:%llu]\n ", i, infos->devices[i].decoder_utilization, infos->devices[i].encoder_utilization, infos->devices[i].gpu_utilization, infos->devices[i].memory_utilization, infos->devices[i].temperature, infos->devices[i].memory_total, infos->devices[i].memory_free, infos->devices[i].memory_used); } } int main(void) { nvGpuInfo_t gpu_buf; int ret = get_gpu_info(&gpu_buf); if(!ret) print_gpu_info(&gpu_buf); return ret; }