前段時間作英偉達硬解得時候,顯卡老是莫名掛掉,後來發現是由於顯卡溫度太高掉了。這幾天找到CUDA中有NVML工具能夠查看顯卡信息,nvidia-smi也是基於這個工具包。html
使用的CUDA版本爲CUDA 8.0 。windows
安裝CUDA以後能夠找到以下:api
圖1.NVML的例子dom
這裏麪包含的是NVML的一個例子。個人系統是64位的,能夠找到NVML的lib和頭文件以下:ide
圖2.NVML的lib文件函數
圖3.NVML頭文件工具
在工程中包含NVML。我是新建的CUDA 8.0 Runtime工程,由於NVML包含在CUDA中,建CUDA 8.0 Runtime工程能夠省去CUDA的配置工做,工程創建方法參見VS2013 VC++的.cpp文件調用CUDA的.cu文件中的函數性能
,CUDA 8.0爲默認安裝,系統爲win10 64位。學習
在程序中直接包含NVML的頭文件和lib文件便可:測試
#include "nvml.h" #pragma comment(lib,"nvml.lib")
注意64位系統應該創建x64工程,由於在安裝的CUDA中沒有win32的nvml.lib。
經常使用函數:
·nvmlInit()函數初始化NVML;
·nvmlDeviceGetCount(unsigned int *deviceCount)函數能夠得到顯卡數;
·nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device)獲取設備;
·nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length)查詢設備的名稱;
·nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci)獲取PCI信息,對這個函數的重要性,例子中是這麼說的
// pci.busId is very useful to know which device physically you're talking to
// Using PCI identifier you can also match nvmlDevice handle to CUDA device.
·nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode)獲得顯卡當前所處的模式,模式由如下:
typedef enum nvmlComputeMode_enum
{
NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device
NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Support Removed
NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device
NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time
// Keep this last
NVML_COMPUTEMODE_COUNT
} nvmlComputeMode_t;
·nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode)能夠修改顯卡的模式;
·nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp)查詢溫度閾值,具體有兩種:
typedef enum nvmlTemperatureThresholds_enum
{
NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0, // Temperature at which the GPU will shut down for HW protection
NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1, // Temperature at which the GPU will begin slowdown
// Keep this last
NVML_TEMPERATURE_THRESHOLD_COUNT
} nvmlTemperatureThresholds_t;
當溫度達到NVML_TEMPERATURE_THRESHOLD_SHUTDOWN 參數獲取的溫度時,顯卡將自動關閉以保護硬件;當溫度達到NVML_TEMPERATURE_THRESHOLD_SLOWDOWN參數獲取的溫度時,顯卡的性能將降低。
·nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp)獲取顯卡當前溫度;
·nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization)獲取設備的使用率(原註釋:Retrieves the current utilization rates for the device's major subsystems。不知道理解錯了沒有),使用率包括如下:
typedef struct nvmlUtilization_st
{
unsigned int gpu; //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU
unsigned int memory; //!< Percent of time over the past sample period during which global (device) memory was being read or written
} nvmlUtilization_t;
·nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory) Retrieves the amount of used, free and total memory available on the device, in bytes。
·nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory) Gets Total, Available and Used size of BAR1 memory.(不知道這種與上一種有什麼區別,有待後續學習)
·nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos) Get information about processes with a compute context on a device。應該是獲取當前在使用顯卡的程序信息。
·nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock) Retrieves the maximum clock speeds for the device。包括如下:
typedef enum nvmlClockType_enum
{
NVML_CLOCK_GRAPHICS = 0, //!< Graphics clock domain
NVML_CLOCK_SM = 1, //!< SM clock domain
NVML_CLOCK_MEM = 2, //!< Memory clock domain
NVML_CLOCK_VIDEO = 3, //!< Video encoder/decoder clock domain
// Keep this last
NVML_CLOCK_COUNT //<! Count of clock types
} nvmlClockType_t;
·nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock) Retrieves the current clock speeds for the device.上面是獲取最大的,這個是獲取當前的。
代碼示例:
#include "cuda_kernels.h" #include "nvml.h" #include <stdio.h> #include <windows.h> #include <winbase.h> #include <tlhelp32.h> #include <psapi.h> #pragma comment(lib,"kernel32.lib") #pragma comment(lib,"advapi32.lib") #pragma comment(lib,"nvml.lib") const char * convertToComputeModeString(nvmlComputeMode_t mode) { switch (mode) { case NVML_COMPUTEMODE_DEFAULT: return "Default"; case NVML_COMPUTEMODE_EXCLUSIVE_THREAD: return "Exclusive_Thread"; case NVML_COMPUTEMODE_PROHIBITED: return "Prohibited"; case NVML_COMPUTEMODE_EXCLUSIVE_PROCESS: return "Exclusive Process"; default: return "Unknown"; } } int main() { cuAdd(); nvmlReturn_t result; unsigned int device_count, i; // First initialize NVML library result = nvmlInit(); if (NVML_SUCCESS != result) { printf("Failed to initialize NVML: %s\n", nvmlErrorString(result)); printf("Press ENTER to continue...\n"); getchar(); return 1; } result = nvmlDeviceGetCount(&device_count); if (NVML_SUCCESS != result) { printf("Failed to query device count: %s\n", nvmlErrorString(result)); goto Error; } printf("Found %d device%s\n\n", device_count, device_count != 1 ? "s" : ""); printf("Listing devices:\n"); while (true) { for (i = 0; i < device_count; i++) { nvmlDevice_t device; char name[NVML_DEVICE_NAME_BUFFER_SIZE]; nvmlPciInfo_t pci; nvmlComputeMode_t compute_mode; // Query for device handle to perform operations on a device // You can also query device handle by other features like: // nvmlDeviceGetHandleBySerial // nvmlDeviceGetHandleByPciBusId result = nvmlDeviceGetHandleByIndex(i, &device); if (NVML_SUCCESS != result) { printf("Failed to get handle for device %i: %s\n", i, nvmlErrorString(result)); goto Error; } result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE); if (NVML_SUCCESS != result) { printf("Failed to get name of device %i: %s\n", i, nvmlErrorString(result)); goto Error; } // pci.busId is very useful to know which device physically you're talking to // Using PCI identifier you can also match nvmlDevice handle to CUDA device. result = nvmlDeviceGetPciInfo(device, &pci); if (NVML_SUCCESS != result) { printf("Failed to get pci info for device %i: %s\n", i, nvmlErrorString(result)); goto Error; } printf("%d. %s [%s]\n", i, name, pci.busId); // This is a simple example on how you can modify GPU's state result = nvmlDeviceGetComputeMode(device, &compute_mode); if (NVML_ERROR_NOT_SUPPORTED == result) printf("\t This is not CUDA capable device\n"); else if (NVML_SUCCESS != result) { printf("Failed to get compute mode for device %i: %s\n", i, nvmlErrorString(result)); goto Error; } else { // try to change compute mode printf("\t Changing device's compute mode from '%s' to '%s'\n", convertToComputeModeString(compute_mode), convertToComputeModeString(NVML_COMPUTEMODE_PROHIBITED)); result = nvmlDeviceSetComputeMode(device, NVML_COMPUTEMODE_PROHIBITED); if (NVML_ERROR_NO_PERMISSION == result) printf("\t\t Need root privileges to do that: %s\n", nvmlErrorString(result)); else if (NVML_ERROR_NOT_SUPPORTED == result) printf("\t\t Compute mode prohibited not supported. You might be running on\n" "\t\t windows in WDDM driver model or on non-CUDA capable GPU.\n"); else if (NVML_SUCCESS != result) { printf("\t\t Failed to set compute mode for device %i: %s\n", i, nvmlErrorString(result)); goto Error; } else { printf("\t Restoring device's compute mode back to '%s'\n", convertToComputeModeString(compute_mode)); result = nvmlDeviceSetComputeMode(device, compute_mode); if (NVML_SUCCESS != result) { printf("\t\t Failed to restore compute mode for device %i: %s\n", i, nvmlErrorString(result)); goto Error; } } } printf("\n"); printf("----- 溫度 ----- \n"); unsigned int temperature_threshold = 100; result = nvmlDeviceGetTemperatureThreshold(device, NVML_TEMPERATURE_THRESHOLD_SHUTDOWN, &temperature_threshold); if (NVML_SUCCESS != result) { printf("device %i Failed to get NVML_TEMPERATURE_THRESHOLD_SHUTDOWN: %s\n", i, nvmlErrorString(result)); } else printf("截止溫度: %d 攝氏度 (Temperature at which the GPU will shut down for HW protection)\n", temperature_threshold); result = nvmlDeviceGetTemperatureThreshold(device, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, &temperature_threshold); if (NVML_SUCCESS != result) { printf("device %i Failed NVML_TEMPERATURE_THRESHOLD_SLOWDOWN: %s\n", i, nvmlErrorString(result)); } else printf("上限溫度: %d 攝氏度 (Temperature at which the GPU will begin slowdown)\n", temperature_threshold); unsigned int temperature = 0; result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature); if (NVML_SUCCESS != result) { printf("device %i NVML_TEMPERATURE_GPU Failed: %s\n", i, nvmlErrorString(result)); } else printf("當前溫度: %d 攝氏度 \n", temperature); //使用率 printf("\n"); nvmlUtilization_t utilization; result = nvmlDeviceGetUtilizationRates(device, &utilization); if (NVML_SUCCESS != result) { printf(" device %i nvmlDeviceGetUtilizationRates Failed : %s\n", i, nvmlErrorString(result)); } else { printf("----- 使用率 ----- \n"); printf("GPU 使用率: %lld %% \n", utilization.gpu); printf("顯存使用率: %lld %% \n", utilization.memory); } //FB memory printf("\n"); nvmlMemory_t memory; result = nvmlDeviceGetMemoryInfo(device, &memory); if (NVML_SUCCESS != result) { printf("device %i nvmlDeviceGetMemoryInfo Failed : %s\n", i, nvmlErrorString(result)); } else { printf("------ FB memory ------- \n"); printf("Total installed FB memory: %lld bytes \n", memory.total); printf("Unallocated FB memory: %lld bytes \n", memory.free); printf("Allocated FB memory: %lld bytes \n", memory.used); } //BAR1 memory printf("\n"); nvmlBAR1Memory_t bar1Memory; result = nvmlDeviceGetBAR1MemoryInfo(device, &bar1Memory); if (NVML_SUCCESS != result) { printf("device %i nvmlDeviceGetBAR1MemoryInfo Failed : %s\n", i, nvmlErrorString(result)); } else { printf("------ BAR1 memory ------- \n"); printf("Total BAR1 memory: %lld bytes \n", bar1Memory.bar1Total); printf("Unallocated BAR1 memory: %lld bytes \n", bar1Memory.bar1Free); printf("Allocated BAR1 memory: %lld bytes \n", bar1Memory.bar1Used); } //Information about running compute processes on the GPU printf("\n"); unsigned int infoCount; nvmlProcessInfo_t infos[999]; result = nvmlDeviceGetComputeRunningProcesses(device, &infoCount, infos); if (NVML_SUCCESS != result) { printf("Failed to get ComputeRunningProcesses for device %i: %s\n", i, nvmlErrorString(result)); } else { HANDLE handle; //定義CreateToolhelp32Snapshot系統快照句柄 handle = CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0);//得到系統快照句柄 PROCESSENTRY32 *info; //定義PROCESSENTRY32結構字指 //PROCESSENTRY32 結構的 dwSize 成員設置成 sizeof(PROCESSENTRY32) info = new PROCESSENTRY32; info->dwSize = sizeof(PROCESSENTRY32); //調用一次 Process32First 函數,從快照中獲取進程列表 Process32First(handle, info); //重複調用 Process32Next,直到函數返回 FALSE 爲止 printf("------ Information about running compute processes on the GPU ------- \n"); for (int i = 0; i < infoCount; i++) { printf("PID: %d 顯存佔用:%lld bytes ", infos[i].pid, infos[i].usedGpuMemory); while (Process32Next(handle, info) != FALSE) { if (info->th32ProcessID == infos[i].pid) { //printf(" %s\n", info->szExeFile); HANDLE hProcess = NULL; //打開目標進程 hProcess = OpenProcess(PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, info->th32ProcessID); if (hProcess == NULL) { printf("\nOpen Process fAiled:%d\n", GetLastError()); break; } char strFilePath[MAX_PATH]; GetModuleFileNameEx(hProcess, NULL, strFilePath, MAX_PATH); printf(" %s\n", strFilePath); CloseHandle(hProcess); break; } } } delete info; CloseHandle(handle); } //BAR1 memory printf("\n"); printf("------ Clocks ------- \n"); unsigned int max_clock; result = nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_GRAPHICS, &max_clock); if (NVML_SUCCESS != result) { printf("device %i nvmlDeviceGetMaxClockInfo Failed : %s\n", i, nvmlErrorString(result)); } unsigned int clock; result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_GRAPHICS, &clock); if (NVML_SUCCESS != result) { printf("Failed to get NVML_CLOCK_GRAPHICS info for device %i: %s\n", i, nvmlErrorString(result)); } else { printf("GRAPHICS: %6d Mhz max clock :%d \n", clock, max_clock); } result = nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_SM, &max_clock); if (NVML_SUCCESS != result) { printf("Failed to get max NVML_CLOCK_SM for device %i: %s\n", i, nvmlErrorString(result)); } result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &clock); if (NVML_SUCCESS != result) { printf("Failed to get current NVML_CLOCK_SM for device %i: %s\n", i, nvmlErrorString(result)); } else { printf(" SM: %6d Mhz max clock :%d \n", clock, max_clock); } result = nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_MEM, &max_clock); if (NVML_SUCCESS != result) { printf("Failed to get max NVML_CLOCK_MEM for device %i: %s\n", i, nvmlErrorString(result)); } result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_MEM, &clock); if (NVML_SUCCESS != result) { printf("Failed to get current NVML_CLOCK_MEM for device %i: %s\n", i, nvmlErrorString(result)); } else { printf(" MEM: %6d Mhz max clock :%d \n", clock, max_clock); } result = nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_VIDEO, &max_clock); if (NVML_SUCCESS != result) { printf("Failed to get max NVML_CLOCK_VIDEO for device %i: %s\n", i, nvmlErrorString(result)); } result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_VIDEO, &clock); if (NVML_SUCCESS != result) { printf("Failed to get current NVML_CLOCK_VIDEO for device %i: %s\n", i, nvmlErrorString(result)); } else { printf(" VIDEO: %6d Mhz max clock :%d \n", clock, max_clock); } } printf("-------------------------------------------------------------------- \n"); Sleep(1000); } Error: result = nvmlShutdown(); if (NVML_SUCCESS != result) printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result)); system("pause"); return 0; }
雖然我已經把nvml.dll拷貝到運行目錄,程序應該是能夠正常運行了。也作一下nvidia-smi的環境配置,參考NVIDIA 顯卡信息(CUDA信息的查看),我把他的複製到下面來:
1. nvidia-smi 查看顯卡信息
nvidia-smi 指的是 NVIDIA System Management Interface;
在安裝完成 NVIDIA 顯卡驅動以後,對於 windows 用戶而言,cmd 命令行界面還沒法識別 nvidia-smi 命令,須要將相關環境變量添加進去。如將 NVIDIA 顯卡驅動安裝在默認位置,nvidia-smi 命令所在的完整路徑應當爲:
C:\Program Files\NVIDIA Corporation\NVSMI也即將上述路徑添加進
Path
系統環境變量中。2. 查看 CUDA 信息
- CUDA 的版本:
- 進入命令行:
nvcc -V
圖4.GeForce 940M查詢結果
圖5.Tesla P4查詢結果
NVML對GeForce 940M的支持不怎麼好,對Tesla P4支持得比較好。