NVML查詢顯卡信息

        前段時間作英偉達硬解得時候,顯卡老是莫名掛掉,後來發現是由於顯卡溫度太高掉了。這幾天找到CUDA中有NVML工具能夠查看顯卡信息,nvidia-smi也是基於這個工具包。html

        使用的CUDA版本爲CUDA 8.0 。windows

1.給程序添加NVML

        安裝CUDA以後能夠找到以下:api

image

圖1.NVML的例子dom

這裏麪包含的是NVML的一個例子。個人系統是64位的,能夠找到NVML的lib和頭文件以下:ide

image

圖2.NVML的lib文件函數

image

圖3.NVML頭文件工具

在工程中包含NVML。我是新建的CUDA 8.0 Runtime工程,由於NVML包含在CUDA中,建CUDA 8.0 Runtime工程能夠省去CUDA的配置工做,工程創建方法參見VS2013 VC++的.cpp文件調用CUDA的.cu文件中的函數性能

,CUDA 8.0爲默認安裝,系統爲win10 64位。學習

        在程序中直接包含NVML的頭文件和lib文件便可:測試

#include "nvml.h"

#pragma  comment(lib,"nvml.lib")

注意64位系統應該創建x64工程,由於在安裝的CUDA中沒有win32的nvml.lib。

2.NVML查詢顯卡信息

        經常使用函數:

        ·nvmlInit()函數初始化NVML;

        ·nvmlDeviceGetCount(unsigned int *deviceCount)函數能夠得到顯卡數;

        ·nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device)獲取設備;

        ·nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length)查詢設備的名稱;

        ·nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci)獲取PCI信息,對這個函數的重要性,例子中是這麼說的

            // pci.busId is very useful to know which device physically you're talking to
            // Using PCI identifier you can also match nvmlDevice handle to CUDA device.

        ·nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode)獲得顯卡當前所處的模式,模式由如下:

typedef enum nvmlComputeMode_enum
{
    NVML_COMPUTEMODE_DEFAULT           = 0,  //!< Default compute mode -- multiple contexts per device
    NVML_COMPUTEMODE_EXCLUSIVE_THREAD  = 1,  //!< Support Removed
    NVML_COMPUTEMODE_PROHIBITED        = 2,  //!< Compute-prohibited mode -- no contexts per device
    NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3,  //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time
   
    // Keep this last
    NVML_COMPUTEMODE_COUNT
} nvmlComputeMode_t;

        ·nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode)能夠修改顯卡的模式;

        ·nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp)查詢溫度閾值,具體有兩種:

typedef enum nvmlTemperatureThresholds_enum
{
    NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0,    // Temperature at which the GPU will shut down for HW protection
    NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1,    // Temperature at which the GPU will begin slowdown
    // Keep this last
    NVML_TEMPERATURE_THRESHOLD_COUNT
} nvmlTemperatureThresholds_t;

當溫度達到NVML_TEMPERATURE_THRESHOLD_SHUTDOWN 參數獲取的溫度時,顯卡將自動關閉以保護硬件;當溫度達到NVML_TEMPERATURE_THRESHOLD_SLOWDOWN參數獲取的溫度時,顯卡的性能將降低。

        ·nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp)獲取顯卡當前溫度;

        ·nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization)獲取設備的使用率(原註釋:Retrieves the current utilization rates for the device's major subsystems。不知道理解錯了沒有),使用率包括如下:

typedef struct nvmlUtilization_st
{
    unsigned int gpu;                //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU
    unsigned int memory;             //!< Percent of time over the past sample period during which global (device) memory was being read or written
} nvmlUtilization_t;

        ·nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory)    Retrieves the amount of used, free and total memory available on the device, in bytes。

        ·nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory)   Gets Total, Available and Used size of BAR1 memory.(不知道這種與上一種有什麼區別,有待後續學習)

       ·nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos)    Get information about processes with a compute context on a device。應該是獲取當前在使用顯卡的程序信息。

        ·nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock)   Retrieves the maximum clock speeds for the device。包括如下:

typedef enum nvmlClockType_enum
{
    NVML_CLOCK_GRAPHICS  = 0,        //!< Graphics clock domain
    NVML_CLOCK_SM        = 1,        //!< SM clock domain
    NVML_CLOCK_MEM       = 2,        //!< Memory clock domain
    NVML_CLOCK_VIDEO     = 3,        //!< Video encoder/decoder clock domain
   
    // Keep this last
    NVML_CLOCK_COUNT //<! Count of clock types
} nvmlClockType_t;

        ·nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock)   Retrieves the current clock speeds for the device.上面是獲取最大的,這個是獲取當前的。

代碼示例:

#include "cuda_kernels.h"

#include "nvml.h"

#include <stdio.h>  
#include <windows.h>  
#include <winbase.h>  
#include <tlhelp32.h>  
#include <psapi.h>   #pragma comment(lib,"kernel32.lib")  
#pragma comment(lib,"advapi32.lib")  

#pragma  comment(lib,"nvml.lib")

const char * convertToComputeModeString(nvmlComputeMode_t mode)
{
    switch (mode)
    {
    case NVML_COMPUTEMODE_DEFAULT:
        return "Default";
    case NVML_COMPUTEMODE_EXCLUSIVE_THREAD:
        return "Exclusive_Thread";
    case NVML_COMPUTEMODE_PROHIBITED:
        return "Prohibited";
    case NVML_COMPUTEMODE_EXCLUSIVE_PROCESS:
        return "Exclusive Process";
    default:
        return "Unknown";
    }
}

int main()
{
    cuAdd();

    nvmlReturn_t result;
    unsigned int device_count, i;

    // First initialize NVML library
    result = nvmlInit();
    if (NVML_SUCCESS != result)
    {
        printf("Failed to initialize NVML: %s\n", nvmlErrorString(result));

        printf("Press ENTER to continue...\n");
        getchar();
        return 1;
    }

    result = nvmlDeviceGetCount(&device_count);
    if (NVML_SUCCESS != result)
    {
        printf("Failed to query device count: %s\n", nvmlErrorString(result));
        goto Error;
    }
    printf("Found %d device%s\n\n", device_count, device_count != 1 ? "s" : "");

    printf("Listing devices:\n");
    while (true)
    {
        for (i = 0; i < device_count; i++)
        {
            nvmlDevice_t device;
            char name[NVML_DEVICE_NAME_BUFFER_SIZE];
            nvmlPciInfo_t pci;
            nvmlComputeMode_t compute_mode;

            // Query for device handle to perform operations on a device
            // You can also query device handle by other features like:
            // nvmlDeviceGetHandleBySerial
            // nvmlDeviceGetHandleByPciBusId
            result = nvmlDeviceGetHandleByIndex(i, &device);
            if (NVML_SUCCESS != result)
            {
                printf("Failed to get handle for device %i: %s\n", i, nvmlErrorString(result));
                goto Error;
            }

            result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
            if (NVML_SUCCESS != result)
            {
                printf("Failed to get name of device %i: %s\n", i, nvmlErrorString(result));
                goto Error;
            }

            // pci.busId is very useful to know which device physically you're talking to
            // Using PCI identifier you can also match nvmlDevice handle to CUDA device.
            result = nvmlDeviceGetPciInfo(device, &pci);
            if (NVML_SUCCESS != result)
            {
                printf("Failed to get pci info for device %i: %s\n", i, nvmlErrorString(result));
                goto Error;
            }

            printf("%d. %s [%s]\n", i, name, pci.busId);

            // This is a simple example on how you can modify GPU's state
            result = nvmlDeviceGetComputeMode(device, &compute_mode);
            if (NVML_ERROR_NOT_SUPPORTED == result)
                printf("\t This is not CUDA capable device\n");
            else if (NVML_SUCCESS != result)
            {
                printf("Failed to get compute mode for device %i: %s\n", i, nvmlErrorString(result));
                goto Error;
            }
            else
            {
                // try to change compute mode
                printf("\t Changing device's compute mode from '%s' to '%s'\n",
                    convertToComputeModeString(compute_mode),
                    convertToComputeModeString(NVML_COMPUTEMODE_PROHIBITED));

                result = nvmlDeviceSetComputeMode(device, NVML_COMPUTEMODE_PROHIBITED);
                if (NVML_ERROR_NO_PERMISSION == result)
                    printf("\t\t Need root privileges to do that: %s\n", nvmlErrorString(result));
                else if (NVML_ERROR_NOT_SUPPORTED == result)
                    printf("\t\t Compute mode prohibited not supported. You might be running on\n"
                    "\t\t windows in WDDM driver model or on non-CUDA capable GPU.\n");
                else if (NVML_SUCCESS != result)
                {
                    printf("\t\t Failed to set compute mode for device %i: %s\n", i, nvmlErrorString(result));
                    goto Error;
                }
                else
                {
                    printf("\t Restoring device's compute mode back to '%s'\n",
                        convertToComputeModeString(compute_mode));
                    result = nvmlDeviceSetComputeMode(device, compute_mode);
                    if (NVML_SUCCESS != result)
                    {
                        printf("\t\t Failed to restore compute mode for device %i: %s\n", i, nvmlErrorString(result));
                        goto Error;
                    }
                }
            }

            printf("\n");
            printf("----- 溫度 ----- \n");
            unsigned int temperature_threshold = 100;
            result = nvmlDeviceGetTemperatureThreshold(device, NVML_TEMPERATURE_THRESHOLD_SHUTDOWN, &temperature_threshold);
            if (NVML_SUCCESS != result)
            {
                printf("device %i Failed to get NVML_TEMPERATURE_THRESHOLD_SHUTDOWN: %s\n", i, nvmlErrorString(result));
            }
            else
                printf("截止溫度: %d 攝氏度  (Temperature at which the GPU will shut down for HW protection)\n", temperature_threshold);

            result = nvmlDeviceGetTemperatureThreshold(device, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, &temperature_threshold);
            if (NVML_SUCCESS != result)
            {
                printf("device %i Failed NVML_TEMPERATURE_THRESHOLD_SLOWDOWN: %s\n", i, nvmlErrorString(result));
            }
            else
                printf("上限溫度: %d 攝氏度  (Temperature at which the GPU will begin slowdown)\n", temperature_threshold);

            unsigned int temperature = 0;
            result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature);
            if (NVML_SUCCESS != result)
            {
                printf("device %i NVML_TEMPERATURE_GPU Failed: %s\n", i, nvmlErrorString(result));
            }
            else
                printf("當前溫度: %d 攝氏度 \n", temperature);

            //使用率
            printf("\n");
            nvmlUtilization_t utilization;
            result = nvmlDeviceGetUtilizationRates(device, &utilization);
            if (NVML_SUCCESS != result)
            {
                printf(" device %i nvmlDeviceGetUtilizationRates Failed : %s\n", i, nvmlErrorString(result));
            }
            else
            {
                printf("----- 使用率 ----- \n");
                printf("GPU 使用率: %lld %% \n", utilization.gpu);
                printf("顯存使用率: %lld %% \n", utilization.memory);
            }

            //FB memory
            printf("\n");
            nvmlMemory_t memory;
            result = nvmlDeviceGetMemoryInfo(device, &memory);
            if (NVML_SUCCESS != result)
            {
                printf("device %i nvmlDeviceGetMemoryInfo Failed : %s\n", i, nvmlErrorString(result));
            }
            else
            {
                printf("------ FB memory ------- \n");
                printf("Total installed FB memory: %lld bytes \n", memory.total);
                printf("Unallocated FB memory: %lld bytes \n", memory.free);
                printf("Allocated FB memory: %lld bytes \n", memory.used);
            }

            //BAR1 memory
            printf("\n");
            nvmlBAR1Memory_t bar1Memory;
            result = nvmlDeviceGetBAR1MemoryInfo(device, &bar1Memory);
            if (NVML_SUCCESS != result)
            {
                printf("device %i  nvmlDeviceGetBAR1MemoryInfo Failed : %s\n", i, nvmlErrorString(result));
            }
            else
            {
                printf("------ BAR1 memory ------- \n");
                printf("Total BAR1 memory: %lld bytes \n", bar1Memory.bar1Total);
                printf("Unallocated BAR1 memory: %lld bytes \n", bar1Memory.bar1Free);
                printf("Allocated BAR1 memory: %lld bytes \n", bar1Memory.bar1Used);
            }

            //Information about running compute processes on the GPU
            printf("\n");
            unsigned int infoCount;
            nvmlProcessInfo_t infos[999];
            result = nvmlDeviceGetComputeRunningProcesses(device, &infoCount, infos);
            if (NVML_SUCCESS != result)
            {
                printf("Failed to get ComputeRunningProcesses for device %i: %s\n", i, nvmlErrorString(result));
            }
            else
            {
                HANDLE handle; //定義CreateToolhelp32Snapshot系統快照句柄       
                handle = CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0);//得到系統快照句柄     
                PROCESSENTRY32 *info; //定義PROCESSENTRY32結構字指     
                //PROCESSENTRY32 結構的 dwSize 成員設置成 sizeof(PROCESSENTRY32)      
                info = new PROCESSENTRY32;
                info->dwSize = sizeof(PROCESSENTRY32);
                //調用一次     Process32First 函數,從快照中獲取進程列表     
                Process32First(handle, info);
                //重複調用 Process32Next,直到函數返回 FALSE 爲止    

                printf("------ Information about running compute processes on the GPU ------- \n");
                for (int i = 0; i < infoCount; i++)
                {
                    printf("PID: %d  顯存佔用:%lld bytes   ", infos[i].pid, infos[i].usedGpuMemory);

                    while (Process32Next(handle, info) != FALSE)
                    {
                        if (info->th32ProcessID == infos[i].pid)
                        {
                            //printf("  %s\n", info->szExeFile);

                            HANDLE hProcess = NULL;
                            //打開目標進程  
                            hProcess = OpenProcess(PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, info->th32ProcessID);
                            if (hProcess == NULL) {
                                printf("\nOpen Process fAiled:%d\n", GetLastError());
                                break;
                            }

                            char strFilePath[MAX_PATH];
                            GetModuleFileNameEx(hProcess, NULL, strFilePath, MAX_PATH);
                            printf(" %s\n", strFilePath);

                            CloseHandle(hProcess);

                            break;
                        }
                    }
                }
                
                delete info;
                CloseHandle(handle);
            }

            //BAR1 memory
            printf("\n");
            printf("------ Clocks ------- \n"); 
            unsigned int max_clock;
            result = nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_GRAPHICS, &max_clock);
            if (NVML_SUCCESS != result)
            {
                printf("device %i   nvmlDeviceGetMaxClockInfo Failed : %s\n", i, nvmlErrorString(result));
            }

            unsigned int clock;
            result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_GRAPHICS, &clock);
            if (NVML_SUCCESS != result)
            {
                printf("Failed to get NVML_CLOCK_GRAPHICS info for device %i: %s\n", i, nvmlErrorString(result));
            }
            else
            {
                printf("GRAPHICS: %6d Mhz   max clock :%d  \n", clock, max_clock);
            }

            result = nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_SM, &max_clock);
            if (NVML_SUCCESS != result)
            {
                printf("Failed to get max NVML_CLOCK_SM for device %i: %s\n", i, nvmlErrorString(result));
            }

            result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &clock);
            if (NVML_SUCCESS != result)
            {
                printf("Failed to get current NVML_CLOCK_SM for device %i: %s\n", i, nvmlErrorString(result));
            }
            else
            {
                printf("      SM: %6d Mhz   max clock :%d   \n", clock, max_clock);
            }

            result = nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_MEM, &max_clock);
            if (NVML_SUCCESS != result)
            {
                printf("Failed to get max NVML_CLOCK_MEM for device %i: %s\n", i, nvmlErrorString(result));
            }

            result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_MEM, &clock);
            if (NVML_SUCCESS != result)
            {
                printf("Failed to get current NVML_CLOCK_MEM for device %i: %s\n", i, nvmlErrorString(result));
            }
            else
            {
                printf("     MEM: %6d Mhz   max clock :%d   \n", clock, max_clock);
            }

            result = nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_VIDEO, &max_clock);
            if (NVML_SUCCESS != result)
            {
                printf("Failed to get max NVML_CLOCK_VIDEO for device %i: %s\n", i, nvmlErrorString(result));
            }

            result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_VIDEO, &clock);
            if (NVML_SUCCESS != result)
            {
                printf("Failed to get current NVML_CLOCK_VIDEO for device %i: %s\n", i, nvmlErrorString(result));
            }
            else
            {
                printf("   VIDEO: %6d Mhz   max clock :%d   \n", clock, max_clock);
            }
        }

        printf("-------------------------------------------------------------------- \n");

        Sleep(1000);
    }

Error:
    result = nvmlShutdown();
    if (NVML_SUCCESS != result)
        printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result));

    system("pause");

    return 0;
}

雖然我已經把nvml.dll拷貝到運行目錄,程序應該是能夠正常運行了。也作一下nvidia-smi的環境配置,參考NVIDIA 顯卡信息(CUDA信息的查看),我把他的複製到下面來:

1. nvidia-smi 查看顯卡信息

nvidia-smi 指的是 NVIDIA System Management Interface;

在安裝完成 NVIDIA 顯卡驅動以後,對於 windows 用戶而言,cmd 命令行界面還沒法識別 nvidia-smi 命令,須要將相關環境變量添加進去。如將 NVIDIA 顯卡驅動安裝在默認位置,nvidia-smi 命令所在的完整路徑應當爲:

C:\Program Files\NVIDIA Corporation\NVSMI

也即將上述路徑添加進 Path 系統環境變量中。

2. 查看 CUDA 信息

  • CUDA 的版本:
    • 進入命令行:nvcc -V

3.運行結果

image

圖4.GeForce 940M查詢結果

P4測試

圖5.Tesla P4查詢結果

        NVML對GeForce 940M的支持不怎麼好,對Tesla P4支持得比較好。


工程源碼:http://download.csdn.net/download/qq_33892166/9841800

相關文章
相關標籤/搜索