nvidia-smi命令能夠查看GPU使用狀況,可是隻能看到佔用每一個GPU的進程ID。根據進程ID能夠獲得進程詳情,進程詳情中包括用戶ID,根據用戶ID能夠獲取用戶名稱,從而知道哪一個用戶在使用GPU。python
import json import os import re import sys import time import typing import bidict """ 查看誰在使用GPU """ def get_user_id_map() -> typing.Dict[str:str]: """獲取用戶名和用戶ID的對應關係""" home = os.path.expanduser('~') users = bidict.bidict() for user_name in os.listdir(os.path.join(home, '..')): info = os.popen('id ' + user_name + ' 2>&1').read().strip() if 'no such user' in info: continue try: a = re.search("uid=(\\d+)\((\\w+)\)", info) users[a.group(1)] = a.group(2) # userid==>username except Exception as e: print(e) return users def nvidia_smi() -> (int, typing.Dict[str:str]): """使用nvidia-smi命令查看GPU使用狀況,返回GPU個數和各個GPU的進程的描述line""" info = os.popen('nvidia-smi').read() info = info.split('\n') """ smi信息分紅上下兩部分 上面部分:以表格形式展現各個GPU的使用率 下面部分:展現各個GPU上運行的進程ID """ space_ind = 0 for ind, line in enumerate(info): if not line.strip(): space_ind = ind break first_line = 0 for ind, line in enumerate(info): if line.startswith('|===='): first_line = ind break gpu_count = abs(space_ind - first_line) // 3 pos = None for ind, line in enumerate(info): line = line.split() if len(line) > 1 and line[1] == 'Processes:': pos = ind + 2 break gpu_usage = dict() if pos == None: return gpu_count, gpu_usage for i in range(pos, len(info)): line = info[i].split() if len(line) > 1: thread = line[2] gpu_id = int(line[1]) if gpu_id not in gpu_usage: gpu_usage[gpu_id] = [] gpu_usage[gpu_id].append(thread) return gpu_count, gpu_usage def get_thread_info(thread_id: str): """根據thread_id獲取thread詳細信息""" id2user = get_user_id_map() thread_info = os.popen('ps -l ' + thread_id).read().split('\n')[1].split() thread_user = id2user.get(thread_info[2]) thread_time = re.search('\\d+', thread_info[12]).group() thread_cmd = ' '.join(thread_info[13:]) return dict(user=thread_user, use_time="{} hours".format(float(thread_time) / 60), thread_id=thread_id, cmd=thread_cmd) def grep_gpu(task): """搶佔GPU準備執行某個任務""" free_gpu = None while free_gpu is None: gpu_count, usage = nvidia_smi() time.sleep(2) for i in range(gpu_count): if i not in usage: free_gpu = i break print('free gpu found ! ', free_gpu) os.system(task) def show(): gpu_count, usage = nvidia_smi() for gpu_id in usage: usage[gpu_id] = [get_thread_info(thread_id) for thread_id in usage[gpu_id]] print('gpu count', gpu_count) print(json.dumps(usage, ensure_ascii=0, indent=2)) def run(gpu_id, task): os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) os.system('echo CUDA_VISIBLE_DEVICES:$CUDA_VISIBLE_DEVICES') os.system(task) if __name__ == '__main__': print(sys.argv) if len(sys.argv) == 1: print(""" GPU utility gpu show gpu grep your command here gpu 1 python haha.py """) exit(0) action = sys.argv[1] if action == 'show': # 顯示GPU使用狀況 show() elif action == 'grep': # 爭奪GPU,獲得以後執行命令 cmd = ' '.join(sys.argv[2:]) print('grep gpu and run', cmd) grep_gpu(cmd) elif re.match("\\d+", action): # 使用gpu_id執行某個action gpu_id = int(action) cmd = ' '.join(sys.argv[2:]) print('run on gpu', gpu_id, 'cmd', cmd) run(gpu_id, cmd) else: print("unkown command")