多GPU計算

多GPU計算已經能夠說,只要是個成熟的模型,都使用了這一點。git

例如:github

gluoncv:https://github.com/dmlc/gluon-cv/blob/master/scripts/detection/faster_rcnn/train_faster_rcnn.py#L218dom

多GPU計算最經常使用的方法是:數據並行ide

流程以下圖:lua

  • 模型參數複製多份
  • 批量數據,分紅多份子集,在各自顯卡的顯存上計算梯度
  • 再累加到一塊顯卡的顯存上
  • 最後廣播到各個顯存上
import mxnet as mx
from mxnet import autograd, nd
from mxnet.gluon import nn,loss as gloss
import d2lzh as d2l

scale = 0.01
W1 = nd.random.normal(scale=scale,shape=(20,1,3,3))
b1 = nd.zeros(shape=20)
W2 = nd.random.normal(scale=scale,shape=(50,20,5,5))
b2 = nd.zeros(shape=50)
W3 = nd.random.normal(scale=scale,shape=(800,128))
b3 = nd.zeros(shape=128)
W4 = nd.random.normal(scale=scale,shape=(128,10))
b4 = nd.zeros(shape=10)
params = [W1, b1, W2, b2, W3, b3, W4, b4]

def lenet(X, params):
    h1_conv = nd.Convolution(data=X, weight=params[0],bias=params[1],
                             kernel=(3,3),num_filter=20)
    h1_activation = nd.relu(h1_conv)
    h1 = nd.Pooling(data=h1_activation, pool_type='avg', kernel=(2,2),
                    stride=(2,2))

    h2_conv = nd.Convolution(data=h1, weight=params[2],bias=params[3],
                             kernel=(5,5), num_filter=50)
    h2_activation = nd.relu(h2_conv)
    h2 = nd.Pooling(data=h2_activation, pool_type='avg', kernel=(2,2),
                    stride=(2,2))
    h2 = nd.flatten(h2)
    h3_linear = nd.dot(h2, params[4]) + params[5]
    h3 = nd.relu(h3_linear)
    y_hat = nd.dot(h3, params[6]) + params[7]
    return y_hat

loss = gloss.SoftmaxCrossEntropyLoss()

# 多GPU之間的同步
# 嘗試把模型參數複製到gpu(0)上
def get_params(params, ctx):
    new_params = [p.copyto(ctx) for p in params]
    for p in new_params:
        p.attach_grad()
    return new_params

new_params = get_params(params,mx.gpu(0))

# 給定分佈在多塊顯卡的顯存之間的數據
# 把各塊顯卡的顯存數據加起來,再廣播到全部顯存上
def allreduce(data):
    for i in range(1,len(data)):
        data[0][:] += data[i].copyto(data[0].context)
    for i in range(1,len(data)):
        data[0].copyto(data[i])

# data = [nd.ones((1,2), ctx=mx.gpu(i)) * (i+1) for i in range(2)]
# print(data)

# 將批量數據切分並複製到各個顯卡的顯存上去
def split_and_load(data, ctx):
    n, k = data.shape[0], len(ctx)
    m = n // k
    return [data[i*m:(i+1)*m].as_in_context(ctx[i]) for i in range(k)]

batch = nd.arange(24).reshape((6,4))
ctx = [mx.gpu(0),mx.gpu(1)]
splitted = split_and_load(batch,ctx)

# 單個小批量上的多GPU訓練
def train_batch(X, y, gpu_params, ctx, lr):
    gpu_Xs, gpu_ys = split_and_load(X, ctx), split_and_load(y, ctx)
    with autograd.record():
        ls = [loss(lenet(gpu_X, gpu_W), gpu_y)
              for gpu_X, gpu_y, gpu_W in zip(gpu_Xs, gpu_ys,
                                             gpu_params)]
    # 各塊GPU上分別反向傳播
    for l in ls:
        l.backward()

    # 把各塊顯卡的顯存上的梯度加起來,而後廣播到全部顯存上
    for i in range(len(gpu_params[0])):
        allreduce([gpu_params[c][i].grad for c in range(len(ctx))])

    # 在各塊顯卡的顯存上分別更新模型參數
    for param in gpu_params:
        d2l.sgd(param, lr, X.shape[0])

import time
# 定義訓練模型
def train(num_gpus, batch_size, lr):
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
    ctx = [mx.gpu(i) for i in range(num_gpus)]
    print('running on:', ctx)
    # 將模型參數複製到各塊顯卡的顯存上
    gpu_params = [get_params(params, c) for c in ctx]
    for epoch in range(4):
        start = time.time()
        for X,y in train_iter:
            # 對單個小批量進行多GPU訓練
            train_batch(X,y, gpu_params, ctx, lr)
            nd.waitall()

        train_time = time.time() - start
        def net(x):
            return lenet(x, gpu_params[0])

        test_acc = d2l.evaluate_accuracy(test_iter, net, ctx[0])
        print('epoch %d, time %.1f sec, test acc %.2f'%(epoch+1, train_time, test_acc))

train(num_gpus=2, batch_size=256, lr=0.2)
相關文章
相關標籤/搜索