cuda學習2-block與thread數量的選取

時間 2019-12-18

標籤 cuda 學習 block thread 數量選取简体版

原文原文鏈接

　　由上一節可知，在main函數中，cuda程序的並行能力是在add<<<N,1>>>( dev_a, dev_b, dev_c )函數中體現的，這裏面設置的是由N個block的構成的計算網絡即grid，每個block裏面有1個thread存在。那麼這種選取有什麼用意呢，如何針對本身的計算問題設置計算網絡呢？express

　　首先要說明這兩個數的選取沒有固定的方法，徹底是根據自身需求。其實它的完整形式是Kernel<<<Dg,Db, Ns, S>>>(param list);<<<>>>運算符內是核函數的執行參數，告訴編譯器運行時如何啓動核函數，用於說明內核函數中的線程數量，以及線程是如何組織的。數組

　　參數Dg用於定義整個grid的維度和尺寸，即一個grid有多少個block。爲dim3類型。Dim3 Dg(Dg.x, Dg.y, 1)表示grid中每行有Dg.x個block，每列有Dg.y個block，第三維恆爲1。整個grid中共有Dg.x*Dg.y個block，其中Dg.x和Dg.y最大值爲65535。網絡

　　參數Db用於定義一個block的維度和尺寸，即一個block有多少個thread。爲dim3類型。Dim3 Db(Db.x, Db.y, Db.z)表示整個block中每行有Db.x個thread，每列有Db.y個thread，高度爲Db.z。Db.x和Db.y最大值爲512，Db.z最大值爲62。一個block中共有Db.x*Db.y*Db.z個thread。計算能力爲1.0,1.1的硬件該乘積的最大值爲768，計算能力爲1.2,1.3的硬件支持的最大值爲1024。app

　　參數Ns是一個可選參數，用於設置每一個block除了靜態分配的shared Memory（之後會學習到）之外，最多能動態分配的shared memory大小，單位爲byte。不須要動態分配時該值爲0或省略不寫。函數

　　參數S是一個cudaStream_t類型的可選參數，初始值爲零，表示該核函數處在哪一個流（之後會學習到）之中。學習

　　在這個例子中，因爲計算很簡單，就選了一個<<<N,1>>>這種搭配。如今咱們看一個複雜一點的例子。this

　　這個例子是說要計算兩個任意長的向量的加法，可能會比比65535長，超過了block數的最大範圍，甚至於比65535×512（thread上限）還長，應該怎麼辦呢？下面就用spa

<<<128,128>>>的計算網絡來搞定。線程

　　核函數改成以下：code

1 __global__ void add( int *a, int *b, int *c ) {
2     int tid = threadIdx.x + blockIdx.x * blockDim.x;
3     while (tid < N) {
4         c[tid] = a[tid] + b[tid];
5         tid += blockDim.x * gridDim.x;
6     }
7 }

　　這段代碼的精髓就在於它是一個循環，當編號爲tid = threadIdx.x + blockIdx.x * blockDim.x的線程進行加法運算以後，tid += blockDim.x * gridDim.x;若是tid<N，則這個線程再作一次加法，依次循環下去。由於計算網絡只有blockDim.x * gridDim.x這麼大（次例爲128×128），那麼那些大於blockDim.x * gridDim.x而且小於N的數組份量的相加任務就須要繼續分配給各個線程，如上就是用循環來分配的。

　　任意長度向量相加完整代碼：

/*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and 
 * proprietary rights in and to this software and related documentation. 
 * Any use, reproduction, disclosure, or distribution of this software 
 * and related documentation without an express license agreement from
 * NVIDIA Corporation is strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA) 
 * associated with this source code for terms and conditions that govern 
 * your use of this NVIDIA software.
 * 
 */


#include "../common/book.h"

#define N   (33 * 1024)

__global__ void add( int *a, int *b, int *c ) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    while (tid < N) {
        c[tid] = a[tid] + b[tid];
        tid += blockDim.x * gridDim.x;
    }
}

int main( void ) {
    int *a, *b, *c;
    int *dev_a, *dev_b, *dev_c;

    // allocate the memory on the CPU
    a = (int*)malloc( N * sizeof(int) );
    b = (int*)malloc( N * sizeof(int) );
    c = (int*)malloc( N * sizeof(int) );

    // allocate the memory on the GPU
    HANDLE_ERROR( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) );

    // fill the arrays 'a' and 'b' on the CPU
    for (int i=0; i<N; i++) {
        a[i] = i;
        b[i] = 2 * i;
    }

    // copy the arrays 'a' and 'b' to the GPU
    HANDLE_ERROR( cudaMemcpy( dev_a, a, N * sizeof(int),
                              cudaMemcpyHostToDevice ) );
    HANDLE_ERROR( cudaMemcpy( dev_b, b, N * sizeof(int),
                              cudaMemcpyHostToDevice ) );

    add<<<128,128>>>( dev_a, dev_b, dev_c );

    // copy the array 'c' back from the GPU to the CPU
    HANDLE_ERROR( cudaMemcpy( c, dev_c, N * sizeof(int),
                              cudaMemcpyDeviceToHost ) );

    // verify that the GPU did the work we requested
    bool success = true;
    for (int i=0; i<N; i++) {
        if ((a[i] + b[i]) != c[i]) {
            printf( "Error:  %d + %d != %d\n", a[i], b[i], c[i] );
            success = false;
        }
    }
    if (success)    printf( "We did it!\n" );

    // free the memory we allocated on the GPU
    HANDLE_ERROR( cudaFree( dev_a ) );
    HANDLE_ERROR( cudaFree( dev_b ) );
    HANDLE_ERROR( cudaFree( dev_c ) );

    // free the memory we allocated on the CPU
    free( a );
    free( b );
    free( c );

    return 0;
}