1.0 4KB 2.0 16kb
1.0 16KB 2.0 48kb
float shared=data[base+tid]; base訪問的起始元素下標 tid線程號
float shared = data[base+4*tid];
shared[tid]=global[tid]; int number = shared[tid*16]; int nRow = tid/16; int nColumn = tid%16; shared[nColumn*17+nRow] = global[tid]; int number = shared[17*tid];
__global__ static void sumof(int *pnNumber,int* pnResult,clock_t* pclock_tTime){ const int tid = threadIdx.x; int nSum = 0; int i; clock_t clock_tStart; if(tid == 0) clock_tStart = clock(); for(i = tid;i<DATA_SIZE;i+=THREAD_NUM){ nSum += pnNumber[i]*pnNumber[i]; } pnResult[tid] = nSum; if(tid == 0) *pclock_tTime = clock()-clock_tStart; }
__global__ static void sumof(int *pnNumber,int* pnResult,clock_t* pclock_tTime){ const int tid = threadIdx.x; const int bid = blockIdx.x; int nSum = 0; int i; clock_t clock_tStart; if(tid == 0) pclock_tTime[bid] = clock(); for(i = bid*THREAD_NUM+tid;i<DATA_SIZE;i+=BLOCK_NUM*THREAD_NUM){ nSum += pnNumber[i]*pnNumber[i]; } pnResult[bid*THREAD_NUM+tid] = nSum; if(tid == 0) *pclock_tTime[bid+BLOCK_NUM] = clock(); }
noffset = THREAD_NUM/2; while(noffset > 0){ if(tid < offset) nshared[tid] += nshared[tid+noffset]; } noffset >>= 1; __syncthreads();