當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

CUDA性能优化----线程配置

發布時間：2025/3/15 编程问答 33 豆豆

生活随笔收集整理的這篇文章主要介紹了 CUDA性能优化----线程配置小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

CUDA性能優化----線程配置??

2017-01-12 14:19:29|??分類： HPC&CUDA優化 |??標簽：cuda??gpu??hpc?? |舉報 |字號?訂閱

? ? ? ? 下載LOFTER 我的照片書??| 前言： CUDA線程的組織形式（block的維度配置）對程序的性能影響是至關重要的。
線程索引：矩陣在memory中是row-major線性存儲的： ?在kernel里，線程的唯一索引非常有用，為了確定一個線程的索引，需要（以2D為例）：

線程和block索引
矩陣中元素坐標
線性global memory 的偏移

首先可以將thread和block索引映射到矩陣坐標： ix = threadIdx.x + blockIdx.x * blockDim.x iy = threadIdx.y + blockIdx.y * blockDim.y 之后可以利用上述變量計算線性地址： idx = iy * nx + ix 上圖展示了block和thread索引，矩陣坐標以及線性地址之間的關系，謹記，相鄰的thread擁有連續的threadIdx.x，也就是索引為(0,0)(1,0)(2,0)(3,0)...的thread連續，而不是(0,0)(0,1)(0,2)(0,3)...連續，跟我們線性代數里玩矩陣的時候不一樣。
下面我們以2D矩陣相加為例，來測試CUDA線程配置( block的大小和數量 )對程序性能的影響,，這里以2D grid和2D block為例。測試環境：Tesla M2070一塊，CUDA 6.0，操作系統：Red Hat 4.1.2-50，gcc version 4.1.2 20080704 測試代碼：

//Threads assign test class="com">#include <cuda_runtime.h> class="com">#include <stdio.h> class="com">#include <math.h> class="com">#include <time.h>#define PRECISION 1e-5 class="com">#define HANDLE_ERROR(err) (HandleError( err, __FILE__, __LINE__ ))static void HandleError( cudaError_t err,const char *file,int line ) class="pun">{ class="pln"> if (err != cudaSuccess) class="pln"> { class="pln"> printf( "%s in %s at line %d\n", cudaGetErrorString( err ), class="pln"> file, line ); class="pln"> exit( EXIT_FAILURE ); class="pln"> } class="pun">}__global__ void sumMatrix2DKernel(float *d_MatA,float *d_MatB,float *d_MatC,int nx,int ny) class="pun">{ class="pln"> int idx = threadIdx.x + blockDim.x * blockIdx.x; class="pln"> int idy = threadIdx.y + blockDim.y * blockIdx.y; class="pln"> int tid = nx*idy + idx; if(idx < nx && idy < ny) class="pln"> d_MatC[tid] = d_MatA[tid] + d_MatB[tid]; class="pun">}void sumMatrix2DOnHost (float *h_A,float *h_B,float *hostRef,int nx,int ny) class="pun">{ class="pln"> for(int i=0; i< nx*ny; i++) class="pln"> hostRef[i] = h_A[i] + h_B[i]; class="pun">}int main(int argc, char **argv) class="pun">{ class="pln"> printf("%s Program Starting...\n",argv[0]); class="pln"> // set up device class="pln"> int devID = 0; class="pln"> cudaDeviceProp deviceProp; class="pln"> HANDLE_ERROR(cudaGetDeviceProperties(&deviceProp, devID)); class="pln"> printf("Using Device %d: %s\n", devID, deviceProp.name); class="pln"> HANDLE_ERROR(cudaSetDevice(devID)); // set up date size of matrix class="pln"> int nx = 1<<14; class="pln"> int ny = 1<<14; class="pln"> int nxy = nx*ny; class="pln"> int nBytes = nxy * sizeof(float); class="pln"> printf("Matrix size: nx= %d, ny= %d\n",nx, ny); // malloc host memory class="pln"> float *h_A, *h_B, *hostRef, *gpuRef; class="pln"> h_A = (float *)malloc(nBytes); class="pln"> h_B = (float *)malloc(nBytes); class="pln"> hostRef = (float *)malloc(nBytes); class="pln"> gpuRef = (float *)malloc(nBytes); // initialize data at host side class="pln"> clock_t iStart,iEnd; class="pln"> iStart = clock(); class="pln"> for(int i=0;i<nxy;i++) class="pln"> { class="pln"> h_A[i] = rand()/(float)RAND_MAX; class="pln"> h_B[i] = rand()/(float)RAND_MAX; class="pln"> } class="pln"> class="pln"> iEnd = clock(); class="pln"> double iElaps = (double)(iEnd-iStart)/CLOCKS_PER_SEC; class="pln"> memset(hostRef, 0, nBytes); class="pln"> memset(gpuRef, 0, nBytes); // add matrix at host side for result checks class="pln"> iStart = clock(); class="pln"> sumMatrix2DOnHost(h_A, h_B, hostRef, nx,ny); class="pln"> iEnd = clock(); class="pln"> iElaps = (double)(iEnd-iStart)/CLOCKS_PER_SEC; class="pln"> printf("--sumMatrix2DOnHost() elapsed %f sec..\n", iElaps); // malloc device global memory class="pln"> float *d_MatA, *d_MatB, *d_MatC; class="pln"> cudaMalloc((void **)&d_MatA, nBytes); class="pln"> cudaMalloc((void **)&d_MatB, nBytes); class="pln"> cudaMalloc((void **)&d_MatC, nBytes); // transfer data from host to device class="pln"> cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice); class="pln"> cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice);/// class="pln"> // invoke kernel at host side class="pln"> int dimx = 32; class="pln"> int dimy = 32; class="pln"> //int dimy = 16; class="pln"> dim3 block(dimx, dimy); class="pln"> dim3 grid((nx+block.x-1)/block.x, (ny+block.y-1)/block.y); class="pln"> iStart = clock(); class="pln"> sumMatrix2DKernel <<< grid, block >>>(d_MatA, d_MatB, d_MatC, nx, ny); class="pln"> cudaDeviceSynchronize(); class="pln"> iEnd = clock(); class="pln"> iElaps = (double)(iEnd-iStart)/CLOCKS_PER_SEC; class="pln"> printf("--sumMatrix2DOnGPU<<<(%d,%d),(%d,%d)>>> elapsed %f sec..\n", grid.x, class="pln"> grid.y, block.x, block.y, iElaps); class="com">/// // copy kernel result back to host side class="pln"> cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost); // check device results class="pln"> for(int i=0; i< nxy; i++) class="pln"> { class="pln"> if(fabs(gpuRef[i]-hostRef[i]) > PRECISION) class="pln"> { class="pln"> fprintf(stderr,"Result verification failed at elemnt %d\n", i); class="pln"> exit(EXIT_FAILURE); class="pln"> } class="pln"> } // free device global memory class="pln"> cudaFree(d_MatA); class="pln"> cudaFree(d_MatB); class="pln"> cudaFree(d_MatC); // free host memory class="pln"> free(h_A); class="pln"> free(h_B); class="pln"> free(hostRef); class="pln"> free(gpuRef); // reset device class="pln"> cudaDeviceReset(); printf("Test Passed..\n"); class="pln"> return 0; class="pun">}

編譯運行： $ nvcc -arch=sm_20 sumMatrix2D.cu -o sumMatrix2D $ ./sumMatrix2D 程序輸出：

./sumMatrix2D Program Starting... Using Device 0: Tesla M2070 Matrix size: nx= 16384, ny= 16384 --sumMatrix2DOnHost() elapsed 1.410000 sec.. --sumMatrix2DOnGPU<<<(512,1024),(32,32)>>> elapsed 0.070000 sec.. Test Passed..

現在我們將block的大小改成(32, 16)，此時block數量為512*1024，再次編譯運行，會發現：

可以看到，程序性能提升了將近1倍，直觀來看是第二次線程配置比第一次配置block的數量增加了1倍，實際上也正是由于block數量增加了的緣故。但是如果繼續增加block的數量，性能反而又會下降。現在我們將block的大小改為(16,16)，此時block數量為1024*1024，再次編譯運行，會發現：

./sumMatrix2D Program Starting... Using Device 0: Tesla M2070 Matrix size: nx= 16384, ny= 16384 --sumMatrix2DOnHost() elapsed 1.400000 sec.. --sumMatrix2DOnGPU<<<(1024,1024),(16,16)>>> elapsed 0.050000 sec.. Test Passed..

關于線程塊配置的性能分析參考后續章節。

總結

以上是生活随笔為你收集整理的CUDA性能优化----线程配置的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： CUDA学习----sp, sm, th
下一篇：北理工在线作业计算机的主要特点是( ),