CUDA之单thread单block多thread单block多thread多block
生活随笔
收集整理的這篇文章主要介紹了
CUDA之单thread单block多thread单block多thread多block
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
用簡單的立方和歸約來舉例:
//單thread單block
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#define DATA_SIZE 1048576
int data[DATA_SIZE];
//產(chǎn)生大量0-9之間的隨機數(shù)
void GenerateNumbers(int *number, int size)
{
for (int i = 0; i < size; i++) {
number[i] = rand() % 10;
}
}
//CUDA 初始化
bool InitCUDA()
{
int count;
//取得支持Cuda的裝置的數(shù)目
cudaGetDeviceCount(&count);
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
int i;
for (i = 0; i < count; i++) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1) {
break; } }
}
if (i == count) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}
cudaSetDevice(i);
return true;
}
// __global__ 函數(shù)(GPU上執(zhí)行) 計算立方和
__global__ static void sumOfcubes(int *num, int* result)
{
intsum = 0;
inti;
for (i= 0; i< DATA_SIZE; i++) {
sum += num[i] * num[i] * num[i];
}
*result = sum;
}
int main()
{ //CUDA 初始化
if (!InitCUDA()) {
return 0;
}
//生成隨機數(shù)
GenerateNumbers(data, DATA_SIZE);
int* gpudata, *result;
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&result, sizeof(int));
//cudaMemcpy 將產(chǎn)生的隨機數(shù)復(fù)制到顯卡內(nèi)存中
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
sumOfcubes<< <1, 1, 0 >> > (gpudata, result);
cudaMemcpy(sum, result, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
printf("GPUsum: %d \n", sum);
int sum = 0;
for (int i = 0; i < DATA_SIZE; i++) {
sum += data[i] * data[i] * data[i];
}
printf("CPUsum: %d \n", sum);
getchar();
return 0;
}
//單block多thread#include <stdio.h>
#include <stdlib.h>
#include <time.h>
//CUDA RunTime API
#include <cuda_runtime.h>
#include "device_launch_parameters.h"#define DATA_SIZE 1048576
#define THREAD_NUM 1024 //256--->1024
int data[DATA_SIZE];void GenerateNumbers(int *number, int size)
{for (int i = 0; i < size; i++) {number[i] = rand() % 10;}
}
// __global__ 函數(shù)(GPU上執(zhí)行) 計算立方和
__global__ static void sumOfSquares(int *num, int* result, clock_t* time)
{ const int tid = threadIdx.x;
//計算每個線程需要完成的量
const int size = DATA_SIZE / THREAD_NUM;
int sum = 0;
int i;
//記錄運算開始的時間
clock_t start;
//只在thread 0(即threadIdx.x = 0 的時候)進行記錄
if (tid == 0) start = clock();
for (i = tid; i < DATA_SIZE; i += THREAD_NUM)
//for (i = tid * size; i < (tid + 1) * size; i++)
{sum += num[i] * num[i] * num[i];
}
result[tid] = sum;
//計算時間的動作,只在thread 0(即threadIdx.x = 0 的時候)進行
if (tid == 0)
*time = clock() - start;
}
int main()
{ //CUDA 初始化//生成隨機數(shù)GenerateNumbers(data, DATA_SIZE);/*把數(shù)據(jù)復(fù)制到顯卡內(nèi)存中*/int* gpudata, *result;clock_t* time;//cudaMalloc 取得一塊顯卡內(nèi)存( 其中result用來存儲計算結(jié)果,time用來存儲運行時間)cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE); cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM);cudaMalloc((void**)&time, sizeof(clock_t));//cudaMemcpy 將產(chǎn)生的隨機數(shù)復(fù)制到顯卡內(nèi)存中cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);// 啟動kernel函數(shù)cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);sumOfSquares << < 1, THREAD_NUM, 0 >> >(gpudata, result, time);int sum[THREAD_NUM];clock_t time_use;//cudaMemcpy 將結(jié)果從顯存中復(fù)制回內(nèi)存cudaMemcpy(sum, result, sizeof(int) * THREAD_NUM, cudaMemcpyDeviceToHost);cudaMemcpy(time_use, time, sizeof(clock_t), cudaMemcpyDeviceToHost);//FreecudaFree(gpudata);cudaFree(result);cudaFree(time);int final_sum = 0; /*立方和歸約*/for (int i = 0; i < THREAD_NUM; i++){final_sum += sum[i];}printf("GPUsum: %d\n time:%d\n", final_sum,time_use);final_sum = 0;for (int i = 0; i < DATA_SIZE; i++) {final_sum += data[i] * data[i] * data[i];}printf("CPUsum: %d \n", final_sum);getchar();return 0;
}
//多block多thread#include <stdio.h>
#include <stdlib.h>
#include <time.h>
//CUDA RunTime API
#include <cuda_runtime.h>
#include "device_launch_parameters.h"#define DATA_SIZE 1048576
#define THREAD_NUM 256
#define BLOCK_NUM 32
int data[DATA_SIZE];void GenerateNumbers(int *number, int size)
{for (int i = 0; i < size; i++) {number[i] = rand() % 10;}
}
// __global__ 函數(shù)(GPU上執(zhí)行) 計算立方和
__global__ static void sumOfSquares(int *num, int* result, clock_t* time)
{const int tid = threadIdx.x;const int bid = blockIdx.x;int sum = 0;int i;//記錄運算開始的時間clock_t start;//只在thread 0(即threadIdx.x = 0 的時候)進行記錄,每個block 都會記錄開始時間及結(jié)束時間if (tid == 0)time[bid] = clock();//thread需要同時通過tid和bid來確定,并保證內(nèi)存連續(xù)性for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM){sum += num[i] * num[i] * num[i];}//Result的數(shù)量隨之增加result[bid * THREAD_NUM + tid] = sum;//計算時間的動作,只在thread 0(即threadIdx.x = 0 的時候)進行,每個block 都會記錄開始時間及結(jié)束時間if (tid == 0)time[bid + BLOCK_NUM] = clock();
}
int main()
{GenerateNumbers(data, DATA_SIZE);int* gpudata, *result;clock_t* time;cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM* BLOCK_NUM);cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);// 在CUDA 中執(zhí)行函數(shù)語法:函數(shù)名稱<<<block 數(shù)目, thread 數(shù)目, shared memory 大小>>>(參數(shù)...);sumOfSquares << < BLOCK_NUM, THREAD_NUM, 0 >> > (gpudata, result, time);int sum[THREAD_NUM*BLOCK_NUM];clock_t time_use[BLOCK_NUM * 2];//cudaMemcpy 將結(jié)果從顯存中復(fù)制回內(nèi)存cudaMemcpy(sum, result, sizeof(int)* THREAD_NUM*BLOCK_NUM, cudaMemcpyDeviceToHost);cudaMemcpy(time_use, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);cudaFree(gpudata);cudaFree(result);cudaFree(time);int final_sum = 0;for (int i = 0; i < THREAD_NUM*BLOCK_NUM; i++){final_sum += sum[i];}//采取新的計時策略把每個block 最早的開始時間,和最晚的結(jié)束時間相減,取得總運行時間clock_t min_start, max_end;min_start = time_use[0];max_end = time_use[BLOCK_NUM];for (int i = 1; i < BLOCK_NUM; i++){if (min_start > time_use[i]) min_start = time_use[i];if (max_end < time_use[i + BLOCK_NUM])max_end = time_use[i + BLOCK_NUM];}printf("GPUsum: %d gputime: %d\n", final_sum, max_end - min_start);final_sum = 0;for (int i = 0; i < DATA_SIZE; i++){final_sum += data[i] * data[i] * data[i];}printf("CPUsum: %d \n", final_sum);getchar();return 0;
}
ShareMemory
是一個block 中所有thread 都能使用的共享內(nèi)存,存取的速度相當(dāng)快,存取shared memory 的速度和存取寄存器相同,不需要擔(dān)心latency 的問題。
可以直接利用__shared__聲明一個shared memory變量
__shared__ float temp[THREAD_NUM * 3];
Shared memory 有時會出現(xiàn)存儲體沖突(bank conflict)的問題:
例如:每個SM有16KB 的shared memory,分成16 個bank
?如果同時每個thread 是存取不同的bank,就不會有問題
?如果同時有兩個(或更多)threads 存取同一個bank 的數(shù)據(jù),就會發(fā)生bank conflict,這些threads 就必須照順序去存取,而無法同時存取shared memory 了。
//多block多thread 使用sharememory#include <stdio.h>
#include <stdlib.h>
#include <time.h>
//CUDA RunTime API
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include "device_functions.h"#define DATA_SIZE 1048576
#define THREAD_NUM 256
#define BLOCK_NUM 32
int data[DATA_SIZE];void GenerateNumbers(int *number, int size)
{for (int i = 0; i < size; i++) {number[i] = rand() % 10;}
}
// __global__ 函數(shù)(GPU上執(zhí)行) 計算立方和
__global__ static void sumOfSquares(int *num, int* result, clock_t* time)
{extern __shared__ int shared[];const int tid = threadIdx.x;const int bid = blockIdx.x;shared[tid] = 0;int i;//記錄運算開始的時間clock_t start;//只在thread 0(即threadIdx.x = 0 的時候)進行記錄,每個block 都會記錄開始時間及結(jié)束時間if (tid == 0) time[bid] = clock();//thread需要同時通過tid和bid來確定,并保證內(nèi)存連續(xù)性for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM){shared[tid] += num[i] * num[i] * num[i];}//同步保證每個thread 都已經(jīng)把結(jié)果寫到shared[tid] 里面__syncthreads();//使用線程0完成加和運算if (tid == 0){for (i = 1; i < THREAD_NUM; i++) shared[0] += shared[i];result[bid] = shared[0];}//計算時間的動作,只在thread 0(即threadIdx.x = 0 的時候)進行,每個block 都會記錄開始時間及結(jié)束時間if (tid == 0) time[bid + BLOCK_NUM] = clock();
}
int main()
{GenerateNumbers(data, DATA_SIZE);int* gpudata, *result;clock_t* time;cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);cudaMalloc((void**)&result, sizeof(int)*BLOCK_NUM);cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);// 在CUDA 中執(zhí)行函數(shù)語法:函數(shù)名稱<<<block 數(shù)目, thread 數(shù)目, shared memory 大小>>>(參數(shù)...);sumOfSquares <<< BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int) >>>(gpudata, result,time);int sum[BLOCK_NUM];clock_t time_use[BLOCK_NUM * 2];//cudaMemcpy 將結(jié)果從顯存中復(fù)制回內(nèi)存cudaMemcpy(&sum, result, sizeof(int)*BLOCK_NUM, cudaMemcpyDeviceToHost);cudaMemcpy(&time_use, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);cudaFree(gpudata);cudaFree(result);cudaFree(time);int final_sum = 0;for (int i = 0; i < BLOCK_NUM; i++){final_sum += sum[i];}//采取新的計時策略把每個block 最早的開始時間,和最晚的結(jié)束時間相減,取得總運行時間clock_t min_start, max_end;min_start = time_use[0];max_end = time_use[BLOCK_NUM];for (int i = 1; i < BLOCK_NUM; i++){if (min_start > time_use[i]) min_start = time_use[i];if (max_end < time_use[i + BLOCK_NUM])max_end = time_use[i + BLOCK_NUM];}printf("GPUsum: %d gputime: %d\n", final_sum, max_end - min_start);final_sum = 0;for (int i = 0; i < DATA_SIZE; i++){final_sum += data[i] * data[i] * data[i];}printf("CPUsum: %d \n", final_sum);getchar();return 0;
}
Block內(nèi)完成部分加和工作,所以gputime增加了
//多block多thread#include <stdio.h>
#include <stdlib.h>
#include <time.h>
//CUDA RunTime API
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include "device_functions.h"#define DATA_SIZE 1048576
#define THREAD_NUM 256
#define BLOCK_NUM 32
int data[DATA_SIZE];void GenerateNumbers(int *number, int size)
{for (int i = 0; i < size; i++) {number[i] = rand() % 10;}
}
// __global__ 函數(shù)(GPU上執(zhí)行) 計算立方和
__global__ static void sumOfSquares(int *num, int* result, clock_t* time)
{extern __shared__ int shared[];const int tid = threadIdx.x;const int bid = blockIdx.x;shared[tid] = 0;int i;//記錄運算開始的時間//只在thread 0(即threadIdx.x = 0 的時候)進行記錄,每個block 都會記錄開始時間及結(jié)束時間if (tid == 0) time[bid] = clock();//thread需要同時通過tid和bid來確定,并保證內(nèi)存連續(xù)性for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM){shared[tid] += num[i] * num[i] * num[i];}//同步保證每個thread 都已經(jīng)把結(jié)果寫到shared[tid] 里面__syncthreads();//樹狀加法int offset = 1, mask = 1;while (offset < THREAD_NUM){if ((tid & mask) == 0){shared[tid] += shared[tid + offset];}offset += offset;mask = offset + mask;__syncthreads();}if (tid == 0){result[bid] = shared[0];time[bid + BLOCK_NUM] = clock();}
}int main()
{GenerateNumbers(data, DATA_SIZE);int* gpudata, *result;clock_t* time;cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);cudaMalloc((void**)&result, sizeof(int)*BLOCK_NUM);cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);// 在CUDA 中執(zhí)行函數(shù)語法:函數(shù)名稱<<<block 數(shù)目, thread 數(shù)目, shared memory 大小>>>(參數(shù)...);sumOfSquares <<< BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int) >>>(gpudata, result,time);int sum[BLOCK_NUM];clock_t time_use[BLOCK_NUM * 2];//cudaMemcpy 將結(jié)果從顯存中復(fù)制回內(nèi)存cudaMemcpy(&sum, result, sizeof(int)*BLOCK_NUM, cudaMemcpyDeviceToHost);cudaMemcpy(&time_use, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);cudaFree(gpudata);cudaFree(result);cudaFree(time);int final_sum = 0;for (int i = 0; i < BLOCK_NUM; i++){final_sum += sum[i];}//采取新的計時策略把每個block 最早的開始時間,和最晚的結(jié)束時間相減,取得總運行時間clock_t min_start, max_end;min_start = time_use[0];max_end = time_use[BLOCK_NUM];for (int i = 1; i < BLOCK_NUM; i++){if (min_start > time_use[i]) min_start = time_use[i];if (max_end < time_use[i + BLOCK_NUM])max_end = time_use[i + BLOCK_NUM];}printf("GPUsum: %d gputime: %d\n", final_sum, max_end - min_start);final_sum = 0;for (int i = 0; i < DATA_SIZE; i++){final_sum += data[i] * data[i] * data[i];}printf("CPUsum: %d \n", final_sum);getchar();return 0;
}
?
總結(jié)
以上是生活随笔為你收集整理的CUDA之单thread单block多thread单block多thread多block的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: C++中结构体与类的区别(struct与
- 下一篇: CUDA存储器模型