CUDA+OpenCV实现光线追踪(有无constant)
常量內存是NVIDIA提供的一個64KB大小的內存空間,它的處理方式和普通的全局內存和共享內存都不一樣,是有cuda專門提供的。
線程束的概念:線程束是指一個包含32個線程的集合,在程序中的每一行,線程束中的每個線程都將在不同的數據上執行相同的指令。
因此,常量內存的作用是,能夠將單次內存的讀取操作廣播到每個半線程束(即16個線程),所以如果在半線程束中的每個線程都從常量內存的相同地址上讀取數據,那么GPU只會產生一次讀取請求,并將其廣播,顯而易見,這種方式的內存流量只是使用全局內存流量的1/16。這是常量內存的第一個好處,第二個好處則是由于這塊內存的內容是不會發生變化的,因此硬件將主動把這個常量內存數據緩存到GPU上,這樣第一次從敞亮內存的某個地址上讀取后,其他半線程束請求同一個地址時,將直接在GPU上命中緩存,因此也減少了額外的內存流量。
使用常量內存只需加上:__constant__修飾符,當從主機內存復制內存到GPU上常量內存時,不用cudaMemcpy()而用cudaMemcpyToSymbol(),這樣就復制到常量內存里了。
無constant:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "H:\cuda_by_example\common\book.h"
#include "H:\cuda_by_example\common\cpu_bitmap.h"
#include "device_functions.h"
#include <stdio.h>#define DIM 1024 #define rnd( x ) (x * rand() / RAND_MAX)
#define INF 2e10f //數據結構對球面建模
struct Sphere { float r,b,g; float radius; float x,y,z; //hit方法,計算光線是否與球面相交,若相交則返回光線到命中球面處的距離__device__ float hit( float ox, float oy, float *n ) { float dx = ox - x; float dy = oy - y; if (dx*dx + dy*dy < radius*radius) { float dz = sqrtf( radius*radius - dx*dx - dy*dy ); *n = dz / sqrtf( radius * radius ); return dz + z; } return -INF; }
};
#define SPHERES 30 //核函數內容
__global__ void kernel( Sphere *s, unsigned char *ptr ) { //將threadIdx/BlockIdx映射到像素位置int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int offset = x + y * blockDim.x * gridDim.x; //讓圖像坐標偏移DIM/2,使z軸穿過圖像中心float ox = (x - DIM/2); float oy = (y - DIM/2); //初始化背景顏色為黑色float r=0, g=0, b=0; float maxz = -INF; //對每一個球面數組進行迭代for(int i=0; i<SPHERES; i++) { float n; float t = s[i].hit( ox, oy, &n ); //如果比上一次命中距離更接近,我將這個距離保存為最接近距離,并且保存球面顏色值if (t > maxz) { float fscale = n; r = s[i].r * fscale; g = s[i].g * fscale; b = s[i].b * fscale; //更新距離maxz = t; } } //判斷球面相交情況后,將當前顏色保存到輸出圖像中ptr[offset*4 + 0] = (int)(r * 255); ptr[offset*4 + 1] = (int)(g * 255); ptr[offset*4 + 2] = (int)(b * 255); ptr[offset*4 + 3] = 255;
} // globals needed by the update routine
struct DataBlock { unsigned char *dev_bitmap; Sphere *s;
}; int main( void ) { DataBlock data; //記錄起始時間cudaEvent_t start, stop; HANDLE_ERROR( cudaEventCreate( &start ) ); HANDLE_ERROR( cudaEventCreate( &stop ) ); HANDLE_ERROR( cudaEventRecord( start, 0 ) ); CPUBitmap bitmap( DIM, DIM, &data ); unsigned char *dev_bitmap; Sphere *s; // allocate memory on the GPU for the output bitmap HANDLE_ERROR( cudaMalloc( (void**)&dev_bitmap, bitmap.image_size() ) ); // allocate memory for the Sphere dataset HANDLE_ERROR( cudaMalloc( (void**)&s, sizeof(Sphere) * SPHERES ) ); // allocate temp memory, initialize it, copy to // memory on the GPU, then free our temp memory //生成球面的中心坐標顏色和半徑Sphere *temp_s = (Sphere*)malloc( sizeof(Sphere) * SPHERES ); for (int i=0; i<SPHERES; i++) { temp_s[i].r = rnd( 1.0f );temp_s[i].g = rnd( 1.0f ); temp_s[i].b = rnd( 1.0f ); temp_s[i].x = rnd( 1000.0f ) - 500; temp_s[i].y = rnd( 1000.0f ) - 500; temp_s[i].z = rnd( 1000.0f ) - 500; temp_s[i].radius = rnd( 100.0f ) + 20; } HANDLE_ERROR( cudaMemcpy( s, temp_s, sizeof(Sphere) * SPHERES, cudaMemcpyHostToDevice ) ); free( temp_s ); // generate a bitmap from our sphere data dim3 grids(DIM/16,DIM/16); dim3 threads(16,16); kernel<<<grids,threads>>>( s, dev_bitmap ); // copy our bitmap back from the GPU for display HANDLE_ERROR( cudaMemcpy( bitmap.get_ptr(), dev_bitmap, bitmap.image_size(), cudaMemcpyDeviceToHost ) ); // get stop time, and display the timing results HANDLE_ERROR( cudaEventRecord( stop, 0 ) ); HANDLE_ERROR( cudaEventSynchronize( stop ) ); float elapsedTime; HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime, start, stop ) ); printf( "Time to generate: %3.1f ms\n", elapsedTime ); HANDLE_ERROR( cudaEventDestroy( start ) ); HANDLE_ERROR( cudaEventDestroy( stop ) ); HANDLE_ERROR( cudaFree( dev_bitmap ) ); HANDLE_ERROR( cudaFree( s ) ); // display bitmap.display_and_exit();
}
__device__是在設備內調用,global調用device,將在所有核中使用hit,如果去掉__device__將出錯,因為hit是在主機內的
有constant:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "H:\cuda_by_example\common\book.h"
#include "H:\cuda_by_example\common\cpu_bitmap.h"
#include "device_functions.h"
#include <stdio.h>
#define DIM 1024 #define rnd( x ) (x * rand() / RAND_MAX)
#define INF 2e10f struct Sphere { float r,b,g; float radius; float x,y,z; __device__ float hit( float ox, float oy, float *n ) { float dx = ox - x; float dy = oy - y; if (dx*dx + dy*dy < radius*radius) { float dz = sqrtf( radius*radius - dx*dx - dy*dy ); *n = dz / sqrtf( radius * radius ); return dz + z; } return -INF; }
};
#define SPHERES 30
__constant__ Sphere s[SPHERES]; __global__ void kernel( unsigned char *ptr ) { // map from threadIdx/BlockIdx to pixel posiytion int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int offset = x + y * blockDim.x * gridDim.x; float ox = (x - DIM/2); float oy = (y - DIM/2); float r=0, g=0, b=0; float maxz = -INF; for(int i=0; i<SPHERES; i++) { float n; float t = s[i].hit( ox, oy, &n ); if (t > maxz) { float fscale = n; r = s[i].r * fscale; g = s[i].g * fscale; b = s[i].b * fscale; maxz = t; } } ptr[offset*4 + 0] = (int)(r * 255); ptr[offset*4 + 1] = (int)(g * 255); ptr[offset*4 + 2] = (int)(b * 255); ptr[offset*4 + 3] = 255;
} struct DataBlock{unsigned char *dev_bitmap;
};int main(){DataBlock data;// capture the start time and start to record itcudaEvent_t start,stop;HANDLE_ERROR(cudaEventCreate(&start));HANDLE_ERROR(cudaEventCreate(&stop));HANDLE_ERROR(cudaEventRecord(start,0));CPUBitmap bitmap(DIM,DIM,&data);unsigned char *dev_bitmap;//allocate the memory on the GPU for the output bitmapHANDLE_ERROR(cudaMalloc((void**)&dev_bitmap,bitmap.image_size()));Sphere *temp_s = (Sphere*)malloc(sizeof(Sphere)*SPHERES); for (int i=0; i<SPHERES; i++) { temp_s[i].r = rnd( 1.0f ); temp_s[i].g = rnd( 1.0f ); temp_s[i].b = rnd( 1.0f ); temp_s[i].x = rnd( 1000.0f ) - 500; temp_s[i].y = rnd( 1000.0f ) - 500; temp_s[i].z = rnd( 1000.0f ) - 500; temp_s[i].radius = rnd( 100.0f ) + 20; } HANDLE_ERROR( cudaMemcpyToSymbol( s, temp_s,sizeof(Sphere) * SPHERES) ); free(temp_s);//generate a bitmap from our sphere datadim3 grids(DIM/16,DIM/16);dim3 threads(16,16);kernel<<<grids,threads>>>(dev_bitmap);//copy the bitmap back from GPU to CPU for displayHANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(),dev_bitmap,bitmap.image_size(),cudaMemcpyDeviceToHost));HANDLE_ERROR(cudaEventRecord(stop,0));//stop the time recordHANDLE_ERROR(cudaEventSynchronize(stop));float elapsedTime;HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,start,stop));printf( "Time to generate: %3.1f ms\n", elapsedTime ); HANDLE_ERROR(cudaEventDestroy(start));HANDLE_ERROR(cudaEventDestroy(stop));HANDLE_ERROR(cudaFree(dev_bitmap));bitmap.display_and_exit();
}
運行效果圖,spherenumber=100時
?
總結
以上是生活随笔為你收集整理的CUDA+OpenCV实现光线追踪(有无constant)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 如何解决VS2015编译C4996错误
- 下一篇: There's no Qt versio