當(dāng)前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

gpu排序

發(fā)布時(shí)間：2024/9/30 编程问答 25 豆豆

生活随笔收集整理的這篇文章主要介紹了 gpu排序小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

單機(jī)版的雙調(diào)排序可以參考?http://blog.csdn.net/sunmenggmail/article/details/42869235

還是這張圖片

基于cuda的雙調(diào)排序的思路是：

為每一個(gè)元素提供一個(gè)線程，如果大于1024個(gè)元素，還是提供1024個(gè)線程，這是因?yàn)?span style="font-family:Arial,'Liberation Sans','DejaVu Sans',sans-serif; font-size:14px; line-height:21px">__syncthreads只能作為block內(nèi)的線程同步，而一個(gè)block最多有1024個(gè)線程，如果元素個(gè)數(shù)大于1024則每個(gè)線程可能就要負(fù)責(zé)一個(gè)以上的元素的比較

就上圖而言，一個(gè)矩形代表一次多線程的比較，那么此圖僅需要6次比較，就可以有右邊的輸出。

#include <vector> #include <algorithm> #include <iostream> #include <time.h> #include <sys/time.h> #include <string.h> #include <math.h> #include <stdlib.h> #include <stdio.h>using namespace std;#define CHECK_EQ1(a,b) do { \if ((a) != (b)) { \ cout <<__FILE__<<" : "<< __LINE__<<" : check failed because "<<a<<"!="<<b<<endl;\cout << cudaGetErrorString(a) <<endl;\exit(1);\}\ } while(0)#define CUDA_CHECK(condition)\ do {\cudaError_t error = condition;\CHECK_EQ1(error, cudaSuccess);\ } while(0)static __device__ __forceinline__ unsigned int __btflo(unsigned int word) {unsigned int ret;asm volatile("bfind.u32 %0, %1;" : "=r"(ret) : "r"(word));//return the index of highest non-zero bit in a word; for example, 00000110, return 2return ret; }//for > 1024 __global__ void bigBinoticSort(unsigned int *arr, int len, unsigned int *buf) {unsigned len2 = 1 << (__btflo(len-1u) + 1);//unsigned int MAX = 0xffffffffu;unsigned id = threadIdx.x;if (id >= len2) return;unsigned iter = blockDim.x;for (unsigned i = id; i < len2; i += iter) {if (i >= len) {buf[i-len] = MAX;}}__syncthreads();int count = 0;for (unsigned k = 2; k <= len2; k*=2) {for (unsigned j = k >> 1; j > 0; j >>= 1) {for (unsigned i = id; i < len2; i += iter) {unsigned swapIdx = i ^ j;if (swapIdx > i) {unsigned myelem, other;if (i < len) myelem = arr[i];else myelem = buf[i-len];if (swapIdx < len) other = arr[swapIdx];else other = buf[swapIdx-len];bool swap = false;if ((i & k)==0 && myelem > other) swap = true;if ((i & k) == k && myelem < other) swap = true;if (swap) {if (swapIdx < len) arr[swapIdx] = myelem; else buf[swapIdx-len] = myelem;if (i < len) arr[i] = other;else buf[i-len] = other;}}}__syncthreads();}} }//for <= 1024 __global__ void binoticSort(unsigned int *arr, int len) {__shared__ unsigned int buf[1024];buf[threadIdx.x] = (threadIdx.x < len ? arr[threadIdx.x] : 0xffffffffu);__syncthreads();for (unsigned k = 2; k <= blockDim.x; k*=2) {//buid k elements ascend or descendfor (unsigned j = k >> 1; j > 0; j >>= 1) {//merge longer binotic into shorter binoticunsigned swapIdx = threadIdx.x ^ j;unsigned myelem = buf[threadIdx.x];unsigned other = buf[swapIdx];__syncthreads();unsigned ascend = k * (swapIdx < threadIdx.x);unsigned descend = k * (swapIdx > threadIdx.x);//if I is front, swap is back; ascend = 0, descend = k//if I is back, swap is front; ascend = k, descend = 0;bool swap = false;if ((threadIdx.x & k) == ascend) {if (myelem > other) swap = true;}if ((threadIdx.x & k) == descend) {if (myelem < other) swap = true;}if (swap) buf[swapIdx] = myelem;__syncthreads();}}if (threadIdx.x < len) arr[threadIdx.x] = buf[threadIdx.x]; }template<class T> inline void printVec(T *vec, int len) {for (int i = 0; i < len; ++i) cout <<vec[i] << "\t";cout << endl; }template<class T> inline void printVecg(T *gvec, int len) {T *vec = (T*)malloc(sizeof(T)*len);CUDA_CHECK(cudaMemcpy(vec,gvec,sizeof(T)*len,cudaMemcpyDeviceToHost));printVec(vec,len);free(vec); }void lineSize(int N, int &nblocks, int &nthreads) {if (N <= 1024) {nthreads = (N + 32 - 1)/32*32;//}else {nblocks = (N + 1024 -1)/1024;} }bool validate(unsigned *gvec, int len) {unsigned *vec = (unsigned*)malloc(sizeof(unsigned)*len);CUDA_CHECK(cudaMemcpy(vec,gvec,sizeof(unsigned)*len,cudaMemcpyDeviceToHost));for(int i = 1; i < len; ++i) {if (vec[i] <= vec[i-1]) return false;}return true; }inline int roundUpPower2(int v) {v--;v |= v >> 1;v |= v >> 2;v |= v >> 4;v |= v >> 8;v |= v >> 16;v++;return v; }int main(int argc, char *argv[]) {if (argc != 2) {cout << "len \n";return;}int len = atoi(argv[1]);unsigned int *arr = (unsigned int*)malloc(sizeof(unsigned int)*len);for (int i = 0; i < len; ++i) arr[i] = i;srand((unsigned int)time(NULL));for (int i = len; i >= 2; --i) {int j = rand() % i;swap(arr[i-1], arr[j]);}unsigned* debug;CUDA_CHECK(cudaMalloc((void**)&debug, sizeof(unsigned)*1000));unsigned int* darr, *buf;CUDA_CHECK(cudaMalloc((void**)&darr, sizeof(unsigned int)*len));CUDA_CHECK(cudaMalloc((void**)&buf, sizeof(unsigned int)*len));CUDA_CHECK(cudaMemcpy(darr, arr, sizeof(unsigned int)*len, cudaMemcpyHostToDevice));bigBinoticSort<<<1,1024>>>(darr,len, buf);CUDA_CHECK(cudaPeekAtLastError());CUDA_CHECK(cudaDeviceSynchronize());if (validate(darr, len))cout << "yes\n";elsecout << "no\n";return 1; }

算法有兩個(gè)雙調(diào)排序?qū)崿F(xiàn)，一個(gè)用于小于1024個(gè)元素，用到了共享內(nèi)存加快訪問速度，但是如果真要排序1024以下的元素，建議還是用cpu版本的快排吧，gpu的在速度上并沒有明顯的優(yōu)勢(shì)，甚至還比cpu慢

如果大于1024元素，就采用另一種方法。這種方法的缺點(diǎn)也是很明顯的，就是不管再多的元素，只能用一個(gè)block進(jìn)行計(jì)算，而一個(gè)block最多只能用1024個(gè)線程，估計(jì)在一萬個(gè)元素以內(nèi)的話，這個(gè)方法是gpu上最快的。

經(jīng)過本人測試，包括thrust的sort(基數(shù)排序)，只有元素?cái)?shù)量超過5000個(gè)，gpu上的排序算法才有明顯的優(yōu)勢(shì)。10萬左右的元素，gpu上的排序算法比cpu有一百倍的提速。

下面會(huì)介紹在gpu上進(jìn)行快速排序。gpu快速排序可以處理非常大的數(shù)據(jù)，但是會(huì)有遞歸深度的限制，當(dāng)超過遞歸深度時(shí)，就可以調(diào)用上面所講的雙調(diào)排序進(jìn)行處理。測試表明，速度比thrust還是快一點(diǎn)

gpu上的快排主要參考樣例?NVIDIA_CUDA-6.5_Samples/6_Advanced/cdpAdvancedQuicksort

快排只處理大于1024個(gè)元素的數(shù)組，然后將其分隔為左右兩個(gè)子數(shù)組，如果子數(shù)組長度大于1024則繼續(xù)動(dòng)態(tài)遞歸調(diào)用快排，如果小于1024則動(dòng)態(tài)調(diào)用雙調(diào)排序。如果快排的遞歸深度已經(jīng)超過最大遞歸深度（cuda最大嵌套深度64，但是還受限于每一級(jí)所使用的內(nèi)存大小），則直接調(diào)用雙調(diào)排序。

這段程序的最精彩的地方在于分隔函數(shù)

將數(shù)組按照warp大小進(jìn)行分隔，每個(gè)warp處理32個(gè)元素，通過全局的atomicAdd函數(shù)，分別獲得warp內(nèi)的小于和大于pivot數(shù)在數(shù)組的偏移地址，注意在同一個(gè)warp內(nèi)，這個(gè)偏移地址是一樣的，然后每個(gè)線程將自己的元素放到偏移地址，這樣就完成了分割

需要注意的是，這個(gè)快排不是in-place的，又涉及到遞歸調(diào)用，所以還得處理原數(shù)組和緩沖區(qū)的調(diào)換

由于cuda沒有顯式的鎖，此方法采用了一種特殊的循環(huán)隊(duì)列，本人認(rèn)為在極端情況下，可能會(huì)出現(xiàn)問題

（這里的代碼有錯(cuò)，沒有處理原數(shù)組和緩沖區(qū)的調(diào)換，只是幫助理解。正確的代碼請(qǐng)參考Samples里的）

#define QSORT_BLOCKSIZE_SHIFT 9 #define QSORT_BLOCKSIZE (1 << QSORT_BLOCKSIZE_SHIFT) #define BITONICSORT_LEN 1024 // Must be power of 2! #define QSORT_MAXDEPTH 16 // Will force final bitonic stage at depth QSORT_MAXDEPTH+1 #define QSORT_STACK_ELEMS 1*1024*1024 // One million stack elements is a HUGE number.typedef struct __align__(128) qsortAtomicData_t {volatile unsigned int lt_offset; // Current output offset for <pivotvolatile unsigned int gt_offset; // Current output offset for >pivotvolatile unsigned int sorted_count; // Total count sorted, for deciding when to launch next wavevolatile unsigned int index; // Ringbuf tracking index. Can be ignored if not using ringbuf. } qsortAtomicData;// A ring-buffer for rapid stack allocationtypedef struct qsortRingbuf_t {volatile unsigned int head; //1 // Head pointer - we allocate from herevolatile unsigned int tail; //0 // Tail pointer - indicates last still-in-use elementvolatile unsigned int count;//0 // Total count allocatedvolatile unsigned int max; //0 // Max index allocatedunsigned int stacksize; // // Wrap-around size of buffer (must be power of 2)volatile void *stackbase; // Pointer to the stack we're allocating from } qsortRingbuf;/* for cuda has no lock, so we have to do like this: if alloc , ++head if free , ++tail so [tail, head) contains alloced chunks; head point to the next free chunk count record the number of chunks had free we have n chunks, but the index of a chunk is increase when re-alloc max record the maximum index of the free chunks only if the chunks before max are all free, aka, max == count, we can alter tail value */ template<class T> static __device__ void ringbufFree(qsortRingbuf *ringbuf, T *data) {unsigned index = data->index;unsigned count = atomicAdd((unsigned*)&(ringbuf->count), 1) + 1;unsigned max = atomicMax((unsigned*)&(ringbuf->max), index + 1);if (max < (index + 1)) max = index + 1;if (max == count) {atomicMax((unsigned*)&(ringbuf->tail), count);} }template<class T> static __device__ T* ringbufAlloc(qsortRingbuf *ringbuf) {unsigned int loop = 10000;while (((ringbuf->head - ringbuf->tail) >= ringbuf->stacksize) && (loop-- > 0));if (loop == 0) return NULL;unsigned index = atomicAdd((unsigned*)&ringbuf->head, 1);T *ret = (T*)(ringbuf->stackbase) + (index & (ringbuf->stacksize - 1));ret->index = index;return ret; }__global__ void qsort_warp(unsigned *indata,unsigned *outdata,unsigned int offset,//0unsigned int len,//qsortAtomicData *atomicData,//stackqsortRingbuf *atomicDataStack,//ringbufunsigned int source_is_indata,//trueunsigned int depth) {//printf("depth = %d", depth);// Find my data offset, based on warp IDunsigned int thread_id = threadIdx.x + (blockIdx.x << QSORT_BLOCKSIZE_SHIFT);//unsigned int warp_id = threadIdx.x >> 5; // Used for debug onlyunsigned int lane_id = threadIdx.x & (warpSize-1);// %32// Exit if I'm outside the range of sort to be doneif (thread_id >= len)return;//// First part of the algorithm. Each warp counts the number of elements that are// greater/less than the pivot.//// When a warp knows its count, it updates an atomic counter.//// Read in the data and the pivot. Arbitrary pivot selection for now.unsigned pivot = indata[offset + len/2];unsigned data = indata[offset + thread_id];// Count how many are <= and how many are > pivot.// If all are <= pivot then we adjust the comparison// because otherwise the sort will move nothing and// we'll iterate forever.unsigned int greater = (data > pivot);unsigned int gt_mask = __ballot(greater);//Evaluate predicate for all active threads of the warp and return an integer whose Nth bit is set if and only if predicate evaluates to non-zero for the Nth thread of the warp and the Nth thread is active.if (gt_mask == 0) {greater = (data >= pivot);gt_mask = __ballot(greater);}unsigned lt_mask = __ballot(!greater);unsigned gt_count = __popc(gt_mask);//count number of 1 in a warp;unsigned lt_count = __popc(lt_mask);//only thread 0 in warp calc//find 2 new positions for this warpunsigned lt_oft, gt_oft;if (lane_id == 0) {if (lt_count > 0)lt_oft = atomicAdd((unsigned*)&atomicData->lt_offset, lt_count);//atomicAdd return old value, not the newer//all the warps will syn call thisif (gt_count > 0)gt_oft = len - (atomicAdd((unsigned*) &atomicData->gt_offset, gt_count) + gt_count);//printf("depth = %d\n", depth);//printf("pivot = %u\n", pivot);//printf("lt_count %u lt_oft %u gt_count %u gt_oft %u atomicDataGtOffset %u\n", lt_count,lt_oft, gt_count,gt_oft, atomicData->gt_offset);}lt_oft = __shfl((int)lt_oft, 0);gt_oft = __shfl((int)gt_oft, 0);//Everyone pulls the offsets from lane 0__syncthreads();// Now compute my own personal offset within this. I need to know how many// threads with a lane ID less than mine are going to write to the same buffer// as me. We can use popc to implement a single-operation warp scan in this case.unsigned lane_mask_lt;asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt));//bits set in positions less than the thread's lane number the warpunsigned my_mask = greater ? gt_mask : lt_mask;unsigned my_oft = __popc(my_mask & lane_mask_lt);////move datamy_oft += greater ? gt_oft : lt_oft;outdata[offset + my_oft] = data;__syncthreads();//if (lane_id == 0) printf("pivot = %d", pivot);if (lane_id == 0) {/*if (blockIdx.x == 0) {printf("depth = %d\n", depth);for (int i = 0; i < len; ++i)printf("%u ", outdata[offset+i]);printf("\n");}*/unsigned mycount = lt_count + gt_count;//we are the last warp if (atomicAdd((unsigned*)&atomicData->sorted_count, mycount) + mycount == len) {unsigned lt_len = atomicData->lt_offset;unsigned gt_len = atomicData->gt_offset;cudaStream_t lstream, rstream;cudaStreamCreateWithFlags(&lstream, cudaStreamNonBlocking);cudaStreamCreateWithFlags(&rstream, cudaStreamNonBlocking);ringbufFree<qsortAtomicData>(atomicDataStack, atomicData);if (lt_len == 0) return;leftif (lt_len > BITONICSORT_LEN) {if (depth >= QSORT_MAXDEPTH) {bigBinoticSort<<<1, BITONICSORT_LEN,0, rstream>>>(outdata + offset, lt_len, indata + offset);}else {if ((atomicData = ringbufAlloc<qsortAtomicData>(atomicDataStack)) == NULL)printf("Stack-allocation error. Failing left child launch.\n");else {atomicData->lt_offset = atomicData->gt_offset = atomicData->sorted_count = 0;unsigned int numblocks = (unsigned int)(lt_len+(QSORT_BLOCKSIZE-1)) >> QSORT_BLOCKSIZE_SHIFT;qsort_warp<<< numblocks, QSORT_BLOCKSIZE, 0, lstream >>>(outdata, indata, offset, lt_len, atomicData, atomicDataStack, true, depth+1);}}}else if (lt_len > 1) {unsigned int bitonic_len = 1 << (__btflo(lt_len-1U)+1);binoticSort<<< 1, bitonic_len, 0, lstream >>>(outdata + offset,lt_len);}// rightif (gt_len > BITONICSORT_LEN) {if (depth >= QSORT_MAXDEPTH)bigBinoticSort<<<1, BITONICSORT_LEN,0, rstream>>>(outdata + offset + lt_len, gt_len, indata + offset + lt_len);else {if ((atomicData = ringbufAlloc<qsortAtomicData>(atomicDataStack)) == NULL)printf("Stack allocation error! Failing right-side launch.\n");else {atomicData->lt_offset = atomicData->gt_offset = atomicData->sorted_count = 0;unsigned int numblocks = (unsigned int)(gt_len+(QSORT_BLOCKSIZE-1)) >> QSORT_BLOCKSIZE_SHIFT;qsort_warp<<< numblocks, QSORT_BLOCKSIZE, 0, rstream >>>(outdata, indata, offset+lt_len, gt_len, atomicData, atomicDataStack, true, depth+1);}}}else if (gt_len > 1) {unsigned int bitonic_len = 1 << (__btflo(gt_len-1U)+1);binoticSort<<< 1, bitonic_len, 0, rstream >>>(outdata + offset + lt_len,gt_len);}}} }void runqsort(unsigned *gpudata, unsigned *scratchdata, unsigned int count, cudaStream_t stream) {unsigned int stacksize = QSORT_STACK_ELEMS;//1*1024*1024// This is the stack, for atomic tracking of each sort's statusqsortAtomicData *gpustack;CUDA_CHECK(cudaMalloc((void **)&gpustack, stacksize * sizeof(qsortAtomicData)));CUDA_CHECK(cudaMemset(gpustack, 0, sizeof(qsortAtomicData))); // Only need set first entry to 0// Create the memory ringbuffer used for handling the stack.// Initialise everything to where it needs to be.qsortRingbuf buf;qsortRingbuf *ringbuf;CUDA_CHECK(cudaMalloc((void **)&ringbuf, sizeof(qsortRingbuf)));buf.head = 1; // We start with one allocationbuf.tail = 0;buf.count = 0;buf.max = 0;buf.stacksize = stacksize;buf.stackbase = gpustack;CUDA_CHECK(cudaMemcpy(ringbuf, &buf, sizeof(buf), cudaMemcpyHostToDevice));if (count > BITONICSORT_LEN)//1024{//QSORT_BLOCKSIZE = 2^9 = 512unsigned int numblocks = (unsigned int)(count+(QSORT_BLOCKSIZE-1)) >> QSORT_BLOCKSIZE_SHIFT;qsort_warp<<< numblocks, QSORT_BLOCKSIZE, 0, stream >>>(gpudata, scratchdata, 0U, count, gpustack, ringbuf, true, 0);}else{binoticSort<<< 1, BITONICSORT_LEN >>>(gpudata, count);CUDA_CHECK(cudaMemcpy(scratchdata, gpudata, sizeof(unsigned)*count, cudaMemcpyDeviceToDevice));}cudaDeviceSynchronize(); }

總結(jié)

以上是生活随笔為你收集整理的gpu排序的全部內(nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò)，歡迎將生活随笔推薦給好友。