#include #include __device__ inline void swap(float & a, float & b) { // Alternative swap doesn't use a temporary register: // a ^= b; // b ^= a; // a ^= b; float tmp = a; a = b; b = tmp; } __global__ static void bitonicSort(float * values, int nSize) { // extern __shared__ float shared[]; __shared__ float shared[512]; unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; shared[tid] = values[i]; __syncthreads(); // Parallel bitonic sort. for (int k = 2; k <= 512; k<<=1) // k *= 2) { // Bitonic merge: for (int j = k / 2; j>0; j>>=1) // j /= 2) { int ixj = tid ^ j; if (ixj > tid) { if ((tid & k) == 0) { if (shared[tid] > shared[ixj]) { swap(shared[tid], shared[ixj]); } } else { if (shared[tid] < shared[ixj]) { swap(shared[tid], shared[ixj]); } } } __syncthreads(); } } // Write result. values[i] = shared[tid]; } extern "C" void bitonic(float * values, int nSize) { float * valuesd; cudaMalloc((void**)&valuesd, sizeof(float) * nSize); cudaMemcpy(valuesd, values, sizeof(float) * nSize, cudaMemcpyHostToDevice); // bitonicSort<<>>(valuesd, nSize); // bitonicSort<<>>(valuesd, nSize); bitonicSort<<< 2, 512 >>>(valuesd, nSize); cudaMemcpy(values, valuesd, sizeof(float) * nSize, cudaMemcpyDeviceToHost); cudaFree(valuesd); }