#include #include #include // texture to make random reads from texture tex; // safe call macros #define CUDA_SAFE_CALL( call) do { \ cudaError err = call; \ if( cudaSuccess != err) { \ fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ __FILE__, __LINE__, cudaGetErrorString( err) ); \ exit(EXIT_FAILURE); \ } } while (0) #define CUT_CHECK_ERROR(errorMessage) do { \ cudaThreadSynchronize(); \ cudaError_t err = cudaGetLastError(); \ if( cudaSuccess != err) { \ fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ exit(EXIT_FAILURE); \ } } while (0) struct gpu_nlist_array { unsigned int *n_neigh; unsigned int *list; unsigned int height; unsigned int pitch; }; struct gpu_pdata_arrays { float4 *pos; unsigned int N; }; __global__ void kernel1(float *d_out, gpu_pdata_arrays pdata, gpu_nlist_array nlist, float r_cutsq) { // start by identifying which particle we are to handle int pidx = blockIdx.x * blockDim.x + threadIdx.x; if (pidx >= pdata.N) return; // read in data int n_neigh = nlist.n_neigh[pidx]; float4 pos = pdata.pos[pidx]; // set an accumulator to 0 float sum = 0.0f; // loop over neighbors for (int neigh_idx = 0; neigh_idx < n_neigh; neigh_idx++) { int cur_neigh = nlist.list[nlist.pitch*neigh_idx + pidx]; // get the neighbor's position float4 neigh_pos = tex1Dfetch(tex, cur_neigh); float nx = neigh_pos.x; float ny = neigh_pos.y; float nz = neigh_pos.z; // calculate dr float dx = pos.x - nx; float dy = pos.y - ny; float dz = pos.z - nz; float rsq = dx*dx + dy*dy + dz*dz; // add up the sum of the distances to make sure the dead code optimizer doesn't wipe out the kernel sum += rsq; } d_out[pidx] = sum; } __global__ void kernel2(float *d_out, gpu_pdata_arrays pdata, gpu_nlist_array nlist, float r_cutsq) { // start by identifying which particle we are to handle int pidx = blockIdx.x * blockDim.x + threadIdx.x; if (pidx >= pdata.N) return; // read in data int n_neigh = nlist.n_neigh[pidx]; float4 pos = pdata.pos[pidx]; // set an accumulator to 0 float sum = 0.0f; // loop over neighbors // this kernel differes from kernel1 in that the loop is over the same number of elements for the entire grid for (int neigh_idx = 0; neigh_idx < nlist.height; neigh_idx++) { // we can't execute the body of the loop if neigh_idx >= n_neigh for this thread, though if (neigh_idx < n_neigh) { int cur_neigh = nlist.list[nlist.pitch*neigh_idx + pidx]; // get the neighbor's position float4 neigh_pos = tex1Dfetch(tex, cur_neigh); float nx = neigh_pos.x; float ny = neigh_pos.y; float nz = neigh_pos.z; // calculate dr float dx = pos.x - nx; float dy = pos.y - ny; float dz = pos.z - nz; float rsq = dx*dx + dy*dy + dz*dz; // add up the sum of the distances to make sure the dead code optimizer doesn't wipe out the kernel sum += rsq; } } d_out[pidx] = sum; } // kernel3 is commented out so that this file can be compiled without -arch sm_13 and tested on non-G200 cards // uncomment kernel3 and compile with -arch sm_13 to test G200 cards: line 239 must be changed to match the kernel you want to test /*__global__ void kernel3(float *d_out, gpu_pdata_arrays pdata, gpu_nlist_array nlist, float r_cutsq) { // start by identifying which particle we are to handle int pidx = blockIdx.x * blockDim.x + threadIdx.x; if (pidx >= pdata.N) return; // read in data int n_neigh = nlist.n_neigh[pidx]; float4 pos = pdata.pos[pidx]; // set an accumulator to 0 float sum = 0.0f; // loop over neighbors // this kernel differes from kernel2 in that the loop is over the maximum length list in this warp (using the sm13 warp vote) for (int neigh_idx = 0; __any(neigh_idx < n_neigh); neigh_idx++) { // we can't execute the body of the loop if neigh_idx >= n_neigh for this thread, though if (neigh_idx < n_neigh) { int cur_neigh = nlist.list[nlist.pitch*neigh_idx + pidx]; // get the neighbor's position float4 neigh_pos = tex1Dfetch(tex, cur_neigh); float nx = neigh_pos.x; float ny = neigh_pos.y; float nz = neigh_pos.z; // calculate dr float dx = pos.x - nx; float dy = pos.y - ny; float dz = pos.z - nz; float rsq = dx*dx + dy*dy + dz*dz; // add up the sum of the distances to make sure the dead code optimizer doesn't wipe out the kernel sum += rsq; } } d_out[pidx] = sum; }*/ int main() { // allocate GPU memory for N particles const int N = 96 * 1000; float *d_out; CUDA_SAFE_CALL( cudaMalloc( (void**) &d_out, N*sizeof(float)) ); gpu_pdata_arrays d_pdata; CUDA_SAFE_CALL(cudaMalloc( (void**) &d_pdata.pos, N * sizeof(float4)) ); d_pdata.N = N; gpu_nlist_array d_nlist; d_nlist.height = 128; CUDA_SAFE_CALL(cudaMalloc( (void**) &d_nlist.list, N*sizeof(unsigned int) * d_nlist.height) ); CUDA_SAFE_CALL(cudaMalloc( (void**) &d_nlist.n_neigh, N*sizeof(unsigned int)) ); d_nlist.pitch = N; // allocate host memory for N particles gpu_pdata_arrays h_pdata; h_pdata.pos = (float4*)malloc(sizeof(float4) * N); assert(h_pdata.pos); h_pdata.N = N; gpu_nlist_array h_nlist; h_nlist.height = 128; h_nlist.list = (unsigned int*)malloc(sizeof(unsigned int) * N * h_nlist.height); assert(h_nlist.list); h_nlist.n_neigh = (unsigned int*)malloc(sizeof(unsigned int) * N); assert(h_nlist.list); h_nlist.pitch = N; // assign host data to values somewhat representing a real dataset float L = 10.0f; for (int i = 0; i < N; i++) { h_pdata.pos[i].x = L * (rand() % 1000) / 1000.0f; h_pdata.pos[i].y = L * (rand() % 1000) / 1000.0f; h_pdata.pos[i].z = L * (rand() % 1000) / 1000.0f; h_nlist.n_neigh[i] = 40 + rand()%40; for (int j = 0; j < h_nlist.list[i]; j++) { h_nlist.list[j * h_nlist.pitch + i] = (i + j + 1) % N; } } // copy host data to device CUDA_SAFE_CALL( cudaMemcpy(d_pdata.pos, h_pdata.pos, sizeof(float4) * N, cudaMemcpyHostToDevice) ); CUDA_SAFE_CALL( cudaMemcpy(d_nlist.list, h_nlist.list, sizeof(unsigned int) * N * h_nlist.height, cudaMemcpyHostToDevice) ); CUDA_SAFE_CALL( cudaMemcpy(d_nlist.n_neigh, h_nlist.n_neigh, sizeof(unsigned int) * N, cudaMemcpyHostToDevice) ); // bind texture cudaBindTexture(0, tex, d_pdata.pos, sizeof(float4) * N); // setup the grid to run the kernel int M = 96; dim3 grid( N/M, 1, 1); dim3 threads(M, 1, 1); // run the kernel for (int i = 0; i < 1000000; i++) { printf("%d\n", i); kernel1<<< grid, threads>>>(d_out, d_pdata, d_nlist, 3.0f*3.0f); CUT_CHECK_ERROR("Kernel execution failed"); } // free memory free(h_nlist.list); free(h_pdata.pos); CUDA_SAFE_CALL(cudaFree(d_pdata.pos)); CUDA_SAFE_CALL(cudaFree(d_nlist.list)); CUDA_SAFE_CALL(cudaFree(d_nlist.n_neigh)); CUDA_SAFE_CALL(cudaFree(d_out)); } // vim:syntax=cpp