#include #include #define BLOCK_SIZE 256 #define NUM_ELEMENTS 256*15 extern __shared__ int smem[]; __global__ void A1_kernel( int *output, int test_iterations ) { int start, stop, i; int tid = threadIdx.x; volatile int* smem1 = (int*)smem; start = clock(); __syncthreads(); #pragma unroll 130 for ( i = 0 ; i < test_iterations ; i ++ ) { // shuffle data around // manual unroll smem1[tid + 256] = smem1[tid]; smem1[tid + 256] = smem1[tid]; // repeated twice to get 15 group transfers smem1[tid + 512] = smem1[tid]; smem1[tid + 768] = smem1[tid]; smem1[tid + 1024] = smem1[tid]; smem1[tid + 1280] = smem1[tid]; smem1[tid + 1536] = smem1[tid]; smem1[tid + 1792] = smem1[tid]; smem1[tid + 2048] = smem1[tid]; smem1[tid + 2304] = smem1[tid]; smem1[tid + 2560] = smem1[tid]; smem1[tid + 2816] = smem1[tid]; smem1[tid + 3072] = smem1[tid]; smem1[tid + 3328] = smem1[tid]; smem1[tid + 3584] = smem1[tid]; __syncthreads(); } stop = clock(); __syncthreads(); output[0] = stop - start; } int main () { int num_elements, data_size; int grid_size, block_size; int test_iterations; int total_clk_cycles, ave_clk_cycles; int *h_output, *d_output; float bandwidth, bandwidth2, gpu_clk_rate; float total_time, ave_time; // get GPU clk rate cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); gpu_clk_rate = deviceProp.clockRate * 1e-6; printf("\nYour GPU clock rate = %f GHz\n", gpu_clk_rate); grid_size = 1; block_size = BLOCK_SIZE; test_iterations = 100000; num_elements = NUM_ELEMENTS; data_size = num_elements*sizeof(int); // allocate arrays h_output = (int*)malloc(data_size); // allocate CUDA arrays cudaMalloc((void **) &d_output, data_size); // kernel invocation A1_kernel <<< grid_size, block_size, data_size >>> ( d_output, test_iterations ); // trasnfer output from gpu to cpu cudaMemcpy(h_output, d_output, data_size, cudaMemcpyDeviceToHost); // calculate bandwidth total_clk_cycles = h_output[0]; ave_clk_cycles = total_clk_cycles / test_iterations; total_time = total_clk_cycles / gpu_clk_rate / 1e9; // (seconds) ave_time = ave_clk_cycles / gpu_clk_rate / 1e9; // (seconds) bandwidth = data_size / ave_time; // (byte/second) bandwidth2 = float(data_size)/ave_clk_cycles; // display results printf("\nSmem test used %d test iterations\n", test_iterations); printf("Total time of %f ms (%d clk cycles)\n", (float)total_time*1e3, total_clk_cycles); printf("Average time of %f us (%d clk cycles)\n", (float)ave_time*1e6, ave_clk_cycles); printf("Transfered data = %d bytes\n\n", data_size); printf("%f Bytes/clock (%f%% of theoretical)\n", bandwidth2, bandwidth2*100/16); printf("---> Bandwidth = %f GB/s <---\n\n", bandwidth*1e-9); // free memory free(h_output); cudaFree(d_output); }