#include #include #define BLOCK_SIZE 256 #define NUM_ELEMENTS 256*7 extern __shared__ char smem[]; __global__ void A3_kernel(int *output, int test_iterations ) { int start, stop, i; int tid = threadIdx.x; volatile int* smem1 = (int*)smem; start = clock(); __syncthreads(); #pragma unroll 75 for ( i = 0 ; i < test_iterations ; i ++ ) { smem1[ tid + (7)*256 ] = smem1[ tid ]; smem1[ tid + (8)*256 ] = smem1[ tid + 256 ]; smem1[ tid + (9)*256 ] = smem1[ tid + 2*256 ]; smem1[ tid + (10)*256 ] = smem1[ tid + 3*256 ]; smem1[ tid + (11)*256 ] = smem1[ tid + 4*256 ]; smem1[ tid + (12)*256 ] = smem1[ tid + 5*256 ]; smem1[ tid + (13)*256 ] = smem1[ tid + 6*256 ]; __syncthreads(); } stop = clock(); __syncthreads(); output[0] = stop - start; } int main () { int num_elements, data_size; int grid_size, block_size; int test_iterations; int total_clk_cycles, ave_clk_cycles; int *h_output, *d_output; float bandwidth, bandwidth2, gpu_clk_rate; float total_time, ave_time; // get GPU clk rate cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); gpu_clk_rate = deviceProp.clockRate * 1e-6; printf("\nYour GPU clock rate = %f GHz\n", gpu_clk_rate); grid_size = 1; block_size = BLOCK_SIZE; test_iterations = 100000; num_elements = NUM_ELEMENTS; data_size = num_elements*sizeof(int); // allocate arrays h_output = (int*)malloc(data_size); // allocate CUDA arrays cudaMalloc((void **) &d_output, data_size); // kernel invocation A3_kernel <<< grid_size, block_size, data_size*2 >>> ( d_output, test_iterations ); // trasnfer output from gpu to cpu cudaMemcpy(h_output, d_output, data_size, cudaMemcpyDeviceToHost); // calculate bandwidth total_clk_cycles = h_output[0]; ave_clk_cycles = total_clk_cycles / test_iterations; total_time = total_clk_cycles / gpu_clk_rate / 1e9; // (seconds) ave_time = ave_clk_cycles / gpu_clk_rate / 1e9; // (seconds) bandwidth = data_size / ave_time; // (byte/second) bandwidth2 = float(data_size)/ave_clk_cycles; // display results printf("\nSmem test used %d test iterations\n", test_iterations); printf("Total time of %f ms (%d clk cycles)\n", (float)total_time*1e3, total_clk_cycles); printf("Average time of %f us (%d clk cycles)\n", (float)ave_time*1e6, ave_clk_cycles); printf("Transfered data = %d bytes\n\n", data_size); printf("%f Bytes/clock (%f%% of theoretical)\n", bandwidth2, bandwidth2*100/16); printf("---> Bandwidth = %f GB/s <---\n\n", bandwidth*1e-9); printf("h_output[%d] = %d\n", 1225, h_output[1225]); printf("h_output[%d] = %d\n", 515, h_output[ 515]); // free memory free(h_output); cudaFree(d_output);