// includes, system #include #include #include #include #include // includes, project #include #include // includes, kernels #include //////////////////////////////////////////////////////////////////////////////// // declarations, forward extern "C" //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) { if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); unsigned int f_rows = 1024; unsigned int f_cols = 1000; unsigned int grid_x = 50; unsigned int block_x = 20; unsigned int mem_size = sizeof(float) * f_rows * f_cols; unsigned int out_mem_size = sizeof(float) * f_cols * f_cols; float *h_idata1 = (float*) malloc(mem_size); float *h_idata2 = (float*) malloc(mem_size); //pull in the data here float *d_idata1, *d_idata2, *d_odata; cutilSafeCall( cudaMalloc( (void**) &d_idata1, mem_size)); cutilSafeCall( cudaMalloc( (void**) &d_idata2, mem_size)); cutilSafeCall( cudaMalloc( (void**) &d_odata, out_mem_size)); cutilSafeCall( cudaMemcpy( d_idata1, h_idata1, mem_size, cudaMemcpyHostToDevice) ); cutilSafeCall( cudaMemcpy( d_idata2, h_idata2, mem_size, cudaMemcpyHostToDevice) ); // allocate mem for the result on host side float* h_odata = (float*) malloc( out_mem_size); // setup execution parameters dim3 grid( grid_x, grid_x, 1); dim3 threads( block_x, block_x, 1); unsigned int timer = 0; cutilCheckError( cutCreateTimer( &timer)); cutilCheckError( cutStartTimer( timer)); CKernel<<< grid, threads >>>( d_idata1, d_idata2, d_odata); cutilSafeCall( cudaMemcpy( h_odata, d_odata, out_mem_size, cudaMemcpyDeviceToHost) ); cutilCheckError( cutStopTimer( timer)); printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer)); cutilCheckError( cutDeleteTimer( timer)); free (h_idata1); free (h_idata2); free (h_odata); cutilSafeCall(cudaFree(d_idata1)); cutilSafeCall(cudaFree(d_idata2)); cutilSafeCall(cudaFree(d_odata)); return 0; }