/******************************************************************** This CUDA program deals with only the convolutions part of CSDD *********************************************************************/ #include #include #include "math.h" #include "matrix.h" #include "mex.h" #include "cuda_runtime.h" #include "cuda_runtime_api.h" #include "cuda.h" #include "math_functions.h" #include "device_functions.h" #define BIN_QTY 128 /************************************************************************/ /* Init CUDA */ /************************************************************************/ #if __DEVICE_EMULATION__ bool InitCUDA(void){return true;} #else bool InitCUDA(void) { int count = 0; int i = 0; cudaGetDeviceCount(&count); if(count == 0) { fprintf(stderr, "There is no device.\n"); return false; } for(i = 0; i < count; i++) { cudaDeviceProp prop; if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if(prop.major >= 1) { break; } } } if(i == count) { fprintf(stderr, "There is no device supporting CUDA.\n"); return false; } cudaSetDevice(i); printf("CUDA initialized.\n"); return true; } #endif /******************************************************************************/ /* The Code Begins - Equivalent of dericheCSDEMD(inimage,gaucoeffs,gau2coeffs)*/ /******************************************************************************/ /* ---------------------- Kernel Functions and Variables -------------------- */ __device__ double *inimage_d, *outimage_d, *rowcoeffs_d, *colcoeffs_d; __device__ double *tmpimage_d, *filtimage_d; __device__ int numRows_d, numCols_d, tilesize_d, numTiles_d; __device__ unsigned int check=0; __device__ unsigned int counter=0; extern "C" __global__ static void ProcessFrame_DeviceKernel(double *inimage_d,double *outimage_d,double *rowcoeffs_d,double *colcoeffs_d,double *tmpimage_d,double *filtimage_d,int numRows_d,int numCols_d,int tilesize_d,int numTiles_d,unsigned int check,unsigned int counter) { int Bx,Tx; //1-D Bx = blockIdx.x; Tx = threadIdx.x; //Bx_Tx = Bx*BIN_QTY*numTiles_d + Tx; //===================================================================== //STEP THROUGH 128 GREY(double)SCALE CHANNELS 0-1, 2-3, ..., 254-255 AND DO //LAPLACIAN OF GAUSSIAN CSD FILTERING, ACCUMULATING RESULT INTO SCOREIM int i; int grey = 2*(Bx/numTiles_d)+1; double dgrey = (double)grey + 0.01; double asum,M0,M1,M2,M3,M4; //===================================================================== //Rounds 1 and 2: Calculations for the first half of FILT1 // from here onwards Tx = rows = r //__syncthreads(); //FORWARD FILTER PASS ALONG ROWS asum = 1.0 + rowcoeffs_d[0] + rowcoeffs_d[1] + rowcoeffs_d[2] + rowcoeffs_d[3]; M1 = M2 = M3 = M4 = ((double)(inimage_d[((Bx%numTiles_d)*tilesize_d+Tx)]<=dgrey)/asum); for (i=0; i < numCols_d; i++) { M0 = (double)(inimage_d[((Bx%numTiles_d)*tilesize_d+Tx)+(i*numRows_d)]<=dgrey) - rowcoeffs_d[0]*M1 - rowcoeffs_d[1]*M2 - rowcoeffs_d[2]*M3 - rowcoeffs_d[3]*M4; filtimage_d[((Bx/numTiles_d)*numRows_d*numCols_d)+((Bx%numTiles_d)*tilesize_d+Tx)+(i*numRows_d)] += rowcoeffs_d[4]*M0 + rowcoeffs_d[5]*M1 + rowcoeffs_d[6]*M2 + rowcoeffs_d[7]*M3; M4 = M3; M3 = M2; M2 = M1; M1 = M0; } __syncthreads(); /*emulation mode hangs here*/ /* mutex+global_synchronization_barrier */ while(atomicCAS(&check,0,0x1F4)==0x1F4); counter++; check = 0; while(counter < (BIN_QTY*numTiles_d*tilesize_d)); //BACKWARD FILTER PASS ALONG ROWS asum = 1.0 + rowcoeffs_d[8] + rowcoeffs_d[9] + rowcoeffs_d[10] + rowcoeffs_d[11]; M1 = M2 = M3 = M4 = ((double)(inimage_d[numRows_d*numCols_d-((Bx%numTiles_d)*tilesize_d+Tx)-1]<=dgrey) / asum); for (i=0; i < numCols_d; i++) { M0 = (double)(inimage_d[numRows_d*numCols_d-((Bx%numTiles_d)*tilesize_d+Tx)-(i*numRows_d)-1]<=dgrey) - rowcoeffs_d[8]*M1 - rowcoeffs_d[9]*M2 - rowcoeffs_d[10]*M3 - rowcoeffs_d[11]*M4; filtimage_d[(((Bx/numTiles_d)+1)*numRows_d*numCols_d)-((Bx%numTiles_d)*tilesize_d+Tx)-(i*numRows_d)-1] += rowcoeffs_d[12]*M1 + rowcoeffs_d[13]*M2 + rowcoeffs_d[14]*M3 + rowcoeffs_d[15]*M4; M4 = M3; M3 = M2; M2 = M1; M1 = M0; } //__syncthreads(); return; } /* --------------------------------------------- Host's (CPU) Main( ) Code ----------------------------------- */ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { //All code and internal function calls go in here! //Declarations double *inimage_h, *outimage_h, *rowcoeffs_h, *colcoeffs_h, *filtimage_h; double dtmp; int numRows_h, numCols_h, i, j, count, tilesize_h, numTiles_h; const int *dims; //Get input image and coefficient arrays if (!mxIsDouble(prhs[0])) { mexErrMsgTxt("Input image should be double.\n"); } inimage_h = (double*)mxGetPr(prhs[0]); dims = mxGetDimensions(prhs[0]); numRows_h = dims[0]; numCols_h = dims[1]; rowcoeffs_h = (double*)mxGetPr(prhs[1]); dims = mxGetDimensions(prhs[1]); if (dims[0]*dims[1] != 16) mexErrMsgTxt("dericheIIR2d: rowcoeffs must be a vector of length 16\n"); colcoeffs_h = (double*)mxGetPr(prhs[2]); dims = mxGetDimensions(prhs[2]); if (dims[0]*dims[1] != 16) mexErrMsgTxt("dericheIIR2d: colcoeffs must be a vector of length 16\n"); //create output image, same size as input image plhs[0] = mxCreateNumericMatrix(numRows_h, numCols_h, mxDOUBLE_CLASS, mxREAL); mxSetM(plhs[0],numRows_h); mxSetN(plhs[0],numCols_h); outimage_h = (double*)mxGetPr(plhs[0]); tilesize_h = 300; numTiles_h = numRows_h/tilesize_h; // allocate memory on the CPU filtimage_h = (double *)malloc(sizeof(double)*numRows_h*numCols_h*BIN_QTY); memset(filtimage_h,0,sizeof(double)*numRows_h*numCols_h*BIN_QTY); // allocate memory on the gpu device cudaMalloc((void**)&inimage_d,sizeof(double)*numRows_h*numCols_h); cudaMalloc((void**)&outimage_d,sizeof(double)*numRows_h*numCols_h); cudaMalloc((void**)&rowcoeffs_d,sizeof(double)*16); cudaMalloc((void**)&colcoeffs_d,sizeof(double)*16); cudaMalloc((void**)&tmpimage_d,sizeof(double)*numRows_h*numCols_h*BIN_QTY); cudaMalloc((void**)&filtimage_d,sizeof(double)*numRows_h*numCols_h*BIN_QTY); cudaMemset(inimage_d,0,sizeof(double)*numRows_h*numCols_h); cudaMemset(outimage_d,0,sizeof(double)*numRows_h*numCols_h); cudaMemset(rowcoeffs_d,0,sizeof(double)*16); cudaMemset(colcoeffs_d,0,sizeof(double)*16); cudaMemset(tmpimage_d,0,sizeof(double)*numRows_h*numCols_h*BIN_QTY); cudaMemset(filtimage_d,0,sizeof(double)*numRows_h*numCols_h*BIN_QTY); cudaMemcpy(inimage_d, inimage_h, sizeof(double)*numRows_h*numCols_h, cudaMemcpyHostToDevice); cudaMemcpy(rowcoeffs_d, rowcoeffs_h, sizeof(double)*16, cudaMemcpyHostToDevice); cudaMemcpy(colcoeffs_d, colcoeffs_h, sizeof(double)*16, cudaMemcpyHostToDevice); numRows_d = numRows_h; numCols_d = numCols_h; tilesize_d = tilesize_h; numTiles_d = numTiles_h; // Set-up the execution configuration dim3 dimGrid(BIN_QTY*numTiles_h,1); /* the grid dimensions in number of blocks */ dim3 dimBlock(tilesize_h,1); /* the block dimensions in number of threads */ // Launch a kernel of threads to perform some function on the Device ProcessFrame_DeviceKernel<<>>(inimage_d,outimage_d,rowcoeffs_d,colcoeffs_d,tmpimage_d,filtimage_d,numRows_d,numCols_d,tilesize_d,numTiles_d,check,counter); cudaMemcpy(filtimage_h, filtimage_d, sizeof(double)*numRows_h*numCols_h*BIN_QTY, cudaMemcpyDeviceToHost); for(i=0;i