// Includes. #include #define NRAWR 500 #define NRAWC 3000 #define NGPSR 2 #define NGPSC 3000 #define NDIR 201 #define NDIC 2021 #define NIMGR 201 #define NIMGC 2021 #define BLOCK_SIZE 24 typedef struct { int width; int height; int stride; float* elements; } Matrix; typedef struct { int width; int height; int stride; float2* elements; } Matrix2; /* Headers */ void creatematrix (Matrix*,int,int); void creatematrix2(Matrix2*,int,int); __global__ void kernel (Matrix, Matrix, Matrix2, Matrix); Matrix RAW, IMG, GPS; Matrix2 DI; int main() { int i, j; Matrix dr, di, dg; Matrix2 dd; creatematrix(&IMG,NDIR,NDIC); creatematrix(&RAW,NRAWR,NRAWC); for(j=0;j>>(dr, dg, dd, di); i=di.height*di.width*sizeof(float); cudaMemcpy(IMG.elements, di.elements, i, cudaMemcpyDeviceToHost); cudaFree(dr.elements); cudaFree(dd.elements); cudaFree(dg.elements); cudaFree(di.elements); return 0; } void creatematrix(Matrix* mat,int row,int column) { int size; mat->height=row; mat->width=mat->stride=column; size = mat->height*mat->width*sizeof(float); mat->elements = (float*)malloc(size); } void creatematrix2(Matrix2* mat,int row,int column) { int size; mat->height=row; mat->width=mat->stride=column; size = mat->height*mat->width*sizeof(float2); mat->elements = (float2*)malloc(size); } __global__ void kernel (Matrix dr, Matrix dg, Matrix2 dd, Matrix di) { float temp, minx, sigma=998, resolution=2; short i, j, k, l, m, n, a, basis; unsigned short jmpcol =(dd.width/gridDim.x) * (1+blockIdx.x); unsigned short indy= blockIdx.x * (dd.width/gridDim.x) + threadIdx.x; unsigned short jmprow = (dd.height/gridDim.y) * (1+blockIdx.y); unsigned short indx= blockIdx.y * (dd.height/gridDim.y) + threadIdx.y; unsigned short thnum = (BLOCK_SIZE*threadIdx.y)+threadIdx.x; float2 var; __shared__ float sdg[2][BLOCK_SIZE*BLOCK_SIZE]; __shared__ int widthg; __shared__ int widthd; __shared__ int widthr; widthg=dg.width; widthd=dd.width; widthr=dr.width; a=BLOCK_SIZE*BLOCK_SIZE; n= widthg/a; for(l=0;la) m=a; for(i=indx;i