CUDA numreical refocus
This project is same as python version, but using CUDA to accelerate the calculation process. The process can speed up 4 times, compare to python version check out my github for more detial about source code
kernel
__global__ void fft_propogate_cu(cufftComplex*p_in, cufftComplex*p_out, double d, float nm, float res, int sizex, int sizey)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
int j = threadIdx.y + blockIdx.y*blockDim.y;
double km = (2 * PI*nm) / res;
double kx_o, ky_o, kx, ky;
cufftComplex I;
if (i < sizex && j < sizey)
{
if (sizex % 2 != 0)
{
if (i < (sizex + 1) / 2)
kx_o = i;
else if (i >= (sizex + 1) / 2)
kx_o = -(sizex - i);
}
else if (sizex % 2 == 0)
{
if (i < (sizex / 2))
kx_o = i;
else if (i >= (sizex / 2))
kx_o = -(sizex - i);
}
if (sizey % 2 != 0)
{
if (j < (sizey + 1) / 2)
ky_o = j;
else if (j >= (sizey + 1) / 2)
ky_o = -(sizey - j);
}
else if (sizey % 2 == 0)
{
if (j < (sizey / 2))
ky_o = j;
else if (j >= (sizey / 2))
ky_o = -(sizey - j);
}
kx = (kx_o / sizex) * 2 * PI;
ky = (ky_o / sizey) * 2 * PI;
double root_km = km * km - kx * kx - ky * ky;
bool rt0 = root_km > 0;
if (root_km > 0)
{
I.x = 0;
I.y = (sqrt(root_km * rt0) - km)*d;
p_out[i*sizex + j] = com_mul(p_in[i*sizex + j], com_exp(I));
}
else
{
p_out[i*sizex + j].x = 0.0;
p_out[i*sizex + j].y = 0.0;
}
}
}