Getting NaN when evaluating PTX kernel file
Show older comments
Hello,
I have a Matlab version and MEX C version of a function and it works fine. I tried implementing a CUDA C version and compiling it using nvcc into PTX (no problems here) and using the GPU kernel Matlab function in the Parallel Computing Toolbox, but I get NaN values as outputs for my inputs (for the Matlab native and MEX C version I get the correct outputs).
I have tested some example CUDA codes adding vectors etc. and they seem to work OK.
Here is my procedure:
kernel = parallel.gpu.CUDAKernel('StressDueToSeg.ptx','StressDueToSeg.cu');
kernel.ThreadBlockSize = 1024;
kernel.GridSize = ceil(N/1024);
[~,~,~,~,~,~,~,~,~,~,~,~,...
s11, s22, s33, s12, s13, s23] = feval(kern,N,S,px,py,pz,p1x,p1y,p1z,p2x,p2y,p2z,bx,by,bz,a,NU,MU,...
sxx,syy,szz,sxy,syz,sxz);
The inputs are scalars 'N,S,a,MU,NU', 1D vectors of length N 'px,py,pz', and 1D vectors of length S 'p1x,p1y,p1z,p2x,p2y,p2z,bx,by,bz'.
My .cu code is as follows:
#include <math.h>
__global__ void StressDueToSeg(int N, int S, double *px, double *py, double *pz,
double *p1x, double *p1y, double *p1z,
double *p2x, double *p2y, double *p2z,
double *bx, double *by, double *bz,
const double a, const double MU, const double NU,
double *sxx, double *syy, double *szz,
double *sxy, double *syz, double *sxz)
{
double oneoverLp, common;
double vec1x, vec1y, vec1z;
double tpx, tpy, tpz;
double Rx, Ry, Rz, Rdt;
double ndx, ndy, ndz;
double d2, s1, s2, a2, a2_d2, a2d2inv;
double Ra, Rainv, Ra3inv, sRa3inv;
double s_03a, s_13a, s_05a, s_15a, s_25a;
double s_03b, s_13b, s_05b, s_15b, s_25b;
double s_03, s_13, s_05, s_15, s_25;
double m4p, m8p, m4pn, mn4pn, a2m8p;
double txbx, txby, txbz;
double dxbx, dxby, dxbz;
double dxbdt, dmdxx, dmdyy, dmdzz, dmdxy, dmdyz, dmdxz;
double tmtxx, tmtyy, tmtzz, tmtxy, tmtyz, tmtxz;
double tmdxx, tmdyy, tmdzz, tmdxy, tmdyz, tmdxz;
double tmtxbxx, tmtxbyy, tmtxbzz, tmtxbxy, tmtxbyz, tmtxbxz;
double dmtxbxx, dmtxbyy, dmtxbzz, dmtxbxy, dmtxbyz, dmtxbxz;
double tmdxbxx, tmdxbyy, tmdxbzz, tmdxbxy, tmdxbyz, tmdxbxz;
double I_03xx, I_03yy, I_03zz, I_03xy, I_03yz, I_03xz;
double I_13xx, I_13yy, I_13zz, I_13xy, I_13yz, I_13xz;
double I_05xx, I_05yy, I_05zz, I_05xy, I_05yz, I_05xz;
double I_15xx, I_15yy, I_15zz, I_15xy, I_15yz, I_15xz;
double I_25xx, I_25yy, I_25zz, I_25xy, I_25yz, I_25xz;
int seg;
int coord = threadIdx.x + blockIdx.x * blockDim.x ;
if (coord<N) {
//precompute some constants
m4p = 0.25 * MU / M_PI;
m8p = 0.5 * m4p;
m4pn = m4p / (1 - NU);
mn4pn = m4pn * NU;
a2 = a * a;
a2m8p = a2 * m8p;
for (seg=0; seg<S; seg++) { //loop over the segments
vec1x = p2x[seg] - p1x[seg];
vec1y = p2y[seg] - p1y[seg];
vec1z = p2z[seg] - p1z[seg];
oneoverLp = 1 / sqrt(vec1x*vec1x + vec1y*vec1y + vec1z*vec1z);
tpx = vec1x * oneoverLp;
tpy = vec1y * oneoverLp;
tpz = vec1z * oneoverLp;
Rx = px[coord] - p1x[seg];
Ry = py[coord] - p1y[seg];
Rz = pz[coord] - p1z[seg];
Rdt = Rx*tpx + Ry*tpy + Rz*tpz;
ndx = Rx - Rdt*tpx;
ndy = Ry - Rdt*tpy;
ndz = Rz - Rdt*tpz;
d2 = ndx*ndx + ndy*ndy + ndz*ndz;
s1 = -Rdt;
s2 = -((px[coord]-p2x[seg])*tpx + (py[coord]-p2y[seg])*tpy + (pz[coord]-p2z[seg])*tpz);
a2_d2 = a2 + d2;
a2d2inv = 1 / a2_d2;
Ra = sqrt(a2_d2 + s1*s1);
Rainv = 1 / Ra;
Ra3inv = Rainv * Rainv * Rainv;
sRa3inv = s1 * Ra3inv;
s_03a = s1 * Rainv * a2d2inv;
s_13a = -Rainv;
s_05a = (2*s_03a + sRa3inv) * a2d2inv;
s_15a = -Ra3inv;
s_25a = s_03a - sRa3inv;
Ra = sqrt(a2_d2 + s2*s2);
Rainv = 1 / Ra;
Ra3inv = Rainv * Rainv * Rainv;
sRa3inv = s2 * Ra3inv;
s_03b = s2 * Rainv * a2d2inv;
s_13b = -Rainv;
s_05b = (2*s_03b + sRa3inv) * a2d2inv;
s_15b = -Ra3inv;
s_25b = s_03b - sRa3inv;
s_03 = s_03b - s_03a;
s_13 = s_13b - s_13a;
s_05 = s_05b - s_05a;
s_15 = s_15b - s_15a;
s_25 = s_25b - s_25a;
txbx = tpy*bz[seg] - tpz*by[seg];
txby = tpz*bx[seg] - tpx*bz[seg];
txbz = tpx*by[seg] - tpy*bx[seg];
dxbx = ndy*bz[seg] - ndz*by[seg];
dxby = ndz*bx[seg] - ndx*bz[seg];
dxbz = ndx*by[seg] - ndy*bx[seg];
dxbdt = dxbx*tpx + dxby*tpy + dxbz*tpz;
dmdxx = ndx * ndx;
dmdyy = ndy * ndy;
dmdzz = ndz * ndz;
dmdxy = ndx * ndy;
dmdyz = ndy * ndz;
dmdxz = ndx * ndz;
tmtxx = tpx * tpx;
tmtyy = tpy * tpy;
tmtzz = tpz * tpz;
tmtxy = tpx * tpy;
tmtyz = tpy * tpz;
tmtxz = tpx * tpz;
tmdxx = 2 * tpx * ndx;
tmdyy = 2 * tpy * ndy;
tmdzz = 2 * tpz * ndz;
tmdxy = tpx*ndy + tpy*ndx;
tmdyz = tpy*ndz + tpz*ndy;
tmdxz = tpx*ndz + tpz*ndx;
tmtxbxx = 2 * tpx * txbx;
tmtxbyy = 2 * tpy * txby;
tmtxbzz = 2 * tpz * txbz;
tmtxbxy = tpx*txby + tpy*txbx;
tmtxbyz = tpy*txbz + tpz*txby;
tmtxbxz = tpx*txbz + tpz*txbx;
dmtxbxx = 2 * ndx * txbx;
dmtxbyy = 2 * ndy * txby;
dmtxbzz = 2 * ndz * txbz;
dmtxbxy = ndx*txby + ndy*txbx;
dmtxbyz = ndy*txbz + ndz*txby;
dmtxbxz = ndx*txbz + ndz*txbx;
tmdxbxx = 2 * tpx * dxbx;
tmdxbyy = 2 * tpy * dxby;
tmdxbzz = 2 * tpz * dxbz;
tmdxbxy = tpx*dxby + tpy*dxbx;
tmdxbyz = tpy*dxbz + tpz*dxby;
tmdxbxz = tpx*dxbz + tpz*dxbx;
common = m4pn * dxbdt;
I_03xx = common + m4pn*dmtxbxx - m4p*tmdxbxx;
I_03yy = common + m4pn*dmtxbyy - m4p*tmdxbyy;
I_03zz = common + m4pn*dmtxbzz - m4p*tmdxbzz;
I_03xy = m4pn*dmtxbxy - m4p*tmdxbxy;
I_03yz = m4pn*dmtxbyz - m4p*tmdxbyz;
I_03xz = m4pn*dmtxbxz - m4p*tmdxbxz;
I_13xx = -mn4pn * tmtxbxx;
I_13yy = -mn4pn * tmtxbyy;
I_13zz = -mn4pn * tmtxbzz;
I_13xy = -mn4pn * tmtxbxy;
I_13yz = -mn4pn * tmtxbyz;
I_13xz = -mn4pn * tmtxbxz;
I_05xx = common*(a2+dmdxx) - a2m8p*tmdxbxx;
I_05yy = common*(a2+dmdyy) - a2m8p*tmdxbyy;
I_05zz = common*(a2+dmdzz) - a2m8p*tmdxbzz;
I_05xy = common*dmdxy - a2m8p*tmdxbxy;
I_05yz = common*dmdyz - a2m8p*tmdxbyz;
I_05xz = common*dmdxz - a2m8p*tmdxbxz;
I_15xx = a2m8p*tmtxbxx - common*tmdxx;
I_15yy = a2m8p*tmtxbyy - common*tmdyy;
I_15zz = a2m8p*tmtxbzz - common*tmdzz;
I_15xy = a2m8p*tmtxbxy - common*tmdxy;
I_15yz = a2m8p*tmtxbyz - common*tmdyz;
I_15xz = a2m8p*tmtxbxz - common*tmdxz;
I_25xx = common * tmtxx;
I_25yy = common * tmtyy;
I_25zz = common * tmtzz;
I_25xy = common * tmtxy;
I_25yz = common * tmtyz;
I_25xz = common * tmtxz;
sxx[coord] += I_03xx*s_03 + I_13xx*s_13 + I_05xx*s_05 +
I_15xx*s_15 + I_25xx*s_25;
syy[coord] += I_03yy*s_03 + I_13yy*s_13 + I_05yy*s_05 +
I_15yy*s_15 + I_25yy*s_25;
szz[coord] += I_03zz*s_03 + I_13zz*s_13 + I_05zz*s_05 +
I_15zz*s_15 + I_25zz*s_25;
sxy[coord] += I_03xy*s_03 + I_13xy*s_13 + I_05xy*s_05 +
I_15xy*s_15 + I_25xy*s_25;
syz[coord] += I_03yz*s_03 + I_13yz*s_13 + I_05yz*s_05 +
I_15yz*s_15 + I_25yz*s_25;
sxz[coord] += I_03xz*s_03 + I_13xz*s_13 + I_05xz*s_05 +
I_15xz*s_15 + I_25xz*s_25;
}
}
return;
}
I am new to CUDA so perhaps it's an implementation problem; however, the MEX C code is essentially very similar (it has a 2nd loop with counter 'coord', rather than coord = threadIdx.x + blockIdx.x * blockDim.x) and that works OK.
Please enlighten me.
Thank you
F
Accepted Answer
More Answers (0)
Categories
Find more on Get Started with GPU Coder in Help Center and File Exchange
Community Treasure Hunt
Find the treasures in MATLAB Central and discover how the community can help you!
Start Hunting!