diff --git a/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cg.DVMH_cuda.cu_opt b/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cg.DVMH_cuda.cu_opt new file mode 100644 index 0000000..90ed1a4 --- /dev/null +++ b/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cg.DVMH_cuda.cu_opt @@ -0,0 +1,2285 @@ + +#include +#define dcmplx2 Complex +#define cmplx2 Complex +typedef int __indexTypeInt; +typedef long long __indexTypeLLong; + + + + + +//--------------------- Kernel for loop on line 229 --------------------- + + __global__ void loop_cg_229_cuda_kernel_int(double _x[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Local needs + __indexTypeInt _i; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _i = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_i <= end_1) + { + +// Loop body + _x[_i] = 1.0e0; + } + } + + +//--------------------- Kernel for loop on line 229 --------------------- + + __global__ void loop_cg_229_cuda_kernel_llong(double _x[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Local needs + __indexTypeLLong _i; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _i = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_i <= end_1) + { + +// Loop body + _x[_i] = 1.0e0; + } + } + + +//--------------------- Kernel for loop on line 233 --------------------- + + __global__ void loop_cg_233_cuda_kernel_int(double _p[], double _r[], double _z[], double _q[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _q[_j] = 0.0e0; + _z[_j] = 0.0e0; + _r[_j] = 0.0e0; + _p[_j] = 0.0e0; + } + } + + +//--------------------- Kernel for loop on line 233 --------------------- + + __global__ void loop_cg_233_cuda_kernel_llong(double _p[], double _r[], double _z[], double _q[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _q[_j] = 0.0e0; + _z[_j] = 0.0e0; + _r[_j] = 0.0e0; + _p[_j] = 0.0e0; + } + } + + +//--------------------- Kernel for loop on line 272 --------------------- + + __global__ void loop_cg_272_cuda_kernel_int(double _z[], double _x[], double _norm_temp1, double norm_temp1_grid[], double _norm_temp2, double norm_temp2_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _norm_temp1 = _z[_j] * _x[_j] + _norm_temp1; + _norm_temp2 = _z[_j] * _z[_j] + _norm_temp2; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _norm_temp1 = __dvmh_blockReduceSum(_norm_temp1); + _norm_temp2 = __dvmh_blockReduceSum(_norm_temp2); + if (_j % warpSize == 0) + { + norm_temp2_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp2; + norm_temp1_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp1; + } + } + + +//--------------------- Kernel for loop on line 272 --------------------- + + __global__ void loop_cg_272_cuda_kernel_llong(double _z[], double _x[], double _norm_temp1, double norm_temp1_grid[], double _norm_temp2, double norm_temp2_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _norm_temp1 = _z[_j] * _x[_j] + _norm_temp1; + _norm_temp2 = _z[_j] * _z[_j] + _norm_temp2; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _norm_temp1 = __dvmh_blockReduceSum(_norm_temp1); + _norm_temp2 = __dvmh_blockReduceSum(_norm_temp2); + if (_j % warpSize == 0) + { + norm_temp2_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp2; + norm_temp1_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp1; + } + } + + +//--------------------- Kernel for loop on line 285 --------------------- + + __global__ void loop_cg_285_cuda_kernel_int(double _x[], double _z[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks, double _norm_temp2) + { + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _x[_j] = _z[_j] * _norm_temp2; + } + } + + +//--------------------- Kernel for loop on line 285 --------------------- + + __global__ void loop_cg_285_cuda_kernel_llong(double _x[], double _z[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks, double _norm_temp2) + { + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _x[_j] = _z[_j] * _norm_temp2; + } + } + + +//--------------------- Kernel for loop on line 301 --------------------- + + __global__ void loop_cg_301_cuda_kernel_int(double _x[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Local needs + __indexTypeInt _i; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _i = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_i <= end_1) + { + +// Loop body + _x[_i] = 1.0e0; + } + } + + +//--------------------- Kernel for loop on line 301 --------------------- + + __global__ void loop_cg_301_cuda_kernel_llong(double _x[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Local needs + __indexTypeLLong _i; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _i = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_i <= end_1) + { + +// Loop body + _x[_i] = 1.0e0; + } + } + + +//--------------------- Kernel for loop on line 347 --------------------- + + __global__ void loop_cg_347_cuda_kernel_int(double _z[], double _x[], double _norm_temp1, double norm_temp1_grid[], double _norm_temp2, double norm_temp2_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _norm_temp1 = _z[_j] * _x[_j] + _norm_temp1; + _norm_temp2 = _z[_j] * _z[_j] + _norm_temp2; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _norm_temp1 = __dvmh_blockReduceSum(_norm_temp1); + _norm_temp2 = __dvmh_blockReduceSum(_norm_temp2); + if (_j % warpSize == 0) + { + norm_temp2_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp2; + norm_temp1_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp1; + } + } + + +//--------------------- Kernel for loop on line 347 --------------------- + + __global__ void loop_cg_347_cuda_kernel_llong(double _z[], double _x[], double _norm_temp1, double norm_temp1_grid[], double _norm_temp2, double norm_temp2_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _norm_temp1 = _z[_j] * _x[_j] + _norm_temp1; + _norm_temp2 = _z[_j] * _z[_j] + _norm_temp2; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _norm_temp1 = __dvmh_blockReduceSum(_norm_temp1); + _norm_temp2 = __dvmh_blockReduceSum(_norm_temp2); + if (_j % warpSize == 0) + { + norm_temp2_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp2; + norm_temp1_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp1; + } + } + + +//--------------------- Kernel for loop on line 367 --------------------- + + __global__ void loop_cg_367_cuda_kernel_int(double _x[], double _z[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks, double _norm_temp2) + { + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _x[_j] = _z[_j] * _norm_temp2; + } + } + + +//--------------------- Kernel for loop on line 367 --------------------- + + __global__ void loop_cg_367_cuda_kernel_llong(double _x[], double _z[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks, double _norm_temp2) + { + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _x[_j] = _z[_j] * _norm_temp2; + } + } + + +//--------------------- Kernel for loop on line 522 --------------------- + + __global__ void loop_cg_522_cuda_kernel_int(double _p[], double _r[], double _x[], double _z[], double _q[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Private variables + double _d; + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _q[_j] = 0.0e0; + _z[_j] = 0.0e0; + _d = _x[_j]; + _r[_j] = _d; + _p[_j] = _d; + } + } + + +//--------------------- Kernel for loop on line 522 --------------------- + + __global__ void loop_cg_522_cuda_kernel_llong(double _p[], double _r[], double _x[], double _z[], double _q[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Private variables + double _d; + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _q[_j] = 0.0e0; + _z[_j] = 0.0e0; + _d = _x[_j]; + _r[_j] = _d; + _p[_j] = _d; + } + } + + +//--------------------- Kernel for loop on line 537 --------------------- + + __global__ void loop_cg_537_cuda_kernel_int(double _r[], double _rho, double rho_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _rho = _r[_j] * _r[_j] + _rho; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _rho = __dvmh_blockReduceSum(_rho); + if (_j % warpSize == 0) + { + rho_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _rho; + } + } + + +//--------------------- Kernel for loop on line 537 --------------------- + + __global__ void loop_cg_537_cuda_kernel_llong(double _r[], double _rho, double rho_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _rho = _r[_j] * _r[_j] + _rho; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _rho = __dvmh_blockReduceSum(_rho); + if (_j % warpSize == 0) + { + rho_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _rho; + } + } + + +//--------------------- Kernel for loop on line 558 --------------------- + + __global__ void loop_cg_558_cuda_kernel_int(double _p_rma[], double _q[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Private variables + int _k; + double _sum; + int cond_0; + int __k; + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int tid = gid / warpSize; + int lid = gid % warpSize; + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x) / warpSize; + if (_j <= end_1) + { + +// Loop body + _sum = 0.e0; + for (_k = _rowstr[_j] + lid, + (_rowstr[_j] > _rowstr[_j + 1] - 1 && 1 > 0 || _rowstr[_j] < _rowstr[_j + 1] - 1 && 1 < 0) ? + cond_0 = (-1) : + cond_0 = abs(_rowstr[_j] - (_rowstr[_j + 1] - 1)) + abs(1), + __k = 0 + lid ; + __k < cond_0 ; + _k = _k + warpSize, __k = __k + warpSize) + { + _sum = _p_rma[_colidx[_k]] * _a[_k] + _sum; + } + _sum = __dvmh_warpReduceSum(_sum); + if (lid == 0) { + _q[_j] = _sum; + } + } + } + + +//--------------------- Kernel for loop on line 558 --------------------- + + __global__ void loop_cg_558_cuda_kernel_llong(double _p_rma[], double _q[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Private variables + int _k; + double _sum; + int cond_0; + int __k; + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int tid = gid / warpSize; + int lid = gid % warpSize; +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x) / warpSize; + if (_j <= end_1) + { + +// Loop body + _sum = 0.e0; + for (_k = _rowstr[_j] + lid, + (_rowstr[_j] > _rowstr[_j + 1] - 1 && 1 > 0 || _rowstr[_j] < _rowstr[_j + 1] - 1 && 1 < 0) ? + cond_0 = (-1) : + cond_0 = abs(_rowstr[_j] - (_rowstr[_j + 1] - 1)) + abs(1), + __k = 0 + lid ; + __k < cond_0 ; + _k = _k + warpSize, __k = __k + warpSize) + { + _sum = _p_rma[_colidx[_k]] * _a[_k] + _sum; + } + _sum = __dvmh_warpReduceSum(_sum); + if (lid == 0) { + _q[_j] = _sum; + } + } + } + + +//--------------------- Kernel for loop on line 567 --------------------- + + __global__ void loop_cg_567_cuda_kernel_int(double _q[], double _p[], double _d, double d_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _d = _q[_j] * _p[_j] + _d; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _d = __dvmh_blockReduceSum(_d); + if (_j % warpSize == 0) + { + d_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _d; + } + } + + +//--------------------- Kernel for loop on line 567 --------------------- + + __global__ void loop_cg_567_cuda_kernel_llong(double _q[], double _p[], double _d, double d_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _d = _q[_j] * _p[_j] + _d; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _d = __dvmh_blockReduceSum(_d); + if (_j % warpSize == 0) + { + d_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _d; + } + } + + +//--------------------- Kernel for loop on line 577 --------------------- + + __global__ void loop_cg_577_cuda_kernel_int(double _q[], double _r[], double _p[], double _z[], double _rho, double rho_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks, double _alpha) + { + +// Private variables + double _d; + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _z[_j] = _p[_j] * _alpha + _z[_j]; + _d = (-(_alpha * _q[_j])) + _r[_j]; + _r[_j] = _d; + _rho = _d * _d + _rho; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _rho = __dvmh_blockReduceSum(_rho); + if (_j % warpSize == 0) + { + rho_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _rho; + } + } + + +//--------------------- Kernel for loop on line 577 --------------------- + + __global__ void loop_cg_577_cuda_kernel_llong(double _q[], double _r[], double _p[], double _z[], double _rho, double rho_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks, double _alpha) + { + +// Private variables + double _d; + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _z[_j] = _p[_j] * _alpha + _z[_j]; + _d = (-(_alpha * _q[_j])) + _r[_j]; + _r[_j] = _d; + _rho = _d * _d + _rho; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _rho = __dvmh_blockReduceSum(_rho); + if (_j % warpSize == 0) + { + rho_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _rho; + } + } + + +//--------------------- Kernel for loop on line 588 --------------------- + + __global__ void loop_cg_588_cuda_kernel_int(double _p[], double _r[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks, double _beta) + { + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _p[_j] = _p[_j] * _beta + _r[_j]; + } + } + + +//--------------------- Kernel for loop on line 588 --------------------- + + __global__ void loop_cg_588_cuda_kernel_llong(double _p[], double _r[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks, double _beta) + { + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _p[_j] = _p[_j] * _beta + _r[_j]; + } + } + + +//--------------------- Kernel for loop on line 605 --------------------- + + __global__ void loop_cg_605_cuda_kernel_int(double _z_rma[], double _r[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Private variables + int _k; + double _d; + int cond_0; + int __k; + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int tid = gid / warpSize; + int lid = gid % warpSize; +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x) / warpSize; + if (_j <= end_1) + { + +// Loop body + _d = 0.e0; + for (_k = _rowstr[_j] + lid, + (_rowstr[_j] > _rowstr[_j + 1] - 1 && 1 > 0 || _rowstr[_j] < _rowstr[_j + 1] - 1 && 1 < 0) ? + cond_0 = (-1) : + cond_0 = abs(_rowstr[_j] - (_rowstr[_j + 1] - 1)) + abs(1), + __k = 0 + lid; + __k < cond_0 ; + _k = _k + warpSize, __k = __k + warpSize) + { + _d = _z_rma[_colidx[_k]] * _a[_k] + _d; + } + _d = __dvmh_warpReduceSum(_d); + if (lid == 0) { + _r[_j] = _d; + } + } + } + + +//--------------------- Kernel for loop on line 605 --------------------- + + __global__ void loop_cg_605_cuda_kernel_llong(double _z_rma[], double _r[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Private variables + int _k; + double _d; + int cond_0; + int __k; + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int tid = gid / warpSize; + int lid = gid % warpSize; +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x) / warpSize; + if (_j <= end_1) + { + +// Loop body + _d = 0.e0; + for (_k = _rowstr[_j] + lid, + (_rowstr[_j] > _rowstr[_j + 1] - 1 && 1 > 0 || _rowstr[_j] < _rowstr[_j + 1] - 1 && 1 < 0) ? + cond_0 = (-1) : + cond_0 = abs(_rowstr[_j] - (_rowstr[_j + 1] - 1)) + abs(1), + __k = 0 + lid; + __k < cond_0 ; + _k = _k + warpSize, __k = __k + warpSize) + { + _d = _z_rma[_colidx[_k]] * _a[_k] + _d; + } + _d = __dvmh_warpReduceSum(_d); + if (lid == 0) { + _r[_j] = _d; + } + } + } + + +//--------------------- Kernel for loop on line 618 --------------------- + + __global__ void loop_cg_618_cuda_kernel_int(double _r[], double _x[], double _sum, double sum_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Private variables + double _d; + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _d = (-_r[_j]) + _x[_j]; + _sum = _d * _d + _sum; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _sum = __dvmh_blockReduceSum(_sum); + if (_j % warpSize == 0) + { + sum_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _sum; + } + } + + +//--------------------- Kernel for loop on line 618 --------------------- + + __global__ void loop_cg_618_cuda_kernel_llong(double _r[], double _x[], double _sum, double sum_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Private variables + double _d; + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _d = (-_r[_j]) + _x[_j]; + _sum = _d * _d + _sum; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _sum = __dvmh_blockReduceSum(_sum); + if (_j % warpSize == 0) + { + sum_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _sum; + } + } + + + +#ifdef _MS_F_ +#define loop_cg_229_cuda_ loop_cg_229_cuda +#define loop_cg_233_cuda_ loop_cg_233_cuda +#define loop_cg_272_cuda_ loop_cg_272_cuda +#define loop_cg_285_cuda_ loop_cg_285_cuda +#define loop_cg_301_cuda_ loop_cg_301_cuda +#define loop_cg_347_cuda_ loop_cg_347_cuda +#define loop_cg_367_cuda_ loop_cg_367_cuda +#define loop_cg_522_cuda_ loop_cg_522_cuda +#define loop_cg_537_cuda_ loop_cg_537_cuda +#define loop_cg_558_cuda_ loop_cg_558_cuda +#define loop_cg_567_cuda_ loop_cg_567_cuda +#define loop_cg_577_cuda_ loop_cg_577_cuda +#define loop_cg_588_cuda_ loop_cg_588_cuda +#define loop_cg_605_cuda_ loop_cg_605_cuda +#define loop_cg_618_cuda_ loop_cg_618_cuda +#endif + +extern "C" { + extern DvmType loop_cg_618_cuda_kernel_llong_regs, loop_cg_618_cuda_kernel_int_regs, loop_cg_605_cuda_kernel_llong_regs, loop_cg_605_cuda_kernel_int_regs, loop_cg_588_cuda_kernel_llong_regs, loop_cg_588_cuda_kernel_int_regs, loop_cg_577_cuda_kernel_llong_regs, loop_cg_577_cuda_kernel_int_regs, loop_cg_567_cuda_kernel_llong_regs, loop_cg_567_cuda_kernel_int_regs, loop_cg_558_cuda_kernel_llong_regs, loop_cg_558_cuda_kernel_int_regs, loop_cg_537_cuda_kernel_llong_regs, loop_cg_537_cuda_kernel_int_regs, loop_cg_522_cuda_kernel_llong_regs, loop_cg_522_cuda_kernel_int_regs, loop_cg_367_cuda_kernel_llong_regs, loop_cg_367_cuda_kernel_int_regs, loop_cg_347_cuda_kernel_llong_regs, loop_cg_347_cuda_kernel_int_regs, loop_cg_301_cuda_kernel_llong_regs, loop_cg_301_cuda_kernel_int_regs, loop_cg_285_cuda_kernel_llong_regs, loop_cg_285_cuda_kernel_int_regs, loop_cg_272_cuda_kernel_llong_regs, loop_cg_272_cuda_kernel_int_regs, loop_cg_233_cuda_kernel_llong_regs, loop_cg_233_cuda_kernel_int_regs, loop_cg_229_cuda_kernel_llong_regs, loop_cg_229_cuda_kernel_int_regs; + + +// CUDA handler for loop on line 229 + + void loop_cg_229_cuda_(DvmType *loop_ref, DvmType _x[]) + { + void *x_base; + DvmType d_x[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + x_base = dvmh_get_natural_base(&device_num, _x); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, x_base, _x, d_x); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_229_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_229_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_229_cuda_kernel_int<<>>((double *)x_base, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_229_cuda_kernel_llong<<>>((double *)x_base, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 233 + + void loop_cg_233_cuda_(DvmType *loop_ref, DvmType _p[], DvmType _r[], DvmType _z[], DvmType _q[]) + { + void *p_base, *r_base, *z_base, *q_base; + DvmType d_p[4], d_r[4], d_z[4], d_q[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + p_base = dvmh_get_natural_base(&device_num, _p); + r_base = dvmh_get_natural_base(&device_num, _r); + z_base = dvmh_get_natural_base(&device_num, _z); + q_base = dvmh_get_natural_base(&device_num, _q); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, p_base, _p, d_p); + dvmh_fill_header_(&device_num, r_base, _r, d_r); + dvmh_fill_header_(&device_num, z_base, _z, d_z); + dvmh_fill_header_(&device_num, q_base, _q, d_q); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_233_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_233_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_233_cuda_kernel_int<<>>((double *)p_base, (double *)r_base, (double *)z_base, (double *)q_base, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_233_cuda_kernel_llong<<>>((double *)p_base, (double *)r_base, (double *)z_base, (double *)q_base, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 272 + + void loop_cg_272_cuda_(DvmType *loop_ref, DvmType _z[], DvmType _x[]) + { + void *z_base, *x_base; + DvmType d_z[4], d_x[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + void *norm_temp2_grid; + double _norm_temp2; + void *norm_temp1_grid; + double _norm_temp1; + DvmType red_num, num_of_red_blocks, fill_flag; + DvmType shared_mem; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Register reduction for CUDA-execution + red_num = 1; + loop_cuda_register_red(loop_ref, red_num, &norm_temp1_grid, 0); + loop_red_init_(loop_ref, &red_num, &_norm_temp1, 0); + red_num = 2; + loop_cuda_register_red(loop_ref, red_num, &norm_temp2_grid, 0); + loop_red_init_(loop_ref, &red_num, &_norm_temp2, 0); + +// Get 'natural' bases + z_base = dvmh_get_natural_base(&device_num, _z); + x_base = dvmh_get_natural_base(&device_num, _x); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, z_base, _z, d_z); + dvmh_fill_header_(&device_num, x_base, _x, d_x); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); +#ifdef CUDA_FERMI_ARCH + shared_mem = 8; +#else + shared_mem = 0; +#endif + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_272_cuda_kernel_int_regs, &threads, &stream, &shared_mem); + } + else + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_272_cuda_kernel_llong_regs, &threads, &stream, &shared_mem); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + +// Prepare reduction + num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref)); + fill_flag = 0; + red_num = 1; + loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag); + red_num = 2; + loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_272_cuda_kernel_int<<>>((double *)z_base, (double *)x_base, _norm_temp1, (double *)norm_temp1_grid, _norm_temp2, (double *)norm_temp2_grid, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_272_cuda_kernel_llong<<>>((double *)z_base, (double *)x_base, _norm_temp1, (double *)norm_temp1_grid, _norm_temp2, (double *)norm_temp2_grid, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + +// Finish reduction + red_num = 1; + loop_red_finish(loop_ref, red_num); + red_num = 2; + loop_red_finish(loop_ref, red_num); + } + + +// CUDA handler for loop on line 285 + + void loop_cg_285_cuda_(DvmType *loop_ref, DvmType _x[], DvmType _z[], double *_norm_temp2) + { + void *x_base, *z_base; + DvmType d_x[4], d_z[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + x_base = dvmh_get_natural_base(&device_num, _x); + z_base = dvmh_get_natural_base(&device_num, _z); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, x_base, _x, d_x); + dvmh_fill_header_(&device_num, z_base, _z, d_z); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_285_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_285_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_285_cuda_kernel_int<<>>((double *)x_base, (double *)z_base, idxL[0], idxH[0], addBlocks, *_norm_temp2); + } + else + { + loop_cg_285_cuda_kernel_llong<<>>((double *)x_base, (double *)z_base, idxL[0], idxH[0], addBlocks, *_norm_temp2); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 301 + + void loop_cg_301_cuda_(DvmType *loop_ref, DvmType _x[]) + { + void *x_base; + DvmType d_x[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + x_base = dvmh_get_natural_base(&device_num, _x); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, x_base, _x, d_x); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_301_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_301_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_301_cuda_kernel_int<<>>((double *)x_base, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_301_cuda_kernel_llong<<>>((double *)x_base, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 347 + + void loop_cg_347_cuda_(DvmType *loop_ref, DvmType _z[], DvmType _x[]) + { + void *z_base, *x_base; + DvmType d_z[4], d_x[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + void *norm_temp2_grid; + double _norm_temp2; + void *norm_temp1_grid; + double _norm_temp1; + DvmType red_num, num_of_red_blocks, fill_flag; + DvmType shared_mem; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Register reduction for CUDA-execution + red_num = 1; + loop_cuda_register_red(loop_ref, red_num, &norm_temp1_grid, 0); + loop_red_init_(loop_ref, &red_num, &_norm_temp1, 0); + red_num = 2; + loop_cuda_register_red(loop_ref, red_num, &norm_temp2_grid, 0); + loop_red_init_(loop_ref, &red_num, &_norm_temp2, 0); + +// Get 'natural' bases + z_base = dvmh_get_natural_base(&device_num, _z); + x_base = dvmh_get_natural_base(&device_num, _x); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, z_base, _z, d_z); + dvmh_fill_header_(&device_num, x_base, _x, d_x); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); +#ifdef CUDA_FERMI_ARCH + shared_mem = 8; +#else + shared_mem = 0; +#endif + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_347_cuda_kernel_int_regs, &threads, &stream, &shared_mem); + } + else + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_347_cuda_kernel_llong_regs, &threads, &stream, &shared_mem); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + +// Prepare reduction + num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref)); + fill_flag = 0; + red_num = 1; + loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag); + red_num = 2; + loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_347_cuda_kernel_int<<>>((double *)z_base, (double *)x_base, _norm_temp1, (double *)norm_temp1_grid, _norm_temp2, (double *)norm_temp2_grid, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_347_cuda_kernel_llong<<>>((double *)z_base, (double *)x_base, _norm_temp1, (double *)norm_temp1_grid, _norm_temp2, (double *)norm_temp2_grid, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + +// Finish reduction + red_num = 1; + loop_red_finish(loop_ref, red_num); + red_num = 2; + loop_red_finish(loop_ref, red_num); + } + + +// CUDA handler for loop on line 367 + + void loop_cg_367_cuda_(DvmType *loop_ref, DvmType _x[], DvmType _z[], double *_norm_temp2) + { + void *x_base, *z_base; + DvmType d_x[4], d_z[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + x_base = dvmh_get_natural_base(&device_num, _x); + z_base = dvmh_get_natural_base(&device_num, _z); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, x_base, _x, d_x); + dvmh_fill_header_(&device_num, z_base, _z, d_z); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_367_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_367_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_367_cuda_kernel_int<<>>((double *)x_base, (double *)z_base, idxL[0], idxH[0], addBlocks, *_norm_temp2); + } + else + { + loop_cg_367_cuda_kernel_llong<<>>((double *)x_base, (double *)z_base, idxL[0], idxH[0], addBlocks, *_norm_temp2); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 522 + + void loop_cg_522_cuda_(DvmType *loop_ref, DvmType _p[], DvmType _r[], DvmType _x[], DvmType _z[], DvmType _q[]) + { + void *p_base, *r_base, *x_base, *z_base, *q_base; + DvmType d_p[4], d_r[4], d_x[4], d_z[4], d_q[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + p_base = dvmh_get_natural_base(&device_num, _p); + r_base = dvmh_get_natural_base(&device_num, _r); + x_base = dvmh_get_natural_base(&device_num, _x); + z_base = dvmh_get_natural_base(&device_num, _z); + q_base = dvmh_get_natural_base(&device_num, _q); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, p_base, _p, d_p); + dvmh_fill_header_(&device_num, r_base, _r, d_r); + dvmh_fill_header_(&device_num, x_base, _x, d_x); + dvmh_fill_header_(&device_num, z_base, _z, d_z); + dvmh_fill_header_(&device_num, q_base, _q, d_q); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_522_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_522_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_522_cuda_kernel_int<<>>((double *)p_base, (double *)r_base, (double *)x_base, (double *)z_base, (double *)q_base, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_522_cuda_kernel_llong<<>>((double *)p_base, (double *)r_base, (double *)x_base, (double *)z_base, (double *)q_base, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 537 + + void loop_cg_537_cuda_(DvmType *loop_ref, DvmType _r[]) + { + void *r_base; + DvmType d_r[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + void *rho_grid; + double _rho; + DvmType red_num, num_of_red_blocks, fill_flag; + DvmType shared_mem; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Register reduction for CUDA-execution + red_num = 1; + loop_cuda_register_red(loop_ref, red_num, &rho_grid, 0); + loop_red_init_(loop_ref, &red_num, &_rho, 0); + +// Get 'natural' bases + r_base = dvmh_get_natural_base(&device_num, _r); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, r_base, _r, d_r); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); +#ifdef CUDA_FERMI_ARCH + shared_mem = 8; +#else + shared_mem = 0; +#endif + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_537_cuda_kernel_int_regs, &threads, &stream, &shared_mem); + } + else + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_537_cuda_kernel_llong_regs, &threads, &stream, &shared_mem); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + +// Prepare reduction + num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref)); + fill_flag = 0; + red_num = 1; + loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_537_cuda_kernel_int<<>>((double *)r_base, _rho, (double *)rho_grid, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_537_cuda_kernel_llong<<>>((double *)r_base, _rho, (double *)rho_grid, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + +// Finish reduction + red_num = 1; + loop_red_finish(loop_ref, red_num); + } + + +// CUDA handler for loop on line 558 + + void loop_cg_558_cuda_(DvmType *loop_ref, DvmType _p_rma[], DvmType _q[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[]) + { + void *p_rma_base, *q_base, *colidx_base, *a_base, *rowstr_base; + DvmType d_p_rma[4], d_q[4], d_colidx[4], d_a[4], d_rowstr[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + p_rma_base = dvmh_get_natural_base(&device_num, _p_rma); + q_base = dvmh_get_natural_base(&device_num, _q); + colidx_base = dvmh_get_natural_base(&device_num, _colidx); + a_base = dvmh_get_natural_base(&device_num, _a); + rowstr_base = dvmh_get_natural_base(&device_num, _rowstr); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, p_rma_base, _p_rma, d_p_rma); + dvmh_fill_header_(&device_num, q_base, _q, d_q); + dvmh_fill_header_(&device_num, colidx_base, _colidx, d_colidx); + dvmh_fill_header_(&device_num, a_base, _a, d_a); + dvmh_fill_header_(&device_num, rowstr_base, _rowstr, d_rowstr); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_558_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_558_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks * dvmh_get_warp_size(loop_ref); + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + maxBlocks = maxBlocks / dvmh_get_warp_size(loop_ref) * dvmh_get_warp_size(loop_ref); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_558_cuda_kernel_int<<>>((double *)p_rma_base, (double *)q_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_558_cuda_kernel_llong<<>>((double *)p_rma_base, (double *)q_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 567 + + void loop_cg_567_cuda_(DvmType *loop_ref, DvmType _q[], DvmType _p[]) + { + void *q_base, *p_base; + DvmType d_q[4], d_p[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + void *d_grid; + double _d; + DvmType red_num, num_of_red_blocks, fill_flag; + DvmType shared_mem; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Register reduction for CUDA-execution + red_num = 1; + loop_cuda_register_red(loop_ref, red_num, &d_grid, 0); + loop_red_init_(loop_ref, &red_num, &_d, 0); + +// Get 'natural' bases + q_base = dvmh_get_natural_base(&device_num, _q); + p_base = dvmh_get_natural_base(&device_num, _p); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, q_base, _q, d_q); + dvmh_fill_header_(&device_num, p_base, _p, d_p); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); +#ifdef CUDA_FERMI_ARCH + shared_mem = 8; +#else + shared_mem = 0; +#endif + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_567_cuda_kernel_int_regs, &threads, &stream, &shared_mem); + } + else + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_567_cuda_kernel_llong_regs, &threads, &stream, &shared_mem); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + +// Prepare reduction + num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref)); + fill_flag = 0; + red_num = 1; + loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_567_cuda_kernel_int<<>>((double *)q_base, (double *)p_base, _d, (double *)d_grid, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_567_cuda_kernel_llong<<>>((double *)q_base, (double *)p_base, _d, (double *)d_grid, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + +// Finish reduction + red_num = 1; + loop_red_finish(loop_ref, red_num); + } + + +// CUDA handler for loop on line 577 + + void loop_cg_577_cuda_(DvmType *loop_ref, DvmType _q[], DvmType _r[], DvmType _p[], DvmType _z[], double *_alpha) + { + void *q_base, *r_base, *p_base, *z_base; + DvmType d_q[4], d_r[4], d_p[4], d_z[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + void *rho_grid; + double _rho; + DvmType red_num, num_of_red_blocks, fill_flag; + DvmType shared_mem; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Register reduction for CUDA-execution + red_num = 1; + loop_cuda_register_red(loop_ref, red_num, &rho_grid, 0); + loop_red_init_(loop_ref, &red_num, &_rho, 0); + +// Get 'natural' bases + q_base = dvmh_get_natural_base(&device_num, _q); + r_base = dvmh_get_natural_base(&device_num, _r); + p_base = dvmh_get_natural_base(&device_num, _p); + z_base = dvmh_get_natural_base(&device_num, _z); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, q_base, _q, d_q); + dvmh_fill_header_(&device_num, r_base, _r, d_r); + dvmh_fill_header_(&device_num, p_base, _p, d_p); + dvmh_fill_header_(&device_num, z_base, _z, d_z); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); +#ifdef CUDA_FERMI_ARCH + shared_mem = 8; +#else + shared_mem = 0; +#endif + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_577_cuda_kernel_int_regs, &threads, &stream, &shared_mem); + } + else + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_577_cuda_kernel_llong_regs, &threads, &stream, &shared_mem); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + +// Prepare reduction + num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref)); + fill_flag = 0; + red_num = 1; + loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_577_cuda_kernel_int<<>>((double *)q_base, (double *)r_base, (double *)p_base, (double *)z_base, _rho, (double *)rho_grid, idxL[0], idxH[0], addBlocks, *_alpha); + } + else + { + loop_cg_577_cuda_kernel_llong<<>>((double *)q_base, (double *)r_base, (double *)p_base, (double *)z_base, _rho, (double *)rho_grid, idxL[0], idxH[0], addBlocks, *_alpha); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + +// Finish reduction + red_num = 1; + loop_red_finish(loop_ref, red_num); + } + + +// CUDA handler for loop on line 588 + + void loop_cg_588_cuda_(DvmType *loop_ref, DvmType _p[], DvmType _r[], double *_beta) + { + void *p_base, *r_base; + DvmType d_p[4], d_r[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + p_base = dvmh_get_natural_base(&device_num, _p); + r_base = dvmh_get_natural_base(&device_num, _r); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, p_base, _p, d_p); + dvmh_fill_header_(&device_num, r_base, _r, d_r); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_588_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_588_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_588_cuda_kernel_int<<>>((double *)p_base, (double *)r_base, idxL[0], idxH[0], addBlocks, *_beta); + } + else + { + loop_cg_588_cuda_kernel_llong<<>>((double *)p_base, (double *)r_base, idxL[0], idxH[0], addBlocks, *_beta); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 605 + + void loop_cg_605_cuda_(DvmType *loop_ref, DvmType _z_rma[], DvmType _r[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[]) + { + void *z_rma_base, *r_base, *colidx_base, *a_base, *rowstr_base; + DvmType d_z_rma[4], d_r[4], d_colidx[4], d_a[4], d_rowstr[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + z_rma_base = dvmh_get_natural_base(&device_num, _z_rma); + r_base = dvmh_get_natural_base(&device_num, _r); + colidx_base = dvmh_get_natural_base(&device_num, _colidx); + a_base = dvmh_get_natural_base(&device_num, _a); + rowstr_base = dvmh_get_natural_base(&device_num, _rowstr); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, z_rma_base, _z_rma, d_z_rma); + dvmh_fill_header_(&device_num, r_base, _r, d_r); + dvmh_fill_header_(&device_num, colidx_base, _colidx, d_colidx); + dvmh_fill_header_(&device_num, a_base, _a, d_a); + dvmh_fill_header_(&device_num, rowstr_base, _rowstr, d_rowstr); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_605_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_605_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks * dvmh_get_warp_size(loop_ref); + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + maxBlocks = maxBlocks / dvmh_get_warp_size(loop_ref) * dvmh_get_warp_size(loop_ref); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_605_cuda_kernel_int<<>>((double *)z_rma_base, (double *)r_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_605_cuda_kernel_llong<<>>((double *)z_rma_base, (double *)r_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 618 + + void loop_cg_618_cuda_(DvmType *loop_ref, DvmType _r[], DvmType _x[]) + { + void *r_base, *x_base; + DvmType d_r[4], d_x[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + void *sum_grid; + double _sum; + DvmType red_num, num_of_red_blocks, fill_flag; + DvmType shared_mem; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Register reduction for CUDA-execution + red_num = 1; + loop_cuda_register_red(loop_ref, red_num, &sum_grid, 0); + loop_red_init_(loop_ref, &red_num, &_sum, 0); + +// Get 'natural' bases + r_base = dvmh_get_natural_base(&device_num, _r); + x_base = dvmh_get_natural_base(&device_num, _x); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, r_base, _r, d_r); + dvmh_fill_header_(&device_num, x_base, _x, d_x); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); +#ifdef CUDA_FERMI_ARCH + shared_mem = 8; +#else + shared_mem = 0; +#endif + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_618_cuda_kernel_int_regs, &threads, &stream, &shared_mem); + } + else + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_618_cuda_kernel_llong_regs, &threads, &stream, &shared_mem); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + +// Prepare reduction + num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref)); + fill_flag = 0; + red_num = 1; + loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_618_cuda_kernel_int<<>>((double *)r_base, (double *)x_base, _sum, (double *)sum_grid, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_618_cuda_kernel_llong<<>>((double *)r_base, (double *)x_base, _sum, (double *)sum_grid, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + +// Finish reduction + red_num = 1; + loop_red_finish(loop_ref, red_num); + } + +}