diff --git a/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cg.DVMH_cuda.cu_opt b/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cg.DVMH_cuda.cu_opt
new file mode 100644
index 0000000..90ed1a4
--- /dev/null
+++ b/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cg.DVMH_cuda.cu_opt
@@ -0,0 +1,2285 @@
+
+#include <dvmhlib_cuda.h>
+#define dcmplx2 Complex<double>
+#define cmplx2 Complex<float>
+typedef int __indexTypeInt; 
+typedef long long __indexTypeLLong;
+
+
+
+
+
+//--------------------- Kernel for loop on line 229 ---------------------
+
+      __global__ void   loop_cg_229_cuda_kernel_int(double _x[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Local needs
+         __indexTypeInt _i;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _i = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_i <= end_1) 
+         {
+
+// Loop body
+            _x[_i] = 1.0e0;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 229 ---------------------
+
+      __global__ void   loop_cg_229_cuda_kernel_llong(double _x[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Local needs
+         __indexTypeLLong _i;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _i = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_i <= end_1) 
+         {
+
+// Loop body
+            _x[_i] = 1.0e0;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 233 ---------------------
+
+      __global__ void   loop_cg_233_cuda_kernel_int(double _p[], double _r[], double _z[], double _q[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _q[_j] = 0.0e0;
+            _z[_j] = 0.0e0;
+            _r[_j] = 0.0e0;
+            _p[_j] = 0.0e0;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 233 ---------------------
+
+      __global__ void   loop_cg_233_cuda_kernel_llong(double _p[], double _r[], double _z[], double _q[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _q[_j] = 0.0e0;
+            _z[_j] = 0.0e0;
+            _r[_j] = 0.0e0;
+            _p[_j] = 0.0e0;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 272 ---------------------
+
+      __global__ void   loop_cg_272_cuda_kernel_int(double _z[], double _x[], double _norm_temp1, double norm_temp1_grid[], double _norm_temp2, double norm_temp2_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _norm_temp1 = _z[_j] * _x[_j] + _norm_temp1;
+            _norm_temp2 = _z[_j] * _z[_j] + _norm_temp2;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _norm_temp1 = __dvmh_blockReduceSum(_norm_temp1);
+         _norm_temp2 = __dvmh_blockReduceSum(_norm_temp2);
+         if (_j % warpSize == 0) 
+         {
+            norm_temp2_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp2;
+            norm_temp1_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp1;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 272 ---------------------
+
+      __global__ void   loop_cg_272_cuda_kernel_llong(double _z[], double _x[], double _norm_temp1, double norm_temp1_grid[], double _norm_temp2, double norm_temp2_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _norm_temp1 = _z[_j] * _x[_j] + _norm_temp1;
+            _norm_temp2 = _z[_j] * _z[_j] + _norm_temp2;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _norm_temp1 = __dvmh_blockReduceSum(_norm_temp1);
+         _norm_temp2 = __dvmh_blockReduceSum(_norm_temp2);
+         if (_j % warpSize == 0) 
+         {
+            norm_temp2_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp2;
+            norm_temp1_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp1;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 285 ---------------------
+
+      __global__ void   loop_cg_285_cuda_kernel_int(double _x[], double _z[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks, double _norm_temp2)
+      {
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _x[_j] = _z[_j] * _norm_temp2;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 285 ---------------------
+
+      __global__ void   loop_cg_285_cuda_kernel_llong(double _x[], double _z[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks, double _norm_temp2)
+      {
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _x[_j] = _z[_j] * _norm_temp2;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 301 ---------------------
+
+      __global__ void   loop_cg_301_cuda_kernel_int(double _x[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Local needs
+         __indexTypeInt _i;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _i = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_i <= end_1) 
+         {
+
+// Loop body
+            _x[_i] = 1.0e0;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 301 ---------------------
+
+      __global__ void   loop_cg_301_cuda_kernel_llong(double _x[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Local needs
+         __indexTypeLLong _i;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _i = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_i <= end_1) 
+         {
+
+// Loop body
+            _x[_i] = 1.0e0;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 347 ---------------------
+
+      __global__ void   loop_cg_347_cuda_kernel_int(double _z[], double _x[], double _norm_temp1, double norm_temp1_grid[], double _norm_temp2, double norm_temp2_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _norm_temp1 = _z[_j] * _x[_j] + _norm_temp1;
+            _norm_temp2 = _z[_j] * _z[_j] + _norm_temp2;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _norm_temp1 = __dvmh_blockReduceSum(_norm_temp1);
+         _norm_temp2 = __dvmh_blockReduceSum(_norm_temp2);
+         if (_j % warpSize == 0) 
+         {
+            norm_temp2_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp2;
+            norm_temp1_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp1;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 347 ---------------------
+
+      __global__ void   loop_cg_347_cuda_kernel_llong(double _z[], double _x[], double _norm_temp1, double norm_temp1_grid[], double _norm_temp2, double norm_temp2_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _norm_temp1 = _z[_j] * _x[_j] + _norm_temp1;
+            _norm_temp2 = _z[_j] * _z[_j] + _norm_temp2;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _norm_temp1 = __dvmh_blockReduceSum(_norm_temp1);
+         _norm_temp2 = __dvmh_blockReduceSum(_norm_temp2);
+         if (_j % warpSize == 0) 
+         {
+            norm_temp2_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp2;
+            norm_temp1_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp1;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 367 ---------------------
+
+      __global__ void   loop_cg_367_cuda_kernel_int(double _x[], double _z[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks, double _norm_temp2)
+      {
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _x[_j] = _z[_j] * _norm_temp2;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 367 ---------------------
+
+      __global__ void   loop_cg_367_cuda_kernel_llong(double _x[], double _z[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks, double _norm_temp2)
+      {
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _x[_j] = _z[_j] * _norm_temp2;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 522 ---------------------
+
+      __global__ void   loop_cg_522_cuda_kernel_int(double _p[], double _r[], double _x[], double _z[], double _q[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Private variables
+         double _d;
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _q[_j] = 0.0e0;
+            _z[_j] = 0.0e0;
+            _d = _x[_j];
+            _r[_j] = _d;
+            _p[_j] = _d;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 522 ---------------------
+
+      __global__ void   loop_cg_522_cuda_kernel_llong(double _p[], double _r[], double _x[], double _z[], double _q[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Private variables
+         double _d;
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _q[_j] = 0.0e0;
+            _z[_j] = 0.0e0;
+            _d = _x[_j];
+            _r[_j] = _d;
+            _p[_j] = _d;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 537 ---------------------
+
+      __global__ void   loop_cg_537_cuda_kernel_int(double _r[], double _rho, double rho_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _rho = _r[_j] * _r[_j] + _rho;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _rho = __dvmh_blockReduceSum(_rho);
+         if (_j % warpSize == 0) 
+         {
+            rho_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _rho;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 537 ---------------------
+
+      __global__ void   loop_cg_537_cuda_kernel_llong(double _r[], double _rho, double rho_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _rho = _r[_j] * _r[_j] + _rho;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _rho = __dvmh_blockReduceSum(_rho);
+         if (_j % warpSize == 0) 
+         {
+            rho_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _rho;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 558 ---------------------
+
+      __global__ void   loop_cg_558_cuda_kernel_int(double _p_rma[], double _q[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Private variables
+         int _k;
+         double _sum;
+         int cond_0;
+         int __k;
+         int gid = blockIdx.x * blockDim.x + threadIdx.x;
+         int tid = gid / warpSize;
+         int lid = gid % warpSize;
+         
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x) / warpSize;
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _sum = 0.e0;
+            for (_k = _rowstr[_j] + lid, 
+                    (_rowstr[_j] > _rowstr[_j + 1] - 1 && 1 > 0 || _rowstr[_j] < _rowstr[_j + 1] - 1 && 1 < 0) ? 
+                        cond_0 = (-1) : 
+                        cond_0 = abs(_rowstr[_j] - (_rowstr[_j + 1] - 1)) + abs(1),
+                    __k = 0 + lid ; 
+                 __k < cond_0 ; 
+                 _k = _k + warpSize, __k = __k + warpSize)
+            {
+               _sum = _p_rma[_colidx[_k]] * _a[_k] + _sum;
+            }
+            _sum = __dvmh_warpReduceSum(_sum);
+            if (lid == 0) {
+                _q[_j] = _sum;
+            }
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 558 ---------------------
+
+      __global__ void   loop_cg_558_cuda_kernel_llong(double _p_rma[], double _q[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Private variables
+         int _k;
+         double _sum;
+         int cond_0;
+         int __k;
+         int gid = blockIdx.x * blockDim.x + threadIdx.x;
+         int tid = gid / warpSize;
+         int lid = gid % warpSize;
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x) / warpSize;
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _sum = 0.e0;
+            for (_k = _rowstr[_j] + lid, 
+                    (_rowstr[_j] > _rowstr[_j + 1] - 1 && 1 > 0 || _rowstr[_j] < _rowstr[_j + 1] - 1 && 1 < 0) ? 
+                        cond_0 = (-1) : 
+                        cond_0 = abs(_rowstr[_j] - (_rowstr[_j + 1] - 1)) + abs(1),
+                    __k = 0 + lid ; 
+                 __k < cond_0 ; 
+                 _k = _k + warpSize, __k = __k + warpSize)
+            {
+               _sum = _p_rma[_colidx[_k]] * _a[_k] + _sum;
+            }
+            _sum = __dvmh_warpReduceSum(_sum);
+            if (lid == 0) {
+                _q[_j] = _sum;
+            }
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 567 ---------------------
+
+      __global__ void   loop_cg_567_cuda_kernel_int(double _q[], double _p[], double _d, double d_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _d = _q[_j] * _p[_j] + _d;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _d = __dvmh_blockReduceSum(_d);
+         if (_j % warpSize == 0) 
+         {
+            d_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _d;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 567 ---------------------
+
+      __global__ void   loop_cg_567_cuda_kernel_llong(double _q[], double _p[], double _d, double d_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _d = _q[_j] * _p[_j] + _d;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _d = __dvmh_blockReduceSum(_d);
+         if (_j % warpSize == 0) 
+         {
+            d_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _d;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 577 ---------------------
+
+      __global__ void   loop_cg_577_cuda_kernel_int(double _q[], double _r[], double _p[], double _z[], double _rho, double rho_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks, double _alpha)
+      {
+
+// Private variables
+         double _d;
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _z[_j] = _p[_j] * _alpha + _z[_j];
+            _d = (-(_alpha * _q[_j])) + _r[_j];
+            _r[_j] = _d;
+            _rho = _d * _d + _rho;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _rho = __dvmh_blockReduceSum(_rho);
+         if (_j % warpSize == 0) 
+         {
+            rho_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _rho;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 577 ---------------------
+
+      __global__ void   loop_cg_577_cuda_kernel_llong(double _q[], double _r[], double _p[], double _z[], double _rho, double rho_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks, double _alpha)
+      {
+
+// Private variables
+         double _d;
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _z[_j] = _p[_j] * _alpha + _z[_j];
+            _d = (-(_alpha * _q[_j])) + _r[_j];
+            _r[_j] = _d;
+            _rho = _d * _d + _rho;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _rho = __dvmh_blockReduceSum(_rho);
+         if (_j % warpSize == 0) 
+         {
+            rho_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _rho;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 588 ---------------------
+
+      __global__ void   loop_cg_588_cuda_kernel_int(double _p[], double _r[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks, double _beta)
+      {
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _p[_j] = _p[_j] * _beta + _r[_j];
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 588 ---------------------
+
+      __global__ void   loop_cg_588_cuda_kernel_llong(double _p[], double _r[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks, double _beta)
+      {
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _p[_j] = _p[_j] * _beta + _r[_j];
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 605 ---------------------
+
+      __global__ void   loop_cg_605_cuda_kernel_int(double _z_rma[], double _r[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Private variables
+         int _k;
+         double _d;
+         int cond_0;
+         int __k;
+         int gid = blockIdx.x * blockDim.x + threadIdx.x;
+         int tid = gid / warpSize;
+         int lid = gid % warpSize;
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x) / warpSize;
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _d = 0.e0;
+            for (_k = _rowstr[_j] + lid, 
+                 (_rowstr[_j] > _rowstr[_j + 1] - 1 && 1 > 0 || _rowstr[_j] < _rowstr[_j + 1] - 1 && 1 < 0) ? 
+                    cond_0 = (-1) :
+                    cond_0 = abs(_rowstr[_j] - (_rowstr[_j + 1] - 1)) + abs(1), 
+                    __k = 0  + lid;
+                  __k < cond_0 ; 
+                  _k = _k + warpSize, __k = __k + warpSize)
+            {
+               _d = _z_rma[_colidx[_k]] * _a[_k] + _d;
+            }
+            _d = __dvmh_warpReduceSum(_d);
+            if (lid == 0) {
+                _r[_j] = _d;
+            }
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 605 ---------------------
+
+      __global__ void   loop_cg_605_cuda_kernel_llong(double _z_rma[], double _r[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Private variables
+         int _k;
+         double _d;
+         int cond_0;
+         int __k;
+         int gid = blockIdx.x * blockDim.x + threadIdx.x;
+         int tid = gid / warpSize;
+         int lid = gid % warpSize;
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x) / warpSize;
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _d = 0.e0;
+            for (_k = _rowstr[_j] + lid, 
+                 (_rowstr[_j] > _rowstr[_j + 1] - 1 && 1 > 0 || _rowstr[_j] < _rowstr[_j + 1] - 1 && 1 < 0) ? 
+                    cond_0 = (-1) :
+                    cond_0 = abs(_rowstr[_j] - (_rowstr[_j + 1] - 1)) + abs(1), 
+                    __k = 0  + lid;
+                  __k < cond_0 ; 
+                  _k = _k + warpSize, __k = __k + warpSize)
+            {
+               _d = _z_rma[_colidx[_k]] * _a[_k] + _d;
+            }
+            _d = __dvmh_warpReduceSum(_d);
+            if (lid == 0) {
+                _r[_j] = _d;
+            }
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 618 ---------------------
+
+      __global__ void   loop_cg_618_cuda_kernel_int(double _r[], double _x[], double _sum, double sum_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Private variables
+         double _d;
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _d = (-_r[_j]) + _x[_j];
+            _sum = _d * _d + _sum;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _sum = __dvmh_blockReduceSum(_sum);
+         if (_j % warpSize == 0) 
+         {
+            sum_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _sum;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 618 ---------------------
+
+      __global__ void   loop_cg_618_cuda_kernel_llong(double _r[], double _x[], double _sum, double sum_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Private variables
+         double _d;
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _d = (-_r[_j]) + _x[_j];
+            _sum = _d * _d + _sum;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _sum = __dvmh_blockReduceSum(_sum);
+         if (_j % warpSize == 0) 
+         {
+            sum_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _sum;
+         }
+      }
+
+
+
+#ifdef _MS_F_
+#define loop_cg_229_cuda_ loop_cg_229_cuda
+#define loop_cg_233_cuda_ loop_cg_233_cuda
+#define loop_cg_272_cuda_ loop_cg_272_cuda
+#define loop_cg_285_cuda_ loop_cg_285_cuda
+#define loop_cg_301_cuda_ loop_cg_301_cuda
+#define loop_cg_347_cuda_ loop_cg_347_cuda
+#define loop_cg_367_cuda_ loop_cg_367_cuda
+#define loop_cg_522_cuda_ loop_cg_522_cuda
+#define loop_cg_537_cuda_ loop_cg_537_cuda
+#define loop_cg_558_cuda_ loop_cg_558_cuda
+#define loop_cg_567_cuda_ loop_cg_567_cuda
+#define loop_cg_577_cuda_ loop_cg_577_cuda
+#define loop_cg_588_cuda_ loop_cg_588_cuda
+#define loop_cg_605_cuda_ loop_cg_605_cuda
+#define loop_cg_618_cuda_ loop_cg_618_cuda
+#endif
+
+extern "C" {
+      extern  DvmType loop_cg_618_cuda_kernel_llong_regs, loop_cg_618_cuda_kernel_int_regs, loop_cg_605_cuda_kernel_llong_regs, loop_cg_605_cuda_kernel_int_regs, loop_cg_588_cuda_kernel_llong_regs, loop_cg_588_cuda_kernel_int_regs, loop_cg_577_cuda_kernel_llong_regs, loop_cg_577_cuda_kernel_int_regs, loop_cg_567_cuda_kernel_llong_regs, loop_cg_567_cuda_kernel_int_regs, loop_cg_558_cuda_kernel_llong_regs, loop_cg_558_cuda_kernel_int_regs, loop_cg_537_cuda_kernel_llong_regs, loop_cg_537_cuda_kernel_int_regs, loop_cg_522_cuda_kernel_llong_regs, loop_cg_522_cuda_kernel_int_regs, loop_cg_367_cuda_kernel_llong_regs, loop_cg_367_cuda_kernel_int_regs, loop_cg_347_cuda_kernel_llong_regs, loop_cg_347_cuda_kernel_int_regs, loop_cg_301_cuda_kernel_llong_regs, loop_cg_301_cuda_kernel_int_regs, loop_cg_285_cuda_kernel_llong_regs, loop_cg_285_cuda_kernel_int_regs, loop_cg_272_cuda_kernel_llong_regs, loop_cg_272_cuda_kernel_int_regs, loop_cg_233_cuda_kernel_llong_regs, loop_cg_233_cuda_kernel_int_regs, loop_cg_229_cuda_kernel_llong_regs, loop_cg_229_cuda_kernel_int_regs;
+
+
+//    CUDA handler for loop on line 229 
+
+      void   loop_cg_229_cuda_(DvmType *loop_ref, DvmType _x[])
+      {
+         void   *x_base;
+         DvmType d_x[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         x_base = dvmh_get_natural_base(&device_num, _x);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, x_base, _x, d_x);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_229_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_229_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_229_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)x_base, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_229_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)x_base, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 233 
+
+      void   loop_cg_233_cuda_(DvmType *loop_ref, DvmType _p[], DvmType _r[], DvmType _z[], DvmType _q[])
+      {
+         void   *p_base, *r_base, *z_base, *q_base;
+         DvmType d_p[4], d_r[4], d_z[4], d_q[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         p_base = dvmh_get_natural_base(&device_num, _p);
+         r_base = dvmh_get_natural_base(&device_num, _r);
+         z_base = dvmh_get_natural_base(&device_num, _z);
+         q_base = dvmh_get_natural_base(&device_num, _q);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, p_base, _p, d_p);
+         dvmh_fill_header_(&device_num, r_base, _r, d_r);
+         dvmh_fill_header_(&device_num, z_base, _z, d_z);
+         dvmh_fill_header_(&device_num, q_base, _q, d_q);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_233_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_233_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_233_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)p_base, (double *)r_base, (double *)z_base, (double *)q_base, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_233_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)p_base, (double *)r_base, (double *)z_base, (double *)q_base, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 272 
+
+      void   loop_cg_272_cuda_(DvmType *loop_ref, DvmType _z[], DvmType _x[])
+      {
+         void   *z_base, *x_base;
+         DvmType d_z[4], d_x[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         void   *norm_temp2_grid;
+         double _norm_temp2;
+         void   *norm_temp1_grid;
+         double _norm_temp1;
+         DvmType red_num, num_of_red_blocks, fill_flag;
+         DvmType shared_mem;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Register reduction for CUDA-execution
+         red_num = 1;
+         loop_cuda_register_red(loop_ref, red_num, &norm_temp1_grid, 0);
+         loop_red_init_(loop_ref, &red_num, &_norm_temp1, 0);
+         red_num = 2;
+         loop_cuda_register_red(loop_ref, red_num, &norm_temp2_grid, 0);
+         loop_red_init_(loop_ref, &red_num, &_norm_temp2, 0);
+
+// Get 'natural' bases
+         z_base = dvmh_get_natural_base(&device_num, _z);
+         x_base = dvmh_get_natural_base(&device_num, _x);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, z_base, _z, d_z);
+         dvmh_fill_header_(&device_num, x_base, _x, d_x);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+#ifdef CUDA_FERMI_ARCH
+         shared_mem = 8;
+#else
+         shared_mem = 0;
+#endif
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_272_cuda_kernel_int_regs, &threads, &stream, &shared_mem);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_272_cuda_kernel_llong_regs, &threads, &stream, &shared_mem);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+
+// Prepare reduction
+         num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref));
+         fill_flag = 0;
+         red_num = 1;
+         loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag);
+         red_num = 2;
+         loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_272_cuda_kernel_int<<<blocks, threads, shared_mem, stream>>>((double *)z_base, (double *)x_base, _norm_temp1, (double *)norm_temp1_grid, _norm_temp2, (double *)norm_temp2_grid, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_272_cuda_kernel_llong<<<blocks, threads, shared_mem, stream>>>((double *)z_base, (double *)x_base, _norm_temp1, (double *)norm_temp1_grid, _norm_temp2, (double *)norm_temp2_grid, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+
+// Finish reduction
+         red_num = 1;
+         loop_red_finish(loop_ref, red_num);
+         red_num = 2;
+         loop_red_finish(loop_ref, red_num);
+      }
+
+
+//    CUDA handler for loop on line 285 
+
+      void   loop_cg_285_cuda_(DvmType *loop_ref, DvmType _x[], DvmType _z[], double *_norm_temp2)
+      {
+         void   *x_base, *z_base;
+         DvmType d_x[4], d_z[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         x_base = dvmh_get_natural_base(&device_num, _x);
+         z_base = dvmh_get_natural_base(&device_num, _z);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, x_base, _x, d_x);
+         dvmh_fill_header_(&device_num, z_base, _z, d_z);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_285_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_285_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_285_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)x_base, (double *)z_base, idxL[0], idxH[0], addBlocks, *_norm_temp2);
+            }
+            else 
+            {
+               loop_cg_285_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)x_base, (double *)z_base, idxL[0], idxH[0], addBlocks, *_norm_temp2);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 301 
+
+      void   loop_cg_301_cuda_(DvmType *loop_ref, DvmType _x[])
+      {
+         void   *x_base;
+         DvmType d_x[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         x_base = dvmh_get_natural_base(&device_num, _x);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, x_base, _x, d_x);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_301_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_301_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_301_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)x_base, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_301_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)x_base, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 347 
+
+      void   loop_cg_347_cuda_(DvmType *loop_ref, DvmType _z[], DvmType _x[])
+      {
+         void   *z_base, *x_base;
+         DvmType d_z[4], d_x[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         void   *norm_temp2_grid;
+         double _norm_temp2;
+         void   *norm_temp1_grid;
+         double _norm_temp1;
+         DvmType red_num, num_of_red_blocks, fill_flag;
+         DvmType shared_mem;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Register reduction for CUDA-execution
+         red_num = 1;
+         loop_cuda_register_red(loop_ref, red_num, &norm_temp1_grid, 0);
+         loop_red_init_(loop_ref, &red_num, &_norm_temp1, 0);
+         red_num = 2;
+         loop_cuda_register_red(loop_ref, red_num, &norm_temp2_grid, 0);
+         loop_red_init_(loop_ref, &red_num, &_norm_temp2, 0);
+
+// Get 'natural' bases
+         z_base = dvmh_get_natural_base(&device_num, _z);
+         x_base = dvmh_get_natural_base(&device_num, _x);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, z_base, _z, d_z);
+         dvmh_fill_header_(&device_num, x_base, _x, d_x);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+#ifdef CUDA_FERMI_ARCH
+         shared_mem = 8;
+#else
+         shared_mem = 0;
+#endif
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_347_cuda_kernel_int_regs, &threads, &stream, &shared_mem);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_347_cuda_kernel_llong_regs, &threads, &stream, &shared_mem);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+
+// Prepare reduction
+         num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref));
+         fill_flag = 0;
+         red_num = 1;
+         loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag);
+         red_num = 2;
+         loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_347_cuda_kernel_int<<<blocks, threads, shared_mem, stream>>>((double *)z_base, (double *)x_base, _norm_temp1, (double *)norm_temp1_grid, _norm_temp2, (double *)norm_temp2_grid, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_347_cuda_kernel_llong<<<blocks, threads, shared_mem, stream>>>((double *)z_base, (double *)x_base, _norm_temp1, (double *)norm_temp1_grid, _norm_temp2, (double *)norm_temp2_grid, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+
+// Finish reduction
+         red_num = 1;
+         loop_red_finish(loop_ref, red_num);
+         red_num = 2;
+         loop_red_finish(loop_ref, red_num);
+      }
+
+
+//    CUDA handler for loop on line 367 
+
+      void   loop_cg_367_cuda_(DvmType *loop_ref, DvmType _x[], DvmType _z[], double *_norm_temp2)
+      {
+         void   *x_base, *z_base;
+         DvmType d_x[4], d_z[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         x_base = dvmh_get_natural_base(&device_num, _x);
+         z_base = dvmh_get_natural_base(&device_num, _z);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, x_base, _x, d_x);
+         dvmh_fill_header_(&device_num, z_base, _z, d_z);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_367_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_367_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_367_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)x_base, (double *)z_base, idxL[0], idxH[0], addBlocks, *_norm_temp2);
+            }
+            else 
+            {
+               loop_cg_367_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)x_base, (double *)z_base, idxL[0], idxH[0], addBlocks, *_norm_temp2);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 522 
+
+      void   loop_cg_522_cuda_(DvmType *loop_ref, DvmType _p[], DvmType _r[], DvmType _x[], DvmType _z[], DvmType _q[])
+      {
+         void   *p_base, *r_base, *x_base, *z_base, *q_base;
+         DvmType d_p[4], d_r[4], d_x[4], d_z[4], d_q[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         p_base = dvmh_get_natural_base(&device_num, _p);
+         r_base = dvmh_get_natural_base(&device_num, _r);
+         x_base = dvmh_get_natural_base(&device_num, _x);
+         z_base = dvmh_get_natural_base(&device_num, _z);
+         q_base = dvmh_get_natural_base(&device_num, _q);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, p_base, _p, d_p);
+         dvmh_fill_header_(&device_num, r_base, _r, d_r);
+         dvmh_fill_header_(&device_num, x_base, _x, d_x);
+         dvmh_fill_header_(&device_num, z_base, _z, d_z);
+         dvmh_fill_header_(&device_num, q_base, _q, d_q);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_522_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_522_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_522_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)p_base, (double *)r_base, (double *)x_base, (double *)z_base, (double *)q_base, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_522_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)p_base, (double *)r_base, (double *)x_base, (double *)z_base, (double *)q_base, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 537 
+
+      void   loop_cg_537_cuda_(DvmType *loop_ref, DvmType _r[])
+      {
+         void   *r_base;
+         DvmType d_r[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         void   *rho_grid;
+         double _rho;
+         DvmType red_num, num_of_red_blocks, fill_flag;
+         DvmType shared_mem;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Register reduction for CUDA-execution
+         red_num = 1;
+         loop_cuda_register_red(loop_ref, red_num, &rho_grid, 0);
+         loop_red_init_(loop_ref, &red_num, &_rho, 0);
+
+// Get 'natural' bases
+         r_base = dvmh_get_natural_base(&device_num, _r);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, r_base, _r, d_r);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+#ifdef CUDA_FERMI_ARCH
+         shared_mem = 8;
+#else
+         shared_mem = 0;
+#endif
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_537_cuda_kernel_int_regs, &threads, &stream, &shared_mem);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_537_cuda_kernel_llong_regs, &threads, &stream, &shared_mem);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+
+// Prepare reduction
+         num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref));
+         fill_flag = 0;
+         red_num = 1;
+         loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_537_cuda_kernel_int<<<blocks, threads, shared_mem, stream>>>((double *)r_base, _rho, (double *)rho_grid, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_537_cuda_kernel_llong<<<blocks, threads, shared_mem, stream>>>((double *)r_base, _rho, (double *)rho_grid, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+
+// Finish reduction
+         red_num = 1;
+         loop_red_finish(loop_ref, red_num);
+      }
+
+
+//    CUDA handler for loop on line 558 
+
+      void   loop_cg_558_cuda_(DvmType *loop_ref, DvmType _p_rma[], DvmType _q[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[])
+      {
+         void   *p_rma_base, *q_base, *colidx_base, *a_base, *rowstr_base;
+         DvmType d_p_rma[4], d_q[4], d_colidx[4], d_a[4], d_rowstr[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         p_rma_base = dvmh_get_natural_base(&device_num, _p_rma);
+         q_base = dvmh_get_natural_base(&device_num, _q);
+         colidx_base = dvmh_get_natural_base(&device_num, _colidx);
+         a_base = dvmh_get_natural_base(&device_num, _a);
+         rowstr_base = dvmh_get_natural_base(&device_num, _rowstr);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, p_rma_base, _p_rma, d_p_rma);
+         dvmh_fill_header_(&device_num, q_base, _q, d_q);
+         dvmh_fill_header_(&device_num, colidx_base, _colidx, d_colidx);
+         dvmh_fill_header_(&device_num, a_base, _a, d_a);
+         dvmh_fill_header_(&device_num, rowstr_base, _rowstr, d_rowstr);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_558_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_558_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks * dvmh_get_warp_size(loop_ref);
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+         maxBlocks = maxBlocks / dvmh_get_warp_size(loop_ref) * dvmh_get_warp_size(loop_ref);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_558_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)p_rma_base, (double *)q_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_558_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)p_rma_base, (double *)q_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 567 
+
+      void   loop_cg_567_cuda_(DvmType *loop_ref, DvmType _q[], DvmType _p[])
+      {
+         void   *q_base, *p_base;
+         DvmType d_q[4], d_p[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         void   *d_grid;
+         double _d;
+         DvmType red_num, num_of_red_blocks, fill_flag;
+         DvmType shared_mem;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Register reduction for CUDA-execution
+         red_num = 1;
+         loop_cuda_register_red(loop_ref, red_num, &d_grid, 0);
+         loop_red_init_(loop_ref, &red_num, &_d, 0);
+
+// Get 'natural' bases
+         q_base = dvmh_get_natural_base(&device_num, _q);
+         p_base = dvmh_get_natural_base(&device_num, _p);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, q_base, _q, d_q);
+         dvmh_fill_header_(&device_num, p_base, _p, d_p);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+#ifdef CUDA_FERMI_ARCH
+         shared_mem = 8;
+#else
+         shared_mem = 0;
+#endif
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_567_cuda_kernel_int_regs, &threads, &stream, &shared_mem);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_567_cuda_kernel_llong_regs, &threads, &stream, &shared_mem);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+
+// Prepare reduction
+         num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref));
+         fill_flag = 0;
+         red_num = 1;
+         loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_567_cuda_kernel_int<<<blocks, threads, shared_mem, stream>>>((double *)q_base, (double *)p_base, _d, (double *)d_grid, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_567_cuda_kernel_llong<<<blocks, threads, shared_mem, stream>>>((double *)q_base, (double *)p_base, _d, (double *)d_grid, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+
+// Finish reduction
+         red_num = 1;
+         loop_red_finish(loop_ref, red_num);
+      }
+
+
+//    CUDA handler for loop on line 577 
+
+      void   loop_cg_577_cuda_(DvmType *loop_ref, DvmType _q[], DvmType _r[], DvmType _p[], DvmType _z[], double *_alpha)
+      {
+         void   *q_base, *r_base, *p_base, *z_base;
+         DvmType d_q[4], d_r[4], d_p[4], d_z[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         void   *rho_grid;
+         double _rho;
+         DvmType red_num, num_of_red_blocks, fill_flag;
+         DvmType shared_mem;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Register reduction for CUDA-execution
+         red_num = 1;
+         loop_cuda_register_red(loop_ref, red_num, &rho_grid, 0);
+         loop_red_init_(loop_ref, &red_num, &_rho, 0);
+
+// Get 'natural' bases
+         q_base = dvmh_get_natural_base(&device_num, _q);
+         r_base = dvmh_get_natural_base(&device_num, _r);
+         p_base = dvmh_get_natural_base(&device_num, _p);
+         z_base = dvmh_get_natural_base(&device_num, _z);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, q_base, _q, d_q);
+         dvmh_fill_header_(&device_num, r_base, _r, d_r);
+         dvmh_fill_header_(&device_num, p_base, _p, d_p);
+         dvmh_fill_header_(&device_num, z_base, _z, d_z);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+#ifdef CUDA_FERMI_ARCH
+         shared_mem = 8;
+#else
+         shared_mem = 0;
+#endif
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_577_cuda_kernel_int_regs, &threads, &stream, &shared_mem);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_577_cuda_kernel_llong_regs, &threads, &stream, &shared_mem);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+
+// Prepare reduction
+         num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref));
+         fill_flag = 0;
+         red_num = 1;
+         loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_577_cuda_kernel_int<<<blocks, threads, shared_mem, stream>>>((double *)q_base, (double *)r_base, (double *)p_base, (double *)z_base, _rho, (double *)rho_grid, idxL[0], idxH[0], addBlocks, *_alpha);
+            }
+            else 
+            {
+               loop_cg_577_cuda_kernel_llong<<<blocks, threads, shared_mem, stream>>>((double *)q_base, (double *)r_base, (double *)p_base, (double *)z_base, _rho, (double *)rho_grid, idxL[0], idxH[0], addBlocks, *_alpha);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+
+// Finish reduction
+         red_num = 1;
+         loop_red_finish(loop_ref, red_num);
+      }
+
+
+//    CUDA handler for loop on line 588 
+
+      void   loop_cg_588_cuda_(DvmType *loop_ref, DvmType _p[], DvmType _r[], double *_beta)
+      {
+         void   *p_base, *r_base;
+         DvmType d_p[4], d_r[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         p_base = dvmh_get_natural_base(&device_num, _p);
+         r_base = dvmh_get_natural_base(&device_num, _r);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, p_base, _p, d_p);
+         dvmh_fill_header_(&device_num, r_base, _r, d_r);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_588_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_588_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_588_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)p_base, (double *)r_base, idxL[0], idxH[0], addBlocks, *_beta);
+            }
+            else 
+            {
+               loop_cg_588_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)p_base, (double *)r_base, idxL[0], idxH[0], addBlocks, *_beta);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 605 
+
+      void   loop_cg_605_cuda_(DvmType *loop_ref, DvmType _z_rma[], DvmType _r[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[])
+      {
+         void   *z_rma_base, *r_base, *colidx_base, *a_base, *rowstr_base;
+         DvmType d_z_rma[4], d_r[4], d_colidx[4], d_a[4], d_rowstr[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         z_rma_base = dvmh_get_natural_base(&device_num, _z_rma);
+         r_base = dvmh_get_natural_base(&device_num, _r);
+         colidx_base = dvmh_get_natural_base(&device_num, _colidx);
+         a_base = dvmh_get_natural_base(&device_num, _a);
+         rowstr_base = dvmh_get_natural_base(&device_num, _rowstr);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, z_rma_base, _z_rma, d_z_rma);
+         dvmh_fill_header_(&device_num, r_base, _r, d_r);
+         dvmh_fill_header_(&device_num, colidx_base, _colidx, d_colidx);
+         dvmh_fill_header_(&device_num, a_base, _a, d_a);
+         dvmh_fill_header_(&device_num, rowstr_base, _rowstr, d_rowstr);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_605_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_605_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks * dvmh_get_warp_size(loop_ref);
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+         maxBlocks = maxBlocks / dvmh_get_warp_size(loop_ref) * dvmh_get_warp_size(loop_ref);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_605_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)z_rma_base, (double *)r_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_605_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)z_rma_base, (double *)r_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 618 
+
+      void   loop_cg_618_cuda_(DvmType *loop_ref, DvmType _r[], DvmType _x[])
+      {
+         void   *r_base, *x_base;
+         DvmType d_r[4], d_x[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         void   *sum_grid;
+         double _sum;
+         DvmType red_num, num_of_red_blocks, fill_flag;
+         DvmType shared_mem;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Register reduction for CUDA-execution
+         red_num = 1;
+         loop_cuda_register_red(loop_ref, red_num, &sum_grid, 0);
+         loop_red_init_(loop_ref, &red_num, &_sum, 0);
+
+// Get 'natural' bases
+         r_base = dvmh_get_natural_base(&device_num, _r);
+         x_base = dvmh_get_natural_base(&device_num, _x);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, r_base, _r, d_r);
+         dvmh_fill_header_(&device_num, x_base, _x, d_x);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+#ifdef CUDA_FERMI_ARCH
+         shared_mem = 8;
+#else
+         shared_mem = 0;
+#endif
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_618_cuda_kernel_int_regs, &threads, &stream, &shared_mem);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_618_cuda_kernel_llong_regs, &threads, &stream, &shared_mem);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+
+// Prepare reduction
+         num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref));
+         fill_flag = 0;
+         red_num = 1;
+         loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_618_cuda_kernel_int<<<blocks, threads, shared_mem, stream>>>((double *)r_base, (double *)x_base, _sum, (double *)sum_grid, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_618_cuda_kernel_llong<<<blocks, threads, shared_mem, stream>>>((double *)r_base, (double *)x_base, _sum, (double *)sum_grid, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+
+// Finish reduction
+         red_num = 1;
+         loop_red_finish(loop_ref, red_num);
+      }
+
+}