fixed dead flag for functions

2025-02-09 20:48:06 +03:00
parent c5d063b0da
commit d8aa5606ef
19 changed files with 897 additions and 241 deletions
--- a/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cg.DVMH_cuda.cu_opt
+++ b/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cg.DVMH_cuda.cu_opt
@@ -487,7 +487,7 @@ typedef long long __indexTypeLLong;

 //--------------------- Kernel for loop on line 558 ---------------------

-      __global__ void   loop_cg_558_cuda_kernel_int(double _p_rma[], double _q[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      __global__ void   loop_cg_558_cuda_kernel_int(double _q[], double _p[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
      {

 // Private variables
@@ -496,8 +496,8 @@ typedef long long __indexTypeLLong;
         int cond_0;
         int __k;
         int gid = blockIdx.x * blockDim.x + threadIdx.x;
-         int tid = gid / warpSize;
-         int lid = gid % warpSize;
+         int tid = gid / 32;
+         int lid = gid % 32;
         
 // Local needs
         __indexTypeInt _j;
@@ -520,7 +520,7 @@ typedef long long __indexTypeLLong;
                 __k < cond_0 ; 
                 _k = _k + warpSize, __k = __k + warpSize)
            {
-               _sum = _p_rma[_colidx[_k]] * _a[_k] + _sum;
+               _sum = _p[_colidx[_k]] * _a[_k] + _sum;
            }
            _sum = __dvmh_warpReduceSum(_sum);
            if (lid == 0) {
@@ -532,7 +532,7 @@ typedef long long __indexTypeLLong;

 //--------------------- Kernel for loop on line 558 ---------------------

-      __global__ void   loop_cg_558_cuda_kernel_llong(double _p_rma[], double _q[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      __global__ void   loop_cg_558_cuda_kernel_llong(double _q[], double _p[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
      {

 // Private variables
@@ -541,8 +541,8 @@ typedef long long __indexTypeLLong;
         int cond_0;
         int __k;
         int gid = blockIdx.x * blockDim.x + threadIdx.x;
-         int tid = gid / warpSize;
-         int lid = gid % warpSize;
+         int tid = gid / 32;
+         int lid = gid % 32;
 // Local needs
         __indexTypeLLong _j;
         __indexTypeLLong rest_blocks, cur_blocks;
@@ -564,7 +564,7 @@ typedef long long __indexTypeLLong;
                 __k < cond_0 ; 
                 _k = _k + warpSize, __k = __k + warpSize)
            {
-               _sum = _p_rma[_colidx[_k]] * _a[_k] + _sum;
+               _sum = _p[_colidx[_k]] * _a[_k] + _sum;
            }
            _sum = __dvmh_warpReduceSum(_sum);
            if (lid == 0) {
@@ -752,7 +752,7 @@ typedef long long __indexTypeLLong;

 //--------------------- Kernel for loop on line 605 ---------------------

-      __global__ void   loop_cg_605_cuda_kernel_int(double _z_rma[], double _r[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      __global__ void   loop_cg_605_cuda_kernel_int(double _r[], double _z[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
      {

 // Private variables
@@ -761,8 +761,8 @@ typedef long long __indexTypeLLong;
         int cond_0;
         int __k;
         int gid = blockIdx.x * blockDim.x + threadIdx.x;
-         int tid = gid / warpSize;
-         int lid = gid % warpSize;
+         int tid = gid / 32;
+         int lid = gid % 32;
 // Local needs
         __indexTypeInt _j;
         __indexTypeInt rest_blocks, cur_blocks;
@@ -784,7 +784,7 @@ typedef long long __indexTypeLLong;
                  __k < cond_0 ; 
                  _k = _k + warpSize, __k = __k + warpSize)
            {
-               _d = _z_rma[_colidx[_k]] * _a[_k] + _d;
+               _d = _z[_colidx[_k]] * _a[_k] + _d;
            }
            _d = __dvmh_warpReduceSum(_d);
            if (lid == 0) {
@@ -796,7 +796,7 @@ typedef long long __indexTypeLLong;

 //--------------------- Kernel for loop on line 605 ---------------------

-      __global__ void   loop_cg_605_cuda_kernel_llong(double _z_rma[], double _r[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      __global__ void   loop_cg_605_cuda_kernel_llong(double _r[], double _z[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
      {

 // Private variables
@@ -805,8 +805,8 @@ typedef long long __indexTypeLLong;
         int cond_0;
         int __k;
         int gid = blockIdx.x * blockDim.x + threadIdx.x;
-         int tid = gid / warpSize;
-         int lid = gid % warpSize;
+         int tid = gid / 32;
+         int lid = gid % 32;
 // Local needs
         __indexTypeLLong _j;
         __indexTypeLLong rest_blocks, cur_blocks;
@@ -828,7 +828,7 @@ typedef long long __indexTypeLLong;
                  __k < cond_0 ; 
                  _k = _k + warpSize, __k = __k + warpSize)
            {
-               _d = _z_rma[_colidx[_k]] * _a[_k] + _d;
+               _d = _z[_colidx[_k]] * _a[_k] + _d;
            }
            _d = __dvmh_warpReduceSum(_d);
            if (lid == 0) {
@@ -1727,10 +1727,11 @@ extern "C" {

 //    CUDA handler for loop on line 558 

-      void   loop_cg_558_cuda_(DvmType *loop_ref, DvmType _p_rma[], DvmType _q[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[])
+      void   loop_cg_558_cuda_(DvmType *loop_ref, DvmType _q[], DvmType _p[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[])
      {
-         void   *p_rma_base, *q_base, *colidx_base, *a_base, *rowstr_base;
-         DvmType d_p_rma[4], d_q[4], d_colidx[4], d_a[4], d_rowstr[4];
+        
+        void   *q_base, *p_base, *colidx_base, *a_base, *rowstr_base;
+         DvmType d_q[4], d_p[4], d_colidx[4], d_a[4], d_rowstr[4];
         DvmType idxTypeInKernel;
         dim3 blocks, threads;
         cudaStream_t stream;
@@ -1742,15 +1743,15 @@ extern "C" {
         device_num = loop_get_device_num_(loop_ref);

 // Get 'natural' bases
-         p_rma_base = dvmh_get_natural_base(&device_num, _p_rma);
         q_base = dvmh_get_natural_base(&device_num, _q);
+         p_base = dvmh_get_natural_base(&device_num, _p);
         colidx_base = dvmh_get_natural_base(&device_num, _colidx);
         a_base = dvmh_get_natural_base(&device_num, _a);
         rowstr_base = dvmh_get_natural_base(&device_num, _rowstr);

 // Fill 'device' headers
-         dvmh_fill_header_(&device_num, p_rma_base, _p_rma, d_p_rma);
         dvmh_fill_header_(&device_num, q_base, _q, d_q);
+         dvmh_fill_header_(&device_num, p_base, _p, d_p);
         dvmh_fill_header_(&device_num, colidx_base, _colidx, d_colidx);
         dvmh_fill_header_(&device_num, a_base, _a, d_a);
         dvmh_fill_header_(&device_num, rowstr_base, _rowstr, d_rowstr);
@@ -1778,8 +1779,8 @@ extern "C" {
         }
         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
-         overallBlocks = blocksS[0];
-         restBlocks = overallBlocks * dvmh_get_warp_size(loop_ref);
+         overallBlocks = blocksS[0]* dvmh_get_warp_size(loop_ref);;
+         restBlocks = overallBlocks;
         addBlocks = 0;
         blocks = dim3(1, 1, 1);
         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
@@ -1798,11 +1799,11 @@ extern "C" {
            }
            if (idxTypeInKernel == rt_INT) 
            {
-               loop_cg_558_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)p_rma_base, (double *)q_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
+               loop_cg_558_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)q_base, (double *)p_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
            }
            else 
            {
-               loop_cg_558_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)p_rma_base, (double *)q_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
+               loop_cg_558_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)q_base, (double *)p_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
            }
            addBlocks += blocks.x;
            restBlocks -= blocks.x;
@@ -2098,10 +2099,10 @@ extern "C" {

 //    CUDA handler for loop on line 605 

-      void   loop_cg_605_cuda_(DvmType *loop_ref, DvmType _z_rma[], DvmType _r[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[])
+      void   loop_cg_605_cuda_(DvmType *loop_ref, DvmType _r[], DvmType _z[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[])
      {
-         void   *z_rma_base, *r_base, *colidx_base, *a_base, *rowstr_base;
-         DvmType d_z_rma[4], d_r[4], d_colidx[4], d_a[4], d_rowstr[4];
+         void   *r_base, *z_base, *colidx_base, *a_base, *rowstr_base;
+         DvmType d_r[4], d_z[4], d_colidx[4], d_a[4], d_rowstr[4];
         DvmType idxTypeInKernel;
         dim3 blocks, threads;
         cudaStream_t stream;
@@ -2113,15 +2114,15 @@ extern "C" {
         device_num = loop_get_device_num_(loop_ref);

 // Get 'natural' bases
-         z_rma_base = dvmh_get_natural_base(&device_num, _z_rma);
         r_base = dvmh_get_natural_base(&device_num, _r);
+         z_base = dvmh_get_natural_base(&device_num, _z);
         colidx_base = dvmh_get_natural_base(&device_num, _colidx);
         a_base = dvmh_get_natural_base(&device_num, _a);
         rowstr_base = dvmh_get_natural_base(&device_num, _rowstr);

 // Fill 'device' headers
-         dvmh_fill_header_(&device_num, z_rma_base, _z_rma, d_z_rma);
         dvmh_fill_header_(&device_num, r_base, _r, d_r);
+         dvmh_fill_header_(&device_num, z_base, _z, d_z);
         dvmh_fill_header_(&device_num, colidx_base, _colidx, d_colidx);
         dvmh_fill_header_(&device_num, a_base, _a, d_a);
         dvmh_fill_header_(&device_num, rowstr_base, _rowstr, d_rowstr);
@@ -2155,7 +2156,7 @@ extern "C" {
         blocks = dim3(1, 1, 1);
         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
         maxBlocks = maxBlocks / dvmh_get_warp_size(loop_ref) * dvmh_get_warp_size(loop_ref);
-
+         
 // GPU execution
         while (restBlocks > 0)
         {
@@ -2169,15 +2170,15 @@ extern "C" {
            }
            if (idxTypeInKernel == rt_INT) 
            {
-               loop_cg_605_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)z_rma_base, (double *)r_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
+               loop_cg_605_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)r_base, (double *)z_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
            }
            else 
            {
-               loop_cg_605_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)z_rma_base, (double *)r_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
+               loop_cg_605_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)r_base, (double *)z_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
            }
            addBlocks += blocks.x;
            restBlocks -= blocks.x;
-         }
+         }         
      }


--- a/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cg.fdv
+++ b/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cg.fdv
@@ -550,11 +550,11 @@ c  The conj grad iteration loop
 c---->
 c---------------------------------------------------------------------
      do cgit = 1, cgitmax
-
         d = 0.0d0
+! DVM$ interval 11
 CDVM$ region
-	 
-CDVM$ parallel (j) on p(j), private(sum,k), remote_access(p(:))
+!WANR for many process, remote_access(p(:)) is needed
+CDVM$ parallel (j) on p(j), private(sum,k)
         do j=1,lastrow-firstrow+1
            sum = 0.d0
            do k=rowstr(j),rowstr(j+1)-1              
@@ -570,7 +570,7 @@ CDVM$ parallel (j) on q(j), reduction(SUM(d))
 CDVM$ end region
         alpha = rho / d
         rho0 = rho
-
+! DVM$ end interval 
         rho = 0.0d0
 CDVM$ region		 
 CDVM$ parallel (j) on r(j), private(d), reduction(SUM(rho))
@@ -598,10 +598,10 @@ c  Compute residual norm explicitly:  ||r|| = ||x - A.z||
 c  First, form A.z
 c  The partition submatrix-vector multiply
 c---------------------------------------------------------------------
-
+!WANR for many process, remote_access(z(:)) is needed
      sum = 0.0d0
 CDVM$ region
-CDVM$ parallel (j) on r(j), private(d,k),remote_access(z(:))	  
+CDVM$ parallel (j) on r(j), private(d,k)	  
      do j=1,lastrow-firstrow+1
         d = 0.d0
         do k=rowstr(j),rowstr(j+1)-1