fixed dead flag for functions
This commit is contained in:
@@ -487,7 +487,7 @@ typedef long long __indexTypeLLong;
|
||||
|
||||
//--------------------- Kernel for loop on line 558 ---------------------
|
||||
|
||||
__global__ void loop_cg_558_cuda_kernel_int(double _p_rma[], double _q[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
|
||||
__global__ void loop_cg_558_cuda_kernel_int(double _q[], double _p[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
|
||||
{
|
||||
|
||||
// Private variables
|
||||
@@ -496,8 +496,8 @@ typedef long long __indexTypeLLong;
|
||||
int cond_0;
|
||||
int __k;
|
||||
int gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int tid = gid / warpSize;
|
||||
int lid = gid % warpSize;
|
||||
int tid = gid / 32;
|
||||
int lid = gid % 32;
|
||||
|
||||
// Local needs
|
||||
__indexTypeInt _j;
|
||||
@@ -520,7 +520,7 @@ typedef long long __indexTypeLLong;
|
||||
__k < cond_0 ;
|
||||
_k = _k + warpSize, __k = __k + warpSize)
|
||||
{
|
||||
_sum = _p_rma[_colidx[_k]] * _a[_k] + _sum;
|
||||
_sum = _p[_colidx[_k]] * _a[_k] + _sum;
|
||||
}
|
||||
_sum = __dvmh_warpReduceSum(_sum);
|
||||
if (lid == 0) {
|
||||
@@ -532,7 +532,7 @@ typedef long long __indexTypeLLong;
|
||||
|
||||
//--------------------- Kernel for loop on line 558 ---------------------
|
||||
|
||||
__global__ void loop_cg_558_cuda_kernel_llong(double _p_rma[], double _q[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
|
||||
__global__ void loop_cg_558_cuda_kernel_llong(double _q[], double _p[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
|
||||
{
|
||||
|
||||
// Private variables
|
||||
@@ -541,8 +541,8 @@ typedef long long __indexTypeLLong;
|
||||
int cond_0;
|
||||
int __k;
|
||||
int gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int tid = gid / warpSize;
|
||||
int lid = gid % warpSize;
|
||||
int tid = gid / 32;
|
||||
int lid = gid % 32;
|
||||
// Local needs
|
||||
__indexTypeLLong _j;
|
||||
__indexTypeLLong rest_blocks, cur_blocks;
|
||||
@@ -564,7 +564,7 @@ typedef long long __indexTypeLLong;
|
||||
__k < cond_0 ;
|
||||
_k = _k + warpSize, __k = __k + warpSize)
|
||||
{
|
||||
_sum = _p_rma[_colidx[_k]] * _a[_k] + _sum;
|
||||
_sum = _p[_colidx[_k]] * _a[_k] + _sum;
|
||||
}
|
||||
_sum = __dvmh_warpReduceSum(_sum);
|
||||
if (lid == 0) {
|
||||
@@ -752,7 +752,7 @@ typedef long long __indexTypeLLong;
|
||||
|
||||
//--------------------- Kernel for loop on line 605 ---------------------
|
||||
|
||||
__global__ void loop_cg_605_cuda_kernel_int(double _z_rma[], double _r[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
|
||||
__global__ void loop_cg_605_cuda_kernel_int(double _r[], double _z[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
|
||||
{
|
||||
|
||||
// Private variables
|
||||
@@ -761,8 +761,8 @@ typedef long long __indexTypeLLong;
|
||||
int cond_0;
|
||||
int __k;
|
||||
int gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int tid = gid / warpSize;
|
||||
int lid = gid % warpSize;
|
||||
int tid = gid / 32;
|
||||
int lid = gid % 32;
|
||||
// Local needs
|
||||
__indexTypeInt _j;
|
||||
__indexTypeInt rest_blocks, cur_blocks;
|
||||
@@ -784,7 +784,7 @@ typedef long long __indexTypeLLong;
|
||||
__k < cond_0 ;
|
||||
_k = _k + warpSize, __k = __k + warpSize)
|
||||
{
|
||||
_d = _z_rma[_colidx[_k]] * _a[_k] + _d;
|
||||
_d = _z[_colidx[_k]] * _a[_k] + _d;
|
||||
}
|
||||
_d = __dvmh_warpReduceSum(_d);
|
||||
if (lid == 0) {
|
||||
@@ -796,7 +796,7 @@ typedef long long __indexTypeLLong;
|
||||
|
||||
//--------------------- Kernel for loop on line 605 ---------------------
|
||||
|
||||
__global__ void loop_cg_605_cuda_kernel_llong(double _z_rma[], double _r[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
|
||||
__global__ void loop_cg_605_cuda_kernel_llong(double _r[], double _z[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
|
||||
{
|
||||
|
||||
// Private variables
|
||||
@@ -805,8 +805,8 @@ typedef long long __indexTypeLLong;
|
||||
int cond_0;
|
||||
int __k;
|
||||
int gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int tid = gid / warpSize;
|
||||
int lid = gid % warpSize;
|
||||
int tid = gid / 32;
|
||||
int lid = gid % 32;
|
||||
// Local needs
|
||||
__indexTypeLLong _j;
|
||||
__indexTypeLLong rest_blocks, cur_blocks;
|
||||
@@ -828,7 +828,7 @@ typedef long long __indexTypeLLong;
|
||||
__k < cond_0 ;
|
||||
_k = _k + warpSize, __k = __k + warpSize)
|
||||
{
|
||||
_d = _z_rma[_colidx[_k]] * _a[_k] + _d;
|
||||
_d = _z[_colidx[_k]] * _a[_k] + _d;
|
||||
}
|
||||
_d = __dvmh_warpReduceSum(_d);
|
||||
if (lid == 0) {
|
||||
@@ -1727,10 +1727,11 @@ extern "C" {
|
||||
|
||||
// CUDA handler for loop on line 558
|
||||
|
||||
void loop_cg_558_cuda_(DvmType *loop_ref, DvmType _p_rma[], DvmType _q[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[])
|
||||
void loop_cg_558_cuda_(DvmType *loop_ref, DvmType _q[], DvmType _p[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[])
|
||||
{
|
||||
void *p_rma_base, *q_base, *colidx_base, *a_base, *rowstr_base;
|
||||
DvmType d_p_rma[4], d_q[4], d_colidx[4], d_a[4], d_rowstr[4];
|
||||
|
||||
void *q_base, *p_base, *colidx_base, *a_base, *rowstr_base;
|
||||
DvmType d_q[4], d_p[4], d_colidx[4], d_a[4], d_rowstr[4];
|
||||
DvmType idxTypeInKernel;
|
||||
dim3 blocks, threads;
|
||||
cudaStream_t stream;
|
||||
@@ -1742,15 +1743,15 @@ extern "C" {
|
||||
device_num = loop_get_device_num_(loop_ref);
|
||||
|
||||
// Get 'natural' bases
|
||||
p_rma_base = dvmh_get_natural_base(&device_num, _p_rma);
|
||||
q_base = dvmh_get_natural_base(&device_num, _q);
|
||||
p_base = dvmh_get_natural_base(&device_num, _p);
|
||||
colidx_base = dvmh_get_natural_base(&device_num, _colidx);
|
||||
a_base = dvmh_get_natural_base(&device_num, _a);
|
||||
rowstr_base = dvmh_get_natural_base(&device_num, _rowstr);
|
||||
|
||||
// Fill 'device' headers
|
||||
dvmh_fill_header_(&device_num, p_rma_base, _p_rma, d_p_rma);
|
||||
dvmh_fill_header_(&device_num, q_base, _q, d_q);
|
||||
dvmh_fill_header_(&device_num, p_base, _p, d_p);
|
||||
dvmh_fill_header_(&device_num, colidx_base, _colidx, d_colidx);
|
||||
dvmh_fill_header_(&device_num, a_base, _a, d_a);
|
||||
dvmh_fill_header_(&device_num, rowstr_base, _rowstr, d_rowstr);
|
||||
@@ -1778,8 +1779,8 @@ extern "C" {
|
||||
}
|
||||
loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
|
||||
blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
|
||||
overallBlocks = blocksS[0];
|
||||
restBlocks = overallBlocks * dvmh_get_warp_size(loop_ref);
|
||||
overallBlocks = blocksS[0]* dvmh_get_warp_size(loop_ref);;
|
||||
restBlocks = overallBlocks;
|
||||
addBlocks = 0;
|
||||
blocks = dim3(1, 1, 1);
|
||||
maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
|
||||
@@ -1798,11 +1799,11 @@ extern "C" {
|
||||
}
|
||||
if (idxTypeInKernel == rt_INT)
|
||||
{
|
||||
loop_cg_558_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)p_rma_base, (double *)q_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
|
||||
loop_cg_558_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)q_base, (double *)p_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
|
||||
}
|
||||
else
|
||||
{
|
||||
loop_cg_558_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)p_rma_base, (double *)q_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
|
||||
loop_cg_558_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)q_base, (double *)p_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
|
||||
}
|
||||
addBlocks += blocks.x;
|
||||
restBlocks -= blocks.x;
|
||||
@@ -2098,10 +2099,10 @@ extern "C" {
|
||||
|
||||
// CUDA handler for loop on line 605
|
||||
|
||||
void loop_cg_605_cuda_(DvmType *loop_ref, DvmType _z_rma[], DvmType _r[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[])
|
||||
void loop_cg_605_cuda_(DvmType *loop_ref, DvmType _r[], DvmType _z[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[])
|
||||
{
|
||||
void *z_rma_base, *r_base, *colidx_base, *a_base, *rowstr_base;
|
||||
DvmType d_z_rma[4], d_r[4], d_colidx[4], d_a[4], d_rowstr[4];
|
||||
void *r_base, *z_base, *colidx_base, *a_base, *rowstr_base;
|
||||
DvmType d_r[4], d_z[4], d_colidx[4], d_a[4], d_rowstr[4];
|
||||
DvmType idxTypeInKernel;
|
||||
dim3 blocks, threads;
|
||||
cudaStream_t stream;
|
||||
@@ -2113,15 +2114,15 @@ extern "C" {
|
||||
device_num = loop_get_device_num_(loop_ref);
|
||||
|
||||
// Get 'natural' bases
|
||||
z_rma_base = dvmh_get_natural_base(&device_num, _z_rma);
|
||||
r_base = dvmh_get_natural_base(&device_num, _r);
|
||||
z_base = dvmh_get_natural_base(&device_num, _z);
|
||||
colidx_base = dvmh_get_natural_base(&device_num, _colidx);
|
||||
a_base = dvmh_get_natural_base(&device_num, _a);
|
||||
rowstr_base = dvmh_get_natural_base(&device_num, _rowstr);
|
||||
|
||||
// Fill 'device' headers
|
||||
dvmh_fill_header_(&device_num, z_rma_base, _z_rma, d_z_rma);
|
||||
dvmh_fill_header_(&device_num, r_base, _r, d_r);
|
||||
dvmh_fill_header_(&device_num, z_base, _z, d_z);
|
||||
dvmh_fill_header_(&device_num, colidx_base, _colidx, d_colidx);
|
||||
dvmh_fill_header_(&device_num, a_base, _a, d_a);
|
||||
dvmh_fill_header_(&device_num, rowstr_base, _rowstr, d_rowstr);
|
||||
@@ -2155,7 +2156,7 @@ extern "C" {
|
||||
blocks = dim3(1, 1, 1);
|
||||
maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
|
||||
maxBlocks = maxBlocks / dvmh_get_warp_size(loop_ref) * dvmh_get_warp_size(loop_ref);
|
||||
|
||||
|
||||
// GPU execution
|
||||
while (restBlocks > 0)
|
||||
{
|
||||
@@ -2169,15 +2170,15 @@ extern "C" {
|
||||
}
|
||||
if (idxTypeInKernel == rt_INT)
|
||||
{
|
||||
loop_cg_605_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)z_rma_base, (double *)r_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
|
||||
loop_cg_605_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)r_base, (double *)z_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
|
||||
}
|
||||
else
|
||||
{
|
||||
loop_cg_605_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)z_rma_base, (double *)r_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
|
||||
loop_cg_605_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)r_base, (double *)z_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
|
||||
}
|
||||
addBlocks += blocks.x;
|
||||
restBlocks -= blocks.x;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -550,11 +550,11 @@ c The conj grad iteration loop
|
||||
c---->
|
||||
c---------------------------------------------------------------------
|
||||
do cgit = 1, cgitmax
|
||||
|
||||
d = 0.0d0
|
||||
! DVM$ interval 11
|
||||
CDVM$ region
|
||||
|
||||
CDVM$ parallel (j) on p(j), private(sum,k), remote_access(p(:))
|
||||
!WANR for many process, remote_access(p(:)) is needed
|
||||
CDVM$ parallel (j) on p(j), private(sum,k)
|
||||
do j=1,lastrow-firstrow+1
|
||||
sum = 0.d0
|
||||
do k=rowstr(j),rowstr(j+1)-1
|
||||
@@ -570,7 +570,7 @@ CDVM$ parallel (j) on q(j), reduction(SUM(d))
|
||||
CDVM$ end region
|
||||
alpha = rho / d
|
||||
rho0 = rho
|
||||
|
||||
! DVM$ end interval
|
||||
rho = 0.0d0
|
||||
CDVM$ region
|
||||
CDVM$ parallel (j) on r(j), private(d), reduction(SUM(rho))
|
||||
@@ -598,10 +598,10 @@ c Compute residual norm explicitly: ||r|| = ||x - A.z||
|
||||
c First, form A.z
|
||||
c The partition submatrix-vector multiply
|
||||
c---------------------------------------------------------------------
|
||||
|
||||
!WANR for many process, remote_access(z(:)) is needed
|
||||
sum = 0.0d0
|
||||
CDVM$ region
|
||||
CDVM$ parallel (j) on r(j), private(d,k),remote_access(z(:))
|
||||
CDVM$ parallel (j) on r(j), private(d,k)
|
||||
do j=1,lastrow-firstrow+1
|
||||
d = 0.d0
|
||||
do k=rowstr(j),rowstr(j+1)-1
|
||||
|
||||
Reference in New Issue
Block a user