diff --git a/dvm/fdvm/trunk/Sage/h/tag b/dvm/fdvm/trunk/Sage/h/tag index 77be3b4..2f30999 100644 --- a/dvm/fdvm/trunk/Sage/h/tag +++ b/dvm/fdvm/trunk/Sage/h/tag @@ -236,6 +236,7 @@ #define DVM_EXIT_INTERVAL_DIR 639 /* DVM-F */ #define DVM_TEMPLATE_CREATE_DIR 640 /* DVM-F */ #define DVM_TEMPLATE_DELETE_DIR 641 /* DVM-F */ +#define PRIVATE_AR_DECL 642 /* DVM-F */ /***************** variant tags for low level nodes ********************/ diff --git a/dvm/fdvm/trunk/Sage/h/tag.h b/dvm/fdvm/trunk/Sage/h/tag.h index 38e9115..abba34b 100644 --- a/dvm/fdvm/trunk/Sage/h/tag.h +++ b/dvm/fdvm/trunk/Sage/h/tag.h @@ -238,7 +238,8 @@ script using "tag". Run make tag.h to regenerate this file */ tag [ DVM_EXIT_INTERVAL_DIR ] = "DVM_EXIT_INTERVAL_DIR"; tag [ DVM_TEMPLATE_CREATE_DIR ] = "DVM_TEMPLATE_CREATE_DIR"; tag [ DVM_TEMPLATE_DELETE_DIR ] = "DVM_TEMPLATE_DELETE_DIR"; - + tag [ PRIVATE_AR_DECL ] = "PRIVATE_AR_DECL"; + /***************** variant tags for low level nodes ********************/ tag [ INT_VAL ] = "INT_VAL"; diff --git a/dvm/fdvm/trunk/Sage/lib/include/unparseC++.def b/dvm/fdvm/trunk/Sage/lib/include/unparseC++.def index ae74bc3..ef81403 100644 --- a/dvm/fdvm/trunk/Sage/lib/include/unparseC++.def +++ b/dvm/fdvm/trunk/Sage/lib/include/unparseC++.def @@ -139,6 +139,8 @@ DEFNODECODE(CONT_STAT, "%CMNT%PUTTABcontinue;%NL", 's',0,BIFNODE) DEFNODECODE(VAR_DECL, "%CMNT%SETFLAG(VARDECL)%IF (%CHECKFLAG(ENUM) == %NULL)%IF (%CHECKFLAG(CLASSDECL) != %NULL)%PROTECTION%ENDIF%PUTTAB%DECLSPEC%TYPE %ENDIF%LL1%IF (%CHECKFLAG(ENUM) == %NULL);%ENDIF%UNSETFLAG(VARDECL)%NL", 's',0,BIFNODE) +DEFNODECODE(PRIVATE_AR_DECL, "%CMNT%PUTTABPrivateArray<%LL1,%LL2> %LL3;%NL", +'s',0,BIFNODE) DEFNODECODE(PARAM_DECL, "%ERROR", 's',0,BIFNODE) DEFNODECODE(COMM_STAT, "%ERROR", diff --git a/dvm/fdvm/trunk/fdvm/acc.cpp b/dvm/fdvm/trunk/fdvm/acc.cpp index 1f1729f..02e9cbd 100644 --- a/dvm/fdvm/trunk/fdvm/acc.cpp +++ b/dvm/fdvm/trunk/fdvm/acc.cpp @@ -736,6 +736,12 @@ SgSymbol *RedCountSymbol(SgStatement *scope) } +char *PointerNameForPrivateArray(SgSymbol *symb) +{ + char *name = new char[strlen(symb->identifier())+4]; + sprintf(name, "_%s_p", symb->identifier()); + return name; +} SgSymbol *OverallBlocksSymbol() { @@ -1224,7 +1230,7 @@ int TestLocal(SgExpression *list) return (0); } -int is_deleted_module_symbol(SgSymbol *s) +int is_deleted_module_symbol(SgSymbol *s) // deleted because it was renamed (parser/sym.c: function delete_symbol()) { if (!strcmp("***", s->identifier())) return 1; @@ -1243,11 +1249,11 @@ void EnterDataRegionForVariablesInMainProgram(SgStatement *st) } s = cur_func->symbol()->next(); while (IS_BY_USE(s)) - { + { if (!is_deleted_module_symbol(s) && IS_ARRAY(s) && s->variant() == VARIABLE_NAME && !IS_ALLOCATABLE(s) && !IS_POINTER_F90(s) && !HEADER(s) ) st->insertStmtAfter(*DataEnter(new SgVarRefExp(s),ConstRef(0)),*st->controlParent()); s = s->next(); - } + } } void ExitDataRegionForVariablesInMainProgram(SgStatement *st) @@ -2688,6 +2694,7 @@ void ACC_CreateParallelLoop(int ipl, SgStatement *first_do, int nloop, SgStateme // creating private_list private_list = clause[PRIVATE_] ? clause[PRIVATE_]->lhs() : NULL; + dost = InnerMostLoop(first_do, nloop); // error checking @@ -2700,7 +2707,8 @@ void ACC_CreateParallelLoop(int ipl, SgStatement *first_do, int nloop, SgStateme for_shadow_compute = clause[SHADOW_COMPUTE_] ? 1 : 0; // for optimization of shadow_compute uses_list = UsesList(dost->lexNext(), lastStmtOfDo(dost)); RefInExpr(IsRedBlack(nloop), _READ_); // add to uses_list variables used in start-expression of redblack loop - UsesInPrivateArrayDeclarations(private_list); // add to uses_list variables used in private array declarations + if (!options.isOn(C_CUDA)) + UsesInPrivateArrayDeclarations(private_list); // add to uses_list variables used in private array declarations if(USE_STATEMENTS_ARE_REQUIRED) // || !IN_COMPUTE_REGION) CorrectUsesList(); for_shadow_compute = 0; @@ -2934,8 +2942,8 @@ int CreateLoopForSequence(SgStatement *first) } void doStatementsToPerformByHandler(int ilh, SgSymbol *adapter_symb, SgSymbol *hostproc_symb,int is_parloop,int interface) -{ SgExpression *arg_list, *base_list, *copy_uses_list, *copy_arg_list, *red_dim_list, *red_bound_list; - int numb, numb_r, numb_b; +{ SgExpression *arg_list, *base_list, *copy_uses_list, *copy_arg_list, *red_dim_list, *red_bound_list, *private_dim_list=NULL, *private_bound_list=NULL; + int numb=0, numb_r=0, numb_b=0, numb_p_dim=0, numb_p_bound=0; SgStatement *st_register; copy_uses_list = uses_list ? &(uses_list->copy()) : NULL; //!!! @@ -2944,9 +2952,16 @@ void doStatementsToPerformByHandler(int ilh, SgSymbol *adapter_symb, SgSymbol * arg_list = AddListToList(arg_list, ArrayArgumentList()); copy_arg_list = arg_list ? &(arg_list->copy()) : NULL; red_dim_list = DimSizeListOfReductionArrays(); - red_bound_list = BoundListOfReductionArrays(); - numb_b = ListElemNumber(red_bound_list); numb_r = ListElemNumber(red_dim_list); + red_bound_list = BoundListOfReductionArrays(); // !!! to change + numb_b = ListElemNumber(red_bound_list); + private_bound_list = BoundListOfPrivateArrays(); + numb_p_bound = ListElemNumber(private_bound_list); + if (options.isOn(C_CUDA)) + { + private_dim_list = DimSizeListOfPrivateArrays(); + numb_p_dim = ListElemNumber(private_dim_list); + } numb = ListElemNumber(arg_list) + ListElemNumber(uses_list); // register CUDA-handler @@ -2955,13 +2970,14 @@ void doStatementsToPerformByHandler(int ilh, SgSymbol *adapter_symb, SgSymbol * arg_list = AddListToList(arg_list, copy_uses_list); arg_list = AddListToList(arg_list, red_dim_list); + arg_list = AddListToList(arg_list, private_dim_list); if(interface == 1) { - InsertNewStatementAfter(RegisterHandler_H(ilh, DeviceTypeConst(CUDA), ConstRef(0), adapter_symb->next(), 0, numb + numb_r), cur_st, cur_st->controlParent()); /* OpenMP */ - AddListToList(cur_st->expr(0), arg_list); + InsertNewStatementAfter(RegisterHandler_H(ilh, DeviceTypeConst(CUDA), ConstRef(0), adapter_symb->next(), 0, numb + numb_r + numb_p_dim), cur_st, cur_st->controlParent()); /* OpenMP */ + AddListToList(cur_st->expr(0), arg_list); } else { - SgExpression *efun = HandlerFunc(adapter_symb->next(), numb + numb_r, arg_list); + SgExpression *efun = HandlerFunc(adapter_symb->next(), numb + numb_r + numb_p_dim, arg_list); InsertNewStatementAfter(RegisterHandler_H2(ilh, DeviceTypeConst(CUDA), ConstRef(0), efun), cur_st, cur_st->controlParent()); /* OpenMP */ } } @@ -2974,15 +2990,15 @@ void doStatementsToPerformByHandler(int ilh, SgSymbol *adapter_symb, SgSymbol * copy_uses_list = uses_list ? &(uses_list->copy()) : NULL; copy_arg_list = AddListToList(copy_arg_list, copy_uses_list); copy_arg_list = AddListToList(copy_arg_list, red_bound_list); - + copy_arg_list = AddListToList(copy_arg_list, private_bound_list); if(interface == 1) { - InsertNewStatementAfter(RegisterHandler_H(ilh, DeviceTypeConst(HOST), DVM000(iht), hostproc_symb, 0, numb+numb_b), cur_st, cur_st->controlParent()); /* OpenMP */ + InsertNewStatementAfter(RegisterHandler_H(ilh, DeviceTypeConst(HOST), DVM000(iht), hostproc_symb, 0, numb+numb_b+numb_p_bound), cur_st, cur_st->controlParent()); /* OpenMP */ AddListToList(cur_st->expr(0), copy_arg_list); } else { - SgExpression *efun = HandlerFunc(hostproc_symb, numb+numb_b, copy_arg_list); + SgExpression *efun = HandlerFunc(hostproc_symb, numb+numb_b+numb_p_bound, copy_arg_list); InsertNewStatementAfter(RegisterHandler_H2(ilh, DeviceTypeConst(HOST), DVM000(iht), efun), cur_st, cur_st->controlParent()); /* OpenMP */ } cur_st->addComment(OpenMpComment_HandlerType(iht)); @@ -3015,7 +3031,7 @@ SgExpression *DimSizeListOfReductionArrays() //arg = SizeFunction(rsl->redvar,idim); Error("Assumed-size array: %s", rsl->redvar->identifier(), 162, dvm_parallel_dir); else - arg = SizeFunctionWithKind(rsl->redvar, idim, len_DvmType); + arg = DvmType_Ref(SizeFunctionWithKind(rsl->redvar, idim, len_DvmType)); ell = new SgExprListExp(*arg); ell->setRhs(el); el = ell; @@ -3036,6 +3052,27 @@ SgExpression *DimSizeListOfReductionArrays() return(arg_list); } +SgExpression *DimSizeListOfPrivateArrays() +{ + int i; + SgExpression *pl, *arg_list=NULL; + SgSymbol *s; + if (!private_list) + return(NULL); + for (pl = private_list; pl; pl = pl->rhs()) + { + s = pl->lhs()->symbol(); + if (isSgArrayType(s->type()) && !TestArrayShape(s)) + { + for (i=0; icopy()) ); - + sl = AddListToList( sl, new SgExprListExp(*LBOUNDFunction(ar,i+1)) ); + if(!isConstantBound(ar,i,0)) - sl = AddListToList( sl, new SgExprListExp(UpperBound(ar,i)->copy()) ); + sl = AddListToList( sl, new SgExprListExp(*UBOUNDFunction(ar,i+1)) ); } return(sl); } @@ -3079,6 +3116,19 @@ SgExpression * BoundListOfReductionArrays() return bound_list; } +SgExpression * BoundListOfPrivateArrays() +{ + SgExpression *pl, *bound_list=NULL; + SgSymbol *s; + for (pl = private_list; pl; pl = pl->rhs()) + { + s = pl->lhs()->symbol(); + if (isSgArrayType(s->type())) + bound_list = AddListToList(bound_list, CreateBoundListOfArray(s)); + } + return bound_list; +} + void ReplaceCaseStatement(SgStatement *first) { SgStatement *stmt, *last_st; @@ -5676,6 +5726,15 @@ SgStatement *Create_Host_Across_Loop_Subroutine(SgSymbol *sHostProc) tail = red_bound_list; } + // add dummy arguments for private arrays + if(private_list) + { + SgExpression * private_dummy_list; + AddListToList(arg_list, private_dummy_list = DummyListForPrivateArrays(st_hedr)); + if(!tail) + tail = private_dummy_list; + } + // create get_dependency_mask function declaration stmt = fdvm[GET_DEP_MASK_F]->makeVarDeclStmt(); stmt->expr(1)->setType(tdvm); @@ -5733,8 +5792,8 @@ SgStatement *Create_Host_Across_Loop_Subroutine(SgSymbol *sHostProc) SgVarRefExp *which_run_expr = new SgVarRefExp(which_run); stmt = new SgAssignStmt(*which_run_expr, *fen); st_end->insertStmtBefore(*stmt, *st_hedr); - //stmt = PrintStat(which_run_expr); - //st_end->insertStmtBefore(*stmt, *st_hedr); + //stmt = PrintStat(which_run_expr); + //st_end->insertStmtBefore(*stmt, *st_hedr); // create argument list of handler's call SgExpression *new_arg_list = &st_hedr->expr(0)->copy(); @@ -5868,6 +5927,7 @@ SgStatement *Create_Host_Loop_Subroutine_Main (SgSymbol *sHostProc) if (!tail) tail = copy_uses_list; } + // add dummy arguments for reductions if(red_list) { SgExpression * red_bound_list; @@ -5876,6 +5936,15 @@ SgStatement *Create_Host_Loop_Subroutine_Main (SgSymbol *sHostProc) tail = red_bound_list; } + // add dummy arguments for private arrays + if(private_list) + { + SgExpression * private_dummy_list; + AddListToList(arg_list, private_dummy_list = DummyListForPrivateArrays(st_hedr)); + if(!tail) + tail = private_dummy_list; + } + // create external statement stmt = new SgStatement(EXTERN_STAT); el = new SgExprListExp(*new SgVarRefExp(fdvm[GET_REMOTE_BUF])); @@ -6006,6 +6075,7 @@ SgStatement *Create_Host_Loop_Subroutine(SgSymbol *sHostProc, int dependency) if (!tail) tail = copy_uses_list; } + // add dummy arguments for reductions if(red_list) { SgExpression * red_bound_list; @@ -6013,6 +6083,14 @@ SgStatement *Create_Host_Loop_Subroutine(SgSymbol *sHostProc, int dependency) if(!tail) tail = red_bound_list; } + // add dummy arguments for private arrays + if(private_list) + { + SgExpression * private_dummy_list; + AddListToList(arg_list, private_dummy_list = DummyListForPrivateArrays(st_hedr)); + if(!tail) + tail = private_dummy_list; + } // create external statement stmt = new SgStatement(EXTERN_STAT); @@ -6177,9 +6255,8 @@ SgStatement *Create_Host_Loop_Subroutine(SgSymbol *sHostProc, int dependency) for (el = private_list; el; el = el->rhs()) { SgSymbol *sp = el->lhs()->symbol(); - //if(HEADER(sp)) // dvm-array is declared as dummy argument - // continue; - DeclareSymbolInHostHandler(sp, st_hedr, NULL); + SgSymbol *sph = isSgArrayType(sp->type()) ? *(SgSymbol **)(el->lhs()->attributeValue(0, PRIVATE_ARRAY)) : sp; + DeclareSymbolInHostHandler(sp, st_hedr, sph); } // SgExprListExp *indexes = NULL; /* OpenMP */ @@ -6918,6 +6995,22 @@ int ExplicitShape(SgExpression *eShape) return 1; } +int TestArrayShape(SgSymbol *ar) +{ + int i; + SgExpression *esize = NULL; + for(i=1; i<=Rank(ar); i++) + { + //calculating size of i-th dimension + esize = ReplaceParameter(ArrayDimSize(ar, i)); + //if(err && esize && esize->variant()==STAR_RANGE) + // return 0; //Error("Assumed-size array: %s",ar->identifier(),162,stmt); + if(!esize || !esize->isInteger()) + return 0; + } + return 1; +} + SgSymbol *ArraySymbolInHostHandler(SgSymbol *ar, SgStatement *scope) { SgSymbol *soff; @@ -6926,7 +7019,7 @@ SgSymbol *ArraySymbolInHostHandler(SgSymbol *ar, SgStatement *scope) rank = Rank(ar); soff = ArraySymbol(ar->identifier(), ar->type()->baseType(), NULL, scope); - if(!ExplicitShape(isSgArrayType(ar->type())->getDimList())) + if (!options.isOn(C_CUDA) && !ExplicitShape(isSgArrayType(ar->type())->getDimList())) Error("Illegal array bound of private array %s", ar->identifier(), 442, dvm_parallel_dir); for (i = 0; i < rank; i++) @@ -7363,6 +7456,26 @@ SgExpression * DummyListForReductionArrays(SgStatement *st_hedr) return dummy_list; } +SgExpression * DummyListForPrivateArrays(SgStatement *st_hedr) +{ + SgExpression *dummy_list = NULL, *pl; + SgSymbol *s; + for (pl=private_list; pl;pl=pl->rhs()) + { + s = pl->lhs()->symbol(); + if (isSgArrayType(s->type())) + { + SgType *tp = s->type()->baseType(); + SgSymbol *new_ar = ArraySymbol(s->identifier(), tp, NULL, st_hedr); + dummy_list = AddListToList(dummy_list, CreateDummyBoundListOfArray(s, new_ar, st_hedr)); + SgSymbol **satr = new (SgSymbol *); + *satr = new_ar; + pl->lhs()->addAttribute(PRIVATE_ARRAY, (void *)satr, sizeof(SgSymbol *) ); + } + } + return dummy_list; +} + /***************************************************************************************/ /*ACC*/ /* Creating and Inserting New Statement in the Program */ @@ -8253,7 +8366,8 @@ SgExpression *CreateKernelDummyList(SgSymbol *s_red_count_k, std::vector ] - + if (private_list) + arg_list = AddListToList(arg_list, CreatePrivateDummyList()); //[+ dummys for private arrays ] for (size_t i = 0; i < lowI.size(); ++i) { ae = new SgExprListExp(*new SgVarRefExp(lowI[i])); @@ -9058,6 +9172,8 @@ SgExpression *CreateKernelDummyList(SgSymbol *s_red_count_k, SgType *idxTypeInKe } if (uses_list) arg_list = AddListToList(arg_list, CreateUsesDummyList()); //[+ ] + if (private_list) + arg_list = AddListToList(arg_list, CreatePrivateDummyList()); //[+ dummys for private arrays ] return arg_list; } @@ -9219,6 +9335,40 @@ SgExpression *CreateUsesDummyList() return(arg_list); } +SgExpression *CreatePrivateDummyList() +{ + SgSymbol *s_dummy, *s; + SgExpression *el, *ae; + SgExpression *arg_list = NULL; + if (!options.isOn(C_CUDA) || !sizeOfPrivateArraysInBytes()) + return NULL; + for (el = private_list; el; el = el->rhs()) + { + s = el->lhs()->symbol(); + if (!IS_ARRAY(s)) + continue; + s_dummy = ArraySymbol(PointerNameForPrivateArray(s), C_Type(s->type()->baseType()), NULL, kernel_st); + ae = new SgArrayRefExp(*s_dummy, *new SgExprListExp()); + ae->setType(s_dummy->type()); + arg_list = AddListToList(arg_list, new SgExprListExp(*ae)); + SgSymbol **satr = new (SgSymbol *); + *satr = s_dummy; + el->lhs()->addAttribute(PRIVATE_POINTER, (void *)satr, sizeof(SgSymbol *) ); + if (!TestArrayShape(s)) + { + SgExpression **eatr = (SgExpression **) el->lhs()->attributeValue(0, DIM_SIZES); + SgExpression *ela; + for (ela = *eatr; ela; ela=ela->rhs()) + arg_list = AddListToList(arg_list, new SgExprListExp(*new SgVarRefExp(ela->lhs()->lhs()->symbol()))); //AddListToList(arg_list, &(ela->copy())); + + eatr = (SgExpression **) el->lhs()->attributeValue(0, L_BOUNDS); + for (ela = *eatr; ela; ela=ela->rhs()) + arg_list = AddListToList(arg_list, new SgExprListExp(*new SgVarRefExp(ela->lhs()->lhs()->symbol()))); //AddListToList(arg_list, &(ela->copy())); + } + } + + return(arg_list); +} SgExpression *CreateRedDummyList() { @@ -9642,23 +9792,85 @@ void DeclareInternalPrivateVars() void DeclarePrivateVars() { - SgStatement *st = NULL; - SgExpression *var = NULL; - // declare private(local in kernel) variables + SgStatement *st = NULL, *st_first=NULL; + SgExpression *var = NULL, *e; + SgSymbol *s; + SgExpression *e_all_private_size = sizeOfPrivateArraysInBytes(); + // declare private variables for (var = private_list; var; var = var->rhs()) { - if (isParDoIndexVar(var->lhs()->symbol())) continue; // declared as index variable of parallel loop - //if (HEADER(var->lhs()->symbol())) continue; // dvm-array declared as dummy argument - st = Declaration_Statement(SymbolInKernel(var->lhs()->symbol())); - kernel_st->insertStmtAfter(*st); - } - if (!st) - return; + s = var->lhs()->symbol(); + if (isParDoIndexVar(s)) continue; // declared as index variable of parallel loop + //if (HEADER(var->lhs()->symbol())) continue; // dvm-array declared as dummy argument + if (!options.isOn(C_CUDA) || !IS_ARRAY(s) || !e_all_private_size ) + { + st = Declaration_Statement(SymbolInKernel(s)); + kernel_st->insertStmtAfter(*st); + st_first = st; + } + else + { + SgSymbol *s_dims=NULL; + st = new SgStatement(PRIVATE_AR_DECL); + kernel_st->insertStmtAfter(*st); + st_first = st; + e = new SgExpression(TYPE_OP); + e->setType(C_Type(s->type()->baseType())); + st->setExpression(0, e); + + e = new SgValueExp(Rank(s)); + st->setExpression(1, e); + if (Rank(s)>1) + { + char *name = new char[strlen(s->identifier())+7]; + sprintf(name, "_%s_dims", s->identifier()); + s_dims = ArraySymbol(name, C_UnsignedLongLongType(), new SgValueExp(Rank(s)-1), kernel_st); + SgExpression *einit = new SgExpression(INIT_LIST); + SgExpression *elist = NULL; + if (!TestArrayShape(s)) + { + SgExpression **eatr = (SgExpression **) var->lhs()->attributeValue(0, DIM_SIZES); + SgExpression *ela; + for (ela = *eatr; ela->rhs(); ela = ela->rhs()) + { + SgExpression *ed = new SgVarRefExp(ela->lhs()->lhs()->symbol()); + elist = AddListToList(new SgExprListExp(*ed), elist); + } + } + else + { + for (int i=Rank(s)-1; i; i--) + elist = AddListToList(elist, Calculate(ArrayDimSize(s,i))); + } + einit->setLhs(elist); + SgStatement *st_dims = makeSymbolDeclarationWithInit(s_dims, einit);//Declaration_Statement(s_dims); + kernel_st->insertStmtAfter(*st_dims); + st_first = st_dims; + } + SgSymbol *s_new = & s->copy(); + SYMB_SCOPE(s_new->thesymb) = kernel_st->thebif; + SgFunctionCallExp *efc = new SgFunctionCallExp(*s_new); + if (s_dims) + { + efc->addArg(*new SgVarRefExp(s_dims)); + } + SgSymbol **satr = (SgSymbol **) var->lhs()->attributeValue(0, PRIVATE_POINTER); + if (satr) + { + SgSymbol *sp = *satr; + efc->addArg(*new SgVarRefExp(sp)); //e->setLhs(new SgExprListExp(*new SgVarRefExp(sp))); + } + st->setExpression(2, efc); + } + } + if (!st_first) + return; + if (options.isOn(C_CUDA)) - st->addComment("// Private variables"); + st_first->addComment("// Private variables"); else - st->addComment("! Private variables\n"); + st_first->addComment("! Private variables\n"); } void DeclareUsedVars() @@ -11332,14 +11544,39 @@ SgExpression *BlockDimsProduct() return &(*new SgRecordRefExp(*s_blockdim, "x") * *new SgRecordRefExp(*s_blockdim, "y") * *new SgRecordRefExp(*s_blockdim, "z")); } +reduction_operation_list *ElementOfReductionStruct(SgSymbol *ar) +{ + reduction_operation_list *rl; + for (rl=red_struct_list; rl; rl=rl->next) + if (!strcmp(rl->redvar->identifier(), ar->identifier())) + return rl; + return red_struct_list; +} + +SgExpression *ElementOfPrivateList(SgSymbol *ar) +{ + SgExpression *el; + for (el=private_list; el; el=el->rhs()) + if (!strcmp(el->lhs()->symbol()->identifier(), ar->identifier())) + return el->lhs(); + return private_list->lhs(); +} + SgExpression *LowerShiftForArrays (SgSymbol *ar, int i, int type) { SgExpression *e = isConstantBound(ar, i, 1); - if(e) return e; - if(type==0) //private array - e = new SgValueExp(1); + if (e) return e; + if (type==0) //private array + { + SgExpression **eatr = (SgExpression **)ElementOfPrivateList(ar)->attributeValue(0, L_BOUNDS); + SgExprListExp *ebounds = (SgExprListExp *)*eatr; + e = new SgVarRefExp(ebounds->elem(i)->lhs()->symbol()); + } else // reduction array - e = &(((SgExprListExp *)red_struct_list->lowBound_arg)->elem(i)->copy()); + { + SgExprListExp *el = ((SgExprListExp *) ElementOfReductionStruct(ar)->lowBound_arg); + e = &( el->elem(i)->copy() ); + } return e; } @@ -11636,6 +11873,11 @@ SgType * C_LongLongType() return(new SgDescriptType(*new SgType(T_LONG), BIT_LONG)); } +SgType * C_UnsignedLongLongType() +{ + return( new SgDescriptType(*new SgType(T_LONG), BIT_UNSIGNED | BIT_LONG)); //TYPE_LONG_SHORT(type->thetype) = BIT_UNSIGNED & BIT_LONG; +} + SgType * C_DvmType() { if (!type_DvmType) @@ -13013,16 +13255,16 @@ SgStatement *Create_C_Adapter_Function(SgSymbol *sadapter) symb_list *sl; SgStatement *st_hedr, *st_end, *stmt, *do_while, *first_exec, *st_base = NULL, *st_call, *cur; SgExpression *fe, *ae, *arg_list, *el, *e, *er; - SgExpression *espec; + SgExpression *espec, *e_all_private_size = NULL; SgFunctionCallExp *fcall; //SgStatement *fileHeaderSt; - SgSymbol *s_loop_ref, *sarg, *s, *sb, *sg, *sdev, *h_first, *hgpu_first, *base_first, *red_first, *uses_first, *scalar_first; + SgSymbol *s_loop_ref, *sarg, *s, *sb, *sg, *sdev, *h_first, *hgpu_first, *base_first, *red_first, *uses_first, *scalar_first, *private_first; SgSymbol *s_stream = NULL, *s_blocks = NULL, *s_threads = NULL, *s_blocks_info = NULL, *s_red_count = NULL, *s_tmp_var = NULL; SgSymbol *s_dev_num = NULL, *s_shared_mem = NULL, *s_regs = NULL, *s_blocksS = NULL, *s_idxL = NULL, *s_idxH = NULL, *s_step = NULL, *s_idxTypeInKernel = NULL; SgSymbol *s_num_of_red_blocks = NULL, *s_fill_flag = NULL, *s_red_num = NULL, *s_restBlocks = NULL, *s_addBlocks = NULL, *s_overallBlocks = NULL; SgSymbol *s_max_blocks; SgType *typ = NULL; - int ln, num, i, uses_num, shared_mem_count, has_red_array, use_device_num, nbuf; + int ln, num, i, uses_num, shared_mem_count, has_red_array, use_device_num, nbuf, lnp; char *define_name; int pl_rank = ParLoopRank(); h_first = hgpu_first = base_first = red_first = uses_first = scalar_first = NULL; @@ -13038,7 +13280,7 @@ SgStatement *Create_C_Adapter_Function(SgSymbol *sadapter) first_exec = st_end; // create dummy argument list: - // loop_ref,,, + // loop_ref,,,, typ = C_PointerType(C_Derived_Type(s_DvmhLoopRef)); s_loop_ref = new SgSymbol(VARIABLE_NAME, "loop_ref", *typ, *st_hedr); @@ -13084,7 +13326,7 @@ SgStatement *Create_C_Adapter_Function(SgSymbol *sadapter) } uses_num = ln; - if (red_list) + if (red_list) // reduction array shapes { reduction_operation_list *rsl; //create dimmesion size list for reduction arrays int idim; @@ -13111,11 +13353,6 @@ SgStatement *Create_C_Adapter_Function(SgSymbol *sadapter) ae = new SgVarRefExp(sarg); ae->setType(t); el = AddElementToList(el, new SgPointerDerefExp(*ae)); - /* - ell = new SgExprListExp(*new SgPointerDerefExp(*ae)); - ell->setRhs(el); - el = ell; - */ } rsl->dimSize_arg = el; /*arg_list->setRhs(el->copy());*/ @@ -13128,6 +13365,49 @@ SgStatement *Create_C_Adapter_Function(SgSymbol *sadapter) } } + if (options.isOn(C_CUDA)) // private array shapes + { + int idim; + SgExpression *elp; + SgType *t = C_PointerType(C_DvmType()); + + for (elp=private_list; elp; elp = elp->rhs()) + { + s = elp->lhs()->symbol(); + if (IS_ARRAY(s) && !TestArrayShape(s)) + { + el = NULL; + for (idim = Rank(s); idim; idim--) + { + sarg = new SgSymbol(VARIABLE_NAME, DimSizeName(s, idim), *t, *st_hedr); + ae = new SgVarRefExp(sarg); + ae->setType(t); + el = AddElementToList(el, new SgPointerDerefExp(*ae)); + } + SgExpression **edim = new (SgExpression *); + *edim = el; + elp->lhs()->addAttribute(DIM_SIZES, (void *)edim, sizeof(SgExpression *) ); + arg_list = AddListToList(arg_list, &el->copy()); + + el = NULL; + for (idim = Rank(s); idim; idim--) + { + sarg = new SgSymbol(VARIABLE_NAME, BoundName(s, idim, 1), *t, *st_hedr); + ae = new SgVarRefExp(sarg); + ae->setType(t); + el = AddElementToList(el, new SgPointerDerefExp(*ae)); + } + SgExpression **elb = new (SgExpression *); + *elb = el; + elp->lhs()->addAttribute(L_BOUNDS, (void *)elb, sizeof(SgExpression *) ); + arg_list = AddListToList(arg_list, &el->copy()); + + while (arg_list->rhs() != 0) + arg_list = arg_list->rhs(); + } + + } + } // create variable's declarations: ,,,,blocks_info [ or blocksS,idxL,idxH ],stream,blocks,threads if (red_list) { @@ -13212,7 +13492,7 @@ SgStatement *Create_C_Adapter_Function(SgSymbol *sadapter) btype = loc_type->baseType(); else btype = loc_type; - //!printf("__112\n"); + SgArrayType *typearray = new SgArrayType(*C_Type(btype)); typearray->addRange(*new SgValueExp(loc_el_num)); s_loc_var->setType(*typearray); @@ -13251,7 +13531,7 @@ SgStatement *Create_C_Adapter_Function(SgSymbol *sadapter) } if (!options.isOn(NO_BL_INFO)) { - s_blocks_info = s = new SgSymbol(VARIABLE_NAME, "blocks_info", *C_PointerType(C_VoidType()), *st_hedr); + s_blocks_info = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("blocks_info"), *C_PointerType(C_VoidType()), *st_hedr); stmt = makeSymbolDeclaration(s); st_hedr->insertStmtAfter(*stmt, *st_hedr); } @@ -13260,13 +13540,13 @@ SgStatement *Create_C_Adapter_Function(SgSymbol *sadapter) s_blocksS = s = ArraySymbol(TestAndCorrectName("blocksS"), C_DvmType(), new SgValueExp(pl_rank), st_hedr); stmt = makeSymbolDeclaration(s); st_hedr->insertStmtAfter(*stmt, *st_hedr); - s_restBlocks = s = new SgSymbol(VARIABLE_NAME, "restBlocks", *C_Derived_Type(s_cudaStream), *st_hedr); + s_restBlocks = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("restBlocks"), *C_Derived_Type(s_cudaStream), *st_hedr); addDeclExpList(s, stmt->expr(0)); - s_max_blocks = s = new SgSymbol(VARIABLE_NAME, "maxBlocks", *C_DvmType(), *st_hedr); - addDeclExpList(s, stmt->expr(0)); - s_addBlocks = s = new SgSymbol(VARIABLE_NAME, "addBlocks", *C_Derived_Type(s_cudaStream), *st_hedr); + s_max_blocks = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("maxBlocks"), *C_DvmType(), *st_hedr); + addDeclExpList(s, stmt->expr(0)); + s_addBlocks = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("addBlocks"), *C_Derived_Type(s_cudaStream), *st_hedr); addDeclExpList(s, stmt->expr(0)); - s_overallBlocks = s = new SgSymbol(VARIABLE_NAME, "overallBlocks", *C_Derived_Type(s_cudaStream), *st_hedr); + s_overallBlocks = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("overallBlocks"), *C_Derived_Type(s_cudaStream), *st_hedr); addDeclExpList(s, stmt->expr(0)); s_idxL = s = ArraySymbol(TestAndCorrectName("idxL"), C_DvmType(), new SgValueExp(pl_rank), st_hedr); stmt = makeSymbolDeclaration(s); @@ -13277,15 +13557,15 @@ SgStatement *Create_C_Adapter_Function(SgSymbol *sadapter) addDeclExpList(s, stmt->expr(0)); } - s_stream = s = new SgSymbol(VARIABLE_NAME, "stream", *C_Derived_Type(s_cudaStream), *st_hedr); + s_stream = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("stream"), *C_Derived_Type(s_cudaStream), *st_hedr); stmt = makeSymbolDeclaration(s); st_hedr->insertStmtAfter(*stmt, *st_hedr); - s_blocks = s = new SgSymbol(VARIABLE_NAME, "blocks", *t_dim3, *st_hedr); + s_blocks = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("blocks"), *t_dim3, *st_hedr); stmt = makeSymbolDeclaration(s); st_hedr->insertStmtAfter(*stmt, *st_hedr); - s_threads = s = new SgSymbol(VARIABLE_NAME, "threads", *t_dim3, *st_hedr); + s_threads = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("threads"), *t_dim3, *st_hedr); addDeclExpList(s, stmt->expr(0)); s_idxTypeInKernel = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("idxTypeInKernel"), *C_DvmType(), *st_hedr); @@ -13615,6 +13895,34 @@ SgStatement *Create_C_Adapter_Function(SgSymbol *sadapter) sdev = sdev->next(); } + e_all_private_size = sizeOfPrivateArraysInBytes(); + if (options.isOn(C_CUDA) && e_all_private_size) + { + for (el=private_list, lnp=0; el; el=el->rhs()) + { + s = el->lhs()->symbol(); + if (IS_ARRAY(s)) + { + sarg = new SgSymbol(VARIABLE_NAME, PointerNameForPrivateArray(s), *C_PointerType(C_VoidType()), *st_hedr); + ae = new SgCastExp(*C_PointerType( C_Type(s->type()->baseType())), *new SgVarRefExp(sarg)); + fcall->addArg(*ae); + if (!lnp) + private_first = sarg; + lnp++; + if (!TestArrayShape(s)) + { + SgExpression **eatr = (SgExpression **) el->lhs()->attributeValue(0, DIM_SIZES); + SgExpression *ela; + for (ela = *eatr; ela; ela = ela->rhs()) + fcall->addArg(SgDerefOp(*new SgVarRefExp(ela->lhs()->lhs()->symbol()))); + eatr = (SgExpression **) el->lhs()->attributeValue(0, L_BOUNDS); + for (ela = *eatr; ela; ela = ela->rhs()) + fcall->addArg(SgDerefOp(*new SgVarRefExp(ela->lhs()->lhs()->symbol()))); + } + + } + } + } if (!options.isOn(NO_BL_INFO)) { @@ -13684,13 +13992,38 @@ SgStatement *Create_C_Adapter_Function(SgSymbol *sadapter) st_call = createKernelCallsInCudaHandler(fcall, s_loop_ref, s_idxTypeInKernel, s_blocks); - SgFunctionCallExp *getProp = new SgFunctionCallExp(*new SgSymbol(FUNCTION_NAME, "loop_cuda_get_device_prop")); - getProp->addArg(*new SgVarRefExp(s_loop_ref)); - getProp->addArg(*new SgKeywordValExp("CUDA_MAX_GRID_X")); + SgExpression *getProp = GetDeviceProp(s_loop_ref, new SgKeywordValExp("CUDA_MAX_GRID_X")); + stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*s_max_blocks), *getProp)); + st_end->insertStmtBefore(*stmt, *st_hedr); - stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*s_max_blocks), *getProp)); - st_end->insertStmtBefore(*stmt, *st_hedr); + // insert code for big private arrays + if (options.isOn(C_CUDA) && e_all_private_size) //(e_size = sizeOfPrivateArraysInBytes())) + { + SgSymbol *s_private_size = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("privateSizeForBlock"), *C_DvmType(), *st_hedr); + stmt = makeSymbolDeclaration(s_private_size); + st_end->insertStmtBefore(*stmt, *st_hedr); + SgSymbol *s_total_threads = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("totalThreads"), *C_DvmType(), *st_hedr); + addDeclExpList(s_total_threads, stmt->expr(0)); + + SgExpression *e_threads = &(*new SgRecordRefExp(*s_threads, "x") * *new SgRecordRefExp(*s_threads, "y") * *new SgRecordRefExp(*s_threads, "z")); + SgExpression *e_private_size_for_block = &(*e_threads * *e_all_private_size); + stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*s_private_size), *e_private_size_for_block)); + st_end->insertStmtBefore(*stmt, *st_hedr); + + SgExpression *e_maxBlocks = GetMaxBlocks(s_loop_ref, s_max_blocks, s_private_size); + stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*s_max_blocks), *e_maxBlocks)); + st_end->insertStmtBefore(*stmt, *st_hedr); + + SgFunctionCallExp *fmin = new SgFunctionCallExp(*new SgSymbol(FUNCTION_NAME, "min", *C_DvmType(), *st_hedr)); + fmin->addArg(*new SgVarRefExp(s_max_blocks)); + fmin->addArg(*new SgVarRefExp(s_restBlocks)); + SgExpression *e_total_threads = &((e_threads->copy()) * *fmin); + stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*s_total_threads), *e_total_threads)); + st_end->insertStmtBefore(*stmt, *st_hedr); + // Get private arrays + GetMemoryForPrivateArrays(private_first, s_loop_ref, lnp, st_end, st_hedr, new SgVarRefExp(s_total_threads)); + } if (currentLoop && currentLoop->irregularAnalysisIsOn()) { stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*s_max_blocks), *new SgVarRefExp(*s_max_blocks) / *new SgValueExp(warpSize) * *new SgValueExp(warpSize))); @@ -13701,7 +14034,7 @@ SgStatement *Create_C_Adapter_Function(SgSymbol *sadapter) do_while = new SgWhileStmt(operator > (*new SgVarRefExp(s_restBlocks), *new SgValueExp(0)), *st_call); st_end->insertStmtBefore(*do_while, *st_hedr); do_while->addComment("// GPU execution"); - stmt = IfForHeader(s_restBlocks, s_blocks, s_max_blocks); + stmt = IfForHeader(s_restBlocks, s_blocks, s_max_blocks); st_call->insertStmtBefore(*stmt, *do_while); stmt = new SgCExpStmt(*new SgExpression(MINUS_ASSGN_OP, new SgVarRefExp(*s_restBlocks), new SgRecordRefExp(*s_blocks, "x"), NULL)); st_call->insertStmtAfter(*stmt, *do_while); @@ -13710,6 +14043,14 @@ SgStatement *Create_C_Adapter_Function(SgSymbol *sadapter) /* ------ block for finish reductions ----*/ if (red_list) InsertFinishReductionCalls(st_end, s_loop_ref, s_red_num); + + // to dispose private arrays + if (options.isOn(C_CUDA) && e_all_private_size) + for (s = private_first, ln = 0; ln < lnp; s = s->next(), ln++) // private arrays + { + stmt = new SgCExpStmt(*DisposePrivateArray(s_loop_ref, s)); + st_end->insertStmtBefore(*stmt, *st_hedr); + } } if (options.isOn(C_CUDA)) @@ -13986,7 +14327,7 @@ SgStatement *Create_C_Adapter_Function_For_Sequence(SgSymbol *sadapter, SgStatem sdev = sdev->next(); } - // insetr kernel call + // inset kernel call stmt = createKernelCallsInCudaHandler(fcall, s_loop_ref, s_idxTypeInKernel, s_blocks); /* ------- WHILE (loop_cuda_do(DvmhLoopRef *InDvmhLoop, dim3 *OutBlocks, dim3 *OutThreads, cudaStream_t *OutStream, CudaIndexType **InOutBlocks) != 0) ----*/ @@ -13998,6 +14339,99 @@ SgStatement *Create_C_Adapter_Function_For_Sequence(SgSymbol *sadapter, SgStatem return(st_hedr); } +void GetMemoryForPrivateArrays(SgSymbol *private_first, SgSymbol *s_loop_ref, int nump, SgStatement *st_end, SgStatement *st_hedr, SgExpression *e_totalThreads) +{ + SgSymbol *s; + SgExpression *el; + SgStatement *stmt; + int ln; + if (!private_first) + return; + SgStatement *st_decl = makeSymbolDeclaration(private_first); + st_end->insertStmtBefore(*st_decl, *st_hedr); + st_decl->addComment("// Get private arrays"); + + for (s = private_first, el = private_list, ln = 0; ln < nump; s = s->next(), el = el->rhs(), ln++) // private arrays + { + while (!IS_ARRAY(el->lhs()->symbol())) + el = el->rhs(); + if (ln) + addDeclExpList(s, st_decl->expr(0)); + SgExpression **esizes = (SgExpression **) el->lhs()->attributeValue(0, DIM_SIZES); + SgExpression *elength = esizes ? &( *ProductOfDimSizeArgs(*esizes) * *sizeOfElementInBytes(el->lhs()->symbol())) : ArrayLength(el->lhs()->symbol(), dvm_parallel_dir, 0); + SgExpression *e_bytes = &(*elength * *e_totalThreads); + stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*s), *GetPrivateArray(s_loop_ref, e_bytes))); + st_end->insertStmtBefore(*stmt, *st_hedr); + } +} + +SgExpression *sizeOfElementInBytes(SgSymbol *symb) +{ + int isz = TypeSize(symb->type()->baseType()); + if (isz <= 0 ) + Error("Illegal type of private array %s, not implemented yet for GPU",symb->identifier(), 592, dvm_parallel_dir); + return (new SgValueExp(isz)); +} + +SgExpression *sizeOfPrivateArraysInBytes() +{ + SgExpression *el, *e_size = NULL; + int isize = 0; + //if (newVars.size() != 0) + //{ + // correctPrivateList(RESTORE); + // newVars.clear(); + //} + for (el = private_list; el; el = el->rhs()) + { + SgSymbol *symb = el->lhs()->symbol(); + if (IS_ARRAY(symb)) + { + SgExpression **eatr = (SgExpression **) el->lhs()->attributeValue(0, DIM_SIZES); + SgExpression *esa; + if (eatr) + esa = &(*ProductOfDimSizeArgs(*eatr) * *sizeOfElementInBytes(symb)); + else + esa = &(*ArrayLengthInElems(symb, dvm_parallel_dir, 1) * *sizeOfElementInBytes(symb)); //ArrayLength(symb, dvm_parallel_dir, 1); + if (e_size) + e_size = &( *e_size + *esa ); + else + e_size = esa; + + // if (e_size) + // e_size = &( *e_size + *ArrayLengthInElems(symb, dvm_parallel_dir, 1) * *sizeOfElementInBytes(symb)); + // else + // e_size = &( *ArrayLengthInElems(symb, dvm_parallel_dir, 1) * *sizeOfElementInBytes(symb)); + } + } + if (e_size && e_size->isInteger()) // calculating length if it is possible + { + int i_size = e_size->valueInteger(); + e_size = new SgValueExp(i_size); + if (i_size > 512) + return e_size; + else + return NULL; + } + + return e_size; +} + +SgExpression *ProductOfDimSizeArgs(SgExpression *esizes) +{ + SgExpression *el, *eprod = NULL; + for (el=esizes; el; el=el->rhs()) + { + if (eprod) + eprod = &(*eprod * SgDerefOp(*new SgVarRefExp(el->lhs()->lhs()->symbol()))); + + else + eprod = &SgDerefOp(*new SgVarRefExp(el->lhs()->lhs()->symbol())); + } + return eprod; +} + + SgStatement *AssignBlocksSElement(int i, int pl_rank, SgSymbol *s_blocksS, SgSymbol *s_idxL, SgSymbol *s_idxH, SgSymbol *s_step, SgSymbol *s_threads) { SgExpression *e=NULL, *estep=NULL; diff --git a/dvm/fdvm/trunk/fdvm/acc_across.cpp b/dvm/fdvm/trunk/fdvm/acc_across.cpp index e30425f..7351ac2 100644 --- a/dvm/fdvm/trunk/fdvm/acc_across.cpp +++ b/dvm/fdvm/trunk/fdvm/acc_across.cpp @@ -633,7 +633,7 @@ ArgsForKernel *Create_C_Adapter_Function_Across(SgSymbol *sadapter) // clear information allRegNames.clear(); - SgStatement *st_hedr, *st_end, *first_exec, *stmt; + SgStatement *st_hedr=NULL, *st_end, *first_exec, *stmt; vector cuda_kernel; SgExpression *fe, *ae, *el, *arg_list; SgType *typ; @@ -698,8 +698,9 @@ ArgsForKernel *Create_C_Adapter_Function_Across(SgSymbol *sadapter) kernel_symbNew += "_long"; else if (rtTypes[t] == rt_LLONG) kernel_symbNew += "_llong"; - + cuda_kernel[t] = CreateLoopKernelAcross(new SgSymbol(FUNCTION_NAME, kernel_symbNew.c_str(), *C_VoidType(), *block_C), &retValueForKernel[t], indexTypeInKernel(rtTypes[t])); + if (options.isOn(RTC)) { acc_call_list = ACC_RTC_ExpandCallList(acc_call_list); @@ -723,7 +724,7 @@ ArgsForKernel *Create_C_Adapter_Function_Across(SgSymbol *sadapter) // if only type ~ 1 across symb bool ifOne = true; for (size_t i = 0; i < allVariants.size(); ++i) - { + { if (allVariants[i].acrossV != 1) ifOne = false; } @@ -734,7 +735,7 @@ ArgsForKernel *Create_C_Adapter_Function_Across(SgSymbol *sadapter) dontGenConvertXY = false; for (size_t i = 0; i < allVariants.size(); ++i) - { + { #if debugMode printf("%d case\n", allVariants[i].type); #endif @@ -758,7 +759,7 @@ ArgsForKernel *Create_C_Adapter_Function_Across(SgSymbol *sadapter) kernel_symb += "_long"; else if (rtTypes[k] == rt_LLONG) kernel_symb += "_llong"; - + if (tmp.acrossV == 1 && tmp.type == 1) { if (k == 0) // create CUDA handler once @@ -788,7 +789,7 @@ ArgsForKernel *Create_C_Adapter_Function_Across(SgSymbol *sadapter) } } } - + if (newVars.size() != 0) { correctPrivateList(RESTORE); @@ -839,7 +840,7 @@ ArgsForKernel *Create_C_Adapter_Function_Across(SgSymbol *sadapter) first_exec = st_end; mywarn("start: create dummy argument list "); - // create dummy argument list: loop_ref, , + // create dummy argument list: loop_ref, , , typ = C_PointerType(C_Derived_Type(s_DvmhLoopRef)); s_loop_ref = new SgSymbol(VARIABLE_NAME, "loop_ref", *typ, *st_hedr); argsForVariantFunction.push_back(s_loop_ref); @@ -864,7 +865,7 @@ ArgsForKernel *Create_C_Adapter_Function_Across(SgSymbol *sadapter) arg_list->setRhs(*new SgExprListExp(*ae)); arg_list = arg_list->rhs(); } - + for (el = uses_list; el; el = el->rhs()) // { s = el->lhs()->symbol(); @@ -879,6 +880,46 @@ ArgsForKernel *Create_C_Adapter_Function_Across(SgSymbol *sadapter) arg_list->setRhs(*new SgExprListExp(*ae)); arg_list = arg_list->rhs(); } + + if (options.isOn(C_CUDA)) // + { + int idim; + SgExpression *elp; + SgType *t = C_PointerType(C_DvmType()); + + for (elp=private_list; elp; elp = elp->rhs()) + { + s = elp->lhs()->symbol(); + if (IS_ARRAY(s) && !TestArrayShape(s)) + { + el = NULL; + for (idim = 1; idim<=Rank(s); idim++) + { + sarg = new SgSymbol(VARIABLE_NAME, DimSizeName(s, idim), *t, *st_hedr); + argsForVariantFunction.push_back(sarg); + ae = new SgVarRefExp(sarg); + ae->setType(t); + ae = new SgPointerDerefExp(*ae); + arg_list->setRhs(*new SgExprListExp(*ae)); + arg_list = arg_list->rhs(); + + } + el = NULL; + for (idim = 1; idim<=Rank(s); idim++) + { + sarg = new SgSymbol(VARIABLE_NAME, BoundName(s, idim, 1), *t, *st_hedr); + argsForVariantFunction.push_back(sarg); + ae = new SgVarRefExp(sarg); + ae->setType(t); + ae = new SgPointerDerefExp(*ae); + arg_list->setRhs(*new SgExprListExp(*ae)); + arg_list = arg_list->rhs(); + } + } + + } + } + mywarn(" end: create dummy argument list "); mywarn("start: create IF BLOCK "); @@ -1112,9 +1153,10 @@ ArgsForKernel *Create_C_Adapter_Function_Across(SgSymbol *sadapter) mywarn(" end: create IF BLOCK "); } + if (options.isOn(C_CUDA)) RenamingCudaFunctionVariables(st_hedr, s_loop_ref, 0); //(st_hedr, current_symbol->next(), 0); - + return NULL; } @@ -1127,14 +1169,14 @@ vector Create_C_Adapter_Function_Across_OneThread(SgSymbol *sadap SgSymbol **reduction_ptr; SgSymbol *lowI, *highI, *idxI; symb_list *sl; - SgStatement *st_hedr, *st_end, *stmt, *first_exec; - SgExpression *fe, *ae, *arg_list, *el, *e, *espec, *er; - SgSymbol *s_loop_ref, *sarg, *s, *sb, *sg, *sdev, *h_first, *hgpu_first, *base_first, *uses_first, *scalar_first; + SgStatement *st_hedr, *st_end, *stmt, *first_exec, *stmt_save; + SgExpression *fe, *ae, *arg_list, *el, *e, *espec, *er, *e_all_private_size = NULL; + SgSymbol *s_loop_ref, *sarg, *s, *sb, *sg, *sdev, *h_first, *hgpu_first, *base_first, *uses_first, *scalar_first, *private_first=NULL; SgSymbol *s_blocks, *s_threads, *s_dev_num, *s_tmp_var, *idxTypeInKernel; SgType *typ; SgFunctionCallExp *funcCall; vector dvm_array_headers; - int ln, num, uses_num, has_red_array, use_device_num, num_of_red_arrays = 0, nbuf = 0; + int ln, num, uses_num, has_red_array, use_device_num, num_of_red_arrays = 0, nbuf = 0, lnp = 0; // init block reduction_ptr = NULL; @@ -1206,8 +1248,58 @@ vector Create_C_Adapter_Function_Across_OneThread(SgSymbol *sadap } uses_num = ln; - mywarn(" end: create dummy argument list "); + if (options.isOn(C_CUDA)) // + { + int idim; + SgExpression *elp; + SgType *t = C_PointerType(C_DvmType()); + for (elp=private_list; elp; elp = elp->rhs()) + { + s = elp->lhs()->symbol(); + if (IS_ARRAY(s) && !TestArrayShape(s)) + { + el = NULL; + for (idim = Rank(s); idim; idim--) + { + sarg = new SgSymbol(VARIABLE_NAME, DimSizeName(s, idim), *t, *st_hedr); + ae = new SgVarRefExp(sarg); + ae->setType(t); + el = AddElementToList(el, new SgPointerDerefExp(*ae)); + } + arg_list = AddListToList(arg_list, &el->copy()); + if (!elp->lhs()->attributeValue(0, DIM_SIZES)) + { + SgExpression **edim = new (SgExpression *); + *edim = el; + elp->lhs()->addAttribute(DIM_SIZES, (void *)edim, sizeof(SgExpression *) ); + } + + el = NULL; + for (idim = Rank(s); idim; idim--) + { + sarg = new SgSymbol(VARIABLE_NAME, BoundName(s, idim, 1), *t, *st_hedr); + ae = new SgVarRefExp(sarg); + ae->setType(t); + el = AddElementToList(el, new SgPointerDerefExp(*ae)); + } + arg_list = AddListToList(arg_list, &el->copy()); + if (!elp->lhs()->attributeValue(0, L_BOUNDS)) + { + SgExpression **elb = new (SgExpression *); + *elb = el; + elp->lhs()->addAttribute(L_BOUNDS, (void *)elb, sizeof(SgExpression *) ); + } + + while (arg_list->rhs() != 0) + arg_list = arg_list->rhs(); + } + + } + } + + mywarn(" end: create dummy argument list "); + // create variable's declarations: ,,,,,blocks_info [ or blocksS,idxL,idxH ],stream,blocks,threads if (red_list) // reduction section { mywarn("start: in reduction section "); @@ -1269,7 +1361,7 @@ vector Create_C_Adapter_Function_Across_OneThread(SgSymbol *sadap btype = loc_type->baseType(); else btype = loc_type; - //!printf("__112\n"); + SgArrayType *typearray = new SgArrayType(*C_Type(btype)); typearray->addRange(*new SgValueExp(loc_el_num)); s_loc_var->setType(*typearray); @@ -1282,7 +1374,6 @@ vector Create_C_Adapter_Function_Across_OneThread(SgSymbol *sadap st_hedr->insertStmtAfter(*stmt, *st_hedr); } - //!printf("__113\n"); /*--- executable statements: register reductions in RTS ---*/ e = &SgAssignOp(*new SgVarRefExp(s_tmp_var), *new SgValueExp(ln+1)); stmt = new SgCExpStmt(*e); @@ -1438,10 +1529,12 @@ vector Create_C_Adapter_Function_Across_OneThread(SgSymbol *sadap st_end->insertStmtBefore(*stmt, *st_hedr); stmt->addComment("// Get bounds"); mywarn(" end: create assigns"); - + stmt_save = stmt; + stmt = new SgCExpStmt(SgAssignOp(*new SgRecordRefExp(*s_blocks, "x"), *new SgValueExp(1))); st_end->insertStmtBefore(*stmt, *st_hedr); stmt->addComment("// Start counting"); + SgStatement *st_where = stmt; stmt = new SgCExpStmt(SgAssignOp(*new SgRecordRefExp(*s_threads, "x"), *new SgValueExp(1))); st_end->insertStmtBefore(*stmt, *st_hedr); @@ -1543,6 +1636,35 @@ vector Create_C_Adapter_Function_Across_OneThread(SgSymbol *sadap } } + e_all_private_size = sizeOfPrivateArraysInBytes(); + if (options.isOn(C_CUDA) && e_all_private_size) + { + for (el=private_list, lnp=0; el; el=el->rhs()) + { + s = el->lhs()->symbol(); + if (IS_ARRAY(s)) + { + sarg = new SgSymbol(VARIABLE_NAME, PointerNameForPrivateArray(s), *C_PointerType(C_VoidType()), *st_hedr); + ae = new SgCastExp(*C_PointerType( C_Type(s->type()->baseType())), *new SgVarRefExp(sarg)); + funcCall->addArg(*ae); + if (!lnp) + private_first = sarg; + lnp++; + if (!TestArrayShape(s)) + { + SgExpression **eatr = (SgExpression **) el->lhs()->attributeValue(0, DIM_SIZES); + SgExpression *ela; + for (ela = *eatr; ela; ela = ela->rhs()) + funcCall->addArg(SgDerefOp(*new SgVarRefExp(ela->lhs()->lhs()->symbol()))); + eatr = (SgExpression **) el->lhs()->attributeValue(0, L_BOUNDS); + for (ela = *eatr; ela; ela = ela->rhs()) + funcCall->addArg(SgDerefOp(*new SgVarRefExp(ela->lhs()->lhs()->symbol()))); + } + + } + } + } + for (int i = 0; i < acrossV + loopV; ++i) { funcCall->addArg(*new SgArrayRefExp(*lowI, *new SgValueExp(i))); @@ -1557,7 +1679,7 @@ vector Create_C_Adapter_Function_Across_OneThread(SgSymbol *sadap if (red_list) { ln = 0; - for (er = red_list; er; er = er->rhs(), ++ln) + for (er = red_list, s = red_first; er; er = er->rhs(), ++ln, s=s->next()) { funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("cudaMemcpy")); funcCall->addArg(SgAddrOp(*new SgVarRefExp(&(er->lhs()->rhs()->symbol()->copy())))); @@ -1571,9 +1693,10 @@ vector Create_C_Adapter_Function_Across_OneThread(SgSymbol *sadap stmt = new SgCExpStmt(*e); st_end->insertStmtBefore(*stmt, *st_hedr); - stmt = new SgCExpStmt(*RedPost(s_loop_ref, s_tmp_var, &(er->lhs()->rhs()->symbol()->copy()), NULL)); // loop_red_post_ - st_end->insertStmtBefore(*stmt, *st_hedr); - } + stmt = new SgCExpStmt(*RedPost(s_loop_ref, s_tmp_var, s, NULL)); // loop_red_post_ + st_end->insertStmtBefore(*stmt, *st_hedr); + } + ln = 0; for (er = red_list; er; er = er->rhs(), ++ln) { @@ -1585,6 +1708,18 @@ vector Create_C_Adapter_Function_Across_OneThread(SgSymbol *sadap stmt->addComment("// Free temporary variables"); } } + // insert code for big private arrays + if (options.isOn(C_CUDA) && e_all_private_size) + { + GetMemoryForPrivateArrays(private_first, s_loop_ref, lnp, st_where, st_hedr, new SgValueExp(1)); + + // to dispose private arrays + for (s = private_first, ln = 0; ln < lnp; s = s->next(), ln++) // private arrays + { + stmt = new SgCExpStmt(*DisposePrivateArray(s_loop_ref, s)); + st_end->insertStmtBefore(*stmt, *st_hedr); + } + } // create args for kernel and return it vector argsKernel(countKernels); for (unsigned i = 0; i < countKernels; ++i) @@ -1594,6 +1729,7 @@ vector Create_C_Adapter_Function_Across_OneThread(SgSymbol *sadap mywarn(" end Adapter Function"); if (options.isOn(C_CUDA)) RenamingCudaFunctionVariables(st_hedr, s_loop_ref, 0); + return argsKernel; } @@ -1653,6 +1789,27 @@ static inline void insertReductionArgs(SgSymbol **reduction_ptr, SgSymbol **redu } } +static void createPrivatePointers(SgSymbol* &private_first, int &lnp, SgStatement* st_hedr, SgExpression* &e_all_private_size) +{ + private_first = NULL; + if (options.isOn(C_CUDA) && (e_all_private_size=sizeOfPrivateArraysInBytes())) + { + SgExpression *el, *ae; + SgSymbol *sarg; + + for (el=private_list, lnp=0; el; el=el->rhs()) + { + SgSymbol *s = el->lhs()->symbol(); + if (IS_ARRAY(s)) + { + sarg = new SgSymbol(VARIABLE_NAME, PointerNameForPrivateArray(s), *C_PointerType(C_VoidType()), *st_hedr); + if (!lnp) + private_first = sarg; + lnp++; + } + } + } +} static void createArgsForKernelForTwoDeps(SgFunctionCallExp*& funcCallKernel, SgSymbol* kernel_symb, SgExpression* espec, SgSymbol*& sg, SgSymbol* hgpu_first, SgSymbol*& sb, SgSymbol* base_first, symb_list*& sl, int& ln, int num, SgExpression*& e, SgSymbol** reduction_ptr, @@ -1660,7 +1817,7 @@ static void createArgsForKernelForTwoDeps(SgFunctionCallExp*& funcCallKernel, Sg SgSymbol* diag, const int& loopV, SgSymbol** num_elems, const int& acrossV, SgSymbol* acrossBase[16], SgSymbol* loopBase[16], SgSymbol* idxI, const vector& loopAcrossSymb, const vector& loopSymb, SgSymbol*& s, SgSymbol* uses_first, SgSymbol*& sdev, SgSymbol* scalar_first, int uses_num, vector& dvm_array_headers, - SgSymbol** addressingParams, SgSymbol** outTypeOfTransformation, SgSymbol* type_of_run, SgSymbol* bIdxs) + SgSymbol** addressingParams, SgSymbol** outTypeOfTransformation, SgSymbol* type_of_run, SgSymbol* bIdxs, SgSymbol* private_first, int lnp) { funcCallKernel = CallKernel(kernel_symb, espec); @@ -1727,6 +1884,32 @@ static void createArgsForKernelForTwoDeps(SgFunctionCallExp*& funcCallKernel, Sg sdev = sdev->next(); } } + + if (options.isOn(C_CUDA) && private_first) // there are big private arrays + { + SgExpression *el, *ae; + SgSymbol *sarg, *sp, *s; + int ln; + for (sp = private_first, el = private_list, ln = 0; ln < lnp; sp = sp->next(), el = el->rhs(), ln++) + { + while (!IS_ARRAY(el->lhs()->symbol())) + el = el->rhs(); + s = el->lhs()->symbol(); + ae = new SgCastExp(*C_PointerType( C_Type(s->type()->baseType())), *new SgVarRefExp(sp)); + funcCallKernel->addArg(*ae); + if (!TestArrayShape(s)) + { + SgExpression **eatr = (SgExpression **) el->lhs()->attributeValue(0, DIM_SIZES); + SgExpression *ela; + for (ela = *eatr; ela; ela = ela->rhs()) + funcCallKernel->addArg(SgDerefOp(*new SgVarRefExp(ela->lhs()->lhs()->symbol()))); + eatr = (SgExpression **) el->lhs()->attributeValue(0, L_BOUNDS); + for (ela = *eatr; ela; ela = ela->rhs()) + funcCallKernel->addArg(SgDerefOp(*new SgVarRefExp(ela->lhs()->lhs()->symbol()))); + } + + } + } if (options.isOn(AUTO_TFM)) { @@ -1767,14 +1950,14 @@ vector Create_C_Adapter_Function_Across_variants(SgSymbol *sadapt symb_list *sl; SgStatement *st_hedr, *st_end, *stmt, *first_exec; - SgExpression *fe, *ae, *arg_list, *el, *e, *espec, *ex, *er; - SgSymbol *s_loop_ref, *sarg, *s, *sb, *sg, *sdev, *h_first, *hgpu_first, *base_first, *uses_first, *scalar_first; + SgExpression *fe, *ae, *arg_list, *el, *e, *espec, *ex, *er, *e_all_private_size = NULL, *e_totalThreads; + SgSymbol *s_loop_ref, *sarg, *s, *sb, *sg, *sdev, *h_first, *hgpu_first, *base_first, *uses_first, *scalar_first, *private_first; SgSymbol *s_blocks, *s_threads, *s_dev_num, *s_tmp_var, *type_of_run, *s_i = NULL, *s_k = NULL, *s_tmp_var_1; SgSymbol *idxTypeInKernel; SgType *typ; SgFunctionCallExp *funcCall, *funcCallKernel; vector dvm_array_headers; - int ln, num, uses_num, has_red_array, use_device_num, num_of_red_arrays, nbuf = 0; + int ln, num, uses_num, has_red_array, use_device_num, num_of_red_arrays, nbuf = 0, lnp; // init block lowI = highI = idxI = elem = red_blocks = shared_mem = stream_t = bIdxs = NULL; @@ -1850,6 +2033,56 @@ vector Create_C_Adapter_Function_Across_variants(SgSymbol *sadapt } uses_num = ln; + if (options.isOn(C_CUDA)) // + { + int idim; + SgExpression *elp; + SgType *t = C_PointerType(C_DvmType()); + + for (elp=private_list; elp; elp = elp->rhs()) + { + s = elp->lhs()->symbol(); + if (IS_ARRAY(s) && !TestArrayShape(s)) + { + el = NULL; + for (idim = Rank(s); idim; idim--) + { + sarg = new SgSymbol(VARIABLE_NAME, DimSizeName(s, idim), *t, *st_hedr); + ae = new SgVarRefExp(sarg); + ae->setType(t); + el = AddElementToList(el, new SgPointerDerefExp(*ae)); + } + arg_list = AddListToList(arg_list, &el->copy()); + if (!elp->lhs()->attributeValue(0, DIM_SIZES)) + { + SgExpression **edim = new (SgExpression *); + *edim = el; + elp->lhs()->addAttribute(DIM_SIZES, (void *)edim, sizeof(SgExpression *) ); + } + + el = NULL; + for (idim = Rank(s); idim; idim--) + { + sarg = new SgSymbol(VARIABLE_NAME, BoundName(s, idim, 1), *t, *st_hedr); + ae = new SgVarRefExp(sarg); + ae->setType(t); + el = AddElementToList(el, new SgPointerDerefExp(*ae)); + } + arg_list = AddListToList(arg_list, &el->copy()); + if (!elp->lhs()->attributeValue(0, L_BOUNDS)) + { + SgExpression **elb = new (SgExpression *); + *elb = el; + elp->lhs()->addAttribute(L_BOUNDS, (void *)elb, sizeof(SgExpression *) ); + } + + while (arg_list->rhs() != 0) + arg_list = arg_list->rhs(); + } + + } + } + type_of_run = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("type_of_run"), *LongT, *st_hedr); ae = new SgVarRefExp(type_of_run); ae->setType(LongT); @@ -1941,7 +2174,7 @@ vector Create_C_Adapter_Function_Across_variants(SgSymbol *sadapt btype = loc_type->baseType(); else btype = loc_type; - //!printf("__112\n"); + SgArrayType *typearray = new SgArrayType(*C_Type(btype)); typearray->addRange(*new SgValueExp(loc_el_num)); s_loc_var->setType(*typearray); @@ -1955,7 +2188,7 @@ vector Create_C_Adapter_Function_Across_variants(SgSymbol *sadapt st_hedr->insertStmtAfter(*stmt, *st_hedr); } - //!printf("__113\n"); + /*--- executable statements: register reductions in RTS ---*/ e = &SgAssignOp(*new SgVarRefExp(s_tmp_var), *new SgValueExp(ln+1)); stmt = new SgCExpStmt(*e); @@ -2556,7 +2789,7 @@ vector Create_C_Adapter_Function_Across_variants(SgSymbol *sadapt e = &SgAssignOp(*new SgVarRefExp(s_blocks), *f); stmt = new SgCExpStmt(*e); st_end->insertStmtBefore(*stmt, *st_hedr); - stmt->addComment("//Start method"); + stmt->addComment("// Start method"); e = &SgAssignOp(*new SgVarRefExp(acrossBase[0]), *new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[0].len))); stmt = new SgCExpStmt(*e); @@ -2722,7 +2955,6 @@ vector Create_C_Adapter_Function_Across_variants(SgSymbol *sadapt st_end->insertStmtBefore(*stmt, *st_hedr); } } - mywarn("start: in adding args section"); /* args for kernel */ @@ -2781,6 +3013,35 @@ vector Create_C_Adapter_Function_Across_variants(SgSymbol *sadapt sdev = sdev->next(); } } + + e_all_private_size = sizeOfPrivateArraysInBytes(); + if (options.isOn(C_CUDA) && e_all_private_size) + { + for (el=private_list, lnp=0; el; el=el->rhs()) + { + s = el->lhs()->symbol(); + if (IS_ARRAY(s)) + { + sarg = new SgSymbol(VARIABLE_NAME, PointerNameForPrivateArray(s), *C_PointerType(C_VoidType()), *st_hedr); + ae = new SgCastExp(*C_PointerType( C_Type(s->type()->baseType())), *new SgVarRefExp(sarg)); + funcCallKernel->addArg(*ae); + if (!lnp) + private_first = sarg; + lnp++; + if (!TestArrayShape(s)) + { + SgExpression **eatr = (SgExpression **) el->lhs()->attributeValue(0, DIM_SIZES); + SgExpression *ela; + for (ela = *eatr; ela; ela = ela->rhs()) + funcCallKernel->addArg(SgDerefOp(*new SgVarRefExp(ela->lhs()->lhs()->symbol()))); + eatr = (SgExpression **) el->lhs()->attributeValue(0, L_BOUNDS); + for (ela = *eatr; ela; ela = ela->rhs()) + funcCallKernel->addArg(SgDerefOp(*new SgVarRefExp(ela->lhs()->lhs()->symbol()))); + } + } + } + } + funcCallKernel->addArg(*new SgVarRefExp(type_of_run)); for (int i = 0; i < acrossV + loopV; ++i) funcCallKernel->addArg(*new SgArrayRefExp(*bIdxs, *new SgValueExp(i))); @@ -2816,7 +3077,15 @@ vector Create_C_Adapter_Function_Across_variants(SgSymbol *sadapt SgForStmt *simple; simple = new SgForStmt(&SgAssignOp(*new SgVarRefExp(tmpV), *new SgValueExp(0)), &(*new SgVarRefExp(tmpV1) < *new SgArrayRefExp(*highI, *new SgValueExp(loopAcrossSymb[0].len))), expr, stmt); st_end->insertStmtBefore(*simple); + stmt = simple; } + stmt->addComment("// GPU execution"); + if (options.isOn(C_CUDA) && e_all_private_size) + { + e_totalThreads = &(*new SgRecordRefExp(*s_blocks, "x") * *new SgRecordRefExp(*s_blocks, "y") * *new SgRecordRefExp(*s_blocks, "z") * *new SgRecordRefExp(*s_threads, "x") * *new SgRecordRefExp(*s_threads, "y") * *new SgRecordRefExp(*s_threads, "z")); + GetMemoryForPrivateArrays(private_first, s_loop_ref, lnp, stmt, st_hedr, e_totalThreads); + } + } else if (acrossV == 2) // ACROSS with two dependence: generate method { @@ -2972,7 +3241,8 @@ vector Create_C_Adapter_Function_Across_variants(SgSymbol *sadapt mywarn(" end: out red section"); } - + createPrivatePointers(private_first, lnp, st_hedr, e_all_private_size); + GetMemoryForPrivateArrays (private_first, s_loop_ref, lnp, st_end, st_hedr, new SgVarRefExp(q)); mywarn("strat: init bases"); // init bases for (int i = 0; i < acrossV; ++i) @@ -3014,7 +3284,7 @@ vector Create_C_Adapter_Function_Across_variants(SgSymbol *sadapt reduction_ptr, reduction_loc_ptr, reduction_symb, reduction_loc_symb, red_blocks, has_red_array, diag, loopV, num_elems, acrossV, acrossBase, loopBase, idxI, loopAcrossSymb, loopSymb, s, uses_first, sdev, scalar_first, uses_num, dvm_array_headers, - addressingParams, outTypeOfTransformation, type_of_run, bIdxs); + addressingParams, outTypeOfTransformation, type_of_run, bIdxs, private_first, lnp); stmt = createKernelCallsInCudaHandler(funcCallKernel, s_loop_ref, idxTypeInKernel, s_blocks); while_st->insertStmtAfter(*stmt); @@ -3093,7 +3363,7 @@ vector Create_C_Adapter_Function_Across_variants(SgSymbol *sadapt reduction_ptr, reduction_loc_ptr, reduction_symb, reduction_loc_symb, red_blocks, has_red_array, q, loopV, num_elems, acrossV, acrossBase, loopBase, idxI, loopAcrossSymb, loopSymb, s, uses_first, sdev, scalar_first, uses_num, dvm_array_headers, - addressingParams, outTypeOfTransformation, type_of_run, bIdxs); + addressingParams, outTypeOfTransformation, type_of_run, bIdxs, private_first, lnp); while_st1->insertStmtAfter(*createKernelCallsInCudaHandler(funcCallKernel, s_loop_ref, idxTypeInKernel, s_blocks)); while_st2->insertStmtAfter(*createKernelCallsInCudaHandler(funcCallKernel, s_loop_ref, idxTypeInKernel, s_blocks)); @@ -3105,7 +3375,7 @@ vector Create_C_Adapter_Function_Across_variants(SgSymbol *sadapt reduction_ptr, reduction_loc_ptr, reduction_symb, reduction_loc_symb, red_blocks, has_red_array, elem, loopV, num_elems, acrossV, acrossBase, loopBase, idxI, loopAcrossSymb, loopSymb, s, uses_first, sdev, scalar_first, uses_num, dvm_array_headers, - addressingParams, outTypeOfTransformation, type_of_run, bIdxs); + addressingParams, outTypeOfTransformation, type_of_run, bIdxs, private_first, lnp); while_st3->insertStmtAfter(*createKernelCallsInCudaHandler(funcCallKernel, s_loop_ref, idxTypeInKernel, s_blocks)); while_st4->insertStmtAfter(*createKernelCallsInCudaHandler(funcCallKernel, s_loop_ref, idxTypeInKernel, s_blocks)); @@ -3190,6 +3460,30 @@ vector Create_C_Adapter_Function_Across_variants(SgSymbol *sadapt sdev = sdev->next(); } } + createPrivatePointers(private_first, lnp, st_hedr, e_all_private_size); + if (options.isOn(C_CUDA) && private_first) // there are big private arrays + { + SgSymbol *sp; + for (sp = private_first, el = private_list, ln = 0; ln < lnp; sp = sp->next(), el = el->rhs(), ln++) + { + while (!IS_ARRAY(el->lhs()->symbol())) + el = el->rhs(); + s = el->lhs()->symbol(); + ae = new SgCastExp(*C_PointerType( C_Type(s->type()->baseType())), *new SgVarRefExp(sp)); + funcCallKernel->addArg(*ae); + if (!TestArrayShape(s)) + { + SgExpression **eatr = (SgExpression **) el->lhs()->attributeValue(0, DIM_SIZES); + SgExpression *ela; + for (ela = *eatr; ela; ela = ela->rhs()) + funcCallKernel->addArg(SgDerefOp(*new SgVarRefExp(ela->lhs()->lhs()->symbol()))); + eatr = (SgExpression **) el->lhs()->attributeValue(0, L_BOUNDS); + for (ela = *eatr; ela; ela = ela->rhs()) + funcCallKernel->addArg(SgDerefOp(*new SgVarRefExp(ela->lhs()->lhs()->symbol()))); + } + } + } + funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("MIN")); funcCall->addArg(*new SgVarRefExp(M1)); funcCall->addArg(*new SgVarRefExp(M2)); @@ -3392,6 +3686,18 @@ vector Create_C_Adapter_Function_Across_variants(SgSymbol *sadapt mywarn(" end: out red section"); } + + if (options.isOn(C_CUDA) && private_first) + { + SgFunctionCallExp *f1 = new SgFunctionCallExp(*createNewFunctionSymbol("MAX")); + SgFunctionCallExp *f2 = new SgFunctionCallExp(*createNewFunctionSymbol("MAX")); + f1->addArg(*new SgVarRefExp(M1)); + f1->addArg(*new SgVarRefExp(M2)); + f2->addArg(*f1); + f2->addArg(*new SgVarRefExp(M3)); + e_totalThreads = &(*new SgVarRefExp(Emin) * *f2); + GetMemoryForPrivateArrays (private_first, s_loop_ref, lnp, st_end, st_hedr, e_totalThreads); + } int flag_comment = 0; for (int i = 3; i < acrossV; ++i) @@ -3755,6 +4061,13 @@ vector Create_C_Adapter_Function_Across_variants(SgSymbol *sadapt st_end->insertStmtBefore(*stmt, *st_hedr); } } + // to dispose private arrays + if (options.isOn(C_CUDA) && e_all_private_size) + for (s = private_first, ln = 0; ln < lnp; s = s->next(), ln++) // private arrays + { + stmt = new SgCExpStmt(*DisposePrivateArray(s_loop_ref, s)); + st_end->insertStmtBefore(*stmt, *st_hedr); + } // create args for kernel and return it vector argsKernel(countKernels); @@ -3976,6 +4289,9 @@ SgExpression *CreateKernelDummyListAcross(ArgsForKernel *argsKer, SgType *idxTyp if (uses_list) arg_list = AddListToList(arg_list, CreateUsesDummyList()); //[+ ] + if (private_list) + arg_list = AddListToList(arg_list, CreatePrivateDummyList()); //[+ dummys for private arrays ] + if (argsKer->symb.size() >= 3) for (int it = 0; it < argsKer->sizeVars.size(); ++it) arg_list = AddListToList(arg_list, new SgExprListExp(*new SgVarRefExp(argsKer->sizeVars[it]))); @@ -5802,18 +6118,17 @@ SgSymbol *RedBlockSymbolInKernelAcross(SgSymbol *s, SgType *type) void DeclarationOfReductionBlockInKernelAcross(SgExpression *ered, reduction_operation_list *rsl) { - SgStatement *ass, *newst, *current, *if_st, *while_st, *typedecl, *st, *do_st; - SgExpression *le, *re, *eatr, *cond, *ev; + SgStatement *newst, *current, *if_st, *while_st, *typedecl, *st, *do_st; + SgExpression *eatr, *cond, *ev; SgSymbol *red_var, *red_var_k, *s_block, *loc_var, *sf; SgType *rtype; - int i, ind; //init block - ass = newst = current = if_st = while_st = typedecl = st = do_st = NULL; - le = re = eatr = cond = ev = NULL; + newst = current = if_st = while_st = typedecl = st = do_st = NULL; + eatr = cond = ev = NULL; red_var = red_var_k = s_block = loc_var = sf = NULL; rtype = NULL; - i = ind = loc_el_num = 0; + loc_el_num = 0; //end of init block // analys of reduction operation diff --git a/dvm/fdvm/trunk/fdvm/funcall.cpp b/dvm/fdvm/trunk/fdvm/funcall.cpp index 09b3d3a..2a96aa2 100644 --- a/dvm/fdvm/trunk/fdvm/funcall.cpp +++ b/dvm/fdvm/trunk/fdvm/funcall.cpp @@ -4941,4 +4941,49 @@ SgExpression *RtcSetLang(SgSymbol *s_loop_ref, const int lang) else fe->addArg(*new SgKeywordValExp("UNKNOWN_CUDA")); return(fe); -} \ No newline at end of file +} + +SgExpression *GetDeviceProp(SgSymbol *s_loop_ref, SgExpression *ep) +{// generating function call: + // DvmType loop_cuda_get_device_prop(DvmType *InDvmhLoop, DvmType prop); + + SgFunctionCallExp *fe = new SgFunctionCallExp(*fdvm[GET_DEVICE_PROP]); + + fe->addArg(*new SgVarRefExp(s_loop_ref)); + fe->addArg(*ep); + return(fe); +} + +SgExpression *GetMaxBlocks(SgSymbol *s_loop_ref, SgSymbol *s_max_blocks, SgSymbol *s_needed_bytes) +{// generating function call: + // DvmType loop_cuda_get_max_blocks(DvmType *InDvmhLoop, DvmType maxBlocks, DvmType neededBytesForBlock) + + SgFunctionCallExp *fe = new SgFunctionCallExp(*fdvm[GET_MAX_BLOCKS]); + + fe->addArg(*new SgVarRefExp(s_loop_ref)); + fe->addArg(*new SgVarRefExp(s_max_blocks)); + fe->addArg(*new SgVarRefExp(s_needed_bytes)); + return(fe); +} + +SgExpression *GetPrivateArray(SgSymbol *s_loop_ref, SgExpression *e_bytes) +{// generating function call: + // DvmType *loop_cuda_get_private_array(DvmType *InDvmhLoop, UDvmType neededBytes) + + SgFunctionCallExp *fe = new SgFunctionCallExp(*fdvm[GET_PRIVATE_ARR]); + + fe->addArg(*new SgVarRefExp(s_loop_ref)); + fe->addArg(*e_bytes); + return(fe); +} + +SgExpression *DisposePrivateArray(SgSymbol *s_loop_ref, SgSymbol *s_array) +{// generating function call: + // void loop_cuda_dispose_private_array(DvmType *InDvmhLoop, void *array) + + SgFunctionCallExp *fe = new SgFunctionCallExp(*fdvm[DISPOSE_PRIVATE_AR]); + + fe->addArg(*new SgVarRefExp(s_loop_ref)); + fe->addArg(*new SgVarRefExp(s_array)); + return(fe); +} diff --git a/dvm/fdvm/trunk/include/dvm.h b/dvm/fdvm/trunk/include/dvm.h index 21be837..db50a5f 100644 --- a/dvm/fdvm/trunk/include/dvm.h +++ b/dvm/fdvm/trunk/include/dvm.h @@ -261,6 +261,10 @@ const int END_OF_USE_LIST = 1050; /*ACC*/ const int ROUTINE_ATTR = 1051; /*ACC*/ const int DATA_REGION_SYMB = 1052; /*ACC*/ const int REMOTE_ACCESS_BUF = 1053; /*ACC*/ +const int L_BOUNDS = 1054; /*ACC*/ +const int DIM_SIZES = 1055; /*ACC*/ +const int PRIVATE_ARRAY = 1056; /*ACC*/ +const int PRIVATE_POINTER = 1057; /*ACC*/ const int MAX_LOOP_LEVEL = 20; // 7 - maximal number of loops in parallel loop nest const int MAX_LOOP_NEST = 25; // maximal number of nested loops @@ -1275,6 +1279,7 @@ SgSymbol *isSameRedVar(char *name); SgSymbol *isSameArray(char *name); SgSymbol *isSameIndexVar(char *name); SgType * C_LongLongType(); +SgType * C_UnsignedLongLongType(); SgType * C_DvmType(); SgType * C_CudaIndexType(); char *OpenMpComment_HandlerType(int idvm); @@ -1434,6 +1439,18 @@ SgSymbol *HeaderSymbolForHandler(SgSymbol *ar); void TestRoutineAttribute(SgSymbol *s, SgStatement *routine_interface); int LookForRoutineDir(SgStatement *interfaceFunc); SgStatement *Interface(SgSymbol *s); +SgExpression *sizeOfElementInBytes(SgSymbol *symb); +SgExpression *sizeOfPrivateArraysInBytes(); +SgExpression *ProductOfDimSizeArgs(SgExpression *esizes); +//void doPrivateArrayList(SgExpression *private_arrays, SgStatement *st_hedr); +void addPrivateArrayList(SgFunctionCallExp *fcall, SgExpression *private_arrays, SgStatement *st_hedr); +int TestArrayShape(SgSymbol *ar); +SgExpression *DimSizeListOfPrivateArrays(); +SgExpression *BoundListOfPrivateArrays(); +SgExpression * DummyListForPrivateArrays(SgStatement *st_hedr); +SgExpression *CreatePrivateDummyList(); +char *PointerNameForPrivateArray(SgSymbol *symb); +void GetMemoryForPrivateArrays(SgSymbol *private_first, SgSymbol *s_loop_ref, int nump, SgStatement *st_end, SgStatement *st_hedr, SgExpression *e_totalThreads); /* acc_analyzer.cpp */ //void Private_Vars_Analyzer(SgStatement *firstSt, SgStatement *lastSt); @@ -1900,6 +1917,10 @@ SgStatement *Consistent_H (int il, SgExpression *hedr, SgExpression *axis_list); SgStatement *LoopRemoteAccess_H (int il, SgExpression *hedr, SgSymbol *ar, SgExpression *axis_list); SgStatement *RemoteAccess_H2 (SgExpression *buf_hedr, SgSymbol *ar, SgExpression *ar_hedr, SgExpression *axis_list); SgStatement *GetRemoteBuf (SgSymbol *loop_s, int n, SgSymbol *s_buf_head); +SgExpression *GetDeviceProp(SgSymbol *s_loop_ref, SgExpression *ep); +SgExpression *GetMaxBlocks(SgSymbol *s_loop_ref, SgSymbol *s_max_blocks, SgSymbol *s_needed_bytes); +SgExpression *GetPrivateArray(SgSymbol *s_loop_ref, SgExpression *e_bytes); +SgExpression *DisposePrivateArray(SgSymbol *s_loop_ref, SgSymbol *s_array); /* io.cpp */ void IO_ThroughBuffer(SgSymbol *ar, SgStatement *stmt, SgExpression *eiostat); @@ -2089,7 +2110,6 @@ char *Check_Correct_Name(const char *name); /* acc_f2c.cpp */ void Translate_Fortran_To_C(SgStatement *stat, SgStatement *last, std::vector > &, int); SgStatement* Translate_Fortran_To_C(SgStatement* Stmt, bool isSapforConv = false); - SgSymbol* createNewFunctionSymbol(const char *name); void swapDimentionsInprivateList(void); void createNewFCall(SgExpression*, SgExpression*&, const char*, int); @@ -2103,6 +2123,9 @@ void RenamingNewProcedureVariables(SgSymbol *proc_name); SgSymbol *hasSameNameAsSource(SgSymbol *symb); void RenamingCudaFunctionVariables(SgStatement *first, SgSymbol *k_symb, int replace_flag); void replaceVariableSymbSameNameInStatements(SgStatement *first, SgStatement *last, SgSymbol *symb, SgSymbol *s_new, int replace_flag); +void RenamingCalledProcedureSymbols(SgStatement *header, SgStatement *copy_header); +void RenamingCalledProcedureSymbolsInKernel(SgSymbol *first_symb); + /* acc_across.cpp */ ArgsForKernel *Create_C_Adapter_Function_Across(SgSymbol *sadapter); SgStatement *CreateLoopKernelAcross(SgSymbol*, ArgsForKernel*, SgType*); @@ -2238,7 +2261,7 @@ void ConvertLoopWithLabelToEnddoLoop (SgStatement *stat); /*OMP*/ enum OPTIONS { AUTO_TFM = 0, ONE_THREAD, SPEED_TEST_L0, SPEED_TEST_L1, GPU_O0, GPU_O1, RTC, C_CUDA, OPT_EXP_COMP, O_HOST, NO_CUDA, NO_BL_INFO, LOOP_ANALYSIS, PRIVATE_ANALYSIS, IO_RTS, READ_ALL, NO_REMOTE, NO_PURE_FUNC, - GPU_IRR_ACC, O_PL, O_PL2, NUM_OPT}; + GPU_IRR_ACC, O_PL, O_PL2, BIG_P, NUM_OPT}; // ONE_THREAD - compile one thread CUDA-kernels only for across (TODO for all CUDA-kernels) // SPEED_TEST_L0, SPEED_TEST_L1 - debug options for speed testof CUDA-kernels for across // RTC - enable CUDA run-time compilation of all CUDA-kernels diff --git a/dvm/fdvm/trunk/include/dvm_tag.h b/dvm/fdvm/trunk/include/dvm_tag.h index 204699a..57ad8a4 100644 --- a/dvm/fdvm/trunk/include/dvm_tag.h +++ b/dvm/fdvm/trunk/include/dvm_tag.h @@ -62,7 +62,8 @@ #define DVM_CP_WAIT_DIR 638 #define DVM_EXIT_INTERVAL_DIR 639 #define DVM_TEMPLATE_CREATE_DIR 640 -#define DVM_TEMPLATE_DELETE_DIR 641 +#define DVM_TEMPLATE_DELETE_DIR 641 +#define PRIVATE_AR_DECL 642 #define BLOCK_OP 705 #define NEW_SPEC_OP 706 #define REDUCTION_OP 707 diff --git a/dvm/fdvm/trunk/include/libdvm.h b/dvm/fdvm/trunk/include/libdvm.h index cf521f5..719ee2f 100644 --- a/dvm/fdvm/trunk/include/libdvm.h +++ b/dvm/fdvm/trunk/include/libdvm.h @@ -334,3 +334,7 @@ name_dvm[GUESS_INDEX_TYPE] = "loop_guess_index_type_"; name_dvm[GUESS_INDEX_TYPE_2]="dvmh_loop_guess_index_type_C"; name_dvm[RTC_SET_LANG] = "loop_cuda_rtc_set_lang"; name_dvm[GET_REMOTE_BUF_C] = "dvmh_loop_get_remote_buf_C"; +name_dvm[GET_DEVICE_PROP] = "loop_cuda_get_device_prop"; +name_dvm[GET_MAX_BLOCKS] = "loop_cuda_get_max_blocks"; +name_dvm[GET_PRIVATE_ARR] = "loop_cuda_get_private_array"; +name_dvm[DISPOSE_PRIVATE_AR]="loop_cuda_dispose_private_array"; \ No newline at end of file diff --git a/dvm/fdvm/trunk/include/libnum.h b/dvm/fdvm/trunk/include/libnum.h index 179ff55..d73fd21 100644 --- a/dvm/fdvm/trunk/include/libnum.h +++ b/dvm/fdvm/trunk/include/libnum.h @@ -332,5 +332,9 @@ enum { GUESS_INDEX_TYPE_2, RTC_SET_LANG, GET_REMOTE_BUF_C, + GET_DEVICE_PROP, + GET_MAX_BLOCKS, + GET_PRIVATE_ARR, + DISPOSE_PRIVATE_AR, MAX_LIBFUN_NUM }; diff --git a/dvm/fdvm/trunk/parser/tag b/dvm/fdvm/trunk/parser/tag index 77be3b4..2f30999 100644 --- a/dvm/fdvm/trunk/parser/tag +++ b/dvm/fdvm/trunk/parser/tag @@ -236,6 +236,7 @@ #define DVM_EXIT_INTERVAL_DIR 639 /* DVM-F */ #define DVM_TEMPLATE_CREATE_DIR 640 /* DVM-F */ #define DVM_TEMPLATE_DELETE_DIR 641 /* DVM-F */ +#define PRIVATE_AR_DECL 642 /* DVM-F */ /***************** variant tags for low level nodes ********************/ diff --git a/dvm/fdvm/trunk/parser/tag.h b/dvm/fdvm/trunk/parser/tag.h index 38e9115..abba34b 100644 --- a/dvm/fdvm/trunk/parser/tag.h +++ b/dvm/fdvm/trunk/parser/tag.h @@ -238,7 +238,8 @@ script using "tag". Run make tag.h to regenerate this file */ tag [ DVM_EXIT_INTERVAL_DIR ] = "DVM_EXIT_INTERVAL_DIR"; tag [ DVM_TEMPLATE_CREATE_DIR ] = "DVM_TEMPLATE_CREATE_DIR"; tag [ DVM_TEMPLATE_DELETE_DIR ] = "DVM_TEMPLATE_DELETE_DIR"; - + tag [ PRIVATE_AR_DECL ] = "PRIVATE_AR_DECL"; + /***************** variant tags for low level nodes ********************/ tag [ INT_VAL ] = "INT_VAL";