From e0e139bf642398d1e1b8cfd803ee6ce276404991 Mon Sep 17 00:00:00 2001 From: huangxiaoquan Date: Wed, 6 Dec 2023 17:51:11 +0800 Subject: [PATCH] Add LLC-Allocation Pass LLC allocation allows the compiler to identify frequently-used data in the program and strengthens the ability to prefetch and distribute it to the last level cache (LLC) through memory accesses of the corresponding data variables. Add flag -fllc-allocate to enable LLC allocation. --- gcc/Makefile.in | 1 + gcc/cfgloop.h | 3 + gcc/common.opt | 4 + gcc/config/aarch64/aarch64-sve.md | 48 +- gcc/config/aarch64/aarch64.c | 18 + gcc/doc/tm.texi | 21 + gcc/doc/tm.texi.in | 6 + gcc/internal-fn.c | 115 + gcc/internal-fn.def | 4 + gcc/optabs.def | 2 + gcc/params.opt | 53 + gcc/passes.def | 1 + gcc/target.def | 31 + gcc/testsuite/gcc.dg/llc-allocate/llc-1.c | 61 + gcc/testsuite/gcc.dg/llc-allocate/llc-2.c | 54 + .../gcc.dg/llc-allocate/llc-allocate.exp | 27 + .../llc-allocate/llc-issue-builtin-prefetch.c | 48 + .../gcc.dg/llc-allocate/llc-nonzero-offset.c | 50 + .../gcc.dg/llc-allocate/llc-ref-trace.c | 62 + .../llc-allocate/llc-tool-insertion-1.c | 48 + .../llc-allocate/llc-tool-insertion-2.c | 48 + .../llc-allocate/llc-tool-insertion-3.c | 48 + .../llc-allocate/llc-tool-insertion-4.c | 47 + .../llc-allocate/llc-tool-insertion-5.c | 48 + .../llc-allocate/llc-tool-insertion-6.c | 47 + .../llc-tool-insertion-7-null-var-name.c | 52 + .../llc-tool-insertion-8-tmp-var-name.c | 54 + .../gfortran.dg/llc-allocate/llc-3.f90 | 213 ++ .../gfortran.dg/llc-allocate/llc-allocate.exp | 29 + .../llc-trace-multiple-base-var.f90 | 63 + .../llc-unknown-type-size-unit.f90 | 58 + gcc/timevar.def | 1 + gcc/tree-cfg.c | 11 + gcc/tree-cfg.h | 1 + gcc/tree-pass.h | 1 + gcc/tree-scalar-evolution.c | 8 +- gcc/tree-scalar-evolution.h | 3 +- gcc/tree-ssa-llc-allocate.c | 2898 +++++++++++++++++ gcc/tree-ssa-loop-niter.c | 38 +- gcc/tree-ssa-loop-niter.h | 3 +- 40 files changed, 4297 insertions(+), 31 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-1.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-2.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-2.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-3.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-4.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-5.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-6.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-7-null-var-name.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 create mode 100644 gcc/tree-ssa-llc-allocate.c diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 2a59acfbe..31bf2cde2 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1594,6 +1594,7 @@ OBJS = \ tree-ssa-loop-array-widen-compare.o \ tree-ssa-loop-crc.o \ tree-ssa-loop-prefetch.o \ + tree-ssa-llc-allocate.o \ tree-ssa-loop-split.o \ tree-ssa-loop-unswitch.o \ tree-ssa-loop.o \ diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h index 18b404e29..e3ecf5076 100644 --- a/gcc/cfgloop.h +++ b/gcc/cfgloop.h @@ -272,6 +272,9 @@ public: the basic-block from being collected but its index can still be reused. */ basic_block former_header; + + /* Number of latch executions from vectorization. */ + tree vec_nb_iterations; }; /* Set if the loop is known to be infinite. */ diff --git a/gcc/common.opt b/gcc/common.opt index 4db061b44..2dde0f673 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -2233,6 +2233,10 @@ Common Joined RejectNegative UInteger Var(prefetch_level) Init(0) IntegerRange(0 Generate prefetch instructions, if available, for arrays in loops. The prefetch level can control the optimize level to array prefetch. +fllc-allocate +Common Report Var(flag_llc_allocate) Init(-1) Optimization +Generate LLC hint instructions. + fprofile Common Report Var(profile_flag) Enable basic program profiling code. diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index d17a77706..c5b99b6c4 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -1940,7 +1940,7 @@ (define_insn "@aarch64_sve_prefetch" [(prefetch (unspec:DI [(match_operand: 0 "register_operand" "Upl") - (match_operand:SVE_FULL_I 1 "aarch64_sve_prefetch_operand" "UP") + (match_operand:SVE_FULL 1 "aarch64_sve_prefetch_operand" "UP") (match_operand:DI 2 "const_int_operand")] UNSPEC_SVE_PREFETCH) (match_operand:DI 3 "const_int_operand") @@ -1973,14 +1973,14 @@ ;; 6: the prefetch operator (an svprfop) ;; 7: the normal RTL prefetch rw flag ;; 8: the normal RTL prefetch locality value -(define_insn "@aarch64_sve_gather_prefetch" +(define_insn "@aarch64_sve_gather_prefetch" [(prefetch (unspec:DI [(match_operand:VNx4BI 0 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl") - (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk, rk, rk") + (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk, rk, rk") (match_operand:VNx4SI_ONLY 2 "register_operand" "w, w, w, w, w, w") (match_operand:DI 3 "const_int_operand" "i, i, Z, Ui1, Z, Ui1") - (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, Ui1, i, i") - (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") + (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, Ui1, i, i") + (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") (match_operand:DI 6 "const_int_operand")] UNSPEC_SVE_PREFETCH_GATHER) (match_operand:DI 7 "const_int_operand") @@ -1988,12 +1988,12 @@ "TARGET_SVE" { static const char *const insns[][2] = { - "prf", "%0, [%2.s]", - "prf", "%0, [%2.s, #%1]", + "prf", "%0, [%2.s]", + "prf", "%0, [%2.s, #%1]", "prfb", "%0, [%1, %2.s, sxtw]", "prfb", "%0, [%1, %2.s, uxtw]", - "prf", "%0, [%1, %2.s, sxtw %p4]", - "prf", "%0, [%1, %2.s, uxtw %p4]" + "prf", "%0, [%1, %2.s, sxtw %p4]", + "prf", "%0, [%1, %2.s, uxtw %p4]" }; const char *const *parts = insns[which_alternative]; return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); @@ -2002,14 +2002,14 @@ ;; Predicated gather prefetches for 64-bit elements. The value of operand 3 ;; doesn't matter in this case. -(define_insn "@aarch64_sve_gather_prefetch" +(define_insn "@aarch64_sve_gather_prefetch" [(prefetch (unspec:DI [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl, Upl, Upl") - (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk") + (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk") (match_operand:VNx2DI_ONLY 2 "register_operand" "w, w, w, w") (match_operand:DI 3 "const_int_operand") - (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, i") - (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") + (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, i") + (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") (match_operand:DI 6 "const_int_operand")] UNSPEC_SVE_PREFETCH_GATHER) (match_operand:DI 7 "const_int_operand") @@ -2017,10 +2017,10 @@ "TARGET_SVE" { static const char *const insns[][2] = { - "prf", "%0, [%2.d]", - "prf", "%0, [%2.d, #%1]", + "prf", "%0, [%2.d]", + "prf", "%0, [%2.d, #%1]", "prfb", "%0, [%1, %2.d]", - "prf", "%0, [%1, %2.d, lsl %p4]" + "prf", "%0, [%1, %2.d, lsl %p4]" }; const char *const *parts = insns[which_alternative]; return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); @@ -2028,7 +2028,7 @@ ) ;; Likewise, but with the offset being sign-extended from 32 bits. -(define_insn_and_rewrite "*aarch64_sve_gather_prefetch_sxtw" +(define_insn_and_rewrite "*aarch64_sve_gather_prefetch_sxtw" [(prefetch (unspec:DI [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl") (match_operand:DI 1 "register_operand" "rk, rk") @@ -2039,8 +2039,8 @@ (match_operand:VNx2DI 2 "register_operand" "w, w")))] UNSPEC_PRED_X) (match_operand:DI 3 "const_int_operand") - (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") - (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") + (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") + (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") (match_operand:DI 6 "const_int_operand")] UNSPEC_SVE_PREFETCH_GATHER) (match_operand:DI 7 "const_int_operand") @@ -2049,7 +2049,7 @@ { static const char *const insns[][2] = { "prfb", "%0, [%1, %2.d, sxtw]", - "prf", "%0, [%1, %2.d, sxtw %p4]" + "prf", "%0, [%1, %2.d, sxtw %p4]" }; const char *const *parts = insns[which_alternative]; return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); @@ -2061,7 +2061,7 @@ ) ;; Likewise, but with the offset being zero-extended from 32 bits. -(define_insn "*aarch64_sve_gather_prefetch_uxtw" +(define_insn "*aarch64_sve_gather_prefetch_uxtw" [(prefetch (unspec:DI [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl") (match_operand:DI 1 "register_operand" "rk, rk") @@ -2069,8 +2069,8 @@ (match_operand:VNx2DI 2 "register_operand" "w, w") (match_operand:VNx2DI 9 "aarch64_sve_uxtw_immediate")) (match_operand:DI 3 "const_int_operand") - (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") - (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") + (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") + (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") (match_operand:DI 6 "const_int_operand")] UNSPEC_SVE_PREFETCH_GATHER) (match_operand:DI 7 "const_int_operand") @@ -2079,7 +2079,7 @@ { static const char *const insns[][2] = { "prfb", "%0, [%1, %2.d, uxtw]", - "prf", "%0, [%1, %2.d, uxtw %p4]" + "prf", "%0, [%1, %2.d, uxtw %p4]" }; const char *const *parts = insns[which_alternative]; return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index dbdc6dffb..aa077ec0a 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -2367,6 +2367,13 @@ aarch64_sve_data_mode_p (machine_mode mode) return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA; } +/* Return true if MODE is an full SVE data vector mode. */ +static bool +aarch64_full_sve_data_mode_p (machine_mode mode) +{ + return aarch64_classify_vector_mode (mode) == VEC_SVE_DATA; +} + /* Return the number of defined bytes in one constituent vector of SVE mode MODE, which has vector flags VEC_FLAGS. */ static poly_int64 @@ -24370,6 +24377,17 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_ASM_FUNCTION_EPILOGUE #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks +#undef TARGET_VECTORIZE_CODE_FOR_PREFETCH +#define TARGET_VECTORIZE_CODE_FOR_PREFETCH code_for_aarch64_sve_prefetch + +#undef TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH +#define TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH \ + code_for_aarch64_sve_gather_prefetch + +#undef TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P +#define TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P \ + aarch64_full_sve_data_mode_p + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-aarch64.h" diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index b46418d0b..ef3566510 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6122,6 +6122,27 @@ The default is @code{NULL_TREE} which means to not vectorize scatter stores. @end deftypefn +@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_PREFETCH (machine_mode @var{arg}) +This hook should return the decl of a function that implements the +vectorized variant of the function with the @code{combined_fn} code +@var{code} or @code{NULL_TREE} if such a function is not available. +The return type of the vectorized function shall be of vector type +@var{vec_type_out} and the argument types should be @var{vec_type_in}. +@end deftypefn + +@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH (machine_mode @var{mode_to}, machine_mode @var{mode_form}) +This hook should return the decl of a function that implements the +vectorized variant of the function with the @code{combined_fn} code +@var{code} or @code{NULL_TREE} if such a function is not available. +The return type of the vectorized function shall be of vector type +@var{vec_type_out} and the argument types should be @var{vec_type_in}. +@end deftypefn + +@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P (machine_mode @var{arg}) +This hook should return true if the target hardware architecture +supports a full SVE data vector mode. +@end deftypefn + @deftypefn {Target Hook} int TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN (struct cgraph_node *@var{}, struct cgraph_simd_clone *@var{}, @var{tree}, @var{int}) This hook should set @var{vecsize_mangle}, @var{vecsize_int}, @var{vecsize_float} fields in @var{simd_clone} structure pointed by @var{clone_info} argument and also diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 2663547c7..945d0f696 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4195,6 +4195,12 @@ address; but often a machine-dependent strategy can generate better code. @hook TARGET_VECTORIZE_BUILTIN_SCATTER +@hook TARGET_VECTORIZE_CODE_FOR_PREFETCH + +@hook TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH + +@hook TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P + @hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN @hook TARGET_SIMD_CLONE_ADJUST diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index 644f234e0..e8a3bb654 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -102,10 +102,12 @@ init_internal_fns () direct_internal_fn. */ #define not_direct { -2, -2, false } #define mask_load_direct { -1, 2, false } +#define mask_prefetch_direct { -1, 2, false } #define load_lanes_direct { -1, -1, false } #define mask_load_lanes_direct { -1, -1, false } #define gather_load_direct { 3, 1, false } #define mask_store_direct { 3, 2, false } +#define gather_prefetch_direct { 3, 1, false } #define store_lanes_direct { 0, 0, false } #define mask_store_lanes_direct { 0, 0, false } #define vec_cond_mask_direct { 0, 0, false } @@ -2520,6 +2522,53 @@ expand_mask_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab) #define expand_mask_load_lanes_optab_fn expand_mask_load_optab_fn +/* Expand MASK_PREFETCH call STMT using optab OPTAB. + .MASK_STORE (_5, 64B, loop_mask_98, vect__8.10_102); + .MASK_PREFETCH (_68, 64B, loop_mask_98, vect__8.10_102, 4); +*/ + +static void +expand_mask_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab) +{ + if (targetm.vectorize.code_for_prefetch == NULL + || targetm.vectorize.prefetch_handleable_mode_p == NULL) + return; + + tree base = gimple_call_arg (stmt, 0); + if (base == NULL_TREE) + return; + + tree maskt = gimple_call_arg (stmt, 2); + tree target = gimple_call_arg (stmt, 3); + tree prfop = gimple_call_arg (stmt, 4); + HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop); + /* Bit 3 of the prfop selects stores over loads. */ + HOST_WIDE_INT access = prfop_int & 8; + /* Bits 1 and 2 specify the locality; 0-based for svprfop but + 1-based for PREFETCH. */ + HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1; + + machine_mode m_mode = TYPE_MODE (TREE_TYPE (target)); + if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode)) + return; + insn_code icode = targetm.vectorize.code_for_prefetch (m_mode); + + rtx mask = expand_normal (maskt); + rtx base_rtx = expand_normal (base); + /* Convert ptr_mode value X to Pmode. */ + if (ptr_mode == SImode) + base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode); + + unsigned i = 0; + class expand_operand ops[5]; + create_input_operand (&ops[i++], mask, TYPE_MODE (TREE_TYPE (maskt))); + create_address_operand (&ops[i++], base_rtx); + create_integer_operand (&ops[i++], prfop_int); + create_integer_operand (&ops[i++], access); + create_integer_operand (&ops[i++], locality); + expand_insn (icode, i, ops); +} + /* Expand MASK_STORE{,_LANES} call STMT using optab OPTAB. */ static void @@ -2920,6 +2969,70 @@ expand_gather_load_optab_fn (internal_fn, gcall *stmt, direct_optab optab) emit_move_insn (lhs_rtx, ops[0].value); } +/* Expand {MASK_,}GATHER_PREFETCH call CALL using optab OPTAB. + vect_patt_97.14_77 = .MASK_GATHER_LOAD (_78, vect__14.13_79, 8, { 0.0, ... }, loop_mask_87); + .MASK_GATHER_PREFETCH (_45, vect__14.13_79, 8, { 0.0, ... }, loop_mask_87, vect_patt_97.14_77, 4); +*/ + +static void +expand_gather_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab) +{ + if (targetm.vectorize.code_for_gather_prefetch == NULL + || targetm.vectorize.prefetch_handleable_mode_p == NULL) + return; + + /* Extracting tree nodes, only expand for scalar base and vector index. */ + tree base = gimple_call_arg (stmt, 0); + if (VECTOR_TYPE_P (TREE_TYPE (base))) + return; + tree offset = gimple_call_arg (stmt, 1); + if (VECTOR_TYPE_P (TREE_TYPE (offset)) == false) + return; + + tree scale = gimple_call_arg (stmt, 2); + tree mask = gimple_call_arg (stmt, 4); + tree target = gimple_call_arg (stmt, 5); + tree prfop = gimple_call_arg (stmt, 6); + + /* Convert to the rtx node. */ + rtx base_rtx = expand_normal (base); + /* Convert ptr_mode value X to Pmode. */ + if (ptr_mode == SImode) + base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode); + rtx offset_rtx = expand_normal (offset); + rtx const_rtx = CONST0_RTX (TYPE_MODE (TREE_TYPE (target))); + rtx mask_rtx = expand_normal (mask); + HOST_WIDE_INT scale_int = tree_to_shwi (scale); + HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop); + /* Bit 3 of the prfop selects stores over loads. */ + HOST_WIDE_INT access = prfop_int & 8; + /* Bits 1 and 2 specify the locality; 0-based for svprfop but + 1-based for PREFETCH. */ + HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1; + + /* add operand. */ + unsigned int i = 0; + class expand_operand ops[9]; + create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask))); + create_address_operand (&ops[i++], base_rtx); + create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset))); + /* Check whether the index has unsigned. */ + create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset))); + create_integer_operand (&ops[i++], scale_int); + create_input_operand (&ops[i++], const_rtx, GET_MODE (const_rtx)); + create_integer_operand (&ops[i++], prfop_int); + create_integer_operand (&ops[i++], access); + create_integer_operand (&ops[i++], locality); + + machine_mode reg_mode = GET_MODE (offset_rtx); + machine_mode m_mode = TYPE_MODE (TREE_TYPE (target)); + if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode)) + return; + insn_code icode = targetm.vectorize.code_for_gather_prefetch + (m_mode, reg_mode); + expand_insn (icode, i, ops); +} + /* Expand DIVMOD() using: a) optab handler for udivmod/sdivmod if it is available. b) If optab_handler doesn't exist, generate call to @@ -3210,9 +3323,11 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, #define direct_cond_binary_optab_supported_p direct_optab_supported_p #define direct_cond_ternary_optab_supported_p direct_optab_supported_p #define direct_mask_load_optab_supported_p direct_optab_supported_p +#define direct_mask_prefetch_optab_supported_p direct_optab_supported_p #define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p #define direct_gather_load_optab_supported_p convert_optab_supported_p +#define direct_gather_prefetch_optab_supported_p direct_optab_supported_p #define direct_mask_store_optab_supported_p direct_optab_supported_p #define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p #define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index 0c6fc3711..cc0f42b98 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -119,6 +119,8 @@ along with GCC; see the file COPYING3. If not see #endif DEF_INTERNAL_OPTAB_FN (MASK_LOAD, ECF_PURE, maskload, mask_load) +DEF_INTERNAL_OPTAB_FN (MASK_PREFETCH, ECF_NOVOPS | ECF_LEAF, + maskprefetch, mask_prefetch) DEF_INTERNAL_OPTAB_FN (LOAD_LANES, ECF_CONST, vec_load_lanes, load_lanes) DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE, vec_mask_load_lanes, mask_load_lanes) @@ -126,6 +128,8 @@ DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE, DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load) DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE, mask_gather_load, gather_load) +DEF_INTERNAL_OPTAB_FN (MASK_GATHER_PREFETCH, ECF_NOVOPS | ECF_LEAF, + mask_gather_prefetch, gather_prefetch) DEF_INTERNAL_OPTAB_FN (SCATTER_STORE, 0, scatter_store, scatter_store) DEF_INTERNAL_OPTAB_FN (MASK_SCATTER_STORE, 0, diff --git a/gcc/optabs.def b/gcc/optabs.def index 0c64eb52a..ee25bc3f7 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -90,9 +90,11 @@ OPTAB_CD(vec_cmp_optab, "vec_cmp$a$b") OPTAB_CD(vec_cmpu_optab, "vec_cmpu$a$b") OPTAB_CD(vec_cmpeq_optab, "vec_cmpeq$a$b") OPTAB_CD(maskload_optab, "maskload$a$b") +OPTAB_CD(maskprefetch_optab, "maskprefetch$a$b") OPTAB_CD(maskstore_optab, "maskstore$a$b") OPTAB_CD(gather_load_optab, "gather_load$a$b") OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b") +OPTAB_CD(mask_gather_prefetch_optab, "mask_gather_prefetch$a$b") OPTAB_CD(scatter_store_optab, "scatter_store$a$b") OPTAB_CD(mask_scatter_store_optab, "mask_scatter_store$a$b") OPTAB_CD(vec_extract_optab, "vec_extract$a$b") diff --git a/gcc/params.opt b/gcc/params.opt index 2044524a3..c429359e3 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -1005,4 +1005,57 @@ Target size of compressed pointer, which should be 8, 16 or 32. Common Joined UInteger Var(semi_relayout_level) Init(13) IntegerRange(11, 15) Param Optimization Set capacity of each bucket to semi-relayout to (1 << semi-relayout-level) / 8 . +-param=mem-access-ratio= +Common Joined UInteger Var(param_mem_access_ratio) Init(20) IntegerRange(0, 100) Param Optimization +Memory access ratio (in percent). + +-param=mem-access-num= +Common Joined UInteger Var(param_mem_access_num) Init(3) Param Optimization +Memory access num. + +-param=prefetch-offset= +Common Joined UInteger Var(param_prefetch_offset) Init(1024) +IntegerRange(1, 999999) Param Optimization +Prefetch Offset, which is usually a power of two due to cache line size. + +-param=branch-prob-threshold= +Common Joined UInteger Var(param_branch_prob_threshold) Init(80) IntegerRange(50, 100) +Param Optimization +High Execution Rate Branch Threshold. + +-param=issue-topn= +Common Joined UInteger Var(param_issue_topn) Init(1) Param Optimization +Issue topn LLC mem_ref hint. + +-param=force-issue= +Common Joined UInteger Var(param_force_issue) Init(0) IntegerRange(0, 1) Param +Force issue the topn LLC mem_ref hint, without generating dynamic multi-branches. + +-param=llc-capacity-per-core= +Common Joined UInteger Var(param_llc_capacity_per_core) Init(114) IntegerRange(0, 999999) Param +LLC capacity per core. + +-param=target-variables= +Common Joined Var(param_target_variables) Init("") Param Optimization +--param=target-variables=[,,...] Target variables for prefetching, separated by comma, +without space. The representation of a variable can be complex and containing space, please surround +it by quotation marks and escape special characters in Linux. The input length should be no more +than 512 characters. + +-param=use-ref-group-index= +Common Joined UInteger Var(param_use_ref_group_index) Init(0) IntegerRange(0, 1) Param Optimization +Prefetch the target variables by their indices in sorted ref_groups, use together with parameter +target-variables. + +-param=mem-ref-index= +Common Joined Var(param_mem_ref_index) Init("") Param Optimization +--param=mem-ref-index=[,,...] Prefetch the target variable at the memory reference +location with the index of customized order, separated by comma, without space. The input length +should be no more than 512 characters. + +-param=filter-kernels= +Common Joined UInteger Var(param_filter_kernels) Init(1) IntegerRange(0, 1) Param +Allow LLC allocate pass to greedily filter kernels by traversing the corresponding basic blocks +through edges with branch probability no less than param_branch_prob_threshold. + ; This comment is to ensure we retain the blank line above. diff --git a/gcc/passes.def b/gcc/passes.def index df7d65733..ea59fc8ca 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -303,6 +303,7 @@ along with GCC; see the file COPYING3. If not see /* Run IVOPTs after the last pass that uses data-reference analysis as that doesn't handle TARGET_MEM_REFs. */ NEXT_PASS (pass_iv_optimize); + NEXT_PASS (pass_llc_allocate); NEXT_PASS (pass_lim); NEXT_PASS (pass_tree_loop_done); POP_INSERT_PASSES () diff --git a/gcc/target.def b/gcc/target.def index 34d3561bd..351c94c37 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -2072,6 +2072,37 @@ DEFHOOK (void *data), default_destroy_cost_data) +/* Function for vector prefetch operation. */ +DEFHOOK +(code_for_prefetch, + "This hook should return the decl of a function that implements the\n\ +vectorized variant of the function with the @code{combined_fn} code\n\ +@var{code} or @code{NULL_TREE} if such a function is not available.\n\ +The return type of the vectorized function shall be of vector type\n\ +@var{vec_type_out} and the argument types should be @var{vec_type_in}.", + insn_code, (machine_mode arg), + NULL) + +/* Function for vector gather prefetch operation. */ +DEFHOOK +(code_for_gather_prefetch, + "This hook should return the decl of a function that implements the\n\ +vectorized variant of the function with the @code{combined_fn} code\n\ +@var{code} or @code{NULL_TREE} if such a function is not available.\n\ +The return type of the vectorized function shall be of vector type\n\ +@var{vec_type_out} and the argument types should be @var{vec_type_in}.", + insn_code, (machine_mode mode_to, machine_mode mode_form), + NULL) + +/* Function to check whether the target hardware architecture supports + a full SVE data vector mode. */ +DEFHOOK +(prefetch_handleable_mode_p, + "This hook should return true if the target hardware architecture\n\ +supports a full SVE data vector mode.", + bool, (machine_mode arg), + NULL) + HOOK_VECTOR_END (vectorize) #undef HOOK_PREFIX diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c new file mode 100644 index 000000000..a4828eaab --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c @@ -0,0 +1,61 @@ +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param issue-topn=2" } */ + +#include + +#define N 131590 +#define F 384477 + +double diagPtr[N]; +double psiPtr[N]; +double ApsiPtr[N]; +int lPtr[F]; +int uPtr[F]; +double lowerPtr[F]; +double upperPtr[F]; + +void +AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, + int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) +{ + for (int cell=0; cell + +#define N 100000 + +int A_i[N]; +int A_j[N]; +double A_data[N]; +double x_data[N]; +double y_data[N]; +int num_rows = N; + +void +MatMult (int *A_i, int *A_j, double *A_data, double *x_data, + int num_rows, double *y_data) +{ + int i = 0; + int j = 0; + double temp = 0; + for (i = 0; i < num_rows; i++) + { + temp = y_data[i]; + for (j = A_i[i]; j < A_i[i+1]; j++) + temp += A_data[j] * x_data[A_j[j]]; + y_data[i] = temp; + } +} + +int +main (int argc, char *argv[]) +{ + int testIter = 2; + + for (int i = 0; i < testIter; i++) + MatMult (A_i, A_j, A_data, x_data, num_rows, y_data); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 2 "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-times "Tracing succeeded" 6 "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-not "static_data_size:" "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 4 "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-not ", size: (?!(0\.000000))" "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 6 "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-times "\\d x_data \\(0.000000, 1, 0\\) : 3" 2 "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-times "\\d A_j \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-times "\\d A_data \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-times "insert svprfd_gather" 2 "llc_allocate" } } */ diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp new file mode 100644 index 000000000..4f34e722f --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp @@ -0,0 +1,27 @@ +# Copyright (C) 2022-2023 Free Software Foundation, Inc. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +load_lib gcc-dg.exp +load_lib target-supports.exp + +# Initialize `dg'. +dg-init + +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \ + "" "-fllc-allocate" + +# All done. +dg-finish diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c new file mode 100644 index 000000000..2a58c501f --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c @@ -0,0 +1,48 @@ +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=uPtr" } */ + +#include + +#define N 131590 +#define F 384477 + +double diagPtr[N]; +double psiPtr[N]; +double ApsiPtr[N]; +int lPtr[F]; +int uPtr[F]; +double lowerPtr[F]; +double upperPtr[F]; + +void +AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, + int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) +{ + for (int cell=0; cell + +typedef struct stack_def +{ + int top; /* index to top stack element */ + unsigned long reg_set; /* set of live registers */ + unsigned char reg[128]; /* register - stack mapping */ +} *stack; + +typedef struct block_info_def +{ + struct stack_def stack_in; /* Input stack configuration. */ + struct stack_def stack_out; /* Output stack configuration. */ + unsigned long out_reg_set; /* Stack regs live on output. */ + int done; /* True if block already converted. */ + int predecessors; /* Number of predecessors that need + to be visited. */ +} *block_info; + +typedef struct basic_block_def +{ + void *aux; +} *basic_block; + +unsigned char +convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high) +{ + stack output_stack; + + output_stack = &(((block_info) bb->aux)->stack_in); + if (value_reg_low == -1) + output_stack->top = -1; + else + { + int reg; + output_stack->top = value_reg_high - value_reg_low; + for (reg = value_reg_low; reg <= value_reg_high; ++reg) + { + (output_stack->reg + 16)[value_reg_high - reg] = reg; + output_stack->reg_set |= (unsigned long) 1 << reg; + } + } + return output_stack->reg[0]; +} + +/* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */ diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c new file mode 100644 index 000000000..27cd574cf --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c @@ -0,0 +1,62 @@ +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0" } */ + +#include +#include + +#define N 1000 + +long a[N] = {0}; +long b[N] = {0}; +long c[N] = {0}; + +double +referenceTrace (double *psiPtr, int *lPtr, int *uPtr, int nCells) +{ + double sum; + for (int cell = 0; cell < nCells; cell++) + { + // Multi-layer pointer + sum += psiPtr[lPtr[cell]]; + psiPtr[uPtr[cell]] = sum; + + // Outer pointer, inner array + sum += psiPtr[b[cell]]; + psiPtr[a[cell]] = sum; + + // Multi-layer array, currently failed tracing at b[cell] and a[cell] + sum += a[b[cell]]; + c[a[cell]] = sum; + + // Outer array, inner pointer, currently failed tracing at lPtr[cell] + sum += a[lPtr[cell]]; + c[lPtr[cell]] = sum; + } + return sum; +} + +int +main (int argc, char *argv[]) +{ + int testIter = 2; + + double *psiPtr = NULL; + int *lPtr = NULL; + int *uPtr = NULL; + psiPtr = (double *) calloc (N, sizeof(double)); + lPtr = (int *) calloc (N, sizeof(int)); + uPtr = (int *) calloc (N, sizeof(int)); + + for (int i = 0; i < testIter; i++) + referenceTrace (psiPtr, lPtr, uPtr, N); + + free (psiPtr); + free (lPtr); + free (uPtr); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "Tracing succeeded" 16 "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-times "Tracing failed" 8 "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-times "unhandled issue scene" 2 "llc_allocate" } } */ diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c new file mode 100644 index 000000000..276781c4f --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c @@ -0,0 +1,48 @@ +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=lPtr" } */ + +#include + +#define N 131590 +#define F 384477 + +double diagPtr[N]; +double psiPtr[N]; +double ApsiPtr[N]; +int lPtr[F]; +int uPtr[F]; +double lowerPtr[F]; +double upperPtr[F]; + +void +AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, + int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) +{ + for (int cell=0; cell + +#define N 131590 +#define F 384477 + +double diagPtr[N]; +double psiPtr[N]; +double ApsiPtr[N]; +int lPtr[F]; +int uPtr[F]; +double lowerPtr[F]; +double upperPtr[F]; + +void +AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, + int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) +{ + for (int cell=0; cell + +#define N 131590 +#define F 384477 + +double diagPtr[N]; +double psiPtr[N]; +double ApsiPtr[N]; +int lPtr[F]; +int uPtr[F]; +double lowerPtr[F]; +double upperPtr[F]; + +void +AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, + int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) +{ + for (int cell=0; cell + +#define N 131590 +#define F 384477 + +double diagPtr[N]; +double psiPtr[N]; +double ApsiPtr[N]; +int lPtr[F]; +int uPtr[F]; +double lowerPtr[F]; +double upperPtr[F]; + +void +AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, + int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) +{ + for (int cell=0; cell + +#define N 131590 +#define F 384477 + +double diagPtr[N]; +double psiPtr[N]; +double ApsiPtr[N]; +int lPtr[F]; +int uPtr[F]; +double lowerPtr[F]; +double upperPtr[F]; + +void +AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, + int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) +{ + for (int cell=0; cell + +#define N 131590 +#define F 384477 + +double diagPtr[N]; +double psiPtr[N]; +double ApsiPtr[N]; +int lPtr[F]; +int uPtr[F]; +double lowerPtr[F]; +double upperPtr[F]; + +void +AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, + int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) +{ + for (int cell=0; cellaux\"" } */ + +#include + +typedef struct stack_def +{ + int top; /* index to top stack element */ + unsigned long reg_set; /* set of live registers */ + unsigned char reg[128]; /* register - stack mapping */ +} *stack; + +typedef struct block_info_def +{ + struct stack_def stack_in; /* Input stack configuration. */ + struct stack_def stack_out; /* Output stack configuration. */ + unsigned long out_reg_set; /* Stack regs live on output. */ + int done; /* True if block already converted. */ + int predecessors; /* Number of predecessors that need + to be visited. */ +} *block_info; + +typedef struct basic_block_def +{ + void *aux; +} *basic_block; + +unsigned char +convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high) +{ + stack output_stack; + + output_stack = &(((block_info) bb->aux)->stack_in); + if (value_reg_low == -1) + output_stack->top = -1; + else + { + int reg; + output_stack->top = value_reg_high - value_reg_low; + for (reg = value_reg_low; reg <= value_reg_high; ++reg) + { + (output_stack->reg + 16)[value_reg_high - reg] = reg; + output_stack->reg_set |= (unsigned long) 1 << reg; + } + } + return output_stack->reg[0]; +} + +/* { dg-final { scan-tree-dump-not "Unrecognizable variable name" + "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */ diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c new file mode 100644 index 000000000..09a525ce1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c @@ -0,0 +1,54 @@ +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -c -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0 --param target-variables=tmp_var_0" } */ + +#include + +typedef struct stack_def +{ + int top; /* index to top stack element */ + unsigned long reg_set; /* set of live registers */ + unsigned char reg[128]; /* register - stack mapping */ +} *stack; + +typedef struct block_info_def +{ + struct stack_def stack_in; /* Input stack configuration. */ + struct stack_def stack_out; /* Output stack configuration. */ + unsigned long out_reg_set; /* Stack regs live on output. */ + int done; /* True if block already converted. */ + int predecessors; /* Number of predecessors that need + to be visited. */ +} *block_info; + +typedef struct basic_block_def +{ + void *aux; +} *basic_block; + +unsigned char +convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high) +{ + stack output_stack; + + output_stack = &(((block_info) bb->aux)->stack_in); + if (value_reg_low == -1) + output_stack->top = -1; + else + { + int reg; + output_stack->top = value_reg_high - value_reg_low; + for (reg = value_reg_low; reg <= value_reg_high; ++reg) + { + (output_stack->reg + 16)[value_reg_high - reg] = reg; + output_stack->reg_set |= (unsigned long) 1 << reg; + } + } + return output_stack->reg[0]; +} + +/* { dg-final { scan-tree-dump-not "Unrecognizable variable name" + "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-times "NOTICE: Prefetching target variable \"" + " bb_16(D)->aux \"" 1 "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */ +/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */ diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 new file mode 100644 index 000000000..ec918e144 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 @@ -0,0 +1,213 @@ +! { dg-do compile { target { aarch64*-*-linux* } } } +! { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50" } + +program main + + IMPLICIT NONE + INTEGER :: ids,ide, jds,jde, kds,kde + INTEGER,parameter :: ims=-4,kms=1,jms=-4 + INTEGER,parameter :: ime=210,kme=36,jme=192 + INTEGER :: its,ite, jts,jte, kts,kte + INTEGER :: number_of_small_timesteps,rk_step, rk_order, step + + REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: t_1, t_2, c2a, p, ph, pm1, al, alt + + + REAL, DIMENSION(ims:ime, jms:jme) :: mu, muts + + REAL, DIMENSION(kms:kme) :: dnw, rdnw, znu + + REAL :: rdx,rdy + REAL :: dts, t0, smdiv + REAL :: random1,time_begin,time_end,total_time + + INTEGER :: i, j, k + INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end + INTEGER :: i_endu, j_endv + INTEGER :: interval=1 + INTEGER :: epoch,iter + + LOGICAL :: non_hydrostatic + + data ids, jds, kds, its, jts, kts /6*1/ + data ide, ite /2*205/ + data jde, jte /2*187/ + data kde, kte /2*36/ + + number_of_small_timesteps = 1 + rk_step = 3 + rk_order = 1 + dts = 1. + + rdx = 1. + rdy = 1. + + t0 = 0. + smdiv = 1. + step = 1 + non_hydrostatic = .true. + + call random_number(random1) + interval = random1*100 + interval=1 + + call random_seed(put=(/(i,i=1,10000,interval)/)) + + call random_number(alt) + call random_number(c2a) + call random_number(ph) + call random_number(pm1) + call random_number(mu) + call random_number(muts) + call random_number(dnw) + call random_number(rdnw) + call random_number(znu) + + do iter=1,2 + call calc_p_rho( al, p, ph, & + alt, t_2, t_1, c2a, pm1, & + mu, muts, znu, t0, & + rdnw, dnw, smdiv, & + non_hydrostatic, step, & + ids, ide, jds, jde, kds, kde, & + ims, ime, jms, jme, kms, kme, & + its,ite, jts,jte, kts,kte ) + + enddo + +end program + + +SUBROUTINE calc_p_rho( al, p, ph, & + alt, t_2, t_1, c2a, pm1, & + mu, muts, znu, t0, & + rdnw, dnw, smdiv, & + non_hydrostatic, step, & + ids, ide, jds, jde, kds, kde, & + ims, ime, jms, jme, kms, kme, & + its,ite, jts,jte, kts,kte ) + + IMPLICIT NONE ! religion first + !asb +! declarations for the stuff coming in + + INTEGER, INTENT(IN ) :: ids,ide, jds,jde, kds,kde + INTEGER, INTENT(IN ) :: ims,ime, jms,jme, kms,kme + INTEGER, INTENT(IN ) :: its,ite, jts,jte, kts,kte + + INTEGER, INTENT(IN ) :: step + + REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT( OUT) :: al, & + p + + REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(IN ) :: alt, & + t_2, & + t_1, & + c2a + + REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(INOUT) :: ph, pm1 + + REAL, DIMENSION(ims:ime, jms:jme) , INTENT(IN ) :: mu, & + muts + + REAL, DIMENSION(kms:kme) , INTENT(IN ) :: dnw, & + rdnw, & + znu + + REAL, INTENT(IN ) :: t0, smdiv + + LOGICAL, INTENT(IN ) :: non_hydrostatic + +! local variables + + INTEGER :: i, j, k + INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end + REAL :: ptmp + + i_start = its + i_end = min(ite,ide-1) + j_start = jts + j_end = min(jte,jde-1) + k_start = kts + k_end = min(kte,kde-1) + + IF (non_hydrostatic) THEN + DO j=j_start, j_end + DO k=k_start, k_end + DO i=i_start, i_end + +! al computation is all dry, so ok with moisture + + al(i,k,j)=-1./muts(i,j)*(alt(i,k,j)*mu(i,j) & + +rdnw(k)*(ph(i,k+1,j)-ph(i,k,j))) + +! this is temporally linearized p, no moisture correction needed + + p(i,k,j)=c2a(i,k,j)*(alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j)) & + /(muts(i,j)*(t0+t_1(i,k,j)))-al (i,k,j)) + + ENDDO + ENDDO + ENDDO + + ELSE ! hydrostatic calculation + + DO j=j_start, j_end + DO k=k_start, k_end + DO i=i_start, i_end + p(i,k,j)=mu(i,j)*znu(k) + al(i,k,j)=alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j)) & + /(muts(i,j)*(t0+t_1(i,k,j)))-p(i,k,j)/c2a(i,k,j) + ph(i,k+1,j)=ph(i,k,j)-dnw(k)*(muts(i,j)*al (i,k,j) & + +mu(i,j)*alt(i,k,j)) + ENDDO + ENDDO + ENDDO + + END IF + +! divergence damping setup + + IF (step == 0) then ! we're initializing small timesteps + DO j=j_start, j_end + DO k=k_start, k_end + DO i=i_start, i_end + pm1(i,k,j)=p(i,k,j) + ENDDO + ENDDO + ENDDO + ELSE ! we're in the small timesteps + DO j=j_start, j_end ! and adding div damping component + DO k=k_start, k_end + DO i=i_start, i_end + ptmp = p(i,k,j) + p(i,k,j) = p(i,k,j) + smdiv*(p(i,k,j)-pm1(i,k,j)) + pm1(i,k,j) = ptmp + ENDDO + ENDDO + ENDDO + END IF + +END SUBROUTINE calc_p_rho + +! { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 6 "llc_allocate" } } +! { dg-final { scan-tree-dump-times "Tracing succeeded" 48 "llc_allocate" } } +! { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } +! { dg-final { scan-tree-dump-not "static_data_size:" "llc_allocate" } } +! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){2}\}" 3 "llc_allocate" } } +! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){4}\}" 1 "llc_allocate" } } +! { dg-final { scan-tree-dump-not ", size: (?!(0\.000000))" "llc_allocate" } } +! { dg-final { scan-tree-dump-times ", size: 0\.000000" 28 "llc_allocate" } } +! { dg-final { scan-tree-dump-times "\\d p \\(0.000000, 3, 0\\) : 8" 2 "llc_allocate" } } +! { dg-final { scan-tree-dump-times "\\d pm1 \\(0.000000, 2, 0\\) : 5" 2 "llc_allocate" } } +! { dg-final { scan-tree-dump-times "\\d ph \\(0.000000, 2, 0\\) : 4" 2 "llc_allocate" } } +! { dg-final { scan-tree-dump-times "\\d al \\(0.000000, 1, 0\\) : 3" 2 "llc_allocate" } } +! { dg-final { scan-tree-dump-times "\\d alt \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } +! { dg-final { scan-tree-dump-times "\\d t_1 \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } +! { dg-final { scan-tree-dump-times "\\d t_2 \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } +! { dg-final { scan-tree-dump-times "\\d c2a \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } +! { dg-final { scan-tree-dump-times "\\d mu \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } +! { dg-final { scan-tree-dump-times "\\d muts \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } +! { dg-final { scan-tree-dump-times "runtime issue" 2 "llc_allocate" } } +! { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } +! { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } } diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp new file mode 100644 index 000000000..068341784 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp @@ -0,0 +1,29 @@ +# Copyright (C) 2022-2023 Free Software Foundation, Inc. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +# GCC testsuite that uses the `dg.exp' driver. + +load_lib gfortran-dg.exp + +# Initialize `dg'. +dg-init + +# Main loop. +gfortran-dg-runtest [lsort \ + [glob -nocomplain $srcdir/$subdir/*.\[fF\]{,90,95,03,08} ] ] "" "" + +# All done. +dg-finish diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 new file mode 100644 index 000000000..23e360540 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 @@ -0,0 +1,63 @@ +! { dg-do compile { target { aarch64*-*-linux* } } } +! { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno" } + +MODULE INPUT + IMPLICIT NONE + + INTEGER, PARAMETER :: wp = 8, jpi = 25, jpj = 39, jpk = 31, kjpt = 2 + + INTEGER :: kt = 1, jpkm1 = 30, jpjm1 = 38, fs_jpim1 = 24, fs_2 = 2 + REAL(wp), DIMENSION(jpi, jpj) :: e12t + REAL(wp), DIMENSION(jpi, jpj, jpk) :: fse3t_n + REAL(wp), DIMENSION(jpi, jpj, jpk, kjpt) :: pta + +END MODULE INPUT + +PROGRAM MAIN + USE INPUT + + IMPLICIT NONE + + INTEGER :: EPOCH + +! Initialize arrays + + e12t = 1 + fse3t_n = 1 + pta = 1 +! + + DO EPOCH=1,2 + CALL tra_ldf_iso + ENDDO + +END PROGRAM MAIN + +SUBROUTINE tra_ldf_iso + USE INPUT + + IMPLICIT NONE + ! + INTEGER :: ji, jj, jk, jn ! dummy loop indices + REAL(wp) :: zbtr, ztra ! - - + REAL(wp), DIMENSION(jpi, jpj, jpk) :: ztfw + + DO jn = 1, kjpt + ztfw(:, :, 1) = 0.e0; ztfw(:, :, jpk) = 0.e0 + + DO jk = 1, jpkm1 + DO jj = 2, jpjm1 + DO ji = fs_2, fs_jpim1 ! vector opt. + zbtr = 1.0/(e12t(ji, jj)*fse3t_n(ji, jj, jk)) + ztra = (ztfw(ji, jj, jk) - ztfw(ji, jj, jk + 1))*zbtr + pta(ji, jj, jk, jn) = pta(ji, jj, jk, jn) + ztra + END DO + END DO + END DO + ! + END DO + ! +END SUBROUTINE tra_ldf_iso + +! { dg-final { scan-tree-dump-times "Traced variables at vectp_ztfw" 2 "llc_allocate" } } +! { dg-final { scan-tree-dump-times "Tracing unusual number or occurrences of base variables. Choose ztfw." 2 "llc_allocate" } } diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 new file mode 100644 index 000000000..d76c75b5b --- /dev/null +++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 @@ -0,0 +1,58 @@ +! { dg-do compile { target { aarch64*-*-linux* } } } +! { dg-options "-c -O3 -march=armv8.2-a+sve -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0 --param issue-topn=1 --param mem-access-ratio=5 --param mem-access-num=1" } + +Module module_domain + IMPLICIT NONE + + REAL, PARAMETER :: g = 9.8 + TYPE :: grid_type + REAL, POINTER :: phb(:,:,:), ph_2(:,:,:), p(:,:,:), pb(:,:,:) + REAL, POINTER :: fnm(:), fnp(:) + END TYPE +END Module + +SUBROUTINE calc_p8w(p8w, ix, iy, k_start, k_end) + + USE module_domain + !USE module_model_constants + + IMPLICIT NONE + + + !TYPE (domain), INTENT(IN) :: grid + INTEGER, INTENT(IN) :: k_start, k_end, ix, iy + REAL, DIMENSION(k_start:k_end), INTENT(OUT) :: p8w + + + INTEGER :: k + REAL :: z0, z1, z2, w1, w2 + REAL, DIMENSION(k_start:k_end) :: z_at_w + REAL, DIMENSION(k_start:k_end-1) :: z + TYPE (grid_type), POINTER :: grid + + + DO k = k_start, k_end + z_at_w(k) = (grid%phb(ix,k,iy)+grid%ph_2(ix,k,iy))/g + END DO + + DO k = k_start, k_end-1 + z(k) = 0.5*(z_at_w(k) + z_at_w(k+1)) + END DO + + DO k = k_start+1, k_end-1 + p8w(k) = grid%fnm(k)*(grid%p(ix,k,iy)+grid%pb(ix,k,iy)) + & + grid%fnp(k)*(grid%p(ix,k-1,iy)+grid%pb(ix,k-1,iy)) + END DO + + z0 = z_at_w(k_start) + z1 = z(k_start) + z2 = z(k_start+1) + w1 = (z0 - z2)/(z1 - z2) + w2 = 1. - w1 + p8w(k_start) = w1*(grid%p(ix,k_start,iy)+grid%pb(ix,k_start,iy)) + & + w2*(grid%p(ix,k_start+1,iy)+grid%pb(ix,k_start+1,iy)) + +END SUBROUTINE calc_p8w + +! { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } +! { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } diff --git a/gcc/timevar.def b/gcc/timevar.def index ba86a1b7b..4b643538f 100644 --- a/gcc/timevar.def +++ b/gcc/timevar.def @@ -207,6 +207,7 @@ DEFTIMEVAR (TV_TREE_LOOP_DISTRIBUTION, "tree loop distribution") DEFTIMEVAR (TV_CHECK_DATA_DEPS , "tree check data dependences") DEFTIMEVAR (TV_TREE_PREFETCH , "tree prefetching") DEFTIMEVAR (TV_TREE_LOOP_IVOPTS , "tree iv optimization") +DEFTIMEVAR (TV_TREE_LLC_ALLOCATE , "tree llc allocation") DEFTIMEVAR (TV_PREDCOM , "predictive commoning") DEFTIMEVAR (TV_TREE_CH , "tree copy headers") DEFTIMEVAR (TV_TREE_SSA_UNCPROP , "tree SSA uncprop") diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c index d82fe23d8..9eb173d69 100644 --- a/gcc/tree-cfg.c +++ b/gcc/tree-cfg.c @@ -8365,6 +8365,17 @@ print_loops (FILE *file, int verbosity) print_loop_and_siblings (file, bb->loop_father, 0, verbosity); } +/* Dump a loop to file. */ + +void +loop_dump (FILE *file, class loop *loop) +{ + print_loop (file, loop, 0, 0); + fprintf (file, "vec_niter = "); + print_generic_expr (file, loop->vec_nb_iterations); + fprintf (file, "\n"); +} + /* Dump a loop. */ DEBUG_FUNCTION void diff --git a/gcc/tree-cfg.h b/gcc/tree-cfg.h index beb4997a6..dad0ca0a6 100644 --- a/gcc/tree-cfg.h +++ b/gcc/tree-cfg.h @@ -83,6 +83,7 @@ extern void dump_function_to_file (tree, FILE *, dump_flags_t); extern void debug_function (tree, dump_flags_t); extern void print_loops_bb (FILE *, basic_block, int, int); extern void print_loops (FILE *, int); +extern void loop_dump (FILE *file, class loop *loop); extern void debug (class loop &ref); extern void debug (class loop *ptr); extern void debug_verbose (class loop &ref); diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 027f8992d..a1e215901 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -383,6 +383,7 @@ extern gimple_opt_pass *make_pass_complete_unrolli (gcc::context *ctxt); extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt); extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt); extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt); +extern gimple_opt_pass *make_pass_llc_allocate (gcc::context *ctxt); extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt); extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt); extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt); diff --git a/gcc/tree-scalar-evolution.c b/gcc/tree-scalar-evolution.c index edab77827..73ffa0759 100644 --- a/gcc/tree-scalar-evolution.c +++ b/gcc/tree-scalar-evolution.c @@ -2789,7 +2789,7 @@ resolve_mixers (class loop *loop, tree chrec, bool *folded_casts) the loop body has been executed 6 times. */ tree -number_of_latch_executions (class loop *loop) +number_of_latch_executions (class loop *loop, bool guarantee) { edge exit; class tree_niter_desc niter_desc; @@ -2810,7 +2810,8 @@ number_of_latch_executions (class loop *loop) res = chrec_dont_know; exit = single_exit (loop); - if (exit && number_of_iterations_exit (loop, exit, &niter_desc, false)) + if (exit && number_of_iterations_exit (loop, exit, &niter_desc, false, + true, NULL, guarantee)) { may_be_zero = niter_desc.may_be_zero; res = niter_desc.niter; @@ -2836,7 +2837,8 @@ number_of_latch_executions (class loop *loop) fprintf (dump_file, "))\n"); } - loop->nb_iterations = res; + if (guarantee) + loop->nb_iterations = res; return res; } diff --git a/gcc/tree-scalar-evolution.h b/gcc/tree-scalar-evolution.h index e2fbfb55b..218155650 100644 --- a/gcc/tree-scalar-evolution.h +++ b/gcc/tree-scalar-evolution.h @@ -21,7 +21,8 @@ along with GCC; see the file COPYING3. If not see #ifndef GCC_TREE_SCALAR_EVOLUTION_H #define GCC_TREE_SCALAR_EVOLUTION_H -extern tree number_of_latch_executions (class loop *); +extern tree number_of_latch_executions (class loop *, + bool guarantee = true); extern gcond *get_loop_exit_condition (const class loop *); extern void scev_initialize (void); diff --git a/gcc/tree-ssa-llc-allocate.c b/gcc/tree-ssa-llc-allocate.c new file mode 100644 index 000000000..746a1cf95 --- /dev/null +++ b/gcc/tree-ssa-llc-allocate.c @@ -0,0 +1,2898 @@ +/* LLC allocate. + Copyright (C) 2022-2023 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +GCC is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#define INCLUDE_MAP +#define INCLUDE_SET +#define INCLUDE_VECTOR +#define INCLUDE_LIST +#define INCLUDE_ALGORITHM +#define INCLUDE_STRING +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "target.h" +#include "rtl.h" +#include "tree.h" +#include "gimple.h" +#include "predict.h" +#include "tree-pass.h" +#include "gimple-ssa.h" +#include "optabs-query.h" +#include "tree-pretty-print.h" +#include "fold-const.h" +#include "stor-layout.h" +#include "gimplify.h" +#include "gimple-iterator.h" +#include "gimplify-me.h" +#include "tree-ssa-loop-ivopts.h" +#include "tree-ssa-loop-manip.h" +#include "tree-ssa-loop-niter.h" +#include "tree-ssa-loop.h" +#include "ssa.h" +#include "tree-into-ssa.h" +#include "cfgloop.h" +#include "tree-scalar-evolution.h" +#include "langhooks.h" +#include "tree-inline.h" +#include "tree-data-ref.h" +#include "diagnostic-core.h" +#include "dbgcnt.h" +#include "gimple-pretty-print.h" +#include "internal-fn.h" +#include "tree-cfg.h" +#include "profile-count.h" + +/* Number of parallel cores. */ +const unsigned int PARALLEL_NUM = 288; + +/* Indirect access weight. */ +const unsigned int INDIRECT_ACCESS_VALUE = 2; + +/* Write memory weight. */ +const unsigned int WRITE_COST = 2; + +/* Prefetch tool input max length. */ +#ifndef PREFETCH_TOOL_INPUT_MAX_LEN +#define PREFETCH_TOOL_INPUT_MAX_LEN 512 +#endif + +/* Prefetch tool number max length. */ +#ifndef PREFETCH_TOOL_NUM_MAX_LEN +#define PREFETCH_TOOL_NUM_MAX_LEN 9 +#endif + +namespace { + +using namespace std; + +/* loop bound info of the memory reference located. */ +struct loop_bound +{ + /* iv tree_node. */ + tree iv; + + /* define stmt of iv. */ + gimple *def_stmt; + + /* loop where stmt is located. */ + class loop *loop; + + /* loop unroll factor. */ + unsigned int unroll; + + /* Number of iterations of loop. */ + tree niters; + + loop_bound (tree t, gimple *stmt) + { + iv = t; + def_stmt = stmt; + loop = loop_containing_stmt (stmt); + unroll = 1; + niters = chrec_dont_know; + } +}; + +/* method of calculating the data size. */ + +enum calc_type +{ + UNHANDLE_CALC = 0, + RUNTIME_CALC, + STATIC_CALC +}; + +/* Describes a info of a memory reference. */ + +struct data_ref +{ + /* The memory reference. */ + tree ref; + + /* Statement where the ref is located. */ + gimple *stmt; + + /* var_decl or param_decl, used for the ref_group. */ + tree var; + + /* Base of the reference. */ + tree base; + + /* Constant offset of the reference. */ + tree offset; + + /* index of the reference. */ + tree index; + + /* Constant step of the reference. */ + tree step; + + /* loop boundary info of each dimension. */ + vector loop_bounds; + + /* memory data size, Unit: MB. */ + double data_size; + + /* method of calculating the data size. */ + calc_type calc_by; + + /* True if the info of ref is traced, and then record it. */ + unsigned int trace_status_p : 1; + + /* True if the loop is vectorized. */ + unsigned int vectorize_p : 1; + + /* True if the memory reference is shared. */ + unsigned int parallel_p : 1; + + /* True if the memory reference is regular. */ + unsigned int regular_p : 1; + + /* True if the memory reference is read. */ + unsigned int read_p : 1; + + data_ref () + { + ref = NULL_TREE; + stmt = NULL; + var = NULL_TREE; + base = NULL_TREE; + offset = NULL_TREE; + index = NULL_TREE; + step = NULL_TREE; + data_size = 0; + calc_by = UNHANDLE_CALC; + trace_status_p = false; + vectorize_p = false; + parallel_p = false; + regular_p = true; + read_p = true; + } +}; + +/* ================ phase 1 get_dense_memory_kernels ================ */ + +/* Add ref node and print. */ + +void +add_ref (vector &references, tree op, gimple *stmt, + bool vectorize_p, bool read_p) +{ + data_ref ref; + ref.ref = op; + ref.stmt = stmt; + ref.vectorize_p = vectorize_p; + ref.read_p = read_p; + if (dump_file && (dump_flags & TDF_DETAILS)) + { + print_generic_expr (dump_file, ref.ref, TDF_LINENO); + fprintf (dump_file, "\n"); + } + references.push_back (ref); +} + +/* Get the references from the simple call (vectorization type). */ + +void +get_references_in_gimple_call (gimple *stmt, vector &references) +{ + if (gimple_code (stmt) != GIMPLE_CALL) + return; + + if (gimple_call_internal_p (stmt)) + { + bool read_p = false; + switch (gimple_call_internal_fn (stmt)) + { + case IFN_MASK_GATHER_LOAD: + case IFN_MASK_LOAD: + { + if (gimple_call_lhs (stmt) == NULL_TREE) + return; + read_p = true; + // FALLTHRU + } + case IFN_MASK_STORE: + { + /* _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B]; + vect__1.1 = .MASK_LOAD (_1, 64B, loop_mask_4); + + _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B]; + .MASK_STORE (_1, 64B, loop_mask_4, vect__1.2); + + _1 = (sizetype) a_2(D); + vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8, + { 0.0, ... }, loop_mask_5); + */ + tree op1 = gimple_call_arg (stmt, 0); + if (TREE_CODE (op1) != SSA_NAME) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "get_references_in_gimple_call: "); + fprintf (dump_file, "find base that not ssa_name: "); + print_generic_expr (dump_file, op1, TDF_LINENO); + fprintf (dump_file, "\n"); + } + return; + } + gimple *op1_def = SSA_NAME_DEF_STMT (op1); + if (op1_def != NULL && gimple_code (op1_def) == GIMPLE_ASSIGN) + { + /* &MEM[base: xx] */ + tree rhs1 = gimple_assign_rhs1 (op1_def); + /* If the definition stmt of the operation is memory + reference type, read it directly. */ + if (TREE_CODE (rhs1) == ADDR_EXPR + && TREE_CODE (TREE_OPERAND (rhs1, 0)) == TARGET_MEM_REF) + op1 = TREE_OPERAND (rhs1, 0); /* MEM[base: xx] */ + } + + add_ref (references, op1, stmt, true, read_p); + return; + } + default: + return; + } + } +} + +/* Stores the locations of memory references in STMT to REFERENCES. */ + +void +get_references_in_stmt (gimple *stmt, vector &references) +{ + if (!gimple_vuse (stmt)) + return; + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "gimple_vuse: "); + print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); + } + + if (gimple_code (stmt) == GIMPLE_ASSIGN) + { + tree op0 = gimple_assign_lhs (stmt); + tree op1 = gimple_assign_rhs1 (stmt); + tree base = NULL_TREE; + + /* _1 = MEM[base: a, index: i, step: 8, offset: 0B]; */ + if (REFERENCE_CLASS_P (op1) && (base = get_base_address (op1)) + && TREE_CODE (base) != SSA_NAME && !is_gimple_min_invariant (base)) + add_ref (references, op1, stmt, false, true); + + if (REFERENCE_CLASS_P (op0) && get_base_address (op0)) + add_ref (references, op0, stmt, false, false); + } + else if (gimple_code (stmt) == GIMPLE_CALL) + get_references_in_gimple_call (stmt, references); + + return; +} + +/* flag of loop filter out. */ + +struct loop_filter_out_flag +{ + /* Use external gimple. */ + bool use_ext_gimple; + + /* Use external call. */ + bool use_ext_call; + + /* Use external node. */ + bool use_ext_node; + + /* Use loop defined in macros. */ + bool use_macro_loop; + + /* Use external node. */ + bool use_cond_func; +}; + +/* Check whether an external node is used. */ + +bool use_ext_node_p (const vector &references, + unsigned int &start) +{ + expanded_location cfun_xloc + = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); + + unsigned i = start; + start = references.size (); + for (; i < references.size (); i++) + { + data_ref ref = references[i]; + expanded_location xloc = expand_location (ref.stmt->location); + if (xloc.file && filename_cmp (cfun_xloc.file, xloc.file)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "use_ext_node\n\n"); + return true; + } + } + return false; +} + +/* Determine whether to filter out loops by stmt. */ + +bool +filter_out_loop_by_stmt_p (loop_filter_out_flag &loop_filter, gimple *stmt, + const vector &references, unsigned int &start) +{ + /* check use_ext_gimple. */ + expanded_location cfun_xloc + = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); + expanded_location xloc = expand_location (stmt->location); + if (xloc.file && filename_cmp (cfun_xloc.file, xloc.file)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "use_ext_gimple: "); + print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); + } + loop_filter.use_ext_gimple = true; + return true; + } + + /* check use_ext_call. */ + if (gimple_code (stmt) == GIMPLE_CALL && !gimple_call_internal_p (stmt)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "use_ext_call: "); + print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); + } + loop_filter.use_ext_call = true; + return true; + } + + /* check use_macro_loop. */ + if (xloc.file && xloc.column != 1) + loop_filter.use_macro_loop = false; + + /* checke use_cond_func, VEC_COND_EXPR/MIN_EXPR/MAX_EXPR. */ + if (gimple_code (stmt) == GIMPLE_ASSIGN) + { + enum tree_code rhs_code = gimple_assign_rhs_code (stmt); + if (rhs_code == VEC_COND_EXPR || rhs_code == MIN_EXPR + || rhs_code == MAX_EXPR) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "use_cond_func: "); + print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); + } + loop_filter.use_cond_func = true; + return true; + } + } + + /* check use_ext_node. */ + if (use_ext_node_p (references, start)) + { + loop_filter.use_ext_node = true; + return true; + } + + return false; +} + +/* Dump the flag type of the loop is filtered out. */ + +void +dump_loop_filter_out_flag (loop_filter_out_flag &loop_filter) +{ + if (loop_filter.use_ext_gimple) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "non-dense mem access: use_ext_gimple\n"); + } + if (loop_filter.use_ext_call) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "non-dense mem access: use_ext_call\n"); + } + + if (loop_filter.use_ext_node) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "non-dense mem access: use_ext_node\n"); + } + + if (loop_filter.use_macro_loop) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "non-dense mem access: use_macro_loop\n"); + } + + if (loop_filter.use_cond_func) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "non-dense mem access: use_cond_func\n"); + } +} + +/* Get references in loop. */ + +bool +get_references_in_loop (vector &references, + loop_filter_out_flag &loop_filter, + class loop *loop) +{ + unsigned int start = 0; + bool filter_out_loop = true; + + /* Analyze each bb in the loop. */ + basic_block *body = get_loop_body_in_dom_order (loop); + for (unsigned i = 0; i < loop->num_nodes; i++) + { + basic_block bb = body[i]; + if (bb->loop_father != loop) + continue; + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "\n==== the %dth loop bb body ====\n", i); + gimple_dump_bb (dump_file, bb, 0, dump_flags); + fprintf (dump_file, "\n"); + } + + gimple_stmt_iterator bsi; + for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) + { + gimple *stmt = gsi_stmt (bsi); + get_references_in_stmt (stmt, references); + filter_out_loop = filter_out_loop_by_stmt_p (loop_filter, stmt, + references, start); + if (filter_out_loop) + break; + } + if (filter_out_loop) + break; + } + free (body); + return !filter_out_loop; +} + +/* Determine whether the loop is a single path. */ + +bool +single_path_p (class loop *loop, basic_block bb) +{ + if (bb == NULL) + return false; + if (bb == loop->latch) + return true; + + gimple *stmt = last_stmt (bb); + bool res = false; + + if (stmt && gimple_code (stmt) == GIMPLE_COND) + { + gcc_assert (EDGE_COUNT (bb->succs) == 2); + edge true_edge = NULL; + edge false_edge = NULL; + extract_true_false_edges_from_block (bb, &true_edge, &false_edge); + + /* Returns false, if a branch occurs. */ + if (true_edge->dest->loop_father == loop + && false_edge->dest->loop_father == loop) + return false; + + if (true_edge->dest->loop_father == loop) + res = single_path_p (loop, true_edge->dest); + else + res = single_path_p (loop, false_edge->dest); + } + else + { + edge e = find_fallthru_edge (bb->succs); + if (e) + res = single_path_p (loop, e->dest); + } + return res; +} + +/* Computes an estimated number of insns in LOOP, weighted by WEIGHTS. + Assume that the HPC data reading and calculation process does not involve + adding branches in loops. Therefore, all bbs of loops are directly used for + calculation (excluding embedded loops) without considering branch weighting. +*/ + +unsigned +estimate_loop_insns (class loop *loop, eni_weights *weights) +{ + basic_block *body = get_loop_body (loop); + gimple_stmt_iterator gsi; + unsigned size = 0, i; + + for (i = 0; i < loop->num_nodes; i++) + { + basic_block bb = body[i]; + if (bb->loop_father != loop) + { + continue; + } + for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi)) + size += estimate_num_insns (gsi_stmt (gsi), weights); + } + free (body); + + return size; +} + +/* Check whether the memory access is dense. */ + +bool +dense_memory_p (const vector &references, class loop *loop) +{ + int ref_count = references.size (); + unsigned int ninsns = estimate_loop_insns (loop, &eni_size_weights); + float mem_to_insn_ratio = (float)ref_count / (float)ninsns; + + /* The number of cores to be run and DDR bandwidth information can be + transferred to flexibly adjust the threshold. */ + bool dense_mem = (mem_to_insn_ratio >= (param_mem_access_ratio / 100.0) + && ref_count >= param_mem_access_num); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (cfun->decl)); + + /* Dump dense memory source code location. */ + if (ref_count && references[0].stmt->location) + { + expanded_location xloc = expand_location + (references[0].stmt->location); + int fn_start = 0; + if (DECL_SOURCE_LOCATION (current_function_decl)) + fn_start = expand_location ( + DECL_SOURCE_LOCATION (current_function_decl)).line; + int fn_end = fn_start; + if (cfun->function_end_locus) + fn_end = expand_location (cfun->function_end_locus).line; + if (xloc.file) + fprintf (dump_file, "[%s:%s(%d-%d):%d:%d] ", + xloc.file, fn_name, fn_start, fn_end, + xloc.line, xloc.column); + } + + /* Dump memory dense information. */ + if (dense_mem) + fprintf (dump_file, "dense memory access: "); + else + fprintf (dump_file, "non-dense mem access: "); + fprintf (dump_file, + "ref_count = %d, ninsns = %d, mem_to_insn_ratio = %f\n\n", + ref_count, ninsns, mem_to_insn_ratio); + } + + return dense_mem; +} + +/* Analyze the inner loop and get the loop with dense memory access. */ + +bool +get_dense_memory_kernels (vector &kernels, + map > &kernels_refs) +{ + if (dump_file) + fprintf (dump_file, "\nPhase 1: get_dense_memory_kernels\n\n"); + class loop *loop = NULL; + FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) + { + number_of_latch_executions (loop); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "\n========== Processing loop %d: ==========\n", + loop->num); + loop_dump (dump_file, loop); + flow_loop_dump (loop, dump_file, NULL, 1); + fprintf (dump_file, "loop unroll: %d\n", loop->unroll); + } + + if (get_loop_exit_edges (loop).length () != 1 + || !single_path_p (loop, loop->header)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "non-dense mem access: loop_branching\n"); + continue; + } + + vector references; + loop_filter_out_flag loop_filter = {false, false, false, true, false}; + + if (!get_references_in_loop (references, loop_filter, loop)) + { + dump_loop_filter_out_flag (loop_filter); + continue; + } + + if (dense_memory_p (references, loop)) + { + kernels_refs[loop] = references; + kernels.push_back (loop); + } + } + return kernels.size () > 0; +} + +/* ================ phase 2 trace_data_refs_info ================ */ + +/* Determine whether the declaration is a non-vectorized. */ + +bool +generic_decl_p (tree expr) +{ + if (expr == NULL_TREE) + return false; + enum tree_code expr_code = TREE_CODE (expr); + if (expr_code != VAR_DECL && expr_code != PARM_DECL + && expr_code != COMPONENT_REF) + return false; + + tree type = TREE_TYPE (expr); + while (type) + { + if (TREE_CODE (type) != VECTOR_TYPE) + /* TREE_TYPE (NODE) ( + CONTAINS_STRUCT_CHECK (NODE, TS_TYPED)->typed.type) */ + type = CONTAINS_STRUCT_CHECK (type, TS_TYPED) ? TREE_TYPE (type) : NULL; + else + return false; + } + return true; +} + +/* Initial worklist preparation for source variable tracing. + Add different initial node based on different gimple statements. */ + +void +add_worklist (vector &worklist, set &walked, gimple *def_stmt) +{ + if (gimple_code (def_stmt) == GIMPLE_PHI) + { + for (unsigned i = 0; i < gimple_phi_num_args (def_stmt); i++) + { + tree node = gimple_phi_arg_def (def_stmt, i); + if (!walked.count (node)) + { + worklist.push_back (node); + walked.insert (node); + } + } + } + else if (is_gimple_assign (def_stmt)) + { + tree_code rhs_code = gimple_assign_rhs_code (def_stmt); + if (rhs_code == POINTER_PLUS_EXPR || rhs_code == NEGATE_EXPR + || rhs_code == NOP_EXPR || rhs_code == SSA_NAME + || rhs_code == COMPONENT_REF) + { + tree node = gimple_assign_rhs1 (def_stmt); + if (!walked.count (node)) + { + worklist.push_back (node); + walked.insert (node); + } + } + else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR) + { + tree node = gimple_assign_rhs1 (def_stmt); + if (!walked.count (node)) + { + worklist.push_back (node); + walked.insert (node); + } + node = gimple_assign_rhs2 (def_stmt); + if (!walked.count (node)) + { + worklist.push_back (node); + walked.insert (node); + } + } + else + { + /* unhandled assign rhs_code: _219 = _17 * _70; + _17 = *grid_56(D).sst.span; + _70 = *grid_56(D).sst.dim[0].stride; + */ + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "unhandled assign rhs_code: "); + print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO); + fprintf (dump_file, "\n"); + } + } + } + else + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "unsupported tracing stmt: "); + print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO); + fprintf (dump_file, "\n"); + } + } +} + + +/* Tracing source variables: + vectp.1 = a_2(D) + _3; + _4 = &MEM[base: vectp.1, index: ivtmp_5, step: 8, offset: 0B]; + vect__1.6 = .MASK_LOAD (_4, 64B, loop_mask_7); + + _1 = (sizetype) b_2(D); + vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8, { 0.0, ... }, + loop_mask_5); + ... + Due to previous pass optimizations, the current tracing method can find + several source variable candidates. We decide to record them in a map and + later filter out the true base variable by some criteria. +*/ + +void +trace_base_var_helper (tree arg, set &walked, + map& base_var_candid) +{ + if (arg == NULL) + return; + + /* Array type. */ + tree op0 = NULL; + if (TREE_CODE (arg) == ADDR_EXPR + && (op0 = TREE_OPERAND (arg, 0)) && generic_decl_p (op0)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "array type\n"); + base_var_candid[op0] += 1; + return; + } + + /* Pointer type. */ + if (TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE && generic_decl_p (arg)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "pointer type\n"); + base_var_candid[arg] += 1; + return; + } + + /* SSA_NAME type. */ + if (TREE_CODE (arg) != SSA_NAME) + return; + + tree tmp_var = SSA_NAME_VAR (arg); + if (tmp_var && generic_decl_p (tmp_var) + && TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "ssa pointer type\n"); + base_var_candid[tmp_var] += 1; + return; + } + + gimple *def_stmt = SSA_NAME_DEF_STMT (arg); + if (def_stmt == NULL) + return; + if (dump_file && (dump_flags & TDF_DETAILS)) + { + print_generic_expr (dump_file, arg, TDF_SLIM); + fprintf (dump_file, "\t\t: "); + print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM); + } + + vector worklist; + add_worklist (worklist, walked, def_stmt); + for (unsigned i = 0; i < worklist.size (); ++i) + trace_base_var_helper (worklist[i], walked, base_var_candid); +} + +/* Identify the base variable traced from base address of memory reference. + We recognize that current method could detect several base variable + candidates and the temporary criteria for base variable determination + is that either one of the following statement is true: + 1. The number of base variable candidates is 1; + 2. The number of detected gimple statements for some variable is 1. + We may use other criteria or relax the current criteria + (e.g., criterion 2: 1 -> any odd number). */ + +bool +trace_base_var (tree &var, tree arg, set &walked) +{ + map base_var_candid; + trace_base_var_helper (arg, walked, base_var_candid); + bool is_tracing_unusual = false; + if (base_var_candid.size () == 1) + var = base_var_candid.begin ()->first; + else + { + is_tracing_unusual = true; + for (const pair& base_var_count : base_var_candid) + if (base_var_count.second == 1) + var = base_var_count.first; + } + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Traced variables at "); + print_generic_expr (dump_file, arg, TDF_SLIM); + fprintf (dump_file, ":\n"); + for (const pair& base_var_count : base_var_candid) + fprintf (dump_file, "%s:%d, ", get_name (base_var_count.first), + base_var_count.second); + fprintf (dump_file, "\n"); + + if (var == NULL_TREE) + fprintf (dump_file, "Unhandled scenario for tracing base variable.\n"); + else if (is_tracing_unusual && var != NULL_TREE) + fprintf (dump_file, "Tracing unusual number or occurrences of base " + "variables. Choose %s.\n", get_name (var)); + } + return var != NULL_TREE; +} + +/* Tracing direct memory reference information. */ + +bool +trace_direct_mem_ref (data_ref &mem_ref, set &traced_ref_stmt) +{ + if (TREE_CODE (mem_ref.ref) != TARGET_MEM_REF) + return false; + + /* Direct memory access, regardless of whether it is in vectorized form, + can be determined through TARGET_MEM_REF. */ + mem_ref.base = TREE_OPERAND (mem_ref.ref, 0); + mem_ref.offset = TREE_OPERAND (mem_ref.ref, 1); + mem_ref.index = TREE_OPERAND (mem_ref.ref, 2); + mem_ref.step = TREE_OPERAND (mem_ref.ref, 3); + + set walked; + if (mem_ref.var == NULL_TREE + && !trace_base_var (mem_ref.var, mem_ref.base, walked)) + return false; + + traced_ref_stmt.insert (mem_ref.stmt); + return true; +} + +/* Recursively trace and check whether the definition stmt of the + index operand is a recorded stmt in direct access tracing. + If true, it is an indirect access. */ + +bool +trace_indirect_operand (tree arg, set &traced_ref_stmt) +{ + if (TREE_CODE (arg) != SSA_NAME) + return false; + + gimple *def_stmt = SSA_NAME_DEF_STMT (arg); + + if (traced_ref_stmt.count (def_stmt)) + return true; + + if (!def_stmt || !is_gimple_assign (def_stmt)) + return false; + + tree_code rhs_code = gimple_assign_rhs_code (def_stmt); + /* Collect a whitelist of gimple_assign_rhs_code for tracing pointer/array + type indirect memory access. Please check examples before function + trace_indirect_ptr and trace_indirect_array. */ + if (rhs_code != MULT_EXPR && rhs_code != NOP_EXPR + && rhs_code != CONVERT_EXPR && rhs_code != PLUS_EXPR + && rhs_code != ARRAY_REF) + return false; + + tree op = NULL_TREE; + ssa_op_iter iter; + FOR_EACH_SSA_TREE_OPERAND (op, def_stmt, iter, SSA_OP_USE) + { + if (trace_indirect_operand (op, traced_ref_stmt)) + return true; + } + return false; +} + +/* Trace the pointer of the indirect memory access: + 1) obtain the base address of the indirect memory access. + 2) ensure that the index has been traced in the direct memory access. + + _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; // Traced in + direct access + _4 = (long unsigned int) _1; + _5 = _4 * 8; + _6 = p(D) + _5; // get base + _7 = *_6; // start tracing +*/ + +bool +trace_indirect_ptr (tree &base, tree &index, tree arg, + set traced_ref_stmt) +{ + gimple *def_stmt = SSA_NAME_DEF_STMT (arg); + + if (!def_stmt || !is_gimple_assign (def_stmt)) + return false; + + tree_code rhs_code = gimple_assign_rhs_code (def_stmt); + if (rhs_code != POINTER_PLUS_EXPR) + return false; + + /* POINTER_PLUS_EXPR, The first operand is always a pointer/reference type. + The second operand is always an unsigned integer type compatible with + sizetype. */ + base = gimple_assign_rhs1 (def_stmt); + index = gimple_assign_rhs2 (def_stmt); + + return trace_indirect_operand (index, traced_ref_stmt); +} + +/* Trace the array of the indirect memory access: + 1) obtain the base address of the indirect memory access. + 2) ensure that the index has been traced in the direct memory access. + + _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; // Traced in + direct access + _4 = (integer(kind=8)) _1; + _5 = _4 + 135; + _6 = p[_5]; // start tracing +*/ + +bool +trace_indirect_array (tree &base, tree &index, + set traced_ref_stmt, tree ref) +{ + if (TREE_CODE (ref) != ARRAY_REF) + return false; + base = TREE_OPERAND (ref, 0); + index = TREE_OPERAND (ref, 1); + return trace_indirect_operand (index, traced_ref_stmt); +} + +/* Tracing indirect memory reference information. + Include tracing of base addresses and source variable. + _x(ssa name) -> a_2(base addr) -> a(src var) */ + +bool +trace_indirect_mem_ref (data_ref &mem_ref, + set &traced_ref_stmt) +{ + /* Processing of vectorization types. */ + if (mem_ref.vectorize_p) + { + tree op = gimple_call_arg (mem_ref.stmt, 1); + if (trace_indirect_operand (op, traced_ref_stmt)) + { + mem_ref.base = gimple_call_arg (mem_ref.stmt, 0); + mem_ref.regular_p = false; + set walked; + if (mem_ref.var == NULL_TREE + && !trace_base_var (mem_ref.var, mem_ref.base, walked)) + return false; + return true; + } + return false; + } + + /* Processing of non-vectorized types. */ + tree op = NULL_TREE; + ssa_op_iter iter; + FOR_EACH_SSA_TREE_OPERAND (op, mem_ref.stmt, iter, SSA_OP_USE) + { + + /* Array type: + _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; + _4 = c[_1]; + + Pointer type: + _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; + _4 = (long unsigned int) _1; + _5 = _4 * 8; + _6 = p(D) + _5; + _7 = *_6; + */ + tree base = NULL_TREE; + tree index = NULL_TREE; + if (trace_indirect_array (base, index, traced_ref_stmt, mem_ref.ref) + || trace_indirect_ptr (base, index, op, traced_ref_stmt)) + { + /* ARRAY_REF, The first operand is the array; + the second is the index. */ + mem_ref.base = base; + mem_ref.index = index; + mem_ref.regular_p = false; + set walked; + if (mem_ref.var == NULL_TREE + && !trace_base_var (mem_ref.var, mem_ref.base, walked)) + return false; + return true; + } + } + + return false; +} + +/* Trace references base info: + 1) Parallel analysis + 2) Memory access rule analysis + 3) Tracing base address and source variable of memory references + We will extend parallel analysis later. +*/ + +void +trace_ref_info (data_ref &mem_ref, set &traced_ref_stmt) +{ + enum tree_code ref_code = TREE_CODE (mem_ref.ref); + if (/* Vectorized and non-vectorized direct access. */ + ref_code != TARGET_MEM_REF + /* non-vectorized indirect memory access. */ + && ref_code != MEM_REF && ref_code != ARRAY_REF + /* vectorized indirect memory access. */ + && ref_code != SSA_NAME) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "ref is another tree-code: "); + fprintf (dump_file, "stmt: "); + print_gimple_stmt (dump_file, mem_ref.stmt, 0, TDF_LINENO); + fprintf (dump_file, "ref: "); + print_generic_expr (dump_file, mem_ref.ref, TDF_LINENO); + fprintf (dump_file, "\n"); + } + return; + } + + /* 1) Direct and indirect access traces and traces source variables. */ + if (!trace_direct_mem_ref (mem_ref, traced_ref_stmt) + && !trace_indirect_mem_ref (mem_ref, traced_ref_stmt)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Tracing failed.\n\n"); + return; + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Tracing succeeded.\n\n"); + mem_ref.trace_status_p = true; +} + +/* Tracing and sorting reference groups. */ + +void +trace_data_refs_info (vector &kernels, + map > &loop_refs) +{ + if (dump_file) + fprintf (dump_file, "\nPhase 2: trace_all_references_info\n\n"); + + set traced_ref_stmt; + + for (unsigned i = 0; i < kernels.size (); ++i) + { + class loop* loop = kernels[i]; + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "loop header %d:\n", loop->header->index); + for (unsigned j = 0; j < loop_refs[loop].size (); ++j) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "trace_references_base_info %d:\n", j); + print_generic_expr (dump_file, loop_refs[loop][j].ref, TDF_SLIM); + fprintf (dump_file, "\n"); + } + trace_ref_info (loop_refs[loop][j], traced_ref_stmt); + } + } +} + +/* ================ phase 3 analyze_nested_kernels ================ */ + +/* Return the inner most type for arrays and pointers of TYPE. */ + +tree +inner_type (tree type) +{ + while (POINTER_TYPE_P (type) + || TREE_CODE (type) == ARRAY_TYPE) + type = TREE_TYPE (type); + return type; +} + +/* Check whether the input iv is the loop dimension boundary. */ + +bool +loop_bound_iv_p (tree t, tree &outer_loop_t) +{ + if (t == NULL || TREE_CODE (t) != SSA_NAME + || TREE_CODE (TREE_TYPE (t)) != INTEGER_TYPE) + return false; + + gimple *def_stmt = SSA_NAME_DEF_STMT (t); + if (gimple_code (def_stmt) != GIMPLE_PHI) + return false; + + /* Filter scenarios with only two phi inputs. */ + if (gimple_phi_num_args (def_stmt) != 2) + return false; + + gphi *phi_stmt = as_a (def_stmt); + basic_block src0 = gimple_phi_arg_edge (phi_stmt, 0)->src; + basic_block src1 = gimple_phi_arg_edge (phi_stmt, 1)->src; + + class loop *loop = loop_containing_stmt (def_stmt); + bool res = false; + /* Two phi inputs, one from the current loop and one from the outer loop. */ + if ((src0->loop_father == loop) && (src1->loop_father == loop_outer (loop))) + { + outer_loop_t = gimple_phi_arg_def (def_stmt, 1); + res = true; + } + else if ((src1->loop_father == loop) + && (src0->loop_father == loop_outer (loop))) + { + outer_loop_t = gimple_phi_arg_def (def_stmt, 0); + res = true; + } + + if (res) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "===> "); + print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM); + } + return true; + } + return false; +} + +/* add worklist and walked list. */ + +void +add_worklist_walked (vector &worklist, set &walked, tree node) +{ + if (!walked.count (node)) + { + worklist.push_back (node); + /* Avoid phi node cycle introduction, which makes the worklist unable + to end. */ + walked.insert (node); + } +} + +/* check bound iv and add worklist. */ + +void +check_bound_iv_and_add_worklist (vector &worklist, set &walked, + tree t, data_ref &mem_ref) +{ + if (TREE_CODE (t) != SSA_NAME) + return; + + gimple *def_stmt = SSA_NAME_DEF_STMT (t); + if (def_stmt == NULL) + return; + if (dump_file && (dump_flags & TDF_DETAILS)) + { + print_generic_expr (dump_file, t, TDF_SLIM); + fprintf (dump_file, "\t\t: "); + print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM); + } + + if (gimple_code (def_stmt) == GIMPLE_PHI) + { + tree out_loop_t = NULL_TREE; + if (loop_bound_iv_p (t, out_loop_t)) + { + mem_ref.loop_bounds.push_back (loop_bound (t, def_stmt)); + add_worklist_walked (worklist, walked, out_loop_t); + } + } + else if (is_gimple_assign (def_stmt)) + { + tree_code rhs_code = gimple_assign_rhs_code (def_stmt); + + /* unary. */ + if (rhs_code == SSA_NAME || rhs_code == NOP_EXPR) + add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt)); + else if (rhs_code == POINTER_PLUS_EXPR) + add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt)); + + /* binary. */ + else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR + || rhs_code == MULT_EXPR) + { + add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt)); + add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt)); + } + } +} + +/* DFS trace the loop bound of iv. */ + +bool +trace_loop_bound_iv (data_ref &mem_ref) +{ + /* Indirect memory access, the size cannot be determined based on the loop + boundary. */ + if (!mem_ref.regular_p) + return false; + + /* Determine and record the boundary iv of the current index, + but do not trace it. */ + tree outer_loop_t = NULL_TREE; + if (loop_bound_iv_p (mem_ref.index, outer_loop_t)) + mem_ref.loop_bounds.push_back ( + loop_bound (mem_ref.index, SSA_NAME_DEF_STMT (mem_ref.index))); + + vector worklist; + worklist.push_back (mem_ref.base); + set walked; + + while (worklist.size ()) + { + tree t = worklist.back (); + worklist.pop_back (); + + /* add worklist. */ + check_bound_iv_and_add_worklist (worklist, walked, t, mem_ref); + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\nmem_ref access dimension: %ld\n", + mem_ref.loop_bounds.size ()); + + return mem_ref.loop_bounds.size () > 0; +} + +/* dump loop bound. */ + +void +loop_bound_dump (FILE *file, loop_bound &lb) +{ + class loop *loop = lb.loop; + fprintf (file, "loop_bound: loop_%d (", loop->num); + if (loop->header) + fprintf (file, "header = %d", loop->header->index); + else + { + fprintf (file, "deleted)\n"); + return; + } + if (loop->latch) + fprintf (file, ", latch = %d", loop->latch->index); + fprintf (file, ", lb_niters = "); + print_generic_expr (file, lb.niters); + fprintf (file, ")\n"); +} + +/* static calculate data size. */ + +void +static_calculate_data_size (data_ref &mem_ref) +{ + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\nstatic_calculate_data_size\n"); + + tree size_unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var))); + HOST_WIDE_INT type_size = size_unit ? tree_to_uhwi (size_unit) : 0; + for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i) + { + HOST_WIDE_INT est_niter = tree_to_uhwi (mem_ref.loop_bounds[i].niters); + unsigned int unroll = mem_ref.loop_bounds[i].unroll; + if (i == 0) + { + /* The unit conversion between byte, kilobytes, and megabytes is + 1024. */ + mem_ref.data_size = double (type_size + * est_niter * unroll) / 1024 / 1024; + } + else + mem_ref.data_size *= est_niter * unroll; + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "static_data_size: %lf\n", mem_ref.data_size); + } +} + +/* Recursive tracing and creating of dominant nodes. */ + +tree +trace_and_create_dominate_expr (tree expr, class loop *outermost) +{ + if (expr == NULL_TREE || is_gimple_constant (expr)) + return expr; + + if (TREE_CODE (expr) != SSA_NAME) + return NULL_TREE; + + if (SSA_NAME_IS_DEFAULT_DEF (expr)) + return expr; + + gimple *stmt = SSA_NAME_DEF_STMT (expr); + basic_block def_bb = gimple_bb (stmt); + if (def_bb == NULL || def_bb->loop_father == NULL) + return NULL_TREE; + + if (dominated_by_p (CDI_DOMINATORS, outermost->header, def_bb)) + return expr; + + if (gimple_code (stmt) != GIMPLE_ASSIGN) + return NULL_TREE; + + enum tree_code rhs_code = gimple_assign_rhs_code (stmt); + tree_code_class code_class = TREE_CODE_CLASS (rhs_code); + tree type = TREE_TYPE (gimple_assign_lhs (stmt)); + tree rhs1 = trace_and_create_dominate_expr + (gimple_assign_rhs1 (stmt), outermost); + if (rhs1 == NULL_TREE) + return NULL_TREE; + + if (code_class == tcc_unary) + { + tree expr_new = build1 (rhs_code, type, rhs1); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "expr_new = "); + print_generic_expr (dump_file, expr_new, TDF_SLIM); + fprintf (dump_file, "\n"); + } + return expr_new; + } + else if (code_class == tcc_binary) + { + tree rhs2 = trace_and_create_dominate_expr + (gimple_assign_rhs2 (stmt), outermost); + if (rhs2 == NULL_TREE) + return NULL_TREE; + + tree expr_new = fold_build2 (rhs_code, type, rhs1, rhs2); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "expr_new = "); + print_generic_expr (dump_file, expr_new, TDF_SLIM); + fprintf (dump_file, "\n"); + } + return expr_new; + } + + return NULL_TREE; +} + +/* Recursive parsing and craating of nodes in expr expressions. */ + +tree +parse_and_create_expr (tree expr, class loop *outermost) +{ + if (expr == NULL_TREE || expr == chrec_dont_know + || is_gimple_constant (expr) || TREE_CODE (expr) == ADDR_EXPR) + { + /* tcc_expression (e.g., &q) situation combined with tcc_unary. */ + if (TREE_CODE (expr) == ADDR_EXPR && dump_file + && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "tcc_expression case in ADDR_EXPR: "); + print_generic_expr (dump_file, expr, TDF_SLIM); + fprintf (dump_file, "\n"); + } + return expr; + } + + if (TREE_CODE (expr) == SSA_NAME) + return trace_and_create_dominate_expr (expr, outermost); + else if (EXPR_P (expr)) + { + enum tree_code tree_code = TREE_CODE (expr); + tree_code_class code_class = TREE_CODE_CLASS (tree_code); + tree type = TREE_TYPE (expr); + tree op1 = parse_and_create_expr (TREE_OPERAND (expr, 0), outermost); + if (op1 == NULL_TREE) + return NULL_TREE; + + if (code_class == tcc_unary) + { + tree expr_new = build1 (tree_code, type, op1); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "expr_new = "); + print_generic_expr (dump_file, expr_new, TDF_SLIM); + fprintf (dump_file, "\n"); + } + return expr_new; + } + else if (code_class == tcc_binary) + { + tree op2 = parse_and_create_expr (TREE_OPERAND (expr, 1), outermost); + if (op2 == NULL_TREE) + return NULL_TREE; + + tree expr_new = fold_build2 (tree_code, type, op1, op2); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "expr_new = "); + print_generic_expr (dump_file, expr_new, TDF_SLIM); + fprintf (dump_file, "\n"); + } + return expr_new; + } + } + return NULL_TREE; +} + +/* Trace and creat dominate loop bounds. */ + +void +trace_and_create_dominate_loop_bounds (data_ref &mem_ref) +{ + /* Check whether the niters is a loop dominant. + If not, trace and determine whether the result is dominant. If yes, create + the expr of the dominant node. + */ + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\ntrace_and_create_dominate_loop_bounds\n"); + + /* Determine the relationship between the boundary of the innermost loop and + the dominant of the outer loop and the processing. */ + loop_bound &outermost = mem_ref.loop_bounds.back (); + for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i) + { + loop_bound ¤t = mem_ref.loop_bounds[i]; + tree &niters = current.niters; + if (TREE_CODE (niters) == COND_EXPR) + niters = TREE_OPERAND (niters, 1); + + niters = parse_and_create_expr (niters, outermost.loop); + + if (niters == NULL_TREE) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM); + fprintf (dump_file, "Tracing loop bound failed at dimension %d", + i); + } + mem_ref.calc_by = UNHANDLE_CALC; + break; + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + loop_bound_dump (dump_file, mem_ref.loop_bounds[i]); + } +} + +/* trace the dimension and corresponding loop bounds of mem_ref. + This function is used to supplement the information of mem_ref.loop_bounds. +*/ + +void +trace_ref_dimension_and_loop_bounds (data_ref &mem_ref) +{ + /* In the same loop, some memory access dimensions are different. Remove + variables with fewer dimensions. + Previous cyclic filtering conditions and memory access node records and + tracing. + The false result is also processed. + */ + if (dump_file) + fprintf (dump_file, "\ncalculate_data_size\n"); + + /* Trace the loop bound iv of ref to determine the dimension. */ + /* Record data from the loop perspective to avoid repeated tracing. */ + if (!trace_loop_bound_iv (mem_ref)) + return; + + /* The traced mem_ref may have multiple dimensions, which corresponds to + multiple loops. */ + /* And in the dimension-by-dimensional analysis, the computable way is + continuously reduced. */ + mem_ref.calc_by = STATIC_CALC; + for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i) + { + class loop *loop = mem_ref.loop_bounds[i].loop; + tree &niters = mem_ref.loop_bounds[i].niters; + + /* Set NULL_TREE to ensure that nb_iterations are retraced and + vec_nb_iterations are also extracted. */ + loop->nb_iterations = NULL_TREE; + niters = number_of_latch_executions (loop, false); + if (dump_file && (dump_flags & TDF_DETAILS)) + loop_dump (dump_file, loop); + + if (loop->unroll) + { + if (loop->unroll == USHRT_MAX && dump_file + && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "loop->unroll = USHRT_MAX = %d", USHRT_MAX); + mem_ref.loop_bounds[i].unroll = loop->unroll; + } + + if ((niters == chrec_dont_know) && loop->vec_nb_iterations + && (loop->vec_nb_iterations != chrec_dont_know)) + niters = loop->vec_nb_iterations; + if (dump_file && (dump_flags & TDF_DETAILS)) + loop_bound_dump (dump_file, mem_ref.loop_bounds[i]); + + if (niters == NULL_TREE || niters == chrec_dont_know) + mem_ref.calc_by = min (mem_ref.calc_by, UNHANDLE_CALC); + else if (TREE_CODE (niters) != INTEGER_CST) + mem_ref.calc_by = min (mem_ref.calc_by, RUNTIME_CALC); + else + mem_ref.calc_by = min (mem_ref.calc_by, STATIC_CALC); + } + + if (mem_ref.calc_by == RUNTIME_CALC) + trace_and_create_dominate_loop_bounds (mem_ref); + else if (mem_ref.calc_by == STATIC_CALC) + static_calculate_data_size (mem_ref); +} + +/* analyze nested kernels. + 1. multidimension loop analyze. + 2. extended outer loop analyze. + Later we will extend outer loop analysis. +*/ + +bool +analyze_nested_kernels (vector &kernels, + map > &loop_refs) +{ + if (dump_file) + fprintf (dump_file, "\nPhase 3: analyze_nested_kernels\n\n"); + + for (unsigned i = 0; i < kernels.size (); ++i) + { + class loop* loop = kernels[i]; + if (loop_refs.count (loop) == 0) + continue; + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\n\nloop header %d:\n", loop->header->index); + for (unsigned j = 0; j < loop_refs[loop].size (); ++j) + { + if (loop_refs[loop][j].trace_status_p == false) + continue; + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "\ntrace_reference_dimension at mem_ref " + "index %d in loop %d:\n", j, loop->num); + print_generic_expr (dump_file, loop_refs[loop][j].ref, TDF_SLIM); + fprintf (dump_file, "\n"); + } + trace_ref_dimension_and_loop_bounds (loop_refs[loop][j]); + } + + } + return true; +} + +/* ================ phase 4 filter_and_sort_kernels ================ */ + +/* Get the edge probability information of each basic block in the loop. */ + +float +get_edge_prob (edge e, float minimum) +{ + float fvalue = 0; + + profile_probability probability = e->probability; + if (probability.initialized_p ()) + { + fvalue = probability.to_reg_br_prob_base () / float (REG_BR_PROB_BASE); + if (fvalue < minimum && probability.to_reg_br_prob_base ()) + fvalue = minimum; + } + return fvalue; +} + +/* Get the next bb with a high branch probability. */ + +basic_block +next_high_probability_bb (basic_block bb) +{ + if (bb == NULL) + return NULL; + + /* Limit the minimum probability value. */ + const float MINNUM_PROB = 0.00001f; + float minimum = MINNUM_PROB; + + gimple *stmt = last_stmt (bb); + if (stmt && gimple_code (stmt) == GIMPLE_COND) + { + edge true_edge = NULL; + edge false_edge = NULL; + extract_true_false_edges_from_block (bb, &true_edge, &false_edge); + + float true_edge_prob = get_edge_prob (true_edge, minimum); + float false_edge_prob = get_edge_prob (false_edge, minimum); + /* If the content of the branch does not include the candidate + kernel, the branch probability may not be limited. */ + /* The edge_prob may have precision error during static prediction, + so we need to relax the limit before comparison. */ + if ((true_edge_prob >= (param_branch_prob_threshold / 100.0) - minimum) + && flow_bb_inside_loop_p (bb->loop_father, true_edge->dest)) + return true_edge->dest; + else if ((false_edge_prob >= (param_branch_prob_threshold / 100.0) + - minimum) && flow_bb_inside_loop_p (bb->loop_father, + false_edge->dest)) + return false_edge->dest; + else + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "No high probability bb:"); + fprintf (dump_file, "current bb: %d, true: %f, false: %f\n", + bb->index, true_edge_prob, false_edge_prob); + } + return NULL; + } + } + else + { + edge e = find_fallthru_edge (bb->succs); + if (e) + return e->dest; + } + return NULL; +} + + +/* Dump loop header bb. */ + +void +dump_loop_headers (const char *name, vector &loops) +{ + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "\n\n%s:\n", name); + fprintf (dump_file, "{ "); + for (unsigned int i = 0; i < loops.size (); i++) + fprintf (dump_file, "%d(%d) ", loops[i]->num, loops[i]->header->index); + fprintf (dump_file, "}\n\n"); + } +} + +/* Combine and sort candidate loops. */ + +bool +filter_and_sort_kernels (vector &sorted_kernels, + vector &kernels) +{ + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\nPhase 4: filter_and_sort_kernels:\n\n"); + + set end_bb; + list walked_header_bb; /* Used to record nested loops. */ + + for (unsigned i = 0; i < kernels.size (); ++i) + end_bb.insert (kernels[i]->header); + + dump_loop_headers ("kernels", kernels); + + if (!param_filter_kernels) + { + for (vector::iterator it = kernels.begin (); + it != kernels.end (); ++it) + sorted_kernels.push_back (*it); + } + else + { + basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun); + + while (bb) + { + if (bb == NULL) + return false; + if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun)) + break; + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "%d ", bb->index); + + /* bb is not the head of the loop, go to the next. */ + if (bb != bb->loop_father->header) + { + bb = next_high_probability_bb (bb); + continue; + } + + /* bb is the head of the loop. */ + if (bb != walked_header_bb.back ()) + { + if (end_bb.count (bb)) + { + sorted_kernels.push_back (bb->loop_father); + bb = single_exit (bb->loop_father)->dest; + continue; + } + if (loop_outer (bb->loop_father) != NULL + && get_loop_exit_edges (bb->loop_father).length () != 1) + return false; + walked_header_bb.push_back (bb); + bb = next_high_probability_bb (bb); + continue; + } + else + { + walked_header_bb.pop_back (); + bb = single_exit (bb->loop_father)->dest; + continue; + } + } + } + + dump_loop_headers ("sorted_kernels", sorted_kernels); + return true; +} + +/* ================ phase 5 record_and_sort_ref_groups ================ */ +/* Memory reference score, different aspects of one memory reference. */ + +struct ref_score +{ + /* certain memory reference. */ + data_ref d_ref; + + /* local count for bb where memory reference is located. */ + gcov_type bb_count; + + /* line-location of memory reference. */ + int line; +}; + + +/* Memory reference group, different reference of the same variable. */ + +struct ref_group +{ + /* source variables. */ + tree var; + + /* variable size, Unit: MB. */ + double var_size; + + /* first ref for insert hint. */ + data_ref first_use; + + /* reuse scores of variables. */ + unsigned int reuse_level; + + /* method of calculating the var size. */ + calc_type calc_by; + + /* memory reference index for specific variable. */ + unsigned int mem_ref_index; + + /* Accessing Reference Records in Different Modes (key_index): + 000: write, random, non-parallel + 001: write, random, parallel + 010: write, regular, non-parallel + 011: write, regular, parallel + 100: read, random, non-parallel + 101: read, random, parallel + 110: read, regular, non-parallel + 111: read, regular, parallel + */ + map > ref_use; + + /* scores for different memory references. */ + vector ref_scores; + + ref_group () + { + var = NULL_TREE; + var_size = 0; + reuse_level = 0; + calc_by = UNHANDLE_CALC; + mem_ref_index = 0; + } +}; + +/* calculate reuse level. */ + +unsigned int +calculate_reuse_level (map > &var_use) +{ + unsigned int level = 0; + for (map >::iterator it = var_use.begin (); + it != var_use.end (); ++it) + { + unsigned int parallel = 1; + unsigned int regular = 1; + unsigned int cost = 1; + + if ((*it).second[0].parallel_p) + parallel = PARALLEL_NUM; + if (!(*it).second[0].regular_p) + regular = INDIRECT_ACCESS_VALUE; + if (!(*it).second[0].read_p) + cost = WRITE_COST; + + /* In serial reuse, we will later check whether they are in the + same cacheline. If yes, delete the reuse. For details, see the + reuse analysis of prefetching and eliminate redundancy. */ + unsigned int add = parallel * ((*it).second.size () * (cost + regular)); + level += add; + if (add && dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "%d : %d * (%ld * (%d + %d)) = %d\n", + (*it).first, parallel, (*it).second.size (), cost, regular, add); + } + return level; +} + +/* Comparison of reference reuse level. */ + +bool +ref_group_reuse_cmp (const ref_group &a, const ref_group &b) +{ + return a.reuse_level > b.reuse_level; +} + +/* Sort reference groups. */ + +void +sort_ref_groups (vector &ref_groups, + map &ref_groups_map) +{ + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\nsort_ref_groups_by_reuse_level\n"); + + for (map::iterator it = ref_groups_map.begin (); + it != ref_groups_map.end (); ++it) + { + (*it).second.reuse_level = calculate_reuse_level ((*it).second.ref_use); + ref_groups.push_back ((*it).second); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + print_generic_expr (dump_file, (*it).second.var, TDF_SLIM); + fprintf (dump_file, " : %d\n", (*it).second.reuse_level); + } + } + + sort (ref_groups.begin (), ref_groups.end (), ref_group_reuse_cmp); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "\nsorted ref_groups:\n"); + fprintf (dump_file, "rank var (data_size, num_of_mem_ref, need_tmp_name):" + " reuse_level_score\n"); + for (unsigned int i = 0; i < ref_groups.size (); ++i) + { + fprintf (dump_file, "%d ", i); + print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM); + int need_tmp_name = !get_name (ref_groups[i].var) ? 1 : 0; + fprintf (dump_file, " (%lf, %lu, %d)", ref_groups[i].var_size, + ref_groups[i].ref_scores.size (), need_tmp_name); + fprintf (dump_file, " : %d\n", ref_groups[i].reuse_level); + } + fprintf (dump_file, "\n"); + + fprintf (dump_file, "first_use:\n"); + for (unsigned int i = 0; i < ref_groups.size (); ++i) + { + fprintf (dump_file, "%d ", i); + print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM); + fprintf (dump_file, " : "); + if (!ref_groups[i].first_use.vectorize_p) + print_generic_expr (dump_file, ref_groups[i].first_use.ref, + TDF_SLIM); + else + print_gimple_stmt (dump_file, ref_groups[i].first_use.stmt, + TDF_SLIM); + fprintf (dump_file, "\n"); + } + fprintf (dump_file, "\n"); + } +} + +/* Attributes of variable data. */ + +enum data_attribute +{ + DA_PARALLEL = 0, + DA_REGULAR, + DA_READ +}; + +/* Record memory reference by use mode. + If the reference group is not found, create a group. */ + +void +record_mem_ref (map &ref_groups, data_ref &mem_ref) +{ + unsigned int index = (mem_ref.parallel_p << DA_PARALLEL) + + (mem_ref.regular_p << DA_REGULAR) + (mem_ref.read_p << DA_READ); + + if (!ref_groups.count (mem_ref.var)) + { + ref_group ref_group; + ref_group.var = mem_ref.var; + ref_group.first_use = mem_ref; + ref_groups[mem_ref.var] = ref_group; + } + + /* Ref_groups' calc_by depends on the inserted mem_ref's calc_by. + Runtime issue requires the specified mem_ref's calc_by to be >= 1. + Temporarily modified ref_group's first_use after sorting mem_refs. */ + ref_groups[mem_ref.var].calc_by = max (ref_groups[mem_ref.var].calc_by, + mem_ref.calc_by); + ref_groups[mem_ref.var].var_size = max (ref_groups[mem_ref.var].var_size, + mem_ref.data_size); + ref_groups[mem_ref.var].ref_use[index].push_back (mem_ref); + + ref_score ref_level{ mem_ref, ((mem_ref.stmt)->bb->count).to_gcov_type (), + expand_location (mem_ref.stmt->location).line }; + ref_groups[mem_ref.var].ref_scores.push_back (ref_level); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "recorded in: "); + print_generic_expr (dump_file, mem_ref.var, TDF_SLIM); + fprintf (dump_file, ":%d:%ld\n", index, + ref_groups[mem_ref.var].ref_use[index].size () - 1); + + fprintf (dump_file, "base: "); + print_generic_expr (dump_file, mem_ref.base, TDF_SLIM); + + fprintf (dump_file, ", index: "); + print_generic_expr (dump_file, mem_ref.index, TDF_SLIM); + + fprintf (dump_file, ", step: "); + if (mem_ref.step && cst_and_fits_in_hwi (mem_ref.step)) + fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, + int_cst_value (mem_ref.step)); + else + print_generic_expr (dump_file, mem_ref.step, TDF_SLIM); + + fprintf (dump_file, ", offset: "); + if (mem_ref.offset && cst_and_fits_in_hwi (mem_ref.offset)) + fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, + int_cst_value (mem_ref.offset)); + else + print_generic_expr (dump_file, mem_ref.offset, TDF_SLIM); + fprintf (dump_file, ", %s", mem_ref.read_p ? "read" : "write"); + + fprintf (dump_file, ", size: %lf", mem_ref.data_size); + fprintf (dump_file, "\n\n"); + } +} + +/* Rank data reference index level by the scheme of source code line number. */ + +bool +data_ref_reuse_cmp (const ref_score &a, const ref_score &b) +{ + return a.line < b.line; +} + +/* Sort data reference index level within one reference group in non-decreasing + order of the customized sorting scheme. */ + +void +sort_mem_ref_in_ref_group (map &ref_groups_map) +{ + if (dump_file) + fprintf (dump_file, "\nsorted data_references:\n"); + for (map::iterator it = ref_groups_map.begin (); + it != ref_groups_map.end (); ++it) + { + vector &ref_scores = (*it).second.ref_scores; + stable_sort (ref_scores.begin (), ref_scores.end (), data_ref_reuse_cmp); + /* Update ref_group's first_use and calc_by with the first mem_ref after + sorting. */ + (*it).second.first_use = (*it).second.ref_scores[0].d_ref; + (*it).second.calc_by = (*it).second.first_use.calc_by; + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + print_generic_expr (dump_file, (*it).first, TDF_SLIM); + fprintf (dump_file, " : %lu\n", ref_scores.size ()); + for (unsigned int i = 0; i < ref_scores.size (); ++i) + { + fprintf (dump_file, "mem_ref_index %u: ", i); + print_gimple_stmt (dump_file, ref_scores[i].d_ref.stmt, 0, + TDF_LINENO); + } + fprintf (dump_file, "\n\n"); + } + } +} + +/* Tracing and sorting reference groups. */ + +bool +record_and_sort_ref_groups (vector &ref_groups, + vector &kernels, + map > &loop_refs) +{ + if (dump_file) + fprintf (dump_file, "\nPhase 5: trace_all_references_details\n\n"); + + map ref_groups_map; + + for (unsigned i = 0; i < kernels.size (); ++i) + { + class loop* loop = kernels[i]; + if (loop_refs.count (loop) == 0) + continue; + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "loop header %d:\n", loop->header->index); + for (unsigned j = 0; j < loop_refs[loop].size (); ++j) + { + if (loop_refs[loop][j].trace_status_p) + record_mem_ref (ref_groups_map, loop_refs[loop][j]); + } + } + + /* Sort mem_ref within ref_group by local count and update first_use's + data_ref, stable sort. */ + sort_mem_ref_in_ref_group (ref_groups_map); + sort_ref_groups (ref_groups, ref_groups_map); + + return ref_groups.size () > 0; +} + +/* ================ phase 6 issue_llc_hint ================ */ + +/* Issue vectorized mask prefetch gimple. */ + +void +issue_mask_prefetch (gimple *stmt) +{ + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "insert svprfd.\n"); + + /* vect__1.1 = .MASK_LOAD (_2, 32B, loop_mask_3); + .MASK_STORE (_4, 32B, loop_mask_5, vect__6.6); + */ + tree dataref_ptr = gimple_call_arg (stmt, 0); + tree scale = gimple_call_arg (stmt, 1); + tree final_mask = gimple_call_arg (stmt, 2); + tree target = NULL_TREE; + if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE) + target = gimple_call_arg (stmt, 3); + else if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD) + target = gimple_call_lhs (stmt); + /* 4: PLDL3KEEP. */ + tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); + + /* add offset. */ + gimple_stmt_iterator si = gsi_for_stmt (stmt); + /* target: vector_type - XXX_type. */ + if (target == NULL_TREE) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "unhandled scene: target vect is null"); + return; + } + HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi + (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target)))); + tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance); + addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true, + NULL, true, GSI_SAME_STMT); + + gcall *call = gimple_build_call_internal (IFN_MASK_PREFETCH, + 5, addr, scale, final_mask, target, prfop); + gsi_insert_after (&si, call, GSI_SAME_STMT); + update_ssa (TODO_update_ssa_only_virtuals); +} + +/* Issue vectorized mask gather prefetch gimple. */ + +void +issue_mask_gather_prefetch (gimple *stmt) +{ + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "insert svprfd_gather_uxindex.\n"); + + /* vect_patt_1.1 = .MASK_GATHER_LOAD (_2, vect__3.3, 8, { 0.0, ... }, + loop_mask_4); */ + tree dataref_ptr = gimple_call_arg (stmt, 0); + tree vec_offset = gimple_call_arg (stmt, 1); + tree scale = gimple_call_arg (stmt, 2); + tree zero = gimple_call_arg (stmt, 3); + tree final_mask = gimple_call_arg (stmt, 4); + tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); + tree target = gimple_call_lhs (stmt); + + /* add offset. */ + gimple_stmt_iterator si = gsi_for_stmt (stmt); + if (target == NULL_TREE) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "unhandled scene: target vect is null"); + return; + } + HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi + (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target)))); + tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance); + addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true, + NULL, true, GSI_SAME_STMT); + + gcall *call = gimple_build_call_internal + (IFN_MASK_GATHER_PREFETCH, 7, addr, + vec_offset, scale, zero, final_mask, target, prfop); + gsi_insert_after (&si, call, GSI_SAME_STMT); + update_ssa (TODO_update_ssa_only_virtuals); +} + +/* Issue builtin prefetch gimple. */ + +void +issue_builtin_prefetch (data_ref &mem_ref) +{ + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "insert prfm.\n"); + /* MEM[symbol: diagPtr, index: ivtmp_102, step: 8, offset: 0B] */ + gimple* stmt = mem_ref.stmt; + tree dataref_ptr = mem_ref.base; + tree data_idx = mem_ref.index; + tree scale = mem_ref.step; + tree offset = mem_ref.offset; + /* add offset. */ + gimple_stmt_iterator si = gsi_for_stmt (stmt); + if (scale == NULL_TREE) + { + /* _190 = (void *) ivtmp.444_221; + Cannot detect size unit at (void *). */ + scale = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var))); + if (scale == NULL_TREE) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "ERROR: Unknown size unit for the prefetching " + "variable. Stop builtin_prefetch.\n\n"); + return; + } + } + + data_idx = data_idx ? data_idx : size_zero_node; + data_idx = build1 (NOP_EXPR, TREE_TYPE (scale), data_idx); + tree displacement = fold_build2 (MULT_EXPR, TREE_TYPE (scale), data_idx, + scale); + if (offset != NULL_TREE && TREE_CODE (offset) != TREE_CODE (size_zero_node)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "WARNING: offset's TREE_TYPE is not integer_cst: " + "%s\nStop builtin_prefetch.\n", + get_tree_code_name (TREE_CODE (offset))); + return; + } + offset = offset ? offset : size_zero_node; + offset = build1 (NOP_EXPR, TREE_TYPE (scale), offset); + dataref_ptr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (dataref_ptr), + dataref_ptr, offset); + tree addr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (dataref_ptr), + dataref_ptr, displacement); + HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi (scale); + + addr = fold_build_pointer_plus_hwi (addr, distance); + addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true, + NULL, true, GSI_SAME_STMT); + /* __builtin_prefetch (_68, 0, 1); + 1st param: *addr, 2nd param: write/read (1/0), 3rd param: temporal locality + (high means strong locality) */ + gcall *call = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), + 3, addr, integer_zero_node, integer_one_node); + gsi_insert_after (&si, call, GSI_SAME_STMT); + update_ssa (TODO_update_ssa_only_virtuals); +} + +/* Retrieve memory reference at the specific index. */ + +data_ref +get_data_ref_at_idx (ref_group &var_ref_group) +{ + unsigned int mem_ref_size = static_cast( + var_ref_group.ref_scores.size ()); + if (strlen (param_mem_ref_index) == 0) + return var_ref_group.first_use; + else + { + /* Insert prefetch hint at highly-likely-used location with the given + index. */ + if (var_ref_group.mem_ref_index >= mem_ref_size) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "WARNING: The target data_ref index is out " + "of range. Use top index instead!\n"); + return var_ref_group.ref_scores[0].d_ref; + } + return var_ref_group.ref_scores[var_ref_group.mem_ref_index].d_ref; + } +} + +/* Static form insertion and issue instruction. We may check the + determination of the ARM SVE architecture before SVE hint insertion. */ + +void +static_issue (vector &ref_groups, int num_issue_var) +{ + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "static issue\n"); + + for (int i = 0; i < num_issue_var; ++i) + { + data_ref mem_ref = get_data_ref_at_idx (ref_groups[i]); + if (mem_ref.vectorize_p) + { + enum internal_fn ifn_code = gimple_call_internal_fn + (mem_ref.stmt); + if (ifn_code == IFN_MASK_STORE || ifn_code == IFN_MASK_LOAD) + issue_mask_prefetch (mem_ref.stmt); + else if (ifn_code == IFN_MASK_GATHER_LOAD) + issue_mask_gather_prefetch (mem_ref.stmt); + else + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "other vectorized internal function\n"); + } + else + issue_builtin_prefetch (mem_ref); + } +} + +/* Generate the stmts for calculating the size. Later we will consider nested + multi-branches scenarios and check more information of niters when it is + a COND_EXPR. */ + +tree +calc_stmts_gen (vector &ref_groups, gimple_seq &cond_expr_stmt_list, + int num_issue_var) +{ + /* Accumulated keep size. */ + tree total_size = build_real_from_int_cst + (double_type_node, integer_zero_node); + for (int i = 0; i < num_issue_var; ++i) + { + data_ref &mem_ref = ref_groups[i].first_use; + tree var = mem_ref.var; + for (unsigned j = 0; j < mem_ref.loop_bounds.size (); ++j) + { + tree niters = mem_ref.loop_bounds[j].niters; + + /* COND_EXPR. */ + if (TREE_CODE (niters) == COND_EXPR) + niters = TREE_OPERAND (niters, 1); + tree unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (var))); + /* _190 = (void *) ivtmp.444_221; + Cannot detect size unit at (void *). */ + if (unit == NULL_TREE) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "WARNING: Cannot detect size unit " + "(use 1 byte) for variable %s: ", get_name (var)); + print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM); + fprintf (dump_file, "\n"); + } + unit = size_one_node; + } + unit = build1 (NOP_EXPR, TREE_TYPE (niters), unit); + tree size = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, unit); + size = build1 (FLOAT_EXPR, double_type_node, size); + total_size = fold_build2 + (PLUS_EXPR, double_type_node, total_size, size); + } + } + /* Create a stmt list for size calculation. */ + tree div = build_int_cst (TREE_TYPE (integer_zero_node), 1024 * 1024); + div = build1 (NOP_EXPR, double_type_node, div); + total_size = fold_build2 (RDIV_EXPR, double_type_node, total_size, div); + + tree threshold = build_int_cst (TREE_TYPE (integer_zero_node), + param_llc_capacity_per_core / 2); + threshold = build_real_from_int_cst (double_type_node, threshold); + tree cond_expr = fold_build2 + (LE_EXPR, boolean_type_node, total_size, threshold); + + /* Convert cond_expr to stmt list. */ + cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), + &cond_expr_stmt_list, is_gimple_condexpr, NULL_TREE); + return cond_expr; +} + +/* Runtime form insertion and issue instruction. */ + +void +runtime_issue (vector &ref_groups, int num_issue_var) +{ + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "runtime issue\n"); + + if (ref_groups.size () == 0) + return; + data_ref &mem_ref = ref_groups[0].first_use; + class loop *loop = mem_ref.loop_bounds.back ().loop; + /* Ensure that variables are in the same loop. */ + for (int i = 1; i < num_issue_var; ++i) + { + data_ref &mem_ref = ref_groups[i].first_use; + if (loop != mem_ref.loop_bounds.back ().loop) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "topn var are not in the same loop\n"); + return; + } + } + if (loop == NULL) + return; + + /* If the exit edge points to bb with multiple inputs, split the exit edge + and create a new bb, make the exit edge point to bb only single input. */ + edge e = single_exit (loop); + if (e == NULL) + return; + if (!single_pred_p (e->dest)) + { + split_loop_exit_edge (e, true); + if (dump_enabled_p ()) + dump_printf (MSG_NOTE, "split exit edge\n"); + } + + gimple_seq cond_expr_stmt_list = NULL; + tree cond_expr = calc_stmts_gen (ref_groups, cond_expr_stmt_list, + num_issue_var); + + /* Use the previous cond and generate a new branch and copy loop. */ + basic_block condition_bb = NULL; + profile_probability prob = profile_probability::likely (); + initialize_original_copy_tables (); + class loop *nloop = loop_version (loop, cond_expr, &condition_bb, + prob, prob.invert (), prob, prob.invert (), true); + free_original_copy_tables (); + + /* Insert the generated stmt list before cond_expr. */ + gimple_stmt_iterator cond_exp_gsi; + if (cond_expr_stmt_list) + { + cond_exp_gsi = gsi_last_bb (condition_bb); + gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, + GSI_SAME_STMT); + } + update_ssa (TODO_update_ssa); + + /* Perform hint issue for branches that meet conditions. */ + static_issue (ref_groups, num_issue_var); +} + +/* Issue llc hints through prefetch instructions. */ + +void +issue_llc_hint (vector &ref_groups) +{ + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "issue_llc_hint:\n"); + + /* 1. If the issue-topn and force-issue options are available, top N var is + forcibly allocated and no runtime branch is generated. + 2. If the issue-topn option is available and the size of top N var is + statically known, top N is statically allocated and no runtime branch + is generated. + 3. If the issue-topn option is available and the size of the top N var is + unknown, but them is dynamically known, the top N is dynamically + allocated and generate runtime branches. (also depends on the screening + of the innermost variable boundary type) + 4. If the dynamic runtime cannot know the size, such as indirect access, + optimization is skipped. + */ + if (ref_groups.size () == 0) + return; + + int num_issue_var = min (param_issue_topn, + static_cast(ref_groups.size ())); + if (num_issue_var < param_issue_topn + && dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "WARNING: Only %u (less than param_issue_topn = %d) " + "ref_group(s) is found for llc hint.\n", + num_issue_var, param_issue_topn); + } + if (param_force_issue) + { + if (strlen (param_target_variables) > 0) + static_issue (ref_groups, static_cast(ref_groups.size ())); + else + static_issue (ref_groups, num_issue_var); + return; + } + calc_type topn_calc_type = STATIC_CALC; + for (int i = 0; i < num_issue_var; ++i) + topn_calc_type = min (topn_calc_type, ref_groups[i].calc_by); + + if (topn_calc_type == STATIC_CALC) + { + /* Before static issue, we still need to collect data size of all target + variables and compare the summation with LLC cache size. */ + double prefetch_data_size = 0.; + for (int i = 0; i < num_issue_var; ++i) + prefetch_data_size += ref_groups[i].var_size; + if (prefetch_data_size <= (double) param_llc_capacity_per_core * 0.8) + static_issue (ref_groups, num_issue_var); + else + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "static issue: Prefetch size exceeds LLC cache " + "size: %lf > %lf.\n", prefetch_data_size, + (double) param_llc_capacity_per_core * 0.8); + } + else if (topn_calc_type == RUNTIME_CALC) + runtime_issue (ref_groups, num_issue_var); + else + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "unhandled issue scene\n"); + } +} + +/* ==================== phase entry ==================== */ +/* Check whether a string can be converted to an unsigned integer. */ + +bool is_unsigned_int (const string &s) +{ + if (s.empty () || s.size () > PREFETCH_TOOL_NUM_MAX_LEN) + return false; + + for (unsigned int i = 0; i < s.size (); ++i) + { + if (s[i] < '0' || s[i] > '9') + return false; + } + return true; +} + +/* Parse a substring separated by comma. If the substring is valid and + non-empty, store it as a parsed element. */ + +bool +parse_string_helper (const string &substr, vector& str_elts, + bool check_unsigned, size_t start, size_t end) +{ + if (substr == "" && dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "WARNING: The input string from %lu to %lu is " + "empty.\n", start, end); + else if (check_unsigned && !is_unsigned_int (substr)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "ERROR: not an unsigned integer: %s\n", + substr.c_str ()); + str_elts.clear (); + return false; + } + else + str_elts.push_back (substr); + return true; +} + +/* Parse a user input string, separated by comma. */ + +void +parse_string (const string &s, vector& str_elts, + bool check_unsigned = false) +{ + string delim = ","; + size_t start = 0; + size_t end = s.find (delim); + string substr = s.substr (start, end - start); + while (end != string::npos) + { + if (!parse_string_helper (substr, str_elts, check_unsigned, start, end)) + return; + start = end + delim.size (); + end = s.find (delim, start); + substr = s.substr (start, end - start); + } + parse_string_helper (substr, str_elts, check_unsigned, start, end); +} + +/* Parse user input of target variables and memory indices and create a map + that assigns a target variable to a memory index. */ + +void +parse_param_inputs (map &var2mem_idx) +{ + /* The user input length should have an input length limit. */ + if ((strlen (param_target_variables) >= PREFETCH_TOOL_INPUT_MAX_LEN + || strlen (param_mem_ref_index) >= PREFETCH_TOOL_INPUT_MAX_LEN) + && dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "INVALID INPUT: The user inputs for target variables " + "and/or memory reference indices are too long for parsing.\n"); + + vector var_names; + string target_variables = param_target_variables; + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Start parsing target variables:\n"); + if (param_use_ref_group_index) + parse_string (target_variables, var_names, true); + else + parse_string (target_variables, var_names, false); + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Finish parsing target variables.\n\n"); + + vector var_mem_indices; + string mem_indices = param_mem_ref_index; + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Start parsing memory reference indices:\n"); + parse_string (mem_indices, var_mem_indices, true); + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Finish parsing memory reference indices.\n\n"); + + /* Construct a map of var_name: var_mem_index. */ + if (var_names.size () > 0) + { + if (var_mem_indices.size () < var_names.size ()) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "WARNING: The number of provided memory " + "reference indices is less than that of target " + "variables.\nUse the top index for all variables " + "instead.\n"); + for (string& var_name : var_names) + var2mem_idx[var_name] = 0; + } + else + { + if (var_mem_indices.size () > var_names.size () + && dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "WARNING: The number of target variables is " + "less than that of memory reference indices.\n"); + for (unsigned int i = 0; i < var_names.size (); ++i) + { + var2mem_idx[var_names[i]] = static_cast( + atoi (var_mem_indices[i].c_str ())); + } + } + } +} + +/* Filter reference groups by only selecting target variables from the user + input. There are two options for prefetching target variables: + 1. Specify variable name parsed by the pass, which you can double-check at + "sorted ref_groups" section in the dump file. + 2. Specify variable rank exhibited at "sorted ref_groups" section in the + dump file. +*/ + +void +prefetch_variables (const vector& ref_groups, + vector& reduced_ref_groups) +{ + map ref_group2mem_idx; + + map var2mem_idx; /* externally defined. */ + parse_param_inputs (var2mem_idx); + + if (param_use_ref_group_index) + { + /* Use ref_group index at "sorted ref_groups" section to specify + variable. */ + /* Collect the variables in "reduced_ref_group" only if their indices + show up at "sorted ref_groups" section. */ + for (const pair &var_mem_idx : var2mem_idx) + { + unsigned int var_idx = static_cast(atoi ( + var_mem_idx.first.c_str ())); + if (var_idx < ref_groups.size ()) + ref_group2mem_idx[var_idx] = var_mem_idx.second; + else if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "WARNING: The index \"%u\" does not show " + "up in the ref_groups.\n", var_idx); + } + } + else + { + /* Use variable name shown up at "sorted ref_groups" section to specify + variable: + var2ref_group_idx + var2mem_idx -> ref_group2mem_idx. */ + /* Create a map that assigns the variable name to its corresponding + ref_group index. */ + map var2ref_group_idx; /* internally detected. */ + for (unsigned int i = 0; i < ref_groups.size (); ++i) + { + const ref_group &curr_ref_group = ref_groups[i]; + const int UINT_MAX_DIGIT = 10; + /* Unrecognizable variable name related to ref_group. */ + if (!get_name (curr_ref_group.var)) + { + /* If the variable name does not have a string representation, + we can rename it by "tmp_var_" + . */ + char group_idx[UINT_MAX_DIGIT]; + sprintf (group_idx, "%u", i); + string tmp_var_name = "tmp_var_" + std::string (group_idx); + fprintf (dump_file, "Unrecognizable variable name at ref_group " + "index %u.\nThe tree expression for variable is: ", i); + print_generic_expr (dump_file, curr_ref_group.var, TDF_SLIM); + fprintf (dump_file, "\n"); + var2ref_group_idx[tmp_var_name] = i; + } + else + var2ref_group_idx[std::string (get_name (curr_ref_group.var))] = i; + } + /* Collect the variables in "reduced_ref_group" only if they show up in + the ref_groups. */ + for (const pair &var_mem_idx : var2mem_idx) + { + if (var2ref_group_idx.count (var_mem_idx.first)) + { + unsigned int ref_group_idx = var2ref_group_idx[var_mem_idx.first]; + ref_group2mem_idx[ref_group_idx] = var_mem_idx.second; + } + else if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "WARNING: Target variable \" %s \" does " + "not show up in the ref_groups. Check whether it needs " + "temporary variable name.\n", + var_mem_idx.first.c_str ()); + } + } + + for (const pair &ref_group_mem_idx : + ref_group2mem_idx) + { + ref_group curr_ref_group = ref_groups[ref_group_mem_idx.first]; + curr_ref_group.mem_ref_index = ref_group_mem_idx.second; + reduced_ref_groups.push_back (curr_ref_group); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "\nNOTICE: Prefetching target variable \" "); + print_generic_expr (dump_file, curr_ref_group.var, TDF_SLIM); + fprintf (dump_file, " \" at ref_group index %u and memory location " + "index %u.\n", ref_group_mem_idx.first, + ref_group_mem_idx.second); + } + } + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\n\n"); +} + + +/* The LLC intelligent allocation consists of 6 steps. */ + +void +llc_allocate (void) +{ + map > kernels_refs; + vector kernels; + if (!get_dense_memory_kernels (kernels, kernels_refs)) + return; + + trace_data_refs_info (kernels, kernels_refs); + + if (!analyze_nested_kernels (kernels, kernels_refs)) + return; + + vector sorted_kernels; + if (!filter_and_sort_kernels (sorted_kernels, kernels)) + return; + + vector ref_groups; + if (!record_and_sort_ref_groups (ref_groups, sorted_kernels, kernels_refs)) + return; + + if (strlen (param_target_variables) > 0) + { + /* If "param_target_variables" is not empty, we will issue parsed target + variables compulsorily. */ + param_force_issue = true; + vector reduced_ref_groups; + prefetch_variables (ref_groups, reduced_ref_groups); + issue_llc_hint (reduced_ref_groups); + } + else + issue_llc_hint (ref_groups); +} + +/* Check whether the function is an operator reloading function. */ + +bool +operator_func_p (function *fn) +{ + const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl)); + + if (fn_name && strncmp (fn_name, "operator", 8) == 0) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "operator_func: %s ", fn_name); + + return true; + } + return false; +} + +/* Check whether the function file location is known. */ + +bool +func_location_p (function *fn) +{ + expanded_location fn_decl_xloc + = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); + expanded_location fn_xloc + = expand_location (fn->function_start_locus); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "fn->function_start_locus = %d \n", + fn->function_start_locus); + fprintf (dump_file, "fn_xloc.file = %s \n", + fn_xloc.file ? fn_xloc.file : "NULL"); + fprintf (dump_file, "fn_decl_xloc.file = %s \n", + fn_decl_xloc.file ? fn_decl_xloc.file : "NULL"); + fprintf (dump_file, "LOCATION_FILE (input_location) = %s \n", + LOCATION_FILE (input_location) ? LOCATION_FILE (input_location) + : "NULL"); + } + if (fn_decl_xloc.file == NULL) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Function location unknown, skip analysis \n"); + return false; + } + /* Newly generated functions are filtered out, such as function constant + propagation func.constprop (). */ + if (LOCATION_FILE (input_location) != fn_decl_xloc.file) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Function location non-local, skip analysis \n"); + return false; + } + return true; +} + +/* Dump function information. */ + +void +dump_function_info (function *fn) +{ + const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl)); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "\nfn_name: %s\n", fn_name); + expanded_location cfun_xloc + = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); + if (cfun_xloc.line) + { + if (cfun_xloc.file) + fprintf (dump_file, "[%s:%d:%d]\n", + cfun_xloc.file, cfun_xloc.line, cfun_xloc.column); + } + fprintf (dump_file, "\n"); + flow_loops_dump (dump_file, NULL, 1); + fprintf (dump_file, "\n"); + } +} + +/* dump param. */ + +void +dump_param (void) +{ + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "LLC allocate parameters:\n"); + fprintf (dump_file, " block size: %d\n", param_l1_cache_line_size); + fprintf (dump_file, " L1 cache size: %d lines, %d kB\n", + param_l1_cache_size * 1024 / param_l1_cache_line_size, + param_l1_cache_size); + fprintf (dump_file, " L1 cache line size: %d\n", + param_l1_cache_line_size); + fprintf (dump_file, " L2 cache size: %d kB\n", param_l2_cache_size); + fprintf (dump_file, " min mem_access_ratio: %d \n", + param_mem_access_ratio); + fprintf (dump_file, " min mem_access_num: %d \n", + param_mem_access_num); + fprintf (dump_file, "\n"); + } +} + +const pass_data pass_data_llc_allocate = +{ + GIMPLE_PASS, /* type. */ + "llc_allocate", /* name. */ + OPTGROUP_LOOP, /* optinfo_flags. */ + TV_TREE_PREFETCH, /* tv_id. */ + (PROP_cfg | PROP_ssa), /* properties_required. */ + 0, /* properties_provided. */ + 0, /* properties_destroyed. */ + 0, /* todo_flags_start. */ + 0, /* todo_flags_finish. */ +}; + +class pass_llc_allocate : public gimple_opt_pass +{ +public: + pass_llc_allocate (gcc::context *ctxt) + : gimple_opt_pass (pass_data_llc_allocate, ctxt) + {} + + /* opt_pass methods. */ + virtual bool gate (function *) + { + return (optimize >= 2 && flag_llc_allocate > 0); + } + virtual unsigned int execute (function *); + +}; // class pass_llc_allocate + +unsigned int +pass_llc_allocate::execute (function *fn) +{ + unsigned int ret = 0; + + if (!targetm.have_prefetch () + || targetm.vectorize.code_for_prefetch == NULL + || targetm.vectorize.prefetch_handleable_mode_p == NULL + || targetm.vectorize.code_for_gather_prefetch == NULL) + return 0; + + if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH)) + { + tree type = build_function_type_list (void_type_node, + const_ptr_type_node, NULL_TREE); + tree decl = add_builtin_function ("__builtin_prefetch", type, + BUILT_IN_PREFETCH, BUILT_IN_NORMAL, + NULL, NULL_TREE); + DECL_IS_NOVOPS (decl) = true; + set_builtin_decl (BUILT_IN_PREFETCH, decl, false); + } + + dump_param (); + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "llc_allocate: %s\n", + IDENTIFIER_POINTER (DECL_NAME (fn->decl))); + + if (number_of_loops (fn) <= 1 || !func_location_p (fn) + || operator_func_p (fn)) + return ret; + + dump_function_info (fn); + + llc_allocate (); + + return ret; +} + +} // anon namespace + +gimple_opt_pass * +make_pass_llc_allocate (gcc::context *ctxt) +{ + return new pass_llc_allocate (ctxt); +} diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c index 7775bc727..c500d5e20 100644 --- a/gcc/tree-ssa-loop-niter.c +++ b/gcc/tree-ssa-loop-niter.c @@ -2384,6 +2384,37 @@ loop_only_exit_p (const class loop *loop, basic_block *body, const_edge exit) return true; } +/* Returns whether the number of vectorized iterations for the loop can be + estimated from the given IR and update the corresponding loop attribute, + e.g., next_mask_114 = .WHILE_ULT (_122, niters.5_75, { 0, ... }); */ + +bool +number_of_iterations_vect (class loop *loop, tree lhs, tree rhs) +{ + loop->vec_nb_iterations = chrec_dont_know; + + if ((TREE_CODE (lhs) != SSA_NAME && TREE_CODE (rhs) != SSA_NAME) + || (TREE_CODE (lhs) == SSA_NAME && TREE_CODE (rhs) == SSA_NAME)) + return false; + + tree ssa = TREE_CODE (lhs) == SSA_NAME ? lhs : rhs; + gimple *def_stmt = SSA_NAME_DEF_STMT (ssa); + + if (gimple_code (def_stmt) != GIMPLE_CALL + || !gimple_call_internal_p (def_stmt)) + return false; + + internal_fn ifn = gimple_call_internal_fn (def_stmt); + if (ifn != IFN_WHILE_ULT) + return false; + + gcall *call = dyn_cast (def_stmt); + tree niters = gimple_call_arg (call, 1); + loop->vec_nb_iterations = niters; + + return true; +} + /* Stores description of number of iterations of LOOP derived from EXIT (an exit edge of the LOOP) in NITER. Returns true if some useful information could be derived (and fields of NITER have meaning described @@ -2454,6 +2485,9 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit, op1 = gimple_cond_rhs (stmt); type = TREE_TYPE (op0); + if (TREE_CODE (type) == VECTOR_TYPE) + number_of_iterations_vect (loop, op0, op1); + if (TREE_CODE (type) != INTEGER_TYPE && !POINTER_TYPE_P (type)) return false; @@ -2730,14 +2764,14 @@ bool number_of_iterations_exit (class loop *loop, edge exit, class tree_niter_desc *niter, bool warn, bool every_iteration, - basic_block *body) + basic_block *body, bool guarantee) { gcond *stmt; if (!number_of_iterations_exit_assumptions (loop, exit, niter, &stmt, every_iteration, body)) return false; - if (integer_nonzerop (niter->assumptions)) + if (integer_nonzerop (niter->assumptions) || guarantee == false) return true; if (warn && dump_enabled_p ()) diff --git a/gcc/tree-ssa-loop-niter.h b/gcc/tree-ssa-loop-niter.h index eb8d15794..d38472e52 100644 --- a/gcc/tree-ssa-loop-niter.h +++ b/gcc/tree-ssa-loop-niter.h @@ -27,7 +27,8 @@ extern bool loop_only_exit_p (const class loop *, basic_block *body, extern bool number_of_iterations_exit (class loop *, edge, class tree_niter_desc *niter, bool, bool every_iteration = true, - basic_block * = NULL); + basic_block * = NULL, + bool guarantee = true); extern bool number_of_iterations_exit_assumptions (class loop *, edge, class tree_niter_desc *, gcond **, bool = true, -- 2.33.0