From 599d6f94c11fd906cfbabbd7ba4e5e2e5642cac9 Mon Sep 17 00:00:00 2001 From: yzyssdd Date: Tue, 28 May 2024 10:43:20 +0800 Subject: [PATCH 2/2] Add prefetch level parameter to specify the last level cache. Add l4 inst and deja case --- gcc/builtins.c | 82 +++++++++++++++++++ gcc/builtins.def | 1 + gcc/config/aarch64/aarch64-protos.h | 6 +- gcc/config/aarch64/aarch64.md | 39 +++++++++ gcc/dce.c | 1 + gcc/hsa-gen.c | 4 +- gcc/ipa-pure-const.c | 1 + gcc/params.opt | 5 ++ gcc/print-rtl.c | 6 ++ gcc/rtl.def | 9 ++ gcc/rtl.h | 4 + gcc/rtlanal.c | 2 + gcc/sched-deps.c | 4 +- gcc/target-insns.def | 1 + .../llc-prefetch-full-pldl1keep.c | 15 ++++ .../llc-prefetch-full-pldl1strm.c | 15 ++++ .../llc-prefetch-full-pldl2keep.c | 15 ++++ .../llc-prefetch-full-pldl2strm.c | 15 ++++ .../llc-prefetch-full-pldl3keep.c | 15 ++++ .../llc-prefetch-full-pldl3strm.c | 15 ++++ .../llc-prefetch-full-pldl4keep.c | 15 ++++ .../llc-prefetch-full-pldl4strm.c | 15 ++++ .../llc-prefetch-full-pstl1keep.c | 15 ++++ .../llc-prefetch-full-pstl1strm.c | 15 ++++ .../llc-prefetch-full-pstl2keep.c | 15 ++++ .../llc-prefetch-full-pstl2strm.c | 15 ++++ .../llc-prefetch-full-pstl3keep.c | 15 ++++ .../llc-prefetch-full-pstl3strm.c | 15 ++++ .../llc-prefetch-full-pstl4keep.c | 15 ++++ .../llc-prefetch-full-pstl4strm.c | 15 ++++ gcc/tree-ssa-llc-allocate.c | 54 ++++++++++-- 31 files changed, 449 insertions(+), 10 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c diff --git a/gcc/builtins.c b/gcc/builtins.c index 1b1c75cc1..ffbb2cae9 100644 --- a/gcc/builtins.c +++ b/gcc/builtins.c @@ -1463,6 +1463,85 @@ expand_builtin_prefetch (tree exp) emit_insn (op0); } +/* Expand a call to __builtin_prefetch_full. */ + +static void +expand_builtin_prefetch_full (tree exp) +{ + tree arg0, arg1, arg2; + int nargs; + rtx op0, op1, op2; + + if (!validate_arglist (exp, POINTER_TYPE, 0)) + return; + + arg0 = CALL_EXPR_ARG (exp, 0); + + /* Arguments 1 and 2 are optional; argument 1 (read/write) defaults to + zero (read) and argument 2 (locality) defaults to 3 (high degree of + locality). */ + nargs = call_expr_nargs (exp); + if (nargs > 1) + arg1 = CALL_EXPR_ARG (exp, 1); + else + arg1 = integer_zero_node; + if (nargs > 2) + arg2 = CALL_EXPR_ARG (exp, 2); + else + arg2 = integer_three_node; + + /* Argument 0 is an address. */ + op0 = expand_expr (arg0, NULL_RTX, Pmode, EXPAND_NORMAL); + + /* Argument 1 (read/write flag) must be a compile-time constant int. */ + if (TREE_CODE (arg1) != INTEGER_CST) + { + error ("second argument to %<__builtin_prefetch_full%> must be a " + "constant"); + arg1 = integer_zero_node; + } + op1 = expand_normal (arg1); + /* Argument 1 must be either zero or one. */ + if (INTVAL (op1) != 0 && INTVAL (op1) != 1) + { + warning (0, "invalid second argument to %<__builtin_prefetch_full%>;" + " using zero"); + op1 = const0_rtx; + } + + /* Argument 2 (locality) must be a compile-time constant int. */ + if (TREE_CODE (arg2) != INTEGER_CST) + { + error ("third argument to %<__builtin_prefetch_full%> must be a " + "constant"); + arg2 = integer_zero_node; + } + op2 = expand_normal (arg2); + /* Argument 2 must be 0-7. */ + if (INTVAL (op2) < 0 || INTVAL (op2) > 7) + { + warning (0, "invalid third argument to %<__builtin_prefetch_full%>; " + "using zero"); + op2 = const0_rtx; + } + + if (targetm.have_prefetch_full ()) + { + class expand_operand ops[3]; + + create_address_operand (&ops[0], op0); + create_integer_operand (&ops[1], INTVAL (op1)); + create_integer_operand (&ops[2], INTVAL (op2)); + if (maybe_expand_insn (targetm.code_for_prefetch_full, 3, ops)) + return; + } + + /* Don't do anything with direct references to volatile memory, but + generate code to handle other side effects. */ + if (!MEM_P (op0) && side_effects_p (op0)) + emit_insn (op0); +} + /* Get a MEM rtx for expression EXP which is the address of an operand to be used in a string instruction (cmpstrsi, cpymemsi, ..). LEN is the maximum length of the block of memory that might be accessed or @@ -8386,6 +8465,9 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode, case BUILT_IN_PREFETCH: expand_builtin_prefetch (exp); return const0_rtx; + case BUILT_IN_PREFETCH_FULL: + expand_builtin_prefetch_full (exp); + return const0_rtx; case BUILT_IN_INIT_TRAMPOLINE: return expand_builtin_init_trampoline (exp, true); diff --git a/gcc/builtins.def b/gcc/builtins.def index ee67ac15d..b89cec11f 100644 --- a/gcc/builtins.def +++ b/gcc/builtins.def @@ -927,6 +927,7 @@ DEF_GCC_BUILTIN (BUILT_IN_POPCOUNTL, "popcountl", BT_FN_INT_ULONG, ATTR_C DEF_GCC_BUILTIN (BUILT_IN_POPCOUNTLL, "popcountll", BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST) DEF_EXT_LIB_BUILTIN (BUILT_IN_POSIX_MEMALIGN, "posix_memalign", BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF) DEF_GCC_BUILTIN (BUILT_IN_PREFETCH, "prefetch", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST) +DEF_GCC_BUILTIN (BUILT_IN_PREFETCH_FULL, "prefetch_full", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST) DEF_LIB_BUILTIN (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE, ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_RETURN, "return", BT_FN_VOID_PTR, ATTR_NORETURN_NOTHROW_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_RETURN_ADDRESS, "return_address", BT_FN_PTR_UINT, ATTR_LEAF_LIST) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 1a4fc2028..c8388f902 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -455,12 +455,16 @@ extern struct tune_params aarch64_tune_params; T (PLDL2STRM, pldl2strm, 3) \ T (PLDL3KEEP, pldl3keep, 4) \ T (PLDL3STRM, pldl3strm, 5) \ + T (PLDL4KEEP, pldl4keep, 6) \ + T (PLDL4STRM, pldl4strm, 7) \ T (PSTL1KEEP, pstl1keep, 8) \ T (PSTL1STRM, pstl1strm, 9) \ T (PSTL2KEEP, pstl2keep, 10) \ T (PSTL2STRM, pstl2strm, 11) \ T (PSTL3KEEP, pstl3keep, 12) \ - T (PSTL3STRM, pstl3strm, 13) + T (PSTL3STRM, pstl3strm, 13) \ + T (PSTL4KEEP, pstl4keep, 14) \ + T (PSTL4STRM, pstl4strm, 15) #define AARCH64_SVENUM(UPPER, LOWER, VALUE) AARCH64_SV_##UPPER = VALUE, enum aarch64_svpattern { diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 38af8d000..2ec1c5d19 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -831,6 +831,45 @@ [(set_attr "type" "load_4")] ) +(define_insn "prefetch_full" + [(prefetch_full (match_operand:DI 0 "aarch64_prefetch_operand" "Dp") + (match_operand:QI 1 "const_int_operand" "") + (match_operand:QI 2 "const_int_operand" ""))] + "" + { + const char * pftype[2][8] = + { + {"prfm\\tPLDL1KEEP, %0", + "prfm\\tPLDL1STRM, %0", + "prfm\\tPLDL2KEEP, %0", + "prfm\\tPLDL2STRM, %0", + "prfm\\tPLDL3KEEP, %0", + "prfm\\tPLDL3STRM, %0", + "prfm\\tPLDL4KEEP, %0", + "prfm\\tPLDL4STRM, %0"}, + {"prfm\\tPSTL1KEEP, %0", + "prfm\\tPSTL1STRM, %0", + "prfm\\tPSTL2KEEP, %0", + "prfm\\tPSTL2STRM, %0", + "prfm\\tPSTL3KEEP, %0", + "prfm\\tPSTL3STRM, %0", + "prfm\\tPSTL4KEEP, %0", + "prfm\\tPSTL4STRM, %0"}, + }; + + int prfop = INTVAL (operands[2]); + + gcc_assert (IN_RANGE (prfop, 0, 7)); + + /* PRFM accepts the same addresses as a 64-bit LDR so wrap + the address into a DImode MEM so that aarch64_print_operand knows + how to print it. */ + operands[0] = gen_rtx_MEM (DImode, operands[0]); + return pftype[INTVAL(operands[1])][prfop]; + } + [(set_attr "type" "load_4")] +) + (define_insn "trap" [(trap_if (const_int 1) (const_int 8))] "" diff --git a/gcc/dce.c b/gcc/dce.c index a6a1599b5..aaa63b63a 100644 --- a/gcc/dce.c +++ b/gcc/dce.c @@ -72,6 +72,7 @@ deletable_insn_p_1 (rtx body) switch (GET_CODE (body)) { case PREFETCH: + case PREFETCH_FULL: case TRAP_IF: /* The UNSPEC case was added here because the ia-64 claims that USEs do not work after reload and generates UNSPECS rather diff --git a/gcc/hsa-gen.c b/gcc/hsa-gen.c index 767badab6..c121aee8d 100644 --- a/gcc/hsa-gen.c +++ b/gcc/hsa-gen.c @@ -5309,7 +5309,8 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb) /* Prefetch pass can create type-mismatching prefetch builtin calls which fail the gimple_call_builtin_p test above. Handle them here. */ - if (fndecl_built_in_p (function_decl, BUILT_IN_PREFETCH)) + if (fndecl_built_in_p (function_decl, BUILT_IN_PREFETCH) + || fndecl_built_in_p (function_decl, BUILT_IN_PREFETCH_FULL)) return; if (hsa_callable_function_p (function_decl)) @@ -5723,6 +5724,7 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb) break; } case BUILT_IN_PREFETCH: + case BUILT_IN_PREFETCH_FULL: break; default: { diff --git a/gcc/ipa-pure-const.c b/gcc/ipa-pure-const.c index 564c6629c..0dc8e60a8 100644 --- a/gcc/ipa-pure-const.c +++ b/gcc/ipa-pure-const.c @@ -534,6 +534,7 @@ special_builtin_state (enum pure_const_state_e *state, bool *looping, *state = IPA_CONST; return true; case BUILT_IN_PREFETCH: + case BUILT_IN_PREFETCH_FULL: *looping = true; *state = IPA_CONST; return true; diff --git a/gcc/params.opt b/gcc/params.opt index 0c9a270b4..f128ae6a4 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -1059,6 +1059,11 @@ Common Joined UInteger Var(param_outer_loop_num) Init(1) IntegerRange(1, 10) Par Maximum number of outer loops allowed to extend outer loops for loops that cannot recognize inner loop boundaries. +-param=llc-level= +Common Joined UInteger Var(param_llc_level) Init(3) IntegerRange(3, 4) +Param Optimization +Specifies the HBM cache level. + -param=filter-mode= Common Joined UInteger Var(param_filter_mode) Init(1) IntegerRange(0, 1) Param Set kernel filtering mode. Use basic block count by default; use branch probability mode when filter mode is turned off. diff --git a/gcc/print-rtl.c b/gcc/print-rtl.c index 611ea079c..4443caf4a 100644 --- a/gcc/print-rtl.c +++ b/gcc/print-rtl.c @@ -1549,6 +1549,12 @@ print_exp (pretty_printer *pp, const_rtx x, int verbose) op[1] = XEXP (x, 1); op[2] = XEXP (x, 2); break; + case PREFETCH_FULL: + fun = "prefetch_full"; + op[0] = XEXP (x, 0); + op[1] = XEXP (x, 1); + op[2] = XEXP (x, 2); + break; case UNSPEC: case UNSPEC_VOLATILE: { diff --git a/gcc/rtl.def b/gcc/rtl.def index 9754333ea..30fd1cf81 100644 --- a/gcc/rtl.def +++ b/gcc/rtl.def @@ -282,6 +282,15 @@ DEF_RTL_EXPR(ADDR_DIFF_VEC, "addr_diff_vec", "eEee0", RTX_EXTRA) whose prefetch instructions do not support them. */ DEF_RTL_EXPR(PREFETCH, "prefetch", "eee", RTX_EXTRA) +/* Memory prefetch, with attributes supported on some targets. + Operand 1 is the address of the memory to fetch. + Operand 2 is 1 for a write access, 0 otherwise. + Operand 3 is the level of prfop. + + The attributes specified by operands 2 and 3 are ignored for targets + whose prefetch instructions do not support them. */ +DEF_RTL_EXPR(PREFETCH_FULL, "prefetch_full", "eee", RTX_EXTRA) + /* ---------------------------------------------------------------------- At the top level of an instruction (perhaps under PARALLEL). ---------------------------------------------------------------------- */ diff --git a/gcc/rtl.h b/gcc/rtl.h index b29afca8d..fbcd05562 100644 --- a/gcc/rtl.h +++ b/gcc/rtl.h @@ -2804,6 +2804,10 @@ do { \ #define PREFETCH_SCHEDULE_BARRIER_P(RTX) \ (RTL_FLAG_CHECK1 ("PREFETCH_SCHEDULE_BARRIER_P", (RTX), PREFETCH)->volatil) +/* True if RTX is flagged to be a scheduling barrier. */ +#define PREFETCH_FULL_SCHEDULE_BARRIER_P(RTX) \ + (RTL_FLAG_CHECK1 ("PREFETCH_FULL_SCHEDULE_BARRIER_P", (RTX), PREFETCH_FULL)->volatil) + /* Indicate whether the machine has any sort of auto increment addressing. If not, we can avoid checking for REG_INC notes. */ diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c index 0ebde7622..63bf1bf58 100644 --- a/gcc/rtlanal.c +++ b/gcc/rtlanal.c @@ -1195,6 +1195,7 @@ reg_referenced_p (const_rtx x, const_rtx body) return reg_overlap_mentioned_p (x, TRAP_CONDITION (body)); case PREFETCH: + case PREFETCH_FULL: return reg_overlap_mentioned_p (x, XEXP (body, 0)); case UNSPEC: @@ -2007,6 +2008,7 @@ note_uses (rtx *pbody, void (*fun) (rtx *, void *), void *data) return; case PREFETCH: + case PREFETCH_FULL: (*fun) (&XEXP (body, 0), data); return; diff --git a/gcc/sched-deps.c b/gcc/sched-deps.c index 331af5ffd..cb5a64ed9 100644 --- a/gcc/sched-deps.c +++ b/gcc/sched-deps.c @@ -2720,7 +2720,9 @@ sched_analyze_2 (class deps_desc *deps, rtx x, rtx_insn *insn) break; case PREFETCH: - if (PREFETCH_SCHEDULE_BARRIER_P (x)) + case PREFETCH_FULL: + if ((code == PREFETCH && PREFETCH_SCHEDULE_BARRIER_P (x)) + || (code == PREFETCH_FULL && PREFETCH_FULL_SCHEDULE_BARRIER_P (x))) reg_pending_barrier = TRUE_BARRIER; /* Prefetch insn contains addresses only. So if the prefetch address has no registers, there will be no dependencies on diff --git a/gcc/target-insns.def b/gcc/target-insns.def index 4d7eb92cf..e80361f0a 100644 --- a/gcc/target-insns.def +++ b/gcc/target-insns.def @@ -77,6 +77,7 @@ DEF_TARGET_INSN (omp_simt_vote_any, (rtx x0, rtx x1)) DEF_TARGET_INSN (omp_simt_xchg_bfly, (rtx x0, rtx x1, rtx x2)) DEF_TARGET_INSN (omp_simt_xchg_idx, (rtx x0, rtx x1, rtx x2)) DEF_TARGET_INSN (prefetch, (rtx x0, rtx x1, rtx x2)) +DEF_TARGET_INSN (prefetch_full, (rtx x0, rtx x1, rtx x2)) DEF_TARGET_INSN (probe_stack, (rtx x0)) DEF_TARGET_INSN (probe_stack_address, (rtx x0)) DEF_TARGET_INSN (prologue, (void)) diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c new file mode 100644 index 000000000..c0fa2db2f --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c @@ -0,0 +1,15 @@ + +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options " -S -O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ + + +int val[100000]; +int main(){ + for(int i=0;i<100000;i++){ + __builtin_prefetch_full(&val[i],0,0); + val[i]=i+1; + } +} + +/* { dg-final { scan-assembler "PLDL1KEEP" } } */ + diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c new file mode 100644 index 000000000..bcd1113d1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c @@ -0,0 +1,15 @@ + +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ + + +int val[100000]; +int main(){ + for(int i=0;i<100000;i++){ + __builtin_prefetch_full(&val[i],0,1); + val[i]=i+1; + } +} + +/* { dg-final { scan-assembler "PLDL1STRM" } } */ + diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c new file mode 100644 index 000000000..46702bfbc --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c @@ -0,0 +1,15 @@ + +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ + + +int val[100000]; +int main(){ + for(int i=0;i<100000;i++){ + __builtin_prefetch_full(&val[i],0,2); + val[i]=i+1; + } +} + +/* { dg-final { scan-assembler "PLDL2KEEP" } } */ + diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c new file mode 100644 index 000000000..e359ad178 --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c @@ -0,0 +1,15 @@ + +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ + + +int val[100000]; +int main(){ + for(int i=0;i<100000;i++){ + __builtin_prefetch_full(&val[i],0,3); + val[i]=i+1; + } +} + +/* { dg-final { scan-assembler "PLDL2STRM" } } */ + diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c new file mode 100644 index 000000000..0a9dae090 --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c @@ -0,0 +1,15 @@ + +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ + + +int val[100000]; +int main(){ + for(int i=0;i<100000;i++){ + __builtin_prefetch_full(&val[i],0,4); + val[i]=i+1; + } +} + +/* { dg-final { scan-assembler "PLDL3KEEP" } } */ + diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c new file mode 100644 index 000000000..58db40ba1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c @@ -0,0 +1,15 @@ + +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ + + +int val[100000]; +int main(){ + for(int i=0;i<100000;i++){ + __builtin_prefetch_full(&val[i],0,5); + val[i]=i+1; + } +} + +/* { dg-final { scan-assembler "PLDL3STRM" } } */ + diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c new file mode 100644 index 000000000..6f6b7bbd4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c @@ -0,0 +1,15 @@ + +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ + + +int val[100000]; +int main(){ + for(int i=0;i<100000;i++){ + __builtin_prefetch_full(&val[i],0,6); + val[i]=i+1; + } +} + +/* { dg-final { scan-assembler "PLDL4KEEP" } } */ + diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c new file mode 100644 index 000000000..b69b4a5e6 --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c @@ -0,0 +1,15 @@ + +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ + + +int val[100000]; +int main(){ + for(int i=0;i<100000;i++){ + __builtin_prefetch_full(&val[i],0,7); + val[i]=i+1; + } +} + +/* { dg-final { scan-assembler "PLDL4STRM" } } */ + diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c new file mode 100644 index 000000000..f5a474eb5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c @@ -0,0 +1,15 @@ + +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ + + +int val[100000]; +int main(){ + for(int i=0;i<100000;i++){ + __builtin_prefetch_full(&val[i],1,0); + val[i]=i+1; + } +} + +/* { dg-final { scan-assembler "PSTL1KEEP" } } */ + diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c new file mode 100644 index 000000000..6798824a9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c @@ -0,0 +1,15 @@ + +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ + + +int val[100000]; +int main(){ + for(int i=0;i<100000;i++){ + __builtin_prefetch_full(&val[i],1,1); + val[i]=i+1; + } +} + +/* { dg-final { scan-assembler "PSTL1STRM" } } */ + diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c new file mode 100644 index 000000000..c19fcc830 --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c @@ -0,0 +1,15 @@ + +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ + + +int val[100000]; +int main(){ + for(int i=0;i<100000;i++){ + __builtin_prefetch_full(&val[i],1,2); + val[i]=i+1; + } +} + +/* { dg-final { scan-assembler "PSTL2KEEP" } } */ + diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c new file mode 100644 index 000000000..dde160a28 --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c @@ -0,0 +1,15 @@ + +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ + + +int val[100000]; +int main(){ + for(int i=0;i<100000;i++){ + __builtin_prefetch_full(&val[i],1,3); + val[i]=i+1; + } +} + +/* { dg-final { scan-assembler "PSTL2STRM" } } */ + diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c new file mode 100644 index 000000000..fa698243d --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c @@ -0,0 +1,15 @@ + +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ + + +int val[100000]; +int main(){ + for(int i=0;i<100000;i++){ + __builtin_prefetch_full(&val[i],1,4); + val[i]=i+1; + } +} + +/* { dg-final { scan-assembler "PSTL3KEEP" } } */ + diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c new file mode 100644 index 000000000..653f7786e --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c @@ -0,0 +1,15 @@ + +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ + + +int val[100000]; +int main(){ + for(int i=0;i<100000;i++){ + __builtin_prefetch_full(&val[i],1,5); + val[i]=i+1; + } +} + +/* { dg-final { scan-assembler "PSTL3STRM" } } */ + diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c new file mode 100644 index 000000000..16a3b6552 --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c @@ -0,0 +1,15 @@ + +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ + + +int val[100000]; +int main(){ + for(int i=0;i<100000;i++){ + __builtin_prefetch_full(&val[i],1,6); + val[i]=i+1; + } +} + +/* { dg-final { scan-assembler "PSTL4KEEP" } } */ + diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c new file mode 100644 index 000000000..60d671bf5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c @@ -0,0 +1,15 @@ + +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ + + +int val[100000]; +int main(){ + for(int i=0;i<100000;i++){ + __builtin_prefetch_full(&val[i],1,7); + val[i]=i+1; + } +} + +/* { dg-final { scan-assembler "PSTL4STRM" } } */ + diff --git a/gcc/tree-ssa-llc-allocate.c b/gcc/tree-ssa-llc-allocate.c index 107d5da26..75501f41c 100644 --- a/gcc/tree-ssa-llc-allocate.c +++ b/gcc/tree-ssa-llc-allocate.c @@ -3271,8 +3271,19 @@ issue_mask_prefetch (gimple *stmt) target = gimple_call_arg (stmt, 3); else if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD) target = gimple_call_lhs (stmt); - /* 4: PLDL3KEEP. */ - tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); + tree prfop = NULL_TREE; + if (param_llc_level == 3) + /* for simulation, 4: PLDL3KEEP. */ + prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); + else if (param_llc_level == 4) + /* 6: PLDL4KEEP. */ + prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6); + else + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "LLC cache levels are illegal.\n"); + return; + } /* add offset. */ gimple_stmt_iterator si = gsi_for_stmt (stmt); @@ -3310,9 +3321,19 @@ issue_mask_gather_prefetch (gimple *stmt) tree scale = gimple_call_arg (stmt, 2); tree zero = gimple_call_arg (stmt, 3); tree final_mask = gimple_call_arg (stmt, 4); - tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); - tree target = gimple_call_lhs (stmt); + tree prfop = NULL_TREE; + if (param_llc_level == 3) // for simulation + prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); // 4: PLDL3KEEP + else if (param_llc_level == 4) + prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6); // 6: PLDL4KEEP + else + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "LLC cache levels are illegal.\n"); + return; + } + tree target = gimple_call_lhs (stmt); /* add offset. */ gimple_stmt_iterator si = gsi_for_stmt (stmt); if (target == NULL_TREE) @@ -3373,8 +3394,27 @@ issue_builtin_prefetch (data_ref &mem_ref) /* __builtin_prefetch (_68, 0, 1); 1st param: *addr, 2nd param: write/read (1/0), 3rd param: temporal locality (high means strong locality) */ - gcall *call = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), 3, - addr, integer_zero_node, integer_one_node); + gcall *call = NULL; + if (param_llc_level == 3) + { + /* for simulation. + BUILT_IN_PREFETCH (addr, rw, locality). */ + call = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), + 3, addr, integer_zero_node, integer_one_node); + } + else if (param_llc_level == 4) + { + tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6); + call = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH_FULL), + 3, addr, integer_zero_node, prfop); + } + else + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "LLC cache levels are illegal.\n"); + return; + } + gsi_insert_after (&si, call, GSI_SAME_STMT); update_ssa (TODO_update_ssa_only_virtuals); } @@ -3724,7 +3764,7 @@ issue_llc_hint (std::vector &ref_groups, fprintf (dump_file, "issue_llc_hint:\n"); /* 1) If the issue-topn and force-issue options are available, top N var is - forcibly allocated and no runtime branch is generated. + forcibly allocated then no runtime branch is generated. 2) If the issue-topn option is available and the size of top N var is statically known, top N is statically allocated and no runtime branch is generated. -- 2.33.0