4906 lines
162 KiB
Diff
4906 lines
162 KiB
Diff
From e0e139bf642398d1e1b8cfd803ee6ce276404991 Mon Sep 17 00:00:00 2001
|
||
From: huangxiaoquan <huangxiaoquan1@huawei.com>
|
||
Date: Wed, 6 Dec 2023 17:51:11 +0800
|
||
Subject: [PATCH] Add LLC-Allocation Pass LLC allocation allows the compiler to
|
||
identify frequently-used data in the program and strengthens the ability to
|
||
prefetch and distribute it to the last level cache (LLC) through memory
|
||
accesses of the corresponding data variables. Add flag -fllc-allocate to
|
||
enable LLC allocation.
|
||
|
||
---
|
||
gcc/Makefile.in | 1 +
|
||
gcc/cfgloop.h | 3 +
|
||
gcc/common.opt | 4 +
|
||
gcc/config/aarch64/aarch64-sve.md | 48 +-
|
||
gcc/config/aarch64/aarch64.c | 18 +
|
||
gcc/doc/tm.texi | 21 +
|
||
gcc/doc/tm.texi.in | 6 +
|
||
gcc/internal-fn.c | 115 +
|
||
gcc/internal-fn.def | 4 +
|
||
gcc/optabs.def | 2 +
|
||
gcc/params.opt | 53 +
|
||
gcc/passes.def | 1 +
|
||
gcc/target.def | 31 +
|
||
gcc/testsuite/gcc.dg/llc-allocate/llc-1.c | 61 +
|
||
gcc/testsuite/gcc.dg/llc-allocate/llc-2.c | 54 +
|
||
.../gcc.dg/llc-allocate/llc-allocate.exp | 27 +
|
||
.../llc-allocate/llc-issue-builtin-prefetch.c | 48 +
|
||
.../gcc.dg/llc-allocate/llc-nonzero-offset.c | 50 +
|
||
.../gcc.dg/llc-allocate/llc-ref-trace.c | 62 +
|
||
.../llc-allocate/llc-tool-insertion-1.c | 48 +
|
||
.../llc-allocate/llc-tool-insertion-2.c | 48 +
|
||
.../llc-allocate/llc-tool-insertion-3.c | 48 +
|
||
.../llc-allocate/llc-tool-insertion-4.c | 47 +
|
||
.../llc-allocate/llc-tool-insertion-5.c | 48 +
|
||
.../llc-allocate/llc-tool-insertion-6.c | 47 +
|
||
.../llc-tool-insertion-7-null-var-name.c | 52 +
|
||
.../llc-tool-insertion-8-tmp-var-name.c | 54 +
|
||
.../gfortran.dg/llc-allocate/llc-3.f90 | 213 ++
|
||
.../gfortran.dg/llc-allocate/llc-allocate.exp | 29 +
|
||
.../llc-trace-multiple-base-var.f90 | 63 +
|
||
.../llc-unknown-type-size-unit.f90 | 58 +
|
||
gcc/timevar.def | 1 +
|
||
gcc/tree-cfg.c | 11 +
|
||
gcc/tree-cfg.h | 1 +
|
||
gcc/tree-pass.h | 1 +
|
||
gcc/tree-scalar-evolution.c | 8 +-
|
||
gcc/tree-scalar-evolution.h | 3 +-
|
||
gcc/tree-ssa-llc-allocate.c | 2898 +++++++++++++++++
|
||
gcc/tree-ssa-loop-niter.c | 38 +-
|
||
gcc/tree-ssa-loop-niter.h | 3 +-
|
||
40 files changed, 4297 insertions(+), 31 deletions(-)
|
||
create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
|
||
create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-2.c
|
||
create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp
|
||
create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c
|
||
create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c
|
||
create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
|
||
create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c
|
||
create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-2.c
|
||
create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-3.c
|
||
create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-4.c
|
||
create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-5.c
|
||
create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-6.c
|
||
create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-7-null-var-name.c
|
||
create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c
|
||
create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
|
||
create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp
|
||
create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90
|
||
create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90
|
||
create mode 100644 gcc/tree-ssa-llc-allocate.c
|
||
|
||
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
|
||
index 2a59acfbe..31bf2cde2 100644
|
||
--- a/gcc/Makefile.in
|
||
+++ b/gcc/Makefile.in
|
||
@@ -1594,6 +1594,7 @@ OBJS = \
|
||
tree-ssa-loop-array-widen-compare.o \
|
||
tree-ssa-loop-crc.o \
|
||
tree-ssa-loop-prefetch.o \
|
||
+ tree-ssa-llc-allocate.o \
|
||
tree-ssa-loop-split.o \
|
||
tree-ssa-loop-unswitch.o \
|
||
tree-ssa-loop.o \
|
||
diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
|
||
index 18b404e29..e3ecf5076 100644
|
||
--- a/gcc/cfgloop.h
|
||
+++ b/gcc/cfgloop.h
|
||
@@ -272,6 +272,9 @@ public:
|
||
the basic-block from being collected but its index can still be
|
||
reused. */
|
||
basic_block former_header;
|
||
+
|
||
+ /* Number of latch executions from vectorization. */
|
||
+ tree vec_nb_iterations;
|
||
};
|
||
|
||
/* Set if the loop is known to be infinite. */
|
||
diff --git a/gcc/common.opt b/gcc/common.opt
|
||
index 4db061b44..2dde0f673 100644
|
||
--- a/gcc/common.opt
|
||
+++ b/gcc/common.opt
|
||
@@ -2233,6 +2233,10 @@ Common Joined RejectNegative UInteger Var(prefetch_level) Init(0) IntegerRange(0
|
||
Generate prefetch instructions, if available, for arrays in loops. The prefetch
|
||
level can control the optimize level to array prefetch.
|
||
|
||
+fllc-allocate
|
||
+Common Report Var(flag_llc_allocate) Init(-1) Optimization
|
||
+Generate LLC hint instructions.
|
||
+
|
||
fprofile
|
||
Common Report Var(profile_flag)
|
||
Enable basic program profiling code.
|
||
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
|
||
index d17a77706..c5b99b6c4 100644
|
||
--- a/gcc/config/aarch64/aarch64-sve.md
|
||
+++ b/gcc/config/aarch64/aarch64-sve.md
|
||
@@ -1940,7 +1940,7 @@
|
||
(define_insn "@aarch64_sve_prefetch<mode>"
|
||
[(prefetch (unspec:DI
|
||
[(match_operand:<VPRED> 0 "register_operand" "Upl")
|
||
- (match_operand:SVE_FULL_I 1 "aarch64_sve_prefetch_operand" "UP<Vesize>")
|
||
+ (match_operand:SVE_FULL 1 "aarch64_sve_prefetch_operand" "UP<Vesize>")
|
||
(match_operand:DI 2 "const_int_operand")]
|
||
UNSPEC_SVE_PREFETCH)
|
||
(match_operand:DI 3 "const_int_operand")
|
||
@@ -1973,14 +1973,14 @@
|
||
;; 6: the prefetch operator (an svprfop)
|
||
;; 7: the normal RTL prefetch rw flag
|
||
;; 8: the normal RTL prefetch locality value
|
||
-(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx4SI_ONLY:mode>"
|
||
+(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL:mode><VNx4SI_ONLY:mode>"
|
||
[(prefetch (unspec:DI
|
||
[(match_operand:VNx4BI 0 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
|
||
- (match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL_I:Vesize>" "Z, vg<SVE_FULL_I:Vesize>, rk, rk, rk, rk")
|
||
+ (match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL:Vesize>" "Z, vg<SVE_FULL:Vesize>, rk, rk, rk, rk")
|
||
(match_operand:VNx4SI_ONLY 2 "register_operand" "w, w, w, w, w, w")
|
||
(match_operand:DI 3 "const_int_operand" "i, i, Z, Ui1, Z, Ui1")
|
||
- (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
|
||
- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
|
||
+ (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL:Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
|
||
+ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero")
|
||
(match_operand:DI 6 "const_int_operand")]
|
||
UNSPEC_SVE_PREFETCH_GATHER)
|
||
(match_operand:DI 7 "const_int_operand")
|
||
@@ -1988,12 +1988,12 @@
|
||
"TARGET_SVE"
|
||
{
|
||
static const char *const insns[][2] = {
|
||
- "prf<SVE_FULL_I:Vesize>", "%0, [%2.s]",
|
||
- "prf<SVE_FULL_I:Vesize>", "%0, [%2.s, #%1]",
|
||
+ "prf<SVE_FULL:Vesize>", "%0, [%2.s]",
|
||
+ "prf<SVE_FULL:Vesize>", "%0, [%2.s, #%1]",
|
||
"prfb", "%0, [%1, %2.s, sxtw]",
|
||
"prfb", "%0, [%1, %2.s, uxtw]",
|
||
- "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.s, sxtw %p4]",
|
||
- "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.s, uxtw %p4]"
|
||
+ "prf<SVE_FULL:Vesize>", "%0, [%1, %2.s, sxtw %p4]",
|
||
+ "prf<SVE_FULL:Vesize>", "%0, [%1, %2.s, uxtw %p4]"
|
||
};
|
||
const char *const *parts = insns[which_alternative];
|
||
return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
|
||
@@ -2002,14 +2002,14 @@
|
||
|
||
;; Predicated gather prefetches for 64-bit elements. The value of operand 3
|
||
;; doesn't matter in this case.
|
||
-(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>"
|
||
+(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL:mode><VNx2DI_ONLY:mode>"
|
||
[(prefetch (unspec:DI
|
||
[(match_operand:VNx2BI 0 "register_operand" "Upl, Upl, Upl, Upl")
|
||
- (match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL_I:Vesize>" "Z, vg<SVE_FULL_I:Vesize>, rk, rk")
|
||
+ (match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL:Vesize>" "Z, vg<SVE_FULL:Vesize>, rk, rk")
|
||
(match_operand:VNx2DI_ONLY 2 "register_operand" "w, w, w, w")
|
||
(match_operand:DI 3 "const_int_operand")
|
||
- (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, Ui1, Ui1, i")
|
||
- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
|
||
+ (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL:Vesize>" "Ui1, Ui1, Ui1, i")
|
||
+ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero")
|
||
(match_operand:DI 6 "const_int_operand")]
|
||
UNSPEC_SVE_PREFETCH_GATHER)
|
||
(match_operand:DI 7 "const_int_operand")
|
||
@@ -2017,10 +2017,10 @@
|
||
"TARGET_SVE"
|
||
{
|
||
static const char *const insns[][2] = {
|
||
- "prf<SVE_FULL_I:Vesize>", "%0, [%2.d]",
|
||
- "prf<SVE_FULL_I:Vesize>", "%0, [%2.d, #%1]",
|
||
+ "prf<SVE_FULL:Vesize>", "%0, [%2.d]",
|
||
+ "prf<SVE_FULL:Vesize>", "%0, [%2.d, #%1]",
|
||
"prfb", "%0, [%1, %2.d]",
|
||
- "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, lsl %p4]"
|
||
+ "prf<SVE_FULL:Vesize>", "%0, [%1, %2.d, lsl %p4]"
|
||
};
|
||
const char *const *parts = insns[which_alternative];
|
||
return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
|
||
@@ -2028,7 +2028,7 @@
|
||
)
|
||
|
||
;; Likewise, but with the offset being sign-extended from 32 bits.
|
||
-(define_insn_and_rewrite "*aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>_sxtw"
|
||
+(define_insn_and_rewrite "*aarch64_sve_gather_prefetch<SVE_FULL:mode><VNx2DI_ONLY:mode>_sxtw"
|
||
[(prefetch (unspec:DI
|
||
[(match_operand:VNx2BI 0 "register_operand" "Upl, Upl")
|
||
(match_operand:DI 1 "register_operand" "rk, rk")
|
||
@@ -2039,8 +2039,8 @@
|
||
(match_operand:VNx2DI 2 "register_operand" "w, w")))]
|
||
UNSPEC_PRED_X)
|
||
(match_operand:DI 3 "const_int_operand")
|
||
- (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, i")
|
||
- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
|
||
+ (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL:Vesize>" "Ui1, i")
|
||
+ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero")
|
||
(match_operand:DI 6 "const_int_operand")]
|
||
UNSPEC_SVE_PREFETCH_GATHER)
|
||
(match_operand:DI 7 "const_int_operand")
|
||
@@ -2049,7 +2049,7 @@
|
||
{
|
||
static const char *const insns[][2] = {
|
||
"prfb", "%0, [%1, %2.d, sxtw]",
|
||
- "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, sxtw %p4]"
|
||
+ "prf<SVE_FULL:Vesize>", "%0, [%1, %2.d, sxtw %p4]"
|
||
};
|
||
const char *const *parts = insns[which_alternative];
|
||
return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
|
||
@@ -2061,7 +2061,7 @@
|
||
)
|
||
|
||
;; Likewise, but with the offset being zero-extended from 32 bits.
|
||
-(define_insn "*aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>_uxtw"
|
||
+(define_insn "*aarch64_sve_gather_prefetch<SVE_FULL:mode><VNx2DI_ONLY:mode>_uxtw"
|
||
[(prefetch (unspec:DI
|
||
[(match_operand:VNx2BI 0 "register_operand" "Upl, Upl")
|
||
(match_operand:DI 1 "register_operand" "rk, rk")
|
||
@@ -2069,8 +2069,8 @@
|
||
(match_operand:VNx2DI 2 "register_operand" "w, w")
|
||
(match_operand:VNx2DI 9 "aarch64_sve_uxtw_immediate"))
|
||
(match_operand:DI 3 "const_int_operand")
|
||
- (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, i")
|
||
- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
|
||
+ (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL:Vesize>" "Ui1, i")
|
||
+ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero")
|
||
(match_operand:DI 6 "const_int_operand")]
|
||
UNSPEC_SVE_PREFETCH_GATHER)
|
||
(match_operand:DI 7 "const_int_operand")
|
||
@@ -2079,7 +2079,7 @@
|
||
{
|
||
static const char *const insns[][2] = {
|
||
"prfb", "%0, [%1, %2.d, uxtw]",
|
||
- "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, uxtw %p4]"
|
||
+ "prf<SVE_FULL:Vesize>", "%0, [%1, %2.d, uxtw %p4]"
|
||
};
|
||
const char *const *parts = insns[which_alternative];
|
||
return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
|
||
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
|
||
index dbdc6dffb..aa077ec0a 100644
|
||
--- a/gcc/config/aarch64/aarch64.c
|
||
+++ b/gcc/config/aarch64/aarch64.c
|
||
@@ -2367,6 +2367,13 @@ aarch64_sve_data_mode_p (machine_mode mode)
|
||
return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
|
||
}
|
||
|
||
+/* Return true if MODE is an full SVE data vector mode. */
|
||
+static bool
|
||
+aarch64_full_sve_data_mode_p (machine_mode mode)
|
||
+{
|
||
+ return aarch64_classify_vector_mode (mode) == VEC_SVE_DATA;
|
||
+}
|
||
+
|
||
/* Return the number of defined bytes in one constituent vector of
|
||
SVE mode MODE, which has vector flags VEC_FLAGS. */
|
||
static poly_int64
|
||
@@ -24370,6 +24377,17 @@ aarch64_libgcc_floating_mode_supported_p
|
||
#undef TARGET_ASM_FUNCTION_EPILOGUE
|
||
#define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
|
||
|
||
+#undef TARGET_VECTORIZE_CODE_FOR_PREFETCH
|
||
+#define TARGET_VECTORIZE_CODE_FOR_PREFETCH code_for_aarch64_sve_prefetch
|
||
+
|
||
+#undef TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH
|
||
+#define TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH \
|
||
+ code_for_aarch64_sve_gather_prefetch
|
||
+
|
||
+#undef TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P
|
||
+#define TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P \
|
||
+ aarch64_full_sve_data_mode_p
|
||
+
|
||
struct gcc_target targetm = TARGET_INITIALIZER;
|
||
|
||
#include "gt-aarch64.h"
|
||
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
|
||
index b46418d0b..ef3566510 100644
|
||
--- a/gcc/doc/tm.texi
|
||
+++ b/gcc/doc/tm.texi
|
||
@@ -6122,6 +6122,27 @@ The default is @code{NULL_TREE} which means to not vectorize scatter
|
||
stores.
|
||
@end deftypefn
|
||
|
||
+@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_PREFETCH (machine_mode @var{arg})
|
||
+This hook should return the decl of a function that implements the
|
||
+vectorized variant of the function with the @code{combined_fn} code
|
||
+@var{code} or @code{NULL_TREE} if such a function is not available.
|
||
+The return type of the vectorized function shall be of vector type
|
||
+@var{vec_type_out} and the argument types should be @var{vec_type_in}.
|
||
+@end deftypefn
|
||
+
|
||
+@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH (machine_mode @var{mode_to}, machine_mode @var{mode_form})
|
||
+This hook should return the decl of a function that implements the
|
||
+vectorized variant of the function with the @code{combined_fn} code
|
||
+@var{code} or @code{NULL_TREE} if such a function is not available.
|
||
+The return type of the vectorized function shall be of vector type
|
||
+@var{vec_type_out} and the argument types should be @var{vec_type_in}.
|
||
+@end deftypefn
|
||
+
|
||
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P (machine_mode @var{arg})
|
||
+This hook should return true if the target hardware architecture
|
||
+supports a full SVE data vector mode.
|
||
+@end deftypefn
|
||
+
|
||
@deftypefn {Target Hook} int TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN (struct cgraph_node *@var{}, struct cgraph_simd_clone *@var{}, @var{tree}, @var{int})
|
||
This hook should set @var{vecsize_mangle}, @var{vecsize_int}, @var{vecsize_float}
|
||
fields in @var{simd_clone} structure pointed by @var{clone_info} argument and also
|
||
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
|
||
index 2663547c7..945d0f696 100644
|
||
--- a/gcc/doc/tm.texi.in
|
||
+++ b/gcc/doc/tm.texi.in
|
||
@@ -4195,6 +4195,12 @@ address; but often a machine-dependent strategy can generate better code.
|
||
|
||
@hook TARGET_VECTORIZE_BUILTIN_SCATTER
|
||
|
||
+@hook TARGET_VECTORIZE_CODE_FOR_PREFETCH
|
||
+
|
||
+@hook TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH
|
||
+
|
||
+@hook TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P
|
||
+
|
||
@hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
|
||
|
||
@hook TARGET_SIMD_CLONE_ADJUST
|
||
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
|
||
index 644f234e0..e8a3bb654 100644
|
||
--- a/gcc/internal-fn.c
|
||
+++ b/gcc/internal-fn.c
|
||
@@ -102,10 +102,12 @@ init_internal_fns ()
|
||
direct_internal_fn. */
|
||
#define not_direct { -2, -2, false }
|
||
#define mask_load_direct { -1, 2, false }
|
||
+#define mask_prefetch_direct { -1, 2, false }
|
||
#define load_lanes_direct { -1, -1, false }
|
||
#define mask_load_lanes_direct { -1, -1, false }
|
||
#define gather_load_direct { 3, 1, false }
|
||
#define mask_store_direct { 3, 2, false }
|
||
+#define gather_prefetch_direct { 3, 1, false }
|
||
#define store_lanes_direct { 0, 0, false }
|
||
#define mask_store_lanes_direct { 0, 0, false }
|
||
#define vec_cond_mask_direct { 0, 0, false }
|
||
@@ -2520,6 +2522,53 @@ expand_mask_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
|
||
|
||
#define expand_mask_load_lanes_optab_fn expand_mask_load_optab_fn
|
||
|
||
+/* Expand MASK_PREFETCH call STMT using optab OPTAB.
|
||
+ .MASK_STORE (_5, 64B, loop_mask_98, vect__8.10_102);
|
||
+ .MASK_PREFETCH (_68, 64B, loop_mask_98, vect__8.10_102, 4);
|
||
+*/
|
||
+
|
||
+static void
|
||
+expand_mask_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
|
||
+{
|
||
+ if (targetm.vectorize.code_for_prefetch == NULL
|
||
+ || targetm.vectorize.prefetch_handleable_mode_p == NULL)
|
||
+ return;
|
||
+
|
||
+ tree base = gimple_call_arg (stmt, 0);
|
||
+ if (base == NULL_TREE)
|
||
+ return;
|
||
+
|
||
+ tree maskt = gimple_call_arg (stmt, 2);
|
||
+ tree target = gimple_call_arg (stmt, 3);
|
||
+ tree prfop = gimple_call_arg (stmt, 4);
|
||
+ HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop);
|
||
+ /* Bit 3 of the prfop selects stores over loads. */
|
||
+ HOST_WIDE_INT access = prfop_int & 8;
|
||
+ /* Bits 1 and 2 specify the locality; 0-based for svprfop but
|
||
+ 1-based for PREFETCH. */
|
||
+ HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1;
|
||
+
|
||
+ machine_mode m_mode = TYPE_MODE (TREE_TYPE (target));
|
||
+ if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode))
|
||
+ return;
|
||
+ insn_code icode = targetm.vectorize.code_for_prefetch (m_mode);
|
||
+
|
||
+ rtx mask = expand_normal (maskt);
|
||
+ rtx base_rtx = expand_normal (base);
|
||
+ /* Convert ptr_mode value X to Pmode. */
|
||
+ if (ptr_mode == SImode)
|
||
+ base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode);
|
||
+
|
||
+ unsigned i = 0;
|
||
+ class expand_operand ops[5];
|
||
+ create_input_operand (&ops[i++], mask, TYPE_MODE (TREE_TYPE (maskt)));
|
||
+ create_address_operand (&ops[i++], base_rtx);
|
||
+ create_integer_operand (&ops[i++], prfop_int);
|
||
+ create_integer_operand (&ops[i++], access);
|
||
+ create_integer_operand (&ops[i++], locality);
|
||
+ expand_insn (icode, i, ops);
|
||
+}
|
||
+
|
||
/* Expand MASK_STORE{,_LANES} call STMT using optab OPTAB. */
|
||
|
||
static void
|
||
@@ -2920,6 +2969,70 @@ expand_gather_load_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
|
||
emit_move_insn (lhs_rtx, ops[0].value);
|
||
}
|
||
|
||
+/* Expand {MASK_,}GATHER_PREFETCH call CALL using optab OPTAB.
|
||
+ vect_patt_97.14_77 = .MASK_GATHER_LOAD (_78, vect__14.13_79, 8, { 0.0, ... }, loop_mask_87);
|
||
+ .MASK_GATHER_PREFETCH (_45, vect__14.13_79, 8, { 0.0, ... }, loop_mask_87, vect_patt_97.14_77, 4);
|
||
+*/
|
||
+
|
||
+static void
|
||
+expand_gather_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
|
||
+{
|
||
+ if (targetm.vectorize.code_for_gather_prefetch == NULL
|
||
+ || targetm.vectorize.prefetch_handleable_mode_p == NULL)
|
||
+ return;
|
||
+
|
||
+ /* Extracting tree nodes, only expand for scalar base and vector index. */
|
||
+ tree base = gimple_call_arg (stmt, 0);
|
||
+ if (VECTOR_TYPE_P (TREE_TYPE (base)))
|
||
+ return;
|
||
+ tree offset = gimple_call_arg (stmt, 1);
|
||
+ if (VECTOR_TYPE_P (TREE_TYPE (offset)) == false)
|
||
+ return;
|
||
+
|
||
+ tree scale = gimple_call_arg (stmt, 2);
|
||
+ tree mask = gimple_call_arg (stmt, 4);
|
||
+ tree target = gimple_call_arg (stmt, 5);
|
||
+ tree prfop = gimple_call_arg (stmt, 6);
|
||
+
|
||
+ /* Convert to the rtx node. */
|
||
+ rtx base_rtx = expand_normal (base);
|
||
+ /* Convert ptr_mode value X to Pmode. */
|
||
+ if (ptr_mode == SImode)
|
||
+ base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode);
|
||
+ rtx offset_rtx = expand_normal (offset);
|
||
+ rtx const_rtx = CONST0_RTX (TYPE_MODE (TREE_TYPE (target)));
|
||
+ rtx mask_rtx = expand_normal (mask);
|
||
+ HOST_WIDE_INT scale_int = tree_to_shwi (scale);
|
||
+ HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop);
|
||
+ /* Bit 3 of the prfop selects stores over loads. */
|
||
+ HOST_WIDE_INT access = prfop_int & 8;
|
||
+ /* Bits 1 and 2 specify the locality; 0-based for svprfop but
|
||
+ 1-based for PREFETCH. */
|
||
+ HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1;
|
||
+
|
||
+ /* add operand. */
|
||
+ unsigned int i = 0;
|
||
+ class expand_operand ops[9];
|
||
+ create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask)));
|
||
+ create_address_operand (&ops[i++], base_rtx);
|
||
+ create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset)));
|
||
+ /* Check whether the index has unsigned. */
|
||
+ create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset)));
|
||
+ create_integer_operand (&ops[i++], scale_int);
|
||
+ create_input_operand (&ops[i++], const_rtx, GET_MODE (const_rtx));
|
||
+ create_integer_operand (&ops[i++], prfop_int);
|
||
+ create_integer_operand (&ops[i++], access);
|
||
+ create_integer_operand (&ops[i++], locality);
|
||
+
|
||
+ machine_mode reg_mode = GET_MODE (offset_rtx);
|
||
+ machine_mode m_mode = TYPE_MODE (TREE_TYPE (target));
|
||
+ if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode))
|
||
+ return;
|
||
+ insn_code icode = targetm.vectorize.code_for_gather_prefetch
|
||
+ (m_mode, reg_mode);
|
||
+ expand_insn (icode, i, ops);
|
||
+}
|
||
+
|
||
/* Expand DIVMOD() using:
|
||
a) optab handler for udivmod/sdivmod if it is available.
|
||
b) If optab_handler doesn't exist, generate call to
|
||
@@ -3210,9 +3323,11 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
|
||
#define direct_cond_binary_optab_supported_p direct_optab_supported_p
|
||
#define direct_cond_ternary_optab_supported_p direct_optab_supported_p
|
||
#define direct_mask_load_optab_supported_p direct_optab_supported_p
|
||
+#define direct_mask_prefetch_optab_supported_p direct_optab_supported_p
|
||
#define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p
|
||
#define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p
|
||
#define direct_gather_load_optab_supported_p convert_optab_supported_p
|
||
+#define direct_gather_prefetch_optab_supported_p direct_optab_supported_p
|
||
#define direct_mask_store_optab_supported_p direct_optab_supported_p
|
||
#define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p
|
||
#define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p
|
||
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
|
||
index 0c6fc3711..cc0f42b98 100644
|
||
--- a/gcc/internal-fn.def
|
||
+++ b/gcc/internal-fn.def
|
||
@@ -119,6 +119,8 @@ along with GCC; see the file COPYING3. If not see
|
||
#endif
|
||
|
||
DEF_INTERNAL_OPTAB_FN (MASK_LOAD, ECF_PURE, maskload, mask_load)
|
||
+DEF_INTERNAL_OPTAB_FN (MASK_PREFETCH, ECF_NOVOPS | ECF_LEAF,
|
||
+ maskprefetch, mask_prefetch)
|
||
DEF_INTERNAL_OPTAB_FN (LOAD_LANES, ECF_CONST, vec_load_lanes, load_lanes)
|
||
DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE,
|
||
vec_mask_load_lanes, mask_load_lanes)
|
||
@@ -126,6 +128,8 @@ DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE,
|
||
DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load)
|
||
DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE,
|
||
mask_gather_load, gather_load)
|
||
+DEF_INTERNAL_OPTAB_FN (MASK_GATHER_PREFETCH, ECF_NOVOPS | ECF_LEAF,
|
||
+ mask_gather_prefetch, gather_prefetch)
|
||
|
||
DEF_INTERNAL_OPTAB_FN (SCATTER_STORE, 0, scatter_store, scatter_store)
|
||
DEF_INTERNAL_OPTAB_FN (MASK_SCATTER_STORE, 0,
|
||
diff --git a/gcc/optabs.def b/gcc/optabs.def
|
||
index 0c64eb52a..ee25bc3f7 100644
|
||
--- a/gcc/optabs.def
|
||
+++ b/gcc/optabs.def
|
||
@@ -90,9 +90,11 @@ OPTAB_CD(vec_cmp_optab, "vec_cmp$a$b")
|
||
OPTAB_CD(vec_cmpu_optab, "vec_cmpu$a$b")
|
||
OPTAB_CD(vec_cmpeq_optab, "vec_cmpeq$a$b")
|
||
OPTAB_CD(maskload_optab, "maskload$a$b")
|
||
+OPTAB_CD(maskprefetch_optab, "maskprefetch$a$b")
|
||
OPTAB_CD(maskstore_optab, "maskstore$a$b")
|
||
OPTAB_CD(gather_load_optab, "gather_load$a$b")
|
||
OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b")
|
||
+OPTAB_CD(mask_gather_prefetch_optab, "mask_gather_prefetch$a$b")
|
||
OPTAB_CD(scatter_store_optab, "scatter_store$a$b")
|
||
OPTAB_CD(mask_scatter_store_optab, "mask_scatter_store$a$b")
|
||
OPTAB_CD(vec_extract_optab, "vec_extract$a$b")
|
||
diff --git a/gcc/params.opt b/gcc/params.opt
|
||
index 2044524a3..c429359e3 100644
|
||
--- a/gcc/params.opt
|
||
+++ b/gcc/params.opt
|
||
@@ -1005,4 +1005,57 @@ Target size of compressed pointer, which should be 8, 16 or 32.
|
||
Common Joined UInteger Var(semi_relayout_level) Init(13) IntegerRange(11, 15) Param Optimization
|
||
Set capacity of each bucket to semi-relayout to (1 << semi-relayout-level) / 8 .
|
||
|
||
+-param=mem-access-ratio=
|
||
+Common Joined UInteger Var(param_mem_access_ratio) Init(20) IntegerRange(0, 100) Param Optimization
|
||
+Memory access ratio (in percent).
|
||
+
|
||
+-param=mem-access-num=
|
||
+Common Joined UInteger Var(param_mem_access_num) Init(3) Param Optimization
|
||
+Memory access num.
|
||
+
|
||
+-param=prefetch-offset=
|
||
+Common Joined UInteger Var(param_prefetch_offset) Init(1024)
|
||
+IntegerRange(1, 999999) Param Optimization
|
||
+Prefetch Offset, which is usually a power of two due to cache line size.
|
||
+
|
||
+-param=branch-prob-threshold=
|
||
+Common Joined UInteger Var(param_branch_prob_threshold) Init(80) IntegerRange(50, 100)
|
||
+Param Optimization
|
||
+High Execution Rate Branch Threshold.
|
||
+
|
||
+-param=issue-topn=
|
||
+Common Joined UInteger Var(param_issue_topn) Init(1) Param Optimization
|
||
+Issue topn LLC mem_ref hint.
|
||
+
|
||
+-param=force-issue=
|
||
+Common Joined UInteger Var(param_force_issue) Init(0) IntegerRange(0, 1) Param
|
||
+Force issue the topn LLC mem_ref hint, without generating dynamic multi-branches.
|
||
+
|
||
+-param=llc-capacity-per-core=
|
||
+Common Joined UInteger Var(param_llc_capacity_per_core) Init(114) IntegerRange(0, 999999) Param
|
||
+LLC capacity per core.
|
||
+
|
||
+-param=target-variables=
|
||
+Common Joined Var(param_target_variables) Init("") Param Optimization
|
||
+--param=target-variables=<var>[,<var>,...] Target variables for prefetching, separated by comma,
|
||
+without space. The representation of a variable can be complex and containing space, please surround
|
||
+it by quotation marks and escape special characters in Linux. The input length should be no more
|
||
+than 512 characters.
|
||
+
|
||
+-param=use-ref-group-index=
|
||
+Common Joined UInteger Var(param_use_ref_group_index) Init(0) IntegerRange(0, 1) Param Optimization
|
||
+Prefetch the target variables by their indices in sorted ref_groups, use together with parameter
|
||
+target-variables.
|
||
+
|
||
+-param=mem-ref-index=
|
||
+Common Joined Var(param_mem_ref_index) Init("") Param Optimization
|
||
+--param=mem-ref-index=<idx>[,<idx>,...] Prefetch the target variable at the memory reference
|
||
+location with the index of customized order, separated by comma, without space. The input length
|
||
+should be no more than 512 characters.
|
||
+
|
||
+-param=filter-kernels=
|
||
+Common Joined UInteger Var(param_filter_kernels) Init(1) IntegerRange(0, 1) Param
|
||
+Allow LLC allocate pass to greedily filter kernels by traversing the corresponding basic blocks
|
||
+through edges with branch probability no less than param_branch_prob_threshold.
|
||
+
|
||
; This comment is to ensure we retain the blank line above.
|
||
diff --git a/gcc/passes.def b/gcc/passes.def
|
||
index df7d65733..ea59fc8ca 100644
|
||
--- a/gcc/passes.def
|
||
+++ b/gcc/passes.def
|
||
@@ -303,6 +303,7 @@ along with GCC; see the file COPYING3. If not see
|
||
/* Run IVOPTs after the last pass that uses data-reference analysis
|
||
as that doesn't handle TARGET_MEM_REFs. */
|
||
NEXT_PASS (pass_iv_optimize);
|
||
+ NEXT_PASS (pass_llc_allocate);
|
||
NEXT_PASS (pass_lim);
|
||
NEXT_PASS (pass_tree_loop_done);
|
||
POP_INSERT_PASSES ()
|
||
diff --git a/gcc/target.def b/gcc/target.def
|
||
index 34d3561bd..351c94c37 100644
|
||
--- a/gcc/target.def
|
||
+++ b/gcc/target.def
|
||
@@ -2072,6 +2072,37 @@ DEFHOOK
|
||
(void *data),
|
||
default_destroy_cost_data)
|
||
|
||
+/* Function for vector prefetch operation. */
|
||
+DEFHOOK
|
||
+(code_for_prefetch,
|
||
+ "This hook should return the decl of a function that implements the\n\
|
||
+vectorized variant of the function with the @code{combined_fn} code\n\
|
||
+@var{code} or @code{NULL_TREE} if such a function is not available.\n\
|
||
+The return type of the vectorized function shall be of vector type\n\
|
||
+@var{vec_type_out} and the argument types should be @var{vec_type_in}.",
|
||
+ insn_code, (machine_mode arg),
|
||
+ NULL)
|
||
+
|
||
+/* Function for vector gather prefetch operation. */
|
||
+DEFHOOK
|
||
+(code_for_gather_prefetch,
|
||
+ "This hook should return the decl of a function that implements the\n\
|
||
+vectorized variant of the function with the @code{combined_fn} code\n\
|
||
+@var{code} or @code{NULL_TREE} if such a function is not available.\n\
|
||
+The return type of the vectorized function shall be of vector type\n\
|
||
+@var{vec_type_out} and the argument types should be @var{vec_type_in}.",
|
||
+ insn_code, (machine_mode mode_to, machine_mode mode_form),
|
||
+ NULL)
|
||
+
|
||
+/* Function to check whether the target hardware architecture supports
|
||
+ a full SVE data vector mode. */
|
||
+DEFHOOK
|
||
+(prefetch_handleable_mode_p,
|
||
+ "This hook should return true if the target hardware architecture\n\
|
||
+supports a full SVE data vector mode.",
|
||
+ bool, (machine_mode arg),
|
||
+ NULL)
|
||
+
|
||
HOOK_VECTOR_END (vectorize)
|
||
|
||
#undef HOOK_PREFIX
|
||
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
|
||
new file mode 100644
|
||
index 000000000..a4828eaab
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
|
||
@@ -0,0 +1,61 @@
|
||
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
|
||
+/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param issue-topn=2" } */
|
||
+
|
||
+#include <stdio.h>
|
||
+
|
||
+#define N 131590
|
||
+#define F 384477
|
||
+
|
||
+double diagPtr[N];
|
||
+double psiPtr[N];
|
||
+double ApsiPtr[N];
|
||
+int lPtr[F];
|
||
+int uPtr[F];
|
||
+double lowerPtr[F];
|
||
+double upperPtr[F];
|
||
+
|
||
+void
|
||
+AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr,
|
||
+ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces)
|
||
+{
|
||
+ for (int cell=0; cell<nCells; cell++)
|
||
+ ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
|
||
+
|
||
+ for (int face=0; face<nFaces; face++)
|
||
+ {
|
||
+ ApsiPtr[uPtr[face]] += lowerPtr[face]*psiPtr[lPtr[face]];
|
||
+ ApsiPtr[lPtr[face]] += upperPtr[face]*psiPtr[uPtr[face]];
|
||
+ }
|
||
+}
|
||
+
|
||
+int
|
||
+main (int argc, char *argv[])
|
||
+{
|
||
+ int nCells = N;
|
||
+ int nFaces = F;
|
||
+ int testIter = 2;
|
||
+
|
||
+ for (int i=0; i<testIter; i++)
|
||
+ AMUL (diagPtr,psiPtr,ApsiPtr,lPtr,uPtr,lowerPtr,upperPtr,nCells,nFaces);
|
||
+
|
||
+ return 0;
|
||
+}
|
||
+
|
||
+/* { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 5 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "Tracing succeeded" 29 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "static_data_size:" 7 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){2}\}" 3 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){3}\}" 1 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times ", size: (?!(0\.000000))" 7 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 19 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump "\\d ApsiPtr \\(1.003952, 5, 0\\) : 17" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump "\\d psiPtr \\(1.003952, 3, 0\\) : 8" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump "\\d diagPtr \\(1.003952, 1, 0\\) : 2" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump "\\d lowerPtr \\(2.933319, 1, 0\\) : 2" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump "\\d upperPtr \\(2.933319, 1, 0\\) : 2" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump "\\d lPtr \\(1.466660, 1, 0\\) : 2" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump "\\d uPtr \\(1.466660, 1, 0\\) : 2" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "insert svprfd" 4 "llc_allocate" } } */
|
||
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c
|
||
new file mode 100644
|
||
index 000000000..9bc6cc32b
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c
|
||
@@ -0,0 +1,54 @@
|
||
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
|
||
+/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param force-issue=1" } */
|
||
+
|
||
+#include <stdio.h>
|
||
+
|
||
+#define N 100000
|
||
+
|
||
+int A_i[N];
|
||
+int A_j[N];
|
||
+double A_data[N];
|
||
+double x_data[N];
|
||
+double y_data[N];
|
||
+int num_rows = N;
|
||
+
|
||
+void
|
||
+MatMult (int *A_i, int *A_j, double *A_data, double *x_data,
|
||
+ int num_rows, double *y_data)
|
||
+{
|
||
+ int i = 0;
|
||
+ int j = 0;
|
||
+ double temp = 0;
|
||
+ for (i = 0; i < num_rows; i++)
|
||
+ {
|
||
+ temp = y_data[i];
|
||
+ for (j = A_i[i]; j < A_i[i+1]; j++)
|
||
+ temp += A_data[j] * x_data[A_j[j]];
|
||
+ y_data[i] = temp;
|
||
+ }
|
||
+}
|
||
+
|
||
+int
|
||
+main (int argc, char *argv[])
|
||
+{
|
||
+ int testIter = 2;
|
||
+
|
||
+ for (int i = 0; i < testIter; i++)
|
||
+ MatMult (A_i, A_j, A_data, x_data, num_rows, y_data);
|
||
+
|
||
+ return 0;
|
||
+}
|
||
+
|
||
+/* { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 2 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "Tracing succeeded" 6 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-not "static_data_size:" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 4 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-not ", size: (?!(0\.000000))" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 6 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "\\d x_data \\(0.000000, 1, 0\\) : 3" 2 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "\\d A_j \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "\\d A_data \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "insert svprfd_gather" 2 "llc_allocate" } } */
|
||
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp
|
||
new file mode 100644
|
||
index 000000000..4f34e722f
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp
|
||
@@ -0,0 +1,27 @@
|
||
+# Copyright (C) 2022-2023 Free Software Foundation, Inc.
|
||
+
|
||
+# This program is free software; you can redistribute it and/or modify
|
||
+# it under the terms of the GNU General Public License as published by
|
||
+# the Free Software Foundation; either version 3 of the License, or
|
||
+# (at your option) any later version.
|
||
+#
|
||
+# This program is distributed in the hope that it will be useful,
|
||
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
+# GNU General Public License for more details.
|
||
+#
|
||
+# You should have received a copy of the GNU General Public License
|
||
+# along with GCC; see the file COPYING3. If not see
|
||
+# <http://www.gnu.org/licenses/>.
|
||
+
|
||
+load_lib gcc-dg.exp
|
||
+load_lib target-supports.exp
|
||
+
|
||
+# Initialize `dg'.
|
||
+dg-init
|
||
+
|
||
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \
|
||
+ "" "-fllc-allocate"
|
||
+
|
||
+# All done.
|
||
+dg-finish
|
||
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c
|
||
new file mode 100644
|
||
index 000000000..2a58c501f
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c
|
||
@@ -0,0 +1,48 @@
|
||
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
|
||
+/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=uPtr" } */
|
||
+
|
||
+#include <stdio.h>
|
||
+
|
||
+#define N 131590
|
||
+#define F 384477
|
||
+
|
||
+double diagPtr[N];
|
||
+double psiPtr[N];
|
||
+double ApsiPtr[N];
|
||
+int lPtr[F];
|
||
+int uPtr[F];
|
||
+double lowerPtr[F];
|
||
+double upperPtr[F];
|
||
+
|
||
+void
|
||
+AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr,
|
||
+ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces)
|
||
+{
|
||
+ for (int cell=0; cell<nCells; cell++)
|
||
+ ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
|
||
+
|
||
+ for (int face=0; face<nFaces; face++)
|
||
+ {
|
||
+ ApsiPtr[uPtr[face]] += lowerPtr[face]*psiPtr[lPtr[face]];
|
||
+ ApsiPtr[lPtr[face]] += upperPtr[face]*psiPtr[uPtr[face]];
|
||
+ }
|
||
+}
|
||
+
|
||
+int
|
||
+main (int argc, char *argv[])
|
||
+{
|
||
+ int nCells = N;
|
||
+ int nFaces = F;
|
||
+ int testIter = 2;
|
||
+
|
||
+ for (int i=0; i<testIter; i++)
|
||
+ AMUL (diagPtr,psiPtr,ApsiPtr,lPtr,uPtr,lowerPtr,upperPtr,nCells,nFaces);
|
||
+
|
||
+ return 0;
|
||
+}
|
||
+
|
||
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "insert prfm" 2 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-not "\[&\]?uPtr(?:_\\d+\\(D\\))? \\+ \\d{4};" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "\[&\]?uPtr(?:_\\d+\\(D\\))? \\+ \[_\]\\d{1,4};" 2 "llc_allocate" } } */
|
||
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c
|
||
new file mode 100644
|
||
index 000000000..e87f343dc
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c
|
||
@@ -0,0 +1,50 @@
|
||
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
|
||
+/* { dg-options "-O3 -c -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0" } */
|
||
+
|
||
+#include <stdio.h>
|
||
+
|
||
+typedef struct stack_def
|
||
+{
|
||
+ int top; /* index to top stack element */
|
||
+ unsigned long reg_set; /* set of live registers */
|
||
+ unsigned char reg[128]; /* register - stack mapping */
|
||
+} *stack;
|
||
+
|
||
+typedef struct block_info_def
|
||
+{
|
||
+ struct stack_def stack_in; /* Input stack configuration. */
|
||
+ struct stack_def stack_out; /* Output stack configuration. */
|
||
+ unsigned long out_reg_set; /* Stack regs live on output. */
|
||
+ int done; /* True if block already converted. */
|
||
+ int predecessors; /* Number of predecessors that need
|
||
+ to be visited. */
|
||
+} *block_info;
|
||
+
|
||
+typedef struct basic_block_def
|
||
+{
|
||
+ void *aux;
|
||
+} *basic_block;
|
||
+
|
||
+unsigned char
|
||
+convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high)
|
||
+{
|
||
+ stack output_stack;
|
||
+
|
||
+ output_stack = &(((block_info) bb->aux)->stack_in);
|
||
+ if (value_reg_low == -1)
|
||
+ output_stack->top = -1;
|
||
+ else
|
||
+ {
|
||
+ int reg;
|
||
+ output_stack->top = value_reg_high - value_reg_low;
|
||
+ for (reg = value_reg_low; reg <= value_reg_high; ++reg)
|
||
+ {
|
||
+ (output_stack->reg + 16)[value_reg_high - reg] = reg;
|
||
+ output_stack->reg_set |= (unsigned long) 1 << reg;
|
||
+ }
|
||
+ }
|
||
+ return output_stack->reg[0];
|
||
+}
|
||
+
|
||
+/* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */
|
||
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
|
||
new file mode 100644
|
||
index 000000000..27cd574cf
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
|
||
@@ -0,0 +1,62 @@
|
||
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
|
||
+/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0" } */
|
||
+
|
||
+#include <stdio.h>
|
||
+#include <stdlib.h>
|
||
+
|
||
+#define N 1000
|
||
+
|
||
+long a[N] = {0};
|
||
+long b[N] = {0};
|
||
+long c[N] = {0};
|
||
+
|
||
+double
|
||
+referenceTrace (double *psiPtr, int *lPtr, int *uPtr, int nCells)
|
||
+{
|
||
+ double sum;
|
||
+ for (int cell = 0; cell < nCells; cell++)
|
||
+ {
|
||
+ // Multi-layer pointer
|
||
+ sum += psiPtr[lPtr[cell]];
|
||
+ psiPtr[uPtr[cell]] = sum;
|
||
+
|
||
+ // Outer pointer, inner array
|
||
+ sum += psiPtr[b[cell]];
|
||
+ psiPtr[a[cell]] = sum;
|
||
+
|
||
+ // Multi-layer array, currently failed tracing at b[cell] and a[cell]
|
||
+ sum += a[b[cell]];
|
||
+ c[a[cell]] = sum;
|
||
+
|
||
+ // Outer array, inner pointer, currently failed tracing at lPtr[cell]
|
||
+ sum += a[lPtr[cell]];
|
||
+ c[lPtr[cell]] = sum;
|
||
+ }
|
||
+ return sum;
|
||
+}
|
||
+
|
||
+int
|
||
+main (int argc, char *argv[])
|
||
+{
|
||
+ int testIter = 2;
|
||
+
|
||
+ double *psiPtr = NULL;
|
||
+ int *lPtr = NULL;
|
||
+ int *uPtr = NULL;
|
||
+ psiPtr = (double *) calloc (N, sizeof(double));
|
||
+ lPtr = (int *) calloc (N, sizeof(int));
|
||
+ uPtr = (int *) calloc (N, sizeof(int));
|
||
+
|
||
+ for (int i = 0; i < testIter; i++)
|
||
+ referenceTrace (psiPtr, lPtr, uPtr, N);
|
||
+
|
||
+ free (psiPtr);
|
||
+ free (lPtr);
|
||
+ free (uPtr);
|
||
+
|
||
+ return 0;
|
||
+}
|
||
+
|
||
+/* { dg-final { scan-tree-dump-times "Tracing succeeded" 16 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "Tracing failed" 8 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "unhandled issue scene" 2 "llc_allocate" } } */
|
||
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c
|
||
new file mode 100644
|
||
index 000000000..276781c4f
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c
|
||
@@ -0,0 +1,48 @@
|
||
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
|
||
+/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=lPtr" } */
|
||
+
|
||
+#include <stdio.h>
|
||
+
|
||
+#define N 131590
|
||
+#define F 384477
|
||
+
|
||
+double diagPtr[N];
|
||
+double psiPtr[N];
|
||
+double ApsiPtr[N];
|
||
+int lPtr[F];
|
||
+int uPtr[F];
|
||
+double lowerPtr[F];
|
||
+double upperPtr[F];
|
||
+
|
||
+void
|
||
+AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr,
|
||
+ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces)
|
||
+{
|
||
+ for (int cell=0; cell<nCells; cell++)
|
||
+ ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
|
||
+
|
||
+ for (int face=0; face<nFaces; face++)
|
||
+ {
|
||
+ ApsiPtr[uPtr[face]] += lowerPtr[face]*psiPtr[lPtr[face]];
|
||
+ ApsiPtr[lPtr[face]] += upperPtr[face]*psiPtr[uPtr[face]];
|
||
+ }
|
||
+}
|
||
+
|
||
+int
|
||
+main (int argc, char *argv[])
|
||
+{
|
||
+ int nCells = N;
|
||
+ int nFaces = F;
|
||
+ int testIter = 2;
|
||
+
|
||
+ for (int i=0; i<testIter; i++)
|
||
+ AMUL (diagPtr,psiPtr,ApsiPtr,lPtr,uPtr,lowerPtr,upperPtr,nCells,nFaces);
|
||
+
|
||
+ return 0;
|
||
+}
|
||
+
|
||
+/* { dg-final { scan-tree-dump-times "NOTICE: Prefetching target variable \""
|
||
+ " lPtr \"" 2 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "insert prfm" 2 "llc_allocate" } } */
|
||
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-2.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-2.c
|
||
new file mode 100644
|
||
index 000000000..57c76f4a6
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-2.c
|
||
@@ -0,0 +1,48 @@
|
||
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
|
||
+/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=,lPtr, --param mem-ref-index=5" } */
|
||
+
|
||
+#include <stdio.h>
|
||
+
|
||
+#define N 131590
|
||
+#define F 384477
|
||
+
|
||
+double diagPtr[N];
|
||
+double psiPtr[N];
|
||
+double ApsiPtr[N];
|
||
+int lPtr[F];
|
||
+int uPtr[F];
|
||
+double lowerPtr[F];
|
||
+double upperPtr[F];
|
||
+
|
||
+void
|
||
+AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr,
|
||
+ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces)
|
||
+{
|
||
+ for (int cell=0; cell<nCells; cell++)
|
||
+ ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
|
||
+
|
||
+ for (int face=0; face<nFaces; face++)
|
||
+ {
|
||
+ ApsiPtr[uPtr[face]] += lowerPtr[face]*psiPtr[lPtr[face]];
|
||
+ ApsiPtr[lPtr[face]] += upperPtr[face]*psiPtr[uPtr[face]];
|
||
+ }
|
||
+}
|
||
+
|
||
+int
|
||
+main (int argc, char *argv[])
|
||
+{
|
||
+ int nCells = N;
|
||
+ int nFaces = F;
|
||
+ int testIter = 2;
|
||
+
|
||
+ for (int i=0; i<testIter; i++)
|
||
+ AMUL (diagPtr,psiPtr,ApsiPtr,lPtr,uPtr,lowerPtr,upperPtr,nCells,nFaces);
|
||
+
|
||
+ return 0;
|
||
+}
|
||
+
|
||
+/* { dg-final { scan-tree-dump-times "WARNING: The target data_ref index is "
|
||
+ "out of range." 2 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "insert prfm" 2 "llc_allocate" } } */
|
||
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-3.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-3.c
|
||
new file mode 100644
|
||
index 000000000..d9c053566
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-3.c
|
||
@@ -0,0 +1,48 @@
|
||
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
|
||
+/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=lPtr,uPtr,, --param mem-ref-index=5" } */
|
||
+
|
||
+#include <stdio.h>
|
||
+
|
||
+#define N 131590
|
||
+#define F 384477
|
||
+
|
||
+double diagPtr[N];
|
||
+double psiPtr[N];
|
||
+double ApsiPtr[N];
|
||
+int lPtr[F];
|
||
+int uPtr[F];
|
||
+double lowerPtr[F];
|
||
+double upperPtr[F];
|
||
+
|
||
+void
|
||
+AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr,
|
||
+ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces)
|
||
+{
|
||
+ for (int cell=0; cell<nCells; cell++)
|
||
+ ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
|
||
+
|
||
+ for (int face=0; face<nFaces; face++)
|
||
+ {
|
||
+ ApsiPtr[uPtr[face]] += lowerPtr[face]*psiPtr[lPtr[face]];
|
||
+ ApsiPtr[lPtr[face]] += upperPtr[face]*psiPtr[uPtr[face]];
|
||
+ }
|
||
+}
|
||
+
|
||
+int
|
||
+main (int argc, char *argv[])
|
||
+{
|
||
+ int nCells = N;
|
||
+ int nFaces = F;
|
||
+ int testIter = 2;
|
||
+
|
||
+ for (int i=0; i<testIter; i++)
|
||
+ AMUL (diagPtr,psiPtr,ApsiPtr,lPtr,uPtr,lowerPtr,upperPtr,nCells,nFaces);
|
||
+
|
||
+ return 0;
|
||
+}
|
||
+
|
||
+/* { dg-final { scan-tree-dump-not "WARNING: The number of provided memory "
|
||
+ "reference indices is less" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "insert prfm" 4 "llc_allocate" } } */
|
||
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-4.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-4.c
|
||
new file mode 100644
|
||
index 000000000..b87f9903d
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-4.c
|
||
@@ -0,0 +1,47 @@
|
||
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
|
||
+/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=lPtr --param use-ref-group-index=1" } */
|
||
+
|
||
+#include <stdio.h>
|
||
+
|
||
+#define N 131590
|
||
+#define F 384477
|
||
+
|
||
+double diagPtr[N];
|
||
+double psiPtr[N];
|
||
+double ApsiPtr[N];
|
||
+int lPtr[F];
|
||
+int uPtr[F];
|
||
+double lowerPtr[F];
|
||
+double upperPtr[F];
|
||
+
|
||
+void
|
||
+AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr,
|
||
+ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces)
|
||
+{
|
||
+ for (int cell=0; cell<nCells; cell++)
|
||
+ ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
|
||
+
|
||
+ for (int face=0; face<nFaces; face++)
|
||
+ {
|
||
+ ApsiPtr[uPtr[face]] += lowerPtr[face]*psiPtr[lPtr[face]];
|
||
+ ApsiPtr[lPtr[face]] += upperPtr[face]*psiPtr[uPtr[face]];
|
||
+ }
|
||
+}
|
||
+
|
||
+int
|
||
+main (int argc, char *argv[])
|
||
+{
|
||
+ int nCells = N;
|
||
+ int nFaces = F;
|
||
+ int testIter = 2;
|
||
+
|
||
+ for (int i=0; i<testIter; i++)
|
||
+ AMUL (diagPtr,psiPtr,ApsiPtr,lPtr,uPtr,lowerPtr,upperPtr,nCells,nFaces);
|
||
+
|
||
+ return 0;
|
||
+}
|
||
+
|
||
+/* { dg-final { scan-tree-dump-times "ERROR: not an unsigned integer" 1
|
||
+ "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */
|
||
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-5.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-5.c
|
||
new file mode 100644
|
||
index 000000000..d07836765
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-5.c
|
||
@@ -0,0 +1,48 @@
|
||
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
|
||
+/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=1 --param use-ref-group-index=1" } */
|
||
+
|
||
+#include <stdio.h>
|
||
+
|
||
+#define N 131590
|
||
+#define F 384477
|
||
+
|
||
+double diagPtr[N];
|
||
+double psiPtr[N];
|
||
+double ApsiPtr[N];
|
||
+int lPtr[F];
|
||
+int uPtr[F];
|
||
+double lowerPtr[F];
|
||
+double upperPtr[F];
|
||
+
|
||
+void
|
||
+AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr,
|
||
+ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces)
|
||
+{
|
||
+ for (int cell=0; cell<nCells; cell++)
|
||
+ ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
|
||
+
|
||
+ for (int face=0; face<nFaces; face++)
|
||
+ {
|
||
+ ApsiPtr[uPtr[face]] += lowerPtr[face]*psiPtr[lPtr[face]];
|
||
+ ApsiPtr[lPtr[face]] += upperPtr[face]*psiPtr[uPtr[face]];
|
||
+ }
|
||
+}
|
||
+
|
||
+int
|
||
+main (int argc, char *argv[])
|
||
+{
|
||
+ int nCells = N;
|
||
+ int nFaces = F;
|
||
+ int testIter = 2;
|
||
+
|
||
+ for (int i=0; i<testIter; i++)
|
||
+ AMUL (diagPtr,psiPtr,ApsiPtr,lPtr,uPtr,lowerPtr,upperPtr,nCells,nFaces);
|
||
+
|
||
+ return 0;
|
||
+}
|
||
+
|
||
+/* { dg-final { scan-tree-dump-times "NOTICE: Prefetching target variable \""
|
||
+ " psiPtr \"" 2 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } } */
|
||
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-6.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-6.c
|
||
new file mode 100644
|
||
index 000000000..c0a6afe5b
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-6.c
|
||
@@ -0,0 +1,47 @@
|
||
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
|
||
+/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=3,a --param use-ref-group-index=1" } */
|
||
+
|
||
+#include <stdio.h>
|
||
+
|
||
+#define N 131590
|
||
+#define F 384477
|
||
+
|
||
+double diagPtr[N];
|
||
+double psiPtr[N];
|
||
+double ApsiPtr[N];
|
||
+int lPtr[F];
|
||
+int uPtr[F];
|
||
+double lowerPtr[F];
|
||
+double upperPtr[F];
|
||
+
|
||
+void
|
||
+AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr,
|
||
+ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces)
|
||
+{
|
||
+ for (int cell=0; cell<nCells; cell++)
|
||
+ ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
|
||
+
|
||
+ for (int face=0; face<nFaces; face++)
|
||
+ {
|
||
+ ApsiPtr[uPtr[face]] += lowerPtr[face]*psiPtr[lPtr[face]];
|
||
+ ApsiPtr[lPtr[face]] += upperPtr[face]*psiPtr[uPtr[face]];
|
||
+ }
|
||
+}
|
||
+
|
||
+int
|
||
+main (int argc, char *argv[])
|
||
+{
|
||
+ int nCells = N;
|
||
+ int nFaces = F;
|
||
+ int testIter = 2;
|
||
+
|
||
+ for (int i=0; i<testIter; i++)
|
||
+ AMUL (diagPtr,psiPtr,ApsiPtr,lPtr,uPtr,lowerPtr,upperPtr,nCells,nFaces);
|
||
+
|
||
+ return 0;
|
||
+}
|
||
+
|
||
+/* { dg-final { scan-tree-dump-times "ERROR: not an unsigned integer" 1
|
||
+ "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */
|
||
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-7-null-var-name.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-7-null-var-name.c
|
||
new file mode 100644
|
||
index 000000000..4ad331626
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-7-null-var-name.c
|
||
@@ -0,0 +1,52 @@
|
||
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
|
||
+/* { dg-options "-O3 -c -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0 --param target-variables=\"bb_16(D)->aux\"" } */
|
||
+
|
||
+#include <stdio.h>
|
||
+
|
||
+typedef struct stack_def
|
||
+{
|
||
+ int top; /* index to top stack element */
|
||
+ unsigned long reg_set; /* set of live registers */
|
||
+ unsigned char reg[128]; /* register - stack mapping */
|
||
+} *stack;
|
||
+
|
||
+typedef struct block_info_def
|
||
+{
|
||
+ struct stack_def stack_in; /* Input stack configuration. */
|
||
+ struct stack_def stack_out; /* Output stack configuration. */
|
||
+ unsigned long out_reg_set; /* Stack regs live on output. */
|
||
+ int done; /* True if block already converted. */
|
||
+ int predecessors; /* Number of predecessors that need
|
||
+ to be visited. */
|
||
+} *block_info;
|
||
+
|
||
+typedef struct basic_block_def
|
||
+{
|
||
+ void *aux;
|
||
+} *basic_block;
|
||
+
|
||
+unsigned char
|
||
+convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high)
|
||
+{
|
||
+ stack output_stack;
|
||
+
|
||
+ output_stack = &(((block_info) bb->aux)->stack_in);
|
||
+ if (value_reg_low == -1)
|
||
+ output_stack->top = -1;
|
||
+ else
|
||
+ {
|
||
+ int reg;
|
||
+ output_stack->top = value_reg_high - value_reg_low;
|
||
+ for (reg = value_reg_low; reg <= value_reg_high; ++reg)
|
||
+ {
|
||
+ (output_stack->reg + 16)[value_reg_high - reg] = reg;
|
||
+ output_stack->reg_set |= (unsigned long) 1 << reg;
|
||
+ }
|
||
+ }
|
||
+ return output_stack->reg[0];
|
||
+}
|
||
+
|
||
+/* { dg-final { scan-tree-dump-not "Unrecognizable variable name"
|
||
+ "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */
|
||
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c
|
||
new file mode 100644
|
||
index 000000000..09a525ce1
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c
|
||
@@ -0,0 +1,54 @@
|
||
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
|
||
+/* { dg-options "-O3 -c -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0 --param target-variables=tmp_var_0" } */
|
||
+
|
||
+#include <stdio.h>
|
||
+
|
||
+typedef struct stack_def
|
||
+{
|
||
+ int top; /* index to top stack element */
|
||
+ unsigned long reg_set; /* set of live registers */
|
||
+ unsigned char reg[128]; /* register - stack mapping */
|
||
+} *stack;
|
||
+
|
||
+typedef struct block_info_def
|
||
+{
|
||
+ struct stack_def stack_in; /* Input stack configuration. */
|
||
+ struct stack_def stack_out; /* Output stack configuration. */
|
||
+ unsigned long out_reg_set; /* Stack regs live on output. */
|
||
+ int done; /* True if block already converted. */
|
||
+ int predecessors; /* Number of predecessors that need
|
||
+ to be visited. */
|
||
+} *block_info;
|
||
+
|
||
+typedef struct basic_block_def
|
||
+{
|
||
+ void *aux;
|
||
+} *basic_block;
|
||
+
|
||
+unsigned char
|
||
+convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high)
|
||
+{
|
||
+ stack output_stack;
|
||
+
|
||
+ output_stack = &(((block_info) bb->aux)->stack_in);
|
||
+ if (value_reg_low == -1)
|
||
+ output_stack->top = -1;
|
||
+ else
|
||
+ {
|
||
+ int reg;
|
||
+ output_stack->top = value_reg_high - value_reg_low;
|
||
+ for (reg = value_reg_low; reg <= value_reg_high; ++reg)
|
||
+ {
|
||
+ (output_stack->reg + 16)[value_reg_high - reg] = reg;
|
||
+ output_stack->reg_set |= (unsigned long) 1 << reg;
|
||
+ }
|
||
+ }
|
||
+ return output_stack->reg[0];
|
||
+}
|
||
+
|
||
+/* { dg-final { scan-tree-dump-not "Unrecognizable variable name"
|
||
+ "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "NOTICE: Prefetching target variable \""
|
||
+ " bb_16(D)->aux \"" 1 "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
|
||
+/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */
|
||
diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
|
||
new file mode 100644
|
||
index 000000000..ec918e144
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
|
||
@@ -0,0 +1,213 @@
|
||
+! { dg-do compile { target { aarch64*-*-linux* } } }
|
||
+! { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50" }
|
||
+
|
||
+program main
|
||
+
|
||
+ IMPLICIT NONE
|
||
+ INTEGER :: ids,ide, jds,jde, kds,kde
|
||
+ INTEGER,parameter :: ims=-4,kms=1,jms=-4
|
||
+ INTEGER,parameter :: ime=210,kme=36,jme=192
|
||
+ INTEGER :: its,ite, jts,jte, kts,kte
|
||
+ INTEGER :: number_of_small_timesteps,rk_step, rk_order, step
|
||
+
|
||
+ REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: t_1, t_2, c2a, p, ph, pm1, al, alt
|
||
+
|
||
+
|
||
+ REAL, DIMENSION(ims:ime, jms:jme) :: mu, muts
|
||
+
|
||
+ REAL, DIMENSION(kms:kme) :: dnw, rdnw, znu
|
||
+
|
||
+ REAL :: rdx,rdy
|
||
+ REAL :: dts, t0, smdiv
|
||
+ REAL :: random1,time_begin,time_end,total_time
|
||
+
|
||
+ INTEGER :: i, j, k
|
||
+ INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end
|
||
+ INTEGER :: i_endu, j_endv
|
||
+ INTEGER :: interval=1
|
||
+ INTEGER :: epoch,iter
|
||
+
|
||
+ LOGICAL :: non_hydrostatic
|
||
+
|
||
+ data ids, jds, kds, its, jts, kts /6*1/
|
||
+ data ide, ite /2*205/
|
||
+ data jde, jte /2*187/
|
||
+ data kde, kte /2*36/
|
||
+
|
||
+ number_of_small_timesteps = 1
|
||
+ rk_step = 3
|
||
+ rk_order = 1
|
||
+ dts = 1.
|
||
+
|
||
+ rdx = 1.
|
||
+ rdy = 1.
|
||
+
|
||
+ t0 = 0.
|
||
+ smdiv = 1.
|
||
+ step = 1
|
||
+ non_hydrostatic = .true.
|
||
+
|
||
+ call random_number(random1)
|
||
+ interval = random1*100
|
||
+ interval=1
|
||
+
|
||
+ call random_seed(put=(/(i,i=1,10000,interval)/))
|
||
+
|
||
+ call random_number(alt)
|
||
+ call random_number(c2a)
|
||
+ call random_number(ph)
|
||
+ call random_number(pm1)
|
||
+ call random_number(mu)
|
||
+ call random_number(muts)
|
||
+ call random_number(dnw)
|
||
+ call random_number(rdnw)
|
||
+ call random_number(znu)
|
||
+
|
||
+ do iter=1,2
|
||
+ call calc_p_rho( al, p, ph, &
|
||
+ alt, t_2, t_1, c2a, pm1, &
|
||
+ mu, muts, znu, t0, &
|
||
+ rdnw, dnw, smdiv, &
|
||
+ non_hydrostatic, step, &
|
||
+ ids, ide, jds, jde, kds, kde, &
|
||
+ ims, ime, jms, jme, kms, kme, &
|
||
+ its,ite, jts,jte, kts,kte )
|
||
+
|
||
+ enddo
|
||
+
|
||
+end program
|
||
+
|
||
+
|
||
+SUBROUTINE calc_p_rho( al, p, ph, &
|
||
+ alt, t_2, t_1, c2a, pm1, &
|
||
+ mu, muts, znu, t0, &
|
||
+ rdnw, dnw, smdiv, &
|
||
+ non_hydrostatic, step, &
|
||
+ ids, ide, jds, jde, kds, kde, &
|
||
+ ims, ime, jms, jme, kms, kme, &
|
||
+ its,ite, jts,jte, kts,kte )
|
||
+
|
||
+ IMPLICIT NONE ! religion first
|
||
+ !asb
|
||
+! declarations for the stuff coming in
|
||
+
|
||
+ INTEGER, INTENT(IN ) :: ids,ide, jds,jde, kds,kde
|
||
+ INTEGER, INTENT(IN ) :: ims,ime, jms,jme, kms,kme
|
||
+ INTEGER, INTENT(IN ) :: its,ite, jts,jte, kts,kte
|
||
+
|
||
+ INTEGER, INTENT(IN ) :: step
|
||
+
|
||
+ REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT( OUT) :: al, &
|
||
+ p
|
||
+
|
||
+ REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(IN ) :: alt, &
|
||
+ t_2, &
|
||
+ t_1, &
|
||
+ c2a
|
||
+
|
||
+ REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(INOUT) :: ph, pm1
|
||
+
|
||
+ REAL, DIMENSION(ims:ime, jms:jme) , INTENT(IN ) :: mu, &
|
||
+ muts
|
||
+
|
||
+ REAL, DIMENSION(kms:kme) , INTENT(IN ) :: dnw, &
|
||
+ rdnw, &
|
||
+ znu
|
||
+
|
||
+ REAL, INTENT(IN ) :: t0, smdiv
|
||
+
|
||
+ LOGICAL, INTENT(IN ) :: non_hydrostatic
|
||
+
|
||
+! local variables
|
||
+
|
||
+ INTEGER :: i, j, k
|
||
+ INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end
|
||
+ REAL :: ptmp
|
||
+
|
||
+ i_start = its
|
||
+ i_end = min(ite,ide-1)
|
||
+ j_start = jts
|
||
+ j_end = min(jte,jde-1)
|
||
+ k_start = kts
|
||
+ k_end = min(kte,kde-1)
|
||
+
|
||
+ IF (non_hydrostatic) THEN
|
||
+ DO j=j_start, j_end
|
||
+ DO k=k_start, k_end
|
||
+ DO i=i_start, i_end
|
||
+
|
||
+! al computation is all dry, so ok with moisture
|
||
+
|
||
+ al(i,k,j)=-1./muts(i,j)*(alt(i,k,j)*mu(i,j) &
|
||
+ +rdnw(k)*(ph(i,k+1,j)-ph(i,k,j)))
|
||
+
|
||
+! this is temporally linearized p, no moisture correction needed
|
||
+
|
||
+ p(i,k,j)=c2a(i,k,j)*(alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j)) &
|
||
+ /(muts(i,j)*(t0+t_1(i,k,j)))-al (i,k,j))
|
||
+
|
||
+ ENDDO
|
||
+ ENDDO
|
||
+ ENDDO
|
||
+
|
||
+ ELSE ! hydrostatic calculation
|
||
+
|
||
+ DO j=j_start, j_end
|
||
+ DO k=k_start, k_end
|
||
+ DO i=i_start, i_end
|
||
+ p(i,k,j)=mu(i,j)*znu(k)
|
||
+ al(i,k,j)=alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j)) &
|
||
+ /(muts(i,j)*(t0+t_1(i,k,j)))-p(i,k,j)/c2a(i,k,j)
|
||
+ ph(i,k+1,j)=ph(i,k,j)-dnw(k)*(muts(i,j)*al (i,k,j) &
|
||
+ +mu(i,j)*alt(i,k,j))
|
||
+ ENDDO
|
||
+ ENDDO
|
||
+ ENDDO
|
||
+
|
||
+ END IF
|
||
+
|
||
+! divergence damping setup
|
||
+
|
||
+ IF (step == 0) then ! we're initializing small timesteps
|
||
+ DO j=j_start, j_end
|
||
+ DO k=k_start, k_end
|
||
+ DO i=i_start, i_end
|
||
+ pm1(i,k,j)=p(i,k,j)
|
||
+ ENDDO
|
||
+ ENDDO
|
||
+ ENDDO
|
||
+ ELSE ! we're in the small timesteps
|
||
+ DO j=j_start, j_end ! and adding div damping component
|
||
+ DO k=k_start, k_end
|
||
+ DO i=i_start, i_end
|
||
+ ptmp = p(i,k,j)
|
||
+ p(i,k,j) = p(i,k,j) + smdiv*(p(i,k,j)-pm1(i,k,j))
|
||
+ pm1(i,k,j) = ptmp
|
||
+ ENDDO
|
||
+ ENDDO
|
||
+ ENDDO
|
||
+ END IF
|
||
+
|
||
+END SUBROUTINE calc_p_rho
|
||
+
|
||
+! { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 6 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "Tracing succeeded" 48 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-not "static_data_size:" "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){2}\}" 3 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){4}\}" 1 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-not ", size: (?!(0\.000000))" "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times ", size: 0\.000000" 28 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "\\d p \\(0.000000, 3, 0\\) : 8" 2 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "\\d pm1 \\(0.000000, 2, 0\\) : 5" 2 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "\\d ph \\(0.000000, 2, 0\\) : 4" 2 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "\\d al \\(0.000000, 1, 0\\) : 3" 2 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "\\d alt \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "\\d t_1 \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "\\d t_2 \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "\\d c2a \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "\\d mu \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "\\d muts \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "runtime issue" 2 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } }
|
||
diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp
|
||
new file mode 100644
|
||
index 000000000..068341784
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp
|
||
@@ -0,0 +1,29 @@
|
||
+# Copyright (C) 2022-2023 Free Software Foundation, Inc.
|
||
+
|
||
+# This program is free software; you can redistribute it and/or modify
|
||
+# it under the terms of the GNU General Public License as published by
|
||
+# the Free Software Foundation; either version 3 of the License, or
|
||
+# (at your option) any later version.
|
||
+#
|
||
+# This program is distributed in the hope that it will be useful,
|
||
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
+# GNU General Public License for more details.
|
||
+#
|
||
+# You should have received a copy of the GNU General Public License
|
||
+# along with GCC; see the file COPYING3. If not see
|
||
+# <http://www.gnu.org/licenses/>.
|
||
+
|
||
+# GCC testsuite that uses the `dg.exp' driver.
|
||
+
|
||
+load_lib gfortran-dg.exp
|
||
+
|
||
+# Initialize `dg'.
|
||
+dg-init
|
||
+
|
||
+# Main loop.
|
||
+gfortran-dg-runtest [lsort \
|
||
+ [glob -nocomplain $srcdir/$subdir/*.\[fF\]{,90,95,03,08} ] ] "" ""
|
||
+
|
||
+# All done.
|
||
+dg-finish
|
||
diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90
|
||
new file mode 100644
|
||
index 000000000..23e360540
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90
|
||
@@ -0,0 +1,63 @@
|
||
+! { dg-do compile { target { aarch64*-*-linux* } } }
|
||
+! { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno" }
|
||
+
|
||
+MODULE INPUT
|
||
+ IMPLICIT NONE
|
||
+
|
||
+ INTEGER, PARAMETER :: wp = 8, jpi = 25, jpj = 39, jpk = 31, kjpt = 2
|
||
+
|
||
+ INTEGER :: kt = 1, jpkm1 = 30, jpjm1 = 38, fs_jpim1 = 24, fs_2 = 2
|
||
+ REAL(wp), DIMENSION(jpi, jpj) :: e12t
|
||
+ REAL(wp), DIMENSION(jpi, jpj, jpk) :: fse3t_n
|
||
+ REAL(wp), DIMENSION(jpi, jpj, jpk, kjpt) :: pta
|
||
+
|
||
+END MODULE INPUT
|
||
+
|
||
+PROGRAM MAIN
|
||
+ USE INPUT
|
||
+
|
||
+ IMPLICIT NONE
|
||
+
|
||
+ INTEGER :: EPOCH
|
||
+
|
||
+! Initialize arrays
|
||
+
|
||
+ e12t = 1
|
||
+ fse3t_n = 1
|
||
+ pta = 1
|
||
+!
|
||
+
|
||
+ DO EPOCH=1,2
|
||
+ CALL tra_ldf_iso
|
||
+ ENDDO
|
||
+
|
||
+END PROGRAM MAIN
|
||
+
|
||
+SUBROUTINE tra_ldf_iso
|
||
+ USE INPUT
|
||
+
|
||
+ IMPLICIT NONE
|
||
+ !
|
||
+ INTEGER :: ji, jj, jk, jn ! dummy loop indices
|
||
+ REAL(wp) :: zbtr, ztra ! - -
|
||
+ REAL(wp), DIMENSION(jpi, jpj, jpk) :: ztfw
|
||
+
|
||
+ DO jn = 1, kjpt
|
||
+ ztfw(:, :, 1) = 0.e0; ztfw(:, :, jpk) = 0.e0
|
||
+
|
||
+ DO jk = 1, jpkm1
|
||
+ DO jj = 2, jpjm1
|
||
+ DO ji = fs_2, fs_jpim1 ! vector opt.
|
||
+ zbtr = 1.0/(e12t(ji, jj)*fse3t_n(ji, jj, jk))
|
||
+ ztra = (ztfw(ji, jj, jk) - ztfw(ji, jj, jk + 1))*zbtr
|
||
+ pta(ji, jj, jk, jn) = pta(ji, jj, jk, jn) + ztra
|
||
+ END DO
|
||
+ END DO
|
||
+ END DO
|
||
+ !
|
||
+ END DO
|
||
+ !
|
||
+END SUBROUTINE tra_ldf_iso
|
||
+
|
||
+! { dg-final { scan-tree-dump-times "Traced variables at vectp_ztfw" 2 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "Tracing unusual number or occurrences of base variables. Choose ztfw." 2 "llc_allocate" } }
|
||
diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90
|
||
new file mode 100644
|
||
index 000000000..d76c75b5b
|
||
--- /dev/null
|
||
+++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90
|
||
@@ -0,0 +1,58 @@
|
||
+! { dg-do compile { target { aarch64*-*-linux* } } }
|
||
+! { dg-options "-c -O3 -march=armv8.2-a+sve -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0 --param issue-topn=1 --param mem-access-ratio=5 --param mem-access-num=1" }
|
||
+
|
||
+Module module_domain
|
||
+ IMPLICIT NONE
|
||
+
|
||
+ REAL, PARAMETER :: g = 9.8
|
||
+ TYPE :: grid_type
|
||
+ REAL, POINTER :: phb(:,:,:), ph_2(:,:,:), p(:,:,:), pb(:,:,:)
|
||
+ REAL, POINTER :: fnm(:), fnp(:)
|
||
+ END TYPE
|
||
+END Module
|
||
+
|
||
+SUBROUTINE calc_p8w(p8w, ix, iy, k_start, k_end)
|
||
+
|
||
+ USE module_domain
|
||
+ !USE module_model_constants
|
||
+
|
||
+ IMPLICIT NONE
|
||
+
|
||
+
|
||
+ !TYPE (domain), INTENT(IN) :: grid
|
||
+ INTEGER, INTENT(IN) :: k_start, k_end, ix, iy
|
||
+ REAL, DIMENSION(k_start:k_end), INTENT(OUT) :: p8w
|
||
+
|
||
+
|
||
+ INTEGER :: k
|
||
+ REAL :: z0, z1, z2, w1, w2
|
||
+ REAL, DIMENSION(k_start:k_end) :: z_at_w
|
||
+ REAL, DIMENSION(k_start:k_end-1) :: z
|
||
+ TYPE (grid_type), POINTER :: grid
|
||
+
|
||
+
|
||
+ DO k = k_start, k_end
|
||
+ z_at_w(k) = (grid%phb(ix,k,iy)+grid%ph_2(ix,k,iy))/g
|
||
+ END DO
|
||
+
|
||
+ DO k = k_start, k_end-1
|
||
+ z(k) = 0.5*(z_at_w(k) + z_at_w(k+1))
|
||
+ END DO
|
||
+
|
||
+ DO k = k_start+1, k_end-1
|
||
+ p8w(k) = grid%fnm(k)*(grid%p(ix,k,iy)+grid%pb(ix,k,iy)) + &
|
||
+ grid%fnp(k)*(grid%p(ix,k-1,iy)+grid%pb(ix,k-1,iy))
|
||
+ END DO
|
||
+
|
||
+ z0 = z_at_w(k_start)
|
||
+ z1 = z(k_start)
|
||
+ z2 = z(k_start+1)
|
||
+ w1 = (z0 - z2)/(z1 - z2)
|
||
+ w2 = 1. - w1
|
||
+ p8w(k_start) = w1*(grid%p(ix,k_start,iy)+grid%pb(ix,k_start,iy)) + &
|
||
+ w2*(grid%p(ix,k_start+1,iy)+grid%pb(ix,k_start+1,iy))
|
||
+
|
||
+END SUBROUTINE calc_p8w
|
||
+
|
||
+! { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } }
|
||
+! { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } }
|
||
diff --git a/gcc/timevar.def b/gcc/timevar.def
|
||
index ba86a1b7b..4b643538f 100644
|
||
--- a/gcc/timevar.def
|
||
+++ b/gcc/timevar.def
|
||
@@ -207,6 +207,7 @@ DEFTIMEVAR (TV_TREE_LOOP_DISTRIBUTION, "tree loop distribution")
|
||
DEFTIMEVAR (TV_CHECK_DATA_DEPS , "tree check data dependences")
|
||
DEFTIMEVAR (TV_TREE_PREFETCH , "tree prefetching")
|
||
DEFTIMEVAR (TV_TREE_LOOP_IVOPTS , "tree iv optimization")
|
||
+DEFTIMEVAR (TV_TREE_LLC_ALLOCATE , "tree llc allocation")
|
||
DEFTIMEVAR (TV_PREDCOM , "predictive commoning")
|
||
DEFTIMEVAR (TV_TREE_CH , "tree copy headers")
|
||
DEFTIMEVAR (TV_TREE_SSA_UNCPROP , "tree SSA uncprop")
|
||
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
|
||
index d82fe23d8..9eb173d69 100644
|
||
--- a/gcc/tree-cfg.c
|
||
+++ b/gcc/tree-cfg.c
|
||
@@ -8365,6 +8365,17 @@ print_loops (FILE *file, int verbosity)
|
||
print_loop_and_siblings (file, bb->loop_father, 0, verbosity);
|
||
}
|
||
|
||
+/* Dump a loop to file. */
|
||
+
|
||
+void
|
||
+loop_dump (FILE *file, class loop *loop)
|
||
+{
|
||
+ print_loop (file, loop, 0, 0);
|
||
+ fprintf (file, "vec_niter = ");
|
||
+ print_generic_expr (file, loop->vec_nb_iterations);
|
||
+ fprintf (file, "\n");
|
||
+}
|
||
+
|
||
/* Dump a loop. */
|
||
|
||
DEBUG_FUNCTION void
|
||
diff --git a/gcc/tree-cfg.h b/gcc/tree-cfg.h
|
||
index beb4997a6..dad0ca0a6 100644
|
||
--- a/gcc/tree-cfg.h
|
||
+++ b/gcc/tree-cfg.h
|
||
@@ -83,6 +83,7 @@ extern void dump_function_to_file (tree, FILE *, dump_flags_t);
|
||
extern void debug_function (tree, dump_flags_t);
|
||
extern void print_loops_bb (FILE *, basic_block, int, int);
|
||
extern void print_loops (FILE *, int);
|
||
+extern void loop_dump (FILE *file, class loop *loop);
|
||
extern void debug (class loop &ref);
|
||
extern void debug (class loop *ptr);
|
||
extern void debug_verbose (class loop &ref);
|
||
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
|
||
index 027f8992d..a1e215901 100644
|
||
--- a/gcc/tree-pass.h
|
||
+++ b/gcc/tree-pass.h
|
||
@@ -383,6 +383,7 @@ extern gimple_opt_pass *make_pass_complete_unrolli (gcc::context *ctxt);
|
||
extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt);
|
||
extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt);
|
||
extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
|
||
+extern gimple_opt_pass *make_pass_llc_allocate (gcc::context *ctxt);
|
||
extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
|
||
extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
|
||
extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt);
|
||
diff --git a/gcc/tree-scalar-evolution.c b/gcc/tree-scalar-evolution.c
|
||
index edab77827..73ffa0759 100644
|
||
--- a/gcc/tree-scalar-evolution.c
|
||
+++ b/gcc/tree-scalar-evolution.c
|
||
@@ -2789,7 +2789,7 @@ resolve_mixers (class loop *loop, tree chrec, bool *folded_casts)
|
||
the loop body has been executed 6 times. */
|
||
|
||
tree
|
||
-number_of_latch_executions (class loop *loop)
|
||
+number_of_latch_executions (class loop *loop, bool guarantee)
|
||
{
|
||
edge exit;
|
||
class tree_niter_desc niter_desc;
|
||
@@ -2810,7 +2810,8 @@ number_of_latch_executions (class loop *loop)
|
||
res = chrec_dont_know;
|
||
exit = single_exit (loop);
|
||
|
||
- if (exit && number_of_iterations_exit (loop, exit, &niter_desc, false))
|
||
+ if (exit && number_of_iterations_exit (loop, exit, &niter_desc, false,
|
||
+ true, NULL, guarantee))
|
||
{
|
||
may_be_zero = niter_desc.may_be_zero;
|
||
res = niter_desc.niter;
|
||
@@ -2836,7 +2837,8 @@ number_of_latch_executions (class loop *loop)
|
||
fprintf (dump_file, "))\n");
|
||
}
|
||
|
||
- loop->nb_iterations = res;
|
||
+ if (guarantee)
|
||
+ loop->nb_iterations = res;
|
||
return res;
|
||
}
|
||
|
||
diff --git a/gcc/tree-scalar-evolution.h b/gcc/tree-scalar-evolution.h
|
||
index e2fbfb55b..218155650 100644
|
||
--- a/gcc/tree-scalar-evolution.h
|
||
+++ b/gcc/tree-scalar-evolution.h
|
||
@@ -21,7 +21,8 @@ along with GCC; see the file COPYING3. If not see
|
||
#ifndef GCC_TREE_SCALAR_EVOLUTION_H
|
||
#define GCC_TREE_SCALAR_EVOLUTION_H
|
||
|
||
-extern tree number_of_latch_executions (class loop *);
|
||
+extern tree number_of_latch_executions (class loop *,
|
||
+ bool guarantee = true);
|
||
extern gcond *get_loop_exit_condition (const class loop *);
|
||
|
||
extern void scev_initialize (void);
|
||
diff --git a/gcc/tree-ssa-llc-allocate.c b/gcc/tree-ssa-llc-allocate.c
|
||
new file mode 100644
|
||
index 000000000..746a1cf95
|
||
--- /dev/null
|
||
+++ b/gcc/tree-ssa-llc-allocate.c
|
||
@@ -0,0 +1,2898 @@
|
||
+/* LLC allocate.
|
||
+ Copyright (C) 2022-2023 Free Software Foundation, Inc.
|
||
+
|
||
+This file is part of GCC.
|
||
+
|
||
+GCC is free software; you can redistribute it and/or modify it
|
||
+under the terms of the GNU General Public License as published by the
|
||
+Free Software Foundation; either version 3, or (at your option) any
|
||
+later version.
|
||
+
|
||
+GCC is distributed in the hope that it will be useful, but WITHOUT
|
||
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||
+for more details.
|
||
+
|
||
+You should have received a copy of the GNU General Public License
|
||
+along with GCC; see the file COPYING3. If not see
|
||
+<http://www.gnu.org/licenses/>. */
|
||
+
|
||
+#include "config.h"
|
||
+#define INCLUDE_MAP
|
||
+#define INCLUDE_SET
|
||
+#define INCLUDE_VECTOR
|
||
+#define INCLUDE_LIST
|
||
+#define INCLUDE_ALGORITHM
|
||
+#define INCLUDE_STRING
|
||
+#include "system.h"
|
||
+#include "coretypes.h"
|
||
+#include "backend.h"
|
||
+#include "target.h"
|
||
+#include "rtl.h"
|
||
+#include "tree.h"
|
||
+#include "gimple.h"
|
||
+#include "predict.h"
|
||
+#include "tree-pass.h"
|
||
+#include "gimple-ssa.h"
|
||
+#include "optabs-query.h"
|
||
+#include "tree-pretty-print.h"
|
||
+#include "fold-const.h"
|
||
+#include "stor-layout.h"
|
||
+#include "gimplify.h"
|
||
+#include "gimple-iterator.h"
|
||
+#include "gimplify-me.h"
|
||
+#include "tree-ssa-loop-ivopts.h"
|
||
+#include "tree-ssa-loop-manip.h"
|
||
+#include "tree-ssa-loop-niter.h"
|
||
+#include "tree-ssa-loop.h"
|
||
+#include "ssa.h"
|
||
+#include "tree-into-ssa.h"
|
||
+#include "cfgloop.h"
|
||
+#include "tree-scalar-evolution.h"
|
||
+#include "langhooks.h"
|
||
+#include "tree-inline.h"
|
||
+#include "tree-data-ref.h"
|
||
+#include "diagnostic-core.h"
|
||
+#include "dbgcnt.h"
|
||
+#include "gimple-pretty-print.h"
|
||
+#include "internal-fn.h"
|
||
+#include "tree-cfg.h"
|
||
+#include "profile-count.h"
|
||
+
|
||
+/* Number of parallel cores. */
|
||
+const unsigned int PARALLEL_NUM = 288;
|
||
+
|
||
+/* Indirect access weight. */
|
||
+const unsigned int INDIRECT_ACCESS_VALUE = 2;
|
||
+
|
||
+/* Write memory weight. */
|
||
+const unsigned int WRITE_COST = 2;
|
||
+
|
||
+/* Prefetch tool input max length. */
|
||
+#ifndef PREFETCH_TOOL_INPUT_MAX_LEN
|
||
+#define PREFETCH_TOOL_INPUT_MAX_LEN 512
|
||
+#endif
|
||
+
|
||
+/* Prefetch tool number max length. */
|
||
+#ifndef PREFETCH_TOOL_NUM_MAX_LEN
|
||
+#define PREFETCH_TOOL_NUM_MAX_LEN 9
|
||
+#endif
|
||
+
|
||
+namespace {
|
||
+
|
||
+using namespace std;
|
||
+
|
||
+/* loop bound info of the memory reference located. */
|
||
+struct loop_bound
|
||
+{
|
||
+ /* iv tree_node. */
|
||
+ tree iv;
|
||
+
|
||
+ /* define stmt of iv. */
|
||
+ gimple *def_stmt;
|
||
+
|
||
+ /* loop where stmt is located. */
|
||
+ class loop *loop;
|
||
+
|
||
+ /* loop unroll factor. */
|
||
+ unsigned int unroll;
|
||
+
|
||
+ /* Number of iterations of loop. */
|
||
+ tree niters;
|
||
+
|
||
+ loop_bound (tree t, gimple *stmt)
|
||
+ {
|
||
+ iv = t;
|
||
+ def_stmt = stmt;
|
||
+ loop = loop_containing_stmt (stmt);
|
||
+ unroll = 1;
|
||
+ niters = chrec_dont_know;
|
||
+ }
|
||
+};
|
||
+
|
||
+/* method of calculating the data size. */
|
||
+
|
||
+enum calc_type
|
||
+{
|
||
+ UNHANDLE_CALC = 0,
|
||
+ RUNTIME_CALC,
|
||
+ STATIC_CALC
|
||
+};
|
||
+
|
||
+/* Describes a info of a memory reference. */
|
||
+
|
||
+struct data_ref
|
||
+{
|
||
+ /* The memory reference. */
|
||
+ tree ref;
|
||
+
|
||
+ /* Statement where the ref is located. */
|
||
+ gimple *stmt;
|
||
+
|
||
+ /* var_decl or param_decl, used for the ref_group. */
|
||
+ tree var;
|
||
+
|
||
+ /* Base of the reference. */
|
||
+ tree base;
|
||
+
|
||
+ /* Constant offset of the reference. */
|
||
+ tree offset;
|
||
+
|
||
+ /* index of the reference. */
|
||
+ tree index;
|
||
+
|
||
+ /* Constant step of the reference. */
|
||
+ tree step;
|
||
+
|
||
+ /* loop boundary info of each dimension. */
|
||
+ vector<loop_bound> loop_bounds;
|
||
+
|
||
+ /* memory data size, Unit: MB. */
|
||
+ double data_size;
|
||
+
|
||
+ /* method of calculating the data size. */
|
||
+ calc_type calc_by;
|
||
+
|
||
+ /* True if the info of ref is traced, and then record it. */
|
||
+ unsigned int trace_status_p : 1;
|
||
+
|
||
+ /* True if the loop is vectorized. */
|
||
+ unsigned int vectorize_p : 1;
|
||
+
|
||
+ /* True if the memory reference is shared. */
|
||
+ unsigned int parallel_p : 1;
|
||
+
|
||
+ /* True if the memory reference is regular. */
|
||
+ unsigned int regular_p : 1;
|
||
+
|
||
+ /* True if the memory reference is read. */
|
||
+ unsigned int read_p : 1;
|
||
+
|
||
+ data_ref ()
|
||
+ {
|
||
+ ref = NULL_TREE;
|
||
+ stmt = NULL;
|
||
+ var = NULL_TREE;
|
||
+ base = NULL_TREE;
|
||
+ offset = NULL_TREE;
|
||
+ index = NULL_TREE;
|
||
+ step = NULL_TREE;
|
||
+ data_size = 0;
|
||
+ calc_by = UNHANDLE_CALC;
|
||
+ trace_status_p = false;
|
||
+ vectorize_p = false;
|
||
+ parallel_p = false;
|
||
+ regular_p = true;
|
||
+ read_p = true;
|
||
+ }
|
||
+};
|
||
+
|
||
+/* ================ phase 1 get_dense_memory_kernels ================ */
|
||
+
|
||
+/* Add ref node and print. */
|
||
+
|
||
+void
|
||
+add_ref (vector<data_ref> &references, tree op, gimple *stmt,
|
||
+ bool vectorize_p, bool read_p)
|
||
+{
|
||
+ data_ref ref;
|
||
+ ref.ref = op;
|
||
+ ref.stmt = stmt;
|
||
+ ref.vectorize_p = vectorize_p;
|
||
+ ref.read_p = read_p;
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ print_generic_expr (dump_file, ref.ref, TDF_LINENO);
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+ references.push_back (ref);
|
||
+}
|
||
+
|
||
+/* Get the references from the simple call (vectorization type). */
|
||
+
|
||
+void
|
||
+get_references_in_gimple_call (gimple *stmt, vector<data_ref> &references)
|
||
+{
|
||
+ if (gimple_code (stmt) != GIMPLE_CALL)
|
||
+ return;
|
||
+
|
||
+ if (gimple_call_internal_p (stmt))
|
||
+ {
|
||
+ bool read_p = false;
|
||
+ switch (gimple_call_internal_fn (stmt))
|
||
+ {
|
||
+ case IFN_MASK_GATHER_LOAD:
|
||
+ case IFN_MASK_LOAD:
|
||
+ {
|
||
+ if (gimple_call_lhs (stmt) == NULL_TREE)
|
||
+ return;
|
||
+ read_p = true;
|
||
+ // FALLTHRU
|
||
+ }
|
||
+ case IFN_MASK_STORE:
|
||
+ {
|
||
+ /* _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B];
|
||
+ vect__1.1 = .MASK_LOAD (_1, 64B, loop_mask_4);
|
||
+
|
||
+ _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B];
|
||
+ .MASK_STORE (_1, 64B, loop_mask_4, vect__1.2);
|
||
+
|
||
+ _1 = (sizetype) a_2(D);
|
||
+ vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8,
|
||
+ { 0.0, ... }, loop_mask_5);
|
||
+ */
|
||
+ tree op1 = gimple_call_arg (stmt, 0);
|
||
+ if (TREE_CODE (op1) != SSA_NAME)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "get_references_in_gimple_call: ");
|
||
+ fprintf (dump_file, "find base that not ssa_name: ");
|
||
+ print_generic_expr (dump_file, op1, TDF_LINENO);
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+ return;
|
||
+ }
|
||
+ gimple *op1_def = SSA_NAME_DEF_STMT (op1);
|
||
+ if (op1_def != NULL && gimple_code (op1_def) == GIMPLE_ASSIGN)
|
||
+ {
|
||
+ /* &MEM[base: xx] */
|
||
+ tree rhs1 = gimple_assign_rhs1 (op1_def);
|
||
+ /* If the definition stmt of the operation is memory
|
||
+ reference type, read it directly. */
|
||
+ if (TREE_CODE (rhs1) == ADDR_EXPR
|
||
+ && TREE_CODE (TREE_OPERAND (rhs1, 0)) == TARGET_MEM_REF)
|
||
+ op1 = TREE_OPERAND (rhs1, 0); /* MEM[base: xx] */
|
||
+ }
|
||
+
|
||
+ add_ref (references, op1, stmt, true, read_p);
|
||
+ return;
|
||
+ }
|
||
+ default:
|
||
+ return;
|
||
+ }
|
||
+ }
|
||
+}
|
||
+
|
||
+/* Stores the locations of memory references in STMT to REFERENCES. */
|
||
+
|
||
+void
|
||
+get_references_in_stmt (gimple *stmt, vector<data_ref> &references)
|
||
+{
|
||
+ if (!gimple_vuse (stmt))
|
||
+ return;
|
||
+
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "gimple_vuse: ");
|
||
+ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
|
||
+ }
|
||
+
|
||
+ if (gimple_code (stmt) == GIMPLE_ASSIGN)
|
||
+ {
|
||
+ tree op0 = gimple_assign_lhs (stmt);
|
||
+ tree op1 = gimple_assign_rhs1 (stmt);
|
||
+ tree base = NULL_TREE;
|
||
+
|
||
+ /* _1 = MEM[base: a, index: i, step: 8, offset: 0B]; */
|
||
+ if (REFERENCE_CLASS_P (op1) && (base = get_base_address (op1))
|
||
+ && TREE_CODE (base) != SSA_NAME && !is_gimple_min_invariant (base))
|
||
+ add_ref (references, op1, stmt, false, true);
|
||
+
|
||
+ if (REFERENCE_CLASS_P (op0) && get_base_address (op0))
|
||
+ add_ref (references, op0, stmt, false, false);
|
||
+ }
|
||
+ else if (gimple_code (stmt) == GIMPLE_CALL)
|
||
+ get_references_in_gimple_call (stmt, references);
|
||
+
|
||
+ return;
|
||
+}
|
||
+
|
||
+/* flag of loop filter out. */
|
||
+
|
||
+struct loop_filter_out_flag
|
||
+{
|
||
+ /* Use external gimple. */
|
||
+ bool use_ext_gimple;
|
||
+
|
||
+ /* Use external call. */
|
||
+ bool use_ext_call;
|
||
+
|
||
+ /* Use external node. */
|
||
+ bool use_ext_node;
|
||
+
|
||
+ /* Use loop defined in macros. */
|
||
+ bool use_macro_loop;
|
||
+
|
||
+ /* Use external node. */
|
||
+ bool use_cond_func;
|
||
+};
|
||
+
|
||
+/* Check whether an external node is used. */
|
||
+
|
||
+bool use_ext_node_p (const vector<data_ref> &references,
|
||
+ unsigned int &start)
|
||
+{
|
||
+ expanded_location cfun_xloc
|
||
+ = expand_location (DECL_SOURCE_LOCATION (current_function_decl));
|
||
+
|
||
+ unsigned i = start;
|
||
+ start = references.size ();
|
||
+ for (; i < references.size (); i++)
|
||
+ {
|
||
+ data_ref ref = references[i];
|
||
+ expanded_location xloc = expand_location (ref.stmt->location);
|
||
+ if (xloc.file && filename_cmp (cfun_xloc.file, xloc.file))
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "use_ext_node\n\n");
|
||
+ return true;
|
||
+ }
|
||
+ }
|
||
+ return false;
|
||
+}
|
||
+
|
||
+/* Determine whether to filter out loops by stmt. */
|
||
+
|
||
+bool
|
||
+filter_out_loop_by_stmt_p (loop_filter_out_flag &loop_filter, gimple *stmt,
|
||
+ const vector<data_ref> &references, unsigned int &start)
|
||
+{
|
||
+ /* check use_ext_gimple. */
|
||
+ expanded_location cfun_xloc
|
||
+ = expand_location (DECL_SOURCE_LOCATION (current_function_decl));
|
||
+ expanded_location xloc = expand_location (stmt->location);
|
||
+ if (xloc.file && filename_cmp (cfun_xloc.file, xloc.file))
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "use_ext_gimple: ");
|
||
+ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
|
||
+ }
|
||
+ loop_filter.use_ext_gimple = true;
|
||
+ return true;
|
||
+ }
|
||
+
|
||
+ /* check use_ext_call. */
|
||
+ if (gimple_code (stmt) == GIMPLE_CALL && !gimple_call_internal_p (stmt))
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "use_ext_call: ");
|
||
+ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
|
||
+ }
|
||
+ loop_filter.use_ext_call = true;
|
||
+ return true;
|
||
+ }
|
||
+
|
||
+ /* check use_macro_loop. */
|
||
+ if (xloc.file && xloc.column != 1)
|
||
+ loop_filter.use_macro_loop = false;
|
||
+
|
||
+ /* checke use_cond_func, VEC_COND_EXPR/MIN_EXPR/MAX_EXPR. */
|
||
+ if (gimple_code (stmt) == GIMPLE_ASSIGN)
|
||
+ {
|
||
+ enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
|
||
+ if (rhs_code == VEC_COND_EXPR || rhs_code == MIN_EXPR
|
||
+ || rhs_code == MAX_EXPR)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "use_cond_func: ");
|
||
+ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
|
||
+ }
|
||
+ loop_filter.use_cond_func = true;
|
||
+ return true;
|
||
+ }
|
||
+ }
|
||
+
|
||
+ /* check use_ext_node. */
|
||
+ if (use_ext_node_p (references, start))
|
||
+ {
|
||
+ loop_filter.use_ext_node = true;
|
||
+ return true;
|
||
+ }
|
||
+
|
||
+ return false;
|
||
+}
|
||
+
|
||
+/* Dump the flag type of the loop is filtered out. */
|
||
+
|
||
+void
|
||
+dump_loop_filter_out_flag (loop_filter_out_flag &loop_filter)
|
||
+{
|
||
+ if (loop_filter.use_ext_gimple)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "non-dense mem access: use_ext_gimple\n");
|
||
+ }
|
||
+ if (loop_filter.use_ext_call)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "non-dense mem access: use_ext_call\n");
|
||
+ }
|
||
+
|
||
+ if (loop_filter.use_ext_node)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "non-dense mem access: use_ext_node\n");
|
||
+ }
|
||
+
|
||
+ if (loop_filter.use_macro_loop)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "non-dense mem access: use_macro_loop\n");
|
||
+ }
|
||
+
|
||
+ if (loop_filter.use_cond_func)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "non-dense mem access: use_cond_func\n");
|
||
+ }
|
||
+}
|
||
+
|
||
+/* Get references in loop. */
|
||
+
|
||
+bool
|
||
+get_references_in_loop (vector<data_ref> &references,
|
||
+ loop_filter_out_flag &loop_filter,
|
||
+ class loop *loop)
|
||
+{
|
||
+ unsigned int start = 0;
|
||
+ bool filter_out_loop = true;
|
||
+
|
||
+ /* Analyze each bb in the loop. */
|
||
+ basic_block *body = get_loop_body_in_dom_order (loop);
|
||
+ for (unsigned i = 0; i < loop->num_nodes; i++)
|
||
+ {
|
||
+ basic_block bb = body[i];
|
||
+ if (bb->loop_father != loop)
|
||
+ continue;
|
||
+
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "\n==== the %dth loop bb body ====\n", i);
|
||
+ gimple_dump_bb (dump_file, bb, 0, dump_flags);
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+
|
||
+ gimple_stmt_iterator bsi;
|
||
+ for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
|
||
+ {
|
||
+ gimple *stmt = gsi_stmt (bsi);
|
||
+ get_references_in_stmt (stmt, references);
|
||
+ filter_out_loop = filter_out_loop_by_stmt_p (loop_filter, stmt,
|
||
+ references, start);
|
||
+ if (filter_out_loop)
|
||
+ break;
|
||
+ }
|
||
+ if (filter_out_loop)
|
||
+ break;
|
||
+ }
|
||
+ free (body);
|
||
+ return !filter_out_loop;
|
||
+}
|
||
+
|
||
+/* Determine whether the loop is a single path. */
|
||
+
|
||
+bool
|
||
+single_path_p (class loop *loop, basic_block bb)
|
||
+{
|
||
+ if (bb == NULL)
|
||
+ return false;
|
||
+ if (bb == loop->latch)
|
||
+ return true;
|
||
+
|
||
+ gimple *stmt = last_stmt (bb);
|
||
+ bool res = false;
|
||
+
|
||
+ if (stmt && gimple_code (stmt) == GIMPLE_COND)
|
||
+ {
|
||
+ gcc_assert (EDGE_COUNT (bb->succs) == 2);
|
||
+ edge true_edge = NULL;
|
||
+ edge false_edge = NULL;
|
||
+ extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
|
||
+
|
||
+ /* Returns false, if a branch occurs. */
|
||
+ if (true_edge->dest->loop_father == loop
|
||
+ && false_edge->dest->loop_father == loop)
|
||
+ return false;
|
||
+
|
||
+ if (true_edge->dest->loop_father == loop)
|
||
+ res = single_path_p (loop, true_edge->dest);
|
||
+ else
|
||
+ res = single_path_p (loop, false_edge->dest);
|
||
+ }
|
||
+ else
|
||
+ {
|
||
+ edge e = find_fallthru_edge (bb->succs);
|
||
+ if (e)
|
||
+ res = single_path_p (loop, e->dest);
|
||
+ }
|
||
+ return res;
|
||
+}
|
||
+
|
||
+/* Computes an estimated number of insns in LOOP, weighted by WEIGHTS.
|
||
+ Assume that the HPC data reading and calculation process does not involve
|
||
+ adding branches in loops. Therefore, all bbs of loops are directly used for
|
||
+ calculation (excluding embedded loops) without considering branch weighting.
|
||
+*/
|
||
+
|
||
+unsigned
|
||
+estimate_loop_insns (class loop *loop, eni_weights *weights)
|
||
+{
|
||
+ basic_block *body = get_loop_body (loop);
|
||
+ gimple_stmt_iterator gsi;
|
||
+ unsigned size = 0, i;
|
||
+
|
||
+ for (i = 0; i < loop->num_nodes; i++)
|
||
+ {
|
||
+ basic_block bb = body[i];
|
||
+ if (bb->loop_father != loop)
|
||
+ {
|
||
+ continue;
|
||
+ }
|
||
+ for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
|
||
+ size += estimate_num_insns (gsi_stmt (gsi), weights);
|
||
+ }
|
||
+ free (body);
|
||
+
|
||
+ return size;
|
||
+}
|
||
+
|
||
+/* Check whether the memory access is dense. */
|
||
+
|
||
+bool
|
||
+dense_memory_p (const vector<data_ref> &references, class loop *loop)
|
||
+{
|
||
+ int ref_count = references.size ();
|
||
+ unsigned int ninsns = estimate_loop_insns (loop, &eni_size_weights);
|
||
+ float mem_to_insn_ratio = (float)ref_count / (float)ninsns;
|
||
+
|
||
+ /* The number of cores to be run and DDR bandwidth information can be
|
||
+ transferred to flexibly adjust the threshold. */
|
||
+ bool dense_mem = (mem_to_insn_ratio >= (param_mem_access_ratio / 100.0)
|
||
+ && ref_count >= param_mem_access_num);
|
||
+
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (cfun->decl));
|
||
+
|
||
+ /* Dump dense memory source code location. */
|
||
+ if (ref_count && references[0].stmt->location)
|
||
+ {
|
||
+ expanded_location xloc = expand_location
|
||
+ (references[0].stmt->location);
|
||
+ int fn_start = 0;
|
||
+ if (DECL_SOURCE_LOCATION (current_function_decl))
|
||
+ fn_start = expand_location (
|
||
+ DECL_SOURCE_LOCATION (current_function_decl)).line;
|
||
+ int fn_end = fn_start;
|
||
+ if (cfun->function_end_locus)
|
||
+ fn_end = expand_location (cfun->function_end_locus).line;
|
||
+ if (xloc.file)
|
||
+ fprintf (dump_file, "[%s:%s(%d-%d):%d:%d] ",
|
||
+ xloc.file, fn_name, fn_start, fn_end,
|
||
+ xloc.line, xloc.column);
|
||
+ }
|
||
+
|
||
+ /* Dump memory dense information. */
|
||
+ if (dense_mem)
|
||
+ fprintf (dump_file, "dense memory access: ");
|
||
+ else
|
||
+ fprintf (dump_file, "non-dense mem access: ");
|
||
+ fprintf (dump_file,
|
||
+ "ref_count = %d, ninsns = %d, mem_to_insn_ratio = %f\n\n",
|
||
+ ref_count, ninsns, mem_to_insn_ratio);
|
||
+ }
|
||
+
|
||
+ return dense_mem;
|
||
+}
|
||
+
|
||
+/* Analyze the inner loop and get the loop with dense memory access. */
|
||
+
|
||
+bool
|
||
+get_dense_memory_kernels (vector<class loop *> &kernels,
|
||
+ map<class loop *, vector<data_ref> > &kernels_refs)
|
||
+{
|
||
+ if (dump_file)
|
||
+ fprintf (dump_file, "\nPhase 1: get_dense_memory_kernels\n\n");
|
||
+ class loop *loop = NULL;
|
||
+ FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
|
||
+ {
|
||
+ number_of_latch_executions (loop);
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "\n========== Processing loop %d: ==========\n",
|
||
+ loop->num);
|
||
+ loop_dump (dump_file, loop);
|
||
+ flow_loop_dump (loop, dump_file, NULL, 1);
|
||
+ fprintf (dump_file, "loop unroll: %d\n", loop->unroll);
|
||
+ }
|
||
+
|
||
+ if (get_loop_exit_edges (loop).length () != 1
|
||
+ || !single_path_p (loop, loop->header))
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "non-dense mem access: loop_branching\n");
|
||
+ continue;
|
||
+ }
|
||
+
|
||
+ vector<data_ref> references;
|
||
+ loop_filter_out_flag loop_filter = {false, false, false, true, false};
|
||
+
|
||
+ if (!get_references_in_loop (references, loop_filter, loop))
|
||
+ {
|
||
+ dump_loop_filter_out_flag (loop_filter);
|
||
+ continue;
|
||
+ }
|
||
+
|
||
+ if (dense_memory_p (references, loop))
|
||
+ {
|
||
+ kernels_refs[loop] = references;
|
||
+ kernels.push_back (loop);
|
||
+ }
|
||
+ }
|
||
+ return kernels.size () > 0;
|
||
+}
|
||
+
|
||
+/* ================ phase 2 trace_data_refs_info ================ */
|
||
+
|
||
+/* Determine whether the declaration is a non-vectorized. */
|
||
+
|
||
+bool
|
||
+generic_decl_p (tree expr)
|
||
+{
|
||
+ if (expr == NULL_TREE)
|
||
+ return false;
|
||
+ enum tree_code expr_code = TREE_CODE (expr);
|
||
+ if (expr_code != VAR_DECL && expr_code != PARM_DECL
|
||
+ && expr_code != COMPONENT_REF)
|
||
+ return false;
|
||
+
|
||
+ tree type = TREE_TYPE (expr);
|
||
+ while (type)
|
||
+ {
|
||
+ if (TREE_CODE (type) != VECTOR_TYPE)
|
||
+ /* TREE_TYPE (NODE) (
|
||
+ CONTAINS_STRUCT_CHECK (NODE, TS_TYPED)->typed.type) */
|
||
+ type = CONTAINS_STRUCT_CHECK (type, TS_TYPED) ? TREE_TYPE (type) : NULL;
|
||
+ else
|
||
+ return false;
|
||
+ }
|
||
+ return true;
|
||
+}
|
||
+
|
||
+/* Initial worklist preparation for source variable tracing.
|
||
+ Add different initial node based on different gimple statements. */
|
||
+
|
||
+void
|
||
+add_worklist (vector<tree> &worklist, set<tree> &walked, gimple *def_stmt)
|
||
+{
|
||
+ if (gimple_code (def_stmt) == GIMPLE_PHI)
|
||
+ {
|
||
+ for (unsigned i = 0; i < gimple_phi_num_args (def_stmt); i++)
|
||
+ {
|
||
+ tree node = gimple_phi_arg_def (def_stmt, i);
|
||
+ if (!walked.count (node))
|
||
+ {
|
||
+ worklist.push_back (node);
|
||
+ walked.insert (node);
|
||
+ }
|
||
+ }
|
||
+ }
|
||
+ else if (is_gimple_assign (def_stmt))
|
||
+ {
|
||
+ tree_code rhs_code = gimple_assign_rhs_code (def_stmt);
|
||
+ if (rhs_code == POINTER_PLUS_EXPR || rhs_code == NEGATE_EXPR
|
||
+ || rhs_code == NOP_EXPR || rhs_code == SSA_NAME
|
||
+ || rhs_code == COMPONENT_REF)
|
||
+ {
|
||
+ tree node = gimple_assign_rhs1 (def_stmt);
|
||
+ if (!walked.count (node))
|
||
+ {
|
||
+ worklist.push_back (node);
|
||
+ walked.insert (node);
|
||
+ }
|
||
+ }
|
||
+ else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR)
|
||
+ {
|
||
+ tree node = gimple_assign_rhs1 (def_stmt);
|
||
+ if (!walked.count (node))
|
||
+ {
|
||
+ worklist.push_back (node);
|
||
+ walked.insert (node);
|
||
+ }
|
||
+ node = gimple_assign_rhs2 (def_stmt);
|
||
+ if (!walked.count (node))
|
||
+ {
|
||
+ worklist.push_back (node);
|
||
+ walked.insert (node);
|
||
+ }
|
||
+ }
|
||
+ else
|
||
+ {
|
||
+ /* unhandled assign rhs_code: _219 = _17 * _70;
|
||
+ _17 = *grid_56(D).sst.span;
|
||
+ _70 = *grid_56(D).sst.dim[0].stride;
|
||
+ */
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "unhandled assign rhs_code: ");
|
||
+ print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO);
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+ }
|
||
+ }
|
||
+ else
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "unsupported tracing stmt: ");
|
||
+ print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO);
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+ }
|
||
+}
|
||
+
|
||
+
|
||
+/* Tracing source variables:
|
||
+ vectp.1 = a_2(D) + _3;
|
||
+ _4 = &MEM[base: vectp.1, index: ivtmp_5, step: 8, offset: 0B];
|
||
+ vect__1.6 = .MASK_LOAD (_4, 64B, loop_mask_7);
|
||
+
|
||
+ _1 = (sizetype) b_2(D);
|
||
+ vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8, { 0.0, ... },
|
||
+ loop_mask_5);
|
||
+ ...
|
||
+ Due to previous pass optimizations, the current tracing method can find
|
||
+ several source variable candidates. We decide to record them in a map and
|
||
+ later filter out the true base variable by some criteria.
|
||
+*/
|
||
+
|
||
+void
|
||
+trace_base_var_helper (tree arg, set<tree> &walked,
|
||
+ map<tree, int>& base_var_candid)
|
||
+{
|
||
+ if (arg == NULL)
|
||
+ return;
|
||
+
|
||
+ /* Array type. */
|
||
+ tree op0 = NULL;
|
||
+ if (TREE_CODE (arg) == ADDR_EXPR
|
||
+ && (op0 = TREE_OPERAND (arg, 0)) && generic_decl_p (op0))
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "array type\n");
|
||
+ base_var_candid[op0] += 1;
|
||
+ return;
|
||
+ }
|
||
+
|
||
+ /* Pointer type. */
|
||
+ if (TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE && generic_decl_p (arg))
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "pointer type\n");
|
||
+ base_var_candid[arg] += 1;
|
||
+ return;
|
||
+ }
|
||
+
|
||
+ /* SSA_NAME type. */
|
||
+ if (TREE_CODE (arg) != SSA_NAME)
|
||
+ return;
|
||
+
|
||
+ tree tmp_var = SSA_NAME_VAR (arg);
|
||
+ if (tmp_var && generic_decl_p (tmp_var)
|
||
+ && TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "ssa pointer type\n");
|
||
+ base_var_candid[tmp_var] += 1;
|
||
+ return;
|
||
+ }
|
||
+
|
||
+ gimple *def_stmt = SSA_NAME_DEF_STMT (arg);
|
||
+ if (def_stmt == NULL)
|
||
+ return;
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ print_generic_expr (dump_file, arg, TDF_SLIM);
|
||
+ fprintf (dump_file, "\t\t: ");
|
||
+ print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM);
|
||
+ }
|
||
+
|
||
+ vector<tree> worklist;
|
||
+ add_worklist (worklist, walked, def_stmt);
|
||
+ for (unsigned i = 0; i < worklist.size (); ++i)
|
||
+ trace_base_var_helper (worklist[i], walked, base_var_candid);
|
||
+}
|
||
+
|
||
+/* Identify the base variable traced from base address of memory reference.
|
||
+ We recognize that current method could detect several base variable
|
||
+ candidates and the temporary criteria for base variable determination
|
||
+ is that either one of the following statement is true:
|
||
+ 1. The number of base variable candidates is 1;
|
||
+ 2. The number of detected gimple statements for some variable is 1.
|
||
+ We may use other criteria or relax the current criteria
|
||
+ (e.g., criterion 2: 1 -> any odd number). */
|
||
+
|
||
+bool
|
||
+trace_base_var (tree &var, tree arg, set<tree> &walked)
|
||
+{
|
||
+ map<tree, int> base_var_candid;
|
||
+ trace_base_var_helper (arg, walked, base_var_candid);
|
||
+ bool is_tracing_unusual = false;
|
||
+ if (base_var_candid.size () == 1)
|
||
+ var = base_var_candid.begin ()->first;
|
||
+ else
|
||
+ {
|
||
+ is_tracing_unusual = true;
|
||
+ for (const pair<tree, int>& base_var_count : base_var_candid)
|
||
+ if (base_var_count.second == 1)
|
||
+ var = base_var_count.first;
|
||
+ }
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "Traced variables at ");
|
||
+ print_generic_expr (dump_file, arg, TDF_SLIM);
|
||
+ fprintf (dump_file, ":\n");
|
||
+ for (const pair<tree, int>& base_var_count : base_var_candid)
|
||
+ fprintf (dump_file, "%s:%d, ", get_name (base_var_count.first),
|
||
+ base_var_count.second);
|
||
+ fprintf (dump_file, "\n");
|
||
+
|
||
+ if (var == NULL_TREE)
|
||
+ fprintf (dump_file, "Unhandled scenario for tracing base variable.\n");
|
||
+ else if (is_tracing_unusual && var != NULL_TREE)
|
||
+ fprintf (dump_file, "Tracing unusual number or occurrences of base "
|
||
+ "variables. Choose %s.\n", get_name (var));
|
||
+ }
|
||
+ return var != NULL_TREE;
|
||
+}
|
||
+
|
||
+/* Tracing direct memory reference information. */
|
||
+
|
||
+bool
|
||
+trace_direct_mem_ref (data_ref &mem_ref, set <gimple *> &traced_ref_stmt)
|
||
+{
|
||
+ if (TREE_CODE (mem_ref.ref) != TARGET_MEM_REF)
|
||
+ return false;
|
||
+
|
||
+ /* Direct memory access, regardless of whether it is in vectorized form,
|
||
+ can be determined through TARGET_MEM_REF. */
|
||
+ mem_ref.base = TREE_OPERAND (mem_ref.ref, 0);
|
||
+ mem_ref.offset = TREE_OPERAND (mem_ref.ref, 1);
|
||
+ mem_ref.index = TREE_OPERAND (mem_ref.ref, 2);
|
||
+ mem_ref.step = TREE_OPERAND (mem_ref.ref, 3);
|
||
+
|
||
+ set<tree> walked;
|
||
+ if (mem_ref.var == NULL_TREE
|
||
+ && !trace_base_var (mem_ref.var, mem_ref.base, walked))
|
||
+ return false;
|
||
+
|
||
+ traced_ref_stmt.insert (mem_ref.stmt);
|
||
+ return true;
|
||
+}
|
||
+
|
||
+/* Recursively trace and check whether the definition stmt of the
|
||
+ index operand is a recorded stmt in direct access tracing.
|
||
+ If true, it is an indirect access. */
|
||
+
|
||
+bool
|
||
+trace_indirect_operand (tree arg, set<gimple *> &traced_ref_stmt)
|
||
+{
|
||
+ if (TREE_CODE (arg) != SSA_NAME)
|
||
+ return false;
|
||
+
|
||
+ gimple *def_stmt = SSA_NAME_DEF_STMT (arg);
|
||
+
|
||
+ if (traced_ref_stmt.count (def_stmt))
|
||
+ return true;
|
||
+
|
||
+ if (!def_stmt || !is_gimple_assign (def_stmt))
|
||
+ return false;
|
||
+
|
||
+ tree_code rhs_code = gimple_assign_rhs_code (def_stmt);
|
||
+ /* Collect a whitelist of gimple_assign_rhs_code for tracing pointer/array
|
||
+ type indirect memory access. Please check examples before function
|
||
+ trace_indirect_ptr and trace_indirect_array. */
|
||
+ if (rhs_code != MULT_EXPR && rhs_code != NOP_EXPR
|
||
+ && rhs_code != CONVERT_EXPR && rhs_code != PLUS_EXPR
|
||
+ && rhs_code != ARRAY_REF)
|
||
+ return false;
|
||
+
|
||
+ tree op = NULL_TREE;
|
||
+ ssa_op_iter iter;
|
||
+ FOR_EACH_SSA_TREE_OPERAND (op, def_stmt, iter, SSA_OP_USE)
|
||
+ {
|
||
+ if (trace_indirect_operand (op, traced_ref_stmt))
|
||
+ return true;
|
||
+ }
|
||
+ return false;
|
||
+}
|
||
+
|
||
+/* Trace the pointer of the indirect memory access:
|
||
+ 1) obtain the base address of the indirect memory access.
|
||
+ 2) ensure that the index has been traced in the direct memory access.
|
||
+
|
||
+ _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; // Traced in
|
||
+ direct access
|
||
+ _4 = (long unsigned int) _1;
|
||
+ _5 = _4 * 8;
|
||
+ _6 = p(D) + _5; // get base
|
||
+ _7 = *_6; // start tracing
|
||
+*/
|
||
+
|
||
+bool
|
||
+trace_indirect_ptr (tree &base, tree &index, tree arg,
|
||
+ set<gimple *> traced_ref_stmt)
|
||
+{
|
||
+ gimple *def_stmt = SSA_NAME_DEF_STMT (arg);
|
||
+
|
||
+ if (!def_stmt || !is_gimple_assign (def_stmt))
|
||
+ return false;
|
||
+
|
||
+ tree_code rhs_code = gimple_assign_rhs_code (def_stmt);
|
||
+ if (rhs_code != POINTER_PLUS_EXPR)
|
||
+ return false;
|
||
+
|
||
+ /* POINTER_PLUS_EXPR, The first operand is always a pointer/reference type.
|
||
+ The second operand is always an unsigned integer type compatible with
|
||
+ sizetype. */
|
||
+ base = gimple_assign_rhs1 (def_stmt);
|
||
+ index = gimple_assign_rhs2 (def_stmt);
|
||
+
|
||
+ return trace_indirect_operand (index, traced_ref_stmt);
|
||
+}
|
||
+
|
||
+/* Trace the array of the indirect memory access:
|
||
+ 1) obtain the base address of the indirect memory access.
|
||
+ 2) ensure that the index has been traced in the direct memory access.
|
||
+
|
||
+ _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; // Traced in
|
||
+ direct access
|
||
+ _4 = (integer(kind=8)) _1;
|
||
+ _5 = _4 + 135;
|
||
+ _6 = p[_5]; // start tracing
|
||
+*/
|
||
+
|
||
+bool
|
||
+trace_indirect_array (tree &base, tree &index,
|
||
+ set<gimple *> traced_ref_stmt, tree ref)
|
||
+{
|
||
+ if (TREE_CODE (ref) != ARRAY_REF)
|
||
+ return false;
|
||
+ base = TREE_OPERAND (ref, 0);
|
||
+ index = TREE_OPERAND (ref, 1);
|
||
+ return trace_indirect_operand (index, traced_ref_stmt);
|
||
+}
|
||
+
|
||
+/* Tracing indirect memory reference information.
|
||
+ Include tracing of base addresses and source variable.
|
||
+ _x(ssa name) -> a_2(base addr) -> a(src var) */
|
||
+
|
||
+bool
|
||
+trace_indirect_mem_ref (data_ref &mem_ref,
|
||
+ set <gimple *> &traced_ref_stmt)
|
||
+{
|
||
+ /* Processing of vectorization types. */
|
||
+ if (mem_ref.vectorize_p)
|
||
+ {
|
||
+ tree op = gimple_call_arg (mem_ref.stmt, 1);
|
||
+ if (trace_indirect_operand (op, traced_ref_stmt))
|
||
+ {
|
||
+ mem_ref.base = gimple_call_arg (mem_ref.stmt, 0);
|
||
+ mem_ref.regular_p = false;
|
||
+ set<tree> walked;
|
||
+ if (mem_ref.var == NULL_TREE
|
||
+ && !trace_base_var (mem_ref.var, mem_ref.base, walked))
|
||
+ return false;
|
||
+ return true;
|
||
+ }
|
||
+ return false;
|
||
+ }
|
||
+
|
||
+ /* Processing of non-vectorized types. */
|
||
+ tree op = NULL_TREE;
|
||
+ ssa_op_iter iter;
|
||
+ FOR_EACH_SSA_TREE_OPERAND (op, mem_ref.stmt, iter, SSA_OP_USE)
|
||
+ {
|
||
+
|
||
+ /* Array type:
|
||
+ _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B];
|
||
+ _4 = c[_1];
|
||
+
|
||
+ Pointer type:
|
||
+ _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B];
|
||
+ _4 = (long unsigned int) _1;
|
||
+ _5 = _4 * 8;
|
||
+ _6 = p(D) + _5;
|
||
+ _7 = *_6;
|
||
+ */
|
||
+ tree base = NULL_TREE;
|
||
+ tree index = NULL_TREE;
|
||
+ if (trace_indirect_array (base, index, traced_ref_stmt, mem_ref.ref)
|
||
+ || trace_indirect_ptr (base, index, op, traced_ref_stmt))
|
||
+ {
|
||
+ /* ARRAY_REF, The first operand is the array;
|
||
+ the second is the index. */
|
||
+ mem_ref.base = base;
|
||
+ mem_ref.index = index;
|
||
+ mem_ref.regular_p = false;
|
||
+ set<tree> walked;
|
||
+ if (mem_ref.var == NULL_TREE
|
||
+ && !trace_base_var (mem_ref.var, mem_ref.base, walked))
|
||
+ return false;
|
||
+ return true;
|
||
+ }
|
||
+ }
|
||
+
|
||
+ return false;
|
||
+}
|
||
+
|
||
+/* Trace references base info:
|
||
+ 1) Parallel analysis
|
||
+ 2) Memory access rule analysis
|
||
+ 3) Tracing base address and source variable of memory references
|
||
+ We will extend parallel analysis later.
|
||
+*/
|
||
+
|
||
+void
|
||
+trace_ref_info (data_ref &mem_ref, set <gimple *> &traced_ref_stmt)
|
||
+{
|
||
+ enum tree_code ref_code = TREE_CODE (mem_ref.ref);
|
||
+ if (/* Vectorized and non-vectorized direct access. */
|
||
+ ref_code != TARGET_MEM_REF
|
||
+ /* non-vectorized indirect memory access. */
|
||
+ && ref_code != MEM_REF && ref_code != ARRAY_REF
|
||
+ /* vectorized indirect memory access. */
|
||
+ && ref_code != SSA_NAME)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "ref is another tree-code: ");
|
||
+ fprintf (dump_file, "stmt: ");
|
||
+ print_gimple_stmt (dump_file, mem_ref.stmt, 0, TDF_LINENO);
|
||
+ fprintf (dump_file, "ref: ");
|
||
+ print_generic_expr (dump_file, mem_ref.ref, TDF_LINENO);
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+ return;
|
||
+ }
|
||
+
|
||
+ /* 1) Direct and indirect access traces and traces source variables. */
|
||
+ if (!trace_direct_mem_ref (mem_ref, traced_ref_stmt)
|
||
+ && !trace_indirect_mem_ref (mem_ref, traced_ref_stmt))
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "Tracing failed.\n\n");
|
||
+ return;
|
||
+ }
|
||
+
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "Tracing succeeded.\n\n");
|
||
+ mem_ref.trace_status_p = true;
|
||
+}
|
||
+
|
||
+/* Tracing and sorting reference groups. */
|
||
+
|
||
+void
|
||
+trace_data_refs_info (vector<class loop *> &kernels,
|
||
+ map<class loop*, vector<data_ref> > &loop_refs)
|
||
+{
|
||
+ if (dump_file)
|
||
+ fprintf (dump_file, "\nPhase 2: trace_all_references_info\n\n");
|
||
+
|
||
+ set <gimple *> traced_ref_stmt;
|
||
+
|
||
+ for (unsigned i = 0; i < kernels.size (); ++i)
|
||
+ {
|
||
+ class loop* loop = kernels[i];
|
||
+
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "loop header %d:\n", loop->header->index);
|
||
+ for (unsigned j = 0; j < loop_refs[loop].size (); ++j)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "trace_references_base_info %d:\n", j);
|
||
+ print_generic_expr (dump_file, loop_refs[loop][j].ref, TDF_SLIM);
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+ trace_ref_info (loop_refs[loop][j], traced_ref_stmt);
|
||
+ }
|
||
+ }
|
||
+}
|
||
+
|
||
+/* ================ phase 3 analyze_nested_kernels ================ */
|
||
+
|
||
+/* Return the inner most type for arrays and pointers of TYPE. */
|
||
+
|
||
+tree
|
||
+inner_type (tree type)
|
||
+{
|
||
+ while (POINTER_TYPE_P (type)
|
||
+ || TREE_CODE (type) == ARRAY_TYPE)
|
||
+ type = TREE_TYPE (type);
|
||
+ return type;
|
||
+}
|
||
+
|
||
+/* Check whether the input iv is the loop dimension boundary. */
|
||
+
|
||
+bool
|
||
+loop_bound_iv_p (tree t, tree &outer_loop_t)
|
||
+{
|
||
+ if (t == NULL || TREE_CODE (t) != SSA_NAME
|
||
+ || TREE_CODE (TREE_TYPE (t)) != INTEGER_TYPE)
|
||
+ return false;
|
||
+
|
||
+ gimple *def_stmt = SSA_NAME_DEF_STMT (t);
|
||
+ if (gimple_code (def_stmt) != GIMPLE_PHI)
|
||
+ return false;
|
||
+
|
||
+ /* Filter scenarios with only two phi inputs. */
|
||
+ if (gimple_phi_num_args (def_stmt) != 2)
|
||
+ return false;
|
||
+
|
||
+ gphi *phi_stmt = as_a <gphi *> (def_stmt);
|
||
+ basic_block src0 = gimple_phi_arg_edge (phi_stmt, 0)->src;
|
||
+ basic_block src1 = gimple_phi_arg_edge (phi_stmt, 1)->src;
|
||
+
|
||
+ class loop *loop = loop_containing_stmt (def_stmt);
|
||
+ bool res = false;
|
||
+ /* Two phi inputs, one from the current loop and one from the outer loop. */
|
||
+ if ((src0->loop_father == loop) && (src1->loop_father == loop_outer (loop)))
|
||
+ {
|
||
+ outer_loop_t = gimple_phi_arg_def (def_stmt, 1);
|
||
+ res = true;
|
||
+ }
|
||
+ else if ((src1->loop_father == loop)
|
||
+ && (src0->loop_father == loop_outer (loop)))
|
||
+ {
|
||
+ outer_loop_t = gimple_phi_arg_def (def_stmt, 0);
|
||
+ res = true;
|
||
+ }
|
||
+
|
||
+ if (res)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "===> ");
|
||
+ print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM);
|
||
+ }
|
||
+ return true;
|
||
+ }
|
||
+ return false;
|
||
+}
|
||
+
|
||
+/* add worklist and walked list. */
|
||
+
|
||
+void
|
||
+add_worklist_walked (vector<tree> &worklist, set<tree> &walked, tree node)
|
||
+{
|
||
+ if (!walked.count (node))
|
||
+ {
|
||
+ worklist.push_back (node);
|
||
+ /* Avoid phi node cycle introduction, which makes the worklist unable
|
||
+ to end. */
|
||
+ walked.insert (node);
|
||
+ }
|
||
+}
|
||
+
|
||
+/* check bound iv and add worklist. */
|
||
+
|
||
+void
|
||
+check_bound_iv_and_add_worklist (vector<tree> &worklist, set<tree> &walked,
|
||
+ tree t, data_ref &mem_ref)
|
||
+{
|
||
+ if (TREE_CODE (t) != SSA_NAME)
|
||
+ return;
|
||
+
|
||
+ gimple *def_stmt = SSA_NAME_DEF_STMT (t);
|
||
+ if (def_stmt == NULL)
|
||
+ return;
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ print_generic_expr (dump_file, t, TDF_SLIM);
|
||
+ fprintf (dump_file, "\t\t: ");
|
||
+ print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM);
|
||
+ }
|
||
+
|
||
+ if (gimple_code (def_stmt) == GIMPLE_PHI)
|
||
+ {
|
||
+ tree out_loop_t = NULL_TREE;
|
||
+ if (loop_bound_iv_p (t, out_loop_t))
|
||
+ {
|
||
+ mem_ref.loop_bounds.push_back (loop_bound (t, def_stmt));
|
||
+ add_worklist_walked (worklist, walked, out_loop_t);
|
||
+ }
|
||
+ }
|
||
+ else if (is_gimple_assign (def_stmt))
|
||
+ {
|
||
+ tree_code rhs_code = gimple_assign_rhs_code (def_stmt);
|
||
+
|
||
+ /* unary. */
|
||
+ if (rhs_code == SSA_NAME || rhs_code == NOP_EXPR)
|
||
+ add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt));
|
||
+ else if (rhs_code == POINTER_PLUS_EXPR)
|
||
+ add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt));
|
||
+
|
||
+ /* binary. */
|
||
+ else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR
|
||
+ || rhs_code == MULT_EXPR)
|
||
+ {
|
||
+ add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt));
|
||
+ add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt));
|
||
+ }
|
||
+ }
|
||
+}
|
||
+
|
||
+/* DFS trace the loop bound of iv. */
|
||
+
|
||
+bool
|
||
+trace_loop_bound_iv (data_ref &mem_ref)
|
||
+{
|
||
+ /* Indirect memory access, the size cannot be determined based on the loop
|
||
+ boundary. */
|
||
+ if (!mem_ref.regular_p)
|
||
+ return false;
|
||
+
|
||
+ /* Determine and record the boundary iv of the current index,
|
||
+ but do not trace it. */
|
||
+ tree outer_loop_t = NULL_TREE;
|
||
+ if (loop_bound_iv_p (mem_ref.index, outer_loop_t))
|
||
+ mem_ref.loop_bounds.push_back (
|
||
+ loop_bound (mem_ref.index, SSA_NAME_DEF_STMT (mem_ref.index)));
|
||
+
|
||
+ vector<tree> worklist;
|
||
+ worklist.push_back (mem_ref.base);
|
||
+ set<tree> walked;
|
||
+
|
||
+ while (worklist.size ())
|
||
+ {
|
||
+ tree t = worklist.back ();
|
||
+ worklist.pop_back ();
|
||
+
|
||
+ /* add worklist. */
|
||
+ check_bound_iv_and_add_worklist (worklist, walked, t, mem_ref);
|
||
+ }
|
||
+
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "\nmem_ref access dimension: %ld\n",
|
||
+ mem_ref.loop_bounds.size ());
|
||
+
|
||
+ return mem_ref.loop_bounds.size () > 0;
|
||
+}
|
||
+
|
||
+/* dump loop bound. */
|
||
+
|
||
+void
|
||
+loop_bound_dump (FILE *file, loop_bound &lb)
|
||
+{
|
||
+ class loop *loop = lb.loop;
|
||
+ fprintf (file, "loop_bound: loop_%d (", loop->num);
|
||
+ if (loop->header)
|
||
+ fprintf (file, "header = %d", loop->header->index);
|
||
+ else
|
||
+ {
|
||
+ fprintf (file, "deleted)\n");
|
||
+ return;
|
||
+ }
|
||
+ if (loop->latch)
|
||
+ fprintf (file, ", latch = %d", loop->latch->index);
|
||
+ fprintf (file, ", lb_niters = ");
|
||
+ print_generic_expr (file, lb.niters);
|
||
+ fprintf (file, ")\n");
|
||
+}
|
||
+
|
||
+/* static calculate data size. */
|
||
+
|
||
+void
|
||
+static_calculate_data_size (data_ref &mem_ref)
|
||
+{
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "\nstatic_calculate_data_size\n");
|
||
+
|
||
+ tree size_unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var)));
|
||
+ HOST_WIDE_INT type_size = size_unit ? tree_to_uhwi (size_unit) : 0;
|
||
+ for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i)
|
||
+ {
|
||
+ HOST_WIDE_INT est_niter = tree_to_uhwi (mem_ref.loop_bounds[i].niters);
|
||
+ unsigned int unroll = mem_ref.loop_bounds[i].unroll;
|
||
+ if (i == 0)
|
||
+ {
|
||
+ /* The unit conversion between byte, kilobytes, and megabytes is
|
||
+ 1024. */
|
||
+ mem_ref.data_size = double (type_size
|
||
+ * est_niter * unroll) / 1024 / 1024;
|
||
+ }
|
||
+ else
|
||
+ mem_ref.data_size *= est_niter * unroll;
|
||
+
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "static_data_size: %lf\n", mem_ref.data_size);
|
||
+ }
|
||
+}
|
||
+
|
||
+/* Recursive tracing and creating of dominant nodes. */
|
||
+
|
||
+tree
|
||
+trace_and_create_dominate_expr (tree expr, class loop *outermost)
|
||
+{
|
||
+ if (expr == NULL_TREE || is_gimple_constant (expr))
|
||
+ return expr;
|
||
+
|
||
+ if (TREE_CODE (expr) != SSA_NAME)
|
||
+ return NULL_TREE;
|
||
+
|
||
+ if (SSA_NAME_IS_DEFAULT_DEF (expr))
|
||
+ return expr;
|
||
+
|
||
+ gimple *stmt = SSA_NAME_DEF_STMT (expr);
|
||
+ basic_block def_bb = gimple_bb (stmt);
|
||
+ if (def_bb == NULL || def_bb->loop_father == NULL)
|
||
+ return NULL_TREE;
|
||
+
|
||
+ if (dominated_by_p (CDI_DOMINATORS, outermost->header, def_bb))
|
||
+ return expr;
|
||
+
|
||
+ if (gimple_code (stmt) != GIMPLE_ASSIGN)
|
||
+ return NULL_TREE;
|
||
+
|
||
+ enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
|
||
+ tree_code_class code_class = TREE_CODE_CLASS (rhs_code);
|
||
+ tree type = TREE_TYPE (gimple_assign_lhs (stmt));
|
||
+ tree rhs1 = trace_and_create_dominate_expr
|
||
+ (gimple_assign_rhs1 (stmt), outermost);
|
||
+ if (rhs1 == NULL_TREE)
|
||
+ return NULL_TREE;
|
||
+
|
||
+ if (code_class == tcc_unary)
|
||
+ {
|
||
+ tree expr_new = build1 (rhs_code, type, rhs1);
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "expr_new = ");
|
||
+ print_generic_expr (dump_file, expr_new, TDF_SLIM);
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+ return expr_new;
|
||
+ }
|
||
+ else if (code_class == tcc_binary)
|
||
+ {
|
||
+ tree rhs2 = trace_and_create_dominate_expr
|
||
+ (gimple_assign_rhs2 (stmt), outermost);
|
||
+ if (rhs2 == NULL_TREE)
|
||
+ return NULL_TREE;
|
||
+
|
||
+ tree expr_new = fold_build2 (rhs_code, type, rhs1, rhs2);
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "expr_new = ");
|
||
+ print_generic_expr (dump_file, expr_new, TDF_SLIM);
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+ return expr_new;
|
||
+ }
|
||
+
|
||
+ return NULL_TREE;
|
||
+}
|
||
+
|
||
+/* Recursive parsing and craating of nodes in expr expressions. */
|
||
+
|
||
+tree
|
||
+parse_and_create_expr (tree expr, class loop *outermost)
|
||
+{
|
||
+ if (expr == NULL_TREE || expr == chrec_dont_know
|
||
+ || is_gimple_constant (expr) || TREE_CODE (expr) == ADDR_EXPR)
|
||
+ {
|
||
+ /* tcc_expression (e.g., &q) situation combined with tcc_unary. */
|
||
+ if (TREE_CODE (expr) == ADDR_EXPR && dump_file
|
||
+ && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "tcc_expression case in ADDR_EXPR: ");
|
||
+ print_generic_expr (dump_file, expr, TDF_SLIM);
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+ return expr;
|
||
+ }
|
||
+
|
||
+ if (TREE_CODE (expr) == SSA_NAME)
|
||
+ return trace_and_create_dominate_expr (expr, outermost);
|
||
+ else if (EXPR_P (expr))
|
||
+ {
|
||
+ enum tree_code tree_code = TREE_CODE (expr);
|
||
+ tree_code_class code_class = TREE_CODE_CLASS (tree_code);
|
||
+ tree type = TREE_TYPE (expr);
|
||
+ tree op1 = parse_and_create_expr (TREE_OPERAND (expr, 0), outermost);
|
||
+ if (op1 == NULL_TREE)
|
||
+ return NULL_TREE;
|
||
+
|
||
+ if (code_class == tcc_unary)
|
||
+ {
|
||
+ tree expr_new = build1 (tree_code, type, op1);
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "expr_new = ");
|
||
+ print_generic_expr (dump_file, expr_new, TDF_SLIM);
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+ return expr_new;
|
||
+ }
|
||
+ else if (code_class == tcc_binary)
|
||
+ {
|
||
+ tree op2 = parse_and_create_expr (TREE_OPERAND (expr, 1), outermost);
|
||
+ if (op2 == NULL_TREE)
|
||
+ return NULL_TREE;
|
||
+
|
||
+ tree expr_new = fold_build2 (tree_code, type, op1, op2);
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "expr_new = ");
|
||
+ print_generic_expr (dump_file, expr_new, TDF_SLIM);
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+ return expr_new;
|
||
+ }
|
||
+ }
|
||
+ return NULL_TREE;
|
||
+}
|
||
+
|
||
+/* Trace and creat dominate loop bounds. */
|
||
+
|
||
+void
|
||
+trace_and_create_dominate_loop_bounds (data_ref &mem_ref)
|
||
+{
|
||
+ /* Check whether the niters is a loop dominant.
|
||
+ If not, trace and determine whether the result is dominant. If yes, create
|
||
+ the expr of the dominant node.
|
||
+ */
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "\ntrace_and_create_dominate_loop_bounds\n");
|
||
+
|
||
+ /* Determine the relationship between the boundary of the innermost loop and
|
||
+ the dominant of the outer loop and the processing. */
|
||
+ loop_bound &outermost = mem_ref.loop_bounds.back ();
|
||
+ for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i)
|
||
+ {
|
||
+ loop_bound ¤t = mem_ref.loop_bounds[i];
|
||
+ tree &niters = current.niters;
|
||
+ if (TREE_CODE (niters) == COND_EXPR)
|
||
+ niters = TREE_OPERAND (niters, 1);
|
||
+
|
||
+ niters = parse_and_create_expr (niters, outermost.loop);
|
||
+
|
||
+ if (niters == NULL_TREE)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM);
|
||
+ fprintf (dump_file, "Tracing loop bound failed at dimension %d",
|
||
+ i);
|
||
+ }
|
||
+ mem_ref.calc_by = UNHANDLE_CALC;
|
||
+ break;
|
||
+ }
|
||
+
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ loop_bound_dump (dump_file, mem_ref.loop_bounds[i]);
|
||
+ }
|
||
+}
|
||
+
|
||
+/* trace the dimension and corresponding loop bounds of mem_ref.
|
||
+ This function is used to supplement the information of mem_ref.loop_bounds.
|
||
+*/
|
||
+
|
||
+void
|
||
+trace_ref_dimension_and_loop_bounds (data_ref &mem_ref)
|
||
+{
|
||
+ /* In the same loop, some memory access dimensions are different. Remove
|
||
+ variables with fewer dimensions.
|
||
+ Previous cyclic filtering conditions and memory access node records and
|
||
+ tracing.
|
||
+ The false result is also processed.
|
||
+ */
|
||
+ if (dump_file)
|
||
+ fprintf (dump_file, "\ncalculate_data_size\n");
|
||
+
|
||
+ /* Trace the loop bound iv of ref to determine the dimension. */
|
||
+ /* Record data from the loop perspective to avoid repeated tracing. */
|
||
+ if (!trace_loop_bound_iv (mem_ref))
|
||
+ return;
|
||
+
|
||
+ /* The traced mem_ref may have multiple dimensions, which corresponds to
|
||
+ multiple loops. */
|
||
+ /* And in the dimension-by-dimensional analysis, the computable way is
|
||
+ continuously reduced. */
|
||
+ mem_ref.calc_by = STATIC_CALC;
|
||
+ for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i)
|
||
+ {
|
||
+ class loop *loop = mem_ref.loop_bounds[i].loop;
|
||
+ tree &niters = mem_ref.loop_bounds[i].niters;
|
||
+
|
||
+ /* Set NULL_TREE to ensure that nb_iterations are retraced and
|
||
+ vec_nb_iterations are also extracted. */
|
||
+ loop->nb_iterations = NULL_TREE;
|
||
+ niters = number_of_latch_executions (loop, false);
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ loop_dump (dump_file, loop);
|
||
+
|
||
+ if (loop->unroll)
|
||
+ {
|
||
+ if (loop->unroll == USHRT_MAX && dump_file
|
||
+ && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "loop->unroll = USHRT_MAX = %d", USHRT_MAX);
|
||
+ mem_ref.loop_bounds[i].unroll = loop->unroll;
|
||
+ }
|
||
+
|
||
+ if ((niters == chrec_dont_know) && loop->vec_nb_iterations
|
||
+ && (loop->vec_nb_iterations != chrec_dont_know))
|
||
+ niters = loop->vec_nb_iterations;
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ loop_bound_dump (dump_file, mem_ref.loop_bounds[i]);
|
||
+
|
||
+ if (niters == NULL_TREE || niters == chrec_dont_know)
|
||
+ mem_ref.calc_by = min (mem_ref.calc_by, UNHANDLE_CALC);
|
||
+ else if (TREE_CODE (niters) != INTEGER_CST)
|
||
+ mem_ref.calc_by = min (mem_ref.calc_by, RUNTIME_CALC);
|
||
+ else
|
||
+ mem_ref.calc_by = min (mem_ref.calc_by, STATIC_CALC);
|
||
+ }
|
||
+
|
||
+ if (mem_ref.calc_by == RUNTIME_CALC)
|
||
+ trace_and_create_dominate_loop_bounds (mem_ref);
|
||
+ else if (mem_ref.calc_by == STATIC_CALC)
|
||
+ static_calculate_data_size (mem_ref);
|
||
+}
|
||
+
|
||
+/* analyze nested kernels.
|
||
+ 1. multidimension loop analyze.
|
||
+ 2. extended outer loop analyze.
|
||
+ Later we will extend outer loop analysis.
|
||
+*/
|
||
+
|
||
+bool
|
||
+analyze_nested_kernels (vector<class loop *> &kernels,
|
||
+ map<class loop*, vector<data_ref> > &loop_refs)
|
||
+{
|
||
+ if (dump_file)
|
||
+ fprintf (dump_file, "\nPhase 3: analyze_nested_kernels\n\n");
|
||
+
|
||
+ for (unsigned i = 0; i < kernels.size (); ++i)
|
||
+ {
|
||
+ class loop* loop = kernels[i];
|
||
+ if (loop_refs.count (loop) == 0)
|
||
+ continue;
|
||
+
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "\n\nloop header %d:\n", loop->header->index);
|
||
+ for (unsigned j = 0; j < loop_refs[loop].size (); ++j)
|
||
+ {
|
||
+ if (loop_refs[loop][j].trace_status_p == false)
|
||
+ continue;
|
||
+
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "\ntrace_reference_dimension at mem_ref "
|
||
+ "index %d in loop %d:\n", j, loop->num);
|
||
+ print_generic_expr (dump_file, loop_refs[loop][j].ref, TDF_SLIM);
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+ trace_ref_dimension_and_loop_bounds (loop_refs[loop][j]);
|
||
+ }
|
||
+
|
||
+ }
|
||
+ return true;
|
||
+}
|
||
+
|
||
+/* ================ phase 4 filter_and_sort_kernels ================ */
|
||
+
|
||
+/* Get the edge probability information of each basic block in the loop. */
|
||
+
|
||
+float
|
||
+get_edge_prob (edge e, float minimum)
|
||
+{
|
||
+ float fvalue = 0;
|
||
+
|
||
+ profile_probability probability = e->probability;
|
||
+ if (probability.initialized_p ())
|
||
+ {
|
||
+ fvalue = probability.to_reg_br_prob_base () / float (REG_BR_PROB_BASE);
|
||
+ if (fvalue < minimum && probability.to_reg_br_prob_base ())
|
||
+ fvalue = minimum;
|
||
+ }
|
||
+ return fvalue;
|
||
+}
|
||
+
|
||
+/* Get the next bb with a high branch probability. */
|
||
+
|
||
+basic_block
|
||
+next_high_probability_bb (basic_block bb)
|
||
+{
|
||
+ if (bb == NULL)
|
||
+ return NULL;
|
||
+
|
||
+ /* Limit the minimum probability value. */
|
||
+ const float MINNUM_PROB = 0.00001f;
|
||
+ float minimum = MINNUM_PROB;
|
||
+
|
||
+ gimple *stmt = last_stmt (bb);
|
||
+ if (stmt && gimple_code (stmt) == GIMPLE_COND)
|
||
+ {
|
||
+ edge true_edge = NULL;
|
||
+ edge false_edge = NULL;
|
||
+ extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
|
||
+
|
||
+ float true_edge_prob = get_edge_prob (true_edge, minimum);
|
||
+ float false_edge_prob = get_edge_prob (false_edge, minimum);
|
||
+ /* If the content of the branch does not include the candidate
|
||
+ kernel, the branch probability may not be limited. */
|
||
+ /* The edge_prob may have precision error during static prediction,
|
||
+ so we need to relax the limit before comparison. */
|
||
+ if ((true_edge_prob >= (param_branch_prob_threshold / 100.0) - minimum)
|
||
+ && flow_bb_inside_loop_p (bb->loop_father, true_edge->dest))
|
||
+ return true_edge->dest;
|
||
+ else if ((false_edge_prob >= (param_branch_prob_threshold / 100.0)
|
||
+ - minimum) && flow_bb_inside_loop_p (bb->loop_father,
|
||
+ false_edge->dest))
|
||
+ return false_edge->dest;
|
||
+ else
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "No high probability bb:");
|
||
+ fprintf (dump_file, "current bb: %d, true: %f, false: %f\n",
|
||
+ bb->index, true_edge_prob, false_edge_prob);
|
||
+ }
|
||
+ return NULL;
|
||
+ }
|
||
+ }
|
||
+ else
|
||
+ {
|
||
+ edge e = find_fallthru_edge (bb->succs);
|
||
+ if (e)
|
||
+ return e->dest;
|
||
+ }
|
||
+ return NULL;
|
||
+}
|
||
+
|
||
+
|
||
+/* Dump loop header bb. */
|
||
+
|
||
+void
|
||
+dump_loop_headers (const char *name, vector<class loop *> &loops)
|
||
+{
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "\n\n%s:\n", name);
|
||
+ fprintf (dump_file, "{ ");
|
||
+ for (unsigned int i = 0; i < loops.size (); i++)
|
||
+ fprintf (dump_file, "%d(%d) ", loops[i]->num, loops[i]->header->index);
|
||
+ fprintf (dump_file, "}\n\n");
|
||
+ }
|
||
+}
|
||
+
|
||
+/* Combine and sort candidate loops. */
|
||
+
|
||
+bool
|
||
+filter_and_sort_kernels (vector<class loop *> &sorted_kernels,
|
||
+ vector<class loop *> &kernels)
|
||
+{
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "\nPhase 4: filter_and_sort_kernels:\n\n");
|
||
+
|
||
+ set<basic_block> end_bb;
|
||
+ list<basic_block> walked_header_bb; /* Used to record nested loops. */
|
||
+
|
||
+ for (unsigned i = 0; i < kernels.size (); ++i)
|
||
+ end_bb.insert (kernels[i]->header);
|
||
+
|
||
+ dump_loop_headers ("kernels", kernels);
|
||
+
|
||
+ if (!param_filter_kernels)
|
||
+ {
|
||
+ for (vector<class loop *>::iterator it = kernels.begin ();
|
||
+ it != kernels.end (); ++it)
|
||
+ sorted_kernels.push_back (*it);
|
||
+ }
|
||
+ else
|
||
+ {
|
||
+ basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun);
|
||
+
|
||
+ while (bb)
|
||
+ {
|
||
+ if (bb == NULL)
|
||
+ return false;
|
||
+ if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun))
|
||
+ break;
|
||
+
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "%d ", bb->index);
|
||
+
|
||
+ /* bb is not the head of the loop, go to the next. */
|
||
+ if (bb != bb->loop_father->header)
|
||
+ {
|
||
+ bb = next_high_probability_bb (bb);
|
||
+ continue;
|
||
+ }
|
||
+
|
||
+ /* bb is the head of the loop. */
|
||
+ if (bb != walked_header_bb.back ())
|
||
+ {
|
||
+ if (end_bb.count (bb))
|
||
+ {
|
||
+ sorted_kernels.push_back (bb->loop_father);
|
||
+ bb = single_exit (bb->loop_father)->dest;
|
||
+ continue;
|
||
+ }
|
||
+ if (loop_outer (bb->loop_father) != NULL
|
||
+ && get_loop_exit_edges (bb->loop_father).length () != 1)
|
||
+ return false;
|
||
+ walked_header_bb.push_back (bb);
|
||
+ bb = next_high_probability_bb (bb);
|
||
+ continue;
|
||
+ }
|
||
+ else
|
||
+ {
|
||
+ walked_header_bb.pop_back ();
|
||
+ bb = single_exit (bb->loop_father)->dest;
|
||
+ continue;
|
||
+ }
|
||
+ }
|
||
+ }
|
||
+
|
||
+ dump_loop_headers ("sorted_kernels", sorted_kernels);
|
||
+ return true;
|
||
+}
|
||
+
|
||
+/* ================ phase 5 record_and_sort_ref_groups ================ */
|
||
+/* Memory reference score, different aspects of one memory reference. */
|
||
+
|
||
+struct ref_score
|
||
+{
|
||
+ /* certain memory reference. */
|
||
+ data_ref d_ref;
|
||
+
|
||
+ /* local count for bb where memory reference is located. */
|
||
+ gcov_type bb_count;
|
||
+
|
||
+ /* line-location of memory reference. */
|
||
+ int line;
|
||
+};
|
||
+
|
||
+
|
||
+/* Memory reference group, different reference of the same variable. */
|
||
+
|
||
+struct ref_group
|
||
+{
|
||
+ /* source variables. */
|
||
+ tree var;
|
||
+
|
||
+ /* variable size, Unit: MB. */
|
||
+ double var_size;
|
||
+
|
||
+ /* first ref for insert hint. */
|
||
+ data_ref first_use;
|
||
+
|
||
+ /* reuse scores of variables. */
|
||
+ unsigned int reuse_level;
|
||
+
|
||
+ /* method of calculating the var size. */
|
||
+ calc_type calc_by;
|
||
+
|
||
+ /* memory reference index for specific variable. */
|
||
+ unsigned int mem_ref_index;
|
||
+
|
||
+ /* Accessing Reference Records in Different Modes (key_index):
|
||
+ 000: write, random, non-parallel
|
||
+ 001: write, random, parallel
|
||
+ 010: write, regular, non-parallel
|
||
+ 011: write, regular, parallel
|
||
+ 100: read, random, non-parallel
|
||
+ 101: read, random, parallel
|
||
+ 110: read, regular, non-parallel
|
||
+ 111: read, regular, parallel
|
||
+ */
|
||
+ map<int, vector<data_ref> > ref_use;
|
||
+
|
||
+ /* scores for different memory references. */
|
||
+ vector<ref_score> ref_scores;
|
||
+
|
||
+ ref_group ()
|
||
+ {
|
||
+ var = NULL_TREE;
|
||
+ var_size = 0;
|
||
+ reuse_level = 0;
|
||
+ calc_by = UNHANDLE_CALC;
|
||
+ mem_ref_index = 0;
|
||
+ }
|
||
+};
|
||
+
|
||
+/* calculate reuse level. */
|
||
+
|
||
+unsigned int
|
||
+calculate_reuse_level (map<int, vector<data_ref> > &var_use)
|
||
+{
|
||
+ unsigned int level = 0;
|
||
+ for (map<int, vector<data_ref> >::iterator it = var_use.begin ();
|
||
+ it != var_use.end (); ++it)
|
||
+ {
|
||
+ unsigned int parallel = 1;
|
||
+ unsigned int regular = 1;
|
||
+ unsigned int cost = 1;
|
||
+
|
||
+ if ((*it).second[0].parallel_p)
|
||
+ parallel = PARALLEL_NUM;
|
||
+ if (!(*it).second[0].regular_p)
|
||
+ regular = INDIRECT_ACCESS_VALUE;
|
||
+ if (!(*it).second[0].read_p)
|
||
+ cost = WRITE_COST;
|
||
+
|
||
+ /* In serial reuse, we will later check whether they are in the
|
||
+ same cacheline. If yes, delete the reuse. For details, see the
|
||
+ reuse analysis of prefetching and eliminate redundancy. */
|
||
+ unsigned int add = parallel * ((*it).second.size () * (cost + regular));
|
||
+ level += add;
|
||
+ if (add && dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "%d : %d * (%ld * (%d + %d)) = %d\n",
|
||
+ (*it).first, parallel, (*it).second.size (), cost, regular, add);
|
||
+ }
|
||
+ return level;
|
||
+}
|
||
+
|
||
+/* Comparison of reference reuse level. */
|
||
+
|
||
+bool
|
||
+ref_group_reuse_cmp (const ref_group &a, const ref_group &b)
|
||
+{
|
||
+ return a.reuse_level > b.reuse_level;
|
||
+}
|
||
+
|
||
+/* Sort reference groups. */
|
||
+
|
||
+void
|
||
+sort_ref_groups (vector<ref_group> &ref_groups,
|
||
+ map<tree, ref_group> &ref_groups_map)
|
||
+{
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "\nsort_ref_groups_by_reuse_level\n");
|
||
+
|
||
+ for (map<tree, ref_group>::iterator it = ref_groups_map.begin ();
|
||
+ it != ref_groups_map.end (); ++it)
|
||
+ {
|
||
+ (*it).second.reuse_level = calculate_reuse_level ((*it).second.ref_use);
|
||
+ ref_groups.push_back ((*it).second);
|
||
+
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ print_generic_expr (dump_file, (*it).second.var, TDF_SLIM);
|
||
+ fprintf (dump_file, " : %d\n", (*it).second.reuse_level);
|
||
+ }
|
||
+ }
|
||
+
|
||
+ sort (ref_groups.begin (), ref_groups.end (), ref_group_reuse_cmp);
|
||
+
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "\nsorted ref_groups:\n");
|
||
+ fprintf (dump_file, "rank var (data_size, num_of_mem_ref, need_tmp_name):"
|
||
+ " reuse_level_score\n");
|
||
+ for (unsigned int i = 0; i < ref_groups.size (); ++i)
|
||
+ {
|
||
+ fprintf (dump_file, "%d ", i);
|
||
+ print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM);
|
||
+ int need_tmp_name = !get_name (ref_groups[i].var) ? 1 : 0;
|
||
+ fprintf (dump_file, " (%lf, %lu, %d)", ref_groups[i].var_size,
|
||
+ ref_groups[i].ref_scores.size (), need_tmp_name);
|
||
+ fprintf (dump_file, " : %d\n", ref_groups[i].reuse_level);
|
||
+ }
|
||
+ fprintf (dump_file, "\n");
|
||
+
|
||
+ fprintf (dump_file, "first_use:\n");
|
||
+ for (unsigned int i = 0; i < ref_groups.size (); ++i)
|
||
+ {
|
||
+ fprintf (dump_file, "%d ", i);
|
||
+ print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM);
|
||
+ fprintf (dump_file, " : ");
|
||
+ if (!ref_groups[i].first_use.vectorize_p)
|
||
+ print_generic_expr (dump_file, ref_groups[i].first_use.ref,
|
||
+ TDF_SLIM);
|
||
+ else
|
||
+ print_gimple_stmt (dump_file, ref_groups[i].first_use.stmt,
|
||
+ TDF_SLIM);
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+}
|
||
+
|
||
+/* Attributes of variable data. */
|
||
+
|
||
+enum data_attribute
|
||
+{
|
||
+ DA_PARALLEL = 0,
|
||
+ DA_REGULAR,
|
||
+ DA_READ
|
||
+};
|
||
+
|
||
+/* Record memory reference by use mode.
|
||
+ If the reference group is not found, create a group. */
|
||
+
|
||
+void
|
||
+record_mem_ref (map<tree, ref_group> &ref_groups, data_ref &mem_ref)
|
||
+{
|
||
+ unsigned int index = (mem_ref.parallel_p << DA_PARALLEL)
|
||
+ + (mem_ref.regular_p << DA_REGULAR) + (mem_ref.read_p << DA_READ);
|
||
+
|
||
+ if (!ref_groups.count (mem_ref.var))
|
||
+ {
|
||
+ ref_group ref_group;
|
||
+ ref_group.var = mem_ref.var;
|
||
+ ref_group.first_use = mem_ref;
|
||
+ ref_groups[mem_ref.var] = ref_group;
|
||
+ }
|
||
+
|
||
+ /* Ref_groups' calc_by depends on the inserted mem_ref's calc_by.
|
||
+ Runtime issue requires the specified mem_ref's calc_by to be >= 1.
|
||
+ Temporarily modified ref_group's first_use after sorting mem_refs. */
|
||
+ ref_groups[mem_ref.var].calc_by = max (ref_groups[mem_ref.var].calc_by,
|
||
+ mem_ref.calc_by);
|
||
+ ref_groups[mem_ref.var].var_size = max (ref_groups[mem_ref.var].var_size,
|
||
+ mem_ref.data_size);
|
||
+ ref_groups[mem_ref.var].ref_use[index].push_back (mem_ref);
|
||
+
|
||
+ ref_score ref_level{ mem_ref, ((mem_ref.stmt)->bb->count).to_gcov_type (),
|
||
+ expand_location (mem_ref.stmt->location).line };
|
||
+ ref_groups[mem_ref.var].ref_scores.push_back (ref_level);
|
||
+
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "recorded in: ");
|
||
+ print_generic_expr (dump_file, mem_ref.var, TDF_SLIM);
|
||
+ fprintf (dump_file, ":%d:%ld\n", index,
|
||
+ ref_groups[mem_ref.var].ref_use[index].size () - 1);
|
||
+
|
||
+ fprintf (dump_file, "base: ");
|
||
+ print_generic_expr (dump_file, mem_ref.base, TDF_SLIM);
|
||
+
|
||
+ fprintf (dump_file, ", index: ");
|
||
+ print_generic_expr (dump_file, mem_ref.index, TDF_SLIM);
|
||
+
|
||
+ fprintf (dump_file, ", step: ");
|
||
+ if (mem_ref.step && cst_and_fits_in_hwi (mem_ref.step))
|
||
+ fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
|
||
+ int_cst_value (mem_ref.step));
|
||
+ else
|
||
+ print_generic_expr (dump_file, mem_ref.step, TDF_SLIM);
|
||
+
|
||
+ fprintf (dump_file, ", offset: ");
|
||
+ if (mem_ref.offset && cst_and_fits_in_hwi (mem_ref.offset))
|
||
+ fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
|
||
+ int_cst_value (mem_ref.offset));
|
||
+ else
|
||
+ print_generic_expr (dump_file, mem_ref.offset, TDF_SLIM);
|
||
+ fprintf (dump_file, ", %s", mem_ref.read_p ? "read" : "write");
|
||
+
|
||
+ fprintf (dump_file, ", size: %lf", mem_ref.data_size);
|
||
+ fprintf (dump_file, "\n\n");
|
||
+ }
|
||
+}
|
||
+
|
||
+/* Rank data reference index level by the scheme of source code line number. */
|
||
+
|
||
+bool
|
||
+data_ref_reuse_cmp (const ref_score &a, const ref_score &b)
|
||
+{
|
||
+ return a.line < b.line;
|
||
+}
|
||
+
|
||
+/* Sort data reference index level within one reference group in non-decreasing
|
||
+ order of the customized sorting scheme. */
|
||
+
|
||
+void
|
||
+sort_mem_ref_in_ref_group (map<tree, ref_group> &ref_groups_map)
|
||
+{
|
||
+ if (dump_file)
|
||
+ fprintf (dump_file, "\nsorted data_references:\n");
|
||
+ for (map<tree, ref_group>::iterator it = ref_groups_map.begin ();
|
||
+ it != ref_groups_map.end (); ++it)
|
||
+ {
|
||
+ vector<ref_score> &ref_scores = (*it).second.ref_scores;
|
||
+ stable_sort (ref_scores.begin (), ref_scores.end (), data_ref_reuse_cmp);
|
||
+ /* Update ref_group's first_use and calc_by with the first mem_ref after
|
||
+ sorting. */
|
||
+ (*it).second.first_use = (*it).second.ref_scores[0].d_ref;
|
||
+ (*it).second.calc_by = (*it).second.first_use.calc_by;
|
||
+
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ print_generic_expr (dump_file, (*it).first, TDF_SLIM);
|
||
+ fprintf (dump_file, " : %lu\n", ref_scores.size ());
|
||
+ for (unsigned int i = 0; i < ref_scores.size (); ++i)
|
||
+ {
|
||
+ fprintf (dump_file, "mem_ref_index %u: ", i);
|
||
+ print_gimple_stmt (dump_file, ref_scores[i].d_ref.stmt, 0,
|
||
+ TDF_LINENO);
|
||
+ }
|
||
+ fprintf (dump_file, "\n\n");
|
||
+ }
|
||
+ }
|
||
+}
|
||
+
|
||
+/* Tracing and sorting reference groups. */
|
||
+
|
||
+bool
|
||
+record_and_sort_ref_groups (vector<ref_group> &ref_groups,
|
||
+ vector<class loop *> &kernels,
|
||
+ map<class loop*, vector<data_ref> > &loop_refs)
|
||
+{
|
||
+ if (dump_file)
|
||
+ fprintf (dump_file, "\nPhase 5: trace_all_references_details\n\n");
|
||
+
|
||
+ map<tree, ref_group> ref_groups_map;
|
||
+
|
||
+ for (unsigned i = 0; i < kernels.size (); ++i)
|
||
+ {
|
||
+ class loop* loop = kernels[i];
|
||
+ if (loop_refs.count (loop) == 0)
|
||
+ continue;
|
||
+
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "loop header %d:\n", loop->header->index);
|
||
+ for (unsigned j = 0; j < loop_refs[loop].size (); ++j)
|
||
+ {
|
||
+ if (loop_refs[loop][j].trace_status_p)
|
||
+ record_mem_ref (ref_groups_map, loop_refs[loop][j]);
|
||
+ }
|
||
+ }
|
||
+
|
||
+ /* Sort mem_ref within ref_group by local count and update first_use's
|
||
+ data_ref, stable sort. */
|
||
+ sort_mem_ref_in_ref_group (ref_groups_map);
|
||
+ sort_ref_groups (ref_groups, ref_groups_map);
|
||
+
|
||
+ return ref_groups.size () > 0;
|
||
+}
|
||
+
|
||
+/* ================ phase 6 issue_llc_hint ================ */
|
||
+
|
||
+/* Issue vectorized mask prefetch gimple. */
|
||
+
|
||
+void
|
||
+issue_mask_prefetch (gimple *stmt)
|
||
+{
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "insert svprfd.\n");
|
||
+
|
||
+ /* vect__1.1 = .MASK_LOAD (_2, 32B, loop_mask_3);
|
||
+ .MASK_STORE (_4, 32B, loop_mask_5, vect__6.6);
|
||
+ */
|
||
+ tree dataref_ptr = gimple_call_arg (stmt, 0);
|
||
+ tree scale = gimple_call_arg (stmt, 1);
|
||
+ tree final_mask = gimple_call_arg (stmt, 2);
|
||
+ tree target = NULL_TREE;
|
||
+ if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
|
||
+ target = gimple_call_arg (stmt, 3);
|
||
+ else if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD)
|
||
+ target = gimple_call_lhs (stmt);
|
||
+ /* 4: PLDL3KEEP. */
|
||
+ tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4);
|
||
+
|
||
+ /* add offset. */
|
||
+ gimple_stmt_iterator si = gsi_for_stmt (stmt);
|
||
+ /* target: vector_type - XXX_type. */
|
||
+ if (target == NULL_TREE)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "unhandled scene: target vect is null");
|
||
+ return;
|
||
+ }
|
||
+ HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi
|
||
+ (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target))));
|
||
+ tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance);
|
||
+ addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true,
|
||
+ NULL, true, GSI_SAME_STMT);
|
||
+
|
||
+ gcall *call = gimple_build_call_internal (IFN_MASK_PREFETCH,
|
||
+ 5, addr, scale, final_mask, target, prfop);
|
||
+ gsi_insert_after (&si, call, GSI_SAME_STMT);
|
||
+ update_ssa (TODO_update_ssa_only_virtuals);
|
||
+}
|
||
+
|
||
+/* Issue vectorized mask gather prefetch gimple. */
|
||
+
|
||
+void
|
||
+issue_mask_gather_prefetch (gimple *stmt)
|
||
+{
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "insert svprfd_gather_uxindex.\n");
|
||
+
|
||
+ /* vect_patt_1.1 = .MASK_GATHER_LOAD (_2, vect__3.3, 8, { 0.0, ... },
|
||
+ loop_mask_4); */
|
||
+ tree dataref_ptr = gimple_call_arg (stmt, 0);
|
||
+ tree vec_offset = gimple_call_arg (stmt, 1);
|
||
+ tree scale = gimple_call_arg (stmt, 2);
|
||
+ tree zero = gimple_call_arg (stmt, 3);
|
||
+ tree final_mask = gimple_call_arg (stmt, 4);
|
||
+ tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4);
|
||
+ tree target = gimple_call_lhs (stmt);
|
||
+
|
||
+ /* add offset. */
|
||
+ gimple_stmt_iterator si = gsi_for_stmt (stmt);
|
||
+ if (target == NULL_TREE)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "unhandled scene: target vect is null");
|
||
+ return;
|
||
+ }
|
||
+ HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi
|
||
+ (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target))));
|
||
+ tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance);
|
||
+ addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true,
|
||
+ NULL, true, GSI_SAME_STMT);
|
||
+
|
||
+ gcall *call = gimple_build_call_internal
|
||
+ (IFN_MASK_GATHER_PREFETCH, 7, addr,
|
||
+ vec_offset, scale, zero, final_mask, target, prfop);
|
||
+ gsi_insert_after (&si, call, GSI_SAME_STMT);
|
||
+ update_ssa (TODO_update_ssa_only_virtuals);
|
||
+}
|
||
+
|
||
+/* Issue builtin prefetch gimple. */
|
||
+
|
||
+void
|
||
+issue_builtin_prefetch (data_ref &mem_ref)
|
||
+{
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "insert prfm.\n");
|
||
+ /* MEM[symbol: diagPtr, index: ivtmp_102, step: 8, offset: 0B] */
|
||
+ gimple* stmt = mem_ref.stmt;
|
||
+ tree dataref_ptr = mem_ref.base;
|
||
+ tree data_idx = mem_ref.index;
|
||
+ tree scale = mem_ref.step;
|
||
+ tree offset = mem_ref.offset;
|
||
+ /* add offset. */
|
||
+ gimple_stmt_iterator si = gsi_for_stmt (stmt);
|
||
+ if (scale == NULL_TREE)
|
||
+ {
|
||
+ /* _190 = (void *) ivtmp.444_221;
|
||
+ Cannot detect size unit at (void *). */
|
||
+ scale = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var)));
|
||
+ if (scale == NULL_TREE)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "ERROR: Unknown size unit for the prefetching "
|
||
+ "variable. Stop builtin_prefetch.\n\n");
|
||
+ return;
|
||
+ }
|
||
+ }
|
||
+
|
||
+ data_idx = data_idx ? data_idx : size_zero_node;
|
||
+ data_idx = build1 (NOP_EXPR, TREE_TYPE (scale), data_idx);
|
||
+ tree displacement = fold_build2 (MULT_EXPR, TREE_TYPE (scale), data_idx,
|
||
+ scale);
|
||
+ if (offset != NULL_TREE && TREE_CODE (offset) != TREE_CODE (size_zero_node))
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "WARNING: offset's TREE_TYPE is not integer_cst: "
|
||
+ "%s\nStop builtin_prefetch.\n",
|
||
+ get_tree_code_name (TREE_CODE (offset)));
|
||
+ return;
|
||
+ }
|
||
+ offset = offset ? offset : size_zero_node;
|
||
+ offset = build1 (NOP_EXPR, TREE_TYPE (scale), offset);
|
||
+ dataref_ptr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (dataref_ptr),
|
||
+ dataref_ptr, offset);
|
||
+ tree addr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (dataref_ptr),
|
||
+ dataref_ptr, displacement);
|
||
+ HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi (scale);
|
||
+
|
||
+ addr = fold_build_pointer_plus_hwi (addr, distance);
|
||
+ addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true,
|
||
+ NULL, true, GSI_SAME_STMT);
|
||
+ /* __builtin_prefetch (_68, 0, 1);
|
||
+ 1st param: *addr, 2nd param: write/read (1/0), 3rd param: temporal locality
|
||
+ (high means strong locality) */
|
||
+ gcall *call = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
|
||
+ 3, addr, integer_zero_node, integer_one_node);
|
||
+ gsi_insert_after (&si, call, GSI_SAME_STMT);
|
||
+ update_ssa (TODO_update_ssa_only_virtuals);
|
||
+}
|
||
+
|
||
+/* Retrieve memory reference at the specific index. */
|
||
+
|
||
+data_ref
|
||
+get_data_ref_at_idx (ref_group &var_ref_group)
|
||
+{
|
||
+ unsigned int mem_ref_size = static_cast<unsigned int>(
|
||
+ var_ref_group.ref_scores.size ());
|
||
+ if (strlen (param_mem_ref_index) == 0)
|
||
+ return var_ref_group.first_use;
|
||
+ else
|
||
+ {
|
||
+ /* Insert prefetch hint at highly-likely-used location with the given
|
||
+ index. */
|
||
+ if (var_ref_group.mem_ref_index >= mem_ref_size)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "WARNING: The target data_ref index is out "
|
||
+ "of range. Use top index instead!\n");
|
||
+ return var_ref_group.ref_scores[0].d_ref;
|
||
+ }
|
||
+ return var_ref_group.ref_scores[var_ref_group.mem_ref_index].d_ref;
|
||
+ }
|
||
+}
|
||
+
|
||
+/* Static form insertion and issue instruction. We may check the
|
||
+ determination of the ARM SVE architecture before SVE hint insertion. */
|
||
+
|
||
+void
|
||
+static_issue (vector<ref_group> &ref_groups, int num_issue_var)
|
||
+{
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "static issue\n");
|
||
+
|
||
+ for (int i = 0; i < num_issue_var; ++i)
|
||
+ {
|
||
+ data_ref mem_ref = get_data_ref_at_idx (ref_groups[i]);
|
||
+ if (mem_ref.vectorize_p)
|
||
+ {
|
||
+ enum internal_fn ifn_code = gimple_call_internal_fn
|
||
+ (mem_ref.stmt);
|
||
+ if (ifn_code == IFN_MASK_STORE || ifn_code == IFN_MASK_LOAD)
|
||
+ issue_mask_prefetch (mem_ref.stmt);
|
||
+ else if (ifn_code == IFN_MASK_GATHER_LOAD)
|
||
+ issue_mask_gather_prefetch (mem_ref.stmt);
|
||
+ else
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "other vectorized internal function\n");
|
||
+ }
|
||
+ else
|
||
+ issue_builtin_prefetch (mem_ref);
|
||
+ }
|
||
+}
|
||
+
|
||
+/* Generate the stmts for calculating the size. Later we will consider nested
|
||
+ multi-branches scenarios and check more information of niters when it is
|
||
+ a COND_EXPR. */
|
||
+
|
||
+tree
|
||
+calc_stmts_gen (vector<ref_group> &ref_groups, gimple_seq &cond_expr_stmt_list,
|
||
+ int num_issue_var)
|
||
+{
|
||
+ /* Accumulated keep size. */
|
||
+ tree total_size = build_real_from_int_cst
|
||
+ (double_type_node, integer_zero_node);
|
||
+ for (int i = 0; i < num_issue_var; ++i)
|
||
+ {
|
||
+ data_ref &mem_ref = ref_groups[i].first_use;
|
||
+ tree var = mem_ref.var;
|
||
+ for (unsigned j = 0; j < mem_ref.loop_bounds.size (); ++j)
|
||
+ {
|
||
+ tree niters = mem_ref.loop_bounds[j].niters;
|
||
+
|
||
+ /* COND_EXPR. */
|
||
+ if (TREE_CODE (niters) == COND_EXPR)
|
||
+ niters = TREE_OPERAND (niters, 1);
|
||
+ tree unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (var)));
|
||
+ /* _190 = (void *) ivtmp.444_221;
|
||
+ Cannot detect size unit at (void *). */
|
||
+ if (unit == NULL_TREE)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "WARNING: Cannot detect size unit "
|
||
+ "(use 1 byte) for variable %s: ", get_name (var));
|
||
+ print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM);
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+ unit = size_one_node;
|
||
+ }
|
||
+ unit = build1 (NOP_EXPR, TREE_TYPE (niters), unit);
|
||
+ tree size = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, unit);
|
||
+ size = build1 (FLOAT_EXPR, double_type_node, size);
|
||
+ total_size = fold_build2
|
||
+ (PLUS_EXPR, double_type_node, total_size, size);
|
||
+ }
|
||
+ }
|
||
+ /* Create a stmt list for size calculation. */
|
||
+ tree div = build_int_cst (TREE_TYPE (integer_zero_node), 1024 * 1024);
|
||
+ div = build1 (NOP_EXPR, double_type_node, div);
|
||
+ total_size = fold_build2 (RDIV_EXPR, double_type_node, total_size, div);
|
||
+
|
||
+ tree threshold = build_int_cst (TREE_TYPE (integer_zero_node),
|
||
+ param_llc_capacity_per_core / 2);
|
||
+ threshold = build_real_from_int_cst (double_type_node, threshold);
|
||
+ tree cond_expr = fold_build2
|
||
+ (LE_EXPR, boolean_type_node, total_size, threshold);
|
||
+
|
||
+ /* Convert cond_expr to stmt list. */
|
||
+ cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
|
||
+ &cond_expr_stmt_list, is_gimple_condexpr, NULL_TREE);
|
||
+ return cond_expr;
|
||
+}
|
||
+
|
||
+/* Runtime form insertion and issue instruction. */
|
||
+
|
||
+void
|
||
+runtime_issue (vector<ref_group> &ref_groups, int num_issue_var)
|
||
+{
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "runtime issue\n");
|
||
+
|
||
+ if (ref_groups.size () == 0)
|
||
+ return;
|
||
+ data_ref &mem_ref = ref_groups[0].first_use;
|
||
+ class loop *loop = mem_ref.loop_bounds.back ().loop;
|
||
+ /* Ensure that variables are in the same loop. */
|
||
+ for (int i = 1; i < num_issue_var; ++i)
|
||
+ {
|
||
+ data_ref &mem_ref = ref_groups[i].first_use;
|
||
+ if (loop != mem_ref.loop_bounds.back ().loop)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "topn var are not in the same loop\n");
|
||
+ return;
|
||
+ }
|
||
+ }
|
||
+ if (loop == NULL)
|
||
+ return;
|
||
+
|
||
+ /* If the exit edge points to bb with multiple inputs, split the exit edge
|
||
+ and create a new bb, make the exit edge point to bb only single input. */
|
||
+ edge e = single_exit (loop);
|
||
+ if (e == NULL)
|
||
+ return;
|
||
+ if (!single_pred_p (e->dest))
|
||
+ {
|
||
+ split_loop_exit_edge (e, true);
|
||
+ if (dump_enabled_p ())
|
||
+ dump_printf (MSG_NOTE, "split exit edge\n");
|
||
+ }
|
||
+
|
||
+ gimple_seq cond_expr_stmt_list = NULL;
|
||
+ tree cond_expr = calc_stmts_gen (ref_groups, cond_expr_stmt_list,
|
||
+ num_issue_var);
|
||
+
|
||
+ /* Use the previous cond and generate a new branch and copy loop. */
|
||
+ basic_block condition_bb = NULL;
|
||
+ profile_probability prob = profile_probability::likely ();
|
||
+ initialize_original_copy_tables ();
|
||
+ class loop *nloop = loop_version (loop, cond_expr, &condition_bb,
|
||
+ prob, prob.invert (), prob, prob.invert (), true);
|
||
+ free_original_copy_tables ();
|
||
+
|
||
+ /* Insert the generated stmt list before cond_expr. */
|
||
+ gimple_stmt_iterator cond_exp_gsi;
|
||
+ if (cond_expr_stmt_list)
|
||
+ {
|
||
+ cond_exp_gsi = gsi_last_bb (condition_bb);
|
||
+ gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
|
||
+ GSI_SAME_STMT);
|
||
+ }
|
||
+ update_ssa (TODO_update_ssa);
|
||
+
|
||
+ /* Perform hint issue for branches that meet conditions. */
|
||
+ static_issue (ref_groups, num_issue_var);
|
||
+}
|
||
+
|
||
+/* Issue llc hints through prefetch instructions. */
|
||
+
|
||
+void
|
||
+issue_llc_hint (vector<ref_group> &ref_groups)
|
||
+{
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "issue_llc_hint:\n");
|
||
+
|
||
+ /* 1. If the issue-topn and force-issue options are available, top N var is
|
||
+ forcibly allocated and no runtime branch is generated.
|
||
+ 2. If the issue-topn option is available and the size of top N var is
|
||
+ statically known, top N is statically allocated and no runtime branch
|
||
+ is generated.
|
||
+ 3. If the issue-topn option is available and the size of the top N var is
|
||
+ unknown, but them is dynamically known, the top N is dynamically
|
||
+ allocated and generate runtime branches. (also depends on the screening
|
||
+ of the innermost variable boundary type)
|
||
+ 4. If the dynamic runtime cannot know the size, such as indirect access,
|
||
+ optimization is skipped.
|
||
+ */
|
||
+ if (ref_groups.size () == 0)
|
||
+ return;
|
||
+
|
||
+ int num_issue_var = min (param_issue_topn,
|
||
+ static_cast<int>(ref_groups.size ()));
|
||
+ if (num_issue_var < param_issue_topn
|
||
+ && dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "WARNING: Only %u (less than param_issue_topn = %d) "
|
||
+ "ref_group(s) is found for llc hint.\n",
|
||
+ num_issue_var, param_issue_topn);
|
||
+ }
|
||
+ if (param_force_issue)
|
||
+ {
|
||
+ if (strlen (param_target_variables) > 0)
|
||
+ static_issue (ref_groups, static_cast<int>(ref_groups.size ()));
|
||
+ else
|
||
+ static_issue (ref_groups, num_issue_var);
|
||
+ return;
|
||
+ }
|
||
+ calc_type topn_calc_type = STATIC_CALC;
|
||
+ for (int i = 0; i < num_issue_var; ++i)
|
||
+ topn_calc_type = min (topn_calc_type, ref_groups[i].calc_by);
|
||
+
|
||
+ if (topn_calc_type == STATIC_CALC)
|
||
+ {
|
||
+ /* Before static issue, we still need to collect data size of all target
|
||
+ variables and compare the summation with LLC cache size. */
|
||
+ double prefetch_data_size = 0.;
|
||
+ for (int i = 0; i < num_issue_var; ++i)
|
||
+ prefetch_data_size += ref_groups[i].var_size;
|
||
+ if (prefetch_data_size <= (double) param_llc_capacity_per_core * 0.8)
|
||
+ static_issue (ref_groups, num_issue_var);
|
||
+ else
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "static issue: Prefetch size exceeds LLC cache "
|
||
+ "size: %lf > %lf.\n", prefetch_data_size,
|
||
+ (double) param_llc_capacity_per_core * 0.8);
|
||
+ }
|
||
+ else if (topn_calc_type == RUNTIME_CALC)
|
||
+ runtime_issue (ref_groups, num_issue_var);
|
||
+ else
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "unhandled issue scene\n");
|
||
+ }
|
||
+}
|
||
+
|
||
+/* ==================== phase entry ==================== */
|
||
+/* Check whether a string can be converted to an unsigned integer. */
|
||
+
|
||
+bool is_unsigned_int (const string &s)
|
||
+{
|
||
+ if (s.empty () || s.size () > PREFETCH_TOOL_NUM_MAX_LEN)
|
||
+ return false;
|
||
+
|
||
+ for (unsigned int i = 0; i < s.size (); ++i)
|
||
+ {
|
||
+ if (s[i] < '0' || s[i] > '9')
|
||
+ return false;
|
||
+ }
|
||
+ return true;
|
||
+}
|
||
+
|
||
+/* Parse a substring separated by comma. If the substring is valid and
|
||
+ non-empty, store it as a parsed element. */
|
||
+
|
||
+bool
|
||
+parse_string_helper (const string &substr, vector<string>& str_elts,
|
||
+ bool check_unsigned, size_t start, size_t end)
|
||
+{
|
||
+ if (substr == "" && dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "WARNING: The input string from %lu to %lu is "
|
||
+ "empty.\n", start, end);
|
||
+ else if (check_unsigned && !is_unsigned_int (substr))
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "ERROR: not an unsigned integer: %s\n",
|
||
+ substr.c_str ());
|
||
+ str_elts.clear ();
|
||
+ return false;
|
||
+ }
|
||
+ else
|
||
+ str_elts.push_back (substr);
|
||
+ return true;
|
||
+}
|
||
+
|
||
+/* Parse a user input string, separated by comma. */
|
||
+
|
||
+void
|
||
+parse_string (const string &s, vector<string>& str_elts,
|
||
+ bool check_unsigned = false)
|
||
+{
|
||
+ string delim = ",";
|
||
+ size_t start = 0;
|
||
+ size_t end = s.find (delim);
|
||
+ string substr = s.substr (start, end - start);
|
||
+ while (end != string::npos)
|
||
+ {
|
||
+ if (!parse_string_helper (substr, str_elts, check_unsigned, start, end))
|
||
+ return;
|
||
+ start = end + delim.size ();
|
||
+ end = s.find (delim, start);
|
||
+ substr = s.substr (start, end - start);
|
||
+ }
|
||
+ parse_string_helper (substr, str_elts, check_unsigned, start, end);
|
||
+}
|
||
+
|
||
+/* Parse user input of target variables and memory indices and create a map
|
||
+ that assigns a target variable to a memory index. */
|
||
+
|
||
+void
|
||
+parse_param_inputs (map<string, unsigned int> &var2mem_idx)
|
||
+{
|
||
+ /* The user input length should have an input length limit. */
|
||
+ if ((strlen (param_target_variables) >= PREFETCH_TOOL_INPUT_MAX_LEN
|
||
+ || strlen (param_mem_ref_index) >= PREFETCH_TOOL_INPUT_MAX_LEN)
|
||
+ && dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "INVALID INPUT: The user inputs for target variables "
|
||
+ "and/or memory reference indices are too long for parsing.\n");
|
||
+
|
||
+ vector<string> var_names;
|
||
+ string target_variables = param_target_variables;
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "Start parsing target variables:\n");
|
||
+ if (param_use_ref_group_index)
|
||
+ parse_string (target_variables, var_names, true);
|
||
+ else
|
||
+ parse_string (target_variables, var_names, false);
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "Finish parsing target variables.\n\n");
|
||
+
|
||
+ vector<string> var_mem_indices;
|
||
+ string mem_indices = param_mem_ref_index;
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "Start parsing memory reference indices:\n");
|
||
+ parse_string (mem_indices, var_mem_indices, true);
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "Finish parsing memory reference indices.\n\n");
|
||
+
|
||
+ /* Construct a map of var_name: var_mem_index. */
|
||
+ if (var_names.size () > 0)
|
||
+ {
|
||
+ if (var_mem_indices.size () < var_names.size ())
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "WARNING: The number of provided memory "
|
||
+ "reference indices is less than that of target "
|
||
+ "variables.\nUse the top index for all variables "
|
||
+ "instead.\n");
|
||
+ for (string& var_name : var_names)
|
||
+ var2mem_idx[var_name] = 0;
|
||
+ }
|
||
+ else
|
||
+ {
|
||
+ if (var_mem_indices.size () > var_names.size ()
|
||
+ && dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "WARNING: The number of target variables is "
|
||
+ "less than that of memory reference indices.\n");
|
||
+ for (unsigned int i = 0; i < var_names.size (); ++i)
|
||
+ {
|
||
+ var2mem_idx[var_names[i]] = static_cast<unsigned int>(
|
||
+ atoi (var_mem_indices[i].c_str ()));
|
||
+ }
|
||
+ }
|
||
+ }
|
||
+}
|
||
+
|
||
+/* Filter reference groups by only selecting target variables from the user
|
||
+ input. There are two options for prefetching target variables:
|
||
+ 1. Specify variable name parsed by the pass, which you can double-check at
|
||
+ "sorted ref_groups" section in the dump file.
|
||
+ 2. Specify variable rank exhibited at "sorted ref_groups" section in the
|
||
+ dump file.
|
||
+*/
|
||
+
|
||
+void
|
||
+prefetch_variables (const vector<ref_group>& ref_groups,
|
||
+ vector<ref_group>& reduced_ref_groups)
|
||
+{
|
||
+ map<unsigned int, unsigned int> ref_group2mem_idx;
|
||
+
|
||
+ map<string, unsigned int> var2mem_idx; /* externally defined. */
|
||
+ parse_param_inputs (var2mem_idx);
|
||
+
|
||
+ if (param_use_ref_group_index)
|
||
+ {
|
||
+ /* Use ref_group index at "sorted ref_groups" section to specify
|
||
+ variable. */
|
||
+ /* Collect the variables in "reduced_ref_group" only if their indices
|
||
+ show up at "sorted ref_groups" section. */
|
||
+ for (const pair<string, unsigned int> &var_mem_idx : var2mem_idx)
|
||
+ {
|
||
+ unsigned int var_idx = static_cast<unsigned int>(atoi (
|
||
+ var_mem_idx.first.c_str ()));
|
||
+ if (var_idx < ref_groups.size ())
|
||
+ ref_group2mem_idx[var_idx] = var_mem_idx.second;
|
||
+ else if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "WARNING: The index \"%u\" does not show "
|
||
+ "up in the ref_groups.\n", var_idx);
|
||
+ }
|
||
+ }
|
||
+ else
|
||
+ {
|
||
+ /* Use variable name shown up at "sorted ref_groups" section to specify
|
||
+ variable:
|
||
+ var2ref_group_idx + var2mem_idx -> ref_group2mem_idx. */
|
||
+ /* Create a map that assigns the variable name to its corresponding
|
||
+ ref_group index. */
|
||
+ map<string, unsigned int> var2ref_group_idx; /* internally detected. */
|
||
+ for (unsigned int i = 0; i < ref_groups.size (); ++i)
|
||
+ {
|
||
+ const ref_group &curr_ref_group = ref_groups[i];
|
||
+ const int UINT_MAX_DIGIT = 10;
|
||
+ /* Unrecognizable variable name related to ref_group. */
|
||
+ if (!get_name (curr_ref_group.var))
|
||
+ {
|
||
+ /* If the variable name does not have a string representation,
|
||
+ we can rename it by "tmp_var_" + <sorted_ref_group_index>. */
|
||
+ char group_idx[UINT_MAX_DIGIT];
|
||
+ sprintf (group_idx, "%u", i);
|
||
+ string tmp_var_name = "tmp_var_" + std::string (group_idx);
|
||
+ fprintf (dump_file, "Unrecognizable variable name at ref_group "
|
||
+ "index %u.\nThe tree expression for variable is: ", i);
|
||
+ print_generic_expr (dump_file, curr_ref_group.var, TDF_SLIM);
|
||
+ fprintf (dump_file, "\n");
|
||
+ var2ref_group_idx[tmp_var_name] = i;
|
||
+ }
|
||
+ else
|
||
+ var2ref_group_idx[std::string (get_name (curr_ref_group.var))] = i;
|
||
+ }
|
||
+ /* Collect the variables in "reduced_ref_group" only if they show up in
|
||
+ the ref_groups. */
|
||
+ for (const pair<string, unsigned int> &var_mem_idx : var2mem_idx)
|
||
+ {
|
||
+ if (var2ref_group_idx.count (var_mem_idx.first))
|
||
+ {
|
||
+ unsigned int ref_group_idx = var2ref_group_idx[var_mem_idx.first];
|
||
+ ref_group2mem_idx[ref_group_idx] = var_mem_idx.second;
|
||
+ }
|
||
+ else if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "WARNING: Target variable \" %s \" does "
|
||
+ "not show up in the ref_groups. Check whether it needs "
|
||
+ "temporary variable name.\n",
|
||
+ var_mem_idx.first.c_str ());
|
||
+ }
|
||
+ }
|
||
+
|
||
+ for (const pair<unsigned int, unsigned int> &ref_group_mem_idx :
|
||
+ ref_group2mem_idx)
|
||
+ {
|
||
+ ref_group curr_ref_group = ref_groups[ref_group_mem_idx.first];
|
||
+ curr_ref_group.mem_ref_index = ref_group_mem_idx.second;
|
||
+ reduced_ref_groups.push_back (curr_ref_group);
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "\nNOTICE: Prefetching target variable \" ");
|
||
+ print_generic_expr (dump_file, curr_ref_group.var, TDF_SLIM);
|
||
+ fprintf (dump_file, " \" at ref_group index %u and memory location "
|
||
+ "index %u.\n", ref_group_mem_idx.first,
|
||
+ ref_group_mem_idx.second);
|
||
+ }
|
||
+ }
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "\n\n");
|
||
+}
|
||
+
|
||
+
|
||
+/* The LLC intelligent allocation consists of 6 steps. */
|
||
+
|
||
+void
|
||
+llc_allocate (void)
|
||
+{
|
||
+ map<class loop *, vector<data_ref> > kernels_refs;
|
||
+ vector<class loop *> kernels;
|
||
+ if (!get_dense_memory_kernels (kernels, kernels_refs))
|
||
+ return;
|
||
+
|
||
+ trace_data_refs_info (kernels, kernels_refs);
|
||
+
|
||
+ if (!analyze_nested_kernels (kernels, kernels_refs))
|
||
+ return;
|
||
+
|
||
+ vector<class loop *> sorted_kernels;
|
||
+ if (!filter_and_sort_kernels (sorted_kernels, kernels))
|
||
+ return;
|
||
+
|
||
+ vector<ref_group> ref_groups;
|
||
+ if (!record_and_sort_ref_groups (ref_groups, sorted_kernels, kernels_refs))
|
||
+ return;
|
||
+
|
||
+ if (strlen (param_target_variables) > 0)
|
||
+ {
|
||
+ /* If "param_target_variables" is not empty, we will issue parsed target
|
||
+ variables compulsorily. */
|
||
+ param_force_issue = true;
|
||
+ vector<ref_group> reduced_ref_groups;
|
||
+ prefetch_variables (ref_groups, reduced_ref_groups);
|
||
+ issue_llc_hint (reduced_ref_groups);
|
||
+ }
|
||
+ else
|
||
+ issue_llc_hint (ref_groups);
|
||
+}
|
||
+
|
||
+/* Check whether the function is an operator reloading function. */
|
||
+
|
||
+bool
|
||
+operator_func_p (function *fn)
|
||
+{
|
||
+ const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl));
|
||
+
|
||
+ if (fn_name && strncmp (fn_name, "operator", 8) == 0)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "operator_func: %s ", fn_name);
|
||
+
|
||
+ return true;
|
||
+ }
|
||
+ return false;
|
||
+}
|
||
+
|
||
+/* Check whether the function file location is known. */
|
||
+
|
||
+bool
|
||
+func_location_p (function *fn)
|
||
+{
|
||
+ expanded_location fn_decl_xloc
|
||
+ = expand_location (DECL_SOURCE_LOCATION (current_function_decl));
|
||
+ expanded_location fn_xloc
|
||
+ = expand_location (fn->function_start_locus);
|
||
+
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "fn->function_start_locus = %d \n",
|
||
+ fn->function_start_locus);
|
||
+ fprintf (dump_file, "fn_xloc.file = %s \n",
|
||
+ fn_xloc.file ? fn_xloc.file : "NULL");
|
||
+ fprintf (dump_file, "fn_decl_xloc.file = %s \n",
|
||
+ fn_decl_xloc.file ? fn_decl_xloc.file : "NULL");
|
||
+ fprintf (dump_file, "LOCATION_FILE (input_location) = %s \n",
|
||
+ LOCATION_FILE (input_location) ? LOCATION_FILE (input_location)
|
||
+ : "NULL");
|
||
+ }
|
||
+ if (fn_decl_xloc.file == NULL)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "Function location unknown, skip analysis \n");
|
||
+ return false;
|
||
+ }
|
||
+ /* Newly generated functions are filtered out, such as function constant
|
||
+ propagation func.constprop (). */
|
||
+ if (LOCATION_FILE (input_location) != fn_decl_xloc.file)
|
||
+ {
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "Function location non-local, skip analysis \n");
|
||
+ return false;
|
||
+ }
|
||
+ return true;
|
||
+}
|
||
+
|
||
+/* Dump function information. */
|
||
+
|
||
+void
|
||
+dump_function_info (function *fn)
|
||
+{
|
||
+ const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl));
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "\nfn_name: %s\n", fn_name);
|
||
+ expanded_location cfun_xloc
|
||
+ = expand_location (DECL_SOURCE_LOCATION (current_function_decl));
|
||
+ if (cfun_xloc.line)
|
||
+ {
|
||
+ if (cfun_xloc.file)
|
||
+ fprintf (dump_file, "[%s:%d:%d]\n",
|
||
+ cfun_xloc.file, cfun_xloc.line, cfun_xloc.column);
|
||
+ }
|
||
+ fprintf (dump_file, "\n");
|
||
+ flow_loops_dump (dump_file, NULL, 1);
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+}
|
||
+
|
||
+/* dump param. */
|
||
+
|
||
+void
|
||
+dump_param (void)
|
||
+{
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ {
|
||
+ fprintf (dump_file, "LLC allocate parameters:\n");
|
||
+ fprintf (dump_file, " block size: %d\n", param_l1_cache_line_size);
|
||
+ fprintf (dump_file, " L1 cache size: %d lines, %d kB\n",
|
||
+ param_l1_cache_size * 1024 / param_l1_cache_line_size,
|
||
+ param_l1_cache_size);
|
||
+ fprintf (dump_file, " L1 cache line size: %d\n",
|
||
+ param_l1_cache_line_size);
|
||
+ fprintf (dump_file, " L2 cache size: %d kB\n", param_l2_cache_size);
|
||
+ fprintf (dump_file, " min mem_access_ratio: %d \n",
|
||
+ param_mem_access_ratio);
|
||
+ fprintf (dump_file, " min mem_access_num: %d \n",
|
||
+ param_mem_access_num);
|
||
+ fprintf (dump_file, "\n");
|
||
+ }
|
||
+}
|
||
+
|
||
+const pass_data pass_data_llc_allocate =
|
||
+{
|
||
+ GIMPLE_PASS, /* type. */
|
||
+ "llc_allocate", /* name. */
|
||
+ OPTGROUP_LOOP, /* optinfo_flags. */
|
||
+ TV_TREE_PREFETCH, /* tv_id. */
|
||
+ (PROP_cfg | PROP_ssa), /* properties_required. */
|
||
+ 0, /* properties_provided. */
|
||
+ 0, /* properties_destroyed. */
|
||
+ 0, /* todo_flags_start. */
|
||
+ 0, /* todo_flags_finish. */
|
||
+};
|
||
+
|
||
+class pass_llc_allocate : public gimple_opt_pass
|
||
+{
|
||
+public:
|
||
+ pass_llc_allocate (gcc::context *ctxt)
|
||
+ : gimple_opt_pass (pass_data_llc_allocate, ctxt)
|
||
+ {}
|
||
+
|
||
+ /* opt_pass methods. */
|
||
+ virtual bool gate (function *)
|
||
+ {
|
||
+ return (optimize >= 2 && flag_llc_allocate > 0);
|
||
+ }
|
||
+ virtual unsigned int execute (function *);
|
||
+
|
||
+}; // class pass_llc_allocate
|
||
+
|
||
+unsigned int
|
||
+pass_llc_allocate::execute (function *fn)
|
||
+{
|
||
+ unsigned int ret = 0;
|
||
+
|
||
+ if (!targetm.have_prefetch ()
|
||
+ || targetm.vectorize.code_for_prefetch == NULL
|
||
+ || targetm.vectorize.prefetch_handleable_mode_p == NULL
|
||
+ || targetm.vectorize.code_for_gather_prefetch == NULL)
|
||
+ return 0;
|
||
+
|
||
+ if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
|
||
+ {
|
||
+ tree type = build_function_type_list (void_type_node,
|
||
+ const_ptr_type_node, NULL_TREE);
|
||
+ tree decl = add_builtin_function ("__builtin_prefetch", type,
|
||
+ BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
|
||
+ NULL, NULL_TREE);
|
||
+ DECL_IS_NOVOPS (decl) = true;
|
||
+ set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
|
||
+ }
|
||
+
|
||
+ dump_param ();
|
||
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
||
+ fprintf (dump_file, "llc_allocate: %s\n",
|
||
+ IDENTIFIER_POINTER (DECL_NAME (fn->decl)));
|
||
+
|
||
+ if (number_of_loops (fn) <= 1 || !func_location_p (fn)
|
||
+ || operator_func_p (fn))
|
||
+ return ret;
|
||
+
|
||
+ dump_function_info (fn);
|
||
+
|
||
+ llc_allocate ();
|
||
+
|
||
+ return ret;
|
||
+}
|
||
+
|
||
+} // anon namespace
|
||
+
|
||
+gimple_opt_pass *
|
||
+make_pass_llc_allocate (gcc::context *ctxt)
|
||
+{
|
||
+ return new pass_llc_allocate (ctxt);
|
||
+}
|
||
diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c
|
||
index 7775bc727..c500d5e20 100644
|
||
--- a/gcc/tree-ssa-loop-niter.c
|
||
+++ b/gcc/tree-ssa-loop-niter.c
|
||
@@ -2384,6 +2384,37 @@ loop_only_exit_p (const class loop *loop, basic_block *body, const_edge exit)
|
||
return true;
|
||
}
|
||
|
||
+/* Returns whether the number of vectorized iterations for the loop can be
|
||
+ estimated from the given IR and update the corresponding loop attribute,
|
||
+ e.g., next_mask_114 = .WHILE_ULT (_122, niters.5_75, { 0, ... }); */
|
||
+
|
||
+bool
|
||
+number_of_iterations_vect (class loop *loop, tree lhs, tree rhs)
|
||
+{
|
||
+ loop->vec_nb_iterations = chrec_dont_know;
|
||
+
|
||
+ if ((TREE_CODE (lhs) != SSA_NAME && TREE_CODE (rhs) != SSA_NAME)
|
||
+ || (TREE_CODE (lhs) == SSA_NAME && TREE_CODE (rhs) == SSA_NAME))
|
||
+ return false;
|
||
+
|
||
+ tree ssa = TREE_CODE (lhs) == SSA_NAME ? lhs : rhs;
|
||
+ gimple *def_stmt = SSA_NAME_DEF_STMT (ssa);
|
||
+
|
||
+ if (gimple_code (def_stmt) != GIMPLE_CALL
|
||
+ || !gimple_call_internal_p (def_stmt))
|
||
+ return false;
|
||
+
|
||
+ internal_fn ifn = gimple_call_internal_fn (def_stmt);
|
||
+ if (ifn != IFN_WHILE_ULT)
|
||
+ return false;
|
||
+
|
||
+ gcall *call = dyn_cast<gcall *> (def_stmt);
|
||
+ tree niters = gimple_call_arg (call, 1);
|
||
+ loop->vec_nb_iterations = niters;
|
||
+
|
||
+ return true;
|
||
+}
|
||
+
|
||
/* Stores description of number of iterations of LOOP derived from
|
||
EXIT (an exit edge of the LOOP) in NITER. Returns true if some useful
|
||
information could be derived (and fields of NITER have meaning described
|
||
@@ -2454,6 +2485,9 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit,
|
||
op1 = gimple_cond_rhs (stmt);
|
||
type = TREE_TYPE (op0);
|
||
|
||
+ if (TREE_CODE (type) == VECTOR_TYPE)
|
||
+ number_of_iterations_vect (loop, op0, op1);
|
||
+
|
||
if (TREE_CODE (type) != INTEGER_TYPE
|
||
&& !POINTER_TYPE_P (type))
|
||
return false;
|
||
@@ -2730,14 +2764,14 @@ bool
|
||
number_of_iterations_exit (class loop *loop, edge exit,
|
||
class tree_niter_desc *niter,
|
||
bool warn, bool every_iteration,
|
||
- basic_block *body)
|
||
+ basic_block *body, bool guarantee)
|
||
{
|
||
gcond *stmt;
|
||
if (!number_of_iterations_exit_assumptions (loop, exit, niter,
|
||
&stmt, every_iteration, body))
|
||
return false;
|
||
|
||
- if (integer_nonzerop (niter->assumptions))
|
||
+ if (integer_nonzerop (niter->assumptions) || guarantee == false)
|
||
return true;
|
||
|
||
if (warn && dump_enabled_p ())
|
||
diff --git a/gcc/tree-ssa-loop-niter.h b/gcc/tree-ssa-loop-niter.h
|
||
index eb8d15794..d38472e52 100644
|
||
--- a/gcc/tree-ssa-loop-niter.h
|
||
+++ b/gcc/tree-ssa-loop-niter.h
|
||
@@ -27,7 +27,8 @@ extern bool loop_only_exit_p (const class loop *, basic_block *body,
|
||
extern bool number_of_iterations_exit (class loop *, edge,
|
||
class tree_niter_desc *niter, bool,
|
||
bool every_iteration = true,
|
||
- basic_block * = NULL);
|
||
+ basic_block * = NULL,
|
||
+ bool guarantee = true);
|
||
extern bool number_of_iterations_exit_assumptions (class loop *, edge,
|
||
class tree_niter_desc *,
|
||
gcond **, bool = true,
|
||
--
|
||
2.33.0
|
||
|