From f43bdfbdcfdeb425a0bd303f4787a13323fd2934 Mon Sep 17 00:00:00 2001 From: vchernon Date: Wed, 27 Sep 2023 11:07:29 +0800 Subject: [PATCH 11/13] Add more flexible check for pointer aliasing during vectorization It takes minimum between number of iteration and segment length and helps to speed up loops with small number of iterations when only tail can be vectorized. --- gcc/params.opt | 5 ++ .../sve/var_stride_flexible_segment_len_1.c | 23 +++++++ gcc/tree-data-ref.c | 68 +++++++++++++------ gcc/tree-data-ref.h | 11 ++- gcc/tree-vect-data-refs.c | 14 +++- 5 files changed, 95 insertions(+), 26 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c diff --git a/gcc/params.opt b/gcc/params.opt index 83fd705ee..7f335a94b 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -964,6 +964,11 @@ Maximum number of loop peels to enhance alignment of data references in a loop. Common Joined UInteger Var(param_vect_max_version_for_alias_checks) Init(10) Param Optimization Bound on number of runtime checks inserted by the vectorizer's loop versioning for alias check. +-param=vect-alias-flexible-segment-len= +Common Joined UInteger Var(param_flexible_seg_len) Init(0) IntegerRange(0, 1) Param Optimization +Use a minimum length of different segments. Currently the minimum between +iteration number and vectorization length is chosen by this param. + -param=vect-max-version-for-alignment-checks= Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check. diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c new file mode 100644 index 000000000..894f075f3 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize --param=vect-alias-flexible-segment-len=1" } */ + +#define TYPE int +#define SIZE 257 + +void __attribute__ ((weak)) +f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused))) +{ + for (int i = 0; i < SIZE; ++i) + x[i * n] += y[i * n]; +} + +/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */ +/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */ +/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */ +/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */ +/* Should use a WAR check that multiplies by (VF-2)*4 rather than + an overlap check that multiplies by (257-1)*4. */ +/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */ +/* One range check and a check for n being zero. */ +/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */ diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c index 2cb54def8..8c5f1048c 100644 --- a/gcc/tree-data-ref.c +++ b/gcc/tree-data-ref.c @@ -2071,31 +2071,14 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr, same arguments. Try to optimize cases in which the second access is a write and in which some overlap is valid. */ -static bool -create_waw_or_war_checks (tree *cond_expr, +static void +create_waw_or_war_checks2 (tree *cond_expr, tree seg_len_a, const dr_with_seg_len_pair_t &alias_pair) { const dr_with_seg_len& dr_a = alias_pair.first; const dr_with_seg_len& dr_b = alias_pair.second; - /* Check for cases in which: - - (a) DR_B is always a write; - (b) the accesses are well-ordered in both the original and new code - (see the comment above the DR_ALIAS_* flags for details); and - (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */ - if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW)) - return false; - - /* Check for equal (but possibly variable) steps. */ tree step = DR_STEP (dr_a.dr); - if (!operand_equal_p (step, DR_STEP (dr_b.dr))) - return false; - - /* Make sure that we can operate on sizetype without loss of precision. */ - tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr)); - if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype)) - return false; /* All addresses involved are known to have a common alignment ALIGN. We can therefore subtract ALIGN from an exclusive endpoint to get @@ -2112,9 +2095,6 @@ create_waw_or_war_checks (tree *cond_expr, fold_convert (ssizetype, indicator), ssize_int (0)); - /* Get lengths in sizetype. */ - tree seg_len_a - = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len)); step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step)); /* Each access has the following pattern: @@ -2221,6 +2201,50 @@ create_waw_or_war_checks (tree *cond_expr, *cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit); if (dump_enabled_p ()) dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n"); +} + +/* This is a wrapper function for create_waw_or_war_checks2. */ +static bool +create_waw_or_war_checks (tree *cond_expr, + const dr_with_seg_len_pair_t &alias_pair) +{ + const dr_with_seg_len& dr_a = alias_pair.first; + const dr_with_seg_len& dr_b = alias_pair.second; + + /* Check for cases in which: + + (a) DR_B is always a write; + (b) the accesses are well-ordered in both the original and new code + (see the comment above the DR_ALIAS_* flags for details); and + (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */ + if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW)) + return false; + + /* Check for equal (but possibly variable) steps. */ + tree step = DR_STEP (dr_a.dr); + if (!operand_equal_p (step, DR_STEP (dr_b.dr))) + return false; + + /* Make sure that we can operate on sizetype without loss of precision. */ + tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr)); + if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype)) + return false; + + /* Get lengths in sizetype. */ + tree seg_len_a + = fold_convert (sizetype, + rewrite_to_non_trapping_overflow (dr_a.seg_len)); + create_waw_or_war_checks2 (cond_expr, seg_len_a, alias_pair); + if (param_flexible_seg_len && dr_a.seg_len != dr_a.seg_len2) + { + tree seg_len2_a + = fold_convert (sizetype, + rewrite_to_non_trapping_overflow (dr_a.seg_len2)); + tree cond_expr2; + create_waw_or_war_checks2 (&cond_expr2, seg_len2_a, alias_pair); + *cond_expr = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, + *cond_expr, cond_expr2); + } return true; } diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h index 771d20fbb..5903ce66a 100644 --- a/gcc/tree-data-ref.h +++ b/gcc/tree-data-ref.h @@ -208,12 +208,19 @@ class dr_with_seg_len public: dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size, unsigned int a) - : dr (d), seg_len (len), access_size (size), align (a) {} - + : dr (d), seg_len (len), seg_len2 (len), access_size (size), align (a) + {} + dr_with_seg_len (data_reference_p d, tree len, tree len2, + unsigned HOST_WIDE_INT size, unsigned int a) + : dr (d), seg_len (len), seg_len2 (len2), access_size (size), align (a) + {} data_reference_p dr; /* The offset of the last access that needs to be checked minus the offset of the first. */ tree seg_len; + /* The second version of segment length. Currently this is used to + soften checks for a small number of iterations. */ + tree seg_len2; /* A value that, when added to abs (SEG_LEN), gives the total number of bytes in the segment. */ poly_uint64 access_size; diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index e4466a4f3..1b8a03c9c 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -3498,6 +3498,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) { poly_uint64 lower_bound; tree segment_length_a, segment_length_b; + tree segment_length2_a, segment_length2_b; unsigned HOST_WIDE_INT access_size_a, access_size_b; unsigned int align_a, align_b; @@ -3598,6 +3599,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) { segment_length_a = size_zero_node; segment_length_b = size_zero_node; + segment_length2_a = size_zero_node; + segment_length2_b = size_zero_node; } else { @@ -3606,8 +3609,15 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) length_factor = scalar_loop_iters; else length_factor = size_int (vect_factor); + /* In any case we should rememeber scalar_loop_iters + this helps to create flexible aliasing check + for small number of iterations. */ segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor); segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor); + segment_length2_a + = vect_vfa_segment_size (dr_info_a, scalar_loop_iters); + segment_length2_b + = vect_vfa_segment_size (dr_info_b, scalar_loop_iters); } access_size_a = vect_vfa_access_size (dr_info_a); access_size_b = vect_vfa_access_size (dr_info_b); @@ -3652,9 +3662,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) } dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a, - access_size_a, align_a); + segment_length2_a, access_size_a, align_a); dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b, - access_size_b, align_b); + segment_length2_b, access_size_b, align_b); /* Canonicalize the order to be the one that's needed for accurate RAW, WAR and WAW flags, in cases where the data references are well-ordered. The order doesn't really matter otherwise, -- 2.33.0