!429 Revert last two commits and sync patch from openeuler/gcc

From: @lesleyzheng1103 
Reviewed-by: @huang-xiaoquan 
Signed-off-by: @huang-xiaoquan
This commit is contained in:
openeuler-ci-bot 2024-05-29 04:48:44 +00:00 committed by Gitee
commit d0fd3414e4
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
13 changed files with 14108 additions and 13 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,109 @@
From 7acb88ae27eb3e1af0da866d433968143c7754bd Mon Sep 17 00:00:00 2001
From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
Date: Thu, 12 Jan 2023 14:52:49 +0300
Subject: [PATCH 20/33] Perform early if-conversion of simple arithmetic
---
gcc/common.opt | 4 ++++
gcc/match.pd | 25 +++++++++++++++++++
gcc/testsuite/gcc.dg/ifcvt-gimple.c | 37 +++++++++++++++++++++++++++++
3 files changed, 66 insertions(+)
create mode 100644 gcc/testsuite/gcc.dg/ifcvt-gimple.c
diff --git a/gcc/common.opt b/gcc/common.opt
index 6f0ed7cea..6950756fd 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1721,6 +1721,10 @@ fif-conversion2
Common Report Var(flag_if_conversion2) Optimization
Perform conversion of conditional jumps to conditional execution.
+fif-conversion-gimple
+Common Report Var(flag_if_conversion_gimple) Optimization
+Perform conversion of conditional jumps to branchless equivalents during gimple transformations.
+
fstack-reuse=
Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization
-fstack-reuse=[all|named_vars|none] Set stack reuse level for local variables.
diff --git a/gcc/match.pd b/gcc/match.pd
index 01f81b063..e98cd02e0 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3402,6 +3402,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
)
)
)
+
+(if (flag_if_conversion_gimple)
+ (for simple_op (plus minus bit_and bit_ior bit_xor)
+ (simplify
+ (cond @0 (simple_op @1 INTEGER_CST@2) @1)
+ (switch
+ /* a = cond ? a + 1 : a -> a = a + ((int) cond) */
+ (if (integer_onep (@2))
+ (simple_op @1 (convert (convert:boolean_type_node @0))))
+ /* a = cond ? a + powerof2cst : a ->
+ a = a + ((int) cond) << log2 (powerof2cst) */
+ (if (INTEGRAL_TYPE_P (type) && integer_pow2p (@2))
+ (with
+ {
+ tree shift = build_int_cst (integer_type_node, tree_log2 (@2));
+ }
+ (simple_op @1 (lshift (convert (convert:boolean_type_node @0))
+ { shift; })
+ )
+ )
+ )
+ )
+ )
+ )
+)
#endif
#if GIMPLE
diff --git a/gcc/testsuite/gcc.dg/ifcvt-gimple.c b/gcc/testsuite/gcc.dg/ifcvt-gimple.c
new file mode 100644
index 000000000..0f7c87e5c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ifcvt-gimple.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fif-conversion-gimple -fdump-tree-optimized" } */
+
+int test_int (int optimizable_int) {
+ if (optimizable_int > 5)
+ ++optimizable_int;
+ return optimizable_int;
+}
+
+int test_int_pow2 (int optimizable_int_pow2) {
+ if (optimizable_int_pow2 <= 4)
+ optimizable_int_pow2 += 1024;
+ return optimizable_int_pow2;
+}
+
+int test_int_non_pow2 (int not_optimizable_int_non_pow2) {
+ if (not_optimizable_int_non_pow2 == 1)
+ not_optimizable_int_non_pow2 += 513;
+ return not_optimizable_int_non_pow2;
+}
+
+float test_float (float not_optimizable_float) {
+ if (not_optimizable_float > 5)
+ not_optimizable_float += 1;
+ return not_optimizable_float;
+}
+
+/* Expecting if-else block in test_float and test_int_non_pow2 only. */
+/* { dg-final { scan-tree-dump-not "if \\(optimizable" "optimized" } } */
+/* { dg-final { scan-tree-dump "if \\(not_optimizable_int_non_pow2" "optimized" } } */
+/* { dg-final { scan-tree-dump "if \\(not_optimizable_float" "optimized" } } */
+/* { dg-final { scan-tree-dump-times "if " 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "else" 2 "optimized" } } */
+
+/* Expecting shifted result only for optimizable_int_pow2. */
+/* { dg-final { scan-tree-dump-times " << " 1 "optimized" } } */
+/* { dg-final { scan-tree-dump " << 10;" "optimized" } } */
--
2.33.0

View File

@ -0,0 +1,236 @@
From f788555b23b0b676729bb695af96954fe083e354 Mon Sep 17 00:00:00 2001
From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
Date: Tue, 24 Jan 2023 16:43:40 +0300
Subject: [PATCH 21/33] Add option to allow matching uaddsub overflow for widen
ops too.
---
gcc/common.opt | 5 ++
gcc/testsuite/gcc.dg/uaddsub.c | 143 +++++++++++++++++++++++++++++++++
gcc/tree-ssa-math-opts.c | 35 +++++++-
3 files changed, 179 insertions(+), 4 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/uaddsub.c
diff --git a/gcc/common.opt b/gcc/common.opt
index 6950756fd..c2f01bbc0 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2989,6 +2989,11 @@ freciprocal-math
Common Report Var(flag_reciprocal_math) SetByCombined Optimization
Same as -fassociative-math for expressions which include division.
+fuaddsub-overflow-match-all
+Common Report Var(flag_uaddsub_overflow_match_all)
+Match unsigned add/sub overflow even if the target does not support
+the corresponding instruction.
+
; Nonzero means that unsafe floating-point math optimizations are allowed
; for the sake of speed. IEEE compliance is not guaranteed, and operations
; are allowed to assume that their arguments and results are "normal"
diff --git a/gcc/testsuite/gcc.dg/uaddsub.c b/gcc/testsuite/gcc.dg/uaddsub.c
new file mode 100644
index 000000000..96c26d308
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/uaddsub.c
@@ -0,0 +1,143 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fuaddsub-overflow-match-all -fdump-tree-optimized" } */
+#include <stdint.h>
+
+typedef unsigned __int128 uint128_t;
+typedef struct uint256_t
+{
+ uint128_t lo;
+ uint128_t hi;
+} uint256_t;
+
+uint16_t add16 (uint8_t a, uint8_t b)
+{
+ uint8_t tmp = a + b;
+ uint8_t overflow = 0;
+ if (tmp < a)
+ overflow = 1;
+
+ uint16_t res = overflow;
+ res <<= 8;
+ res += tmp;
+ return res;
+}
+
+uint32_t add32 (uint16_t a, uint16_t b)
+{
+ uint16_t tmp = a + b;
+ uint16_t overflow = 0;
+ if (tmp < a)
+ overflow = 1;
+
+ uint32_t res = overflow;
+ res <<= 16;
+ res += tmp;
+ return res;
+}
+
+uint64_t add64 (uint32_t a, uint32_t b)
+{
+ uint32_t tmp = a + b;
+ uint32_t overflow = 0;
+ if (tmp < a)
+ overflow = 1;
+
+ uint64_t res = overflow;
+ res <<= 32;
+ res += tmp;
+ return res;
+}
+
+uint128_t add128 (uint64_t a, uint64_t b)
+{
+ uint64_t tmp = a + b;
+ uint64_t overflow = 0;
+ if (tmp < a)
+ overflow = 1;
+
+ uint128_t res = overflow;
+ res <<= 64;
+ res += tmp;
+ return res;
+}
+
+uint256_t add256 (uint128_t a, uint128_t b)
+{
+ uint128_t tmp = a + b;
+ uint128_t overflow = 0;
+ if (tmp < a)
+ overflow = 1;
+
+ uint256_t res;
+ res.hi = overflow;
+ res.lo = tmp;
+ return res;
+}
+
+uint16_t sub16 (uint8_t a, uint8_t b)
+{
+ uint8_t tmp = a - b;
+ uint8_t overflow = 0;
+ if (tmp > a)
+ overflow = -1;
+
+ uint16_t res = overflow;
+ res <<= 8;
+ res += tmp;
+ return res;
+}
+
+uint32_t sub32 (uint16_t a, uint16_t b)
+{
+ uint16_t tmp = a - b;
+ uint16_t overflow = 0;
+ if (tmp > a)
+ overflow = -1;
+
+ uint32_t res = overflow;
+ res <<= 16;
+ res += tmp;
+ return res;
+}
+
+uint64_t sub64 (uint32_t a, uint32_t b)
+{
+ uint32_t tmp = a - b;
+ uint32_t overflow = 0;
+ if (tmp > a)
+ overflow = -1;
+
+ uint64_t res = overflow;
+ res <<= 32;
+ res += tmp;
+ return res;
+}
+
+uint128_t sub128 (uint64_t a, uint64_t b)
+{
+ uint64_t tmp = a - b;
+ uint64_t overflow = 0;
+ if (tmp > a)
+ overflow = -1;
+
+ uint128_t res = overflow;
+ res <<= 64;
+ res += tmp;
+ return res;
+}
+
+uint256_t sub256 (uint128_t a, uint128_t b)
+{
+ uint128_t tmp = a - b;
+ uint128_t overflow = 0;
+ if (tmp > a)
+ overflow = -1;
+
+ uint256_t res;
+ res.hi = overflow;
+ res.lo = tmp;
+ return res;
+}
+
+/* { dg-final { scan-tree-dump-times "= .ADD_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "= .SUB_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 4c89fddcf..716bf9e35 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -3290,6 +3290,27 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
}
}
+/* Check if the corresponding operation has wider equivalent on the target. */
+
+static bool
+wider_optab_check_p (optab op, machine_mode mode, int unsignedp)
+{
+ machine_mode wider_mode;
+ FOR_EACH_WIDER_MODE (wider_mode, mode)
+ {
+ machine_mode next_mode;
+ if (optab_handler (op, wider_mode) != CODE_FOR_nothing
+ || (op == smul_optab
+ && GET_MODE_WIDER_MODE (wider_mode).exists (&next_mode)
+ && (find_widening_optab_handler ((unsignedp
+ ? umul_widen_optab
+ : smul_widen_optab),
+ next_mode, mode))))
+ return true;
+ }
+
+ return false;
+}
/* Helper function of match_uaddsub_overflow. Return 1
if USE_STMT is unsigned overflow check ovf != 0 for
@@ -3390,12 +3411,18 @@ match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
gimple *use_stmt;
gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
+ optab op = code == PLUS_EXPR ? uaddv4_optab : usubv4_optab;
+ machine_mode mode = TYPE_MODE (type);
+ int unsignedp = TYPE_UNSIGNED (type);
if (!INTEGRAL_TYPE_P (type)
- || !TYPE_UNSIGNED (type)
+ || !unsignedp
|| has_zero_uses (lhs)
- || has_single_use (lhs)
- || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab,
- TYPE_MODE (type)) == CODE_FOR_nothing)
+ || has_single_use (lhs))
+ return false;
+
+ if (optab_handler (op, mode) == CODE_FOR_nothing
+ && (!flag_uaddsub_overflow_match_all
+ || !wider_optab_check_p (op, mode, unsignedp)))
return false;
FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
--
2.33.0

View File

@ -0,0 +1,488 @@
From 3be7a26a08772d014f54f7b1a0555ccca91115d6 Mon Sep 17 00:00:00 2001
From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
Date: Wed, 25 Jan 2023 15:04:07 +0300
Subject: [PATCH 22/33] Match double sized mul pattern
---
gcc/match.pd | 136 +++++++++++++++++++++
gcc/testsuite/gcc.dg/double_sized_mul-1.c | 141 ++++++++++++++++++++++
gcc/testsuite/gcc.dg/double_sized_mul-2.c | 62 ++++++++++
gcc/tree-ssa-math-opts.c | 80 ++++++++++++
4 files changed, 419 insertions(+)
create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-1.c
create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-2.c
diff --git a/gcc/match.pd b/gcc/match.pd
index e98cd02e0..74f8ab999 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -6390,3 +6390,139 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
to the number of trailing zeroes. */
(match (ctz_table_index @1 @2 @3)
(rshift (mult (bit_and:c (negate @1) @1) INTEGER_CST@2) INTEGER_CST@3))
+
+/* Match multiplication with double sized result.
+
+ Consider the following calculations:
+ arg0 * arg1 = (2^(bit_size/2) * arg0_hi + arg0_lo)
+ * (2^(bit_size/2) * arg1_hi + arg1_lo)
+ arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi
+ + 2^(bit_size/2) * (arg0_hi * arg1_lo + arg0_lo * arg1_hi)
+ + arg0_lo * arg1_lo
+
+ The products of high and low parts fits in bit_size values, thus they are
+ placed in high and low parts of result respectively.
+
+ The sum of the mixed products may overflow, so we need a detection for that.
+ Also it has a bit_size/2 offset, thus it intersects with both high and low
+ parts of result. Overflow detection constant is bit_size/2 due to this.
+
+ With this info:
+ arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi
+ + 2^(bit_size/2) * middle
+ + 2^bit_size * possible_middle_overflow
+ + arg0_lo * arg1_lo
+ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow)
+ + 2^(bit_size/2) * (2^(bit_size/2) * middle_hi + middle_lo)
+ + arg0_lo * arg1_lo
+ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + middle_hi
+ + possible_middle_overflow)
+ + 2^(bit_size/2) * middle_lo
+ + arg0_lo * arg1_lo
+
+ The last sum can produce overflow for the high result part. With this:
+ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow
+ + possible_res_lo_overflow + middle_hi)
+ + res_lo
+ = res_hi + res_lo
+
+ This formula is quite big to fit into one match pattern with all of the
+ combinations of terms inside it. There are many helpers for better code
+ readability.
+
+ The simplification basis is res_hi: assuming that res_lo only is not
+ real practical case for such calculations.
+
+ Overflow handling is done via matching complex calculations:
+ the realpart and imagpart are quite handy here. */
+/* Match low and high parts of the argument. */
+(match (double_size_mul_arg_lo @0 @1)
+ (bit_and @0 INTEGER_CST@1)
+ (if (wi::to_wide (@1)
+ == wi::mask (TYPE_PRECISION (type) / 2, false, TYPE_PRECISION (type)))))
+(match (double_size_mul_arg_hi @0 @1)
+ (rshift @0 INTEGER_CST@1)
+ (if (wi::to_wide (@1) == TYPE_PRECISION (type) / 2)))
+
+/* Match various argument parts products. */
+(match (double_size_mul_lolo @0 @1)
+ (mult@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_lo @1 @3))
+ (if (single_use (@4))))
+(match (double_size_mul_hihi @0 @1)
+ (mult@4 (double_size_mul_arg_hi @0 @2) (double_size_mul_arg_hi @1 @3))
+ (if (single_use (@4))))
+(match (double_size_mul_lohi @0 @1)
+ (mult:c@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_hi @1 @3))
+ (if (single_use (@4))))
+
+/* Match complex middle sum. */
+(match (double_size_mul_middle_complex @0 @1)
+ (IFN_ADD_OVERFLOW@2 (double_size_mul_lohi @0 @1) (double_size_mul_lohi @1 @0))
+ (if (num_imm_uses (@2) == 2)))
+
+/* Match real middle results. */
+(match (double_size_mul_middle @0 @1)
+ (realpart@2 (double_size_mul_middle_complex @0 @1))
+ (if (num_imm_uses (@2) == 2)))
+(match (double_size_mul_middleres_lo @0 @1)
+ (lshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2)
+ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
+ && single_use (@3))))
+(match (double_size_mul_middleres_hi @0 @1)
+ (rshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2)
+ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
+ && single_use (@3))))
+
+/* Match low result part. */
+/* Number of uses may be < 2 in case when we are interested in
+ high part only. */
+(match (double_size_mul_res_lo_complex @0 @1)
+ (IFN_ADD_OVERFLOW:c@2
+ (double_size_mul_lolo:c @0 @1) (double_size_mul_middleres_lo @0 @1))
+ (if (num_imm_uses (@2) <= 2)))
+(match (double_size_mul_res_lo @0 @1)
+ (realpart (double_size_mul_res_lo_complex @0 @1)))
+
+/* Match overflow terms. */
+(match (double_size_mul_overflow_check_lo @0 @1 @5)
+ (convert@4 (ne@3
+ (imagpart@2 (double_size_mul_res_lo_complex@5 @0 @1)) integer_zerop))
+ (if (single_use (@2) && single_use (@3) && single_use (@4))))
+(match (double_size_mul_overflow_check_hi @0 @1)
+ (lshift@6 (convert@5 (ne@4
+ (imagpart@3 (double_size_mul_middle_complex @0 @1)) integer_zerop))
+ INTEGER_CST@2)
+ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
+ && single_use (@3) && single_use (@4) && single_use (@5)
+ && single_use (@6))))
+
+/* Match all possible permutations for high result part calculations. */
+(for op1 (double_size_mul_hihi
+ double_size_mul_overflow_check_hi
+ double_size_mul_middleres_hi)
+ op2 (double_size_mul_overflow_check_hi
+ double_size_mul_middleres_hi
+ double_size_mul_hihi)
+ op3 (double_size_mul_middleres_hi
+ double_size_mul_hihi
+ double_size_mul_overflow_check_hi)
+ (match (double_size_mul_candidate @0 @1 @2 @3)
+ (plus:c@2
+ (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3) (op1:c @0 @1))
+ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1)))
+ (if (single_use (@4) && single_use (@5))))
+ (match (double_size_mul_candidate @0 @1 @2 @3)
+ (plus:c@2 (double_size_mul_overflow_check_lo @0 @1 @3)
+ (plus:c@4 (op1:c @0 @1)
+ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1))))
+ (if (single_use (@4) && single_use (@5))))
+ (match (double_size_mul_candidate @0 @1 @2 @3)
+ (plus:c@2 (op1:c @0 @1)
+ (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3)
+ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1))))
+ (if (single_use (@4) && single_use (@5))))
+ (match (double_size_mul_candidate @0 @1 @2 @3)
+ (plus:c@2 (op1:c @0 @1)
+ (plus:c@4 (op2:c @0 @1)
+ (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1))))
+ (if (single_use (@4) && single_use (@5)))))
diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-1.c b/gcc/testsuite/gcc.dg/double_sized_mul-1.c
new file mode 100644
index 000000000..4d475cc8a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/double_sized_mul-1.c
@@ -0,0 +1,141 @@
+/* { dg-do compile } */
+/* fif-conversion-gimple and fuaddsub-overflow-match-all are required for
+ proper overflow detection in some cases. */
+/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
+#include <stdint.h>
+
+typedef unsigned __int128 uint128_t;
+
+uint16_t mul16 (uint8_t a, uint8_t b)
+{
+ uint8_t a_lo = a & 0xF;
+ uint8_t b_lo = b & 0xF;
+ uint8_t a_hi = a >> 4;
+ uint8_t b_hi = b >> 4;
+ uint8_t lolo = a_lo * b_lo;
+ uint8_t lohi = a_lo * b_hi;
+ uint8_t hilo = a_hi * b_lo;
+ uint8_t hihi = a_hi * b_hi;
+ uint8_t middle = hilo + lohi;
+ uint8_t middle_hi = middle >> 4;
+ uint8_t middle_lo = middle << 4;
+ uint8_t res_lo = lolo + middle_lo;
+ uint8_t res_hi = hihi + middle_hi;
+ res_hi += (res_lo < middle_lo ? 1 : 0);
+ res_hi += (middle < hilo ? 0x10 : 0);
+ uint16_t res = ((uint16_t) res_hi) << 8;
+ res += res_lo;
+ return res;
+}
+
+uint32_t mul32 (uint16_t a, uint16_t b)
+{
+ uint16_t a_lo = a & 0xFF;
+ uint16_t b_lo = b & 0xFF;
+ uint16_t a_hi = a >> 8;
+ uint16_t b_hi = b >> 8;
+ uint16_t lolo = a_lo * b_lo;
+ uint16_t lohi = a_lo * b_hi;
+ uint16_t hilo = a_hi * b_lo;
+ uint16_t hihi = a_hi * b_hi;
+ uint16_t middle = hilo + lohi;
+ uint16_t middle_hi = middle >> 8;
+ uint16_t middle_lo = middle << 8;
+ uint16_t res_lo = lolo + middle_lo;
+ uint16_t res_hi = hihi + middle_hi;
+ res_hi += (res_lo < middle_lo ? 1 : 0);
+ res_hi += (middle < hilo ? 0x100 : 0);
+ uint32_t res = ((uint32_t) res_hi) << 16;
+ res += res_lo;
+ return res;
+}
+
+uint64_t mul64 (uint32_t a, uint32_t b)
+{
+ uint32_t a_lo = a & 0xFFFF;
+ uint32_t b_lo = b & 0xFFFF;
+ uint32_t a_hi = a >> 16;
+ uint32_t b_hi = b >> 16;
+ uint32_t lolo = a_lo * b_lo;
+ uint32_t lohi = a_lo * b_hi;
+ uint32_t hilo = a_hi * b_lo;
+ uint32_t hihi = a_hi * b_hi;
+ uint32_t middle = hilo + lohi;
+ uint32_t middle_hi = middle >> 16;
+ uint32_t middle_lo = middle << 16;
+ uint32_t res_lo = lolo + middle_lo;
+ uint32_t res_hi = hihi + middle_hi;
+ res_hi += (res_lo < middle_lo ? 1 : 0);
+ res_hi += (middle < hilo ? 0x10000 : 0);
+ uint64_t res = ((uint64_t) res_hi) << 32;
+ res += res_lo;
+ return res;
+}
+
+uint128_t mul128 (uint64_t a, uint64_t b)
+{
+ uint64_t a_lo = a & 0xFFFFFFFF;
+ uint64_t b_lo = b & 0xFFFFFFFF;
+ uint64_t a_hi = a >> 32;
+ uint64_t b_hi = b >> 32;
+ uint64_t lolo = a_lo * b_lo;
+ uint64_t lohi = a_lo * b_hi;
+ uint64_t hilo = a_hi * b_lo;
+ uint64_t hihi = a_hi * b_hi;
+ uint64_t middle = hilo + lohi;
+ uint64_t middle_hi = middle >> 32;
+ uint64_t middle_lo = middle << 32;
+ uint64_t res_lo = lolo + middle_lo;
+ uint64_t res_hi = hihi + middle_hi;
+ res_hi += (res_lo < middle_lo ? 1 : 0);
+ res_hi += (middle < hilo ? 0x100000000 : 0);
+ uint128_t res = ((uint128_t) res_hi) << 64;
+ res += res_lo;
+ return res;
+}
+
+uint64_t mul64_perm (uint32_t a, uint32_t b)
+{
+ uint32_t a_lo = a & 0xFFFF;
+ uint32_t b_lo = b & 0xFFFF;
+ uint32_t a_hi = a >> 16;
+ uint32_t b_hi = b >> 16;
+ uint32_t lolo = a_lo * b_lo;
+ uint32_t lohi = a_lo * b_hi;
+ uint32_t hilo = a_hi * b_lo;
+ uint32_t hihi = a_hi * b_hi;
+ uint32_t middle = hilo + lohi;
+ uint32_t middle_hi = middle >> 16;
+ uint32_t middle_lo = middle << 16;
+ uint32_t res_lo = lolo + middle_lo;
+ uint32_t res_hi = hihi + middle_hi;
+ res_hi = res_lo < middle_lo ? res_hi + 1 : res_hi;
+ res_hi = middle < hilo ? res_hi + 0x10000 : res_hi;
+ uint64_t res = ((uint64_t) res_hi) << 32;
+ res += res_lo;
+ return res;
+}
+
+uint128_t mul128_perm (uint64_t a, uint64_t b)
+{
+ uint64_t a_lo = a & 0xFFFFFFFF;
+ uint64_t b_lo = b & 0xFFFFFFFF;
+ uint64_t a_hi = a >> 32;
+ uint64_t b_hi = b >> 32;
+ uint64_t lolo = a_lo * b_lo;
+ uint64_t lohi = a_lo * b_hi;
+ uint64_t hilo = a_hi * b_lo;
+ uint64_t hihi = a_hi * b_hi;
+ uint64_t middle = hilo + lohi;
+ uint64_t middle_hi = middle >> 32;
+ uint64_t middle_lo = middle << 32;
+ uint64_t res_lo = lolo + middle_lo;
+ uint64_t res_hi = hihi + middle_hi;
+ res_hi = res_lo < middle_lo ? res_hi + 1 : res_hi;
+ res_hi = middle < hilo ? res_hi + 0x100000000 : res_hi;
+ uint128_t res = ((uint128_t) res_hi) << 64;
+ res += res_lo;
+ return res;
+}
+
+/* { dg-final { scan-tree-dump-times "double sized mul optimized: 1" 6 "widening_mul" } } */
diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-2.c b/gcc/testsuite/gcc.dg/double_sized_mul-2.c
new file mode 100644
index 000000000..cc6e5af25
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/double_sized_mul-2.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* fif-conversion-gimple is required for proper overflow detection
+ in some cases. */
+/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
+#include <stdint.h>
+
+typedef unsigned __int128 uint128_t;
+typedef struct uint256_t
+{
+ uint128_t lo;
+ uint128_t hi;
+} uint256_t;
+
+uint64_t mul64_double_use (uint32_t a, uint32_t b)
+{
+ uint32_t a_lo = a & 0xFFFF;
+ uint32_t b_lo = b & 0xFFFF;
+ uint32_t a_hi = a >> 16;
+ uint32_t b_hi = b >> 16;
+ uint32_t lolo = a_lo * b_lo;
+ uint32_t lohi = a_lo * b_hi;
+ uint32_t hilo = a_hi * b_lo;
+ uint32_t hihi = a_hi * b_hi;
+ uint32_t middle = hilo + lohi;
+ uint32_t middle_hi = middle >> 16;
+ uint32_t middle_lo = middle << 16;
+ uint32_t res_lo = lolo + middle_lo;
+ uint32_t res_hi = hihi + middle_hi;
+ res_hi += (res_lo < middle_lo ? 1 : 0);
+ res_hi += (middle < hilo ? 0x10000 : 0);
+ uint64_t res = ((uint64_t) res_hi) << 32;
+ res += res_lo;
+ return res + lolo;
+}
+
+uint256_t mul256 (uint128_t a, uint128_t b)
+{
+ uint128_t a_lo = a & 0xFFFFFFFFFFFFFFFF;
+ uint128_t b_lo = b & 0xFFFFFFFFFFFFFFFF;
+ uint128_t a_hi = a >> 64;
+ uint128_t b_hi = b >> 64;
+ uint128_t lolo = a_lo * b_lo;
+ uint128_t lohi = a_lo * b_hi;
+ uint128_t hilo = a_hi * b_lo;
+ uint128_t hihi = a_hi * b_hi;
+ uint128_t middle = hilo + lohi;
+ uint128_t middle_hi = middle >> 64;
+ uint128_t middle_lo = middle << 64;
+ uint128_t res_lo = lolo + middle_lo;
+ uint128_t res_hi = hihi + middle_hi;
+ res_hi += (res_lo < middle_lo ? 1 : 0);
+ /* Constant is to big warning WA */
+ uint128_t overflow_tmp = (middle < hilo ? 1 : 0);
+ overflow_tmp <<= 64;
+ res_hi += overflow_tmp;
+ uint256_t res;
+ res.lo = res_lo;
+ res.hi = res_hi;
+ return res;
+}
+
+/* { dg-final { scan-tree-dump-not "double sized mul optimized" "widening_mul" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 716bf9e35..a81d7501c 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -182,6 +182,9 @@ static struct
/* Number of divmod calls inserted. */
int divmod_calls_inserted;
+
+ /* Number of optimized double sized multiplications. */
+ int double_sized_mul_optimized;
} widen_mul_stats;
/* The instance of "struct occurrence" representing the highest
@@ -3708,6 +3711,78 @@ convert_to_divmod (gassign *stmt)
return true;
}
+/* Pattern matcher for double sized multiplication defined in match.pd. */
+extern bool gimple_double_size_mul_candidate (tree, tree*, tree (*)(tree));
+
+static bool
+convert_double_size_mul (gimple_stmt_iterator *gsi, gimple *stmt)
+{
+ gimple *use_stmt, *complex_res_lo;
+ gimple_stmt_iterator insert_before;
+ imm_use_iterator use_iter;
+ tree match[4]; // arg0, arg1, res_hi, complex_res_lo
+ tree arg0, arg1, widen_mult, new_type, tmp;
+ tree lhs = gimple_assign_lhs (stmt);
+ location_t loc = UNKNOWN_LOCATION;
+ machine_mode mode;
+
+ if (!gimple_double_size_mul_candidate (lhs, match, NULL))
+ return false;
+
+ new_type = build_nonstandard_integer_type (
+ TYPE_PRECISION (TREE_TYPE (match[0])) * 2, 1);
+ mode = TYPE_MODE (new_type);
+
+ /* Early return if the target multiplication doesn't exist on target. */
+ if (optab_handler (smul_optab, mode) == CODE_FOR_nothing
+ && !wider_optab_check_p (smul_optab, mode, 1))
+ return false;
+
+ /* Determine the point where the wide multiplication
+ should be inserted. Complex low res is OK since it is required
+ by both high and low part getters, thus it dominates both of them. */
+ complex_res_lo = SSA_NAME_DEF_STMT (match[3]);
+ insert_before = gsi_for_stmt (complex_res_lo);
+ gsi_next (&insert_before);
+
+ /* Create the widen multiplication. */
+ arg0 = build_and_insert_cast (&insert_before, loc, new_type, match[0]);
+ arg1 = build_and_insert_cast (&insert_before, loc, new_type, match[1]);
+ widen_mult = build_and_insert_binop (&insert_before, loc, "widen_mult",
+ MULT_EXPR, arg0, arg1);
+
+ /* Find the mult low part getter. */
+ FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, match[3])
+ if (gimple_assign_rhs_code (use_stmt) == REALPART_EXPR)
+ BREAK_FROM_IMM_USE_STMT (use_iter);
+
+ /* Create high and low (if needed) parts extractors. */
+ /* Low part. */
+ if (use_stmt)
+ {
+ loc = gimple_location (use_stmt);
+ tmp = build_and_insert_cast (&insert_before, loc,
+ TREE_TYPE (gimple_get_lhs (use_stmt)),
+ widen_mult);
+ gassign *new_stmt = gimple_build_assign (gimple_get_lhs (use_stmt),
+ NOP_EXPR, tmp);
+ gsi_replace (&insert_before, new_stmt, true);
+ }
+
+ /* High part. */
+ loc = gimple_location (stmt);
+ tmp = build_and_insert_binop (gsi, loc, "widen_mult_hi",
+ RSHIFT_EXPR, widen_mult,
+ build_int_cst (new_type,
+ TYPE_PRECISION (new_type) / 2));
+ tmp = build_and_insert_cast (gsi, loc, TREE_TYPE (lhs), tmp);
+ gassign *new_stmt = gimple_build_assign (lhs, NOP_EXPR, tmp);
+ gsi_replace (gsi, new_stmt, true);
+
+ widen_mul_stats.double_sized_mul_optimized++;
+ return true;
+}
+
/* Find integer multiplications where the operands are extended from
smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
where appropriate. */
@@ -3801,6 +3876,9 @@ math_opts_dom_walker::after_dom_children (basic_block bb)
break;
case PLUS_EXPR:
+ if (convert_double_size_mul (&gsi, stmt))
+ break;
+ __attribute__ ((fallthrough));
case MINUS_EXPR:
if (!convert_plusminus_to_widen (&gsi, stmt, code))
match_uaddsub_overflow (&gsi, stmt, code);
@@ -3892,6 +3970,8 @@ pass_optimize_widening_mul::execute (function *fun)
widen_mul_stats.fmas_inserted);
statistics_counter_event (fun, "divmod calls inserted",
widen_mul_stats.divmod_calls_inserted);
+ statistics_counter_event (fun, "double sized mul optimized",
+ widen_mul_stats.double_sized_mul_optimized);
return cfg_changed ? TODO_cleanup_cfg : 0;
}
--
2.33.0

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,194 @@
From 80b7de670da46d8921118799904cba4a0753bb72 Mon Sep 17 00:00:00 2001
From: Diachkov Ilia WX1215920 <diachkov.ilia1@huawei-partners.com>
Date: Wed, 23 Aug 2023 15:03:00 +0300
Subject: [PATCH 09/13] add insn defs and correct costs for cmlt generation
---
gcc/config/aarch64/aarch64-simd.md | 48 +++++++++++++++++++++++++++++
gcc/config/aarch64/aarch64.c | 15 +++++++++
gcc/config/aarch64/aarch64.opt | 4 +++
gcc/config/aarch64/iterators.md | 3 +-
gcc/config/aarch64/predicates.md | 25 +++++++++++++++
gcc/testsuite/gcc.dg/combine-cmlt.c | 20 ++++++++++++
6 files changed, 114 insertions(+), 1 deletion(-)
create mode 100755 gcc/testsuite/gcc.dg/combine-cmlt.c
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 6049adc3f..f4213fd62 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4719,6 +4719,54 @@
[(set_attr "type" "neon_compare<q>, neon_compare_zero<q>")]
)
+;; Use cmlt to replace vector arithmetic operations like this (SImode example):
+;; B = (((A >> 15) & 0x00010001) << 16) - ((A >> 15) & 0x00010001)
+;; TODO: maybe extend to scalar operations or other cm** instructions.
+
+(define_insn "*aarch64_cmlt_as_arith<mode>"
+ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
+ (minus:<V_INT_EQUIV>
+ (ashift:<V_INT_EQUIV>
+ (and:<V_INT_EQUIV>
+ (lshiftrt:<V_INT_EQUIV>
+ (match_operand:VDQHSD 1 "register_operand" "w")
+ (match_operand:VDQHSD 2 "half_size_minus_one_operand"))
+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand"))
+ (match_operand:VDQHSD 4 "half_size_operand"))
+ (and:<V_INT_EQUIV>
+ (lshiftrt:<V_INT_EQUIV>
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))))]
+ "TARGET_SIMD && flag_cmlt_arith"
+ "cmlt\t%<v>0.<V2ntype>, %<v>1.<V2ntype>, #0"
+ [(set_attr "type" "neon_compare_zero")]
+)
+
+;; The helper definition that allows combiner to use the previous pattern.
+
+(define_insn_and_split "*aarch64_cmlt_tmp<mode>"
+ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
+ (and:<V_INT_EQUIV>
+ (lshiftrt:<V_INT_EQUIV>
+ (match_operand:VDQHSD 1 "register_operand" "w")
+ (match_operand:VDQHSD 2 "half_size_minus_one_operand"))
+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
+ "TARGET_SIMD && flag_cmlt_arith"
+ "#"
+ "&& reload_completed"
+ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
+ (lshiftrt:<V_INT_EQUIV>
+ (match_operand:VDQHSD 1 "register_operand")
+ (match_operand:VDQHSD 2 "half_size_minus_one_operand")))
+ (set (match_dup 0)
+ (and:<V_INT_EQUIV>
+ (match_dup 0)
+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
+ ""
+ [(set_attr "type" "neon_compare_zero")]
+)
+
(define_insn_and_split "aarch64_cm<optab>di"
[(set (match_operand:DI 0 "register_operand" "=w,w,r")
(neg:DI
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index cbdde11b0..7a00a0817 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -12659,6 +12659,21 @@ cost_minus:
return true;
}
+ /* Detect aarch64_cmlt_as_arith instruction. Now only this pattern
+ matches the condition. The costs of cmlt and sub instructions
+ are comparable, so we are not increasing the cost here. */
+ if (flag_cmlt_arith && GET_CODE (op0) == ASHIFT
+ && GET_CODE (op1) == AND)
+ {
+ rtx op0_subop0 = XEXP (op0, 0);
+ if (rtx_equal_p (op0_subop0, op1))
+ {
+ rtx lshrt_op = XEXP (op0_subop0, 0);
+ if (GET_CODE (lshrt_op) == LSHIFTRT)
+ return true;
+ }
+ }
+
/* Look for SUB (extended register). */
if (is_a <scalar_int_mode> (mode, &int_mode)
&& aarch64_rtx_arith_op_extract_p (op1, int_mode))
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index bb888461a..c42494036 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -273,6 +273,10 @@ Use an immediate to offset from the stack protector guard register, sp_el0.
This option is for use with fstack-protector-strong and not for use in
user-land code.
+mcmlt-arith
+Target Report Var(flag_cmlt_arith) Optimization Init(0)
+Use SIMD cmlt instruction to perform some arithmetic/logic calculations.
+
TargetVariable
long aarch64_stack_protector_guard_offset = 0
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 0a7145281..d3be06c6f 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1228,7 +1228,8 @@
(V2DI "2s")])
;; Register suffix narrowed modes for VQN.
-(define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
+(define_mode_attr V2ntype [(V4HI "8b") (V2SI "4h")
+ (V8HI "16b") (V4SI "8h")
(V2DI "4s")])
;; Widened modes of vector modes.
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 1754b1eff..de58562a7 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -47,6 +47,31 @@
return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3);
})
+(define_predicate "half_size_minus_one_operand"
+ (match_code "const_vector")
+{
+ op = unwrap_const_vec_duplicate (op);
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+ return CONST_INT_P (op) && (UINTVAL (op) == size - 1);
+})
+
+(define_predicate "half_size_operand"
+ (match_code "const_vector")
+{
+ op = unwrap_const_vec_duplicate (op);
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+ return CONST_INT_P (op) && (UINTVAL (op) == size);
+})
+
+(define_predicate "cmlt_arith_mask_operand"
+ (match_code "const_vector")
+{
+ op = unwrap_const_vec_duplicate (op);
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+ unsigned long long mask = ((unsigned long long) 1 << size) | 1;
+ return CONST_INT_P (op) && (UINTVAL (op) == mask);
+})
+
(define_predicate "subreg_lowpart_operator"
(ior (match_code "truncate")
(and (match_code "subreg")
diff --git a/gcc/testsuite/gcc.dg/combine-cmlt.c b/gcc/testsuite/gcc.dg/combine-cmlt.c
new file mode 100755
index 000000000..b4c9a37ff
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/combine-cmlt.c
@@ -0,0 +1,20 @@
+/* { dg-do compile { target aarch64-*-* } } */
+/* { dg-options "-O3 -mcmlt-arith" } */
+
+/* The test checks usage of cmlt insns for arithmetic/logic calculations
+ * in foo (). It's inspired by sources of x264 codec. */
+
+typedef unsigned short int uint16_t;
+typedef unsigned int uint32_t;
+
+void foo( uint32_t *a, uint32_t *b)
+{
+ for (unsigned i = 0; i < 4; i++)
+ {
+ uint32_t s = ((a[i]>>((8 * sizeof(uint16_t))-1))
+ &(((uint32_t)1<<(8 * sizeof(uint16_t)))+1))*((uint16_t)-1);
+ b[i] = (a[i]+s)^s;
+ }
+}
+
+/* { dg-final { scan-assembler-times {cmlt\t} 1 } } */
--
2.33.0

View File

@ -0,0 +1,502 @@
From df68d120a049049671e44f6cda51e96a9a82c613 Mon Sep 17 00:00:00 2001
From: Chernonog Vyacheslav 00812786 <chernonog.vyacheslav@huawei.com>
Date: Mon, 28 Nov 2022 14:16:48 +0300
Subject: [PATCH 10/13] Introduce RTL ifcvt enhancements
It is controlled by option -fifcvt-allow-complicated-cmps, allowing
ifcvt to deal with complicated cmps like
if (cmp)
X = reg1
else
X = reg2 + reg3
and
if (cmp)
X = reg1 + reg3
Y = reg2 + reg4
Z = reg3
Parameter -param=ifcvt-allow-register-renaming=[0,1,2] allows ifcvt to
aggressively rename registers in basic blocks.
* 0: does not allow ifcvt to rename registers
* 1: allows ifcvt to rename registers in then and else bb
* 2: allows to rename registers in condition and else/then bb
---
gcc/ifcvt.c | 298 ++++++++++++++++++++++++++++++++++++++-----------
gcc/params.opt | 8 ++
2 files changed, 240 insertions(+), 66 deletions(-)
diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c
index 2452f231c..50a73a7ca 100644
--- a/gcc/ifcvt.c
+++ b/gcc/ifcvt.c
@@ -1,5 +1,5 @@
/* If-conversion support.
- Copyright (C) 2000-2020 Free Software Foundation, Inc.
+ Copyright (C) 2000-2022 Free Software Foundation, Inc.
This file is part of GCC.
@@ -876,7 +876,9 @@ noce_emit_store_flag (struct noce_if_info *if_info, rtx x, int reversep,
}
/* Don't even try if the comparison operands or the mode of X are weird. */
- if (cond_complex || !SCALAR_INT_MODE_P (GET_MODE (x)))
+ if (!param_ifcvt_allow_complicated_cmps
+ && (cond_complex
+ || !SCALAR_INT_MODE_P (GET_MODE (x))))
return NULL_RTX;
return emit_store_flag (x, code, XEXP (cond, 0),
@@ -1743,8 +1745,9 @@ noce_emit_cmove (struct noce_if_info *if_info, rtx x, enum rtx_code code,
/* Don't even try if the comparison operands are weird
except that the target supports cbranchcc4. */
- if (! general_operand (cmp_a, GET_MODE (cmp_a))
- || ! general_operand (cmp_b, GET_MODE (cmp_b)))
+ if (! param_ifcvt_allow_complicated_cmps
+ && (! general_operand (cmp_a, GET_MODE (cmp_a))
+ || ! general_operand (cmp_b, GET_MODE (cmp_b))))
{
if (!have_cbranchcc4
|| GET_MODE_CLASS (GET_MODE (cmp_a)) != MODE_CC
@@ -1915,19 +1918,6 @@ noce_try_cmove (struct noce_if_info *if_info)
return FALSE;
}
-/* Return true if X contains a conditional code mode rtx. */
-
-static bool
-contains_ccmode_rtx_p (rtx x)
-{
- subrtx_iterator::array_type array;
- FOR_EACH_SUBRTX (iter, array, x, ALL)
- if (GET_MODE_CLASS (GET_MODE (*iter)) == MODE_CC)
- return true;
-
- return false;
-}
-
/* Helper for bb_valid_for_noce_process_p. Validate that
the rtx insn INSN is a single set that does not set
the conditional register CC and is in general valid for
@@ -1946,7 +1936,6 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc)
/* Currently support only simple single sets in test_bb. */
if (!sset
|| !noce_operand_ok (SET_DEST (sset))
- || contains_ccmode_rtx_p (SET_DEST (sset))
|| !noce_operand_ok (SET_SRC (sset)))
return false;
@@ -1960,13 +1949,17 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc)
in this function. */
static bool
-bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
+bbs_ok_for_cmove_arith (basic_block bb_a,
+ basic_block bb_b,
+ rtx to_rename,
+ bitmap conflict_regs)
{
rtx_insn *a_insn;
bitmap bba_sets = BITMAP_ALLOC (&reg_obstack);
-
+ bitmap intersections = BITMAP_ALLOC (&reg_obstack);
df_ref def;
df_ref use;
+ rtx_insn *last_a = last_active_insn (bb_a, FALSE);
FOR_BB_INSNS (bb_a, a_insn)
{
@@ -1976,30 +1969,25 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
rtx sset_a = single_set (a_insn);
if (!sset_a)
- {
- BITMAP_FREE (bba_sets);
- return false;
- }
+ goto end_cmove_arith_check_and_fail;
+ if (a_insn == last_a)
+ continue;
/* Record all registers that BB_A sets. */
FOR_EACH_INSN_DEF (def, a_insn)
if (!(to_rename && DF_REF_REG (def) == to_rename))
bitmap_set_bit (bba_sets, DF_REF_REGNO (def));
}
+ bitmap_and (intersections, df_get_live_in (bb_b), bba_sets);
rtx_insn *b_insn;
-
FOR_BB_INSNS (bb_b, b_insn)
{
if (!active_insn_p (b_insn))
continue;
-
rtx sset_b = single_set (b_insn);
if (!sset_b)
- {
- BITMAP_FREE (bba_sets);
- return false;
- }
+ goto end_cmove_arith_check_and_fail;
/* Make sure this is a REG and not some instance
of ZERO_EXTRACT or SUBREG or other dangerous stuff.
@@ -2011,25 +1999,34 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
if (MEM_P (SET_DEST (sset_b)))
gcc_assert (rtx_equal_p (SET_DEST (sset_b), to_rename));
else if (!REG_P (SET_DEST (sset_b)))
- {
- BITMAP_FREE (bba_sets);
- return false;
- }
+ goto end_cmove_arith_check_and_fail;
- /* If the insn uses a reg set in BB_A return false. */
+ /* If the insn uses a reg set in BB_A return false
+ or try to collect register list for renaming. */
FOR_EACH_INSN_USE (use, b_insn)
{
- if (bitmap_bit_p (bba_sets, DF_REF_REGNO (use)))
+ if (bitmap_bit_p (intersections, DF_REF_REGNO (use)))
{
- BITMAP_FREE (bba_sets);
- return false;
+ if (param_ifcvt_allow_register_renaming < 1)
+ goto end_cmove_arith_check_and_fail;
+
+ /* Those regs should be renamed. We can't rename CC reg, but
+ possibly we can provide combined comparison in the future. */
+ if (GET_MODE_CLASS (GET_MODE (DF_REF_REG (use))) == MODE_CC)
+ goto end_cmove_arith_check_and_fail;
+ bitmap_set_bit (conflict_regs, DF_REF_REGNO (use));
}
}
-
}
BITMAP_FREE (bba_sets);
+ BITMAP_FREE (intersections);
return true;
+
+end_cmove_arith_check_and_fail:
+ BITMAP_FREE (bba_sets);
+ BITMAP_FREE (intersections);
+ return false;
}
/* Emit copies of all the active instructions in BB except the last.
@@ -2084,6 +2081,134 @@ noce_emit_bb (rtx last_insn, basic_block bb, bool simple)
return true;
}
+/* This function tries to rename regs that intersect with considered bb. */
+
+static bool
+noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs)
+{
+ bool success = true;
+ if (bitmap_empty_p (cond_rename_regs))
+ return true;
+ if (param_ifcvt_allow_register_renaming < 2)
+ return false;
+ df_ref use;
+ rtx_insn* cmp_insn = if_info->cond_earliest;
+ /* Jump instruction as a condion currently unsupported. */
+ if (JUMP_P (cmp_insn))
+ return false;
+ rtx_insn* before_cmp = PREV_INSN (cmp_insn);
+ start_sequence ();
+ rtx_insn *copy_of_cmp = as_a <rtx_insn *> (copy_rtx (cmp_insn));
+ basic_block cmp_block = BLOCK_FOR_INSN (cmp_insn);
+ FOR_EACH_INSN_USE (use, cmp_insn)
+ {
+ if (bitmap_bit_p (cond_rename_regs, DF_REF_REGNO (use)))
+ {
+ rtx use_reg = DF_REF_REG (use);
+ rtx tmp = gen_reg_rtx (GET_MODE (use_reg));
+ if (!validate_replace_rtx (use_reg, tmp, copy_of_cmp))
+ {
+ end_sequence ();
+ return false;
+ }
+ noce_emit_move_insn (tmp, use_reg);
+ }
+ }
+
+ emit_insn (PATTERN (copy_of_cmp));
+ rtx_insn *seq = get_insns ();
+ unshare_all_rtl_in_chain (seq);
+ end_sequence ();
+
+ emit_insn_after_setloc (seq, before_cmp, INSN_LOCATION (cmp_insn));
+ delete_insn_and_edges (cmp_insn);
+ rtx_insn* insn;
+ FOR_BB_INSNS (cmp_block, insn)
+ df_insn_rescan (insn);
+
+ if_info->cond = noce_get_condition (if_info->jump,
+ &copy_of_cmp,
+ if_info->then_else_reversed);
+ if_info->cond_earliest = copy_of_cmp;
+ if_info->rev_cond = NULL_RTX;
+
+ return success;
+}
+
+/* This function tries to rename regs that intersect with considered bb. */
+static bool
+noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs)
+{
+ if (bitmap_empty_p (rename_regs))
+ return true;
+ rtx_insn* insn;
+ rtx_insn *last_insn = last_active_insn (test_bb, FALSE);
+ bool res = true;
+ start_sequence ();
+ FOR_BB_INSNS (test_bb, insn)
+ {
+ if (!active_insn_p (insn))
+ continue;
+ /* Only ssets are supported for now. */
+ rtx sset = single_set (insn);
+ gcc_assert (sset);
+ rtx x = SET_DEST (sset);
+ if (!REG_P (x) || bitmap_bit_p (rename_regs, REGNO (x)))
+ continue;
+
+ machine_mode mode = GET_MODE (x);
+ rtx tmp = gen_reg_rtx (mode);
+ if (!validate_replace_rtx_part (x, tmp, &SET_DEST (sset), insn))
+ {
+ gcc_assert (insn != last_insn);
+ /* We can generate additional move for such case,
+ but it will increase register preasure.
+ For now just stop transformation. */
+ rtx result_rtx = SET_DEST (single_set (last_insn));
+ if (REG_P (result_rtx) && (x != result_rtx))
+ {
+ res = false;
+ break;
+ }
+ if (!validate_replace_rtx (x, tmp, insn))
+ gcc_unreachable ();
+ noce_emit_move_insn (tmp,x);
+ }
+ set_used_flags (insn);
+ rtx_insn* rename_candidate;
+ for (rename_candidate = NEXT_INSN (insn);
+ rename_candidate && rename_candidate!= NEXT_INSN (BB_END (test_bb));
+ rename_candidate = NEXT_INSN (rename_candidate))
+ {
+ if (!reg_overlap_mentioned_p (x, rename_candidate))
+ continue;
+
+ int replace_res = TRUE;
+ if (rename_candidate == last_insn)
+ {
+ validate_replace_src_group (x, tmp, rename_candidate);
+ replace_res = apply_change_group ();
+ }
+ else
+ replace_res = validate_replace_rtx (x, tmp, rename_candidate);
+ gcc_assert (replace_res);
+ set_used_flags (rename_candidate);
+
+ }
+ set_used_flags (x);
+ set_used_flags (tmp);
+
+ }
+ rtx_insn *seq = get_insns ();
+ unshare_all_rtl_in_chain (seq);
+ end_sequence ();
+ emit_insn_before_setloc (seq, first_active_insn (test_bb),
+ INSN_LOCATION (first_active_insn (test_bb)));
+ FOR_BB_INSNS (test_bb, insn)
+ df_insn_rescan (insn);
+ return res;
+}
+
/* Try more complex cases involving conditional_move. */
static int
@@ -2166,11 +2291,29 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
std::swap (then_bb, else_bb);
}
}
-
+ bitmap else_bb_rename_regs = BITMAP_ALLOC (&reg_obstack);
+ bitmap then_bb_rename_regs = BITMAP_ALLOC (&reg_obstack);
if (then_bb && else_bb
- && (!bbs_ok_for_cmove_arith (then_bb, else_bb, if_info->orig_x)
- || !bbs_ok_for_cmove_arith (else_bb, then_bb, if_info->orig_x)))
- return FALSE;
+ && (!bbs_ok_for_cmove_arith (then_bb, else_bb,
+ if_info->orig_x,
+ then_bb_rename_regs)
+ || !bbs_ok_for_cmove_arith (else_bb, then_bb,
+ if_info->orig_x,
+ else_bb_rename_regs)))
+ {
+ BITMAP_FREE (then_bb_rename_regs);
+ BITMAP_FREE (else_bb_rename_regs);
+ return FALSE;
+ }
+ bool prepass_renaming = true;
+ prepass_renaming |= noce_rename_regs_in_bb (then_bb, then_bb_rename_regs);
+ prepass_renaming |= noce_rename_regs_in_bb (else_bb, else_bb_rename_regs);
+
+ BITMAP_FREE (then_bb_rename_regs);
+ BITMAP_FREE (else_bb_rename_regs);
+
+ if (!prepass_renaming)
+ return FALSE;
start_sequence ();
@@ -2178,7 +2321,6 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
came from the test block. The non-empty complex block that we will
emit might clobber the register used by B or A, so move it to a pseudo
first. */
-
rtx tmp_a = NULL_RTX;
rtx tmp_b = NULL_RTX;
@@ -3052,7 +3194,8 @@ noce_operand_ok (const_rtx op)
static bool
bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
- unsigned int *cost, bool *simple_p)
+ unsigned int *cost, bool *simple_p,
+ bitmap cond_rename_regs)
{
if (!test_bb)
return false;
@@ -3086,10 +3229,10 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
rtx_insn *prev_last_insn = PREV_INSN (last_insn);
gcc_assert (prev_last_insn);
- /* For now, disallow setting x multiple times in test_bb. */
- if (REG_P (x) && reg_set_between_p (x, first_insn, prev_last_insn))
+ if (REG_P (x)
+ && reg_set_between_p (x, first_insn, prev_last_insn)
+ && param_ifcvt_allow_register_renaming < 1)
return false;
-
bitmap test_bb_temps = BITMAP_ALLOC (&reg_obstack);
/* The regs that are live out of test_bb. */
@@ -3099,25 +3242,35 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
rtx_insn *insn;
FOR_BB_INSNS (test_bb, insn)
{
- if (insn != last_insn)
- {
- if (!active_insn_p (insn))
- continue;
+ if (insn == last_insn)
+ continue;
+ if (!active_insn_p (insn))
+ continue;
- if (!insn_valid_noce_process_p (insn, cc))
- goto free_bitmap_and_fail;
+ if (!insn_valid_noce_process_p (insn, cc))
+ goto free_bitmap_and_fail;
- rtx sset = single_set (insn);
- gcc_assert (sset);
+ rtx sset = single_set (insn);
+ gcc_assert (sset);
- if (contains_mem_rtx_p (SET_SRC (sset))
- || !REG_P (SET_DEST (sset))
- || reg_overlap_mentioned_p (SET_DEST (sset), cond))
- goto free_bitmap_and_fail;
+ if (contains_mem_rtx_p (SET_SRC (sset))
+ || !REG_P (SET_DEST (sset)))
+ goto free_bitmap_and_fail;
- potential_cost += pattern_cost (sset, speed_p);
- bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
+ if (reg_overlap_mentioned_p (SET_DEST (sset), cond))
+ {
+ if (param_ifcvt_allow_register_renaming < 1)
+ goto free_bitmap_and_fail;
+ rtx sset_dest = SET_DEST (sset);
+ if (REG_P (sset_dest)
+ && (GET_MODE_CLASS (GET_MODE (sset_dest)) != MODE_CC))
+ bitmap_set_bit (cond_rename_regs, REGNO (sset_dest));
+ else
+ goto free_bitmap_and_fail;
}
+ potential_cost += pattern_cost (sset, speed_p);
+ if (SET_DEST (sset) != SET_DEST (last_set))
+ bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
}
/* If any of the intermediate results in test_bb are live after test_bb
@@ -3475,14 +3628,27 @@ noce_process_if_block (struct noce_if_info *if_info)
bool speed_p = optimize_bb_for_speed_p (test_bb);
unsigned int then_cost = 0, else_cost = 0;
+ bitmap cond_rename_regs = BITMAP_ALLOC (&reg_obstack);
if (!bb_valid_for_noce_process_p (then_bb, cond, &then_cost,
- &if_info->then_simple))
- return false;
+ &if_info->then_simple, cond_rename_regs))
+ {
+ BITMAP_FREE (cond_rename_regs);
+ return false;
+ }
if (else_bb
&& !bb_valid_for_noce_process_p (else_bb, cond, &else_cost,
- &if_info->else_simple))
+ &if_info->else_simple, cond_rename_regs))
+ {
+ BITMAP_FREE (cond_rename_regs);
+ return false;
+ }
+
+ if (!noce_rename_regs_in_cond (if_info, cond_rename_regs))
return false;
+ cond = if_info->cond;
+
+ BITMAP_FREE (cond_rename_regs);
if (speed_p)
if_info->original_cost += average_cost (then_cost, else_cost,
@@ -5426,7 +5592,7 @@ if_convert (bool after_combine)
{
basic_block bb;
int pass;
-
+ cleanup_cfg (CLEANUP_EXPENSIVE);
if (optimize == 1)
{
df_live_add_problem ();
diff --git a/gcc/params.opt b/gcc/params.opt
index 83fd705ee..345f9b3ff 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -574,6 +574,14 @@ Maximum permissible cost for the sequence that would be generated by the RTL if-
Common Joined UInteger Var(param_max_rtl_if_conversion_unpredictable_cost) Init(40) IntegerRange(0, 200) Param Optimization
Maximum permissible cost for the sequence that would be generated by the RTL if-conversion pass for a branch that is considered unpredictable.
+-param=ifcvt-allow-complicated-cmps=
+Common Joined UInteger Var(param_ifcvt_allow_complicated_cmps) IntegerRange(0, 1) Param Optimization
+Allow RTL if-conversion pass to deal with complicated cmps (can increase compilation time).
+
+-param=ifcvt-allow-register-renaming=
+Common Joined UInteger Var(param_ifcvt_allow_register_renaming) IntegerRange(0, 2) Param Optimization
+Allow RTL if-conversion pass to aggressively rename registers in basic blocks. Sometimes additional moves will be created.
+
-param=max-sched-extend-regions-iters=
Common Joined UInteger Var(param_max_sched_extend_regions_iters) Param Optimization
The maximum number of iterations through CFG to extend regions.
--
2.33.0

View File

@ -0,0 +1,239 @@
From f43bdfbdcfdeb425a0bd303f4787a13323fd2934 Mon Sep 17 00:00:00 2001
From: vchernon <chernonog.vyacheslav@huawei.com>
Date: Wed, 27 Sep 2023 11:07:29 +0800
Subject: [PATCH 11/13] Add more flexible check for pointer aliasing during
vectorization
It takes minimum between number of iteration and segment length and helps to
speed up loops with small number of iterations when only tail can be vectorized.
---
gcc/params.opt | 5 ++
.../sve/var_stride_flexible_segment_len_1.c | 23 +++++++
gcc/tree-data-ref.c | 68 +++++++++++++------
gcc/tree-data-ref.h | 11 ++-
gcc/tree-vect-data-refs.c | 14 +++-
5 files changed, 95 insertions(+), 26 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
diff --git a/gcc/params.opt b/gcc/params.opt
index 83fd705ee..7f335a94b 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -964,6 +964,11 @@ Maximum number of loop peels to enhance alignment of data references in a loop.
Common Joined UInteger Var(param_vect_max_version_for_alias_checks) Init(10) Param Optimization
Bound on number of runtime checks inserted by the vectorizer's loop versioning for alias check.
+-param=vect-alias-flexible-segment-len=
+Common Joined UInteger Var(param_flexible_seg_len) Init(0) IntegerRange(0, 1) Param Optimization
+Use a minimum length of different segments. Currently the minimum between
+iteration number and vectorization length is chosen by this param.
+
-param=vect-max-version-for-alignment-checks=
Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization
Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
new file mode 100644
index 000000000..894f075f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize --param=vect-alias-flexible-segment-len=1" } */
+
+#define TYPE int
+#define SIZE 257
+
+void __attribute__ ((weak))
+f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused)))
+{
+ for (int i = 0; i < SIZE; ++i)
+ x[i * n] += y[i * n];
+}
+
+/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
+/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
+/* Should use a WAR check that multiplies by (VF-2)*4 rather than
+ an overlap check that multiplies by (257-1)*4. */
+/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */
+/* One range check and a check for n being zero. */
+/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */
diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c
index 2cb54def8..8c5f1048c 100644
--- a/gcc/tree-data-ref.c
+++ b/gcc/tree-data-ref.c
@@ -2071,31 +2071,14 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr,
same arguments. Try to optimize cases in which the second access
is a write and in which some overlap is valid. */
-static bool
-create_waw_or_war_checks (tree *cond_expr,
+static void
+create_waw_or_war_checks2 (tree *cond_expr, tree seg_len_a,
const dr_with_seg_len_pair_t &alias_pair)
{
const dr_with_seg_len& dr_a = alias_pair.first;
const dr_with_seg_len& dr_b = alias_pair.second;
- /* Check for cases in which:
-
- (a) DR_B is always a write;
- (b) the accesses are well-ordered in both the original and new code
- (see the comment above the DR_ALIAS_* flags for details); and
- (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */
- if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
- return false;
-
- /* Check for equal (but possibly variable) steps. */
tree step = DR_STEP (dr_a.dr);
- if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
- return false;
-
- /* Make sure that we can operate on sizetype without loss of precision. */
- tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
- if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
- return false;
/* All addresses involved are known to have a common alignment ALIGN.
We can therefore subtract ALIGN from an exclusive endpoint to get
@@ -2112,9 +2095,6 @@ create_waw_or_war_checks (tree *cond_expr,
fold_convert (ssizetype, indicator),
ssize_int (0));
- /* Get lengths in sizetype. */
- tree seg_len_a
- = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len));
step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step));
/* Each access has the following pattern:
@@ -2221,6 +2201,50 @@ create_waw_or_war_checks (tree *cond_expr,
*cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit);
if (dump_enabled_p ())
dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n");
+}
+
+/* This is a wrapper function for create_waw_or_war_checks2. */
+static bool
+create_waw_or_war_checks (tree *cond_expr,
+ const dr_with_seg_len_pair_t &alias_pair)
+{
+ const dr_with_seg_len& dr_a = alias_pair.first;
+ const dr_with_seg_len& dr_b = alias_pair.second;
+
+ /* Check for cases in which:
+
+ (a) DR_B is always a write;
+ (b) the accesses are well-ordered in both the original and new code
+ (see the comment above the DR_ALIAS_* flags for details); and
+ (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */
+ if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
+ return false;
+
+ /* Check for equal (but possibly variable) steps. */
+ tree step = DR_STEP (dr_a.dr);
+ if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
+ return false;
+
+ /* Make sure that we can operate on sizetype without loss of precision. */
+ tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
+ if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
+ return false;
+
+ /* Get lengths in sizetype. */
+ tree seg_len_a
+ = fold_convert (sizetype,
+ rewrite_to_non_trapping_overflow (dr_a.seg_len));
+ create_waw_or_war_checks2 (cond_expr, seg_len_a, alias_pair);
+ if (param_flexible_seg_len && dr_a.seg_len != dr_a.seg_len2)
+ {
+ tree seg_len2_a
+ = fold_convert (sizetype,
+ rewrite_to_non_trapping_overflow (dr_a.seg_len2));
+ tree cond_expr2;
+ create_waw_or_war_checks2 (&cond_expr2, seg_len2_a, alias_pair);
+ *cond_expr = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
+ *cond_expr, cond_expr2);
+ }
return true;
}
diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
index 771d20fbb..5903ce66a 100644
--- a/gcc/tree-data-ref.h
+++ b/gcc/tree-data-ref.h
@@ -208,12 +208,19 @@ class dr_with_seg_len
public:
dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size,
unsigned int a)
- : dr (d), seg_len (len), access_size (size), align (a) {}
-
+ : dr (d), seg_len (len), seg_len2 (len), access_size (size), align (a)
+ {}
+ dr_with_seg_len (data_reference_p d, tree len, tree len2,
+ unsigned HOST_WIDE_INT size, unsigned int a)
+ : dr (d), seg_len (len), seg_len2 (len2), access_size (size), align (a)
+ {}
data_reference_p dr;
/* The offset of the last access that needs to be checked minus
the offset of the first. */
tree seg_len;
+ /* The second version of segment length. Currently this is used to
+ soften checks for a small number of iterations. */
+ tree seg_len2;
/* A value that, when added to abs (SEG_LEN), gives the total number of
bytes in the segment. */
poly_uint64 access_size;
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index e4466a4f3..1b8a03c9c 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -3498,6 +3498,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
{
poly_uint64 lower_bound;
tree segment_length_a, segment_length_b;
+ tree segment_length2_a, segment_length2_b;
unsigned HOST_WIDE_INT access_size_a, access_size_b;
unsigned int align_a, align_b;
@@ -3598,6 +3599,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
{
segment_length_a = size_zero_node;
segment_length_b = size_zero_node;
+ segment_length2_a = size_zero_node;
+ segment_length2_b = size_zero_node;
}
else
{
@@ -3606,8 +3609,15 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
length_factor = scalar_loop_iters;
else
length_factor = size_int (vect_factor);
+ /* In any case we should rememeber scalar_loop_iters
+ this helps to create flexible aliasing check
+ for small number of iterations. */
segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
+ segment_length2_a
+ = vect_vfa_segment_size (dr_info_a, scalar_loop_iters);
+ segment_length2_b
+ = vect_vfa_segment_size (dr_info_b, scalar_loop_iters);
}
access_size_a = vect_vfa_access_size (dr_info_a);
access_size_b = vect_vfa_access_size (dr_info_b);
@@ -3652,9 +3662,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
}
dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
- access_size_a, align_a);
+ segment_length2_a, access_size_a, align_a);
dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
- access_size_b, align_b);
+ segment_length2_b, access_size_b, align_b);
/* Canonicalize the order to be the one that's needed for accurate
RAW, WAR and WAW flags, in cases where the data references are
well-ordered. The order doesn't really matter otherwise,
--
2.33.0

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,381 @@
From 4bcb19923cdcb042d66057766d661ef68bf70e92 Mon Sep 17 00:00:00 2001
From: Chernonog Vyacheslav 00812786 <chernonog.vyacheslav@huawei.com>
Date: Wed, 29 Mar 2023 05:22:17 +0300
Subject: [PATCH 13/13] Fix bugs and add tests for RTL ifcvt
1. Fix bug in rtl ifcvt that run pass despite renaming failure.
2. Fix bug that prevent final set register to be renamed.
3. Clean up dominance info before runnig cleanup_cfg to avoid fixup
invalid dominance info.
4. Remove duplicated cleanup_cfg.
5. Add tests.
---
gcc/common.opt | 4 +
gcc/ifcvt.c | 88 ++++++++++++-------
gcc/params.opt | 4 -
.../gcc.c-torture/execute/ifcvt-renaming-1.c | 38 ++++++++
gcc/testsuite/gcc.dg/ifcvt-6.c | 29 ++++++
5 files changed, 128 insertions(+), 35 deletions(-)
create mode 100644 gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
create mode 100644 gcc/testsuite/gcc.dg/ifcvt-6.c
diff --git a/gcc/common.opt b/gcc/common.opt
index 6f0ed7cea..92d3a1986 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -3534,4 +3534,8 @@ fipa-ra
Common Report Var(flag_ipa_ra) Optimization
Use caller save register across calls if possible.
+fifcvt-allow-complicated-cmps
+Common Report Var(flag_ifcvt_allow_complicated_cmps) Optimization
+Allow RTL if-conversion pass to deal with complicated cmps (can increase compilation time).
+
; This comment is to ensure we retain the blank line above.
diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c
index 50a73a7ca..209987ebc 100644
--- a/gcc/ifcvt.c
+++ b/gcc/ifcvt.c
@@ -876,7 +876,7 @@ noce_emit_store_flag (struct noce_if_info *if_info, rtx x, int reversep,
}
/* Don't even try if the comparison operands or the mode of X are weird. */
- if (!param_ifcvt_allow_complicated_cmps
+ if (!flag_ifcvt_allow_complicated_cmps
&& (cond_complex
|| !SCALAR_INT_MODE_P (GET_MODE (x))))
return NULL_RTX;
@@ -1745,7 +1745,7 @@ noce_emit_cmove (struct noce_if_info *if_info, rtx x, enum rtx_code code,
/* Don't even try if the comparison operands are weird
except that the target supports cbranchcc4. */
- if (! param_ifcvt_allow_complicated_cmps
+ if (! flag_ifcvt_allow_complicated_cmps
&& (! general_operand (cmp_a, GET_MODE (cmp_a))
|| ! general_operand (cmp_b, GET_MODE (cmp_b))))
{
@@ -1918,6 +1918,19 @@ noce_try_cmove (struct noce_if_info *if_info)
return FALSE;
}
+/* Return true if X contains a conditional code mode rtx. */
+
+static bool
+contains_ccmode_rtx_p (rtx x)
+{
+ subrtx_iterator::array_type array;
+ FOR_EACH_SUBRTX (iter, array, x, ALL)
+ if (GET_MODE_CLASS (GET_MODE (*iter)) == MODE_CC)
+ return true;
+
+ return false;
+}
+
/* Helper for bb_valid_for_noce_process_p. Validate that
the rtx insn INSN is a single set that does not set
the conditional register CC and is in general valid for
@@ -1936,6 +1949,8 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc)
/* Currently support only simple single sets in test_bb. */
if (!sset
|| !noce_operand_ok (SET_DEST (sset))
+ || (!flag_ifcvt_allow_complicated_cmps
+ && contains_ccmode_rtx_p (SET_DEST (sset)))
|| !noce_operand_ok (SET_SRC (sset)))
return false;
@@ -1974,8 +1989,7 @@ bbs_ok_for_cmove_arith (basic_block bb_a,
continue;
/* Record all registers that BB_A sets. */
FOR_EACH_INSN_DEF (def, a_insn)
- if (!(to_rename && DF_REF_REG (def) == to_rename))
- bitmap_set_bit (bba_sets, DF_REF_REGNO (def));
+ bitmap_set_bit (bba_sets, DF_REF_REGNO (def));
}
bitmap_and (intersections, df_get_live_in (bb_b), bba_sets);
@@ -1984,6 +1998,7 @@ bbs_ok_for_cmove_arith (basic_block bb_a,
{
if (!active_insn_p (b_insn))
continue;
+
rtx sset_b = single_set (b_insn);
if (!sset_b)
@@ -2081,7 +2096,12 @@ noce_emit_bb (rtx last_insn, basic_block bb, bool simple)
return true;
}
-/* This function tries to rename regs that intersect with considered bb. */
+/* This function tries to rename regs that intersect with considered bb
+ inside condition expression. Condition expression will be moved down
+ if the optimization will be applied, so it is essential to be sure that
+ all intersected registers will be renamed otherwise transformation
+ can't be applied. Function returns true if renaming was successful
+ and optimization can proceed futher. */
static bool
noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs)
@@ -2092,11 +2112,11 @@ noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs)
if (param_ifcvt_allow_register_renaming < 2)
return false;
df_ref use;
- rtx_insn* cmp_insn = if_info->cond_earliest;
+ rtx_insn *cmp_insn = if_info->cond_earliest;
/* Jump instruction as a condion currently unsupported. */
if (JUMP_P (cmp_insn))
return false;
- rtx_insn* before_cmp = PREV_INSN (cmp_insn);
+ rtx_insn *before_cmp = PREV_INSN (cmp_insn);
start_sequence ();
rtx_insn *copy_of_cmp = as_a <rtx_insn *> (copy_rtx (cmp_insn));
basic_block cmp_block = BLOCK_FOR_INSN (cmp_insn);
@@ -2122,7 +2142,7 @@ noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs)
emit_insn_after_setloc (seq, before_cmp, INSN_LOCATION (cmp_insn));
delete_insn_and_edges (cmp_insn);
- rtx_insn* insn;
+ rtx_insn *insn;
FOR_BB_INSNS (cmp_block, insn)
df_insn_rescan (insn);
@@ -2135,13 +2155,15 @@ noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs)
return success;
}
-/* This function tries to rename regs that intersect with considered bb. */
+/* This function tries to rename regs that intersect with considered bb.
+ return true if the renaming was successful and optimization can
+ proceed futher, false otherwise. */
static bool
noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs)
{
if (bitmap_empty_p (rename_regs))
return true;
- rtx_insn* insn;
+ rtx_insn *insn;
rtx_insn *last_insn = last_active_insn (test_bb, FALSE);
bool res = true;
start_sequence ();
@@ -2153,7 +2175,7 @@ noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs)
rtx sset = single_set (insn);
gcc_assert (sset);
rtx x = SET_DEST (sset);
- if (!REG_P (x) || bitmap_bit_p (rename_regs, REGNO (x)))
+ if (!REG_P (x) || !bitmap_bit_p (rename_regs, REGNO (x)))
continue;
machine_mode mode = GET_MODE (x);
@@ -2175,7 +2197,7 @@ noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs)
noce_emit_move_insn (tmp,x);
}
set_used_flags (insn);
- rtx_insn* rename_candidate;
+ rtx_insn *rename_candidate;
for (rename_candidate = NEXT_INSN (insn);
rename_candidate && rename_candidate!= NEXT_INSN (BB_END (test_bb));
rename_candidate = NEXT_INSN (rename_candidate))
@@ -2193,17 +2215,16 @@ noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs)
replace_res = validate_replace_rtx (x, tmp, rename_candidate);
gcc_assert (replace_res);
set_used_flags (rename_candidate);
-
}
set_used_flags (x);
set_used_flags (tmp);
-
}
- rtx_insn *seq = get_insns ();
- unshare_all_rtl_in_chain (seq);
- end_sequence ();
- emit_insn_before_setloc (seq, first_active_insn (test_bb),
- INSN_LOCATION (first_active_insn (test_bb)));
+
+ rtx_insn *seq = get_insns ();
+ unshare_all_rtl_in_chain (seq);
+ end_sequence ();
+ emit_insn_before_setloc (seq, first_active_insn (test_bb),
+ INSN_LOCATION (first_active_insn (test_bb)));
FOR_BB_INSNS (test_bb, insn)
df_insn_rescan (insn);
return res;
@@ -2305,9 +2326,10 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
BITMAP_FREE (else_bb_rename_regs);
return FALSE;
}
- bool prepass_renaming = true;
- prepass_renaming |= noce_rename_regs_in_bb (then_bb, then_bb_rename_regs);
- prepass_renaming |= noce_rename_regs_in_bb (else_bb, else_bb_rename_regs);
+ bool prepass_renaming = noce_rename_regs_in_bb (then_bb,
+ then_bb_rename_regs)
+ && noce_rename_regs_in_bb (else_bb,
+ else_bb_rename_regs);
BITMAP_FREE (then_bb_rename_regs);
BITMAP_FREE (else_bb_rename_regs);
@@ -2321,6 +2343,7 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
came from the test block. The non-empty complex block that we will
emit might clobber the register used by B or A, so move it to a pseudo
first. */
+
rtx tmp_a = NULL_RTX;
rtx tmp_b = NULL_RTX;
@@ -3233,6 +3256,7 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
&& reg_set_between_p (x, first_insn, prev_last_insn)
&& param_ifcvt_allow_register_renaming < 1)
return false;
+
bitmap test_bb_temps = BITMAP_ALLOC (&reg_obstack);
/* The regs that are live out of test_bb. */
@@ -3268,9 +3292,10 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
else
goto free_bitmap_and_fail;
}
- potential_cost += pattern_cost (sset, speed_p);
- if (SET_DEST (sset) != SET_DEST (last_set))
- bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
+
+ potential_cost += pattern_cost (sset, speed_p);
+ if (SET_DEST (sset) != SET_DEST (last_set))
+ bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
}
/* If any of the intermediate results in test_bb are live after test_bb
@@ -3645,11 +3670,12 @@ noce_process_if_block (struct noce_if_info *if_info)
}
if (!noce_rename_regs_in_cond (if_info, cond_rename_regs))
- return false;
- cond = if_info->cond;
-
+ {
+ BITMAP_FREE (cond_rename_regs);
+ return false;
+ }
BITMAP_FREE (cond_rename_regs);
-
+ cond = if_info->cond;
if (speed_p)
if_info->original_cost += average_cost (then_cost, else_cost,
find_edge (test_bb, then_bb));
@@ -5592,12 +5618,13 @@ if_convert (bool after_combine)
{
basic_block bb;
int pass;
- cleanup_cfg (CLEANUP_EXPENSIVE);
+
if (optimize == 1)
{
df_live_add_problem ();
df_live_set_all_dirty ();
}
+ cleanup_cfg (CLEANUP_EXPENSIVE);
/* Record whether we are after combine pass. */
ifcvt_after_combine = after_combine;
@@ -5702,7 +5729,6 @@ rest_of_handle_if_conversion (void)
dump_reg_info (dump_file);
dump_flow_info (dump_file, dump_flags);
}
- cleanup_cfg (CLEANUP_EXPENSIVE);
if_convert (false);
if (num_updated_if_blocks)
/* Get rid of any dead CC-related instructions. */
diff --git a/gcc/params.opt b/gcc/params.opt
index 345f9b3ff..272a0eb2b 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -574,10 +574,6 @@ Maximum permissible cost for the sequence that would be generated by the RTL if-
Common Joined UInteger Var(param_max_rtl_if_conversion_unpredictable_cost) Init(40) IntegerRange(0, 200) Param Optimization
Maximum permissible cost for the sequence that would be generated by the RTL if-conversion pass for a branch that is considered unpredictable.
--param=ifcvt-allow-complicated-cmps=
-Common Joined UInteger Var(param_ifcvt_allow_complicated_cmps) IntegerRange(0, 1) Param Optimization
-Allow RTL if-conversion pass to deal with complicated cmps (can increase compilation time).
-
-param=ifcvt-allow-register-renaming=
Common Joined UInteger Var(param_ifcvt_allow_register_renaming) IntegerRange(0, 2) Param Optimization
Allow RTL if-conversion pass to aggressively rename registers in basic blocks. Sometimes additional moves will be created.
diff --git a/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c b/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
new file mode 100644
index 000000000..761c8ab7e
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
@@ -0,0 +1,38 @@
+
+extern void abort(void);
+
+__attribute__ ((noinline))
+int foo (int x, int y, int z, int a, int b)
+{
+ if (a < 2)
+ {
+ if (a == 0)
+ {
+ if (x - y < 0)
+ x = x - y + z;
+ else
+ x = x - y;
+ }
+ else
+ {
+ if (x + y >= z)
+ x = x + y - z;
+ else
+ x = x + y;
+ }
+ }
+ return x;
+}
+
+int main(void)
+{
+ if (foo (5,10,7,0,1) != 2) // x - y + z = -5 + 7 = 2
+ abort ();
+ if (foo (50,10,7,0,1) != 40) // x - y = 40
+ abort ();
+ if (foo (5,10,7,1,1) != 8) // x + y - z = 5 + 10 - 7 = 8
+ abort ();
+ if (foo (5,10,70,1,1) != 15) // x + y = 15
+ abort ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/ifcvt-6.c b/gcc/testsuite/gcc.dg/ifcvt-6.c
new file mode 100644
index 000000000..7d2a8d58b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ifcvt-6.c
@@ -0,0 +1,29 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-options "-fdump-rtl-ce1 -O2 -fifcvt-allow-complicated-cmps --param max-rtl-if-conversion-unpredictable-cost=100 --param max-rtl-if-conversion-predictable-cost=100 --param=ifcvt-allow-register-renaming=2 " } */
+
+typedef unsigned int uint16_t;
+
+uint16_t
+foo (uint16_t x, uint16_t y, uint16_t z, uint16_t a,
+ uint16_t b, uint16_t c, uint16_t d)
+{
+ int i = 1;
+ int j = 1;
+ if (a > b)
+ {
+ j = x;
+ if (b > c)
+ i = y;
+ else
+ i = z;
+ }
+ else
+ {
+ j = y;
+ if (c > d)
+ i = z;
+ }
+ return i * j;
+}
+
+/* { dg-final { scan-rtl-dump "7 true changes made" "ce1" } } */
--
2.33.0

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -46,7 +46,7 @@
%else
%global build_libitm 0
%endif
%global build_isl 0
%global build_isl 1
%global build_libstdcxx_docs 0
%ifarch %{ix86} x86_64 ppc ppc64 ppc64le ppc64p7 s390 s390x %{arm} aarch64 %{mips}
%global attr_ifunc 1
@ -61,7 +61,7 @@
Summary: Various compilers (C, C++, Objective-C, ...)
Name: gcc
Version: %{gcc_version}
Release: 41
Release: 42
License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD
URL: https://gcc.gnu.org
@ -249,6 +249,18 @@ Patch138: 0138-Fix-ICE-bugs-in-transpose-test-cases-with-vector-ind.patch
Patch139: 0139-Fix-errors-on-testsuite-c-c-tests-and-505.mcf_r.patch
Patch140: 0140-Fix-an-error-in-memory-allocation-deallocation.patch
Patch141: 0141-Fix-warnings-and-errors-with-debug-prints.patch
Patch142: 0142-crc-loop-optimization-initial.patch
Patch143: 0143-Perform-early-if-conversion-of-simple-arithmetic.patch
Patch144: 0144-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch
Patch145: 0145-Match-double-sized-mul-pattern.patch
Patch146: 0146-LOOP-CRC32-Add-Crc32-Optimization-in-Gzip-For-crc32-.patch
Patch147: 0147-add-insn-defs-and-correct-costs-for-cmlt-generation.patch
Patch148: 0148-Introduce-RTL-ifcvt-enhancements.patch
Patch149: 0149-Add-more-flexible-check-for-pointer-aliasing-during-.patch
Patch150: 0150-Implement-propagation-of-permutations-in-fwprop.patch
Patch151: 0151-Fix-bugs-and-add-tests-for-RTL-ifcvt.patch
Patch152: 0152-Add-LLC-Allocation-Pass.patch
Patch153: 0153-LLC-add-extending-outer-loop.patch
%global gcc_target_platform %{_arch}-linux-gnu
@ -843,6 +855,18 @@ not stable, so plugins must be rebuilt any time GCC is updated.
%patch139 -p1
%patch140 -p1
%patch141 -p1
%patch142 -p1
%patch143 -p1
%patch144 -p1
%patch145 -p1
%patch146 -p1
%patch147 -p1
%patch148 -p1
%patch149 -p1
%patch150 -p1
%patch151 -p1
%patch152 -p1
%patch153 -p1
%build
@ -908,15 +932,10 @@ CC="$CC" CFLAGS="$OPT_FLAGS" \
--with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions \
--enable-gnu-unique-object --enable-linker-build-id --with-linker-hash-style=gnu \
--enable-languages=c,c++,fortran${enablelobjc}${enablelada}${enablelgo}${enableld},lto --enable-plugin \
--enable-initfini-array --disable-libgcj --without-cloog \
--enable-initfini-array --disable-libgcj --with-isl --without-cloog \
--enable-gnu-indirect-function --build=%{gcc_target_platform} \
--with-stage1-ldflags="$OPT_LDFLAGS" \
--with-boot-ldflags="$OPT_LDFLAGS" --disable-bootstrap \
%if %{build_isl}
--with-isl \
%else
--without-isl \
%endif
%ifarch x86_64
--with-tune=generic \
--with-arch_32=x86-64 \
@ -930,11 +949,6 @@ CC="$CC" CFLAGS="$OPT_FLAGS" \
--with-arch=rv64g --with-abi=lp64d \
--disable-libquadmath --disable-multilib
%endif
%ifarch ppc64le
--disable-multilib \
--enable-targets=powerpcle-linux \
--with-cpu-32=power8 --with-tune-32=power8 --with-cpu-64=power8 --with-tune-64=power8 \
%endif
%ifarch sparc sparcv9 sparc64
make %{?_smp_mflags} BOOT_CFLAGS="$OPT_FLAGS" bootstrap
@ -2877,6 +2891,12 @@ end
%doc rpm.doc/changelogs/libcc1/ChangeLog*
%changelog
* Wed May 29 2024 zhengchenhui <zhengchenhui1@huawei.com> - 10.3.1-42
- Type:Spec
- ID:NA
- SUG:NA
- DESC: Revert last two commits about isl and ppc64le, and Sync patch from openeuler/gcc
* Mon Apr 15 2024 huyubiao <huyubiao@huawei.com> - 10.3.1-41
- Type:SPEC
- ID:NA