[Sync] Sync patch from openeuler/gcc
0143-Perform-early-if-conversion-of-simple-arithmetic.patch 0144-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch 0145-Match-double-sized-mul-pattern.patch
This commit is contained in:
parent
40d16c6c94
commit
b10bad3541
109
0143-Perform-early-if-conversion-of-simple-arithmetic.patch
Normal file
109
0143-Perform-early-if-conversion-of-simple-arithmetic.patch
Normal file
@ -0,0 +1,109 @@
|
||||
From 7acb88ae27eb3e1af0da866d433968143c7754bd Mon Sep 17 00:00:00 2001
|
||||
From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
|
||||
Date: Thu, 12 Jan 2023 14:52:49 +0300
|
||||
Subject: [PATCH 20/33] Perform early if-conversion of simple arithmetic
|
||||
|
||||
---
|
||||
gcc/common.opt | 4 ++++
|
||||
gcc/match.pd | 25 +++++++++++++++++++
|
||||
gcc/testsuite/gcc.dg/ifcvt-gimple.c | 37 +++++++++++++++++++++++++++++
|
||||
3 files changed, 66 insertions(+)
|
||||
create mode 100644 gcc/testsuite/gcc.dg/ifcvt-gimple.c
|
||||
|
||||
diff --git a/gcc/common.opt b/gcc/common.opt
|
||||
index 6f0ed7cea..6950756fd 100644
|
||||
--- a/gcc/common.opt
|
||||
+++ b/gcc/common.opt
|
||||
@@ -1721,6 +1721,10 @@ fif-conversion2
|
||||
Common Report Var(flag_if_conversion2) Optimization
|
||||
Perform conversion of conditional jumps to conditional execution.
|
||||
|
||||
+fif-conversion-gimple
|
||||
+Common Report Var(flag_if_conversion_gimple) Optimization
|
||||
+Perform conversion of conditional jumps to branchless equivalents during gimple transformations.
|
||||
+
|
||||
fstack-reuse=
|
||||
Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization
|
||||
-fstack-reuse=[all|named_vars|none] Set stack reuse level for local variables.
|
||||
diff --git a/gcc/match.pd b/gcc/match.pd
|
||||
index 01f81b063..e98cd02e0 100644
|
||||
--- a/gcc/match.pd
|
||||
+++ b/gcc/match.pd
|
||||
@@ -3402,6 +3402,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
|
||||
)
|
||||
)
|
||||
)
|
||||
+
|
||||
+(if (flag_if_conversion_gimple)
|
||||
+ (for simple_op (plus minus bit_and bit_ior bit_xor)
|
||||
+ (simplify
|
||||
+ (cond @0 (simple_op @1 INTEGER_CST@2) @1)
|
||||
+ (switch
|
||||
+ /* a = cond ? a + 1 : a -> a = a + ((int) cond) */
|
||||
+ (if (integer_onep (@2))
|
||||
+ (simple_op @1 (convert (convert:boolean_type_node @0))))
|
||||
+ /* a = cond ? a + powerof2cst : a ->
|
||||
+ a = a + ((int) cond) << log2 (powerof2cst) */
|
||||
+ (if (INTEGRAL_TYPE_P (type) && integer_pow2p (@2))
|
||||
+ (with
|
||||
+ {
|
||||
+ tree shift = build_int_cst (integer_type_node, tree_log2 (@2));
|
||||
+ }
|
||||
+ (simple_op @1 (lshift (convert (convert:boolean_type_node @0))
|
||||
+ { shift; })
|
||||
+ )
|
||||
+ )
|
||||
+ )
|
||||
+ )
|
||||
+ )
|
||||
+ )
|
||||
+)
|
||||
#endif
|
||||
|
||||
#if GIMPLE
|
||||
diff --git a/gcc/testsuite/gcc.dg/ifcvt-gimple.c b/gcc/testsuite/gcc.dg/ifcvt-gimple.c
|
||||
new file mode 100644
|
||||
index 000000000..0f7c87e5c
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.dg/ifcvt-gimple.c
|
||||
@@ -0,0 +1,37 @@
|
||||
+/* { dg-do compile } */
|
||||
+/* { dg-options "-O2 -fif-conversion-gimple -fdump-tree-optimized" } */
|
||||
+
|
||||
+int test_int (int optimizable_int) {
|
||||
+ if (optimizable_int > 5)
|
||||
+ ++optimizable_int;
|
||||
+ return optimizable_int;
|
||||
+}
|
||||
+
|
||||
+int test_int_pow2 (int optimizable_int_pow2) {
|
||||
+ if (optimizable_int_pow2 <= 4)
|
||||
+ optimizable_int_pow2 += 1024;
|
||||
+ return optimizable_int_pow2;
|
||||
+}
|
||||
+
|
||||
+int test_int_non_pow2 (int not_optimizable_int_non_pow2) {
|
||||
+ if (not_optimizable_int_non_pow2 == 1)
|
||||
+ not_optimizable_int_non_pow2 += 513;
|
||||
+ return not_optimizable_int_non_pow2;
|
||||
+}
|
||||
+
|
||||
+float test_float (float not_optimizable_float) {
|
||||
+ if (not_optimizable_float > 5)
|
||||
+ not_optimizable_float += 1;
|
||||
+ return not_optimizable_float;
|
||||
+}
|
||||
+
|
||||
+/* Expecting if-else block in test_float and test_int_non_pow2 only. */
|
||||
+/* { dg-final { scan-tree-dump-not "if \\(optimizable" "optimized" } } */
|
||||
+/* { dg-final { scan-tree-dump "if \\(not_optimizable_int_non_pow2" "optimized" } } */
|
||||
+/* { dg-final { scan-tree-dump "if \\(not_optimizable_float" "optimized" } } */
|
||||
+/* { dg-final { scan-tree-dump-times "if " 2 "optimized" } } */
|
||||
+/* { dg-final { scan-tree-dump-times "else" 2 "optimized" } } */
|
||||
+
|
||||
+/* Expecting shifted result only for optimizable_int_pow2. */
|
||||
+/* { dg-final { scan-tree-dump-times " << " 1 "optimized" } } */
|
||||
+/* { dg-final { scan-tree-dump " << 10;" "optimized" } } */
|
||||
--
|
||||
2.33.0
|
||||
|
||||
236
0144-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch
Normal file
236
0144-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch
Normal file
@ -0,0 +1,236 @@
|
||||
From f788555b23b0b676729bb695af96954fe083e354 Mon Sep 17 00:00:00 2001
|
||||
From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
|
||||
Date: Tue, 24 Jan 2023 16:43:40 +0300
|
||||
Subject: [PATCH 21/33] Add option to allow matching uaddsub overflow for widen
|
||||
ops too.
|
||||
|
||||
---
|
||||
gcc/common.opt | 5 ++
|
||||
gcc/testsuite/gcc.dg/uaddsub.c | 143 +++++++++++++++++++++++++++++++++
|
||||
gcc/tree-ssa-math-opts.c | 35 +++++++-
|
||||
3 files changed, 179 insertions(+), 4 deletions(-)
|
||||
create mode 100644 gcc/testsuite/gcc.dg/uaddsub.c
|
||||
|
||||
diff --git a/gcc/common.opt b/gcc/common.opt
|
||||
index 6950756fd..c2f01bbc0 100644
|
||||
--- a/gcc/common.opt
|
||||
+++ b/gcc/common.opt
|
||||
@@ -2989,6 +2989,11 @@ freciprocal-math
|
||||
Common Report Var(flag_reciprocal_math) SetByCombined Optimization
|
||||
Same as -fassociative-math for expressions which include division.
|
||||
|
||||
+fuaddsub-overflow-match-all
|
||||
+Common Report Var(flag_uaddsub_overflow_match_all)
|
||||
+Match unsigned add/sub overflow even if the target does not support
|
||||
+the corresponding instruction.
|
||||
+
|
||||
; Nonzero means that unsafe floating-point math optimizations are allowed
|
||||
; for the sake of speed. IEEE compliance is not guaranteed, and operations
|
||||
; are allowed to assume that their arguments and results are "normal"
|
||||
diff --git a/gcc/testsuite/gcc.dg/uaddsub.c b/gcc/testsuite/gcc.dg/uaddsub.c
|
||||
new file mode 100644
|
||||
index 000000000..96c26d308
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.dg/uaddsub.c
|
||||
@@ -0,0 +1,143 @@
|
||||
+/* { dg-do compile } */
|
||||
+/* { dg-options "-O2 -fuaddsub-overflow-match-all -fdump-tree-optimized" } */
|
||||
+#include <stdint.h>
|
||||
+
|
||||
+typedef unsigned __int128 uint128_t;
|
||||
+typedef struct uint256_t
|
||||
+{
|
||||
+ uint128_t lo;
|
||||
+ uint128_t hi;
|
||||
+} uint256_t;
|
||||
+
|
||||
+uint16_t add16 (uint8_t a, uint8_t b)
|
||||
+{
|
||||
+ uint8_t tmp = a + b;
|
||||
+ uint8_t overflow = 0;
|
||||
+ if (tmp < a)
|
||||
+ overflow = 1;
|
||||
+
|
||||
+ uint16_t res = overflow;
|
||||
+ res <<= 8;
|
||||
+ res += tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint32_t add32 (uint16_t a, uint16_t b)
|
||||
+{
|
||||
+ uint16_t tmp = a + b;
|
||||
+ uint16_t overflow = 0;
|
||||
+ if (tmp < a)
|
||||
+ overflow = 1;
|
||||
+
|
||||
+ uint32_t res = overflow;
|
||||
+ res <<= 16;
|
||||
+ res += tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint64_t add64 (uint32_t a, uint32_t b)
|
||||
+{
|
||||
+ uint32_t tmp = a + b;
|
||||
+ uint32_t overflow = 0;
|
||||
+ if (tmp < a)
|
||||
+ overflow = 1;
|
||||
+
|
||||
+ uint64_t res = overflow;
|
||||
+ res <<= 32;
|
||||
+ res += tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint128_t add128 (uint64_t a, uint64_t b)
|
||||
+{
|
||||
+ uint64_t tmp = a + b;
|
||||
+ uint64_t overflow = 0;
|
||||
+ if (tmp < a)
|
||||
+ overflow = 1;
|
||||
+
|
||||
+ uint128_t res = overflow;
|
||||
+ res <<= 64;
|
||||
+ res += tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint256_t add256 (uint128_t a, uint128_t b)
|
||||
+{
|
||||
+ uint128_t tmp = a + b;
|
||||
+ uint128_t overflow = 0;
|
||||
+ if (tmp < a)
|
||||
+ overflow = 1;
|
||||
+
|
||||
+ uint256_t res;
|
||||
+ res.hi = overflow;
|
||||
+ res.lo = tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint16_t sub16 (uint8_t a, uint8_t b)
|
||||
+{
|
||||
+ uint8_t tmp = a - b;
|
||||
+ uint8_t overflow = 0;
|
||||
+ if (tmp > a)
|
||||
+ overflow = -1;
|
||||
+
|
||||
+ uint16_t res = overflow;
|
||||
+ res <<= 8;
|
||||
+ res += tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint32_t sub32 (uint16_t a, uint16_t b)
|
||||
+{
|
||||
+ uint16_t tmp = a - b;
|
||||
+ uint16_t overflow = 0;
|
||||
+ if (tmp > a)
|
||||
+ overflow = -1;
|
||||
+
|
||||
+ uint32_t res = overflow;
|
||||
+ res <<= 16;
|
||||
+ res += tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint64_t sub64 (uint32_t a, uint32_t b)
|
||||
+{
|
||||
+ uint32_t tmp = a - b;
|
||||
+ uint32_t overflow = 0;
|
||||
+ if (tmp > a)
|
||||
+ overflow = -1;
|
||||
+
|
||||
+ uint64_t res = overflow;
|
||||
+ res <<= 32;
|
||||
+ res += tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint128_t sub128 (uint64_t a, uint64_t b)
|
||||
+{
|
||||
+ uint64_t tmp = a - b;
|
||||
+ uint64_t overflow = 0;
|
||||
+ if (tmp > a)
|
||||
+ overflow = -1;
|
||||
+
|
||||
+ uint128_t res = overflow;
|
||||
+ res <<= 64;
|
||||
+ res += tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint256_t sub256 (uint128_t a, uint128_t b)
|
||||
+{
|
||||
+ uint128_t tmp = a - b;
|
||||
+ uint128_t overflow = 0;
|
||||
+ if (tmp > a)
|
||||
+ overflow = -1;
|
||||
+
|
||||
+ uint256_t res;
|
||||
+ res.hi = overflow;
|
||||
+ res.lo = tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+/* { dg-final { scan-tree-dump-times "= .ADD_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */
|
||||
+/* { dg-final { scan-tree-dump-times "= .SUB_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */
|
||||
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
|
||||
index 4c89fddcf..716bf9e35 100644
|
||||
--- a/gcc/tree-ssa-math-opts.c
|
||||
+++ b/gcc/tree-ssa-math-opts.c
|
||||
@@ -3290,6 +3290,27 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
|
||||
}
|
||||
}
|
||||
|
||||
+/* Check if the corresponding operation has wider equivalent on the target. */
|
||||
+
|
||||
+static bool
|
||||
+wider_optab_check_p (optab op, machine_mode mode, int unsignedp)
|
||||
+{
|
||||
+ machine_mode wider_mode;
|
||||
+ FOR_EACH_WIDER_MODE (wider_mode, mode)
|
||||
+ {
|
||||
+ machine_mode next_mode;
|
||||
+ if (optab_handler (op, wider_mode) != CODE_FOR_nothing
|
||||
+ || (op == smul_optab
|
||||
+ && GET_MODE_WIDER_MODE (wider_mode).exists (&next_mode)
|
||||
+ && (find_widening_optab_handler ((unsignedp
|
||||
+ ? umul_widen_optab
|
||||
+ : smul_widen_optab),
|
||||
+ next_mode, mode))))
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ return false;
|
||||
+}
|
||||
|
||||
/* Helper function of match_uaddsub_overflow. Return 1
|
||||
if USE_STMT is unsigned overflow check ovf != 0 for
|
||||
@@ -3390,12 +3411,18 @@ match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
|
||||
gimple *use_stmt;
|
||||
|
||||
gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
|
||||
+ optab op = code == PLUS_EXPR ? uaddv4_optab : usubv4_optab;
|
||||
+ machine_mode mode = TYPE_MODE (type);
|
||||
+ int unsignedp = TYPE_UNSIGNED (type);
|
||||
if (!INTEGRAL_TYPE_P (type)
|
||||
- || !TYPE_UNSIGNED (type)
|
||||
+ || !unsignedp
|
||||
|| has_zero_uses (lhs)
|
||||
- || has_single_use (lhs)
|
||||
- || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab,
|
||||
- TYPE_MODE (type)) == CODE_FOR_nothing)
|
||||
+ || has_single_use (lhs))
|
||||
+ return false;
|
||||
+
|
||||
+ if (optab_handler (op, mode) == CODE_FOR_nothing
|
||||
+ && (!flag_uaddsub_overflow_match_all
|
||||
+ || !wider_optab_check_p (op, mode, unsignedp)))
|
||||
return false;
|
||||
|
||||
FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
|
||||
--
|
||||
2.33.0
|
||||
|
||||
488
0145-Match-double-sized-mul-pattern.patch
Normal file
488
0145-Match-double-sized-mul-pattern.patch
Normal file
@ -0,0 +1,488 @@
|
||||
From 3be7a26a08772d014f54f7b1a0555ccca91115d6 Mon Sep 17 00:00:00 2001
|
||||
From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
|
||||
Date: Wed, 25 Jan 2023 15:04:07 +0300
|
||||
Subject: [PATCH 22/33] Match double sized mul pattern
|
||||
|
||||
---
|
||||
gcc/match.pd | 136 +++++++++++++++++++++
|
||||
gcc/testsuite/gcc.dg/double_sized_mul-1.c | 141 ++++++++++++++++++++++
|
||||
gcc/testsuite/gcc.dg/double_sized_mul-2.c | 62 ++++++++++
|
||||
gcc/tree-ssa-math-opts.c | 80 ++++++++++++
|
||||
4 files changed, 419 insertions(+)
|
||||
create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-1.c
|
||||
create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-2.c
|
||||
|
||||
diff --git a/gcc/match.pd b/gcc/match.pd
|
||||
index e98cd02e0..74f8ab999 100644
|
||||
--- a/gcc/match.pd
|
||||
+++ b/gcc/match.pd
|
||||
@@ -6390,3 +6390,139 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
|
||||
to the number of trailing zeroes. */
|
||||
(match (ctz_table_index @1 @2 @3)
|
||||
(rshift (mult (bit_and:c (negate @1) @1) INTEGER_CST@2) INTEGER_CST@3))
|
||||
+
|
||||
+/* Match multiplication with double sized result.
|
||||
+
|
||||
+ Consider the following calculations:
|
||||
+ arg0 * arg1 = (2^(bit_size/2) * arg0_hi + arg0_lo)
|
||||
+ * (2^(bit_size/2) * arg1_hi + arg1_lo)
|
||||
+ arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi
|
||||
+ + 2^(bit_size/2) * (arg0_hi * arg1_lo + arg0_lo * arg1_hi)
|
||||
+ + arg0_lo * arg1_lo
|
||||
+
|
||||
+ The products of high and low parts fits in bit_size values, thus they are
|
||||
+ placed in high and low parts of result respectively.
|
||||
+
|
||||
+ The sum of the mixed products may overflow, so we need a detection for that.
|
||||
+ Also it has a bit_size/2 offset, thus it intersects with both high and low
|
||||
+ parts of result. Overflow detection constant is bit_size/2 due to this.
|
||||
+
|
||||
+ With this info:
|
||||
+ arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi
|
||||
+ + 2^(bit_size/2) * middle
|
||||
+ + 2^bit_size * possible_middle_overflow
|
||||
+ + arg0_lo * arg1_lo
|
||||
+ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow)
|
||||
+ + 2^(bit_size/2) * (2^(bit_size/2) * middle_hi + middle_lo)
|
||||
+ + arg0_lo * arg1_lo
|
||||
+ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + middle_hi
|
||||
+ + possible_middle_overflow)
|
||||
+ + 2^(bit_size/2) * middle_lo
|
||||
+ + arg0_lo * arg1_lo
|
||||
+
|
||||
+ The last sum can produce overflow for the high result part. With this:
|
||||
+ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow
|
||||
+ + possible_res_lo_overflow + middle_hi)
|
||||
+ + res_lo
|
||||
+ = res_hi + res_lo
|
||||
+
|
||||
+ This formula is quite big to fit into one match pattern with all of the
|
||||
+ combinations of terms inside it. There are many helpers for better code
|
||||
+ readability.
|
||||
+
|
||||
+ The simplification basis is res_hi: assuming that res_lo only is not
|
||||
+ real practical case for such calculations.
|
||||
+
|
||||
+ Overflow handling is done via matching complex calculations:
|
||||
+ the realpart and imagpart are quite handy here. */
|
||||
+/* Match low and high parts of the argument. */
|
||||
+(match (double_size_mul_arg_lo @0 @1)
|
||||
+ (bit_and @0 INTEGER_CST@1)
|
||||
+ (if (wi::to_wide (@1)
|
||||
+ == wi::mask (TYPE_PRECISION (type) / 2, false, TYPE_PRECISION (type)))))
|
||||
+(match (double_size_mul_arg_hi @0 @1)
|
||||
+ (rshift @0 INTEGER_CST@1)
|
||||
+ (if (wi::to_wide (@1) == TYPE_PRECISION (type) / 2)))
|
||||
+
|
||||
+/* Match various argument parts products. */
|
||||
+(match (double_size_mul_lolo @0 @1)
|
||||
+ (mult@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_lo @1 @3))
|
||||
+ (if (single_use (@4))))
|
||||
+(match (double_size_mul_hihi @0 @1)
|
||||
+ (mult@4 (double_size_mul_arg_hi @0 @2) (double_size_mul_arg_hi @1 @3))
|
||||
+ (if (single_use (@4))))
|
||||
+(match (double_size_mul_lohi @0 @1)
|
||||
+ (mult:c@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_hi @1 @3))
|
||||
+ (if (single_use (@4))))
|
||||
+
|
||||
+/* Match complex middle sum. */
|
||||
+(match (double_size_mul_middle_complex @0 @1)
|
||||
+ (IFN_ADD_OVERFLOW@2 (double_size_mul_lohi @0 @1) (double_size_mul_lohi @1 @0))
|
||||
+ (if (num_imm_uses (@2) == 2)))
|
||||
+
|
||||
+/* Match real middle results. */
|
||||
+(match (double_size_mul_middle @0 @1)
|
||||
+ (realpart@2 (double_size_mul_middle_complex @0 @1))
|
||||
+ (if (num_imm_uses (@2) == 2)))
|
||||
+(match (double_size_mul_middleres_lo @0 @1)
|
||||
+ (lshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2)
|
||||
+ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
|
||||
+ && single_use (@3))))
|
||||
+(match (double_size_mul_middleres_hi @0 @1)
|
||||
+ (rshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2)
|
||||
+ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
|
||||
+ && single_use (@3))))
|
||||
+
|
||||
+/* Match low result part. */
|
||||
+/* Number of uses may be < 2 in case when we are interested in
|
||||
+ high part only. */
|
||||
+(match (double_size_mul_res_lo_complex @0 @1)
|
||||
+ (IFN_ADD_OVERFLOW:c@2
|
||||
+ (double_size_mul_lolo:c @0 @1) (double_size_mul_middleres_lo @0 @1))
|
||||
+ (if (num_imm_uses (@2) <= 2)))
|
||||
+(match (double_size_mul_res_lo @0 @1)
|
||||
+ (realpart (double_size_mul_res_lo_complex @0 @1)))
|
||||
+
|
||||
+/* Match overflow terms. */
|
||||
+(match (double_size_mul_overflow_check_lo @0 @1 @5)
|
||||
+ (convert@4 (ne@3
|
||||
+ (imagpart@2 (double_size_mul_res_lo_complex@5 @0 @1)) integer_zerop))
|
||||
+ (if (single_use (@2) && single_use (@3) && single_use (@4))))
|
||||
+(match (double_size_mul_overflow_check_hi @0 @1)
|
||||
+ (lshift@6 (convert@5 (ne@4
|
||||
+ (imagpart@3 (double_size_mul_middle_complex @0 @1)) integer_zerop))
|
||||
+ INTEGER_CST@2)
|
||||
+ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
|
||||
+ && single_use (@3) && single_use (@4) && single_use (@5)
|
||||
+ && single_use (@6))))
|
||||
+
|
||||
+/* Match all possible permutations for high result part calculations. */
|
||||
+(for op1 (double_size_mul_hihi
|
||||
+ double_size_mul_overflow_check_hi
|
||||
+ double_size_mul_middleres_hi)
|
||||
+ op2 (double_size_mul_overflow_check_hi
|
||||
+ double_size_mul_middleres_hi
|
||||
+ double_size_mul_hihi)
|
||||
+ op3 (double_size_mul_middleres_hi
|
||||
+ double_size_mul_hihi
|
||||
+ double_size_mul_overflow_check_hi)
|
||||
+ (match (double_size_mul_candidate @0 @1 @2 @3)
|
||||
+ (plus:c@2
|
||||
+ (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3) (op1:c @0 @1))
|
||||
+ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1)))
|
||||
+ (if (single_use (@4) && single_use (@5))))
|
||||
+ (match (double_size_mul_candidate @0 @1 @2 @3)
|
||||
+ (plus:c@2 (double_size_mul_overflow_check_lo @0 @1 @3)
|
||||
+ (plus:c@4 (op1:c @0 @1)
|
||||
+ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1))))
|
||||
+ (if (single_use (@4) && single_use (@5))))
|
||||
+ (match (double_size_mul_candidate @0 @1 @2 @3)
|
||||
+ (plus:c@2 (op1:c @0 @1)
|
||||
+ (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3)
|
||||
+ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1))))
|
||||
+ (if (single_use (@4) && single_use (@5))))
|
||||
+ (match (double_size_mul_candidate @0 @1 @2 @3)
|
||||
+ (plus:c@2 (op1:c @0 @1)
|
||||
+ (plus:c@4 (op2:c @0 @1)
|
||||
+ (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1))))
|
||||
+ (if (single_use (@4) && single_use (@5)))))
|
||||
diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-1.c b/gcc/testsuite/gcc.dg/double_sized_mul-1.c
|
||||
new file mode 100644
|
||||
index 000000000..4d475cc8a
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.dg/double_sized_mul-1.c
|
||||
@@ -0,0 +1,141 @@
|
||||
+/* { dg-do compile } */
|
||||
+/* fif-conversion-gimple and fuaddsub-overflow-match-all are required for
|
||||
+ proper overflow detection in some cases. */
|
||||
+/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
|
||||
+#include <stdint.h>
|
||||
+
|
||||
+typedef unsigned __int128 uint128_t;
|
||||
+
|
||||
+uint16_t mul16 (uint8_t a, uint8_t b)
|
||||
+{
|
||||
+ uint8_t a_lo = a & 0xF;
|
||||
+ uint8_t b_lo = b & 0xF;
|
||||
+ uint8_t a_hi = a >> 4;
|
||||
+ uint8_t b_hi = b >> 4;
|
||||
+ uint8_t lolo = a_lo * b_lo;
|
||||
+ uint8_t lohi = a_lo * b_hi;
|
||||
+ uint8_t hilo = a_hi * b_lo;
|
||||
+ uint8_t hihi = a_hi * b_hi;
|
||||
+ uint8_t middle = hilo + lohi;
|
||||
+ uint8_t middle_hi = middle >> 4;
|
||||
+ uint8_t middle_lo = middle << 4;
|
||||
+ uint8_t res_lo = lolo + middle_lo;
|
||||
+ uint8_t res_hi = hihi + middle_hi;
|
||||
+ res_hi += (res_lo < middle_lo ? 1 : 0);
|
||||
+ res_hi += (middle < hilo ? 0x10 : 0);
|
||||
+ uint16_t res = ((uint16_t) res_hi) << 8;
|
||||
+ res += res_lo;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint32_t mul32 (uint16_t a, uint16_t b)
|
||||
+{
|
||||
+ uint16_t a_lo = a & 0xFF;
|
||||
+ uint16_t b_lo = b & 0xFF;
|
||||
+ uint16_t a_hi = a >> 8;
|
||||
+ uint16_t b_hi = b >> 8;
|
||||
+ uint16_t lolo = a_lo * b_lo;
|
||||
+ uint16_t lohi = a_lo * b_hi;
|
||||
+ uint16_t hilo = a_hi * b_lo;
|
||||
+ uint16_t hihi = a_hi * b_hi;
|
||||
+ uint16_t middle = hilo + lohi;
|
||||
+ uint16_t middle_hi = middle >> 8;
|
||||
+ uint16_t middle_lo = middle << 8;
|
||||
+ uint16_t res_lo = lolo + middle_lo;
|
||||
+ uint16_t res_hi = hihi + middle_hi;
|
||||
+ res_hi += (res_lo < middle_lo ? 1 : 0);
|
||||
+ res_hi += (middle < hilo ? 0x100 : 0);
|
||||
+ uint32_t res = ((uint32_t) res_hi) << 16;
|
||||
+ res += res_lo;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint64_t mul64 (uint32_t a, uint32_t b)
|
||||
+{
|
||||
+ uint32_t a_lo = a & 0xFFFF;
|
||||
+ uint32_t b_lo = b & 0xFFFF;
|
||||
+ uint32_t a_hi = a >> 16;
|
||||
+ uint32_t b_hi = b >> 16;
|
||||
+ uint32_t lolo = a_lo * b_lo;
|
||||
+ uint32_t lohi = a_lo * b_hi;
|
||||
+ uint32_t hilo = a_hi * b_lo;
|
||||
+ uint32_t hihi = a_hi * b_hi;
|
||||
+ uint32_t middle = hilo + lohi;
|
||||
+ uint32_t middle_hi = middle >> 16;
|
||||
+ uint32_t middle_lo = middle << 16;
|
||||
+ uint32_t res_lo = lolo + middle_lo;
|
||||
+ uint32_t res_hi = hihi + middle_hi;
|
||||
+ res_hi += (res_lo < middle_lo ? 1 : 0);
|
||||
+ res_hi += (middle < hilo ? 0x10000 : 0);
|
||||
+ uint64_t res = ((uint64_t) res_hi) << 32;
|
||||
+ res += res_lo;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint128_t mul128 (uint64_t a, uint64_t b)
|
||||
+{
|
||||
+ uint64_t a_lo = a & 0xFFFFFFFF;
|
||||
+ uint64_t b_lo = b & 0xFFFFFFFF;
|
||||
+ uint64_t a_hi = a >> 32;
|
||||
+ uint64_t b_hi = b >> 32;
|
||||
+ uint64_t lolo = a_lo * b_lo;
|
||||
+ uint64_t lohi = a_lo * b_hi;
|
||||
+ uint64_t hilo = a_hi * b_lo;
|
||||
+ uint64_t hihi = a_hi * b_hi;
|
||||
+ uint64_t middle = hilo + lohi;
|
||||
+ uint64_t middle_hi = middle >> 32;
|
||||
+ uint64_t middle_lo = middle << 32;
|
||||
+ uint64_t res_lo = lolo + middle_lo;
|
||||
+ uint64_t res_hi = hihi + middle_hi;
|
||||
+ res_hi += (res_lo < middle_lo ? 1 : 0);
|
||||
+ res_hi += (middle < hilo ? 0x100000000 : 0);
|
||||
+ uint128_t res = ((uint128_t) res_hi) << 64;
|
||||
+ res += res_lo;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint64_t mul64_perm (uint32_t a, uint32_t b)
|
||||
+{
|
||||
+ uint32_t a_lo = a & 0xFFFF;
|
||||
+ uint32_t b_lo = b & 0xFFFF;
|
||||
+ uint32_t a_hi = a >> 16;
|
||||
+ uint32_t b_hi = b >> 16;
|
||||
+ uint32_t lolo = a_lo * b_lo;
|
||||
+ uint32_t lohi = a_lo * b_hi;
|
||||
+ uint32_t hilo = a_hi * b_lo;
|
||||
+ uint32_t hihi = a_hi * b_hi;
|
||||
+ uint32_t middle = hilo + lohi;
|
||||
+ uint32_t middle_hi = middle >> 16;
|
||||
+ uint32_t middle_lo = middle << 16;
|
||||
+ uint32_t res_lo = lolo + middle_lo;
|
||||
+ uint32_t res_hi = hihi + middle_hi;
|
||||
+ res_hi = res_lo < middle_lo ? res_hi + 1 : res_hi;
|
||||
+ res_hi = middle < hilo ? res_hi + 0x10000 : res_hi;
|
||||
+ uint64_t res = ((uint64_t) res_hi) << 32;
|
||||
+ res += res_lo;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint128_t mul128_perm (uint64_t a, uint64_t b)
|
||||
+{
|
||||
+ uint64_t a_lo = a & 0xFFFFFFFF;
|
||||
+ uint64_t b_lo = b & 0xFFFFFFFF;
|
||||
+ uint64_t a_hi = a >> 32;
|
||||
+ uint64_t b_hi = b >> 32;
|
||||
+ uint64_t lolo = a_lo * b_lo;
|
||||
+ uint64_t lohi = a_lo * b_hi;
|
||||
+ uint64_t hilo = a_hi * b_lo;
|
||||
+ uint64_t hihi = a_hi * b_hi;
|
||||
+ uint64_t middle = hilo + lohi;
|
||||
+ uint64_t middle_hi = middle >> 32;
|
||||
+ uint64_t middle_lo = middle << 32;
|
||||
+ uint64_t res_lo = lolo + middle_lo;
|
||||
+ uint64_t res_hi = hihi + middle_hi;
|
||||
+ res_hi = res_lo < middle_lo ? res_hi + 1 : res_hi;
|
||||
+ res_hi = middle < hilo ? res_hi + 0x100000000 : res_hi;
|
||||
+ uint128_t res = ((uint128_t) res_hi) << 64;
|
||||
+ res += res_lo;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+/* { dg-final { scan-tree-dump-times "double sized mul optimized: 1" 6 "widening_mul" } } */
|
||||
diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-2.c b/gcc/testsuite/gcc.dg/double_sized_mul-2.c
|
||||
new file mode 100644
|
||||
index 000000000..cc6e5af25
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.dg/double_sized_mul-2.c
|
||||
@@ -0,0 +1,62 @@
|
||||
+/* { dg-do compile } */
|
||||
+/* fif-conversion-gimple is required for proper overflow detection
|
||||
+ in some cases. */
|
||||
+/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
|
||||
+#include <stdint.h>
|
||||
+
|
||||
+typedef unsigned __int128 uint128_t;
|
||||
+typedef struct uint256_t
|
||||
+{
|
||||
+ uint128_t lo;
|
||||
+ uint128_t hi;
|
||||
+} uint256_t;
|
||||
+
|
||||
+uint64_t mul64_double_use (uint32_t a, uint32_t b)
|
||||
+{
|
||||
+ uint32_t a_lo = a & 0xFFFF;
|
||||
+ uint32_t b_lo = b & 0xFFFF;
|
||||
+ uint32_t a_hi = a >> 16;
|
||||
+ uint32_t b_hi = b >> 16;
|
||||
+ uint32_t lolo = a_lo * b_lo;
|
||||
+ uint32_t lohi = a_lo * b_hi;
|
||||
+ uint32_t hilo = a_hi * b_lo;
|
||||
+ uint32_t hihi = a_hi * b_hi;
|
||||
+ uint32_t middle = hilo + lohi;
|
||||
+ uint32_t middle_hi = middle >> 16;
|
||||
+ uint32_t middle_lo = middle << 16;
|
||||
+ uint32_t res_lo = lolo + middle_lo;
|
||||
+ uint32_t res_hi = hihi + middle_hi;
|
||||
+ res_hi += (res_lo < middle_lo ? 1 : 0);
|
||||
+ res_hi += (middle < hilo ? 0x10000 : 0);
|
||||
+ uint64_t res = ((uint64_t) res_hi) << 32;
|
||||
+ res += res_lo;
|
||||
+ return res + lolo;
|
||||
+}
|
||||
+
|
||||
+uint256_t mul256 (uint128_t a, uint128_t b)
|
||||
+{
|
||||
+ uint128_t a_lo = a & 0xFFFFFFFFFFFFFFFF;
|
||||
+ uint128_t b_lo = b & 0xFFFFFFFFFFFFFFFF;
|
||||
+ uint128_t a_hi = a >> 64;
|
||||
+ uint128_t b_hi = b >> 64;
|
||||
+ uint128_t lolo = a_lo * b_lo;
|
||||
+ uint128_t lohi = a_lo * b_hi;
|
||||
+ uint128_t hilo = a_hi * b_lo;
|
||||
+ uint128_t hihi = a_hi * b_hi;
|
||||
+ uint128_t middle = hilo + lohi;
|
||||
+ uint128_t middle_hi = middle >> 64;
|
||||
+ uint128_t middle_lo = middle << 64;
|
||||
+ uint128_t res_lo = lolo + middle_lo;
|
||||
+ uint128_t res_hi = hihi + middle_hi;
|
||||
+ res_hi += (res_lo < middle_lo ? 1 : 0);
|
||||
+ /* Constant is to big warning WA */
|
||||
+ uint128_t overflow_tmp = (middle < hilo ? 1 : 0);
|
||||
+ overflow_tmp <<= 64;
|
||||
+ res_hi += overflow_tmp;
|
||||
+ uint256_t res;
|
||||
+ res.lo = res_lo;
|
||||
+ res.hi = res_hi;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+/* { dg-final { scan-tree-dump-not "double sized mul optimized" "widening_mul" } } */
|
||||
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
|
||||
index 716bf9e35..a81d7501c 100644
|
||||
--- a/gcc/tree-ssa-math-opts.c
|
||||
+++ b/gcc/tree-ssa-math-opts.c
|
||||
@@ -182,6 +182,9 @@ static struct
|
||||
|
||||
/* Number of divmod calls inserted. */
|
||||
int divmod_calls_inserted;
|
||||
+
|
||||
+ /* Number of optimized double sized multiplications. */
|
||||
+ int double_sized_mul_optimized;
|
||||
} widen_mul_stats;
|
||||
|
||||
/* The instance of "struct occurrence" representing the highest
|
||||
@@ -3708,6 +3711,78 @@ convert_to_divmod (gassign *stmt)
|
||||
return true;
|
||||
}
|
||||
|
||||
+/* Pattern matcher for double sized multiplication defined in match.pd. */
|
||||
+extern bool gimple_double_size_mul_candidate (tree, tree*, tree (*)(tree));
|
||||
+
|
||||
+static bool
|
||||
+convert_double_size_mul (gimple_stmt_iterator *gsi, gimple *stmt)
|
||||
+{
|
||||
+ gimple *use_stmt, *complex_res_lo;
|
||||
+ gimple_stmt_iterator insert_before;
|
||||
+ imm_use_iterator use_iter;
|
||||
+ tree match[4]; // arg0, arg1, res_hi, complex_res_lo
|
||||
+ tree arg0, arg1, widen_mult, new_type, tmp;
|
||||
+ tree lhs = gimple_assign_lhs (stmt);
|
||||
+ location_t loc = UNKNOWN_LOCATION;
|
||||
+ machine_mode mode;
|
||||
+
|
||||
+ if (!gimple_double_size_mul_candidate (lhs, match, NULL))
|
||||
+ return false;
|
||||
+
|
||||
+ new_type = build_nonstandard_integer_type (
|
||||
+ TYPE_PRECISION (TREE_TYPE (match[0])) * 2, 1);
|
||||
+ mode = TYPE_MODE (new_type);
|
||||
+
|
||||
+ /* Early return if the target multiplication doesn't exist on target. */
|
||||
+ if (optab_handler (smul_optab, mode) == CODE_FOR_nothing
|
||||
+ && !wider_optab_check_p (smul_optab, mode, 1))
|
||||
+ return false;
|
||||
+
|
||||
+ /* Determine the point where the wide multiplication
|
||||
+ should be inserted. Complex low res is OK since it is required
|
||||
+ by both high and low part getters, thus it dominates both of them. */
|
||||
+ complex_res_lo = SSA_NAME_DEF_STMT (match[3]);
|
||||
+ insert_before = gsi_for_stmt (complex_res_lo);
|
||||
+ gsi_next (&insert_before);
|
||||
+
|
||||
+ /* Create the widen multiplication. */
|
||||
+ arg0 = build_and_insert_cast (&insert_before, loc, new_type, match[0]);
|
||||
+ arg1 = build_and_insert_cast (&insert_before, loc, new_type, match[1]);
|
||||
+ widen_mult = build_and_insert_binop (&insert_before, loc, "widen_mult",
|
||||
+ MULT_EXPR, arg0, arg1);
|
||||
+
|
||||
+ /* Find the mult low part getter. */
|
||||
+ FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, match[3])
|
||||
+ if (gimple_assign_rhs_code (use_stmt) == REALPART_EXPR)
|
||||
+ BREAK_FROM_IMM_USE_STMT (use_iter);
|
||||
+
|
||||
+ /* Create high and low (if needed) parts extractors. */
|
||||
+ /* Low part. */
|
||||
+ if (use_stmt)
|
||||
+ {
|
||||
+ loc = gimple_location (use_stmt);
|
||||
+ tmp = build_and_insert_cast (&insert_before, loc,
|
||||
+ TREE_TYPE (gimple_get_lhs (use_stmt)),
|
||||
+ widen_mult);
|
||||
+ gassign *new_stmt = gimple_build_assign (gimple_get_lhs (use_stmt),
|
||||
+ NOP_EXPR, tmp);
|
||||
+ gsi_replace (&insert_before, new_stmt, true);
|
||||
+ }
|
||||
+
|
||||
+ /* High part. */
|
||||
+ loc = gimple_location (stmt);
|
||||
+ tmp = build_and_insert_binop (gsi, loc, "widen_mult_hi",
|
||||
+ RSHIFT_EXPR, widen_mult,
|
||||
+ build_int_cst (new_type,
|
||||
+ TYPE_PRECISION (new_type) / 2));
|
||||
+ tmp = build_and_insert_cast (gsi, loc, TREE_TYPE (lhs), tmp);
|
||||
+ gassign *new_stmt = gimple_build_assign (lhs, NOP_EXPR, tmp);
|
||||
+ gsi_replace (gsi, new_stmt, true);
|
||||
+
|
||||
+ widen_mul_stats.double_sized_mul_optimized++;
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
/* Find integer multiplications where the operands are extended from
|
||||
smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
|
||||
where appropriate. */
|
||||
@@ -3801,6 +3876,9 @@ math_opts_dom_walker::after_dom_children (basic_block bb)
|
||||
break;
|
||||
|
||||
case PLUS_EXPR:
|
||||
+ if (convert_double_size_mul (&gsi, stmt))
|
||||
+ break;
|
||||
+ __attribute__ ((fallthrough));
|
||||
case MINUS_EXPR:
|
||||
if (!convert_plusminus_to_widen (&gsi, stmt, code))
|
||||
match_uaddsub_overflow (&gsi, stmt, code);
|
||||
@@ -3892,6 +3970,8 @@ pass_optimize_widening_mul::execute (function *fun)
|
||||
widen_mul_stats.fmas_inserted);
|
||||
statistics_counter_event (fun, "divmod calls inserted",
|
||||
widen_mul_stats.divmod_calls_inserted);
|
||||
+ statistics_counter_event (fun, "double sized mul optimized",
|
||||
+ widen_mul_stats.double_sized_mul_optimized);
|
||||
|
||||
return cfg_changed ? TODO_cleanup_cfg : 0;
|
||||
}
|
||||
--
|
||||
2.33.0
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user