185 lines
6.5 KiB
Diff
185 lines
6.5 KiB
Diff
From 6975b6e27ea6231d41c24afc51c77e0576b28b14 Mon Sep 17 00:00:00 2001
|
|
From: Marc Glisse <marc.glisse@inria.fr>
|
|
Date: Wed, 5 Aug 2020 16:45:33 +0200
|
|
Subject: [PATCH 15/26] VEC_COND_EXPR optimizations
|
|
|
|
When vector comparisons were forced to use vec_cond_expr, we lost a number of optimizations (my fault for not adding enough testcases to
|
|
prevent that). This patch tries to unwrap vec_cond_expr a bit so some optimizations can still happen.
|
|
|
|
I wasn't planning to add all those transformations together, but adding one caused a regression, whose fix introduced a second regression,
|
|
etc.
|
|
|
|
Restricting to constant folding would not be sufficient, we also need at least things like X|0 or X&X. The transformations are quite
|
|
conservative with :s and folding only if everything simplifies, we may want to relax this later. And of course we are going to miss things
|
|
like a?b:c + a?c:b -> b+c.
|
|
|
|
In terms of number of operations, some transformations turning 2 VEC_COND_EXPR into VEC_COND_EXPR + BIT_IOR_EXPR + BIT_NOT_EXPR might not look
|
|
like a gain... I expect the bit_not disappears in most cases, and VEC_COND_EXPR looks more costly than a simpler BIT_IOR_EXPR.
|
|
|
|
2020-08-05 Marc Glisse <marc.glisse@inria.fr>
|
|
|
|
PR tree-optimization/95906
|
|
PR target/70314
|
|
* match.pd ((c ? a : b) op d, (c ? a : b) op (c ? d : e),
|
|
(v ? w : 0) ? a : b, c1 ? c2 ? a : b : b): New transformations.
|
|
(op (c ? a : b)): Update to match the new transformations.
|
|
|
|
* gcc.dg/tree-ssa/andnot-2.c: New file.
|
|
* gcc.dg/tree-ssa/pr95906.c: Likewise.
|
|
* gcc.target/i386/pr70314.c: Likewise.
|
|
|
|
Reference commit in GCC: 229752afe3156a3990dacaedb94c76846cebf132
|
|
Signed-off-by: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
|
|
---
|
|
gcc/match.pd | 70 ++++++++++++++++++++----
|
|
gcc/testsuite/gcc.dg/tree-ssa/andnot-2.c | 10 ++++
|
|
gcc/testsuite/gcc.dg/tree-ssa/pr95906.c | 13 +++++
|
|
gcc/testsuite/gcc.target/i386/pr70314.c | 15 +++++
|
|
4 files changed, 96 insertions(+), 12 deletions(-)
|
|
create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/andnot-2.c
|
|
create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr95906.c
|
|
create mode 100644 gcc/testsuite/gcc.target/i386/pr70314.c
|
|
|
|
diff --git a/gcc/match.pd b/gcc/match.pd
|
|
index 01f81b063..25575af21 100644
|
|
--- a/gcc/match.pd
|
|
+++ b/gcc/match.pd
|
|
@@ -3319,20 +3319,66 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
|
|
(if (integer_zerop (@0))
|
|
@2)))
|
|
|
|
-/* Sink unary operations to constant branches, but only if we do fold it to
|
|
- constants. */
|
|
+#if GIMPLE
|
|
+/* Sink unary operations to branches, but only if we do fold both. */
|
|
(for op (negate bit_not abs absu)
|
|
(simplify
|
|
- (op (vec_cond @0 VECTOR_CST@1 VECTOR_CST@2))
|
|
- (with
|
|
- {
|
|
- tree cst1, cst2;
|
|
- cst1 = const_unop (op, type, @1);
|
|
- if (cst1)
|
|
- cst2 = const_unop (op, type, @2);
|
|
- }
|
|
- (if (cst1 && cst2)
|
|
- (vec_cond @0 { cst1; } { cst2; })))))
|
|
+ (op (vec_cond:s @0 @1 @2))
|
|
+ (vec_cond @0 (op! @1) (op! @2))))
|
|
+
|
|
+/* Sink binary operation to branches, but only if we can fold it. */
|
|
+(for op (tcc_comparison plus minus mult bit_and bit_ior bit_xor
|
|
+ rdiv trunc_div ceil_div floor_div round_div
|
|
+ trunc_mod ceil_mod floor_mod round_mod min max)
|
|
+/* (c ? a : b) op (c ? d : e) --> c ? (a op d) : (b op e) */
|
|
+ (simplify
|
|
+ (op (vec_cond:s @0 @1 @2) (vec_cond:s @0 @3 @4))
|
|
+ (vec_cond @0 (op! @1 @3) (op! @2 @4)))
|
|
+
|
|
+/* (c ? a : b) op d --> c ? (a op d) : (b op d) */
|
|
+ (simplify
|
|
+ (op (vec_cond:s @0 @1 @2) @3)
|
|
+ (vec_cond @0 (op! @1 @3) (op! @2 @3)))
|
|
+ (simplify
|
|
+ (op @3 (vec_cond:s @0 @1 @2))
|
|
+ (vec_cond @0 (op! @3 @1) (op! @3 @2))))
|
|
+#endif
|
|
+
|
|
+/* (v ? w : 0) ? a : b is just (v & w) ? a : b */
|
|
+(simplify
|
|
+ (vec_cond (vec_cond:s @0 @3 integer_zerop) @1 @2)
|
|
+ (if (types_match (@0, @3))
|
|
+ (vec_cond (bit_and @0 @3) @1 @2)))
|
|
+(simplify
|
|
+ (vec_cond (vec_cond:s @0 integer_all_onesp @3) @1 @2)
|
|
+ (if (types_match (@0, @3))
|
|
+ (vec_cond (bit_ior @0 @3) @1 @2)))
|
|
+(simplify
|
|
+ (vec_cond (vec_cond:s @0 integer_zerop @3) @1 @2)
|
|
+ (if (types_match (@0, @3))
|
|
+ (vec_cond (bit_ior @0 (bit_not @3)) @2 @1)))
|
|
+(simplify
|
|
+ (vec_cond (vec_cond:s @0 @3 integer_all_onesp) @1 @2)
|
|
+ (if (types_match (@0, @3))
|
|
+ (vec_cond (bit_and @0 (bit_not @3)) @2 @1)))
|
|
+
|
|
+/* c1 ? c2 ? a : b : b --> (c1 & c2) ? a : b */
|
|
+(simplify
|
|
+ (vec_cond @0 (vec_cond:s @1 @2 @3) @3)
|
|
+ (if (types_match (@0, @1))
|
|
+ (vec_cond (bit_and @0 @1) @2 @3)))
|
|
+(simplify
|
|
+ (vec_cond @0 @2 (vec_cond:s @1 @2 @3))
|
|
+ (if (types_match (@0, @1))
|
|
+ (vec_cond (bit_ior @0 @1) @2 @3)))
|
|
+(simplify
|
|
+ (vec_cond @0 (vec_cond:s @1 @2 @3) @2)
|
|
+ (if (types_match (@0, @1))
|
|
+ (vec_cond (bit_ior (bit_not @0) @1) @2 @3)))
|
|
+(simplify
|
|
+ (vec_cond @0 @3 (vec_cond:s @1 @2 @3))
|
|
+ (if (types_match (@0, @1))
|
|
+ (vec_cond (bit_and (bit_not @0) @1) @2 @3)))
|
|
|
|
/* A few simplifications of "a ? CST1 : CST2". */
|
|
/* NOTE: Only do this on gimple as the if-chain-to-switch
|
|
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/andnot-2.c b/gcc/testsuite/gcc.dg/tree-ssa/andnot-2.c
|
|
new file mode 100644
|
|
index 000000000..e0955ce3f
|
|
--- /dev/null
|
|
+++ b/gcc/testsuite/gcc.dg/tree-ssa/andnot-2.c
|
|
@@ -0,0 +1,10 @@
|
|
+/* { dg-do compile } */
|
|
+/* { dg-options "-O2 -fdump-tree-forwprop3-raw -w -Wno-psabi" } */
|
|
+
|
|
+typedef long vec __attribute__((vector_size(16)));
|
|
+vec f(vec x){
|
|
+ vec y = x < 10;
|
|
+ return y & (y == 0);
|
|
+}
|
|
+
|
|
+/* { dg-final { scan-tree-dump-not "_expr" "forwprop3" } } */
|
|
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c b/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c
|
|
new file mode 100644
|
|
index 000000000..3d820a58e
|
|
--- /dev/null
|
|
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c
|
|
@@ -0,0 +1,13 @@
|
|
+/* { dg-do compile } */
|
|
+/* { dg-options "-O2 -fdump-tree-forwprop3-raw -w -Wno-psabi" } */
|
|
+
|
|
+// FIXME: this should further optimize to a MAX_EXPR
|
|
+typedef signed char v16i8 __attribute__((vector_size(16)));
|
|
+v16i8 f(v16i8 a, v16i8 b)
|
|
+{
|
|
+ v16i8 cmp = (a > b);
|
|
+ return (cmp & a) | (~cmp & b);
|
|
+}
|
|
+
|
|
+/* { dg-final { scan-tree-dump-not "bit_(and|ior)_expr" "forwprop3" } } */
|
|
+/* { dg-final { scan-tree-dump-times "vec_cond_expr" 1 "forwprop3" } } */
|
|
diff --git a/gcc/testsuite/gcc.target/i386/pr70314.c b/gcc/testsuite/gcc.target/i386/pr70314.c
|
|
new file mode 100644
|
|
index 000000000..aad8dd9b5
|
|
--- /dev/null
|
|
+++ b/gcc/testsuite/gcc.target/i386/pr70314.c
|
|
@@ -0,0 +1,15 @@
|
|
+/* { dg-do compile } */
|
|
+/* { dg-options "-march=skylake-avx512 -O2" } */
|
|
+/* { dg-final { scan-assembler-times "cmp" 2 } } */
|
|
+/* { dg-final { scan-assembler-not "and" } } */
|
|
+
|
|
+typedef long vec __attribute__((vector_size(16)));
|
|
+vec f(vec x, vec y){
|
|
+ return (x < 5) & (y < 8);
|
|
+}
|
|
+
|
|
+/* On x86_64, currently
|
|
+ vpcmpq $2, .LC1(%rip), %xmm1, %k1
|
|
+ vpcmpq $2, .LC0(%rip), %xmm0, %k0{%k1}
|
|
+ vpmovm2q %k0, %xmm0
|
|
+*/
|
|
--
|
|
2.27.0
|
|
|