gcc/0172-minmax-Move-minmax-pattern-to-gimple.patch

From df88d29c355c59e262397fdf3b22ee9099ce40c2 Mon Sep 17 00:00:00 2001
From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
Date: Tue, 19 Dec 2023 12:19:14 +0300
Subject: [PATCH 1/5] [minmax] Move minmax pattern to gimple.

---
 gcc/common.opt                          |   4 +
 gcc/config/aarch64/aarch64-simd.md      |  72 ----------------
 gcc/match.pd                            | 104 ++++++++++++++++++++++++
 gcc/testsuite/gcc.dg/combine-maxmin-1.c |  15 ++++
 gcc/testsuite/gcc.dg/combine-maxmin-2.c |  14 ++++
 gcc/testsuite/gcc.dg/combine-maxmin.c   |  19 +++--
 6 files changed, 151 insertions(+), 77 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-1.c
 create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-2.c

diff --git a/gcc/common.opt b/gcc/common.opt
index a8a2264ee..73234dcc3 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1750,6 +1750,10 @@ fif-conversion-gimple
 Common Report Var(flag_if_conversion_gimple) Optimization
 Perform conversion of conditional jumps to branchless equivalents during gimple transformations.

+fconvert-minmax
+Common Report Var(flag_convert_minmax) Optimization
+Convert saturating clipping to min max.
+
 fstack-reuse=
 Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization
 -fstack-reuse=[all|named_vars|none]	Set stack reuse level for local variables.
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c7503561f..754343abc 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1535,78 +1535,6 @@
   [(set_attr "type" "neon_minmax<q>")]
 )

-;; Use sequential smax+smin to replace vector arithmetic operations like this:
-;; a = ((x & ~((1 << 8)-1)) ? (-x)>>31 & ((1 << 8)-1) : x);
-;; TODO: maybe extend to scalar operations.
-
-(define_insn_and_split "*aarch64_maxmin_arith<mode>"
-  [(set (match_operand:VDQHSD 0 "register_operand" "=w")
-	(xor:VDQHSD
-	  (and:VDQHSD
-	    (xor:VDQHSD
-	      (ashiftrt:VDQHSD
-		(neg:VDQHSD
-		  (match_operand:VDQHSD 1 "register_operand"))
-		(match_operand:VDQHSD 2 "maxmin_arith_shift_operand"))
-	      (match_dup 1))
-	    (neg:VDQHSD
-	      (eq:VDQHSD
-		(and:VDQHSD
-		  (match_dup 1)
-		  (match_operand:VDQHSD 3 "aarch64_bic_imm_for_maxmin"))
-		(match_operand:VDQHSD 4 "aarch64_simd_or_scalar_imm_zero"))))
-	  (ashiftrt:VDQHSD
-	    (neg:VDQHSD
-	      (match_dup 1))
-	    (match_dup 2))))]
-  "TARGET_SIMD && !reload_completed"
-  "#"
-  "&& true"
-  [(set (match_operand:VDQHSD 5 "register_operand" "w") (match_dup 3))
-   (set (match_operand:VDQHSD 6 "register_operand" "w") (match_dup 4))
-   (set (match_operand:VDQHSD 0 "register_operand" "=w")
-	(smax:VDQHSD (match_operand:VDQHSD 1 "register_operand" "w")
-		     (match_operand:VDQHSD 6 "register_operand" "w")))
-   (set (match_operand:VDQHSD 0 "register_operand" "=w")
-	(smin:VDQHSD (match_operand:VDQHSD 0 "register_operand" "w")
-		     (match_operand:VDQHSD 5 "register_operand" "w")))]
-  {
-    if (can_create_pseudo_p ())
-      {
-	int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[3], 0));
-	operands[3] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
-							 ~val);
-	operands[5] = gen_reg_rtx (<MODE>mode);
-	operands[6] = gen_reg_rtx (<MODE>mode);
-      }
-    else
-      FAIL;
-  }
-  [(set_attr "type" "neon_minmax<q>")]
-)
-
-;; The helper definition that allows combiner to use the previous pattern.
-
-(define_insn_and_split "*aarch64_maxmin_tmp<mode>"
-  [(set (match_operand:VDQHSD 0 "register_operand" "=w")
-	(ashiftrt:VDQHSD
-	  (neg:VDQHSD
-	    (match_operand:VDQHSD 1 "register_operand" "w"))
-	  (match_operand:VDQHSD 2 "maxmin_arith_shift_operand")))]
-  "TARGET_SIMD"
-  "#"
-  "&& reload_completed"
-  [(set (match_operand:VDQHSD 0 "register_operand")
-	(neg:VDQHSD
-	  (match_operand:VDQHSD 1 "register_operand" "w")))
-   (set (match_dup 0)
-	(ashiftrt:VDQHSD
-	  (match_dup 0)
-	  (match_operand:VDQHSD 2 "maxmin_arith_shift_operand")))]
-  ""
-  [(set_attr "type" "neon_minmax<q>")]
-)
-
 ;; Pairwise FP Max/Min operations.
 (define_insn "aarch64_<maxmin_uns>p<mode>"
  [(set (match_operand:VHSDF 0 "register_operand" "=w")
diff --git a/gcc/match.pd b/gcc/match.pd
index 24ae157af..1097cd926 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -6595,3 +6595,107 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
    (plus:c@4 (op2:c @0 @1)
     (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1))))
      (if (single_use (@4) && single_use (@5)))))
+
+/* MinMax pattern matching helpers.  More info on the transformation below.  */
+
+/* Match (a & 0b11..100..0) pattern.  */
+(match (minmax_cmp_arg @0 @1)
+ (bit_and @0 INTEGER_CST@1)
+ (if (wi::popcount (~wi::to_widest (@1) + 1) == 1)))
+
+/* Match (inversed_sign_bit >> sign_bit_pos) pattern.
+   This statement is blocking for the transformation of unsigned integers.
+   Do type check here to avoid unnecessary duplications.  */
+(match (minmax_sat_arg @0)
+ (rshift (negate @0) INTEGER_CST@1)
+ (if (!TYPE_UNSIGNED (TREE_TYPE (@0))
+      && wi::eq_p (wi::to_widest (@1), TYPE_PRECISION (TREE_TYPE (@0)) - 1))))
+
+/* Transform ((x & ~mask) ? (-x)>>31 & mask : x) to (min (max (x, 0), mask)).
+   The matched pattern can be described as saturated clipping.
+
+   The pattern supports truncation via both casts and bit_and.
+   Also there are patterns for possible inverted conditions.  */
+(if (flag_convert_minmax)
+/* Truncation via casts.  Unfortunately convert? cannot be applied here
+   because convert and cond take different number of arguments.  */
+ (simplify
+  (convert
+   (cond
+    (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+    (convert? (minmax_sat_arg @0))
+    (convert? @0)))
+  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
+   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+    (convert (min (max @0 { integer_zero_node; })
+		  { mask; })))))
+ (simplify
+  (cond
+   (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+   (convert? (minmax_sat_arg @0))
+   (convert? @0))
+  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
+   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+    (convert (min (max @0 { integer_zero_node; })
+		  { mask; })))))
+
+ (simplify
+  (convert
+   (cond
+    (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+    (convert? @0)
+    (convert? (minmax_sat_arg @0))))
+  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
+   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+    (convert (min (max @0 { integer_zero_node; })
+		  { mask; })))))
+ (simplify
+  (cond
+   (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+   (convert? @0)
+   (convert? (minmax_sat_arg @0)))
+  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
+   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+    (convert (min (max @0 { integer_zero_node; })
+		  { mask; })))))
+
+ /* Truncation via bit_and with mask.  Same concerns on convert? here.  */
+ (simplify
+  (convert
+   (cond
+    (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+    (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))
+    (convert? @0)))
+  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
+   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+    (convert (min (max @0 { integer_zero_node; })
+		  { mask; })))))
+ (simplify
+  (cond
+   (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+   (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))
+   (convert? @0))
+  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
+   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+    (convert (min (max @0 { integer_zero_node; })
+		  { mask; })))))
+
+ (simplify
+  (convert
+   (cond
+    (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+    (convert? @0)
+    (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))))
+  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
+   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+    (convert (min (max @0 { integer_zero_node; })
+		  { mask; })))))
+ (simplify
+  (cond
+   (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+   (convert? @0)
+   (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)))
+  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
+   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+    (convert (min (max @0 { integer_zero_node; })
+		  { mask; }))))))
diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-1.c b/gcc/testsuite/gcc.dg/combine-maxmin-1.c
new file mode 100644
index 000000000..859ff7df8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/combine-maxmin-1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile { target aarch64-*-* } } */
+/* { dg-options "-O3 -fconvert-minmax" } */
+
+#include <inttypes.h>
+
+__attribute__((noinline))
+void test (int32_t *restrict a, int32_t *restrict x)
+{
+  for (int i = 0; i < 4; i++)
+    a[i] = ((((-x[i]) >> 31) ^ x[i])
+            & (-((int32_t)((x[i] & (~((1 << 8)-1))) == 0)))) ^ ((-x[i]) >> 31);
+}
+
+/* { dg-final { scan-assembler-not {smax\t} } }  */
+/* { dg-final { scan-assembler-not {smin\t} } }  */
diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-2.c b/gcc/testsuite/gcc.dg/combine-maxmin-2.c
new file mode 100644
index 000000000..63d4d85b3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/combine-maxmin-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target aarch64-*-* } } */
+/* { dg-options "-O3 -fconvert-minmax" } */
+
+#include <inttypes.h>
+
+__attribute__((noinline))
+void test (int8_t *restrict a, int32_t *restrict x)
+{
+  for (int i = 0; i < 8; i++)
+    a[i] = ((x[i] & ~((1 << 9)-1)) ? (-x[i])>>31 & ((1 << 9)-1) : x[i]);
+}
+
+/* { dg-final { scan-assembler-times {smax\t} 4 } }  */
+/* { dg-final { scan-assembler-times {smin\t} 4 } }  */
diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
index 06bce7029..a984fa560 100755
--- a/gcc/testsuite/gcc.dg/combine-maxmin.c
+++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
@@ -1,5 +1,5 @@
 /* { dg-do compile { target aarch64-*-* } } */
-/* { dg-options "-O3 -fdump-rtl-combine-all" } */
+/* { dg-options "-O3 -fconvert-minmax" } */

 /* The test checks usage of smax/smin insns for clip evaluation and
  * uzp1/uzp2 insns for vector element narrowing.  It's inspired by
@@ -19,20 +19,26 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
 {
     const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0;
     for( int y = 0; y < height; y++ ) {
+        /* This loop is not being vectorized now.  */
         for( int x = -2; x < width+3; x++ ) {
             int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride]
 		     + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride]));
             dstv[x] = clip ( (v + 16) >> 5 );
             buf[x+2] = v + pad;
         }
+
+        /* Produces two versions of the code: 3xUZP1/2xMAX/2xMIN + 1xUZP1/1xMAX/1xMIN.  */
         for( int x = 0; x < width; x++ )
             dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1]
 			      + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1]))
 			     - 32*pad + 512) >> 10);
+
+        /* Priduces two versions of the code: 1xUZP1/2xMAX/2xMIN + 0xUZP1/1xMAX/1xMIN.  */
         for( int x = 0; x < width; x++ )
             dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1]
 			      + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1]))
 			     + 16) >> 5);
+
         dsth += stride;
         dstv += stride;
         dstc += stride;
@@ -40,7 +46,10 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
     }
 }

-/* { dg-final { scan-assembler-times {smax\t} 4 } }  */
-/* { dg-final { scan-assembler-times {smin\t} 4 } }  */
-/* { dg-final { scan-assembler-times {cmtst\t} 2 } }  */
-/* { dg-final { scan-assembler-times {uzp1\t} 6 } }  */
+/* Max is performed on 0 from signed values, match smax exactly.  */
+/* { dg-final { scan-assembler-times {smax\t} 6 } }  */
+/* Min is performed on signed val>0 and a mask, min sign doesn't matter.  */
+/* { dg-final { scan-assembler-times {[us]min\t} 6 } }  */
+/* All of the vectorized patterns are expected to be matched.  */
+/* { dg-final { scan-assembler-not {cmtst\t} } }  */
+/* { dg-final { scan-assembler-times {uzp1\t} 5 } }  */
--
2.33.0