195 lines
6.6 KiB
Diff
195 lines
6.6 KiB
Diff
From 80b7de670da46d8921118799904cba4a0753bb72 Mon Sep 17 00:00:00 2001
|
|
From: Diachkov Ilia WX1215920 <diachkov.ilia1@huawei-partners.com>
|
|
Date: Wed, 23 Aug 2023 15:03:00 +0300
|
|
Subject: [PATCH 09/13] add insn defs and correct costs for cmlt generation
|
|
|
|
---
|
|
gcc/config/aarch64/aarch64-simd.md | 48 +++++++++++++++++++++++++++++
|
|
gcc/config/aarch64/aarch64.c | 15 +++++++++
|
|
gcc/config/aarch64/aarch64.opt | 4 +++
|
|
gcc/config/aarch64/iterators.md | 3 +-
|
|
gcc/config/aarch64/predicates.md | 25 +++++++++++++++
|
|
gcc/testsuite/gcc.dg/combine-cmlt.c | 20 ++++++++++++
|
|
6 files changed, 114 insertions(+), 1 deletion(-)
|
|
create mode 100755 gcc/testsuite/gcc.dg/combine-cmlt.c
|
|
|
|
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
|
|
index 6049adc3f..f4213fd62 100644
|
|
--- a/gcc/config/aarch64/aarch64-simd.md
|
|
+++ b/gcc/config/aarch64/aarch64-simd.md
|
|
@@ -4719,6 +4719,54 @@
|
|
[(set_attr "type" "neon_compare<q>, neon_compare_zero<q>")]
|
|
)
|
|
|
|
+;; Use cmlt to replace vector arithmetic operations like this (SImode example):
|
|
+;; B = (((A >> 15) & 0x00010001) << 16) - ((A >> 15) & 0x00010001)
|
|
+;; TODO: maybe extend to scalar operations or other cm** instructions.
|
|
+
|
|
+(define_insn "*aarch64_cmlt_as_arith<mode>"
|
|
+ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
|
|
+ (minus:<V_INT_EQUIV>
|
|
+ (ashift:<V_INT_EQUIV>
|
|
+ (and:<V_INT_EQUIV>
|
|
+ (lshiftrt:<V_INT_EQUIV>
|
|
+ (match_operand:VDQHSD 1 "register_operand" "w")
|
|
+ (match_operand:VDQHSD 2 "half_size_minus_one_operand"))
|
|
+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand"))
|
|
+ (match_operand:VDQHSD 4 "half_size_operand"))
|
|
+ (and:<V_INT_EQUIV>
|
|
+ (lshiftrt:<V_INT_EQUIV>
|
|
+ (match_dup 1)
|
|
+ (match_dup 2))
|
|
+ (match_dup 3))))]
|
|
+ "TARGET_SIMD && flag_cmlt_arith"
|
|
+ "cmlt\t%<v>0.<V2ntype>, %<v>1.<V2ntype>, #0"
|
|
+ [(set_attr "type" "neon_compare_zero")]
|
|
+)
|
|
+
|
|
+;; The helper definition that allows combiner to use the previous pattern.
|
|
+
|
|
+(define_insn_and_split "*aarch64_cmlt_tmp<mode>"
|
|
+ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
|
|
+ (and:<V_INT_EQUIV>
|
|
+ (lshiftrt:<V_INT_EQUIV>
|
|
+ (match_operand:VDQHSD 1 "register_operand" "w")
|
|
+ (match_operand:VDQHSD 2 "half_size_minus_one_operand"))
|
|
+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
|
|
+ "TARGET_SIMD && flag_cmlt_arith"
|
|
+ "#"
|
|
+ "&& reload_completed"
|
|
+ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
|
|
+ (lshiftrt:<V_INT_EQUIV>
|
|
+ (match_operand:VDQHSD 1 "register_operand")
|
|
+ (match_operand:VDQHSD 2 "half_size_minus_one_operand")))
|
|
+ (set (match_dup 0)
|
|
+ (and:<V_INT_EQUIV>
|
|
+ (match_dup 0)
|
|
+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
|
|
+ ""
|
|
+ [(set_attr "type" "neon_compare_zero")]
|
|
+)
|
|
+
|
|
(define_insn_and_split "aarch64_cm<optab>di"
|
|
[(set (match_operand:DI 0 "register_operand" "=w,w,r")
|
|
(neg:DI
|
|
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
|
|
index cbdde11b0..7a00a0817 100644
|
|
--- a/gcc/config/aarch64/aarch64.c
|
|
+++ b/gcc/config/aarch64/aarch64.c
|
|
@@ -12659,6 +12659,21 @@ cost_minus:
|
|
return true;
|
|
}
|
|
|
|
+ /* Detect aarch64_cmlt_as_arith instruction. Now only this pattern
|
|
+ matches the condition. The costs of cmlt and sub instructions
|
|
+ are comparable, so we are not increasing the cost here. */
|
|
+ if (flag_cmlt_arith && GET_CODE (op0) == ASHIFT
|
|
+ && GET_CODE (op1) == AND)
|
|
+ {
|
|
+ rtx op0_subop0 = XEXP (op0, 0);
|
|
+ if (rtx_equal_p (op0_subop0, op1))
|
|
+ {
|
|
+ rtx lshrt_op = XEXP (op0_subop0, 0);
|
|
+ if (GET_CODE (lshrt_op) == LSHIFTRT)
|
|
+ return true;
|
|
+ }
|
|
+ }
|
|
+
|
|
/* Look for SUB (extended register). */
|
|
if (is_a <scalar_int_mode> (mode, &int_mode)
|
|
&& aarch64_rtx_arith_op_extract_p (op1, int_mode))
|
|
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
|
|
index bb888461a..c42494036 100644
|
|
--- a/gcc/config/aarch64/aarch64.opt
|
|
+++ b/gcc/config/aarch64/aarch64.opt
|
|
@@ -273,6 +273,10 @@ Use an immediate to offset from the stack protector guard register, sp_el0.
|
|
This option is for use with fstack-protector-strong and not for use in
|
|
user-land code.
|
|
|
|
+mcmlt-arith
|
|
+Target Report Var(flag_cmlt_arith) Optimization Init(0)
|
|
+Use SIMD cmlt instruction to perform some arithmetic/logic calculations.
|
|
+
|
|
TargetVariable
|
|
long aarch64_stack_protector_guard_offset = 0
|
|
|
|
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
|
|
index 0a7145281..d3be06c6f 100644
|
|
--- a/gcc/config/aarch64/iterators.md
|
|
+++ b/gcc/config/aarch64/iterators.md
|
|
@@ -1228,7 +1228,8 @@
|
|
(V2DI "2s")])
|
|
|
|
;; Register suffix narrowed modes for VQN.
|
|
-(define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
|
|
+(define_mode_attr V2ntype [(V4HI "8b") (V2SI "4h")
|
|
+ (V8HI "16b") (V4SI "8h")
|
|
(V2DI "4s")])
|
|
|
|
;; Widened modes of vector modes.
|
|
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
|
|
index 1754b1eff..de58562a7 100644
|
|
--- a/gcc/config/aarch64/predicates.md
|
|
+++ b/gcc/config/aarch64/predicates.md
|
|
@@ -47,6 +47,31 @@
|
|
return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3);
|
|
})
|
|
|
|
+(define_predicate "half_size_minus_one_operand"
|
|
+ (match_code "const_vector")
|
|
+{
|
|
+ op = unwrap_const_vec_duplicate (op);
|
|
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
|
|
+ return CONST_INT_P (op) && (UINTVAL (op) == size - 1);
|
|
+})
|
|
+
|
|
+(define_predicate "half_size_operand"
|
|
+ (match_code "const_vector")
|
|
+{
|
|
+ op = unwrap_const_vec_duplicate (op);
|
|
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
|
|
+ return CONST_INT_P (op) && (UINTVAL (op) == size);
|
|
+})
|
|
+
|
|
+(define_predicate "cmlt_arith_mask_operand"
|
|
+ (match_code "const_vector")
|
|
+{
|
|
+ op = unwrap_const_vec_duplicate (op);
|
|
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
|
|
+ unsigned long long mask = ((unsigned long long) 1 << size) | 1;
|
|
+ return CONST_INT_P (op) && (UINTVAL (op) == mask);
|
|
+})
|
|
+
|
|
(define_predicate "subreg_lowpart_operator"
|
|
(ior (match_code "truncate")
|
|
(and (match_code "subreg")
|
|
diff --git a/gcc/testsuite/gcc.dg/combine-cmlt.c b/gcc/testsuite/gcc.dg/combine-cmlt.c
|
|
new file mode 100755
|
|
index 000000000..b4c9a37ff
|
|
--- /dev/null
|
|
+++ b/gcc/testsuite/gcc.dg/combine-cmlt.c
|
|
@@ -0,0 +1,20 @@
|
|
+/* { dg-do compile { target aarch64-*-* } } */
|
|
+/* { dg-options "-O3 -mcmlt-arith" } */
|
|
+
|
|
+/* The test checks usage of cmlt insns for arithmetic/logic calculations
|
|
+ * in foo (). It's inspired by sources of x264 codec. */
|
|
+
|
|
+typedef unsigned short int uint16_t;
|
|
+typedef unsigned int uint32_t;
|
|
+
|
|
+void foo( uint32_t *a, uint32_t *b)
|
|
+{
|
|
+ for (unsigned i = 0; i < 4; i++)
|
|
+ {
|
|
+ uint32_t s = ((a[i]>>((8 * sizeof(uint16_t))-1))
|
|
+ &(((uint32_t)1<<(8 * sizeof(uint16_t)))+1))*((uint16_t)-1);
|
|
+ b[i] = (a[i]+s)^s;
|
|
+ }
|
|
+}
|
|
+
|
|
+/* { dg-final { scan-assembler-times {cmlt\t} 1 } } */
|
|
--
|
|
2.33.0
|
|
|