From 1e886b98ff7ffdac023dcee8645717f2849d2eb7 Mon Sep 17 00:00:00 2001 From: Diachkov Ilia WX1215920 Date: Wed, 25 Oct 2023 18:12:28 +0300 Subject: [PATCH 1/6] Add maxmin and uzp1/uzp2 combining --- gcc/config/aarch64/aarch64-simd.md | 339 +++++++++++++++++++++++++- gcc/config/aarch64/predicates.md | 19 ++ gcc/testsuite/gcc.dg/combine-maxmin.c | 46 ++++ 3 files changed, 399 insertions(+), 5 deletions(-) create mode 100755 gcc/testsuite/gcc.dg/combine-maxmin.c diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 6049adc3f..7f707de57 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1034,6 +1034,82 @@ [(set_attr "type" "neon_shift_imm")] ) +;; Simplify the extension with following truncation for shift+neg operation. + +(define_insn_and_split "*aarch64_sshr_neg_v8hi" + [(set (match_operand:V8HI 0 "register_operand" "=w") + (vec_concat:V8HI + (truncate:V4HI + (ashiftrt:V4SI + (neg:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "register_operand") + (match_operand:V8HI 3 "vect_par_cnst_lo_half")))) + (match_operand:V4SI 2 "maxmin_arith_shift_operand"))) + (truncate:V4HI + (ashiftrt:V4SI + (neg:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (match_dup 1) + (match_operand:V8HI 4 "vect_par_cnst_hi_half")))) + (match_dup 2)))))] + "TARGET_SIMD" + "#" + "&& true" + [(set (match_operand:V8HI 0 "register_operand" "=w") + (ashiftrt:V8HI + (neg:V8HI + (match_operand:V8HI 1 "register_operand" "w")) + (match_operand:V8HI 2 "aarch64_simd_imm_minus_one")))] + { + /* Reduce the shift amount to smaller mode. */ + int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[2], 0)) + - (GET_MODE_UNIT_BITSIZE (GET_MODE (operands[2])) / 2); + operands[2] = aarch64_simd_gen_const_vector_dup (V8HImode, val); + } + [(set_attr "type" "multiple")] +) + +;; The helper definition that allows combiner to use the previous pattern. + +(define_insn_and_split "*aarch64_sshr_neg_tmpv8hi" + [(set (match_operand:V8HI 0 "register_operand" "=w") + (vec_concat:V8HI + (truncate:V4HI + (ashiftrt:V4SI + (neg:V4SI + (match_operand:V4SI 1 "register_operand" "w")) + (match_operand:V4SI 2 "maxmin_arith_shift_operand"))) + (truncate:V4HI + (ashiftrt:V4SI + (neg:V4SI + (match_operand:V4SI 3 "register_operand" "w")) + (match_dup 2)))))] + "TARGET_SIMD" + "#" + "&& true" + [(set (match_operand:V4SI 1 "register_operand" "=w") + (ashiftrt:V4SI + (neg:V4SI + (match_dup 1)) + (match_operand:V4SI 2 "maxmin_arith_shift_operand"))) + (set (match_operand:V4SI 3 "register_operand" "=w") + (ashiftrt:V4SI + (neg:V4SI + (match_dup 3)) + (match_dup 2))) + (set (match_operand:V8HI 0 "register_operand" "=w") + (vec_concat:V8HI + (truncate:V4HI + (match_dup 1)) + (truncate:V4HI + (match_dup 3))))] + "" + [(set_attr "type" "multiple")] +) + (define_insn "*aarch64_simd_sra" [(set (match_operand:VDQ_I 0 "register_operand" "=w") (plus:VDQ_I @@ -1459,6 +1535,78 @@ [(set_attr "type" "neon_minmax")] ) +;; Use sequential smax+smin to replace vector arithmetic operations like this: +;; a = ((x & ~((1 << 8)-1)) ? (-x)>>31 & ((1 << 8)-1) : x); +;; TODO: maybe extend to scalar operations. + +(define_insn_and_split "*aarch64_maxmin_arith" + [(set (match_operand:VDQHSD 0 "register_operand" "=w") + (xor:VDQHSD + (and:VDQHSD + (xor:VDQHSD + (ashiftrt:VDQHSD + (neg:VDQHSD + (match_operand:VDQHSD 1 "register_operand")) + (match_operand:VDQHSD 2 "maxmin_arith_shift_operand")) + (match_dup 1)) + (neg:VDQHSD + (eq:VDQHSD + (and:VDQHSD + (match_dup 1) + (match_operand:VDQHSD 3 "aarch64_bic_imm_for_maxmin")) + (match_operand:VDQHSD 4 "aarch64_simd_or_scalar_imm_zero")))) + (ashiftrt:VDQHSD + (neg:VDQHSD + (match_dup 1)) + (match_dup 2))))] + "TARGET_SIMD && !reload_completed" + "#" + "&& true" + [(set (match_operand:VDQHSD 5 "register_operand" "w") (match_dup 3)) + (set (match_operand:VDQHSD 6 "register_operand" "w") (match_dup 4)) + (set (match_operand:VDQHSD 0 "register_operand" "=w") + (smax:VDQHSD (match_operand:VDQHSD 1 "register_operand" "w") + (match_operand:VDQHSD 6 "register_operand" "w"))) + (set (match_operand:VDQHSD 0 "register_operand" "=w") + (smin:VDQHSD (match_operand:VDQHSD 0 "register_operand" "w") + (match_operand:VDQHSD 5 "register_operand" "w")))] + { + if (can_create_pseudo_p ()) + { + int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[3], 0)); + operands[3] = aarch64_simd_gen_const_vector_dup (mode, + ~val); + operands[5] = gen_reg_rtx (mode); + operands[6] = gen_reg_rtx (mode); + } + else + FAIL; + } + [(set_attr "type" "neon_minmax")] +) + +;; The helper definition that allows combiner to use the previous pattern. + +(define_insn_and_split "*aarch64_maxmin_tmp" + [(set (match_operand:VDQHSD 0 "register_operand" "=w") + (ashiftrt:VDQHSD + (neg:VDQHSD + (match_operand:VDQHSD 1 "register_operand" "w")) + (match_operand:VDQHSD 2 "maxmin_arith_shift_operand")))] + "TARGET_SIMD" + "#" + "&& reload_completed" + [(set (match_operand:VDQHSD 0 "register_operand") + (neg:VDQHSD + (match_operand:VDQHSD 1 "register_operand" "w"))) + (set (match_dup 0) + (ashiftrt:VDQHSD + (match_dup 0) + (match_operand:VDQHSD 2 "maxmin_arith_shift_operand")))] + "" + [(set_attr "type" "neon_minmax")] +) + ;; Pairwise FP Max/Min operations. (define_insn "aarch64_p" [(set (match_operand:VHSDF 0 "register_operand" "=w") @@ -1599,7 +1747,8 @@ DONE; }) -;; For quads. +;; For quads. Use UZP1 on the narrower type, which discards the high part of +;; each wide element. (define_insn "vec_pack_trunc_" [(set (match_operand: 0 "register_operand" "=&w") @@ -1609,12 +1758,32 @@ "TARGET_SIMD" { if (BYTES_BIG_ENDIAN) - return "xtn\\t%0., %2.\;xtn2\\t%0., %1."; + return "uzp1\\t%0., %2., %1."; else - return "xtn\\t%0., %1.\;xtn2\\t%0., %2."; + return "uzp1\\t%0., %1., %2."; } - [(set_attr "type" "multiple") - (set_attr "length" "8")] + [(set_attr "type" "neon_permute") + (set_attr "length" "4")] +) + +(define_insn "vec_pack_trunc_shifted_" + [(set (match_operand: 0 "register_operand" "=&w") + (vec_concat: + (truncate: + (ashiftrt:VQN (match_operand:VQN 1 "register_operand" "w") + (match_operand:VQN 2 "half_size_operand" "w"))) + (truncate: + (ashiftrt:VQN (match_operand:VQN 3 "register_operand" "w") + (match_operand:VQN 4 "half_size_operand" "w")))))] + "TARGET_SIMD" + { + if (BYTES_BIG_ENDIAN) + return "uzp2\\t%0., %3., %1."; + else + return "uzp2\\t%0., %1., %3."; + } + [(set_attr "type" "neon_permute") + (set_attr "length" "4")] ) ;; Widening operations. @@ -4852,6 +5021,166 @@ [(set_attr "type" "neon_tst")] ) +;; Simplify the extension with following truncation for cmtst-like operation. + +(define_insn_and_split "*aarch64_cmtst_arith_v8hi" + [(set (match_operand:V8HI 0 "register_operand" "=w") + (vec_concat:V8HI + (plus:V4HI + (truncate:V4HI + (eq:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (and:V8HI + (match_operand:V8HI 1 "register_operand") + (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) + (match_operand:V8HI 3 "vect_par_cnst_lo_half"))) + (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))) + (match_operand:V4HI 5 "aarch64_simd_imm_minus_one")) + (plus:V4HI + (truncate:V4HI + (eq:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (and:V8HI + (match_dup 1) + (match_dup 2)) + (match_operand:V8HI 6 "vect_par_cnst_hi_half"))) + (match_dup 4))) + (match_dup 5))))] + "TARGET_SIMD && !reload_completed" + "#" + "&& true" + [(set (match_operand:V8HI 6 "register_operand" "=w") + (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) + (set (match_operand:V8HI 0 "register_operand" "=w") + (plus:V8HI + (eq:V8HI + (and:V8HI + (match_operand:V8HI 1 "register_operand" "w") + (match_dup 6)) + (match_operand:V8HI 4 "aarch64_simd_imm_zero")) + (match_operand:V8HI 5 "aarch64_simd_imm_minus_one")))] + { + if (can_create_pseudo_p ()) + { + int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[4], 0)); + operands[4] = aarch64_simd_gen_const_vector_dup (V8HImode, val); + int val2 = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[5], 0)); + operands[5] = aarch64_simd_gen_const_vector_dup (V8HImode, val2); + + operands[6] = gen_reg_rtx (V8HImode); + } + else + FAIL; + } + [(set_attr "type" "neon_tst_q")] +) + +;; Three helper definitions that allow combiner to use the previous pattern. + +(define_insn_and_split "*aarch64_cmtst_arith_tmp_lo_v8hi" + [(set (match_operand:V4SI 0 "register_operand" "=w") + (neg:V4SI + (eq:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (and:V8HI + (match_operand:V8HI 1 "register_operand") + (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) + (match_operand:V8HI 3 "vect_par_cnst_lo_half"))) + (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))] + "TARGET_SIMD && !reload_completed" + "#" + "&& true" + [(set (match_operand:V8HI 5 "register_operand" "=w") + (and:V8HI + (match_operand:V8HI 1 "register_operand") + (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))) + (set (match_operand:V4SI 0 "register_operand" "=w") + (sign_extend:V4SI + (vec_select:V4HI + (match_dup 5) + (match_operand:V8HI 3 "vect_par_cnst_lo_half")))) + (set (match_dup 0) + (neg:V4SI + (eq:V4SI + (match_dup 0) + (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))] + { + if (can_create_pseudo_p ()) + operands[5] = gen_reg_rtx (V8HImode); + else + FAIL; + } + [(set_attr "type" "multiple")] +) + +(define_insn_and_split "*aarch64_cmtst_arith_tmp_hi_v8hi" + [(set (match_operand:V4SI 0 "register_operand" "=w") + (neg:V4SI + (eq:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (and:V8HI + (match_operand:V8HI 1 "register_operand") + (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) + (match_operand:V8HI 3 "vect_par_cnst_hi_half"))) + (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))] + "TARGET_SIMD && !reload_completed" + "#" + "&& true" + [(set (match_operand:V8HI 5 "register_operand" "=w") + (and:V8HI + (match_operand:V8HI 1 "register_operand") + (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))) + (set (match_operand:V4SI 0 "register_operand" "=w") + (sign_extend:V4SI + (vec_select:V4HI + (match_dup 5) + (match_operand:V8HI 3 "vect_par_cnst_hi_half")))) + (set (match_dup 0) + (neg:V4SI + (eq:V4SI + (match_dup 0) + (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))] + { + if (can_create_pseudo_p ()) + operands[5] = gen_reg_rtx (V8HImode); + else + FAIL; + } + [(set_attr "type" "multiple")] +) + +(define_insn_and_split "*aarch64_cmtst_arith_tmpv8hi" + [(set (match_operand:V8HI 0 "register_operand" "=w") + (vec_concat:V8HI + (truncate:V4HI + (not:V4SI + (match_operand:V4SI 1 "register_operand" "w"))) + (truncate:V4HI + (not:V4SI + (match_operand:V4SI 2 "register_operand" "w")))))] + "TARGET_SIMD" + "#" + "&& true" + [(set (match_operand:V4SI 1 "register_operand" "=w") + (not:V4SI + (match_dup 1))) + (set (match_operand:V4SI 2 "register_operand" "=w") + (not:V4SI + (match_dup 2))) + (set (match_operand:V8HI 0 "register_operand" "=w") + (vec_concat:V8HI + (truncate:V4HI + (match_dup 1)) + (truncate:V4HI + (match_dup 2))))] + "" + [(set_attr "type" "multiple")] +) + (define_insn_and_split "aarch64_cmtstdi" [(set (match_operand:DI 0 "register_operand" "=w,r") (neg:DI diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index 1754b1eff..3cd83334b 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -91,6 +91,25 @@ (match_test "aarch64_simd_valid_immediate (op, NULL, AARCH64_CHECK_ORR)")))) +(define_predicate "aarch64_bic_imm_for_maxmin" + (match_code "const_vector") +{ + if (!aarch64_simd_valid_immediate (op, NULL, AARCH64_CHECK_BIC)) + return false; + op = unwrap_const_vec_duplicate (op); + unsigned int size = GET_MODE_UNIT_BITSIZE (mode); + return CONST_INT_P (op) + && ((~UINTVAL (op)) < (((long unsigned int) 1 << size) - 1)); +}) + +(define_predicate "maxmin_arith_shift_operand" + (match_code "const_vector") +{ + op = unwrap_const_vec_duplicate (op); + unsigned int size = GET_MODE_UNIT_BITSIZE (mode) - 1; + return CONST_INT_P (op) && (UINTVAL (op) == size); +}) + (define_predicate "aarch64_reg_or_bic_imm" (ior (match_operand 0 "register_operand") (and (match_code "const_vector") diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c new file mode 100755 index 000000000..06bce7029 --- /dev/null +++ b/gcc/testsuite/gcc.dg/combine-maxmin.c @@ -0,0 +1,46 @@ +/* { dg-do compile { target aarch64-*-* } } */ +/* { dg-options "-O3 -fdump-rtl-combine-all" } */ + +/* The test checks usage of smax/smin insns for clip evaluation and + * uzp1/uzp2 insns for vector element narrowing. It's inspired by + * sources of x264 codec. */ + +typedef unsigned char uint8_t; +typedef long int intptr_t; +typedef signed short int int16_t; + +static __attribute__((always_inline)) inline uint8_t clip (int x ) +{ + return ( (x & ~((1 << 8)-1)) ? (-x)>>31 & ((1 << 8)-1) : x ); +} + +void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, + intptr_t stride, int width, int height, int16_t *buf) +{ + const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0; + for( int y = 0; y < height; y++ ) { + for( int x = -2; x < width+3; x++ ) { + int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride] + + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride])); + dstv[x] = clip ( (v + 16) >> 5 ); + buf[x+2] = v + pad; + } + for( int x = 0; x < width; x++ ) + dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1] + + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1])) + - 32*pad + 512) >> 10); + for( int x = 0; x < width; x++ ) + dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1] + + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1])) + + 16) >> 5); + dsth += stride; + dstv += stride; + dstc += stride; + src += stride; + } +} + +/* { dg-final { scan-assembler-times {smax\t} 4 } } */ +/* { dg-final { scan-assembler-times {smin\t} 4 } } */ +/* { dg-final { scan-assembler-times {cmtst\t} 2 } } */ +/* { dg-final { scan-assembler-times {uzp1\t} 6 } } */ -- 2.33.0