[Sync] Sync patches from openeuler/gcc
This commit is contained in:
parent
de52087262
commit
1003614af1
1772
0154-Loop-CRC32-Judge-null-on-pointers-and-solving-coding.patch
Normal file
1772
0154-Loop-CRC32-Judge-null-on-pointers-and-solving-coding.patch
Normal file
File diff suppressed because it is too large
Load Diff
477
0155-Add-maxmin-and-uzp1-uzp2-combining.patch
Normal file
477
0155-Add-maxmin-and-uzp1-uzp2-combining.patch
Normal file
@ -0,0 +1,477 @@
|
||||
From 1e886b98ff7ffdac023dcee8645717f2849d2eb7 Mon Sep 17 00:00:00 2001
|
||||
From: Diachkov Ilia WX1215920 <diachkov.ilia1@huawei-partners.com>
|
||||
Date: Wed, 25 Oct 2023 18:12:28 +0300
|
||||
Subject: [PATCH 1/6] Add maxmin and uzp1/uzp2 combining
|
||||
|
||||
---
|
||||
gcc/config/aarch64/aarch64-simd.md | 339 +++++++++++++++++++++++++-
|
||||
gcc/config/aarch64/predicates.md | 19 ++
|
||||
gcc/testsuite/gcc.dg/combine-maxmin.c | 46 ++++
|
||||
3 files changed, 399 insertions(+), 5 deletions(-)
|
||||
create mode 100755 gcc/testsuite/gcc.dg/combine-maxmin.c
|
||||
|
||||
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
|
||||
index 6049adc3f..7f707de57 100644
|
||||
--- a/gcc/config/aarch64/aarch64-simd.md
|
||||
+++ b/gcc/config/aarch64/aarch64-simd.md
|
||||
@@ -1034,6 +1034,82 @@
|
||||
[(set_attr "type" "neon_shift_imm<q>")]
|
||||
)
|
||||
|
||||
+;; Simplify the extension with following truncation for shift+neg operation.
|
||||
+
|
||||
+(define_insn_and_split "*aarch64_sshr_neg_v8hi"
|
||||
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
|
||||
+ (vec_concat:V8HI
|
||||
+ (truncate:V4HI
|
||||
+ (ashiftrt:V4SI
|
||||
+ (neg:V4SI
|
||||
+ (sign_extend:V4SI
|
||||
+ (vec_select:V4HI
|
||||
+ (match_operand:V8HI 1 "register_operand")
|
||||
+ (match_operand:V8HI 3 "vect_par_cnst_lo_half"))))
|
||||
+ (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
|
||||
+ (truncate:V4HI
|
||||
+ (ashiftrt:V4SI
|
||||
+ (neg:V4SI
|
||||
+ (sign_extend:V4SI
|
||||
+ (vec_select:V4HI
|
||||
+ (match_dup 1)
|
||||
+ (match_operand:V8HI 4 "vect_par_cnst_hi_half"))))
|
||||
+ (match_dup 2)))))]
|
||||
+ "TARGET_SIMD"
|
||||
+ "#"
|
||||
+ "&& true"
|
||||
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
|
||||
+ (ashiftrt:V8HI
|
||||
+ (neg:V8HI
|
||||
+ (match_operand:V8HI 1 "register_operand" "w"))
|
||||
+ (match_operand:V8HI 2 "aarch64_simd_imm_minus_one")))]
|
||||
+ {
|
||||
+ /* Reduce the shift amount to smaller mode. */
|
||||
+ int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[2], 0))
|
||||
+ - (GET_MODE_UNIT_BITSIZE (GET_MODE (operands[2])) / 2);
|
||||
+ operands[2] = aarch64_simd_gen_const_vector_dup (V8HImode, val);
|
||||
+ }
|
||||
+ [(set_attr "type" "multiple")]
|
||||
+)
|
||||
+
|
||||
+;; The helper definition that allows combiner to use the previous pattern.
|
||||
+
|
||||
+(define_insn_and_split "*aarch64_sshr_neg_tmpv8hi"
|
||||
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
|
||||
+ (vec_concat:V8HI
|
||||
+ (truncate:V4HI
|
||||
+ (ashiftrt:V4SI
|
||||
+ (neg:V4SI
|
||||
+ (match_operand:V4SI 1 "register_operand" "w"))
|
||||
+ (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
|
||||
+ (truncate:V4HI
|
||||
+ (ashiftrt:V4SI
|
||||
+ (neg:V4SI
|
||||
+ (match_operand:V4SI 3 "register_operand" "w"))
|
||||
+ (match_dup 2)))))]
|
||||
+ "TARGET_SIMD"
|
||||
+ "#"
|
||||
+ "&& true"
|
||||
+ [(set (match_operand:V4SI 1 "register_operand" "=w")
|
||||
+ (ashiftrt:V4SI
|
||||
+ (neg:V4SI
|
||||
+ (match_dup 1))
|
||||
+ (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
|
||||
+ (set (match_operand:V4SI 3 "register_operand" "=w")
|
||||
+ (ashiftrt:V4SI
|
||||
+ (neg:V4SI
|
||||
+ (match_dup 3))
|
||||
+ (match_dup 2)))
|
||||
+ (set (match_operand:V8HI 0 "register_operand" "=w")
|
||||
+ (vec_concat:V8HI
|
||||
+ (truncate:V4HI
|
||||
+ (match_dup 1))
|
||||
+ (truncate:V4HI
|
||||
+ (match_dup 3))))]
|
||||
+ ""
|
||||
+ [(set_attr "type" "multiple")]
|
||||
+)
|
||||
+
|
||||
(define_insn "*aarch64_simd_sra<mode>"
|
||||
[(set (match_operand:VDQ_I 0 "register_operand" "=w")
|
||||
(plus:VDQ_I
|
||||
@@ -1459,6 +1535,78 @@
|
||||
[(set_attr "type" "neon_minmax<q>")]
|
||||
)
|
||||
|
||||
+;; Use sequential smax+smin to replace vector arithmetic operations like this:
|
||||
+;; a = ((x & ~((1 << 8)-1)) ? (-x)>>31 & ((1 << 8)-1) : x);
|
||||
+;; TODO: maybe extend to scalar operations.
|
||||
+
|
||||
+(define_insn_and_split "*aarch64_maxmin_arith<mode>"
|
||||
+ [(set (match_operand:VDQHSD 0 "register_operand" "=w")
|
||||
+ (xor:VDQHSD
|
||||
+ (and:VDQHSD
|
||||
+ (xor:VDQHSD
|
||||
+ (ashiftrt:VDQHSD
|
||||
+ (neg:VDQHSD
|
||||
+ (match_operand:VDQHSD 1 "register_operand"))
|
||||
+ (match_operand:VDQHSD 2 "maxmin_arith_shift_operand"))
|
||||
+ (match_dup 1))
|
||||
+ (neg:VDQHSD
|
||||
+ (eq:VDQHSD
|
||||
+ (and:VDQHSD
|
||||
+ (match_dup 1)
|
||||
+ (match_operand:VDQHSD 3 "aarch64_bic_imm_for_maxmin"))
|
||||
+ (match_operand:VDQHSD 4 "aarch64_simd_or_scalar_imm_zero"))))
|
||||
+ (ashiftrt:VDQHSD
|
||||
+ (neg:VDQHSD
|
||||
+ (match_dup 1))
|
||||
+ (match_dup 2))))]
|
||||
+ "TARGET_SIMD && !reload_completed"
|
||||
+ "#"
|
||||
+ "&& true"
|
||||
+ [(set (match_operand:VDQHSD 5 "register_operand" "w") (match_dup 3))
|
||||
+ (set (match_operand:VDQHSD 6 "register_operand" "w") (match_dup 4))
|
||||
+ (set (match_operand:VDQHSD 0 "register_operand" "=w")
|
||||
+ (smax:VDQHSD (match_operand:VDQHSD 1 "register_operand" "w")
|
||||
+ (match_operand:VDQHSD 6 "register_operand" "w")))
|
||||
+ (set (match_operand:VDQHSD 0 "register_operand" "=w")
|
||||
+ (smin:VDQHSD (match_operand:VDQHSD 0 "register_operand" "w")
|
||||
+ (match_operand:VDQHSD 5 "register_operand" "w")))]
|
||||
+ {
|
||||
+ if (can_create_pseudo_p ())
|
||||
+ {
|
||||
+ int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[3], 0));
|
||||
+ operands[3] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
|
||||
+ ~val);
|
||||
+ operands[5] = gen_reg_rtx (<MODE>mode);
|
||||
+ operands[6] = gen_reg_rtx (<MODE>mode);
|
||||
+ }
|
||||
+ else
|
||||
+ FAIL;
|
||||
+ }
|
||||
+ [(set_attr "type" "neon_minmax<q>")]
|
||||
+)
|
||||
+
|
||||
+;; The helper definition that allows combiner to use the previous pattern.
|
||||
+
|
||||
+(define_insn_and_split "*aarch64_maxmin_tmp<mode>"
|
||||
+ [(set (match_operand:VDQHSD 0 "register_operand" "=w")
|
||||
+ (ashiftrt:VDQHSD
|
||||
+ (neg:VDQHSD
|
||||
+ (match_operand:VDQHSD 1 "register_operand" "w"))
|
||||
+ (match_operand:VDQHSD 2 "maxmin_arith_shift_operand")))]
|
||||
+ "TARGET_SIMD"
|
||||
+ "#"
|
||||
+ "&& reload_completed"
|
||||
+ [(set (match_operand:VDQHSD 0 "register_operand")
|
||||
+ (neg:VDQHSD
|
||||
+ (match_operand:VDQHSD 1 "register_operand" "w")))
|
||||
+ (set (match_dup 0)
|
||||
+ (ashiftrt:VDQHSD
|
||||
+ (match_dup 0)
|
||||
+ (match_operand:VDQHSD 2 "maxmin_arith_shift_operand")))]
|
||||
+ ""
|
||||
+ [(set_attr "type" "neon_minmax<q>")]
|
||||
+)
|
||||
+
|
||||
;; Pairwise FP Max/Min operations.
|
||||
(define_insn "aarch64_<maxmin_uns>p<mode>"
|
||||
[(set (match_operand:VHSDF 0 "register_operand" "=w")
|
||||
@@ -1599,7 +1747,8 @@
|
||||
DONE;
|
||||
})
|
||||
|
||||
-;; For quads.
|
||||
+;; For quads. Use UZP1 on the narrower type, which discards the high part of
|
||||
+;; each wide element.
|
||||
|
||||
(define_insn "vec_pack_trunc_<mode>"
|
||||
[(set (match_operand:<VNARROWQ2> 0 "register_operand" "=&w")
|
||||
@@ -1609,12 +1758,32 @@
|
||||
"TARGET_SIMD"
|
||||
{
|
||||
if (BYTES_BIG_ENDIAN)
|
||||
- return "xtn\\t%0.<Vntype>, %2.<Vtype>\;xtn2\\t%0.<V2ntype>, %1.<Vtype>";
|
||||
+ return "uzp1\\t%0.<V2ntype>, %2.<V2ntype>, %1.<V2ntype>";
|
||||
else
|
||||
- return "xtn\\t%0.<Vntype>, %1.<Vtype>\;xtn2\\t%0.<V2ntype>, %2.<Vtype>";
|
||||
+ return "uzp1\\t%0.<V2ntype>, %1.<V2ntype>, %2.<V2ntype>";
|
||||
}
|
||||
- [(set_attr "type" "multiple")
|
||||
- (set_attr "length" "8")]
|
||||
+ [(set_attr "type" "neon_permute<q>")
|
||||
+ (set_attr "length" "4")]
|
||||
+)
|
||||
+
|
||||
+(define_insn "vec_pack_trunc_shifted_<mode>"
|
||||
+ [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=&w")
|
||||
+ (vec_concat:<VNARROWQ2>
|
||||
+ (truncate:<VNARROWQ>
|
||||
+ (ashiftrt:VQN (match_operand:VQN 1 "register_operand" "w")
|
||||
+ (match_operand:VQN 2 "half_size_operand" "w")))
|
||||
+ (truncate:<VNARROWQ>
|
||||
+ (ashiftrt:VQN (match_operand:VQN 3 "register_operand" "w")
|
||||
+ (match_operand:VQN 4 "half_size_operand" "w")))))]
|
||||
+ "TARGET_SIMD"
|
||||
+ {
|
||||
+ if (BYTES_BIG_ENDIAN)
|
||||
+ return "uzp2\\t%0.<V2ntype>, %3.<V2ntype>, %1.<V2ntype>";
|
||||
+ else
|
||||
+ return "uzp2\\t%0.<V2ntype>, %1.<V2ntype>, %3.<V2ntype>";
|
||||
+ }
|
||||
+ [(set_attr "type" "neon_permute<q>")
|
||||
+ (set_attr "length" "4")]
|
||||
)
|
||||
|
||||
;; Widening operations.
|
||||
@@ -4852,6 +5021,166 @@
|
||||
[(set_attr "type" "neon_tst<q>")]
|
||||
)
|
||||
|
||||
+;; Simplify the extension with following truncation for cmtst-like operation.
|
||||
+
|
||||
+(define_insn_and_split "*aarch64_cmtst_arith_v8hi"
|
||||
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
|
||||
+ (vec_concat:V8HI
|
||||
+ (plus:V4HI
|
||||
+ (truncate:V4HI
|
||||
+ (eq:V4SI
|
||||
+ (sign_extend:V4SI
|
||||
+ (vec_select:V4HI
|
||||
+ (and:V8HI
|
||||
+ (match_operand:V8HI 1 "register_operand")
|
||||
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
|
||||
+ (match_operand:V8HI 3 "vect_par_cnst_lo_half")))
|
||||
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero")))
|
||||
+ (match_operand:V4HI 5 "aarch64_simd_imm_minus_one"))
|
||||
+ (plus:V4HI
|
||||
+ (truncate:V4HI
|
||||
+ (eq:V4SI
|
||||
+ (sign_extend:V4SI
|
||||
+ (vec_select:V4HI
|
||||
+ (and:V8HI
|
||||
+ (match_dup 1)
|
||||
+ (match_dup 2))
|
||||
+ (match_operand:V8HI 6 "vect_par_cnst_hi_half")))
|
||||
+ (match_dup 4)))
|
||||
+ (match_dup 5))))]
|
||||
+ "TARGET_SIMD && !reload_completed"
|
||||
+ "#"
|
||||
+ "&& true"
|
||||
+ [(set (match_operand:V8HI 6 "register_operand" "=w")
|
||||
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
|
||||
+ (set (match_operand:V8HI 0 "register_operand" "=w")
|
||||
+ (plus:V8HI
|
||||
+ (eq:V8HI
|
||||
+ (and:V8HI
|
||||
+ (match_operand:V8HI 1 "register_operand" "w")
|
||||
+ (match_dup 6))
|
||||
+ (match_operand:V8HI 4 "aarch64_simd_imm_zero"))
|
||||
+ (match_operand:V8HI 5 "aarch64_simd_imm_minus_one")))]
|
||||
+ {
|
||||
+ if (can_create_pseudo_p ())
|
||||
+ {
|
||||
+ int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[4], 0));
|
||||
+ operands[4] = aarch64_simd_gen_const_vector_dup (V8HImode, val);
|
||||
+ int val2 = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[5], 0));
|
||||
+ operands[5] = aarch64_simd_gen_const_vector_dup (V8HImode, val2);
|
||||
+
|
||||
+ operands[6] = gen_reg_rtx (V8HImode);
|
||||
+ }
|
||||
+ else
|
||||
+ FAIL;
|
||||
+ }
|
||||
+ [(set_attr "type" "neon_tst_q")]
|
||||
+)
|
||||
+
|
||||
+;; Three helper definitions that allow combiner to use the previous pattern.
|
||||
+
|
||||
+(define_insn_and_split "*aarch64_cmtst_arith_tmp_lo_v8hi"
|
||||
+ [(set (match_operand:V4SI 0 "register_operand" "=w")
|
||||
+ (neg:V4SI
|
||||
+ (eq:V4SI
|
||||
+ (sign_extend:V4SI
|
||||
+ (vec_select:V4HI
|
||||
+ (and:V8HI
|
||||
+ (match_operand:V8HI 1 "register_operand")
|
||||
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
|
||||
+ (match_operand:V8HI 3 "vect_par_cnst_lo_half")))
|
||||
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
|
||||
+ "TARGET_SIMD && !reload_completed"
|
||||
+ "#"
|
||||
+ "&& true"
|
||||
+ [(set (match_operand:V8HI 5 "register_operand" "=w")
|
||||
+ (and:V8HI
|
||||
+ (match_operand:V8HI 1 "register_operand")
|
||||
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")))
|
||||
+ (set (match_operand:V4SI 0 "register_operand" "=w")
|
||||
+ (sign_extend:V4SI
|
||||
+ (vec_select:V4HI
|
||||
+ (match_dup 5)
|
||||
+ (match_operand:V8HI 3 "vect_par_cnst_lo_half"))))
|
||||
+ (set (match_dup 0)
|
||||
+ (neg:V4SI
|
||||
+ (eq:V4SI
|
||||
+ (match_dup 0)
|
||||
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
|
||||
+ {
|
||||
+ if (can_create_pseudo_p ())
|
||||
+ operands[5] = gen_reg_rtx (V8HImode);
|
||||
+ else
|
||||
+ FAIL;
|
||||
+ }
|
||||
+ [(set_attr "type" "multiple")]
|
||||
+)
|
||||
+
|
||||
+(define_insn_and_split "*aarch64_cmtst_arith_tmp_hi_v8hi"
|
||||
+ [(set (match_operand:V4SI 0 "register_operand" "=w")
|
||||
+ (neg:V4SI
|
||||
+ (eq:V4SI
|
||||
+ (sign_extend:V4SI
|
||||
+ (vec_select:V4HI
|
||||
+ (and:V8HI
|
||||
+ (match_operand:V8HI 1 "register_operand")
|
||||
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
|
||||
+ (match_operand:V8HI 3 "vect_par_cnst_hi_half")))
|
||||
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
|
||||
+ "TARGET_SIMD && !reload_completed"
|
||||
+ "#"
|
||||
+ "&& true"
|
||||
+ [(set (match_operand:V8HI 5 "register_operand" "=w")
|
||||
+ (and:V8HI
|
||||
+ (match_operand:V8HI 1 "register_operand")
|
||||
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")))
|
||||
+ (set (match_operand:V4SI 0 "register_operand" "=w")
|
||||
+ (sign_extend:V4SI
|
||||
+ (vec_select:V4HI
|
||||
+ (match_dup 5)
|
||||
+ (match_operand:V8HI 3 "vect_par_cnst_hi_half"))))
|
||||
+ (set (match_dup 0)
|
||||
+ (neg:V4SI
|
||||
+ (eq:V4SI
|
||||
+ (match_dup 0)
|
||||
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
|
||||
+ {
|
||||
+ if (can_create_pseudo_p ())
|
||||
+ operands[5] = gen_reg_rtx (V8HImode);
|
||||
+ else
|
||||
+ FAIL;
|
||||
+ }
|
||||
+ [(set_attr "type" "multiple")]
|
||||
+)
|
||||
+
|
||||
+(define_insn_and_split "*aarch64_cmtst_arith_tmpv8hi"
|
||||
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
|
||||
+ (vec_concat:V8HI
|
||||
+ (truncate:V4HI
|
||||
+ (not:V4SI
|
||||
+ (match_operand:V4SI 1 "register_operand" "w")))
|
||||
+ (truncate:V4HI
|
||||
+ (not:V4SI
|
||||
+ (match_operand:V4SI 2 "register_operand" "w")))))]
|
||||
+ "TARGET_SIMD"
|
||||
+ "#"
|
||||
+ "&& true"
|
||||
+ [(set (match_operand:V4SI 1 "register_operand" "=w")
|
||||
+ (not:V4SI
|
||||
+ (match_dup 1)))
|
||||
+ (set (match_operand:V4SI 2 "register_operand" "=w")
|
||||
+ (not:V4SI
|
||||
+ (match_dup 2)))
|
||||
+ (set (match_operand:V8HI 0 "register_operand" "=w")
|
||||
+ (vec_concat:V8HI
|
||||
+ (truncate:V4HI
|
||||
+ (match_dup 1))
|
||||
+ (truncate:V4HI
|
||||
+ (match_dup 2))))]
|
||||
+ ""
|
||||
+ [(set_attr "type" "multiple")]
|
||||
+)
|
||||
+
|
||||
(define_insn_and_split "aarch64_cmtstdi"
|
||||
[(set (match_operand:DI 0 "register_operand" "=w,r")
|
||||
(neg:DI
|
||||
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
|
||||
index 1754b1eff..3cd83334b 100644
|
||||
--- a/gcc/config/aarch64/predicates.md
|
||||
+++ b/gcc/config/aarch64/predicates.md
|
||||
@@ -91,6 +91,25 @@
|
||||
(match_test "aarch64_simd_valid_immediate (op, NULL,
|
||||
AARCH64_CHECK_ORR)"))))
|
||||
|
||||
+(define_predicate "aarch64_bic_imm_for_maxmin"
|
||||
+ (match_code "const_vector")
|
||||
+{
|
||||
+ if (!aarch64_simd_valid_immediate (op, NULL, AARCH64_CHECK_BIC))
|
||||
+ return false;
|
||||
+ op = unwrap_const_vec_duplicate (op);
|
||||
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode);
|
||||
+ return CONST_INT_P (op)
|
||||
+ && ((~UINTVAL (op)) < (((long unsigned int) 1 << size) - 1));
|
||||
+})
|
||||
+
|
||||
+(define_predicate "maxmin_arith_shift_operand"
|
||||
+ (match_code "const_vector")
|
||||
+{
|
||||
+ op = unwrap_const_vec_duplicate (op);
|
||||
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) - 1;
|
||||
+ return CONST_INT_P (op) && (UINTVAL (op) == size);
|
||||
+})
|
||||
+
|
||||
(define_predicate "aarch64_reg_or_bic_imm"
|
||||
(ior (match_operand 0 "register_operand")
|
||||
(and (match_code "const_vector")
|
||||
diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
|
||||
new file mode 100755
|
||||
index 000000000..06bce7029
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
|
||||
@@ -0,0 +1,46 @@
|
||||
+/* { dg-do compile { target aarch64-*-* } } */
|
||||
+/* { dg-options "-O3 -fdump-rtl-combine-all" } */
|
||||
+
|
||||
+/* The test checks usage of smax/smin insns for clip evaluation and
|
||||
+ * uzp1/uzp2 insns for vector element narrowing. It's inspired by
|
||||
+ * sources of x264 codec. */
|
||||
+
|
||||
+typedef unsigned char uint8_t;
|
||||
+typedef long int intptr_t;
|
||||
+typedef signed short int int16_t;
|
||||
+
|
||||
+static __attribute__((always_inline)) inline uint8_t clip (int x )
|
||||
+{
|
||||
+ return ( (x & ~((1 << 8)-1)) ? (-x)>>31 & ((1 << 8)-1) : x );
|
||||
+}
|
||||
+
|
||||
+void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
|
||||
+ intptr_t stride, int width, int height, int16_t *buf)
|
||||
+{
|
||||
+ const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0;
|
||||
+ for( int y = 0; y < height; y++ ) {
|
||||
+ for( int x = -2; x < width+3; x++ ) {
|
||||
+ int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride]
|
||||
+ + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride]));
|
||||
+ dstv[x] = clip ( (v + 16) >> 5 );
|
||||
+ buf[x+2] = v + pad;
|
||||
+ }
|
||||
+ for( int x = 0; x < width; x++ )
|
||||
+ dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1]
|
||||
+ + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1]))
|
||||
+ - 32*pad + 512) >> 10);
|
||||
+ for( int x = 0; x < width; x++ )
|
||||
+ dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1]
|
||||
+ + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1]))
|
||||
+ + 16) >> 5);
|
||||
+ dsth += stride;
|
||||
+ dstv += stride;
|
||||
+ dstc += stride;
|
||||
+ src += stride;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+/* { dg-final { scan-assembler-times {smax\t} 4 } } */
|
||||
+/* { dg-final { scan-assembler-times {smin\t} 4 } } */
|
||||
+/* { dg-final { scan-assembler-times {cmtst\t} 2 } } */
|
||||
+/* { dg-final { scan-assembler-times {uzp1\t} 6 } } */
|
||||
--
|
||||
2.33.0
|
||||
|
||||
2387
0156-add-icp-optimization.patch
Normal file
2387
0156-add-icp-optimization.patch
Normal file
File diff suppressed because it is too large
Load Diff
1241
0157-Add-split-complex-instructions-pass.patch
Normal file
1241
0157-Add-split-complex-instructions-pass.patch
Normal file
File diff suppressed because it is too large
Load Diff
2072
0158-Implement-IPA-prefetch-optimization.patch
Normal file
2072
0158-Implement-IPA-prefetch-optimization.patch
Normal file
File diff suppressed because it is too large
Load Diff
233
0159-Implement-AES-pattern-matching.patch
Normal file
233
0159-Implement-AES-pattern-matching.patch
Normal file
@ -0,0 +1,233 @@
|
||||
From 3a48cd1be0915a0fabbfb3a30bd9b67ccd5c65d3 Mon Sep 17 00:00:00 2001
|
||||
From: Diachkov Ilia WX1215920 <diachkov.ilia1@huawei-partners.com>
|
||||
Date: Tue, 12 Dec 2023 10:41:12 +0800
|
||||
Subject: [PATCH 6/6] Implement AES pattern matching
|
||||
|
||||
---
|
||||
gcc/Makefile.in | 1 +
|
||||
gcc/common.opt | 4 ++++
|
||||
gcc/config/aarch64/aarch64.c | 24 +++++++++++++++++++++
|
||||
gcc/doc/tm.texi | 29 +++++++++++++++++++++++++
|
||||
gcc/doc/tm.texi.in | 12 +++++++++++
|
||||
gcc/passes.def | 1 +
|
||||
gcc/target.def | 41 ++++++++++++++++++++++++++++++++++++
|
||||
gcc/timevar.def | 1 +
|
||||
gcc/tree-pass.h | 1 +
|
||||
9 files changed, 114 insertions(+)
|
||||
|
||||
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
|
||||
index 31bf2cde2..75b28722e 100644
|
||||
--- a/gcc/Makefile.in
|
||||
+++ b/gcc/Makefile.in
|
||||
@@ -1288,6 +1288,7 @@ OBJS = \
|
||||
cgraphunit.o \
|
||||
cgraphclones.o \
|
||||
combine.o \
|
||||
+ crypto-accel.o \
|
||||
combine-stack-adj.o \
|
||||
compare-elim.o \
|
||||
context.o \
|
||||
diff --git a/gcc/common.opt b/gcc/common.opt
|
||||
index 36b016253..eb995f701 100644
|
||||
--- a/gcc/common.opt
|
||||
+++ b/gcc/common.opt
|
||||
@@ -1069,6 +1069,10 @@ floop-crc
|
||||
Common Report Var(flag_loop_crc) Optimization
|
||||
Do the loop crc conversion.
|
||||
|
||||
+fcrypto-accel-aes
|
||||
+Common Report Var(flag_crypto_accel_aes) Init(0) Optimization
|
||||
+Perform crypto acceleration AES pattern matching.
|
||||
+
|
||||
fauto-inc-dec
|
||||
Common Report Var(flag_auto_inc_dec) Init(1) Optimization
|
||||
Generate auto-inc/dec instructions.
|
||||
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
|
||||
index ae9e0802b..75efbcb97 100644
|
||||
--- a/gcc/config/aarch64/aarch64.c
|
||||
+++ b/gcc/config/aarch64/aarch64.c
|
||||
@@ -23894,6 +23894,30 @@ is_aarch64_stp_insn (int icode)
|
||||
return false;
|
||||
}
|
||||
|
||||
+machine_mode
|
||||
+aarch64_get_v16qi_mode ()
|
||||
+{
|
||||
+ return V16QImode;
|
||||
+}
|
||||
+
|
||||
+#undef TARGET_GET_V16QI_MODE
|
||||
+#define TARGET_GET_V16QI_MODE aarch64_get_v16qi_mode
|
||||
+
|
||||
+#undef TARGET_GEN_REV32V16QI
|
||||
+#define TARGET_GEN_REV32V16QI gen_aarch64_rev32v16qi
|
||||
+
|
||||
+#undef TARGET_GEN_AESEV16QI
|
||||
+#define TARGET_GEN_AESEV16QI gen_aarch64_crypto_aesev16qi
|
||||
+
|
||||
+#undef TARGET_GEN_AESDV16QI
|
||||
+#define TARGET_GEN_AESDV16QI gen_aarch64_crypto_aesdv16qi
|
||||
+
|
||||
+#undef TARGET_GEN_AESMCV16QI
|
||||
+#define TARGET_GEN_AESMCV16QI gen_aarch64_crypto_aesmcv16qi
|
||||
+
|
||||
+#undef TARGET_GEN_AESIMCV16QI
|
||||
+#define TARGET_GEN_AESIMCV16QI gen_aarch64_crypto_aesimcv16qi
|
||||
+
|
||||
#undef TARGET_IS_LDP_INSN
|
||||
#define TARGET_IS_LDP_INSN is_aarch64_ldp_insn
|
||||
|
||||
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
|
||||
index ac1d665c5..4a998aa76 100644
|
||||
--- a/gcc/doc/tm.texi
|
||||
+++ b/gcc/doc/tm.texi
|
||||
@@ -11870,6 +11870,35 @@ object files that are not referenced from @code{main} and uses export
|
||||
lists.
|
||||
@end defmac
|
||||
|
||||
+@deftypefn {Target Hook} machine_mode TARGET_GET_V16QI_MODE ()
|
||||
+This function get the 16 byte elements vector mode if target supports this.
|
||||
+@end deftypefn
|
||||
+
|
||||
+@deftypefn {Target Hook} rtx TARGET_GEN_REV32V16QI (rtx @var{dest}, rtx @var{src})
|
||||
+This function generate the byte reverse instruction
|
||||
+ of 16 byte elements vector if target supports this.
|
||||
+@end deftypefn
|
||||
+
|
||||
+@deftypefn {Target Hook} rtx TARGET_GEN_AESEV16QI (rtx @var{dest}, rtx @var{src1}, rtx @var{src2})
|
||||
+This function generate the AES encryption instruction
|
||||
+ of 16 byte elements vector if target supports this.
|
||||
+@end deftypefn
|
||||
+
|
||||
+@deftypefn {Target Hook} rtx TARGET_GEN_AESDV16QI (rtx @var{dest}, rtx @var{src1}, rtx @var{src2})
|
||||
+This function generate the AES decryption instruction
|
||||
+ of 16 byte elements vector if target supports this.
|
||||
+@end deftypefn
|
||||
+
|
||||
+@deftypefn {Target Hook} rtx TARGET_GEN_AESMCV16QI (rtx @var{dest}, rtx @var{src})
|
||||
+This function generate the AES mix columns instruction
|
||||
+ of 16 byte elements vector if target supports this.
|
||||
+@end deftypefn
|
||||
+
|
||||
+@deftypefn {Target Hook} rtx TARGET_GEN_AESIMCV16QI (rtx @var{dest}, rtx @var{src})
|
||||
+This function generate the AES inversed mix columns instruction
|
||||
+ of 16 byte elements vector if target supports this.
|
||||
+@end deftypefn
|
||||
+
|
||||
@deftypefn {Target Hook} bool TARGET_IS_LDP_INSN (int @var{icode})
|
||||
Return true if icode is corresponding to any of the LDP instruction types.
|
||||
@end deftypefn
|
||||
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
|
||||
index 0cd70dda4..f7094d8c2 100644
|
||||
--- a/gcc/doc/tm.texi.in
|
||||
+++ b/gcc/doc/tm.texi.in
|
||||
@@ -8010,6 +8010,18 @@ object files that are not referenced from @code{main} and uses export
|
||||
lists.
|
||||
@end defmac
|
||||
|
||||
+@hook TARGET_GET_V16QI_MODE
|
||||
+
|
||||
+@hook TARGET_GEN_REV32V16QI
|
||||
+
|
||||
+@hook TARGET_GEN_AESEV16QI
|
||||
+
|
||||
+@hook TARGET_GEN_AESDV16QI
|
||||
+
|
||||
+@hook TARGET_GEN_AESMCV16QI
|
||||
+
|
||||
+@hook TARGET_GEN_AESIMCV16QI
|
||||
+
|
||||
@hook TARGET_IS_LDP_INSN
|
||||
|
||||
@hook TARGET_IS_STP_INSN
|
||||
diff --git a/gcc/passes.def b/gcc/passes.def
|
||||
index ba13d897c..da5d71646 100644
|
||||
--- a/gcc/passes.def
|
||||
+++ b/gcc/passes.def
|
||||
@@ -448,6 +448,7 @@ along with GCC; see the file COPYING3. If not see
|
||||
NEXT_PASS (pass_rtl_fwprop_addr);
|
||||
NEXT_PASS (pass_inc_dec);
|
||||
NEXT_PASS (pass_initialize_regs);
|
||||
+ NEXT_PASS (pass_crypto_accel);
|
||||
NEXT_PASS (pass_ud_rtl_dce);
|
||||
NEXT_PASS (pass_combine);
|
||||
NEXT_PASS (pass_if_after_combine);
|
||||
diff --git a/gcc/target.def b/gcc/target.def
|
||||
index 48c8a8234..b4dff78ea 100644
|
||||
--- a/gcc/target.def
|
||||
+++ b/gcc/target.def
|
||||
@@ -2727,6 +2727,47 @@ modes and they have different conditional execution capability, such as ARM.",
|
||||
bool, (void),
|
||||
default_have_conditional_execution)
|
||||
|
||||
+DEFHOOK
|
||||
+(get_v16qi_mode,
|
||||
+ "This function get the 16 byte elements vector mode if target supports this.",
|
||||
+ machine_mode, (),
|
||||
+ NULL)
|
||||
+
|
||||
+DEFHOOK
|
||||
+(gen_rev32v16qi,
|
||||
+ "This function generate the byte reverse instruction\n\
|
||||
+ of 16 byte elements vector if target supports this.",
|
||||
+ rtx, (rtx dest, rtx src),
|
||||
+ NULL)
|
||||
+
|
||||
+DEFHOOK
|
||||
+(gen_aesev16qi,
|
||||
+ "This function generate the AES encryption instruction\n\
|
||||
+ of 16 byte elements vector if target supports this.",
|
||||
+ rtx, (rtx dest, rtx src1, rtx src2),
|
||||
+ NULL)
|
||||
+
|
||||
+DEFHOOK
|
||||
+(gen_aesdv16qi,
|
||||
+ "This function generate the AES decryption instruction\n\
|
||||
+ of 16 byte elements vector if target supports this.",
|
||||
+ rtx, (rtx dest, rtx src1, rtx src2),
|
||||
+ NULL)
|
||||
+
|
||||
+DEFHOOK
|
||||
+(gen_aesmcv16qi,
|
||||
+ "This function generate the AES mix columns instruction\n\
|
||||
+ of 16 byte elements vector if target supports this.",
|
||||
+ rtx, (rtx dest, rtx src),
|
||||
+ NULL)
|
||||
+
|
||||
+DEFHOOK
|
||||
+(gen_aesimcv16qi,
|
||||
+ "This function generate the AES inversed mix columns instruction\n\
|
||||
+ of 16 byte elements vector if target supports this.",
|
||||
+ rtx, (rtx dest, rtx src),
|
||||
+ NULL)
|
||||
+
|
||||
DEFHOOK
|
||||
(is_ldp_insn,
|
||||
"Return true if icode is corresponding to any of the LDP instruction types.",
|
||||
diff --git a/gcc/timevar.def b/gcc/timevar.def
|
||||
index 24caf1b5d..9ca74dffe 100644
|
||||
--- a/gcc/timevar.def
|
||||
+++ b/gcc/timevar.def
|
||||
@@ -258,6 +258,7 @@ DEFTIMEVAR (TV_AUTO_INC_DEC , "auto inc dec")
|
||||
DEFTIMEVAR (TV_CSE2 , "CSE 2")
|
||||
DEFTIMEVAR (TV_BRANCH_PROB , "branch prediction")
|
||||
DEFTIMEVAR (TV_COMBINE , "combiner")
|
||||
+DEFTIMEVAR (TV_CRYPTO_ACCEL , "crypto accel")
|
||||
DEFTIMEVAR (TV_IFCVT , "if-conversion")
|
||||
DEFTIMEVAR (TV_MODE_SWITCH , "mode switching")
|
||||
DEFTIMEVAR (TV_SMS , "sms modulo scheduling")
|
||||
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
|
||||
index 232a3fdf6..29dc7e34b 100644
|
||||
--- a/gcc/tree-pass.h
|
||||
+++ b/gcc/tree-pass.h
|
||||
@@ -570,6 +570,7 @@ extern rtl_opt_pass *make_pass_cse2 (gcc::context *ctxt);
|
||||
extern rtl_opt_pass *make_pass_df_initialize_opt (gcc::context *ctxt);
|
||||
extern rtl_opt_pass *make_pass_df_initialize_no_opt (gcc::context *ctxt);
|
||||
extern rtl_opt_pass *make_pass_reginfo_init (gcc::context *ctxt);
|
||||
+extern rtl_opt_pass *make_pass_crypto_accel (gcc::context *ctxt);
|
||||
extern rtl_opt_pass *make_pass_inc_dec (gcc::context *ctxt);
|
||||
extern rtl_opt_pass *make_pass_stack_ptr_mod (gcc::context *ctxt);
|
||||
extern rtl_opt_pass *make_pass_initialize_regs (gcc::context *ctxt);
|
||||
--
|
||||
2.33.0
|
||||
|
||||
3746
0160-AES-Add-lost-files.patch
Normal file
3746
0160-AES-Add-lost-files.patch
Normal file
File diff suppressed because it is too large
Load Diff
22
gcc.spec
22
gcc.spec
@ -61,7 +61,7 @@
|
||||
Summary: Various compilers (C, C++, Objective-C, ...)
|
||||
Name: gcc
|
||||
Version: %{gcc_version}
|
||||
Release: 42
|
||||
Release: 43
|
||||
License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD
|
||||
URL: https://gcc.gnu.org
|
||||
|
||||
@ -261,6 +261,13 @@ Patch150: 0150-Implement-propagation-of-permutations-in-fwprop.patch
|
||||
Patch151: 0151-Fix-bugs-and-add-tests-for-RTL-ifcvt.patch
|
||||
Patch152: 0152-Add-LLC-Allocation-Pass.patch
|
||||
Patch153: 0153-LLC-add-extending-outer-loop.patch
|
||||
Patch154: 0154-Loop-CRC32-Judge-null-on-pointers-and-solving-coding.patch
|
||||
Patch155: 0155-Add-maxmin-and-uzp1-uzp2-combining.patch
|
||||
Patch156: 0156-add-icp-optimization.patch
|
||||
Patch157: 0157-Add-split-complex-instructions-pass.patch
|
||||
Patch158: 0158-Implement-IPA-prefetch-optimization.patch
|
||||
Patch159: 0159-Implement-AES-pattern-matching.patch
|
||||
Patch160: 0160-AES-Add-lost-files.patch
|
||||
|
||||
%global gcc_target_platform %{_arch}-linux-gnu
|
||||
|
||||
@ -867,6 +874,13 @@ not stable, so plugins must be rebuilt any time GCC is updated.
|
||||
%patch151 -p1
|
||||
%patch152 -p1
|
||||
%patch153 -p1
|
||||
%patch154 -p1
|
||||
%patch155 -p1
|
||||
%patch156 -p1
|
||||
%patch157 -p1
|
||||
%patch158 -p1
|
||||
%patch159 -p1
|
||||
%patch160 -p1
|
||||
|
||||
%build
|
||||
|
||||
@ -2891,6 +2905,12 @@ end
|
||||
%doc rpm.doc/changelogs/libcc1/ChangeLog*
|
||||
|
||||
%changelog
|
||||
* Tue Dec 12 2023 Xiong Zhou <xiongzhou4@huawei.com> - 10.3.1-43
|
||||
- Type:Spec
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC: Sync patches from openeuler/gcc
|
||||
|
||||
* Tue Dec 12 2023 Shujian Zhao <zhaoshujian@huawei.com> - 10.3.1-42
|
||||
- Type:Spec
|
||||
- ID:NA
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user