[Sync] Sync patch from openeuler/gcc

0147-add-insn-defs-and-correct-costs-for-cmlt-generation.patch
0148-Introduce-RTL-ifcvt-enhancements.patch
0149-Add-more-flexible-check-for-pointer-aliasing-during-.patch
This commit is contained in:
wangding16 2023-12-06 11:52:14 +08:00
parent 8f8eb20266
commit c396b7ffab
3 changed files with 935 additions and 0 deletions

View File

@ -0,0 +1,194 @@
From 80b7de670da46d8921118799904cba4a0753bb72 Mon Sep 17 00:00:00 2001
From: Diachkov Ilia WX1215920 <diachkov.ilia1@huawei-partners.com>
Date: Wed, 23 Aug 2023 15:03:00 +0300
Subject: [PATCH 09/13] add insn defs and correct costs for cmlt generation
---
gcc/config/aarch64/aarch64-simd.md | 48 +++++++++++++++++++++++++++++
gcc/config/aarch64/aarch64.c | 15 +++++++++
gcc/config/aarch64/aarch64.opt | 4 +++
gcc/config/aarch64/iterators.md | 3 +-
gcc/config/aarch64/predicates.md | 25 +++++++++++++++
gcc/testsuite/gcc.dg/combine-cmlt.c | 20 ++++++++++++
6 files changed, 114 insertions(+), 1 deletion(-)
create mode 100755 gcc/testsuite/gcc.dg/combine-cmlt.c
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 6049adc3f..f4213fd62 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4719,6 +4719,54 @@
[(set_attr "type" "neon_compare<q>, neon_compare_zero<q>")]
)
+;; Use cmlt to replace vector arithmetic operations like this (SImode example):
+;; B = (((A >> 15) & 0x00010001) << 16) - ((A >> 15) & 0x00010001)
+;; TODO: maybe extend to scalar operations or other cm** instructions.
+
+(define_insn "*aarch64_cmlt_as_arith<mode>"
+ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
+ (minus:<V_INT_EQUIV>
+ (ashift:<V_INT_EQUIV>
+ (and:<V_INT_EQUIV>
+ (lshiftrt:<V_INT_EQUIV>
+ (match_operand:VDQHSD 1 "register_operand" "w")
+ (match_operand:VDQHSD 2 "half_size_minus_one_operand"))
+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand"))
+ (match_operand:VDQHSD 4 "half_size_operand"))
+ (and:<V_INT_EQUIV>
+ (lshiftrt:<V_INT_EQUIV>
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))))]
+ "TARGET_SIMD && flag_cmlt_arith"
+ "cmlt\t%<v>0.<V2ntype>, %<v>1.<V2ntype>, #0"
+ [(set_attr "type" "neon_compare_zero")]
+)
+
+;; The helper definition that allows combiner to use the previous pattern.
+
+(define_insn_and_split "*aarch64_cmlt_tmp<mode>"
+ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
+ (and:<V_INT_EQUIV>
+ (lshiftrt:<V_INT_EQUIV>
+ (match_operand:VDQHSD 1 "register_operand" "w")
+ (match_operand:VDQHSD 2 "half_size_minus_one_operand"))
+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
+ "TARGET_SIMD && flag_cmlt_arith"
+ "#"
+ "&& reload_completed"
+ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
+ (lshiftrt:<V_INT_EQUIV>
+ (match_operand:VDQHSD 1 "register_operand")
+ (match_operand:VDQHSD 2 "half_size_minus_one_operand")))
+ (set (match_dup 0)
+ (and:<V_INT_EQUIV>
+ (match_dup 0)
+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
+ ""
+ [(set_attr "type" "neon_compare_zero")]
+)
+
(define_insn_and_split "aarch64_cm<optab>di"
[(set (match_operand:DI 0 "register_operand" "=w,w,r")
(neg:DI
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index cbdde11b0..7a00a0817 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -12659,6 +12659,21 @@ cost_minus:
return true;
}
+ /* Detect aarch64_cmlt_as_arith instruction. Now only this pattern
+ matches the condition. The costs of cmlt and sub instructions
+ are comparable, so we are not increasing the cost here. */
+ if (flag_cmlt_arith && GET_CODE (op0) == ASHIFT
+ && GET_CODE (op1) == AND)
+ {
+ rtx op0_subop0 = XEXP (op0, 0);
+ if (rtx_equal_p (op0_subop0, op1))
+ {
+ rtx lshrt_op = XEXP (op0_subop0, 0);
+ if (GET_CODE (lshrt_op) == LSHIFTRT)
+ return true;
+ }
+ }
+
/* Look for SUB (extended register). */
if (is_a <scalar_int_mode> (mode, &int_mode)
&& aarch64_rtx_arith_op_extract_p (op1, int_mode))
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index bb888461a..c42494036 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -273,6 +273,10 @@ Use an immediate to offset from the stack protector guard register, sp_el0.
This option is for use with fstack-protector-strong and not for use in
user-land code.
+mcmlt-arith
+Target Report Var(flag_cmlt_arith) Optimization Init(0)
+Use SIMD cmlt instruction to perform some arithmetic/logic calculations.
+
TargetVariable
long aarch64_stack_protector_guard_offset = 0
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 0a7145281..d3be06c6f 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1228,7 +1228,8 @@
(V2DI "2s")])
;; Register suffix narrowed modes for VQN.
-(define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
+(define_mode_attr V2ntype [(V4HI "8b") (V2SI "4h")
+ (V8HI "16b") (V4SI "8h")
(V2DI "4s")])
;; Widened modes of vector modes.
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 1754b1eff..de58562a7 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -47,6 +47,31 @@
return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3);
})
+(define_predicate "half_size_minus_one_operand"
+ (match_code "const_vector")
+{
+ op = unwrap_const_vec_duplicate (op);
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+ return CONST_INT_P (op) && (UINTVAL (op) == size - 1);
+})
+
+(define_predicate "half_size_operand"
+ (match_code "const_vector")
+{
+ op = unwrap_const_vec_duplicate (op);
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+ return CONST_INT_P (op) && (UINTVAL (op) == size);
+})
+
+(define_predicate "cmlt_arith_mask_operand"
+ (match_code "const_vector")
+{
+ op = unwrap_const_vec_duplicate (op);
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+ unsigned long long mask = ((unsigned long long) 1 << size) | 1;
+ return CONST_INT_P (op) && (UINTVAL (op) == mask);
+})
+
(define_predicate "subreg_lowpart_operator"
(ior (match_code "truncate")
(and (match_code "subreg")
diff --git a/gcc/testsuite/gcc.dg/combine-cmlt.c b/gcc/testsuite/gcc.dg/combine-cmlt.c
new file mode 100755
index 000000000..b4c9a37ff
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/combine-cmlt.c
@@ -0,0 +1,20 @@
+/* { dg-do compile { target aarch64-*-* } } */
+/* { dg-options "-O3 -mcmlt-arith" } */
+
+/* The test checks usage of cmlt insns for arithmetic/logic calculations
+ * in foo (). It's inspired by sources of x264 codec. */
+
+typedef unsigned short int uint16_t;
+typedef unsigned int uint32_t;
+
+void foo( uint32_t *a, uint32_t *b)
+{
+ for (unsigned i = 0; i < 4; i++)
+ {
+ uint32_t s = ((a[i]>>((8 * sizeof(uint16_t))-1))
+ &(((uint32_t)1<<(8 * sizeof(uint16_t)))+1))*((uint16_t)-1);
+ b[i] = (a[i]+s)^s;
+ }
+}
+
+/* { dg-final { scan-assembler-times {cmlt\t} 1 } } */
--
2.33.0

View File

@ -0,0 +1,502 @@
From df68d120a049049671e44f6cda51e96a9a82c613 Mon Sep 17 00:00:00 2001
From: Chernonog Vyacheslav 00812786 <chernonog.vyacheslav@huawei.com>
Date: Mon, 28 Nov 2022 14:16:48 +0300
Subject: [PATCH 10/13] Introduce RTL ifcvt enhancements
It is controlled by option -fifcvt-allow-complicated-cmps, allowing
ifcvt to deal with complicated cmps like
if (cmp)
X = reg1
else
X = reg2 + reg3
and
if (cmp)
X = reg1 + reg3
Y = reg2 + reg4
Z = reg3
Parameter -param=ifcvt-allow-register-renaming=[0,1,2] allows ifcvt to
aggressively rename registers in basic blocks.
* 0: does not allow ifcvt to rename registers
* 1: allows ifcvt to rename registers in then and else bb
* 2: allows to rename registers in condition and else/then bb
---
gcc/ifcvt.c | 298 ++++++++++++++++++++++++++++++++++++++-----------
gcc/params.opt | 8 ++
2 files changed, 240 insertions(+), 66 deletions(-)
diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c
index 2452f231c..50a73a7ca 100644
--- a/gcc/ifcvt.c
+++ b/gcc/ifcvt.c
@@ -1,5 +1,5 @@
/* If-conversion support.
- Copyright (C) 2000-2020 Free Software Foundation, Inc.
+ Copyright (C) 2000-2022 Free Software Foundation, Inc.
This file is part of GCC.
@@ -876,7 +876,9 @@ noce_emit_store_flag (struct noce_if_info *if_info, rtx x, int reversep,
}
/* Don't even try if the comparison operands or the mode of X are weird. */
- if (cond_complex || !SCALAR_INT_MODE_P (GET_MODE (x)))
+ if (!param_ifcvt_allow_complicated_cmps
+ && (cond_complex
+ || !SCALAR_INT_MODE_P (GET_MODE (x))))
return NULL_RTX;
return emit_store_flag (x, code, XEXP (cond, 0),
@@ -1743,8 +1745,9 @@ noce_emit_cmove (struct noce_if_info *if_info, rtx x, enum rtx_code code,
/* Don't even try if the comparison operands are weird
except that the target supports cbranchcc4. */
- if (! general_operand (cmp_a, GET_MODE (cmp_a))
- || ! general_operand (cmp_b, GET_MODE (cmp_b)))
+ if (! param_ifcvt_allow_complicated_cmps
+ && (! general_operand (cmp_a, GET_MODE (cmp_a))
+ || ! general_operand (cmp_b, GET_MODE (cmp_b))))
{
if (!have_cbranchcc4
|| GET_MODE_CLASS (GET_MODE (cmp_a)) != MODE_CC
@@ -1915,19 +1918,6 @@ noce_try_cmove (struct noce_if_info *if_info)
return FALSE;
}
-/* Return true if X contains a conditional code mode rtx. */
-
-static bool
-contains_ccmode_rtx_p (rtx x)
-{
- subrtx_iterator::array_type array;
- FOR_EACH_SUBRTX (iter, array, x, ALL)
- if (GET_MODE_CLASS (GET_MODE (*iter)) == MODE_CC)
- return true;
-
- return false;
-}
-
/* Helper for bb_valid_for_noce_process_p. Validate that
the rtx insn INSN is a single set that does not set
the conditional register CC and is in general valid for
@@ -1946,7 +1936,6 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc)
/* Currently support only simple single sets in test_bb. */
if (!sset
|| !noce_operand_ok (SET_DEST (sset))
- || contains_ccmode_rtx_p (SET_DEST (sset))
|| !noce_operand_ok (SET_SRC (sset)))
return false;
@@ -1960,13 +1949,17 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc)
in this function. */
static bool
-bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
+bbs_ok_for_cmove_arith (basic_block bb_a,
+ basic_block bb_b,
+ rtx to_rename,
+ bitmap conflict_regs)
{
rtx_insn *a_insn;
bitmap bba_sets = BITMAP_ALLOC (&reg_obstack);
-
+ bitmap intersections = BITMAP_ALLOC (&reg_obstack);
df_ref def;
df_ref use;
+ rtx_insn *last_a = last_active_insn (bb_a, FALSE);
FOR_BB_INSNS (bb_a, a_insn)
{
@@ -1976,30 +1969,25 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
rtx sset_a = single_set (a_insn);
if (!sset_a)
- {
- BITMAP_FREE (bba_sets);
- return false;
- }
+ goto end_cmove_arith_check_and_fail;
+ if (a_insn == last_a)
+ continue;
/* Record all registers that BB_A sets. */
FOR_EACH_INSN_DEF (def, a_insn)
if (!(to_rename && DF_REF_REG (def) == to_rename))
bitmap_set_bit (bba_sets, DF_REF_REGNO (def));
}
+ bitmap_and (intersections, df_get_live_in (bb_b), bba_sets);
rtx_insn *b_insn;
-
FOR_BB_INSNS (bb_b, b_insn)
{
if (!active_insn_p (b_insn))
continue;
-
rtx sset_b = single_set (b_insn);
if (!sset_b)
- {
- BITMAP_FREE (bba_sets);
- return false;
- }
+ goto end_cmove_arith_check_and_fail;
/* Make sure this is a REG and not some instance
of ZERO_EXTRACT or SUBREG or other dangerous stuff.
@@ -2011,25 +1999,34 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
if (MEM_P (SET_DEST (sset_b)))
gcc_assert (rtx_equal_p (SET_DEST (sset_b), to_rename));
else if (!REG_P (SET_DEST (sset_b)))
- {
- BITMAP_FREE (bba_sets);
- return false;
- }
+ goto end_cmove_arith_check_and_fail;
- /* If the insn uses a reg set in BB_A return false. */
+ /* If the insn uses a reg set in BB_A return false
+ or try to collect register list for renaming. */
FOR_EACH_INSN_USE (use, b_insn)
{
- if (bitmap_bit_p (bba_sets, DF_REF_REGNO (use)))
+ if (bitmap_bit_p (intersections, DF_REF_REGNO (use)))
{
- BITMAP_FREE (bba_sets);
- return false;
+ if (param_ifcvt_allow_register_renaming < 1)
+ goto end_cmove_arith_check_and_fail;
+
+ /* Those regs should be renamed. We can't rename CC reg, but
+ possibly we can provide combined comparison in the future. */
+ if (GET_MODE_CLASS (GET_MODE (DF_REF_REG (use))) == MODE_CC)
+ goto end_cmove_arith_check_and_fail;
+ bitmap_set_bit (conflict_regs, DF_REF_REGNO (use));
}
}
-
}
BITMAP_FREE (bba_sets);
+ BITMAP_FREE (intersections);
return true;
+
+end_cmove_arith_check_and_fail:
+ BITMAP_FREE (bba_sets);
+ BITMAP_FREE (intersections);
+ return false;
}
/* Emit copies of all the active instructions in BB except the last.
@@ -2084,6 +2081,134 @@ noce_emit_bb (rtx last_insn, basic_block bb, bool simple)
return true;
}
+/* This function tries to rename regs that intersect with considered bb. */
+
+static bool
+noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs)
+{
+ bool success = true;
+ if (bitmap_empty_p (cond_rename_regs))
+ return true;
+ if (param_ifcvt_allow_register_renaming < 2)
+ return false;
+ df_ref use;
+ rtx_insn* cmp_insn = if_info->cond_earliest;
+ /* Jump instruction as a condion currently unsupported. */
+ if (JUMP_P (cmp_insn))
+ return false;
+ rtx_insn* before_cmp = PREV_INSN (cmp_insn);
+ start_sequence ();
+ rtx_insn *copy_of_cmp = as_a <rtx_insn *> (copy_rtx (cmp_insn));
+ basic_block cmp_block = BLOCK_FOR_INSN (cmp_insn);
+ FOR_EACH_INSN_USE (use, cmp_insn)
+ {
+ if (bitmap_bit_p (cond_rename_regs, DF_REF_REGNO (use)))
+ {
+ rtx use_reg = DF_REF_REG (use);
+ rtx tmp = gen_reg_rtx (GET_MODE (use_reg));
+ if (!validate_replace_rtx (use_reg, tmp, copy_of_cmp))
+ {
+ end_sequence ();
+ return false;
+ }
+ noce_emit_move_insn (tmp, use_reg);
+ }
+ }
+
+ emit_insn (PATTERN (copy_of_cmp));
+ rtx_insn *seq = get_insns ();
+ unshare_all_rtl_in_chain (seq);
+ end_sequence ();
+
+ emit_insn_after_setloc (seq, before_cmp, INSN_LOCATION (cmp_insn));
+ delete_insn_and_edges (cmp_insn);
+ rtx_insn* insn;
+ FOR_BB_INSNS (cmp_block, insn)
+ df_insn_rescan (insn);
+
+ if_info->cond = noce_get_condition (if_info->jump,
+ &copy_of_cmp,
+ if_info->then_else_reversed);
+ if_info->cond_earliest = copy_of_cmp;
+ if_info->rev_cond = NULL_RTX;
+
+ return success;
+}
+
+/* This function tries to rename regs that intersect with considered bb. */
+static bool
+noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs)
+{
+ if (bitmap_empty_p (rename_regs))
+ return true;
+ rtx_insn* insn;
+ rtx_insn *last_insn = last_active_insn (test_bb, FALSE);
+ bool res = true;
+ start_sequence ();
+ FOR_BB_INSNS (test_bb, insn)
+ {
+ if (!active_insn_p (insn))
+ continue;
+ /* Only ssets are supported for now. */
+ rtx sset = single_set (insn);
+ gcc_assert (sset);
+ rtx x = SET_DEST (sset);
+ if (!REG_P (x) || bitmap_bit_p (rename_regs, REGNO (x)))
+ continue;
+
+ machine_mode mode = GET_MODE (x);
+ rtx tmp = gen_reg_rtx (mode);
+ if (!validate_replace_rtx_part (x, tmp, &SET_DEST (sset), insn))
+ {
+ gcc_assert (insn != last_insn);
+ /* We can generate additional move for such case,
+ but it will increase register preasure.
+ For now just stop transformation. */
+ rtx result_rtx = SET_DEST (single_set (last_insn));
+ if (REG_P (result_rtx) && (x != result_rtx))
+ {
+ res = false;
+ break;
+ }
+ if (!validate_replace_rtx (x, tmp, insn))
+ gcc_unreachable ();
+ noce_emit_move_insn (tmp,x);
+ }
+ set_used_flags (insn);
+ rtx_insn* rename_candidate;
+ for (rename_candidate = NEXT_INSN (insn);
+ rename_candidate && rename_candidate!= NEXT_INSN (BB_END (test_bb));
+ rename_candidate = NEXT_INSN (rename_candidate))
+ {
+ if (!reg_overlap_mentioned_p (x, rename_candidate))
+ continue;
+
+ int replace_res = TRUE;
+ if (rename_candidate == last_insn)
+ {
+ validate_replace_src_group (x, tmp, rename_candidate);
+ replace_res = apply_change_group ();
+ }
+ else
+ replace_res = validate_replace_rtx (x, tmp, rename_candidate);
+ gcc_assert (replace_res);
+ set_used_flags (rename_candidate);
+
+ }
+ set_used_flags (x);
+ set_used_flags (tmp);
+
+ }
+ rtx_insn *seq = get_insns ();
+ unshare_all_rtl_in_chain (seq);
+ end_sequence ();
+ emit_insn_before_setloc (seq, first_active_insn (test_bb),
+ INSN_LOCATION (first_active_insn (test_bb)));
+ FOR_BB_INSNS (test_bb, insn)
+ df_insn_rescan (insn);
+ return res;
+}
+
/* Try more complex cases involving conditional_move. */
static int
@@ -2166,11 +2291,29 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
std::swap (then_bb, else_bb);
}
}
-
+ bitmap else_bb_rename_regs = BITMAP_ALLOC (&reg_obstack);
+ bitmap then_bb_rename_regs = BITMAP_ALLOC (&reg_obstack);
if (then_bb && else_bb
- && (!bbs_ok_for_cmove_arith (then_bb, else_bb, if_info->orig_x)
- || !bbs_ok_for_cmove_arith (else_bb, then_bb, if_info->orig_x)))
- return FALSE;
+ && (!bbs_ok_for_cmove_arith (then_bb, else_bb,
+ if_info->orig_x,
+ then_bb_rename_regs)
+ || !bbs_ok_for_cmove_arith (else_bb, then_bb,
+ if_info->orig_x,
+ else_bb_rename_regs)))
+ {
+ BITMAP_FREE (then_bb_rename_regs);
+ BITMAP_FREE (else_bb_rename_regs);
+ return FALSE;
+ }
+ bool prepass_renaming = true;
+ prepass_renaming |= noce_rename_regs_in_bb (then_bb, then_bb_rename_regs);
+ prepass_renaming |= noce_rename_regs_in_bb (else_bb, else_bb_rename_regs);
+
+ BITMAP_FREE (then_bb_rename_regs);
+ BITMAP_FREE (else_bb_rename_regs);
+
+ if (!prepass_renaming)
+ return FALSE;
start_sequence ();
@@ -2178,7 +2321,6 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
came from the test block. The non-empty complex block that we will
emit might clobber the register used by B or A, so move it to a pseudo
first. */
-
rtx tmp_a = NULL_RTX;
rtx tmp_b = NULL_RTX;
@@ -3052,7 +3194,8 @@ noce_operand_ok (const_rtx op)
static bool
bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
- unsigned int *cost, bool *simple_p)
+ unsigned int *cost, bool *simple_p,
+ bitmap cond_rename_regs)
{
if (!test_bb)
return false;
@@ -3086,10 +3229,10 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
rtx_insn *prev_last_insn = PREV_INSN (last_insn);
gcc_assert (prev_last_insn);
- /* For now, disallow setting x multiple times in test_bb. */
- if (REG_P (x) && reg_set_between_p (x, first_insn, prev_last_insn))
+ if (REG_P (x)
+ && reg_set_between_p (x, first_insn, prev_last_insn)
+ && param_ifcvt_allow_register_renaming < 1)
return false;
-
bitmap test_bb_temps = BITMAP_ALLOC (&reg_obstack);
/* The regs that are live out of test_bb. */
@@ -3099,25 +3242,35 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
rtx_insn *insn;
FOR_BB_INSNS (test_bb, insn)
{
- if (insn != last_insn)
- {
- if (!active_insn_p (insn))
- continue;
+ if (insn == last_insn)
+ continue;
+ if (!active_insn_p (insn))
+ continue;
- if (!insn_valid_noce_process_p (insn, cc))
- goto free_bitmap_and_fail;
+ if (!insn_valid_noce_process_p (insn, cc))
+ goto free_bitmap_and_fail;
- rtx sset = single_set (insn);
- gcc_assert (sset);
+ rtx sset = single_set (insn);
+ gcc_assert (sset);
- if (contains_mem_rtx_p (SET_SRC (sset))
- || !REG_P (SET_DEST (sset))
- || reg_overlap_mentioned_p (SET_DEST (sset), cond))
- goto free_bitmap_and_fail;
+ if (contains_mem_rtx_p (SET_SRC (sset))
+ || !REG_P (SET_DEST (sset)))
+ goto free_bitmap_and_fail;
- potential_cost += pattern_cost (sset, speed_p);
- bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
+ if (reg_overlap_mentioned_p (SET_DEST (sset), cond))
+ {
+ if (param_ifcvt_allow_register_renaming < 1)
+ goto free_bitmap_and_fail;
+ rtx sset_dest = SET_DEST (sset);
+ if (REG_P (sset_dest)
+ && (GET_MODE_CLASS (GET_MODE (sset_dest)) != MODE_CC))
+ bitmap_set_bit (cond_rename_regs, REGNO (sset_dest));
+ else
+ goto free_bitmap_and_fail;
}
+ potential_cost += pattern_cost (sset, speed_p);
+ if (SET_DEST (sset) != SET_DEST (last_set))
+ bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
}
/* If any of the intermediate results in test_bb are live after test_bb
@@ -3475,14 +3628,27 @@ noce_process_if_block (struct noce_if_info *if_info)
bool speed_p = optimize_bb_for_speed_p (test_bb);
unsigned int then_cost = 0, else_cost = 0;
+ bitmap cond_rename_regs = BITMAP_ALLOC (&reg_obstack);
if (!bb_valid_for_noce_process_p (then_bb, cond, &then_cost,
- &if_info->then_simple))
- return false;
+ &if_info->then_simple, cond_rename_regs))
+ {
+ BITMAP_FREE (cond_rename_regs);
+ return false;
+ }
if (else_bb
&& !bb_valid_for_noce_process_p (else_bb, cond, &else_cost,
- &if_info->else_simple))
+ &if_info->else_simple, cond_rename_regs))
+ {
+ BITMAP_FREE (cond_rename_regs);
+ return false;
+ }
+
+ if (!noce_rename_regs_in_cond (if_info, cond_rename_regs))
return false;
+ cond = if_info->cond;
+
+ BITMAP_FREE (cond_rename_regs);
if (speed_p)
if_info->original_cost += average_cost (then_cost, else_cost,
@@ -5426,7 +5592,7 @@ if_convert (bool after_combine)
{
basic_block bb;
int pass;
-
+ cleanup_cfg (CLEANUP_EXPENSIVE);
if (optimize == 1)
{
df_live_add_problem ();
diff --git a/gcc/params.opt b/gcc/params.opt
index 83fd705ee..345f9b3ff 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -574,6 +574,14 @@ Maximum permissible cost for the sequence that would be generated by the RTL if-
Common Joined UInteger Var(param_max_rtl_if_conversion_unpredictable_cost) Init(40) IntegerRange(0, 200) Param Optimization
Maximum permissible cost for the sequence that would be generated by the RTL if-conversion pass for a branch that is considered unpredictable.
+-param=ifcvt-allow-complicated-cmps=
+Common Joined UInteger Var(param_ifcvt_allow_complicated_cmps) IntegerRange(0, 1) Param Optimization
+Allow RTL if-conversion pass to deal with complicated cmps (can increase compilation time).
+
+-param=ifcvt-allow-register-renaming=
+Common Joined UInteger Var(param_ifcvt_allow_register_renaming) IntegerRange(0, 2) Param Optimization
+Allow RTL if-conversion pass to aggressively rename registers in basic blocks. Sometimes additional moves will be created.
+
-param=max-sched-extend-regions-iters=
Common Joined UInteger Var(param_max_sched_extend_regions_iters) Param Optimization
The maximum number of iterations through CFG to extend regions.
--
2.33.0

View File

@ -0,0 +1,239 @@
From f43bdfbdcfdeb425a0bd303f4787a13323fd2934 Mon Sep 17 00:00:00 2001
From: vchernon <chernonog.vyacheslav@huawei.com>
Date: Wed, 27 Sep 2023 11:07:29 +0800
Subject: [PATCH 11/13] Add more flexible check for pointer aliasing during
vectorization
It takes minimum between number of iteration and segment length and helps to
speed up loops with small number of iterations when only tail can be vectorized.
---
gcc/params.opt | 5 ++
.../sve/var_stride_flexible_segment_len_1.c | 23 +++++++
gcc/tree-data-ref.c | 68 +++++++++++++------
gcc/tree-data-ref.h | 11 ++-
gcc/tree-vect-data-refs.c | 14 +++-
5 files changed, 95 insertions(+), 26 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
diff --git a/gcc/params.opt b/gcc/params.opt
index 83fd705ee..7f335a94b 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -964,6 +964,11 @@ Maximum number of loop peels to enhance alignment of data references in a loop.
Common Joined UInteger Var(param_vect_max_version_for_alias_checks) Init(10) Param Optimization
Bound on number of runtime checks inserted by the vectorizer's loop versioning for alias check.
+-param=vect-alias-flexible-segment-len=
+Common Joined UInteger Var(param_flexible_seg_len) Init(0) IntegerRange(0, 1) Param Optimization
+Use a minimum length of different segments. Currently the minimum between
+iteration number and vectorization length is chosen by this param.
+
-param=vect-max-version-for-alignment-checks=
Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization
Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
new file mode 100644
index 000000000..894f075f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize --param=vect-alias-flexible-segment-len=1" } */
+
+#define TYPE int
+#define SIZE 257
+
+void __attribute__ ((weak))
+f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused)))
+{
+ for (int i = 0; i < SIZE; ++i)
+ x[i * n] += y[i * n];
+}
+
+/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
+/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
+/* Should use a WAR check that multiplies by (VF-2)*4 rather than
+ an overlap check that multiplies by (257-1)*4. */
+/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */
+/* One range check and a check for n being zero. */
+/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */
diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c
index 2cb54def8..8c5f1048c 100644
--- a/gcc/tree-data-ref.c
+++ b/gcc/tree-data-ref.c
@@ -2071,31 +2071,14 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr,
same arguments. Try to optimize cases in which the second access
is a write and in which some overlap is valid. */
-static bool
-create_waw_or_war_checks (tree *cond_expr,
+static void
+create_waw_or_war_checks2 (tree *cond_expr, tree seg_len_a,
const dr_with_seg_len_pair_t &alias_pair)
{
const dr_with_seg_len& dr_a = alias_pair.first;
const dr_with_seg_len& dr_b = alias_pair.second;
- /* Check for cases in which:
-
- (a) DR_B is always a write;
- (b) the accesses are well-ordered in both the original and new code
- (see the comment above the DR_ALIAS_* flags for details); and
- (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */
- if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
- return false;
-
- /* Check for equal (but possibly variable) steps. */
tree step = DR_STEP (dr_a.dr);
- if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
- return false;
-
- /* Make sure that we can operate on sizetype without loss of precision. */
- tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
- if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
- return false;
/* All addresses involved are known to have a common alignment ALIGN.
We can therefore subtract ALIGN from an exclusive endpoint to get
@@ -2112,9 +2095,6 @@ create_waw_or_war_checks (tree *cond_expr,
fold_convert (ssizetype, indicator),
ssize_int (0));
- /* Get lengths in sizetype. */
- tree seg_len_a
- = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len));
step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step));
/* Each access has the following pattern:
@@ -2221,6 +2201,50 @@ create_waw_or_war_checks (tree *cond_expr,
*cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit);
if (dump_enabled_p ())
dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n");
+}
+
+/* This is a wrapper function for create_waw_or_war_checks2. */
+static bool
+create_waw_or_war_checks (tree *cond_expr,
+ const dr_with_seg_len_pair_t &alias_pair)
+{
+ const dr_with_seg_len& dr_a = alias_pair.first;
+ const dr_with_seg_len& dr_b = alias_pair.second;
+
+ /* Check for cases in which:
+
+ (a) DR_B is always a write;
+ (b) the accesses are well-ordered in both the original and new code
+ (see the comment above the DR_ALIAS_* flags for details); and
+ (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */
+ if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
+ return false;
+
+ /* Check for equal (but possibly variable) steps. */
+ tree step = DR_STEP (dr_a.dr);
+ if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
+ return false;
+
+ /* Make sure that we can operate on sizetype without loss of precision. */
+ tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
+ if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
+ return false;
+
+ /* Get lengths in sizetype. */
+ tree seg_len_a
+ = fold_convert (sizetype,
+ rewrite_to_non_trapping_overflow (dr_a.seg_len));
+ create_waw_or_war_checks2 (cond_expr, seg_len_a, alias_pair);
+ if (param_flexible_seg_len && dr_a.seg_len != dr_a.seg_len2)
+ {
+ tree seg_len2_a
+ = fold_convert (sizetype,
+ rewrite_to_non_trapping_overflow (dr_a.seg_len2));
+ tree cond_expr2;
+ create_waw_or_war_checks2 (&cond_expr2, seg_len2_a, alias_pair);
+ *cond_expr = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
+ *cond_expr, cond_expr2);
+ }
return true;
}
diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
index 771d20fbb..5903ce66a 100644
--- a/gcc/tree-data-ref.h
+++ b/gcc/tree-data-ref.h
@@ -208,12 +208,19 @@ class dr_with_seg_len
public:
dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size,
unsigned int a)
- : dr (d), seg_len (len), access_size (size), align (a) {}
-
+ : dr (d), seg_len (len), seg_len2 (len), access_size (size), align (a)
+ {}
+ dr_with_seg_len (data_reference_p d, tree len, tree len2,
+ unsigned HOST_WIDE_INT size, unsigned int a)
+ : dr (d), seg_len (len), seg_len2 (len2), access_size (size), align (a)
+ {}
data_reference_p dr;
/* The offset of the last access that needs to be checked minus
the offset of the first. */
tree seg_len;
+ /* The second version of segment length. Currently this is used to
+ soften checks for a small number of iterations. */
+ tree seg_len2;
/* A value that, when added to abs (SEG_LEN), gives the total number of
bytes in the segment. */
poly_uint64 access_size;
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index e4466a4f3..1b8a03c9c 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -3498,6 +3498,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
{
poly_uint64 lower_bound;
tree segment_length_a, segment_length_b;
+ tree segment_length2_a, segment_length2_b;
unsigned HOST_WIDE_INT access_size_a, access_size_b;
unsigned int align_a, align_b;
@@ -3598,6 +3599,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
{
segment_length_a = size_zero_node;
segment_length_b = size_zero_node;
+ segment_length2_a = size_zero_node;
+ segment_length2_b = size_zero_node;
}
else
{
@@ -3606,8 +3609,15 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
length_factor = scalar_loop_iters;
else
length_factor = size_int (vect_factor);
+ /* In any case we should rememeber scalar_loop_iters
+ this helps to create flexible aliasing check
+ for small number of iterations. */
segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
+ segment_length2_a
+ = vect_vfa_segment_size (dr_info_a, scalar_loop_iters);
+ segment_length2_b
+ = vect_vfa_segment_size (dr_info_b, scalar_loop_iters);
}
access_size_a = vect_vfa_access_size (dr_info_a);
access_size_b = vect_vfa_access_size (dr_info_b);
@@ -3652,9 +3662,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
}
dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
- access_size_a, align_a);
+ segment_length2_a, access_size_a, align_a);
dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
- access_size_b, align_b);
+ segment_length2_b, access_size_b, align_b);
/* Canonicalize the order to be the one that's needed for accurate
RAW, WAR and WAW flags, in cases where the data references are
well-ordered. The order doesn't really matter otherwise,
--
2.33.0