[Sync] Sync patch from openeuler/gcc

0147-add-insn-defs-and-correct-costs-for-cmlt-generation.patch 0148-Introduce-RTL-ifcvt-enhancements.patch 0149-Add-more-flexible-check-for-pointer-aliasing-during-.patch
2023-12-06 11:52:14 +08:00 · 2023-12-06 11:52:14 +08:00 · c396b7ffab
commit c396b7ffab
parent 8f8eb20266
3 changed files with 935 additions and 0 deletions
--- a/0147-add-insn-defs-and-correct-costs-for-cmlt-generation.patch
+++ b/0147-add-insn-defs-and-correct-costs-for-cmlt-generation.patch
@ -0,0 +1,194 @@
+From 80b7de670da46d8921118799904cba4a0753bb72 Mon Sep 17 00:00:00 2001
+From: Diachkov Ilia WX1215920 <diachkov.ilia1@huawei-partners.com>
+Date: Wed, 23 Aug 2023 15:03:00 +0300
+Subject: [PATCH 09/13] add insn defs and correct costs for cmlt generation
+
+---
+ gcc/config/aarch64/aarch64-simd.md  | 48 +++++++++++++++++++++++++++++
+ gcc/config/aarch64/aarch64.c        | 15 +++++++++
+ gcc/config/aarch64/aarch64.opt      |  4 +++
+ gcc/config/aarch64/iterators.md     |  3 +-
+ gcc/config/aarch64/predicates.md    | 25 +++++++++++++++
+ gcc/testsuite/gcc.dg/combine-cmlt.c | 20 ++++++++++++
+ 6 files changed, 114 insertions(+), 1 deletion(-)
+ create mode 100755 gcc/testsuite/gcc.dg/combine-cmlt.c
+
+diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
+index 6049adc3f..f4213fd62 100644
+--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
+@@ -4719,6 +4719,54 @@
+   [(set_attr "type" "neon_compare<q>, neon_compare_zero<q>")]
+ )
+ 
+;; Use cmlt to replace vector arithmetic operations like this (SImode example):
+;; B = (((A >> 15) & 0x00010001) << 16) - ((A >> 15) & 0x00010001)
+;; TODO: maybe extend to scalar operations or other cm** instructions.
+
+(define_insn "*aarch64_cmlt_as_arith<mode>"
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
+	(minus:<V_INT_EQUIV>
+	  (ashift:<V_INT_EQUIV>
+	    (and:<V_INT_EQUIV>
+	      (lshiftrt:<V_INT_EQUIV>
+		(match_operand:VDQHSD 1 "register_operand" "w")
+		(match_operand:VDQHSD 2 "half_size_minus_one_operand"))
+	      (match_operand:VDQHSD 3 "cmlt_arith_mask_operand"))
+	    (match_operand:VDQHSD 4 "half_size_operand"))
+	  (and:<V_INT_EQUIV>
+	    (lshiftrt:<V_INT_EQUIV>
+	      (match_dup 1)
+	      (match_dup 2))
+	    (match_dup 3))))]
+  "TARGET_SIMD && flag_cmlt_arith"
+  "cmlt\t%<v>0.<V2ntype>, %<v>1.<V2ntype>, #0"
+  [(set_attr "type" "neon_compare_zero")]
+)
+
+;; The helper definition that allows combiner to use the previous pattern.
+
+(define_insn_and_split "*aarch64_cmlt_tmp<mode>"
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
+	(and:<V_INT_EQUIV>
+	  (lshiftrt:<V_INT_EQUIV>
+	    (match_operand:VDQHSD 1 "register_operand" "w")
+	    (match_operand:VDQHSD 2 "half_size_minus_one_operand"))
+	  (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
+  "TARGET_SIMD && flag_cmlt_arith"
+  "#"
+  "&& reload_completed"
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
+	(lshiftrt:<V_INT_EQUIV>
+	  (match_operand:VDQHSD 1 "register_operand")
+	  (match_operand:VDQHSD 2 "half_size_minus_one_operand")))
+   (set (match_dup 0)
+	(and:<V_INT_EQUIV>
+	  (match_dup 0)
+	  (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
+  ""
+  [(set_attr "type" "neon_compare_zero")]
+)
+
+ (define_insn_and_split "aarch64_cm<optab>di"
+   [(set (match_operand:DI 0 "register_operand" "=w,w,r")
+ 	(neg:DI
+diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
+index cbdde11b0..7a00a0817 100644
+--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
+@@ -12659,6 +12659,21 @@ cost_minus:
+ 	    return true;
+ 	  }
+ 
+	/* Detect aarch64_cmlt_as_arith instruction. Now only this pattern
+	   matches the condition. The costs of cmlt and sub instructions
+	   are comparable, so we are not increasing the cost here.  */
+	if (flag_cmlt_arith && GET_CODE (op0) == ASHIFT
+	    && GET_CODE (op1) == AND)
+	  {
+	    rtx op0_subop0 = XEXP (op0, 0);
+	    if (rtx_equal_p (op0_subop0, op1))
+	      {
+		rtx lshrt_op = XEXP (op0_subop0, 0);
+		if (GET_CODE (lshrt_op) == LSHIFTRT)
+		  return true;
+	      }
+	  }
+
+ 	/* Look for SUB (extended register).  */
+ 	if (is_a <scalar_int_mode> (mode, &int_mode)
+ 	    && aarch64_rtx_arith_op_extract_p (op1, int_mode))
+diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
+index bb888461a..c42494036 100644
+--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
+@@ -273,6 +273,10 @@ Use an immediate to offset from the stack protector guard register, sp_el0.
+ This option is for use with fstack-protector-strong and not for use in
+ user-land code.
+ 
+mcmlt-arith
+Target Report Var(flag_cmlt_arith) Optimization Init(0)
+Use SIMD cmlt instruction to perform some arithmetic/logic calculations.
+
+ TargetVariable
+ long aarch64_stack_protector_guard_offset = 0
+ 
+diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
+index 0a7145281..d3be06c6f 100644
+--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
+@@ -1228,7 +1228,8 @@
+ 			  (V2DI "2s")])
+ 
+ ;; Register suffix narrowed modes for VQN.
+-(define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
+(define_mode_attr V2ntype [(V4HI "8b") (V2SI "4h")
+			   (V8HI "16b") (V4SI "8h")
+ 			   (V2DI "4s")])
+ 
+ ;; Widened modes of vector modes.
+diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
+index 1754b1eff..de58562a7 100644
+--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
+@@ -47,6 +47,31 @@
+   return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3);
+ })
+ 
+(define_predicate "half_size_minus_one_operand"
+  (match_code "const_vector")
+{
+  op = unwrap_const_vec_duplicate (op);
+  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+  return CONST_INT_P (op) && (UINTVAL (op) == size - 1);
+})
+
+(define_predicate "half_size_operand"
+  (match_code "const_vector")
+{
+  op = unwrap_const_vec_duplicate (op);
+  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+  return CONST_INT_P (op) && (UINTVAL (op) == size);
+})
+
+(define_predicate "cmlt_arith_mask_operand"
+  (match_code "const_vector")
+{
+  op = unwrap_const_vec_duplicate (op);
+  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+  unsigned long long mask = ((unsigned long long) 1 << size) | 1;
+  return CONST_INT_P (op) && (UINTVAL (op) == mask);
+})
+
+ (define_predicate "subreg_lowpart_operator"
+   (ior (match_code "truncate")
+        (and (match_code "subreg")
+diff --git a/gcc/testsuite/gcc.dg/combine-cmlt.c b/gcc/testsuite/gcc.dg/combine-cmlt.c
+new file mode 100755
+index 000000000..b4c9a37ff
+--- /dev/null
+++ b/gcc/testsuite/gcc.dg/combine-cmlt.c
+@@ -0,0 +1,20 @@
+/* { dg-do compile { target aarch64-*-* } } */
+/* { dg-options "-O3 -mcmlt-arith" } */
+
+/* The test checks usage of cmlt insns for arithmetic/logic calculations
+ * in foo ().  It's inspired by sources of x264 codec.  */
+
+typedef unsigned short int uint16_t;
+typedef unsigned int uint32_t;
+
+void foo( uint32_t *a, uint32_t *b)
+{
+  for (unsigned i = 0; i < 4; i++)
+    {
+      uint32_t s = ((a[i]>>((8 * sizeof(uint16_t))-1))
+		    &(((uint32_t)1<<(8 * sizeof(uint16_t)))+1))*((uint16_t)-1);
+      b[i] = (a[i]+s)^s;
+    }
+}
+
+/* { dg-final { scan-assembler-times {cmlt\t} 1 } }  */
+-- 
+2.33.0
+
--- a/0148-Introduce-RTL-ifcvt-enhancements.patch
+++ b/0148-Introduce-RTL-ifcvt-enhancements.patch
@ -0,0 +1,502 @@
+From df68d120a049049671e44f6cda51e96a9a82c613 Mon Sep 17 00:00:00 2001
+From: Chernonog Vyacheslav 00812786 <chernonog.vyacheslav@huawei.com>
+Date: Mon, 28 Nov 2022 14:16:48 +0300
+Subject: [PATCH 10/13] Introduce RTL ifcvt enhancements
+
+It is controlled by option -fifcvt-allow-complicated-cmps, allowing
+ifcvt to deal with complicated cmps like
+  if (cmp)
+    X = reg1
+  else
+    X = reg2 + reg3
+and
+  if (cmp)
+    X = reg1 + reg3
+    Y = reg2 + reg4
+    Z = reg3
+
+Parameter -param=ifcvt-allow-register-renaming=[0,1,2] allows ifcvt to
+aggressively rename registers in basic blocks.
+* 0: does not allow ifcvt to rename registers
+* 1: allows ifcvt to rename registers in then and else bb
+* 2: allows to rename registers in condition and else/then bb
+---
+ gcc/ifcvt.c    | 298 ++++++++++++++++++++++++++++++++++++++-----------
+ gcc/params.opt |   8 ++
+ 2 files changed, 240 insertions(+), 66 deletions(-)
+
+diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c
+index 2452f231c..50a73a7ca 100644
+--- a/gcc/ifcvt.c
+++ b/gcc/ifcvt.c
+@@ -1,5 +1,5 @@
+ /* If-conversion support.
+-   Copyright (C) 2000-2020 Free Software Foundation, Inc.
+   Copyright (C) 2000-2022 Free Software Foundation, Inc.
+ 
+    This file is part of GCC.
+ 
+@@ -876,7 +876,9 @@ noce_emit_store_flag (struct noce_if_info *if_info, rtx x, int reversep,
+     }
+ 
+   /* Don't even try if the comparison operands or the mode of X are weird.  */
+-  if (cond_complex || !SCALAR_INT_MODE_P (GET_MODE (x)))
+  if (!param_ifcvt_allow_complicated_cmps
+      && (cond_complex
+	  || !SCALAR_INT_MODE_P (GET_MODE (x))))
+     return NULL_RTX;
+ 
+   return emit_store_flag (x, code, XEXP (cond, 0),
+@@ -1743,8 +1745,9 @@ noce_emit_cmove (struct noce_if_info *if_info, rtx x, enum rtx_code code,
+ 
+   /* Don't even try if the comparison operands are weird
+      except that the target supports cbranchcc4.  */
+-  if (! general_operand (cmp_a, GET_MODE (cmp_a))
+-      || ! general_operand (cmp_b, GET_MODE (cmp_b)))
+  if (! param_ifcvt_allow_complicated_cmps
+      && (! general_operand (cmp_a, GET_MODE (cmp_a))
+	  || ! general_operand (cmp_b, GET_MODE (cmp_b))))
+     {
+       if (!have_cbranchcc4
+ 	  || GET_MODE_CLASS (GET_MODE (cmp_a)) != MODE_CC
+@@ -1915,19 +1918,6 @@ noce_try_cmove (struct noce_if_info *if_info)
+   return FALSE;
+ }
+ 
+-/* Return true if X contains a conditional code mode rtx.  */
+-
+-static bool
+-contains_ccmode_rtx_p (rtx x)
+-{
+-  subrtx_iterator::array_type array;
+-  FOR_EACH_SUBRTX (iter, array, x, ALL)
+-    if (GET_MODE_CLASS (GET_MODE (*iter)) == MODE_CC)
+-      return true;
+-
+-  return false;
+-}
+-
+ /* Helper for bb_valid_for_noce_process_p.  Validate that
+    the rtx insn INSN is a single set that does not set
+    the conditional register CC and is in general valid for
+@@ -1946,7 +1936,6 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc)
+   /* Currently support only simple single sets in test_bb.  */
+   if (!sset
+       || !noce_operand_ok (SET_DEST (sset))
+-      || contains_ccmode_rtx_p (SET_DEST (sset))
+       || !noce_operand_ok (SET_SRC (sset)))
+     return false;
+ 
+@@ -1960,13 +1949,17 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc)
+    in this function.  */
+ 
+ static bool
+-bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
+bbs_ok_for_cmove_arith (basic_block bb_a,
+			basic_block bb_b,
+			rtx to_rename,
+			bitmap conflict_regs)
+ {
+   rtx_insn *a_insn;
+   bitmap bba_sets = BITMAP_ALLOC (&reg_obstack);
+-
+  bitmap intersections = BITMAP_ALLOC (&reg_obstack);
+   df_ref def;
+   df_ref use;
+  rtx_insn *last_a = last_active_insn (bb_a, FALSE);
+ 
+   FOR_BB_INSNS (bb_a, a_insn)
+     {
+@@ -1976,30 +1969,25 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
+       rtx sset_a = single_set (a_insn);
+ 
+       if (!sset_a)
+-	{
+-	  BITMAP_FREE (bba_sets);
+-	  return false;
+-	}
+	goto end_cmove_arith_check_and_fail;
+      if (a_insn == last_a)
+	continue;
+       /* Record all registers that BB_A sets.  */
+       FOR_EACH_INSN_DEF (def, a_insn)
+ 	if (!(to_rename && DF_REF_REG (def) == to_rename))
+ 	  bitmap_set_bit (bba_sets, DF_REF_REGNO (def));
+     }
+ 
+  bitmap_and (intersections, df_get_live_in (bb_b), bba_sets);
+   rtx_insn *b_insn;
+-
+   FOR_BB_INSNS (bb_b, b_insn)
+     {
+       if (!active_insn_p (b_insn))
+ 	continue;
+-
+       rtx sset_b = single_set (b_insn);
+ 
+       if (!sset_b)
+-	{
+-	  BITMAP_FREE (bba_sets);
+-	  return false;
+-	}
+	goto end_cmove_arith_check_and_fail;
+ 
+       /* Make sure this is a REG and not some instance
+ 	 of ZERO_EXTRACT or SUBREG or other dangerous stuff.
+@@ -2011,25 +1999,34 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
+       if (MEM_P (SET_DEST (sset_b)))
+ 	gcc_assert (rtx_equal_p (SET_DEST (sset_b), to_rename));
+       else if (!REG_P (SET_DEST (sset_b)))
+-	{
+-	  BITMAP_FREE (bba_sets);
+-	  return false;
+-	}
+	goto end_cmove_arith_check_and_fail;
+ 
+-      /* If the insn uses a reg set in BB_A return false.  */
+      /* If the insn uses a reg set in BB_A return false
+	 or try to collect register list for renaming.  */
+       FOR_EACH_INSN_USE (use, b_insn)
+ 	{
+-	  if (bitmap_bit_p (bba_sets, DF_REF_REGNO (use)))
+	  if (bitmap_bit_p (intersections, DF_REF_REGNO (use)))
+ 	    {
+-	      BITMAP_FREE (bba_sets);
+-	      return false;
+	      if (param_ifcvt_allow_register_renaming < 1)
+		  goto end_cmove_arith_check_and_fail;
+
+	      /* Those regs should be renamed.  We can't rename CC reg, but
+		 possibly we can provide combined comparison in the future.  */
+	      if (GET_MODE_CLASS (GET_MODE (DF_REF_REG (use))) == MODE_CC)
+		goto end_cmove_arith_check_and_fail;
+	      bitmap_set_bit (conflict_regs, DF_REF_REGNO (use));
+ 	    }
+ 	}
+-
+     }
+ 
+   BITMAP_FREE (bba_sets);
+  BITMAP_FREE (intersections);
+   return true;
+
+end_cmove_arith_check_and_fail:
+  BITMAP_FREE (bba_sets);
+  BITMAP_FREE (intersections);
+  return false;
+ }
+ 
+ /* Emit copies of all the active instructions in BB except the last.
+@@ -2084,6 +2081,134 @@ noce_emit_bb (rtx last_insn, basic_block bb, bool simple)
+   return true;
+ }
+ 
+/* This function tries to rename regs that intersect with considered bb.  */
+
+static bool
+noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs)
+{
+  bool success = true;
+  if (bitmap_empty_p (cond_rename_regs))
+    return true;
+  if (param_ifcvt_allow_register_renaming < 2)
+    return false;
+  df_ref use;
+  rtx_insn* cmp_insn = if_info->cond_earliest;
+  /*  Jump instruction as a condion currently unsupported.  */
+  if (JUMP_P (cmp_insn))
+    return false;
+  rtx_insn* before_cmp = PREV_INSN (cmp_insn);
+  start_sequence ();
+  rtx_insn *copy_of_cmp = as_a <rtx_insn *> (copy_rtx (cmp_insn));
+  basic_block cmp_block = BLOCK_FOR_INSN (cmp_insn);
+  FOR_EACH_INSN_USE (use, cmp_insn)
+    {
+      if (bitmap_bit_p (cond_rename_regs, DF_REF_REGNO (use)))
+	{
+	  rtx use_reg = DF_REF_REG (use);
+	  rtx tmp = gen_reg_rtx (GET_MODE (use_reg));
+	  if (!validate_replace_rtx (use_reg, tmp, copy_of_cmp))
+	    {
+	      end_sequence ();
+	      return false;
+	    }
+	  noce_emit_move_insn (tmp, use_reg);
+	}
+    }
+
+  emit_insn (PATTERN (copy_of_cmp));
+  rtx_insn *seq = get_insns ();
+  unshare_all_rtl_in_chain (seq);
+  end_sequence ();
+
+  emit_insn_after_setloc (seq, before_cmp, INSN_LOCATION (cmp_insn));
+  delete_insn_and_edges (cmp_insn);
+  rtx_insn* insn;
+  FOR_BB_INSNS (cmp_block, insn)
+    df_insn_rescan (insn);
+
+  if_info->cond = noce_get_condition (if_info->jump,
+				      &copy_of_cmp,
+				      if_info->then_else_reversed);
+  if_info->cond_earliest = copy_of_cmp;
+  if_info->rev_cond = NULL_RTX;
+
+  return success;
+}
+
+/* This function tries to rename regs that intersect with considered bb.  */
+static bool
+noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs)
+{
+  if (bitmap_empty_p (rename_regs))
+    return true;
+  rtx_insn* insn;
+  rtx_insn *last_insn = last_active_insn (test_bb, FALSE);
+  bool res = true;
+  start_sequence ();
+  FOR_BB_INSNS (test_bb, insn)
+    {
+      if (!active_insn_p (insn))
+	continue;
+      /* Only ssets are supported for now.  */
+      rtx sset = single_set (insn);
+      gcc_assert (sset);
+      rtx x = SET_DEST (sset);
+      if (!REG_P (x) || bitmap_bit_p (rename_regs, REGNO (x)))
+	continue;
+
+      machine_mode mode = GET_MODE (x);
+      rtx tmp = gen_reg_rtx (mode);
+      if (!validate_replace_rtx_part (x, tmp, &SET_DEST (sset), insn))
+	{
+	  gcc_assert (insn != last_insn);
+	  /* We can generate additional move for such case,
+	     but it will increase register preasure.
+	     For now just stop transformation.  */
+	  rtx result_rtx = SET_DEST (single_set (last_insn));
+	  if (REG_P (result_rtx) && (x != result_rtx))
+	    {
+	      res = false;
+	      break;
+	    }
+	  if (!validate_replace_rtx (x, tmp, insn))
+	    gcc_unreachable ();
+	  noce_emit_move_insn (tmp,x);
+	}
+      set_used_flags (insn);
+      rtx_insn* rename_candidate;
+      for (rename_candidate = NEXT_INSN (insn);
+	   rename_candidate && rename_candidate!= NEXT_INSN (BB_END (test_bb));
+	   rename_candidate = NEXT_INSN (rename_candidate))
+	{
+	  if (!reg_overlap_mentioned_p (x, rename_candidate))
+	    continue;
+
+	  int replace_res = TRUE;
+	  if (rename_candidate == last_insn)
+	    {
+	      validate_replace_src_group (x, tmp, rename_candidate);
+	      replace_res = apply_change_group ();
+	    }
+	  else
+	    replace_res = validate_replace_rtx (x, tmp, rename_candidate);
+	  gcc_assert (replace_res);
+	  set_used_flags (rename_candidate);
+
+	}
+      set_used_flags (x);
+      set_used_flags (tmp);
+
+    }
+    rtx_insn *seq = get_insns ();
+    unshare_all_rtl_in_chain (seq);
+    end_sequence ();
+    emit_insn_before_setloc (seq, first_active_insn (test_bb),
+			     INSN_LOCATION (first_active_insn (test_bb)));
+  FOR_BB_INSNS (test_bb, insn)
+    df_insn_rescan (insn);
+  return res;
+}
+
+ /* Try more complex cases involving conditional_move.  */
+ 
+ static int
+@@ -2166,11 +2291,29 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
+ 	  std::swap (then_bb, else_bb);
+ 	}
+     }
+-
+  bitmap else_bb_rename_regs = BITMAP_ALLOC (&reg_obstack);
+  bitmap then_bb_rename_regs = BITMAP_ALLOC (&reg_obstack);
+   if (then_bb && else_bb
+-      && (!bbs_ok_for_cmove_arith (then_bb, else_bb,  if_info->orig_x)
+-	  || !bbs_ok_for_cmove_arith (else_bb, then_bb,  if_info->orig_x)))
+-    return FALSE;
+      && (!bbs_ok_for_cmove_arith (then_bb, else_bb,
+				   if_info->orig_x,
+				   then_bb_rename_regs)
+	  || !bbs_ok_for_cmove_arith (else_bb, then_bb,
+				      if_info->orig_x,
+				      else_bb_rename_regs)))
+    {
+      BITMAP_FREE (then_bb_rename_regs);
+      BITMAP_FREE (else_bb_rename_regs);
+      return FALSE;
+    }
+  bool prepass_renaming = true;
+  prepass_renaming |= noce_rename_regs_in_bb (then_bb, then_bb_rename_regs);
+  prepass_renaming |= noce_rename_regs_in_bb (else_bb, else_bb_rename_regs);
+
+  BITMAP_FREE (then_bb_rename_regs);
+  BITMAP_FREE (else_bb_rename_regs);
+
+  if (!prepass_renaming)
+   return FALSE;
+ 
+   start_sequence ();
+ 
+@@ -2178,7 +2321,6 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
+      came from the test block.  The non-empty complex block that we will
+      emit might clobber the register used by B or A, so move it to a pseudo
+      first.  */
+-
+   rtx tmp_a = NULL_RTX;
+   rtx tmp_b = NULL_RTX;
+ 
+@@ -3052,7 +3194,8 @@ noce_operand_ok (const_rtx op)
+ 
+ static bool
+ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
+-			      unsigned int *cost, bool *simple_p)
+			     unsigned int *cost, bool *simple_p,
+			     bitmap cond_rename_regs)
+ {
+   if (!test_bb)
+     return false;
+@@ -3086,10 +3229,10 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
+   rtx_insn *prev_last_insn = PREV_INSN (last_insn);
+   gcc_assert (prev_last_insn);
+ 
+-  /* For now, disallow setting x multiple times in test_bb.  */
+-  if (REG_P (x) && reg_set_between_p (x, first_insn, prev_last_insn))
+  if (REG_P (x)
+      && reg_set_between_p (x, first_insn, prev_last_insn)
+      && param_ifcvt_allow_register_renaming < 1)
+     return false;
+-
+   bitmap test_bb_temps = BITMAP_ALLOC (&reg_obstack);
+ 
+   /* The regs that are live out of test_bb.  */
+@@ -3099,25 +3242,35 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
+   rtx_insn *insn;
+   FOR_BB_INSNS (test_bb, insn)
+     {
+-      if (insn != last_insn)
+-	{
+-	  if (!active_insn_p (insn))
+-	    continue;
+      if (insn == last_insn)
+	continue;
+      if (!active_insn_p (insn))
+	continue;
+ 
+-	  if (!insn_valid_noce_process_p (insn, cc))
+-	    goto free_bitmap_and_fail;
+      if (!insn_valid_noce_process_p (insn, cc))
+	goto free_bitmap_and_fail;
+ 
+-	  rtx sset = single_set (insn);
+-	  gcc_assert (sset);
+      rtx sset = single_set (insn);
+      gcc_assert (sset);
+ 
+-	  if (contains_mem_rtx_p (SET_SRC (sset))
+-	      || !REG_P (SET_DEST (sset))
+-	      || reg_overlap_mentioned_p (SET_DEST (sset), cond))
+-	    goto free_bitmap_and_fail;
+      if (contains_mem_rtx_p (SET_SRC (sset))
+	  || !REG_P (SET_DEST (sset)))
+	goto free_bitmap_and_fail;
+ 
+-	  potential_cost += pattern_cost (sset, speed_p);
+-	  bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
+      if (reg_overlap_mentioned_p (SET_DEST (sset), cond))
+	{
+	  if (param_ifcvt_allow_register_renaming < 1)
+	    goto free_bitmap_and_fail;
+	  rtx sset_dest = SET_DEST (sset);
+	  if (REG_P (sset_dest)
+	      && (GET_MODE_CLASS (GET_MODE (sset_dest)) != MODE_CC))
+	    bitmap_set_bit (cond_rename_regs, REGNO (sset_dest));
+	  else
+	    goto free_bitmap_and_fail;
+ 	}
+	  potential_cost += pattern_cost (sset, speed_p);
+	  if (SET_DEST (sset) != SET_DEST (last_set))
+	    bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
+     }
+ 
+   /* If any of the intermediate results in test_bb are live after test_bb
+@@ -3475,14 +3628,27 @@ noce_process_if_block (struct noce_if_info *if_info)
+ 
+   bool speed_p = optimize_bb_for_speed_p (test_bb);
+   unsigned int then_cost = 0, else_cost = 0;
+  bitmap cond_rename_regs = BITMAP_ALLOC (&reg_obstack);
+   if (!bb_valid_for_noce_process_p (then_bb, cond, &then_cost,
+-				    &if_info->then_simple))
+-    return false;
+				    &if_info->then_simple, cond_rename_regs))
+    {
+      BITMAP_FREE (cond_rename_regs);
+      return false;
+    }
+ 
+   if (else_bb
+       && !bb_valid_for_noce_process_p (else_bb, cond, &else_cost,
+-				       &if_info->else_simple))
+				       &if_info->else_simple, cond_rename_regs))
+    {
+      BITMAP_FREE (cond_rename_regs);
+      return false;
+    }
+
+  if (!noce_rename_regs_in_cond (if_info, cond_rename_regs))
+     return false;
+  cond = if_info->cond;
+
+  BITMAP_FREE (cond_rename_regs);
+ 
+   if (speed_p)
+     if_info->original_cost += average_cost (then_cost, else_cost,
+@@ -5426,7 +5592,7 @@ if_convert (bool after_combine)
+ {
+   basic_block bb;
+   int pass;
+-
+  cleanup_cfg (CLEANUP_EXPENSIVE);
+   if (optimize == 1)
+     {
+       df_live_add_problem ();
+diff --git a/gcc/params.opt b/gcc/params.opt
+index 83fd705ee..345f9b3ff 100644
+--- a/gcc/params.opt
+++ b/gcc/params.opt
+@@ -574,6 +574,14 @@ Maximum permissible cost for the sequence that would be generated by the RTL if-
+ Common Joined UInteger Var(param_max_rtl_if_conversion_unpredictable_cost) Init(40) IntegerRange(0, 200) Param Optimization
+ Maximum permissible cost for the sequence that would be generated by the RTL if-conversion pass for a branch that is considered unpredictable.
+ 
+-param=ifcvt-allow-complicated-cmps=
+Common Joined UInteger Var(param_ifcvt_allow_complicated_cmps) IntegerRange(0, 1) Param Optimization
+Allow RTL if-conversion pass to deal with complicated cmps (can increase compilation time).
+
+-param=ifcvt-allow-register-renaming=
+Common Joined UInteger Var(param_ifcvt_allow_register_renaming) IntegerRange(0, 2) Param Optimization
+Allow RTL if-conversion pass to aggressively rename registers in basic blocks.  Sometimes additional moves will be created.
+
+ -param=max-sched-extend-regions-iters=
+ Common Joined UInteger Var(param_max_sched_extend_regions_iters) Param Optimization
+ The maximum number of iterations through CFG to extend regions.
+-- 
+2.33.0
+
--- a/0149-Add-more-flexible-check-for-pointer-aliasing-during-.patch
+++ b/0149-Add-more-flexible-check-for-pointer-aliasing-during-.patch
@ -0,0 +1,239 @@
+From f43bdfbdcfdeb425a0bd303f4787a13323fd2934 Mon Sep 17 00:00:00 2001
+From: vchernon <chernonog.vyacheslav@huawei.com>
+Date: Wed, 27 Sep 2023 11:07:29 +0800
+Subject: [PATCH 11/13] Add more flexible check for pointer aliasing during
+ vectorization
+
+It takes minimum between number of iteration and segment length and helps to
+speed up loops with small number of iterations when only tail can be vectorized.
+---
+ gcc/params.opt                                |  5 ++
+ .../sve/var_stride_flexible_segment_len_1.c   | 23 +++++++
+ gcc/tree-data-ref.c                           | 68 +++++++++++++------
+ gcc/tree-data-ref.h                           | 11 ++-
+ gcc/tree-vect-data-refs.c                     | 14 +++-
+ 5 files changed, 95 insertions(+), 26 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
+
+diff --git a/gcc/params.opt b/gcc/params.opt
+index 83fd705ee..7f335a94b 100644
+--- a/gcc/params.opt
+++ b/gcc/params.opt
+@@ -964,6 +964,11 @@ Maximum number of loop peels to enhance alignment of data references in a loop.
+ Common Joined UInteger Var(param_vect_max_version_for_alias_checks) Init(10) Param Optimization
+ Bound on number of runtime checks inserted by the vectorizer's loop versioning for alias check.
+ 
+-param=vect-alias-flexible-segment-len=
+Common Joined UInteger Var(param_flexible_seg_len) Init(0) IntegerRange(0, 1) Param Optimization
+Use a minimum length of different segments. Currently the minimum between
+iteration number and vectorization length is chosen by this param.
+
+ -param=vect-max-version-for-alignment-checks=
+ Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization
+ Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check.
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
+new file mode 100644
+index 000000000..894f075f3
+--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
+@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize --param=vect-alias-flexible-segment-len=1" } */
+
+#define TYPE int
+#define SIZE 257
+
+void __attribute__ ((weak))
+f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused)))
+{
+  for (int i = 0; i < SIZE; ++i)
+    x[i * n] += y[i * n];
+}
+
+/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
+/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
+/* Should use a WAR check that multiplies by (VF-2)*4 rather than
+   an overlap check that multiplies by (257-1)*4.  */
+/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */
+/* One range check and a check for n being zero.  */
+/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */
+diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c
+index 2cb54def8..8c5f1048c 100644
+--- a/gcc/tree-data-ref.c
+++ b/gcc/tree-data-ref.c
+@@ -2071,31 +2071,14 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr,
+    same arguments.  Try to optimize cases in which the second access
+    is a write and in which some overlap is valid.  */
+ 
+-static bool
+-create_waw_or_war_checks (tree *cond_expr,
+static void
+create_waw_or_war_checks2 (tree *cond_expr, tree seg_len_a,
+ 			  const dr_with_seg_len_pair_t &alias_pair)
+ {
+   const dr_with_seg_len& dr_a = alias_pair.first;
+   const dr_with_seg_len& dr_b = alias_pair.second;
+ 
+-  /* Check for cases in which:
+-
+-     (a) DR_B is always a write;
+-     (b) the accesses are well-ordered in both the original and new code
+-	 (see the comment above the DR_ALIAS_* flags for details); and
+-     (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR.  */
+-  if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
+-    return false;
+-
+-  /* Check for equal (but possibly variable) steps.  */
+   tree step = DR_STEP (dr_a.dr);
+-  if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
+-    return false;
+-
+-  /* Make sure that we can operate on sizetype without loss of precision.  */
+-  tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
+-  if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
+-    return false;
+ 
+   /* All addresses involved are known to have a common alignment ALIGN.
+      We can therefore subtract ALIGN from an exclusive endpoint to get
+@@ -2112,9 +2095,6 @@ create_waw_or_war_checks (tree *cond_expr,
+ 			       fold_convert (ssizetype, indicator),
+ 			       ssize_int (0));
+ 
+-  /* Get lengths in sizetype.  */
+-  tree seg_len_a
+-    = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len));
+   step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step));
+ 
+   /* Each access has the following pattern:
+@@ -2221,6 +2201,50 @@ create_waw_or_war_checks (tree *cond_expr,
+   *cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit);
+   if (dump_enabled_p ())
+     dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n");
+}
+
+/* This is a wrapper function for create_waw_or_war_checks2.  */
+static bool
+create_waw_or_war_checks (tree *cond_expr,
+			  const dr_with_seg_len_pair_t &alias_pair)
+{
+  const dr_with_seg_len& dr_a = alias_pair.first;
+  const dr_with_seg_len& dr_b = alias_pair.second;
+
+  /* Check for cases in which:
+
+     (a) DR_B is always a write;
+     (b) the accesses are well-ordered in both the original and new code
+     (see the comment above the DR_ALIAS_* flags for details); and
+     (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR.  */
+  if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
+    return false;
+
+  /* Check for equal (but possibly variable) steps.  */
+  tree step = DR_STEP (dr_a.dr);
+  if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
+    return false;
+
+  /* Make sure that we can operate on sizetype without loss of precision.  */
+  tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
+  if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
+    return false;
+
+  /* Get lengths in sizetype.  */
+  tree seg_len_a
+    = fold_convert (sizetype,
+		    rewrite_to_non_trapping_overflow (dr_a.seg_len));
+  create_waw_or_war_checks2 (cond_expr, seg_len_a, alias_pair);
+  if (param_flexible_seg_len && dr_a.seg_len != dr_a.seg_len2)
+    {
+      tree seg_len2_a
+	= fold_convert (sizetype,
+			rewrite_to_non_trapping_overflow (dr_a.seg_len2));
+      tree cond_expr2;
+      create_waw_or_war_checks2 (&cond_expr2, seg_len2_a, alias_pair);
+      *cond_expr =  fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
+				 *cond_expr, cond_expr2);
+   }
+   return true;
+ }
+ 
+diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
+index 771d20fbb..5903ce66a 100644
+--- a/gcc/tree-data-ref.h
+++ b/gcc/tree-data-ref.h
+@@ -208,12 +208,19 @@ class dr_with_seg_len
+ public:
+   dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size,
+ 		   unsigned int a)
+-    : dr (d), seg_len (len), access_size (size), align (a) {}
+-
+    : dr (d), seg_len (len), seg_len2 (len), access_size (size), align (a)
+    {}
+  dr_with_seg_len (data_reference_p d, tree len, tree len2,
+		   unsigned HOST_WIDE_INT size, unsigned int a)
+    : dr (d), seg_len (len), seg_len2 (len2), access_size (size), align (a)
+    {}
+   data_reference_p dr;
+   /* The offset of the last access that needs to be checked minus
+      the offset of the first.  */
+   tree seg_len;
+  /* The second version of segment length.  Currently this is used to
+     soften checks for a small number of iterations.  */
+  tree seg_len2;
+   /* A value that, when added to abs (SEG_LEN), gives the total number of
+      bytes in the segment.  */
+   poly_uint64 access_size;
+diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
+index e4466a4f3..1b8a03c9c 100644
+--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
+@@ -3498,6 +3498,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
+     {
+       poly_uint64 lower_bound;
+       tree segment_length_a, segment_length_b;
+      tree segment_length2_a, segment_length2_b;
+       unsigned HOST_WIDE_INT access_size_a, access_size_b;
+       unsigned int align_a, align_b;
+ 
+@@ -3598,6 +3599,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
+ 	{
+ 	  segment_length_a = size_zero_node;
+ 	  segment_length_b = size_zero_node;
+	  segment_length2_a = size_zero_node;
+	  segment_length2_b = size_zero_node;
+ 	}
+       else
+ 	{
+@@ -3606,8 +3609,15 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
+ 	    length_factor = scalar_loop_iters;
+ 	  else
+ 	    length_factor = size_int (vect_factor);
+	  /* In any case we should rememeber scalar_loop_iters
+	     this helps to create flexible aliasing check
+	     for small number of iterations.  */
+ 	  segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
+ 	  segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
+	  segment_length2_a
+	    = vect_vfa_segment_size (dr_info_a, scalar_loop_iters);
+	  segment_length2_b
+	    = vect_vfa_segment_size (dr_info_b, scalar_loop_iters);
+ 	}
+       access_size_a = vect_vfa_access_size (dr_info_a);
+       access_size_b = vect_vfa_access_size (dr_info_b);
+@@ -3652,9 +3662,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
+ 	}
+ 
+       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
+-			    access_size_a, align_a);
+			    segment_length2_a, access_size_a, align_a);
+       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
+-			    access_size_b, align_b);
+			    segment_length2_b, access_size_b, align_b);
+       /* Canonicalize the order to be the one that's needed for accurate
+ 	 RAW, WAR and WAW flags, in cases where the data references are
+ 	 well-ordered.  The order doesn't really matter otherwise,
+-- 
+2.33.0
+