gcc/0150-Implement-propagation-of-permutations-in-fwprop.patch

1051 lines
34 KiB
Diff

From 07aa5f889dc8bc3e642affe21dcfc197ad7d8b3b Mon Sep 17 00:00:00 2001
From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
Date: Sun, 3 Sep 2023 05:52:32 +0800
Subject: [PATCH 12/13] Implement propagation of permutations in fwprop
It is an implementation of permutation forward propagation, which is a
transformation designed to decrease the number of vector permutation
instructions in vectorized code, moving the permutations over arithmetic
operations.
---
gcc/config/aarch64/aarch64-simd.md | 26 +
gcc/params.opt | 4 +
gcc/testsuite/gcc.dg/vect/transpose-9.c | 56 ++
gcc/tree-ssa-forwprop.c | 891 ++++++++++++++++++++++++
4 files changed, 977 insertions(+)
create mode 100755 gcc/testsuite/gcc.dg/vect/transpose-9.c
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 6049adc3f..af6d3ebf6 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4615,6 +4615,19 @@
[(set_attr "type" "neon_shift_imm_long")]
)
+(define_insn "*aarch64_simd_vec_unpacks_lo_shiftsi"
+ [(set (match_operand:V4SI 0 "register_operand" "=w")
+ (ashift:V4SI
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (match_operand:V8HI 1 "register_operand" "w")
+ (match_operand:V8HI 2 "vect_par_cnst_lo_half" "")))
+ (match_operand:V4SI 3 "aarch64_simd_rshift_imm" "Dr")))]
+ "TARGET_SIMD"
+ "shll\t%0.4s, %1.4h, #%3"
+ [(set_attr "type" "neon_compare_zero")]
+)
+
;; vshll_high_n
(define_insn "aarch64_<sur>shll2_n<mode>"
@@ -4632,6 +4645,19 @@
[(set_attr "type" "neon_shift_imm_long")]
)
+(define_insn "*aarch64_simd_vec_unpacks_hi_shiftsi"
+ [(set (match_operand:V4SI 0 "register_operand" "=w")
+ (ashift:V4SI
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (match_operand:V8HI 1 "register_operand" "w")
+ (match_operand:V8HI 2 "vect_par_cnst_hi_half" "")))
+ (match_operand:V4SI 3 "aarch64_simd_rshift_imm" "Dr")))]
+ "TARGET_SIMD"
+ "shll2\t%0.4s, %1.8h, #%3"
+ [(set_attr "type" "neon_compare_zero")]
+)
+
;; vrshr_n
(define_insn "aarch64_<sur>shr_n<mode>"
diff --git a/gcc/params.opt b/gcc/params.opt
index 83fd705ee..a87f6f00a 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -852,6 +852,10 @@ Maximum size, in storage units, of an aggregate which should be considered for s
Common Joined UInteger Var(param_sra_max_propagations) Param Optimization Init(32)
Maximum number of artificial accesses to enable forward propagation that Scalar Replacement of Aggregates will keep for one local variable.
+-param=tree-forwprop-perm=
+Common Joined UInteger Var(param_tree_forwprop_perm) Param Optimization Init(0)
+Propagate permutations in vectorized code on tree forward propagation.
+
-param=ssa-name-def-chain-limit=
Common Joined UInteger Var(param_ssa_name_def_chain_limit) Init(512) Param Optimization
The maximum number of SSA_NAME assignments to follow in determining a value.
diff --git a/gcc/testsuite/gcc.dg/vect/transpose-9.c b/gcc/testsuite/gcc.dg/vect/transpose-9.c
new file mode 100755
index 000000000..f20a67c6e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/transpose-9.c
@@ -0,0 +1,56 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-additional-options "-mtune=tsv110 --param=tree-forwprop-perm=1 -fdump-tree-forwprop-details" } */
+/* { dg-require-effective-target vect_int } */
+#include <stdio.h>
+#include <stdlib.h>
+#include "tree-vect.h"
+
+typedef unsigned short int sum_t;
+typedef unsigned int sum2_t;
+typedef long int intptr_t;
+typedef unsigned char data;
+#define BITS_PER_SUM (8 * sizeof(sum_t))
+
+static sum2_t bar(sum2_t a )
+{
+ sum2_t s = ((a>>(BITS_PER_SUM-1))&(((sum2_t)1<<BITS_PER_SUM)+1))*((sum_t)-1);
+ return (a+s)^s;
+}
+
+int foo(data *pix1, intptr_t i_pix1, data *pix2, intptr_t i_pix2 )
+{
+ sum2_t tmp[4][4];
+ sum2_t a0, a1, a2, a3;
+ sum2_t sum = 0;
+ for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
+ {
+ a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
+ a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
+ a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
+ a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
+ sum2_t t0 = a0 + a1;
+ sum2_t t1 = a0 - a1;
+ sum2_t t2 = a2 + a3;
+ sum2_t t3 = a2 - a3;
+ tmp[i][0] = t0 + t2;
+ tmp[i][2] = t0 - t2;
+ tmp[i][1] = t1 + t3;
+ tmp[i][3] = t1 - t3;
+ }
+ for( int i = 0; i < 4; i++ )
+ {
+ sum2_t t0 = tmp[0][i] + tmp[1][i];
+ sum2_t t1 = tmp[0][i] - tmp[1][i];
+ sum2_t t2 = tmp[2][i] + tmp[3][i];
+ sum2_t t3 = tmp[2][i] - tmp[3][i];
+ a0 = t0 + t2;
+ a2 = t0 - t2;
+ a1 = t1 + t3;
+ a3 = t1 - t3;
+ sum += bar(a0) + bar(a1) + bar(a2) + bar(a3);
+ }
+ return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
+}
+/* { dg-final { scan-tree-dump "Initial permutations were reduced:" "forwprop4" } } */
+/* { dg-final { scan-tree-dump "Permutations were moved through binary operations:" "forwprop4" } } */
+
diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index ba0b55f4a..92ef5d036 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -2225,6 +2225,893 @@ simplify_permutation (gimple_stmt_iterator *gsi)
return 0;
}
+/* Compare the UID of two gimple stmts for sorting in ascending order. */
+
+static int
+gimple_uid_cmp (const void *ptr0, const void *ptr1)
+{
+ const gimple *stmt0 = *(gimple * const *) ptr0;
+ const gimple *stmt1 = *(gimple * const *) ptr1;
+
+ if (gimple_uid (stmt0) < gimple_uid (stmt1))
+ return -1;
+ else if (gimple_uid (stmt0) > gimple_uid (stmt1))
+ return 1;
+ return 0;
+}
+
+/* Find a source permutation statement in backward direction through a chain of
+ unary, single or binary operations. In the last case only one variable
+ operand is allowed. If it's found, return true and save the statement in
+ perm_stmts, otherwise return false. */
+
+static bool
+find_src_perm_stmt (tree op, auto_vec<gimple *> &perm_stmts)
+{
+ gimple *stmt;
+ while ((stmt = get_prop_source_stmt (op, false, NULL)))
+ {
+ if (!can_propagate_from (stmt))
+ return false;
+
+ if (gimple_assign_rhs_code (stmt) == VEC_PERM_EXPR)
+ {
+ perm_stmts.safe_push (stmt);
+ return true;
+ }
+
+ /* TODO: check vector length and element size. */
+ enum tree_code code = gimple_assign_rhs_code (stmt);
+ switch (get_gimple_rhs_class (code))
+ {
+ case GIMPLE_TERNARY_RHS:
+ return false;
+ case GIMPLE_BINARY_RHS:
+ {
+ tree op1 = gimple_assign_rhs1 (stmt);
+ tree op2 = gimple_assign_rhs2 (stmt);
+ bool is_cst_op1 = is_gimple_constant (op1);
+ bool is_cst_op2 = is_gimple_constant (op2);
+ if ((is_cst_op1 && is_cst_op2) || (!is_cst_op1 && !is_cst_op2))
+ return false;
+ op = !is_cst_op1 && is_cst_op2 ? op1 : op2;
+ break;
+ }
+ case GIMPLE_UNARY_RHS:
+ case GIMPLE_SINGLE_RHS:
+ op = gimple_assign_rhs1 (stmt);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ if (TREE_CODE (op) != SSA_NAME)
+ return false;
+ }
+ return false;
+}
+
+/* Check the stmt is binary operation and find initial permutations for both
+ of its sources. */
+
+static bool
+find_initial_permutations (gimple_stmt_iterator *gsi, tree &type,
+ auto_vec<gimple *> &perm_stmts)
+{
+ gimple *stmt = gsi_stmt (*gsi);
+ enum tree_code code = gimple_assign_rhs_code (stmt);
+
+ // TODO: support other initial binary operations.
+ gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
+
+ type = TREE_TYPE (gimple_assign_lhs (stmt));
+ if (!VECTOR_TYPE_P (type))
+ return false;
+ tree op1 = gimple_assign_rhs1 (stmt);
+ tree op2 = gimple_assign_rhs2 (stmt);
+ if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME
+ || TREE_TYPE (op1) != type || TREE_TYPE (op2) != type || op1 == op2)
+ return false;
+
+ if (find_src_perm_stmt (op1, perm_stmts)
+ && find_src_perm_stmt (op2, perm_stmts))
+ return true;
+ return false;
+}
+
+/* Check if the permutation statement is suitable for the transformation. */
+
+static bool
+check_perm_stmt (gimple *stmt, tree type, vec<gimple *> *perm_stmts,
+ vec<tree> *src_vects)
+{
+ if (!stmt || !can_propagate_from (stmt))
+ return false;
+
+ enum tree_code code = gimple_assign_rhs_code (stmt);
+ if (code != VEC_PERM_EXPR)
+ return false;
+
+ tree op3 = gimple_assign_rhs3 (stmt);
+ tree op1 = gimple_assign_rhs1 (stmt);
+ tree op2 = gimple_assign_rhs2 (stmt);
+ if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME
+ || TREE_CODE (op3) != VECTOR_CST)
+ return false;
+ if (type != NULL_TREE && (TREE_TYPE (op1) != type
+ || TREE_TYPE (op2) != type))
+ return false;
+ if (perm_stmts)
+ perm_stmts->safe_push (stmt);
+ if (src_vects)
+ {
+ src_vects->safe_push (op1);
+ src_vects->safe_push (op2);
+ }
+ return true;
+}
+
+/* Collect permutation stmts preceding the given stmt. */
+
+static bool
+find_perm_set (gimple *stmt, tree type, vec<gimple *> &perm_stmts,
+ vec<tree> &src_vects)
+{
+ auto_vec<tree> ops;
+ if (!check_perm_stmt (stmt, NULL, NULL, &ops))
+ return false;
+
+ unsigned i;
+ tree op;
+ bool single_use_op = false;
+ FOR_EACH_VEC_ELT (ops, i, op)
+ {
+ /* Skip if we already processed the same operand. */
+ if (i > 0 && ops[i] == ops[i - 1])
+ continue;
+ /* Find one permutation stmt. */
+ gimple *def_stmt = get_prop_source_stmt (op, false, &single_use_op);
+ if (!check_perm_stmt (def_stmt, type, &perm_stmts, &src_vects))
+ return false;
+ if (single_use_op || src_vects.length () <= 1)
+ return false;
+ unsigned last_i = src_vects.length () - 1;
+ unsigned before_last_i = src_vects.length () - 2;
+
+ /* Find one more permutation stmt. */
+ gimple *use_stmt;
+ imm_use_iterator iter;
+ FOR_EACH_IMM_USE_STMT (use_stmt, iter, src_vects[before_last_i])
+ if (use_stmt != def_stmt)
+ BREAK_FROM_IMM_USE_STMT (iter);
+ if (!use_stmt || use_stmt == def_stmt
+ || gimple_assign_rhs_code (use_stmt) != VEC_PERM_EXPR
+ || src_vects[before_last_i] != gimple_assign_rhs1 (use_stmt)
+ || src_vects[last_i] != gimple_assign_rhs2 (use_stmt))
+ return false;
+ perm_stmts.safe_push (use_stmt);
+ }
+ return true;
+}
+
+/* Walk permutation pattern and make a vector of permutation indices. */
+
+static bool
+make_vec_of_indices (vec<tree> &perm_pattern, vec<unsigned> &perm_indices)
+{
+ unsigned i, j;
+ tree tree_it;
+ FOR_EACH_VEC_ELT (perm_pattern, i, tree_it)
+ {
+ unsigned HOST_WIDE_INT nelts;
+ if (!VECTOR_CST_NELTS (tree_it).is_constant (&nelts))
+ return false;
+ for (j = 0; j < nelts; j++)
+ {
+ tree val = VECTOR_CST_ELT (tree_it, j);
+ gcc_checking_assert (TREE_CODE (val) == INTEGER_CST);
+ perm_indices.safe_push (TREE_INT_CST_LOW (val));
+ }
+ }
+ return true;
+}
+
+/* Check or collect a permutation pattern in the provided perm_stmts depending
+ on the passed parameters. If collect_pattern is true, collect permutation
+ vectors to pattern. In other case, check the pattern suits perm_stmts. */
+
+static bool
+check_or_collect_perm_pattern (vec<gimple *> &perm_stmts, vec<tree> &pattern,
+ bool collect_pattern)
+{
+ unsigned i, j;
+ gimple *stmt_it;
+ tree tree_it;
+ FOR_EACH_VEC_ELT (perm_stmts, i, stmt_it)
+ {
+ gcc_assert (gimple_assign_rhs_code (stmt_it) == VEC_PERM_EXPR);
+ tree perm_vec = gimple_assign_rhs3 (stmt_it);
+ bool found = false;
+ FOR_EACH_VEC_ELT (pattern, j, tree_it)
+ if (operand_equal_p (tree_it, perm_vec))
+ {
+ found = true;
+ break;
+ }
+ if (collect_pattern && !found)
+ pattern.safe_push (perm_vec);
+ else
+ gcc_assert (found);
+ if (i % pattern.length () != j)
+ return false;
+ }
+ return true;
+}
+
+/* Identify the permutation pattern and check it. For now, we are checking
+ only transposition permutations with no more than 2 lines in their patterns.
+ Collect permutation const vectors and the second permutation stmts. */
+
+static bool
+check_perm_pattern (vec<gimple *> &first_perm_stmts, vec<tree> &perm_pattern,
+ vec<gimple *> &second_perm_stmts)
+{
+ unsigned i, j;
+ gimple *stmt_it;
+ if (!check_or_collect_perm_pattern (first_perm_stmts, perm_pattern, true))
+ return false;
+
+ if (perm_pattern.length () == 0 || perm_pattern.length () > 2)
+ return false;
+
+ /* Find the second permutation stmts. */
+ hash_set<gimple *> visited;
+ FOR_EACH_VEC_ELT (first_perm_stmts, i, stmt_it)
+ {
+ tree dst = gimple_assign_lhs (stmt_it);
+ use_operand_p use_p;
+ imm_use_iterator iter;
+ FOR_EACH_IMM_USE_FAST (use_p, iter, dst)
+ {
+ gimple *stmt_it2 = USE_STMT (use_p);
+ if (visited.contains (stmt_it2))
+ continue;
+ second_perm_stmts.safe_push (stmt_it2);
+ visited.add (stmt_it2);
+ }
+ }
+ second_perm_stmts.qsort (gimple_uid_cmp);
+
+ if (first_perm_stmts.length () != second_perm_stmts.length ())
+ return false;
+
+ /* Check that all second_perm_stmts are VEC_PERM_EXPR. */
+ FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it)
+ if (gimple_assign_rhs_code (stmt_it) != VEC_PERM_EXPR)
+ return false;
+
+ /* Check permutation pattern on the second permutation stmts. */
+ if (!check_or_collect_perm_pattern (second_perm_stmts, perm_pattern, false))
+ return false;
+
+ /* Check values of permutation indices. */
+ auto_vec<unsigned> perm_indices (vector_cst_encoded_nelts (perm_pattern[0])
+ * perm_pattern.length ());
+ if (!make_vec_of_indices (perm_pattern, perm_indices))
+ return false;
+
+ unsigned val, half_len = perm_indices.length () / 2;
+ FOR_EACH_VEC_ELT (perm_indices, j, val)
+ if (val != (j % 2 ? half_len + j / 2 : j / 2))
+ return false;
+
+ /* Check the correspondence of defs in first_perm_stmts and uses in
+ second_perm_stmts. */
+ tree type1 = TREE_TYPE (gimple_assign_lhs (first_perm_stmts[0]));
+ tree type2 = TREE_TYPE (gimple_assign_lhs (second_perm_stmts[0]));
+ if (type1 != type2)
+ return false;
+
+ unsigned HOST_WIDE_INT len = TYPE_VECTOR_SUBPARTS (type1).to_constant ();
+ FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it)
+ {
+ /* Vectors of first/second perm stmts consist of blocks, each block
+ transposes its own set of input vectors. J corresponds to the number
+ of such block in the vector. */
+ unsigned j = (i / len) * len;
+ gimple *src_stmt1 = first_perm_stmts[j + (i - j) / 2];
+ gimple *src_stmt2 = first_perm_stmts[j + (i - j) / 2 + len / 2];
+ if (gimple_assign_rhs1 (stmt_it) != gimple_assign_lhs (src_stmt1)
+ || gimple_assign_rhs2 (stmt_it) != gimple_assign_lhs (src_stmt2))
+ return false;
+ }
+ return true;
+}
+
+/* For the given vector of stmts find all immediate def or use stmts.
+ It uses SSA and don't go trough loads/stores. */
+
+static bool
+find_next_stmts (auto_vec<gimple *> &stmts, auto_vec<gimple *> &next_stmts,
+ bool is_forward, bool skip_perms)
+{
+ unsigned i;
+ gimple *stmt_it;
+ hash_set<gimple *> new_stmt_set;
+ FOR_EACH_VEC_ELT (stmts, i, stmt_it)
+ {
+ if (is_forward)
+ {
+ tree lhs = gimple_assign_lhs (stmt_it);
+ if (!lhs || TREE_CODE (lhs) != SSA_NAME)
+ continue;
+ imm_use_iterator iter;
+ gimple *use_stmt;
+ FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
+ if (!new_stmt_set.contains (use_stmt))
+ {
+ new_stmt_set.add (use_stmt);
+ if (!skip_perms
+ || gimple_assign_rhs_code (use_stmt) != VEC_PERM_EXPR)
+ next_stmts.safe_push (use_stmt);
+ }
+ }
+ else
+ {
+ tree rhs;
+ auto_vec<tree> rhs_vec (3);
+ if ((rhs = gimple_assign_rhs1 (stmt_it)))
+ rhs_vec.quick_push (rhs);
+ if ((rhs = gimple_assign_rhs2 (stmt_it)))
+ rhs_vec.quick_push (rhs);
+ if ((rhs = gimple_assign_rhs3 (stmt_it)))
+ rhs_vec.quick_push (rhs);
+ unsigned j;
+ FOR_EACH_VEC_ELT (rhs_vec, j, rhs)
+ {
+ if (TREE_CODE (rhs) == VIEW_CONVERT_EXPR)
+ rhs = TREE_OPERAND (rhs, 0);
+ if (TREE_CODE (rhs) != SSA_NAME)
+ continue;
+ gimple *def_stmt = get_prop_source_stmt (rhs, false, NULL);
+ if (!def_stmt)
+ return false;
+ if (new_stmt_set.contains (def_stmt))
+ continue;
+ new_stmt_set.add (def_stmt);
+ if (!skip_perms
+ || gimple_assign_rhs_code (def_stmt) != VEC_PERM_EXPR)
+ next_stmts.safe_push (def_stmt);
+ }
+ }
+ }
+ return true;
+}
+
+/* Check if stmts in the vector have similar code and type. Process only
+ assign stmts. */
+
+static bool
+check_stmts_similarity (auto_vec<gimple *> &stmts, enum tree_code &code)
+{
+ code = NOP_EXPR;
+ tree type = NULL_TREE;
+ unsigned i;
+ gimple *stmt_it;
+ FOR_EACH_VEC_ELT (stmts, i, stmt_it)
+ {
+ if (!is_gimple_assign (stmt_it))
+ return false;
+ tree lhs = gimple_assign_lhs (stmt_it);
+ enum tree_code code2 = gimple_assign_rhs_code (stmt_it);
+ if (type != NULL_TREE)
+ {
+ /* Unpack lo/hi are the same for the analysis. */
+ if (((code2 != VEC_UNPACK_LO_EXPR && code2 != VEC_UNPACK_HI_EXPR)
+ || (code != VEC_UNPACK_LO_EXPR && code != VEC_UNPACK_HI_EXPR))
+ && (!lhs || type != TREE_TYPE (lhs)
+ || (code != NOP_EXPR && code != code2)))
+ return false;
+ }
+ else if (lhs)
+ type = TREE_TYPE (lhs);
+ if (code == NOP_EXPR)
+ code = code2;
+ }
+ return true;
+}
+
+/* Check that the order of definitions of first_stmts and uses of second_stmts
+ is the same. */
+
+static bool
+check_def_use_order (vec<gimple *> &first_stmts, vec<gimple *> &second_stmts)
+{
+ first_stmts.qsort (gimple_uid_cmp);
+ second_stmts.qsort (gimple_uid_cmp);
+ unsigned len1 = first_stmts.length ();
+ unsigned len2 = second_stmts.length ();
+
+ /* Skip if one of the blocks is empty or the second block is permutaions. */
+ if (!len1 || !len2
+ || gimple_assign_rhs_code (second_stmts[0]) == VEC_PERM_EXPR)
+ return true;
+
+ unsigned i;
+ gimple *stmt_it;
+ FOR_EACH_VEC_ELT (first_stmts, i, stmt_it)
+ {
+ tree op = gimple_assign_lhs (stmt_it);
+ imm_use_iterator iter;
+ gimple *stmt;
+ FOR_EACH_IMM_USE_STMT (stmt, iter, op)
+ {
+ if ((len1 == len2 && stmt != second_stmts[i])
+ || (len1 == len2 * 2 && stmt != second_stmts[i % len2]))
+ RETURN_FROM_IMM_USE_STMT (iter, false);
+ enum tree_code code = gimple_assign_rhs_code (stmt);
+ if ((len1 * 2 == len2)
+ && ((code == VEC_UNPACK_LO_EXPR && stmt != second_stmts[2 * i])
+ || (code == VEC_UNPACK_HI_EXPR
+ && stmt != second_stmts[2 * i + 1])))
+ RETURN_FROM_IMM_USE_STMT (iter, false);
+ }
+ }
+ return true;
+}
+
+/* Check similarity of stmts in the block of arithmetic operations. */
+
+static bool
+check_arithmetic_block (vec<gimple *> &initial_perm_stmts, unsigned nstmts)
+{
+ auto_vec<gimple *> next_stmts (nstmts);
+ auto_vec<gimple *> prev_stmts (nstmts);
+
+ enum tree_code code;
+ unsigned i;
+ gimple *stmt_it;
+ FOR_EACH_VEC_ELT (initial_perm_stmts, i, stmt_it)
+ prev_stmts.quick_push (stmt_it);
+
+ do
+ {
+ next_stmts.block_remove (0, next_stmts.length ());
+ if (!find_next_stmts (prev_stmts, next_stmts, false, true))
+ return false;
+
+ /* Check that types and codes of all stmts in the list are the same. */
+ if (!check_stmts_similarity (next_stmts, code))
+ return false;
+ /* Check that the order of all operands is the same. */
+ if (!check_def_use_order (next_stmts, prev_stmts))
+ return false;
+ prev_stmts.block_remove (0, prev_stmts.length ());
+
+ FOR_EACH_VEC_ELT (next_stmts, i, stmt_it)
+ prev_stmts.safe_push (stmt_it);
+ }
+ while (code != NOP_EXPR);
+
+ return true;
+}
+
+/* Find two blocks of permutations on two sets of input vectors which are
+ used in the same vectorized arithmetic operations after the permutaion:
+ Va1...VaN = PERM{P1} (Sa1...SaN)
+ Vb1...VbN = PERM{P1} (Sb1...SbN)
+ Vc1...VcN = binops (Va1...VaN, Vb1...VbN)
+ The goal of the transformation is to execute the block of permutations
+ only once on the result of the arithmetic operations:
+ Va1...VaN = binops (Sa1...SaN, Sb1...SbN)
+ Vc1...VcN = PERM{P1} (Va1...VaN)
+
+ Currently the analysis looks for transposition permutations that consist
+ of two layers of statements e.g.:
+ Vt1 = PERM { 0, 4, 1, 5 } Sa1, Sa2 // the first
+ Vt2 = PERM { 2, 6, 3, 7 } Sa1, Sa2
+ Vt3 = PERM { 0, 4, 1, 5 } Sa3, Sa4
+ Vt4 = PERM { 2, 6, 3, 7 } Sa3, Sa4
+ Va1 = PERM { 0, 4, 1, 5 } Vt1, Vt3 // the second
+ Va2 = PERM { 2, 6, 3, 7 } Vt1, Vt3
+ Va3 = PERM { 0, 4, 1, 5 } Vt2, Vt4
+ Va4 = PERM { 2, 6, 3, 7 } Vt2, Vt4
+ Permutation stmts are collected in first_perm_stmts and second_perm_stmts
+ vectors correspondinglys.
+
+ Arithmetic operations may contain several stmts for one pair of input source
+ vectors e.g.:
+ Vtmp1 = unop (Va1)
+ Vtmp2 = binop (Vb1, const)
+ Vc1 = binop (Vtmp1, Vtmp2)
+ The last stmts of each sequence in the arithmetic block are collected
+ in final_arith_stmts. */
+
+static bool
+analyze_perm_fwprop (tree type, unsigned HOST_WIDE_INT nelts,
+ vec<gimple *> &stmts, auto_vec<tree> &src_vects,
+ auto_vec<tree> &perm_pattern,
+ auto_vec<gimple *> &final_arith_stmts,
+ auto_vec<gimple *> &second_perm_stmts)
+{
+ gcc_checking_assert (stmts.length () == 2);
+ auto_vec<gimple *> first_perm_stmts (nelts * 2);
+ if (!find_perm_set (stmts[0], type, first_perm_stmts, src_vects)
+ || !find_perm_set (stmts[1], type, first_perm_stmts, src_vects))
+ return false;
+ first_perm_stmts.qsort (gimple_uid_cmp);
+
+ /* Determine permutation pattern. */
+ if (!check_perm_pattern (first_perm_stmts, perm_pattern, second_perm_stmts))
+ return false;
+
+ /* Find all arithmetic stmts. */
+ unsigned i;
+ gimple *stmt_it;
+ auto_vec<gimple *> all_arith_stmts (nelts * 2);
+ hash_set<gimple *> visited;
+ FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it)
+ {
+ tree dst = gimple_assign_lhs (stmt_it);
+ use_operand_p use_p;
+ gimple *use_stmt;
+ if (!single_imm_use (dst, &use_p, &use_stmt))
+ return false;
+ all_arith_stmts.quick_push (use_stmt);
+ visited.add (use_stmt);
+ }
+
+ /* Select final arithmetic stmts. */
+ FOR_EACH_VEC_ELT (all_arith_stmts, i, stmt_it)
+ {
+ tree dst = gimple_assign_lhs (stmt_it);
+ use_operand_p use_p;
+ imm_use_iterator iter;
+ bool use_only_outside_arith_stmts = true;
+ FOR_EACH_IMM_USE_FAST (use_p, iter, dst)
+ if (visited.contains (USE_STMT (use_p)))
+ {
+ use_only_outside_arith_stmts = false;
+ break;
+ }
+ if (use_only_outside_arith_stmts)
+ final_arith_stmts.quick_push (stmt_it);
+ }
+
+ /* Check that all results has the same arithmetic patterns. */
+ if (!check_arithmetic_block (final_arith_stmts, nelts))
+ return false;
+
+ if (final_arith_stmts.length () < nelts)
+ return false;
+ return true;
+}
+
+/* Substitute uses of stmts' results by new_uses. */
+
+static void
+substitute_uses (vec<gimple *> &stmts, vec<tree> &new_uses)
+{
+ gcc_checking_assert (stmts.length () == new_uses.length ());
+ unsigned i;
+ gimple *stmt_it;
+ FOR_EACH_VEC_ELT (stmts, i, stmt_it)
+ {
+ tree op = gimple_assign_lhs (stmt_it);
+ imm_use_iterator iter;
+ gimple *use_stmt;
+ FOR_EACH_IMM_USE_STMT (use_stmt, iter, op)
+ {
+ use_operand_p use_p;
+ FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
+ SET_USE (use_p, new_uses[i]);
+ update_stmt (use_stmt);
+ }
+ }
+}
+
+/* Propagate permutations through the block of arithmetic operations. */
+
+static void
+fwprop_perms (tree type, auto_vec<tree> &src_vects,
+ auto_vec<tree> &perm_pattern,
+ auto_vec<gimple *> &final_arith_stmts,
+ auto_vec<gimple *> &second_perm_stmts)
+{
+ /* Build new permutation stmts after the block of arithmetic stmts. */
+ gimple_seq new_stmts = NULL;
+ unsigned perm_block_size = final_arith_stmts.length ();
+ auto_vec<tree> new_first_perm_vals (perm_block_size);
+ hash_set<gimple *> new_stmts_set;
+ unsigned i, perm_pattern_size = perm_pattern.length ();
+ for (i = 0; i < perm_block_size; i++)
+ {
+ tree op0 = gimple_assign_lhs (final_arith_stmts[i / 2]);
+ unsigned idx = i / 2 + perm_block_size / 2;
+ tree op1 = gimple_assign_lhs (final_arith_stmts[idx]);
+ tree res = gimple_build (&new_stmts, VEC_PERM_EXPR, type, op0, op1,
+ perm_pattern[i % perm_pattern_size]);
+ new_first_perm_vals.quick_push (res);
+ new_stmts_set.add (gimple_seq_last (new_stmts));
+ }
+ auto_vec<tree> new_second_perm_vals (perm_block_size);
+ for (i = 0; i < perm_block_size; i++)
+ {
+ tree op0 = new_first_perm_vals[i / 2];
+ tree op1 = new_first_perm_vals[i / 2 + perm_block_size/ 2];
+ tree res = gimple_build (&new_stmts, VEC_PERM_EXPR, type, op0, op1,
+ perm_pattern[i % perm_pattern_size]);
+ new_second_perm_vals.quick_push (res);
+ new_stmts_set.add (gimple_seq_last (new_stmts));
+ }
+
+ gimple_stmt_iterator g = gsi_for_stmt (final_arith_stmts.last ());
+ gsi_insert_seq_after (&g, new_stmts, GSI_SAME_STMT);
+
+ /* Replace old uses of the arithmetic block results by destinations of
+ the new permutation block. */
+ gimple *stmt_it;
+ FOR_EACH_VEC_ELT (final_arith_stmts, i, stmt_it)
+ {
+ tree op0 = gimple_assign_lhs (final_arith_stmts[i]);
+ imm_use_iterator iter;
+ gimple *use_stmt;
+ use_operand_p use_p;
+ FOR_EACH_IMM_USE_STMT (use_stmt, iter, op0)
+ {
+ if (new_stmts_set.contains (use_stmt))
+ continue;
+ FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
+ SET_USE (use_p, new_second_perm_vals[i]);
+ update_stmt (use_stmt);
+ }
+ }
+
+ /* Disconnect the old permutation stmts. */
+ substitute_uses (second_perm_stmts, src_vects);
+}
+
+/* Find the permutation stmts in the forward or backward direction (in terms of
+ def/use graph) starting from the vector of initial stmts. Count reduction
+ stmts (i.e. binary operations) if they can change the number of processed
+ elements. */
+
+static bool
+find_perm_stmts (vec<gimple *> &initial_stmts, unsigned nstmts,
+ vec<gimple *> &final_perm_stmts, bool is_forward,
+ unsigned &nreduct)
+{
+ auto_vec<gimple *> next_stmts (nstmts);
+ auto_vec<gimple *> prev_stmts (nstmts);
+
+ nreduct = 0;
+ enum tree_code code;
+ unsigned i;
+ gimple *stmt_it;
+ FOR_EACH_VEC_ELT (initial_stmts, i, stmt_it)
+ prev_stmts.quick_push (stmt_it);
+
+ do
+ {
+ next_stmts.block_remove (0, next_stmts.length ());
+ if (!find_next_stmts (prev_stmts, next_stmts, is_forward, false))
+ return false;
+
+ /* Check that types and codes of all stmts in the list are the same. */
+ if (!check_stmts_similarity (next_stmts, code))
+ return false;
+
+ /* TODO: don't take into account binary operations with constants. */
+ if (TREE_CODE_CLASS (code) == tcc_binary)
+ nreduct += 1;
+
+ if (is_forward ? !check_def_use_order (prev_stmts, next_stmts)
+ : !check_def_use_order (next_stmts, prev_stmts))
+ return false;
+
+ prev_stmts.block_remove (0, prev_stmts.length ());
+
+ FOR_EACH_VEC_ELT (next_stmts, i, stmt_it)
+ prev_stmts.safe_push (stmt_it);
+ }
+ while (code != NOP_EXPR && code != VEC_PERM_EXPR);
+
+ if (code != VEC_PERM_EXPR)
+ return false;
+
+ FOR_EACH_VEC_ELT (next_stmts, i, stmt_it)
+ final_perm_stmts.safe_push (stmt_it);
+ final_perm_stmts.qsort (gimple_uid_cmp);
+ return true;
+}
+
+/* Check if the initial and the final permutations can be optimized i.e.
+ the initial permutation can be removed with the modification of
+ the final one. */
+
+static bool
+can_reduce_permutations (unsigned init_nelts, vec<tree> &perm_pattern,
+ vec<gimple *> &init_perm_stmts)
+{
+ auto_vec<unsigned> perm_indices (init_nelts);
+ if (!make_vec_of_indices (perm_pattern, perm_indices))
+ return false;
+ unsigned i, j;
+ gimple *stmt_it;
+ unsigned perm_vec_size = perm_indices.length ();
+ FOR_EACH_VEC_ELT (init_perm_stmts, i, stmt_it)
+ {
+ gcc_assert (gimple_assign_rhs_code (stmt_it) == VEC_PERM_EXPR);
+ tree perm_vec2 = gimple_assign_rhs3 (stmt_it);
+ unsigned HOST_WIDE_INT mask_elts;
+ if (!VECTOR_CST_NELTS (perm_vec2).is_constant (&mask_elts))
+ return false;
+ for (j = 0; j < mask_elts; j++)
+ {
+ tree val = VECTOR_CST_ELT (perm_vec2, j);
+ gcc_assert (TREE_CODE (val) == INTEGER_CST);
+ unsigned HOST_WIDE_INT int_val = TREE_INT_CST_LOW (val);
+ if (int_val != perm_indices[j % perm_vec_size]
+ + (j / perm_vec_size) * perm_vec_size)
+ return false;
+ }
+ }
+ return true;
+}
+
+/* Find permutation blocks before and after arithmetic operations and decide
+ if the number of permutations can be reduced, e.g:
+ Va1...VaN = PERM{P1} (Sa1...SaN)
+ Vb1...VbM = some operations (Va1...VaN)
+ Vb1...VbM = PERM{P2} (Sb1...SbM)
+ can be transformed to:
+ Vb1...VbM = some operations (Va1...VaN)
+ Vb1...VbM = PERM{P3} (Sb1...SbM)
+
+ Currently it supports initial permutations like this:
+ Va1 = PERM { 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15} Sa1
+ and transposition permutations with two layers of permutation stmts as
+ final permutaions.
+
+ Operations between permutations can include unary and binary arithmetic,
+ element conversions and vector packing/unpacking. */
+
+static bool
+analyze_perm_reduction (unsigned HOST_WIDE_INT nelts,
+ vec<gimple *> &perm_stmts,
+ vec<gimple *> &init_perm_stmts,
+ vec<gimple *> &second_perm_stmts)
+{
+ auto_vec<gimple *> first_perm_stmts (nelts * 2);
+ if (!check_perm_stmt (perm_stmts[0], NULL_TREE, &first_perm_stmts, NULL)
+ || !check_perm_stmt (perm_stmts[1], NULL_TREE, &first_perm_stmts, NULL))
+ return false;
+
+ unsigned nreduct;
+ auto_vec<gimple *> final_perm_stmts (nelts * 2);
+ if (!find_perm_stmts (first_perm_stmts, nelts, final_perm_stmts, true,
+ nreduct))
+ return false;
+
+ if (!find_perm_stmts (final_perm_stmts, nelts, init_perm_stmts, false,
+ nreduct))
+ return false;
+
+ /* Check number of elemetns in the inital and final data block. */
+ tree init_elem_type = TREE_TYPE (gimple_assign_lhs (init_perm_stmts[0]));
+ unsigned init_nelts = TYPE_VECTOR_SUBPARTS (init_elem_type).to_constant ()
+ * init_perm_stmts.length ();
+ tree final_elem_type = TREE_TYPE (gimple_assign_lhs (final_perm_stmts[0]));
+ unsigned final_nelts = TYPE_VECTOR_SUBPARTS (final_elem_type).to_constant ()
+ * final_perm_stmts.length ();
+ if (init_nelts != final_nelts * (1 + nreduct))
+ return false;
+
+ /* Check the final permutations and detect its pattern. */
+ auto_vec<tree> perm_pattern (nelts);
+ if (!check_perm_pattern (final_perm_stmts, perm_pattern, second_perm_stmts))
+ return false;
+
+ return can_reduce_permutations (init_nelts, perm_pattern, init_perm_stmts);
+}
+
+/* Do the optimization: skip the initial permutation and change the order
+ of destinations after the second layer of permutation statements in
+ the final permutation block. */
+
+static void
+reduce_perms (vec<gimple *> &init_perm_stmts, vec<gimple *> &second_perm_stmts)
+{
+ unsigned i;
+ gimple *stmt_it;
+ auto_vec<tree> new_srcs (init_perm_stmts.length ());
+ FOR_EACH_VEC_ELT (init_perm_stmts, i, stmt_it)
+ new_srcs.quick_push (gimple_assign_rhs1 (stmt_it));
+ substitute_uses (init_perm_stmts, new_srcs);
+
+ unsigned half = second_perm_stmts.length () / 2;
+ auto_vec<tree> new_dsts (second_perm_stmts.length ());
+ FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it)
+ {
+ unsigned idx = i < half ? i << 1 : ((i - half) << 1) + 1;
+ new_dsts.quick_push (gimple_assign_lhs (second_perm_stmts[idx]));
+ }
+
+ FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it)
+ {
+ gimple_assign_set_lhs (stmt_it, new_dsts[i]);
+ update_stmt (stmt_it);
+ }
+}
+
+/* Optimize permutations in the following two cases:
+ 1. Recognize the same permutations of two sets of vectors with subsequent
+ binary arithmetic operations on them:
+ V1 = PERM{1} (S1);
+ V2 = PERM{1} (S2);
+ V3 = V1 binop V2;
+ then move the permutation after the operations:
+ V0 = S1 binop S2;
+ V3 = PERM{1} V0;
+ 2. Detect the first permutation before some operations on a set of vectors
+ and the second one after the operations:
+ V1 = PERM{1} (S1)
+ V2 = set of operations (V1)
+ V3 = PERM{2} (V2)
+ try to reduce them:
+ V2 = set of operations (S1)
+ V3 = PERM{3} (V2)
+ Return true if the optimization is successful. */
+
+static bool
+propagate_permutations (gimple_stmt_iterator *gsi)
+{
+ tree type;
+ auto_vec<gimple *> perm_stmts (2);
+
+ if (!find_initial_permutations (gsi, type, perm_stmts))
+ return false;
+
+ unsigned HOST_WIDE_INT nelts = TYPE_VECTOR_SUBPARTS (type).to_constant ();
+ auto_vec<gimple *> final_arith_stmts (nelts * 2);
+ auto_vec<gimple *> second_perm_stmts (nelts * 2);
+ auto_vec<tree> src_vects (nelts * 2);
+ auto_vec<tree> perm_pattern (nelts);
+ if (analyze_perm_fwprop (type, nelts, perm_stmts, src_vects, perm_pattern,
+ final_arith_stmts, second_perm_stmts))
+ {
+ fwprop_perms (type, src_vects, perm_pattern, final_arith_stmts,
+ second_perm_stmts);
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ unsigned i;
+ gimple *stmt_it;
+ fprintf (dump_file, "Permutations were moved through "
+ "binary operations:\n");
+ FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it)
+ print_gimple_stmt (dump_file, stmt_it, 0);
+ }
+ return true;
+ }
+
+ auto_vec<gimple *> init_perm_stmts (nelts * 2);
+ auto_vec<gimple *> final_perm_stmts (nelts * 2);
+ if (analyze_perm_reduction (nelts, perm_stmts, init_perm_stmts,
+ final_perm_stmts))
+ {
+ reduce_perms (init_perm_stmts, final_perm_stmts);
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ unsigned i;
+ gimple *stmt_it;
+ fprintf (dump_file, "Initial permutations were reduced:\n");
+ FOR_EACH_VEC_ELT (init_perm_stmts, i, stmt_it)
+ print_gimple_stmt (dump_file, stmt_it, 0);
+ }
+ return true;
+ }
+ return false;
+}
+
/* Get the BIT_FIELD_REF definition of VAL, if any, looking through
conversions with code CONV_CODE or update it if still ERROR_MARK.
Return NULL_TREE if no such matching def was found. */
@@ -3155,6 +4042,10 @@ pass_forwprop::execute (function *fun)
|| code == BIT_XOR_EXPR)
&& simplify_rotate (&gsi))
changed = true;
+ else if ((code == PLUS_EXPR || code == MINUS_EXPR)
+ && param_tree_forwprop_perm
+ && propagate_permutations (&gsi))
+ changed = true;
else if (code == VEC_PERM_EXPR)
{
int did_something = simplify_permutation (&gsi);
--
2.33.0