From 07aa5f889dc8bc3e642affe21dcfc197ad7d8b3b Mon Sep 17 00:00:00 2001 From: Diachkov Ilia Date: Sun, 3 Sep 2023 05:52:32 +0800 Subject: [PATCH 12/13] Implement propagation of permutations in fwprop It is an implementation of permutation forward propagation, which is a transformation designed to decrease the number of vector permutation instructions in vectorized code, moving the permutations over arithmetic operations. --- gcc/config/aarch64/aarch64-simd.md | 26 + gcc/params.opt | 4 + gcc/testsuite/gcc.dg/vect/transpose-9.c | 56 ++ gcc/tree-ssa-forwprop.c | 891 ++++++++++++++++++++++++ 4 files changed, 977 insertions(+) create mode 100755 gcc/testsuite/gcc.dg/vect/transpose-9.c diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 6049adc3f..af6d3ebf6 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4615,6 +4615,19 @@ [(set_attr "type" "neon_shift_imm_long")] ) +(define_insn "*aarch64_simd_vec_unpacks_lo_shiftsi" + [(set (match_operand:V4SI 0 "register_operand" "=w") + (ashift:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "register_operand" "w") + (match_operand:V8HI 2 "vect_par_cnst_lo_half" ""))) + (match_operand:V4SI 3 "aarch64_simd_rshift_imm" "Dr")))] + "TARGET_SIMD" + "shll\t%0.4s, %1.4h, #%3" + [(set_attr "type" "neon_compare_zero")] +) + ;; vshll_high_n (define_insn "aarch64_shll2_n" @@ -4632,6 +4645,19 @@ [(set_attr "type" "neon_shift_imm_long")] ) +(define_insn "*aarch64_simd_vec_unpacks_hi_shiftsi" + [(set (match_operand:V4SI 0 "register_operand" "=w") + (ashift:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "register_operand" "w") + (match_operand:V8HI 2 "vect_par_cnst_hi_half" ""))) + (match_operand:V4SI 3 "aarch64_simd_rshift_imm" "Dr")))] + "TARGET_SIMD" + "shll2\t%0.4s, %1.8h, #%3" + [(set_attr "type" "neon_compare_zero")] +) + ;; vrshr_n (define_insn "aarch64_shr_n" diff --git a/gcc/params.opt b/gcc/params.opt index 83fd705ee..a87f6f00a 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -852,6 +852,10 @@ Maximum size, in storage units, of an aggregate which should be considered for s Common Joined UInteger Var(param_sra_max_propagations) Param Optimization Init(32) Maximum number of artificial accesses to enable forward propagation that Scalar Replacement of Aggregates will keep for one local variable. +-param=tree-forwprop-perm= +Common Joined UInteger Var(param_tree_forwprop_perm) Param Optimization Init(0) +Propagate permutations in vectorized code on tree forward propagation. + -param=ssa-name-def-chain-limit= Common Joined UInteger Var(param_ssa_name_def_chain_limit) Init(512) Param Optimization The maximum number of SSA_NAME assignments to follow in determining a value. diff --git a/gcc/testsuite/gcc.dg/vect/transpose-9.c b/gcc/testsuite/gcc.dg/vect/transpose-9.c new file mode 100755 index 000000000..f20a67c6e --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/transpose-9.c @@ -0,0 +1,56 @@ +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-additional-options "-mtune=tsv110 --param=tree-forwprop-perm=1 -fdump-tree-forwprop-details" } */ +/* { dg-require-effective-target vect_int } */ +#include +#include +#include "tree-vect.h" + +typedef unsigned short int sum_t; +typedef unsigned int sum2_t; +typedef long int intptr_t; +typedef unsigned char data; +#define BITS_PER_SUM (8 * sizeof(sum_t)) + +static sum2_t bar(sum2_t a ) +{ + sum2_t s = ((a>>(BITS_PER_SUM-1))&(((sum2_t)1<>BITS_PER_SUM)) >> 1; +} +/* { dg-final { scan-tree-dump "Initial permutations were reduced:" "forwprop4" } } */ +/* { dg-final { scan-tree-dump "Permutations were moved through binary operations:" "forwprop4" } } */ + diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c index ba0b55f4a..92ef5d036 100644 --- a/gcc/tree-ssa-forwprop.c +++ b/gcc/tree-ssa-forwprop.c @@ -2225,6 +2225,893 @@ simplify_permutation (gimple_stmt_iterator *gsi) return 0; } +/* Compare the UID of two gimple stmts for sorting in ascending order. */ + +static int +gimple_uid_cmp (const void *ptr0, const void *ptr1) +{ + const gimple *stmt0 = *(gimple * const *) ptr0; + const gimple *stmt1 = *(gimple * const *) ptr1; + + if (gimple_uid (stmt0) < gimple_uid (stmt1)) + return -1; + else if (gimple_uid (stmt0) > gimple_uid (stmt1)) + return 1; + return 0; +} + +/* Find a source permutation statement in backward direction through a chain of + unary, single or binary operations. In the last case only one variable + operand is allowed. If it's found, return true and save the statement in + perm_stmts, otherwise return false. */ + +static bool +find_src_perm_stmt (tree op, auto_vec &perm_stmts) +{ + gimple *stmt; + while ((stmt = get_prop_source_stmt (op, false, NULL))) + { + if (!can_propagate_from (stmt)) + return false; + + if (gimple_assign_rhs_code (stmt) == VEC_PERM_EXPR) + { + perm_stmts.safe_push (stmt); + return true; + } + + /* TODO: check vector length and element size. */ + enum tree_code code = gimple_assign_rhs_code (stmt); + switch (get_gimple_rhs_class (code)) + { + case GIMPLE_TERNARY_RHS: + return false; + case GIMPLE_BINARY_RHS: + { + tree op1 = gimple_assign_rhs1 (stmt); + tree op2 = gimple_assign_rhs2 (stmt); + bool is_cst_op1 = is_gimple_constant (op1); + bool is_cst_op2 = is_gimple_constant (op2); + if ((is_cst_op1 && is_cst_op2) || (!is_cst_op1 && !is_cst_op2)) + return false; + op = !is_cst_op1 && is_cst_op2 ? op1 : op2; + break; + } + case GIMPLE_UNARY_RHS: + case GIMPLE_SINGLE_RHS: + op = gimple_assign_rhs1 (stmt); + break; + default: + gcc_unreachable (); + } + if (TREE_CODE (op) != SSA_NAME) + return false; + } + return false; +} + +/* Check the stmt is binary operation and find initial permutations for both + of its sources. */ + +static bool +find_initial_permutations (gimple_stmt_iterator *gsi, tree &type, + auto_vec &perm_stmts) +{ + gimple *stmt = gsi_stmt (*gsi); + enum tree_code code = gimple_assign_rhs_code (stmt); + + // TODO: support other initial binary operations. + gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR); + + type = TREE_TYPE (gimple_assign_lhs (stmt)); + if (!VECTOR_TYPE_P (type)) + return false; + tree op1 = gimple_assign_rhs1 (stmt); + tree op2 = gimple_assign_rhs2 (stmt); + if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME + || TREE_TYPE (op1) != type || TREE_TYPE (op2) != type || op1 == op2) + return false; + + if (find_src_perm_stmt (op1, perm_stmts) + && find_src_perm_stmt (op2, perm_stmts)) + return true; + return false; +} + +/* Check if the permutation statement is suitable for the transformation. */ + +static bool +check_perm_stmt (gimple *stmt, tree type, vec *perm_stmts, + vec *src_vects) +{ + if (!stmt || !can_propagate_from (stmt)) + return false; + + enum tree_code code = gimple_assign_rhs_code (stmt); + if (code != VEC_PERM_EXPR) + return false; + + tree op3 = gimple_assign_rhs3 (stmt); + tree op1 = gimple_assign_rhs1 (stmt); + tree op2 = gimple_assign_rhs2 (stmt); + if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME + || TREE_CODE (op3) != VECTOR_CST) + return false; + if (type != NULL_TREE && (TREE_TYPE (op1) != type + || TREE_TYPE (op2) != type)) + return false; + if (perm_stmts) + perm_stmts->safe_push (stmt); + if (src_vects) + { + src_vects->safe_push (op1); + src_vects->safe_push (op2); + } + return true; +} + +/* Collect permutation stmts preceding the given stmt. */ + +static bool +find_perm_set (gimple *stmt, tree type, vec &perm_stmts, + vec &src_vects) +{ + auto_vec ops; + if (!check_perm_stmt (stmt, NULL, NULL, &ops)) + return false; + + unsigned i; + tree op; + bool single_use_op = false; + FOR_EACH_VEC_ELT (ops, i, op) + { + /* Skip if we already processed the same operand. */ + if (i > 0 && ops[i] == ops[i - 1]) + continue; + /* Find one permutation stmt. */ + gimple *def_stmt = get_prop_source_stmt (op, false, &single_use_op); + if (!check_perm_stmt (def_stmt, type, &perm_stmts, &src_vects)) + return false; + if (single_use_op || src_vects.length () <= 1) + return false; + unsigned last_i = src_vects.length () - 1; + unsigned before_last_i = src_vects.length () - 2; + + /* Find one more permutation stmt. */ + gimple *use_stmt; + imm_use_iterator iter; + FOR_EACH_IMM_USE_STMT (use_stmt, iter, src_vects[before_last_i]) + if (use_stmt != def_stmt) + BREAK_FROM_IMM_USE_STMT (iter); + if (!use_stmt || use_stmt == def_stmt + || gimple_assign_rhs_code (use_stmt) != VEC_PERM_EXPR + || src_vects[before_last_i] != gimple_assign_rhs1 (use_stmt) + || src_vects[last_i] != gimple_assign_rhs2 (use_stmt)) + return false; + perm_stmts.safe_push (use_stmt); + } + return true; +} + +/* Walk permutation pattern and make a vector of permutation indices. */ + +static bool +make_vec_of_indices (vec &perm_pattern, vec &perm_indices) +{ + unsigned i, j; + tree tree_it; + FOR_EACH_VEC_ELT (perm_pattern, i, tree_it) + { + unsigned HOST_WIDE_INT nelts; + if (!VECTOR_CST_NELTS (tree_it).is_constant (&nelts)) + return false; + for (j = 0; j < nelts; j++) + { + tree val = VECTOR_CST_ELT (tree_it, j); + gcc_checking_assert (TREE_CODE (val) == INTEGER_CST); + perm_indices.safe_push (TREE_INT_CST_LOW (val)); + } + } + return true; +} + +/* Check or collect a permutation pattern in the provided perm_stmts depending + on the passed parameters. If collect_pattern is true, collect permutation + vectors to pattern. In other case, check the pattern suits perm_stmts. */ + +static bool +check_or_collect_perm_pattern (vec &perm_stmts, vec &pattern, + bool collect_pattern) +{ + unsigned i, j; + gimple *stmt_it; + tree tree_it; + FOR_EACH_VEC_ELT (perm_stmts, i, stmt_it) + { + gcc_assert (gimple_assign_rhs_code (stmt_it) == VEC_PERM_EXPR); + tree perm_vec = gimple_assign_rhs3 (stmt_it); + bool found = false; + FOR_EACH_VEC_ELT (pattern, j, tree_it) + if (operand_equal_p (tree_it, perm_vec)) + { + found = true; + break; + } + if (collect_pattern && !found) + pattern.safe_push (perm_vec); + else + gcc_assert (found); + if (i % pattern.length () != j) + return false; + } + return true; +} + +/* Identify the permutation pattern and check it. For now, we are checking + only transposition permutations with no more than 2 lines in their patterns. + Collect permutation const vectors and the second permutation stmts. */ + +static bool +check_perm_pattern (vec &first_perm_stmts, vec &perm_pattern, + vec &second_perm_stmts) +{ + unsigned i, j; + gimple *stmt_it; + if (!check_or_collect_perm_pattern (first_perm_stmts, perm_pattern, true)) + return false; + + if (perm_pattern.length () == 0 || perm_pattern.length () > 2) + return false; + + /* Find the second permutation stmts. */ + hash_set visited; + FOR_EACH_VEC_ELT (first_perm_stmts, i, stmt_it) + { + tree dst = gimple_assign_lhs (stmt_it); + use_operand_p use_p; + imm_use_iterator iter; + FOR_EACH_IMM_USE_FAST (use_p, iter, dst) + { + gimple *stmt_it2 = USE_STMT (use_p); + if (visited.contains (stmt_it2)) + continue; + second_perm_stmts.safe_push (stmt_it2); + visited.add (stmt_it2); + } + } + second_perm_stmts.qsort (gimple_uid_cmp); + + if (first_perm_stmts.length () != second_perm_stmts.length ()) + return false; + + /* Check that all second_perm_stmts are VEC_PERM_EXPR. */ + FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it) + if (gimple_assign_rhs_code (stmt_it) != VEC_PERM_EXPR) + return false; + + /* Check permutation pattern on the second permutation stmts. */ + if (!check_or_collect_perm_pattern (second_perm_stmts, perm_pattern, false)) + return false; + + /* Check values of permutation indices. */ + auto_vec perm_indices (vector_cst_encoded_nelts (perm_pattern[0]) + * perm_pattern.length ()); + if (!make_vec_of_indices (perm_pattern, perm_indices)) + return false; + + unsigned val, half_len = perm_indices.length () / 2; + FOR_EACH_VEC_ELT (perm_indices, j, val) + if (val != (j % 2 ? half_len + j / 2 : j / 2)) + return false; + + /* Check the correspondence of defs in first_perm_stmts and uses in + second_perm_stmts. */ + tree type1 = TREE_TYPE (gimple_assign_lhs (first_perm_stmts[0])); + tree type2 = TREE_TYPE (gimple_assign_lhs (second_perm_stmts[0])); + if (type1 != type2) + return false; + + unsigned HOST_WIDE_INT len = TYPE_VECTOR_SUBPARTS (type1).to_constant (); + FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it) + { + /* Vectors of first/second perm stmts consist of blocks, each block + transposes its own set of input vectors. J corresponds to the number + of such block in the vector. */ + unsigned j = (i / len) * len; + gimple *src_stmt1 = first_perm_stmts[j + (i - j) / 2]; + gimple *src_stmt2 = first_perm_stmts[j + (i - j) / 2 + len / 2]; + if (gimple_assign_rhs1 (stmt_it) != gimple_assign_lhs (src_stmt1) + || gimple_assign_rhs2 (stmt_it) != gimple_assign_lhs (src_stmt2)) + return false; + } + return true; +} + +/* For the given vector of stmts find all immediate def or use stmts. + It uses SSA and don't go trough loads/stores. */ + +static bool +find_next_stmts (auto_vec &stmts, auto_vec &next_stmts, + bool is_forward, bool skip_perms) +{ + unsigned i; + gimple *stmt_it; + hash_set new_stmt_set; + FOR_EACH_VEC_ELT (stmts, i, stmt_it) + { + if (is_forward) + { + tree lhs = gimple_assign_lhs (stmt_it); + if (!lhs || TREE_CODE (lhs) != SSA_NAME) + continue; + imm_use_iterator iter; + gimple *use_stmt; + FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs) + if (!new_stmt_set.contains (use_stmt)) + { + new_stmt_set.add (use_stmt); + if (!skip_perms + || gimple_assign_rhs_code (use_stmt) != VEC_PERM_EXPR) + next_stmts.safe_push (use_stmt); + } + } + else + { + tree rhs; + auto_vec rhs_vec (3); + if ((rhs = gimple_assign_rhs1 (stmt_it))) + rhs_vec.quick_push (rhs); + if ((rhs = gimple_assign_rhs2 (stmt_it))) + rhs_vec.quick_push (rhs); + if ((rhs = gimple_assign_rhs3 (stmt_it))) + rhs_vec.quick_push (rhs); + unsigned j; + FOR_EACH_VEC_ELT (rhs_vec, j, rhs) + { + if (TREE_CODE (rhs) == VIEW_CONVERT_EXPR) + rhs = TREE_OPERAND (rhs, 0); + if (TREE_CODE (rhs) != SSA_NAME) + continue; + gimple *def_stmt = get_prop_source_stmt (rhs, false, NULL); + if (!def_stmt) + return false; + if (new_stmt_set.contains (def_stmt)) + continue; + new_stmt_set.add (def_stmt); + if (!skip_perms + || gimple_assign_rhs_code (def_stmt) != VEC_PERM_EXPR) + next_stmts.safe_push (def_stmt); + } + } + } + return true; +} + +/* Check if stmts in the vector have similar code and type. Process only + assign stmts. */ + +static bool +check_stmts_similarity (auto_vec &stmts, enum tree_code &code) +{ + code = NOP_EXPR; + tree type = NULL_TREE; + unsigned i; + gimple *stmt_it; + FOR_EACH_VEC_ELT (stmts, i, stmt_it) + { + if (!is_gimple_assign (stmt_it)) + return false; + tree lhs = gimple_assign_lhs (stmt_it); + enum tree_code code2 = gimple_assign_rhs_code (stmt_it); + if (type != NULL_TREE) + { + /* Unpack lo/hi are the same for the analysis. */ + if (((code2 != VEC_UNPACK_LO_EXPR && code2 != VEC_UNPACK_HI_EXPR) + || (code != VEC_UNPACK_LO_EXPR && code != VEC_UNPACK_HI_EXPR)) + && (!lhs || type != TREE_TYPE (lhs) + || (code != NOP_EXPR && code != code2))) + return false; + } + else if (lhs) + type = TREE_TYPE (lhs); + if (code == NOP_EXPR) + code = code2; + } + return true; +} + +/* Check that the order of definitions of first_stmts and uses of second_stmts + is the same. */ + +static bool +check_def_use_order (vec &first_stmts, vec &second_stmts) +{ + first_stmts.qsort (gimple_uid_cmp); + second_stmts.qsort (gimple_uid_cmp); + unsigned len1 = first_stmts.length (); + unsigned len2 = second_stmts.length (); + + /* Skip if one of the blocks is empty or the second block is permutaions. */ + if (!len1 || !len2 + || gimple_assign_rhs_code (second_stmts[0]) == VEC_PERM_EXPR) + return true; + + unsigned i; + gimple *stmt_it; + FOR_EACH_VEC_ELT (first_stmts, i, stmt_it) + { + tree op = gimple_assign_lhs (stmt_it); + imm_use_iterator iter; + gimple *stmt; + FOR_EACH_IMM_USE_STMT (stmt, iter, op) + { + if ((len1 == len2 && stmt != second_stmts[i]) + || (len1 == len2 * 2 && stmt != second_stmts[i % len2])) + RETURN_FROM_IMM_USE_STMT (iter, false); + enum tree_code code = gimple_assign_rhs_code (stmt); + if ((len1 * 2 == len2) + && ((code == VEC_UNPACK_LO_EXPR && stmt != second_stmts[2 * i]) + || (code == VEC_UNPACK_HI_EXPR + && stmt != second_stmts[2 * i + 1]))) + RETURN_FROM_IMM_USE_STMT (iter, false); + } + } + return true; +} + +/* Check similarity of stmts in the block of arithmetic operations. */ + +static bool +check_arithmetic_block (vec &initial_perm_stmts, unsigned nstmts) +{ + auto_vec next_stmts (nstmts); + auto_vec prev_stmts (nstmts); + + enum tree_code code; + unsigned i; + gimple *stmt_it; + FOR_EACH_VEC_ELT (initial_perm_stmts, i, stmt_it) + prev_stmts.quick_push (stmt_it); + + do + { + next_stmts.block_remove (0, next_stmts.length ()); + if (!find_next_stmts (prev_stmts, next_stmts, false, true)) + return false; + + /* Check that types and codes of all stmts in the list are the same. */ + if (!check_stmts_similarity (next_stmts, code)) + return false; + /* Check that the order of all operands is the same. */ + if (!check_def_use_order (next_stmts, prev_stmts)) + return false; + prev_stmts.block_remove (0, prev_stmts.length ()); + + FOR_EACH_VEC_ELT (next_stmts, i, stmt_it) + prev_stmts.safe_push (stmt_it); + } + while (code != NOP_EXPR); + + return true; +} + +/* Find two blocks of permutations on two sets of input vectors which are + used in the same vectorized arithmetic operations after the permutaion: + Va1...VaN = PERM{P1} (Sa1...SaN) + Vb1...VbN = PERM{P1} (Sb1...SbN) + Vc1...VcN = binops (Va1...VaN, Vb1...VbN) + The goal of the transformation is to execute the block of permutations + only once on the result of the arithmetic operations: + Va1...VaN = binops (Sa1...SaN, Sb1...SbN) + Vc1...VcN = PERM{P1} (Va1...VaN) + + Currently the analysis looks for transposition permutations that consist + of two layers of statements e.g.: + Vt1 = PERM { 0, 4, 1, 5 } Sa1, Sa2 // the first + Vt2 = PERM { 2, 6, 3, 7 } Sa1, Sa2 + Vt3 = PERM { 0, 4, 1, 5 } Sa3, Sa4 + Vt4 = PERM { 2, 6, 3, 7 } Sa3, Sa4 + Va1 = PERM { 0, 4, 1, 5 } Vt1, Vt3 // the second + Va2 = PERM { 2, 6, 3, 7 } Vt1, Vt3 + Va3 = PERM { 0, 4, 1, 5 } Vt2, Vt4 + Va4 = PERM { 2, 6, 3, 7 } Vt2, Vt4 + Permutation stmts are collected in first_perm_stmts and second_perm_stmts + vectors correspondinglys. + + Arithmetic operations may contain several stmts for one pair of input source + vectors e.g.: + Vtmp1 = unop (Va1) + Vtmp2 = binop (Vb1, const) + Vc1 = binop (Vtmp1, Vtmp2) + The last stmts of each sequence in the arithmetic block are collected + in final_arith_stmts. */ + +static bool +analyze_perm_fwprop (tree type, unsigned HOST_WIDE_INT nelts, + vec &stmts, auto_vec &src_vects, + auto_vec &perm_pattern, + auto_vec &final_arith_stmts, + auto_vec &second_perm_stmts) +{ + gcc_checking_assert (stmts.length () == 2); + auto_vec first_perm_stmts (nelts * 2); + if (!find_perm_set (stmts[0], type, first_perm_stmts, src_vects) + || !find_perm_set (stmts[1], type, first_perm_stmts, src_vects)) + return false; + first_perm_stmts.qsort (gimple_uid_cmp); + + /* Determine permutation pattern. */ + if (!check_perm_pattern (first_perm_stmts, perm_pattern, second_perm_stmts)) + return false; + + /* Find all arithmetic stmts. */ + unsigned i; + gimple *stmt_it; + auto_vec all_arith_stmts (nelts * 2); + hash_set visited; + FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it) + { + tree dst = gimple_assign_lhs (stmt_it); + use_operand_p use_p; + gimple *use_stmt; + if (!single_imm_use (dst, &use_p, &use_stmt)) + return false; + all_arith_stmts.quick_push (use_stmt); + visited.add (use_stmt); + } + + /* Select final arithmetic stmts. */ + FOR_EACH_VEC_ELT (all_arith_stmts, i, stmt_it) + { + tree dst = gimple_assign_lhs (stmt_it); + use_operand_p use_p; + imm_use_iterator iter; + bool use_only_outside_arith_stmts = true; + FOR_EACH_IMM_USE_FAST (use_p, iter, dst) + if (visited.contains (USE_STMT (use_p))) + { + use_only_outside_arith_stmts = false; + break; + } + if (use_only_outside_arith_stmts) + final_arith_stmts.quick_push (stmt_it); + } + + /* Check that all results has the same arithmetic patterns. */ + if (!check_arithmetic_block (final_arith_stmts, nelts)) + return false; + + if (final_arith_stmts.length () < nelts) + return false; + return true; +} + +/* Substitute uses of stmts' results by new_uses. */ + +static void +substitute_uses (vec &stmts, vec &new_uses) +{ + gcc_checking_assert (stmts.length () == new_uses.length ()); + unsigned i; + gimple *stmt_it; + FOR_EACH_VEC_ELT (stmts, i, stmt_it) + { + tree op = gimple_assign_lhs (stmt_it); + imm_use_iterator iter; + gimple *use_stmt; + FOR_EACH_IMM_USE_STMT (use_stmt, iter, op) + { + use_operand_p use_p; + FOR_EACH_IMM_USE_ON_STMT (use_p, iter) + SET_USE (use_p, new_uses[i]); + update_stmt (use_stmt); + } + } +} + +/* Propagate permutations through the block of arithmetic operations. */ + +static void +fwprop_perms (tree type, auto_vec &src_vects, + auto_vec &perm_pattern, + auto_vec &final_arith_stmts, + auto_vec &second_perm_stmts) +{ + /* Build new permutation stmts after the block of arithmetic stmts. */ + gimple_seq new_stmts = NULL; + unsigned perm_block_size = final_arith_stmts.length (); + auto_vec new_first_perm_vals (perm_block_size); + hash_set new_stmts_set; + unsigned i, perm_pattern_size = perm_pattern.length (); + for (i = 0; i < perm_block_size; i++) + { + tree op0 = gimple_assign_lhs (final_arith_stmts[i / 2]); + unsigned idx = i / 2 + perm_block_size / 2; + tree op1 = gimple_assign_lhs (final_arith_stmts[idx]); + tree res = gimple_build (&new_stmts, VEC_PERM_EXPR, type, op0, op1, + perm_pattern[i % perm_pattern_size]); + new_first_perm_vals.quick_push (res); + new_stmts_set.add (gimple_seq_last (new_stmts)); + } + auto_vec new_second_perm_vals (perm_block_size); + for (i = 0; i < perm_block_size; i++) + { + tree op0 = new_first_perm_vals[i / 2]; + tree op1 = new_first_perm_vals[i / 2 + perm_block_size/ 2]; + tree res = gimple_build (&new_stmts, VEC_PERM_EXPR, type, op0, op1, + perm_pattern[i % perm_pattern_size]); + new_second_perm_vals.quick_push (res); + new_stmts_set.add (gimple_seq_last (new_stmts)); + } + + gimple_stmt_iterator g = gsi_for_stmt (final_arith_stmts.last ()); + gsi_insert_seq_after (&g, new_stmts, GSI_SAME_STMT); + + /* Replace old uses of the arithmetic block results by destinations of + the new permutation block. */ + gimple *stmt_it; + FOR_EACH_VEC_ELT (final_arith_stmts, i, stmt_it) + { + tree op0 = gimple_assign_lhs (final_arith_stmts[i]); + imm_use_iterator iter; + gimple *use_stmt; + use_operand_p use_p; + FOR_EACH_IMM_USE_STMT (use_stmt, iter, op0) + { + if (new_stmts_set.contains (use_stmt)) + continue; + FOR_EACH_IMM_USE_ON_STMT (use_p, iter) + SET_USE (use_p, new_second_perm_vals[i]); + update_stmt (use_stmt); + } + } + + /* Disconnect the old permutation stmts. */ + substitute_uses (second_perm_stmts, src_vects); +} + +/* Find the permutation stmts in the forward or backward direction (in terms of + def/use graph) starting from the vector of initial stmts. Count reduction + stmts (i.e. binary operations) if they can change the number of processed + elements. */ + +static bool +find_perm_stmts (vec &initial_stmts, unsigned nstmts, + vec &final_perm_stmts, bool is_forward, + unsigned &nreduct) +{ + auto_vec next_stmts (nstmts); + auto_vec prev_stmts (nstmts); + + nreduct = 0; + enum tree_code code; + unsigned i; + gimple *stmt_it; + FOR_EACH_VEC_ELT (initial_stmts, i, stmt_it) + prev_stmts.quick_push (stmt_it); + + do + { + next_stmts.block_remove (0, next_stmts.length ()); + if (!find_next_stmts (prev_stmts, next_stmts, is_forward, false)) + return false; + + /* Check that types and codes of all stmts in the list are the same. */ + if (!check_stmts_similarity (next_stmts, code)) + return false; + + /* TODO: don't take into account binary operations with constants. */ + if (TREE_CODE_CLASS (code) == tcc_binary) + nreduct += 1; + + if (is_forward ? !check_def_use_order (prev_stmts, next_stmts) + : !check_def_use_order (next_stmts, prev_stmts)) + return false; + + prev_stmts.block_remove (0, prev_stmts.length ()); + + FOR_EACH_VEC_ELT (next_stmts, i, stmt_it) + prev_stmts.safe_push (stmt_it); + } + while (code != NOP_EXPR && code != VEC_PERM_EXPR); + + if (code != VEC_PERM_EXPR) + return false; + + FOR_EACH_VEC_ELT (next_stmts, i, stmt_it) + final_perm_stmts.safe_push (stmt_it); + final_perm_stmts.qsort (gimple_uid_cmp); + return true; +} + +/* Check if the initial and the final permutations can be optimized i.e. + the initial permutation can be removed with the modification of + the final one. */ + +static bool +can_reduce_permutations (unsigned init_nelts, vec &perm_pattern, + vec &init_perm_stmts) +{ + auto_vec perm_indices (init_nelts); + if (!make_vec_of_indices (perm_pattern, perm_indices)) + return false; + unsigned i, j; + gimple *stmt_it; + unsigned perm_vec_size = perm_indices.length (); + FOR_EACH_VEC_ELT (init_perm_stmts, i, stmt_it) + { + gcc_assert (gimple_assign_rhs_code (stmt_it) == VEC_PERM_EXPR); + tree perm_vec2 = gimple_assign_rhs3 (stmt_it); + unsigned HOST_WIDE_INT mask_elts; + if (!VECTOR_CST_NELTS (perm_vec2).is_constant (&mask_elts)) + return false; + for (j = 0; j < mask_elts; j++) + { + tree val = VECTOR_CST_ELT (perm_vec2, j); + gcc_assert (TREE_CODE (val) == INTEGER_CST); + unsigned HOST_WIDE_INT int_val = TREE_INT_CST_LOW (val); + if (int_val != perm_indices[j % perm_vec_size] + + (j / perm_vec_size) * perm_vec_size) + return false; + } + } + return true; +} + +/* Find permutation blocks before and after arithmetic operations and decide + if the number of permutations can be reduced, e.g: + Va1...VaN = PERM{P1} (Sa1...SaN) + Vb1...VbM = some operations (Va1...VaN) + Vb1...VbM = PERM{P2} (Sb1...SbM) + can be transformed to: + Vb1...VbM = some operations (Va1...VaN) + Vb1...VbM = PERM{P3} (Sb1...SbM) + + Currently it supports initial permutations like this: + Va1 = PERM { 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15} Sa1 + and transposition permutations with two layers of permutation stmts as + final permutaions. + + Operations between permutations can include unary and binary arithmetic, + element conversions and vector packing/unpacking. */ + +static bool +analyze_perm_reduction (unsigned HOST_WIDE_INT nelts, + vec &perm_stmts, + vec &init_perm_stmts, + vec &second_perm_stmts) +{ + auto_vec first_perm_stmts (nelts * 2); + if (!check_perm_stmt (perm_stmts[0], NULL_TREE, &first_perm_stmts, NULL) + || !check_perm_stmt (perm_stmts[1], NULL_TREE, &first_perm_stmts, NULL)) + return false; + + unsigned nreduct; + auto_vec final_perm_stmts (nelts * 2); + if (!find_perm_stmts (first_perm_stmts, nelts, final_perm_stmts, true, + nreduct)) + return false; + + if (!find_perm_stmts (final_perm_stmts, nelts, init_perm_stmts, false, + nreduct)) + return false; + + /* Check number of elemetns in the inital and final data block. */ + tree init_elem_type = TREE_TYPE (gimple_assign_lhs (init_perm_stmts[0])); + unsigned init_nelts = TYPE_VECTOR_SUBPARTS (init_elem_type).to_constant () + * init_perm_stmts.length (); + tree final_elem_type = TREE_TYPE (gimple_assign_lhs (final_perm_stmts[0])); + unsigned final_nelts = TYPE_VECTOR_SUBPARTS (final_elem_type).to_constant () + * final_perm_stmts.length (); + if (init_nelts != final_nelts * (1 + nreduct)) + return false; + + /* Check the final permutations and detect its pattern. */ + auto_vec perm_pattern (nelts); + if (!check_perm_pattern (final_perm_stmts, perm_pattern, second_perm_stmts)) + return false; + + return can_reduce_permutations (init_nelts, perm_pattern, init_perm_stmts); +} + +/* Do the optimization: skip the initial permutation and change the order + of destinations after the second layer of permutation statements in + the final permutation block. */ + +static void +reduce_perms (vec &init_perm_stmts, vec &second_perm_stmts) +{ + unsigned i; + gimple *stmt_it; + auto_vec new_srcs (init_perm_stmts.length ()); + FOR_EACH_VEC_ELT (init_perm_stmts, i, stmt_it) + new_srcs.quick_push (gimple_assign_rhs1 (stmt_it)); + substitute_uses (init_perm_stmts, new_srcs); + + unsigned half = second_perm_stmts.length () / 2; + auto_vec new_dsts (second_perm_stmts.length ()); + FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it) + { + unsigned idx = i < half ? i << 1 : ((i - half) << 1) + 1; + new_dsts.quick_push (gimple_assign_lhs (second_perm_stmts[idx])); + } + + FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it) + { + gimple_assign_set_lhs (stmt_it, new_dsts[i]); + update_stmt (stmt_it); + } +} + +/* Optimize permutations in the following two cases: + 1. Recognize the same permutations of two sets of vectors with subsequent + binary arithmetic operations on them: + V1 = PERM{1} (S1); + V2 = PERM{1} (S2); + V3 = V1 binop V2; + then move the permutation after the operations: + V0 = S1 binop S2; + V3 = PERM{1} V0; + 2. Detect the first permutation before some operations on a set of vectors + and the second one after the operations: + V1 = PERM{1} (S1) + V2 = set of operations (V1) + V3 = PERM{2} (V2) + try to reduce them: + V2 = set of operations (S1) + V3 = PERM{3} (V2) + Return true if the optimization is successful. */ + +static bool +propagate_permutations (gimple_stmt_iterator *gsi) +{ + tree type; + auto_vec perm_stmts (2); + + if (!find_initial_permutations (gsi, type, perm_stmts)) + return false; + + unsigned HOST_WIDE_INT nelts = TYPE_VECTOR_SUBPARTS (type).to_constant (); + auto_vec final_arith_stmts (nelts * 2); + auto_vec second_perm_stmts (nelts * 2); + auto_vec src_vects (nelts * 2); + auto_vec perm_pattern (nelts); + if (analyze_perm_fwprop (type, nelts, perm_stmts, src_vects, perm_pattern, + final_arith_stmts, second_perm_stmts)) + { + fwprop_perms (type, src_vects, perm_pattern, final_arith_stmts, + second_perm_stmts); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + unsigned i; + gimple *stmt_it; + fprintf (dump_file, "Permutations were moved through " + "binary operations:\n"); + FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it) + print_gimple_stmt (dump_file, stmt_it, 0); + } + return true; + } + + auto_vec init_perm_stmts (nelts * 2); + auto_vec final_perm_stmts (nelts * 2); + if (analyze_perm_reduction (nelts, perm_stmts, init_perm_stmts, + final_perm_stmts)) + { + reduce_perms (init_perm_stmts, final_perm_stmts); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + unsigned i; + gimple *stmt_it; + fprintf (dump_file, "Initial permutations were reduced:\n"); + FOR_EACH_VEC_ELT (init_perm_stmts, i, stmt_it) + print_gimple_stmt (dump_file, stmt_it, 0); + } + return true; + } + return false; +} + /* Get the BIT_FIELD_REF definition of VAL, if any, looking through conversions with code CONV_CODE or update it if still ERROR_MARK. Return NULL_TREE if no such matching def was found. */ @@ -3155,6 +4042,10 @@ pass_forwprop::execute (function *fun) || code == BIT_XOR_EXPR) && simplify_rotate (&gsi)) changed = true; + else if ((code == PLUS_EXPR || code == MINUS_EXPR) + && param_tree_forwprop_perm + && propagate_permutations (&gsi)) + changed = true; else if (code == VEC_PERM_EXPR) { int did_something = simplify_permutation (&gsi); -- 2.33.0