From 25014ea924bfe3659e88195636ec08f87dd72c07 Mon Sep 17 00:00:00 2001 From: Mingchuan Wu Date: Fri, 21 Jun 2024 20:26:04 +0800 Subject: [PATCH] Try to use AI model to guide optimization. --- gcc/Makefile.in | 1 + gcc/common.opt | 7 ++ gcc/config/aarch64/aarch64.c | 130 ++++++++++++++++++++ gcc/ipa-hardware-detection.c | 228 +++++++++++++++++++++++++++++++++++ gcc/opts-common.c | 154 +++++++++++++++++++++++ gcc/opts.c | 11 ++ gcc/passes.def | 1 + gcc/timevar.def | 1 + gcc/tree-pass.h | 2 + 9 files changed, 535 insertions(+) create mode 100644 gcc/ipa-hardware-detection.c diff --git a/gcc/Makefile.in b/gcc/Makefile.in index aed321d27..f21bc5f9a 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1398,6 +1398,7 @@ OBJS = \ inchash.o \ incpath.o \ init-regs.o \ + ipa-hardware-detection.o \ internal-fn.o \ ipa-struct-reorg/ipa-struct-reorg.o \ ipa-cp.o \ diff --git a/gcc/common.opt b/gcc/common.opt index aad6fb281..9b32ea50a 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -188,6 +188,9 @@ const char *main_input_basename Variable int main_input_baselength +Variable +bool optimize_machine + ; Which options have been printed by --help. Variable char *help_printed @@ -467,6 +470,10 @@ Ofast Common Optimization Optimize for speed disregarding exact standards compliance. +Om +Common Optimization +Optimize for radical optimization for machines. + Og Common Optimization Optimize for debugging experience rather than speed or size. diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 2117326ba..e67e77e6a 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -14416,6 +14416,135 @@ aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind, return stmt_cost; } +/* Check whether in C language or LTO with only C language. */ +extern bool lang_c_p (void); + +static void +override_C_optimize_options (struct gcc_options *opts) +{ + opts->x_flag_ipa_reorder_fields = 1; + opts->x_flag_ipa_struct_reorg = 6; + opts->x_struct_layout_optimize_level = 6; + opts->x_flag_gnu89_inline = 1; + opts->x_flag_ccmp2 = 1; + opts->x_flag_array_widen_compare = 1; + opts->x_flag_convert_minmax = 1; + opts->x_flag_tree_slp_transpose_vectorize = 1; + opts->x_param_max_inline_insns_auto = 64; + opts->x_param_inline_unit_growth = 96; + opts->x_flag_cmlt_arith = 1; +} + +/* Check whether in CPP language or LTO with only CPP language. */ +static bool +lang_cpp_p (void) +{ + const char *language_string = lang_hooks.name; + if (!language_string) + { + return false; + } + if (lang_GNU_CXX ()) + { + return true; + } + else if (strcmp (language_string, "GNU GIMPLE") == 0) // for LTO check + { + unsigned i = 0; + tree t = NULL_TREE; + FOR_EACH_VEC_SAFE_ELT (all_translation_units, i, t) + { + language_string = TRANSLATION_UNIT_LANGUAGE (t); + if (language_string == NULL + || strncmp (lang_hooks.name, "GNU C++", 7)) + { + return false; + } + } + return true; + } + return false; +} + +static void +override_CPP_optimize_options (struct gcc_options *opts) +{ + opts->x_flag_finite_loops = 1; + opts->x_flag_omit_frame_pointer = 1; + opts->x_flag_sized_deallocation = 0; + opts->x_flag_loop_elim = 1; + opts->x_flag_convert_minmax = 1; + opts->x_param_early_inlining_insns = 256; + opts->x_param_max_inline_insns_auto = 128; + opts->x_param_inline_unit_growth = 256; + opts->x_flag_cmlt_arith = 1; +} + +static void +override_optimize_options_1 (struct gcc_options *opts) +{ + opts->x_flag_split_ldp_stp = 1; + opts->x_flag_if_conversion_gimple = 1; + opts->x_param_tree_forwprop_perm = 1; + opts->x_flag_ifcvt_allow_complicated_cmps = 1; + opts->x_param_ifcvt_allow_register_renaming = 2; + opts->x_param_max_rtl_if_conversion_unpredictable_cost = 48; + opts->x_param_max_rtl_if_conversion_predictable_cost = 48; +} + +static void +override_Fortran_optimize_options (struct gcc_options *opts) +{ + opts->x_flag_unroll_loops = 1; + opts->x_flag_unconstrained_commons = 1; + opts->x_param_ipa_cp_eval_threshold = 1; + opts->x_param_ipa_cp_unit_growth = 80; + opts->x_param_ipa_cp_max_recursive_depth = 8; + opts->x_param_large_unit_insns = 30000; + opts->x_flag_ira_loop_pressure = 1; + opts->x_flag_inline_functions_called_once = 0; + opts->x_flag_ira_algorithm = IRA_ALGORITHM_PRIORITY; + opts->x_flag_delayed_branch = 1; + opts->x_flag_gcse_las = 1; + opts->x_flag_gcse_sm = 1; + opts->x_flag_ipa_pta = 1; + opts->x_flag_reorder_blocks_and_partition = 1; + opts->x_flag_reorder_blocks = 1; + opts->x_flag_crypto_accel_aes = 1; + opts->x_param_flexible_seg_len = 1; +} + +/* Reset the optimize option. + After checking the model result, this function can + reset the more appropriate options. */ +static void +reset_machine_option (struct gcc_options *opts) +{ + if (!(opts->x_optimize_machine) + || strstr (opts->x_aarch64_tune_string, "hip09") == NULL) + { + return; + } + + const char *ai_infer_level = getenv ("AI_INFER_LEVEL"); + if (ai_infer_level) + { + override_optimize_options_1 (opts); + if (lang_c_p ()) + { + override_C_optimize_options (opts); + } + else if (lang_cpp_p ()) + { + override_CPP_optimize_options (opts); + } + else if (lang_GNU_Fortran ()) + { + override_Fortran_optimize_options (opts); + } + } +} + /* Implement targetm.vectorize.add_stmt_cost. */ static unsigned aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind, @@ -15060,6 +15189,7 @@ aarch64_override_options_internal (struct gcc_options *opts) if (opts->x_aarch64_tune_string == NULL) opts->x_aarch64_tune_string = selected_tune->name; + reset_machine_option (opts); aarch64_override_options_after_change_1 (opts); } diff --git a/gcc/ipa-hardware-detection.c b/gcc/ipa-hardware-detection.c new file mode 100644 index 000000000..f127ebe2c --- /dev/null +++ b/gcc/ipa-hardware-detection.c @@ -0,0 +1,228 @@ +/* Hardware Detection. + Copyright (C) 2022-2022 Free Software Foundation, Inc. +This file is part of GCC. +GCC is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. +GCC is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +for more details. +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3.  If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "target.h" +#include "tree.h" +#include "gimple.h" +#include "tree-pass.h" +#include "gimple-ssa.h" +#include "tree-pretty-print.h" +#include "fold-const.h" +#include "gimplify.h" +#include "gimple-iterator.h" +#include "tree-ssa-loop-manip.h" +#include "tree-ssa-loop.h" +#include "ssa.h" +#include "tree-into-ssa.h" +#include "cfganal.h" +#include "cfgloop.h" +#include "gimple-pretty-print.h" +#include "tree-cfg.h" +#include "cgraph.h" +#include "print-tree.h" +#include "cfghooks.h" +#include "gimple-fold.h" + +namespace { + +static basic_block +create_abort_bb (basic_block last_bb) +{ + basic_block bb = create_empty_bb (last_bb); + if (last_bb->loop_father != NULL) + { + add_bb_to_loop (bb, last_bb->loop_father); + loops_state_set (LOOPS_NEED_FIXUP); + } + gimple_stmt_iterator gsi = gsi_last_bb (bb); + tree fn = builtin_decl_implicit (BUILT_IN_ABORT); + gimple *g = gimple_build_call (fn, 0); + gsi_insert_after (&gsi, g, GSI_NEW_STMT); + return bb; +} + +static basic_block +create_part_bb (basic_block last_bb, tree part_base) +{ + basic_block bb = create_empty_bb (last_bb); + if (last_bb->loop_father != NULL) + { + add_bb_to_loop (bb, last_bb->loop_father); + loops_state_set (LOOPS_NEED_FIXUP); + } + gimple_stmt_iterator gsi = gsi_last_bb (bb); + gsi_insert_after (&gsi, gimple_build_nop (), GSI_NEW_STMT); + /* This number is used to efficiently identify the supported part range. */ + tree part_cond = gimplify_build2 ( + &gsi, PLUS_EXPR, unsigned_type_node, part_base, + build_int_cst (unsigned_type_node, 4294963967)); + gcond *cond = gimple_build_cond (LE_EXPR, part_cond, + build_int_cst (unsigned_type_node, 2), + NULL_TREE, NULL_TREE); + gimple_set_location (cond, input_location); + gsi_insert_before (&gsi, cond, GSI_SAME_STMT); + gsi_remove (&gsi, true); + return bb; +} + +static void +create_detection_bb () +{ + edge old_e = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun)); + basic_block ret_bb = old_e->dest; + + basic_block detection_bb = create_empty_bb (ENTRY_BLOCK_PTR_FOR_FN (cfun)); + if (ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father != NULL) + { + add_bb_to_loop (detection_bb, ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father); + loops_state_set (LOOPS_NEED_FIXUP); + } + tree cpuid_decl = build_decl (input_location, VAR_DECL, + get_identifier ("cpuid"), unsigned_type_node); + add_local_decl (cfun, cpuid_decl); + + gimple_stmt_iterator gsi = gsi_last_bb (detection_bb); + vec *outputs = NULL; + tree purpose = build_string (strlen ("=r"), "=r"); + tree output = build_tree_list ( + build_tree_list (NULL_TREE, purpose), cpuid_decl); + vec_safe_push (outputs, output); + gasm *asm_stmt = gimple_build_asm_vec ( + "mrs %0, MIDR_EL1", NULL, outputs, NULL, NULL); + gsi_insert_after (&gsi, asm_stmt, GSI_NEW_STMT); + gsi_insert_after (&gsi, gimple_build_nop (), GSI_NEW_STMT); + + tree implementer = gimplify_build2 ( + &gsi, RSHIFT_EXPR, unsigned_type_node, cpuid_decl, + build_int_cst (unsigned_type_node, 24)); + tree part_base = gimplify_build2 ( + &gsi, RSHIFT_EXPR, unsigned_type_node, cpuid_decl, + build_int_cst (unsigned_type_node, 4)); + tree part = gimplify_build2 ( + &gsi, BIT_AND_EXPR, unsigned_type_node, part_base, + build_int_cst (unsigned_type_node, 4095)); + gcond *implementer_cond = gimple_build_cond ( + EQ_EXPR, implementer, + build_int_cst (unsigned_type_node, 72), + NULL_TREE, NULL_TREE); + gimple_set_location (implementer_cond, input_location); + gsi_insert_before (&gsi, implementer_cond, GSI_SAME_STMT); + gsi_remove (&gsi, true); + + basic_block part_bb = create_part_bb (detection_bb, part); + basic_block abort_bb = create_abort_bb (part_bb); + + remove_edge_raw (old_e); + make_single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun), + detection_bb, EDGE_FALLTHRU); + edge etrue = make_edge (detection_bb, part_bb, EDGE_TRUE_VALUE); + etrue->probability = profile_probability::likely (); + edge efalse = make_edge (detection_bb, abort_bb, EDGE_FALSE_VALUE); + efalse->probability = profile_probability::unlikely (); + edge part_true = make_edge (part_bb, ret_bb, EDGE_TRUE_VALUE); + part_true->probability = profile_probability::likely (); + edge part_false = make_edge (part_bb, abort_bb, EDGE_FALSE_VALUE); + part_false->probability = profile_probability::unlikely (); + make_single_succ_edge (abort_bb, ret_bb, EDGE_FALLTHRU); + if (dom_info_available_p (CDI_DOMINATORS)) + { + set_immediate_dominator (CDI_DOMINATORS, part_bb, detection_bb); + set_immediate_dominator (CDI_DOMINATORS, ret_bb, detection_bb); + set_immediate_dominator (CDI_DOMINATORS, abort_bb, detection_bb); + } +} + +const pass_data pass_data_ipa_hardware_detection = +{ + SIMPLE_IPA_PASS, + "hardware_detection", + OPTGROUP_NONE, + TV_IPA_HARDWARE_DETECTION, + (PROP_cfg | PROP_ssa), + 0, + 0, + 0, + (TODO_update_ssa | TODO_verify_all) +}; + +class pass_ipa_hardware_detection : public simple_ipa_opt_pass +{ +public: + pass_ipa_hardware_detection (gcc::context *ctxt) + : simple_ipa_opt_pass (pass_data_ipa_hardware_detection, ctxt) + {} + + virtual bool gate (function *); + virtual unsigned int execute (function *); +}; // class pass_ipa_hardware_detection + +bool +pass_ipa_hardware_detection::gate (function *) +{ + const char *ai_infer_level = getenv ("AI_INFER_LEVEL"); + return (ai_infer_level + && optimize_machine > 0 + /* Only enable in lto or whole_program.  */ + && (in_lto_p || flag_whole_program)); +} + +unsigned int +pass_ipa_hardware_detection::execute (function *) +{ + unsigned int ret = 0; + cgraph_node *cnode; + FOR_EACH_FUNCTION (cnode) + { + if (!cnode->real_symbol_p ()) + { + continue; + } + if (cnode->definition) + { + if (!cnode->has_gimple_body_p () || cnode->inlined_to) + continue; + + cnode->get_body (); + function *fn = DECL_STRUCT_FUNCTION (cnode->decl); + if (!fn) + continue; + + if (DECL_NAME (cnode->decl) + && MAIN_NAME_P (DECL_NAME (cnode->decl))) + { + push_cfun (fn); + calculate_dominance_info (CDI_DOMINATORS); + + create_detection_bb (); + + cgraph_edge::rebuild_edges (); + free_dominance_info (CDI_DOMINATORS); + pop_cfun (); + } + } + } + return ret; +} +} // anon namespace + +simple_ipa_opt_pass * +make_pass_ipa_hardware_detection (gcc::context *ctxt) +{ + return new pass_ipa_hardware_detection (ctxt); +} diff --git a/gcc/opts-common.c b/gcc/opts-common.c index bf82b05c8..52e28e2dc 100644 --- a/gcc/opts-common.c +++ b/gcc/opts-common.c @@ -926,6 +926,158 @@ opts_concat (const char *first, ...) return newstr; } +typedef int64_t (*run_ai_model_func)(int, const char **, + const char *, int, int64_t *); +#define PTR_UNION_TYPE(TOTYPE) union { void *_q; TOTYPE _nq; } +#define PTR_UNION_AS_VOID_PTR(NAME) (NAME._q) +#define PTR_UNION_AS_CAST_PTR(NAME) (NAME._nq) + +static int64_t +ai_infer_optimization (int argc, const char **argv, + const char *mcpu_option, + int argc_hw, int64_t *argv_hw) +{ + /* Load dependent AI-framework libraries. */ + void *onnxruntime_lib_handle = NULL; + const char *onnxruntime_lib_path = "libonnxruntime.so"; + + onnxruntime_lib_handle = dlopen (onnxruntime_lib_path, + RTLD_LAZY | RTLD_GLOBAL); + if (!onnxruntime_lib_handle) + { + return -1; + } + + void *ai4c_lib_handle = NULL; + const char *ai4c_lib_path = "libONNXRunner.so"; + + ai4c_lib_handle = dlopen (ai4c_lib_path, RTLD_LAZY | RTLD_GLOBAL); + if (!ai4c_lib_handle) + { + return -1; + } + + /* Clear any existing error. */ + dlerror (); + + /* Run AI4Compiler model. */ + if (ai4c_lib_handle == NULL || onnxruntime_lib_handle == NULL) + { + return -1; + } + + run_ai_model_func run_ai_model; + PTR_UNION_TYPE (run_ai_model_func) run_ai_model_func_union; + PTR_UNION_AS_VOID_PTR (run_ai_model_func_union) + = dlsym (ai4c_lib_handle, "runONNXModelOptimizer"); + run_ai_model = PTR_UNION_AS_CAST_PTR (run_ai_model_func_union); + if (!run_ai_model) + { + dlclose (ai4c_lib_handle); + dlclose (onnxruntime_lib_handle); + return -1; + } + int64_t model_pred = (*run_ai_model) (argc, argv, + mcpu_option, argc_hw, argv_hw); + + if (ai4c_lib_handle) + dlclose (ai4c_lib_handle); + + if (onnxruntime_lib_handle) + dlclose (onnxruntime_lib_handle); + + if (model_pred == 1) + putenv ("AI_INFER_LEVEL=1"); + return model_pred; +} + +static int +handle_lto_option (unsigned int lang_mask, + unsigned int num_decoded_options, + unsigned int argc, + const char **argv, + struct cl_decoded_option *&opt_array) +{ + int ret = 0; + char *lan = ""; + char *compiler = xstrdup (argv[0]); + lan = strrchr (compiler, '/'); + if (lan != NULL) + lan ++; + else + lan = compiler; + if (strstr (lan, "gcc") != NULL) + { + opt_array = XRESIZEVEC (struct cl_decoded_option, opt_array, argc + 2); + const char* lto_flag = "-flto=8"; + decode_cmdline_option (<o_flag, lang_mask, + &opt_array[num_decoded_options]); + ret++; + const char* ltopartition_flag = "-flto-partition=one"; + decode_cmdline_option (<opartition_flag, lang_mask, + &opt_array[num_decoded_options + 1]); + ret++; + } + else if (strstr (lan, "g++") != NULL + || strstr (lan, "gfortran") != NULL) + { + opt_array = XRESIZEVEC (struct cl_decoded_option, opt_array, argc + 1); + const char* lto_flag = "-flto=8"; + decode_cmdline_option (<o_flag, lang_mask, + &opt_array[num_decoded_options]); + ret++; + } + if (compiler) + free (compiler); + return ret; +} + +static int +handle_machine_option (unsigned int lang_mask, + unsigned int num_decoded_options, + unsigned int argc, + const char **argv, + struct cl_decoded_option *&opt_array) +{ + int ret = 0; + bool flag_Om = false; + bool flag_hip09 = false; + for (unsigned i = 1; i < argc; i ++) + { + if (strcmp (argv[i], "-Om") == 0) + flag_Om = true; + if (strstr (argv[i], "mcpu=hip09") != NULL) + flag_hip09 = true; + } + if (!flag_hip09 || !flag_Om) + { + return ret; + } + + const char *ai_infer_level = getenv ("AI_INFER_LEVEL"); + if (ai_infer_level) + { + return ret; + } + int argc_hw = 6; + int64_t argv_hw[argc_hw] = { + global_options.x_param_simultaneous_prefetches, + global_options.x_param_l1_cache_size, + global_options.x_param_l1_cache_line_size, + global_options.x_param_l2_cache_size, + global_options.x_param_llc_capacity_per_core, + global_options.x_param_ipa_prefetch_distance_factor}; + int64_t output_pred = ai_infer_optimization ( + argc, argv, "hip09", argc_hw, argv_hw); + if (output_pred != 1) + { + return ret; + } + + return handle_lto_option (lang_mask, num_decoded_options, + argc, argv, opt_array); +} + /* Decode command-line options (ARGC and ARGV being the arguments of main) into an array, setting *DECODED_OPTIONS to a pointer to that array and *DECODED_OPTIONS_COUNT to the number of entries in the @@ -987,6 +1139,8 @@ decode_cmdline_options_to_array (unsigned int argc, const char **argv, num_decoded_options++; } + num_decoded_options += handle_machine_option (lang_mask, num_decoded_options, + argc, argv, opt_array); *decoded_options = opt_array; *decoded_options_count = num_decoded_options; prune_options (decoded_options, decoded_options_count, lang_mask); diff --git a/gcc/opts.c b/gcc/opts.c index c0ccd0853..dc61216c0 100644 --- a/gcc/opts.c +++ b/gcc/opts.c @@ -632,6 +632,15 @@ default_options_optimization (struct gcc_options *opts, opts->x_optimize_debug = 1; break; + case OPT_Om: + /* -Om adds flags to -O3 & -Ofast. */ + opts->x_optimize_size = 0; + opts->x_optimize = 3; + opts->x_optimize_fast = 1; + opts->x_optimize_machine = true; + opts->x_optimize_debug = 0; + break; + case OPT_fopenacc: if (opt->value) openacc_mode = true; @@ -2378,6 +2387,8 @@ common_handle_option (struct gcc_options *opts, opts->x_flag_sanitize_coverage, value, true); break; + case OPT_Om: + break; case OPT_O: case OPT_Os: case OPT_Ofast: diff --git a/gcc/passes.def b/gcc/passes.def index b6006de22..8898b72fc 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -177,6 +177,7 @@ along with GCC; see the file COPYING3. If not see compiled unit. */ INSERT_PASSES_AFTER (all_late_ipa_passes) NEXT_PASS (pass_materialize_all_clones); + NEXT_PASS (pass_ipa_hardware_detection); NEXT_PASS (pass_ipa_pta); /* FIXME: this should a normal IP pass */ NEXT_PASS (pass_ipa_struct_reorg); diff --git a/gcc/timevar.def b/gcc/timevar.def index 929e9e1d3..66b21f166 100644 --- a/gcc/timevar.def +++ b/gcc/timevar.def @@ -81,6 +81,7 @@ DEFTIMEVAR (TV_IPA_CONSTANT_PROP , "ipa cp") DEFTIMEVAR (TV_IPA_INLINING , "ipa inlining heuristics") DEFTIMEVAR (TV_IPA_FNSPLIT , "ipa function splitting") DEFTIMEVAR (TV_IPA_COMDATS , "ipa comdats") +DEFTIMEVAR (TV_IPA_HARDWARE_DETECTION, "ipa detection") DEFTIMEVAR (TV_IPA_PREFETCH , "ipa prefetch") DEFTIMEVAR (TV_IPA_STRUCT_REORG , "ipa struct reorg optimization") DEFTIMEVAR (TV_IPA_EXTEND_AUTO_PROFILE, "ipa extend auto profile") diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 4d952884d..d3a41d0d5 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -513,6 +513,8 @@ extern ipa_opt_pass_d *make_pass_ipa_odr (gcc::context *ctxt); extern ipa_opt_pass_d *make_pass_ipa_reference (gcc::context *ctxt); extern ipa_opt_pass_d *make_pass_ipa_hsa (gcc::context *ctxt); extern ipa_opt_pass_d *make_pass_ipa_pure_const (gcc::context *ctxt); +extern simple_ipa_opt_pass *make_pass_ipa_hardware_detection (gcc::context * + ctxt); extern simple_ipa_opt_pass *make_pass_ipa_prefetch (gcc::context *ctxt); extern simple_ipa_opt_pass *make_pass_ipa_struct_reorg (gcc::context *ctxt); extern simple_ipa_opt_pass *make_pass_ipa_extend_auto_profile (gcc::context -- 2.33.0