Tree - rpms/gcc-toolset-11-gcc

rpms / gcc-toolset-11-gcc

Blame SOURCES/gcc11-tremont2.patch

Blob History Raw

		f49307	`From 80c2ed8228817fb6438120997227811a746272ba Mon Sep 17 00:00:00 2001`
		f49307	`From: "H.J. Lu" <hjl.tools@gmail.com>`
		f49307	`Date: Wed, 15 Sep 2021 14:17:08 +0800`
		f49307	`Subject: [PATCH 2/3] x86: Update memcpy/memset inline strategies for`
		f49307	`-mtune=tremont`
		f49307
		f49307	`Simply memcpy and memset inline strategies to avoid branches for`
		f49307	`-mtune=tremont:`
		f49307
		f49307	`1. Create Tremont cost model from generic cost model.`
		f49307	`2. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector`
		f49307	`load and store for up to 16 * 16 (256) bytes when the data size is`
		f49307	`fixed and known.`
		f49307	`3. Inline only if data size is known to be <= 256.`
		f49307	`a. Use "rep movsb/stosb" with simple code sequence if the data size`
		f49307	`is a constant.`
		f49307	`b. Use loop if data size is not a constant.`
		f49307	`4. Use memcpy/memset libray function if data size is unknown or > 256.`
		f49307
		f49307	`* config/i386/i386-options.c (processor_cost_table): Use`
		f49307	`tremont_cost for Tremont.`
		f49307	`* config/i386/x86-tune-costs.h (tremont_memcpy): New.`
		f49307	`(tremont_memset): Likewise.`
		f49307	`(tremont_cost): Likewise.`
		f49307	`* config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):`
		f49307	`Enable for Tremont.`
		f49307	`---`
		f49307	`gcc/config/i386/i386-options.c \| 2 +-`
		f49307	`gcc/config/i386/x86-tune-costs.h \| 124 +++++++++++++++++++++++++++++++`
		f49307	`gcc/config/i386/x86-tune.def \| 2 +-`
		f49307	`3 files changed, 126 insertions(+), 2 deletions(-)`
		f49307
		f49307	`diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c`
		f49307	`index 19632b5fd6b..4b77d62926f 100644`
		f49307	`--- a/gcc/config/i386/i386-options.c`
		f49307	`+++ b/gcc/config/i386/i386-options.c`
		f49307	`@@ -719,7 +719,7 @@ static const struct processor_costs *processor_cost_table[] =`
		f49307	`&slm_cost,`
		f49307	`&slm_cost,`
		f49307	`&slm_cost,`
		f49307	`- &slm_cost,`
		f49307	`+ &tremont_cost,`
		f49307	`&slm_cost,`
		f49307	`&slm_cost,`
		f49307	`&skylake_cost,`
		f49307	`diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h`
		f49307	`index ffe810f2bcb..93644be9cb3 100644`
		f49307	`--- a/gcc/config/i386/x86-tune-costs.h`
		f49307	`+++ b/gcc/config/i386/x86-tune-costs.h`
		f49307	`@@ -2734,6 +2734,130 @@ struct processor_costs slm_cost = {`
		f49307	`"16", /* Func alignment. */`
		f49307	`};`
		f49307
		f49307	`+static stringop_algs tremont_memcpy[2] = {`
		f49307	`+ {libcall,`
		f49307	`+ {{256, rep_prefix_1_byte, true},`
		f49307	`+ {256, loop, false},`
		f49307	`+ {-1, libcall, false}}},`
		f49307	`+ {libcall,`
		f49307	`+ {{256, rep_prefix_1_byte, true},`
		f49307	`+ {256, loop, false},`
		f49307	`+ {-1, libcall, false}}}};`
		f49307	`+static stringop_algs tremont_memset[2] = {`
		f49307	`+ {libcall,`
		f49307	`+ {{256, rep_prefix_1_byte, true},`
		f49307	`+ {256, loop, false},`
		f49307	`+ {-1, libcall, false}}},`
		f49307	`+ {libcall,`
		f49307	`+ {{256, rep_prefix_1_byte, true},`
		f49307	`+ {256, loop, false},`
		f49307	`+ {-1, libcall, false}}}};`
		f49307	`+static const`
		f49307	`+struct processor_costs tremont_cost = {`
		f49307	`+ {`
		f49307	`+ /* Start of register allocator costs. integer->integer move cost is 2. */`
		f49307	`+ 6, /* cost for loading QImode using movzbl */`
		f49307	`+ {6, 6, 6}, /* cost of loading integer registers`
		f49307	`+ in QImode, HImode and SImode.`
		f49307	`+ Relative to reg-reg move (2). */`
		f49307	`+ {6, 6, 6}, /* cost of storing integer registers */`
		f49307	`+ 4, /* cost of reg,reg fld/fst */`
		f49307	`+ {6, 6, 12}, /* cost of loading fp registers`
		f49307	`+ in SFmode, DFmode and XFmode */`
		f49307	`+ {6, 6, 12}, /* cost of storing fp registers`
		f49307	`+ in SFmode, DFmode and XFmode */`
		f49307	`+ 2, /* cost of moving MMX register */`
		f49307	`+ {6, 6}, /* cost of loading MMX registers`
		f49307	`+ in SImode and DImode */`
		f49307	`+ {6, 6}, /* cost of storing MMX registers`
		f49307	`+ in SImode and DImode */`
		f49307	`+ 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */`
		f49307	`+ {6, 6, 6, 10, 15}, /* cost of loading SSE registers`
		f49307	`+ in 32,64,128,256 and 512-bit */`
		f49307	`+ {6, 6, 6, 10, 15}, /* cost of storing SSE registers`
		f49307	`+ in 32,64,128,256 and 512-bit */`
		f49307	`+ 6, 6, /* SSE->integer and integer->SSE moves */`
		f49307	`+ 6, 6, /* mask->integer and integer->mask moves */`
		f49307	`+ {6, 6, 6}, /* cost of loading mask register`
		f49307	`+ in QImode, HImode, SImode. */`
		f49307	`+ {6, 6, 6}, /* cost if storing mask register`
		f49307	`+ in QImode, HImode, SImode. */`
		f49307	`+ 2, /* cost of moving mask register. */`
		f49307	`+ /* End of register allocator costs. */`
		f49307	`+ },`
		f49307	`+`
		f49307	`+ COSTS_N_INSNS (1), /* cost of an add instruction */`
		f49307	`+ /* Setting cost to 2 makes our current implementation of synth_mult result in`
		f49307	`+ use of unnecessary temporary registers causing regression on several`
		f49307	`+ SPECfp benchmarks. */`
		f49307	`+ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */`
		f49307	`+ COSTS_N_INSNS (1), /* variable shift costs */`
		f49307	`+ COSTS_N_INSNS (1), /* constant shift costs */`
		f49307	`+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */`
		f49307	`+ COSTS_N_INSNS (4), /* HI */`
		f49307	`+ COSTS_N_INSNS (3), /* SI */`
		f49307	`+ COSTS_N_INSNS (4), /* DI */`
		f49307	`+ COSTS_N_INSNS (4)}, /* other */`
		f49307	`+ 0, /* cost of multiply per each bit set */`
		f49307	`+ {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */`
		f49307	`+ COSTS_N_INSNS (22), /* HI */`
		f49307	`+ COSTS_N_INSNS (30), /* SI */`
		f49307	`+ COSTS_N_INSNS (74), /* DI */`
		f49307	`+ COSTS_N_INSNS (74)}, /* other */`
		f49307	`+ COSTS_N_INSNS (1), /* cost of movsx */`
		f49307	`+ COSTS_N_INSNS (1), /* cost of movzx */`
		f49307	`+ 8, /* "large" insn */`
		f49307	`+ 17, /* MOVE_RATIO */`
		f49307	`+ 17, /* CLEAR_RATIO */`
		f49307	`+ {6, 6, 6}, /* cost of loading integer registers`
		f49307	`+ in QImode, HImode and SImode.`
		f49307	`+ Relative to reg-reg move (2). */`
		f49307	`+ {6, 6, 6}, /* cost of storing integer registers */`
		f49307	`+ {6, 6, 6, 10, 15}, /* cost of loading SSE register`
		f49307	`+ in 32bit, 64bit, 128bit, 256bit and 512bit */`
		f49307	`+ {6, 6, 6, 10, 15}, /* cost of storing SSE register`
		f49307	`+ in 32bit, 64bit, 128bit, 256bit and 512bit */`
		f49307	`+ {6, 6, 6, 10, 15}, /* cost of unaligned loads. */`
		f49307	`+ {6, 6, 6, 10, 15}, /* cost of unaligned storess. */`
		f49307	`+ 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */`
		f49307	`+ 6, /* cost of moving SSE register to integer. */`
		f49307	`+ 18, 6, /* Gather load static, per_elt. */`
		f49307	`+ 18, 6, /* Gather store static, per_elt. */`
		f49307	`+ 32, /* size of l1 cache. */`
		f49307	`+ 512, /* size of l2 cache. */`
		f49307	`+ 64, /* size of prefetch block */`
		f49307	`+ 6, /* number of parallel prefetches */`
		f49307	`+ /* Benchmarks shows large regressions on K8 sixtrack benchmark when this`
		f49307	`+ value is increased to perhaps more appropriate value of 5. */`
		f49307	`+ 3, /* Branch cost */`
		f49307	`+ COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */`
		f49307	`+ COSTS_N_INSNS (5), /* cost of FMUL instruction. */`
		f49307	`+ COSTS_N_INSNS (17), /* cost of FDIV instruction. */`
		f49307	`+ COSTS_N_INSNS (1), /* cost of FABS instruction. */`
		f49307	`+ COSTS_N_INSNS (1), /* cost of FCHS instruction. */`
		f49307	`+ COSTS_N_INSNS (14), /* cost of FSQRT instruction. */`
		f49307	`+`
		f49307	`+ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */`
		f49307	`+ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */`
		f49307	`+ COSTS_N_INSNS (4), /* cost of MULSS instruction. */`
		f49307	`+ COSTS_N_INSNS (5), /* cost of MULSD instruction. */`
		f49307	`+ COSTS_N_INSNS (5), /* cost of FMA SS instruction. */`
		f49307	`+ COSTS_N_INSNS (5), /* cost of FMA SD instruction. */`
		f49307	`+ COSTS_N_INSNS (13), /* cost of DIVSS instruction. */`
		f49307	`+ COSTS_N_INSNS (17), /* cost of DIVSD instruction. */`
		f49307	`+ COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */`
		f49307	`+ COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */`
		f49307	`+ 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */`
		f49307	`+ tremont_memcpy,`
		f49307	`+ tremont_memset,`
		f49307	`+ COSTS_N_INSNS (4), /* cond_taken_branch_cost. */`
		f49307	`+ COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */`
		f49307	`+ "16:11:8", /* Loop alignment. */`
		f49307	`+ "16:11:8", /* Jump alignment. */`
		f49307	`+ "0:0:8", /* Label alignment. */`
		f49307	`+ "16", /* Func alignment. */`
		f49307	`+};`
		f49307	`+`
		f49307	`static stringop_algs intel_memcpy[2] = {`
		f49307	`{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},`
		f49307	`{libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},`
		f49307	`diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def`
		f49307	`index 6bd7087a03f..636e0c788bf 100644`
		f49307	`--- a/gcc/config/i386/x86-tune.def`
		f49307	`+++ b/gcc/config/i386/x86-tune.def`
		f49307	`@@ -273,7 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 \| m_P4_NOCONA)`
		f49307	`move/set sequences of bytes with known size. */`
		f49307	`DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,`
		f49307	`"prefer_known_rep_movsb_stosb",`
		f49307	`- m_SKYLAKE \| m_ALDERLAKE \| m_CORE_AVX512)`
		f49307	`+ m_SKYLAKE \| m_ALDERLAKE \| m_TREMONT \| m_CORE_AVX512)`
		f49307
		f49307	`/* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of`
		f49307	`compact prologues and epilogues by issuing a misaligned moves. This`
		f49307	`--`
		f49307	`2.18.2`
		f49307

rpms / gcc-toolset-11-gcc

Source Code

Blame SOURCES/gcc11-tremont2.patch