Tree - rpms/devtoolset-11-gcc - CentOS Git server

rpms / devtoolset-11-gcc

Blame SOURCES/gcc11-tremont2.patch

Blob History Raw

		a46658	`From 80c2ed8228817fb6438120997227811a746272ba Mon Sep 17 00:00:00 2001`
		a46658	`From: "H.J. Lu" <hjl.tools@gmail.com>`
		a46658	`Date: Wed, 15 Sep 2021 14:17:08 +0800`
		a46658	`Subject: [PATCH 2/3] x86: Update memcpy/memset inline strategies for`
		a46658	`-mtune=tremont`
		a46658
		a46658	`Simply memcpy and memset inline strategies to avoid branches for`
		a46658	`-mtune=tremont:`
		a46658
		a46658	`1. Create Tremont cost model from generic cost model.`
		a46658	`2. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector`
		a46658	`load and store for up to 16 * 16 (256) bytes when the data size is`
		a46658	`fixed and known.`
		a46658	`3. Inline only if data size is known to be <= 256.`
		a46658	`a. Use "rep movsb/stosb" with simple code sequence if the data size`
		a46658	`is a constant.`
		a46658	`b. Use loop if data size is not a constant.`
		a46658	`4. Use memcpy/memset libray function if data size is unknown or > 256.`
		a46658
		a46658	`* config/i386/i386-options.c (processor_cost_table): Use`
		a46658	`tremont_cost for Tremont.`
		a46658	`* config/i386/x86-tune-costs.h (tremont_memcpy): New.`
		a46658	`(tremont_memset): Likewise.`
		a46658	`(tremont_cost): Likewise.`
		a46658	`* config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):`
		a46658	`Enable for Tremont.`
		a46658	`---`
		a46658	`gcc/config/i386/i386-options.c \| 2 +-`
		a46658	`gcc/config/i386/x86-tune-costs.h \| 124 +++++++++++++++++++++++++++++++`
		a46658	`gcc/config/i386/x86-tune.def \| 2 +-`
		a46658	`3 files changed, 126 insertions(+), 2 deletions(-)`
		a46658
		a46658	`diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c`
		a46658	`index 19632b5fd6b..4b77d62926f 100644`
		a46658	`--- a/gcc/config/i386/i386-options.c`
		a46658	`+++ b/gcc/config/i386/i386-options.c`
		a46658	`@@ -719,7 +719,7 @@ static const struct processor_costs *processor_cost_table[] =`
		a46658	`&slm_cost,`
		a46658	`&slm_cost,`
		a46658	`&slm_cost,`
		a46658	`- &slm_cost,`
		a46658	`+ &tremont_cost,`
		a46658	`&slm_cost,`
		a46658	`&slm_cost,`
		a46658	`&skylake_cost,`
		a46658	`diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h`
		a46658	`index ffe810f2bcb..93644be9cb3 100644`
		a46658	`--- a/gcc/config/i386/x86-tune-costs.h`
		a46658	`+++ b/gcc/config/i386/x86-tune-costs.h`
		a46658	`@@ -2734,6 +2734,130 @@ struct processor_costs slm_cost = {`
		a46658	`"16", /* Func alignment. */`
		a46658	`};`
		a46658
		a46658	`+static stringop_algs tremont_memcpy[2] = {`
		a46658	`+ {libcall,`
		a46658	`+ {{256, rep_prefix_1_byte, true},`
		a46658	`+ {256, loop, false},`
		a46658	`+ {-1, libcall, false}}},`
		a46658	`+ {libcall,`
		a46658	`+ {{256, rep_prefix_1_byte, true},`
		a46658	`+ {256, loop, false},`
		a46658	`+ {-1, libcall, false}}}};`
		a46658	`+static stringop_algs tremont_memset[2] = {`
		a46658	`+ {libcall,`
		a46658	`+ {{256, rep_prefix_1_byte, true},`
		a46658	`+ {256, loop, false},`
		a46658	`+ {-1, libcall, false}}},`
		a46658	`+ {libcall,`
		a46658	`+ {{256, rep_prefix_1_byte, true},`
		a46658	`+ {256, loop, false},`
		a46658	`+ {-1, libcall, false}}}};`
		a46658	`+static const`
		a46658	`+struct processor_costs tremont_cost = {`
		a46658	`+ {`
		a46658	`+ /* Start of register allocator costs. integer->integer move cost is 2. */`
		a46658	`+ 6, /* cost for loading QImode using movzbl */`
		a46658	`+ {6, 6, 6}, /* cost of loading integer registers`
		a46658	`+ in QImode, HImode and SImode.`
		a46658	`+ Relative to reg-reg move (2). */`
		a46658	`+ {6, 6, 6}, /* cost of storing integer registers */`
		a46658	`+ 4, /* cost of reg,reg fld/fst */`
		a46658	`+ {6, 6, 12}, /* cost of loading fp registers`
		a46658	`+ in SFmode, DFmode and XFmode */`
		a46658	`+ {6, 6, 12}, /* cost of storing fp registers`
		a46658	`+ in SFmode, DFmode and XFmode */`
		a46658	`+ 2, /* cost of moving MMX register */`
		a46658	`+ {6, 6}, /* cost of loading MMX registers`
		a46658	`+ in SImode and DImode */`
		a46658	`+ {6, 6}, /* cost of storing MMX registers`
		a46658	`+ in SImode and DImode */`
		a46658	`+ 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */`
		a46658	`+ {6, 6, 6, 10, 15}, /* cost of loading SSE registers`
		a46658	`+ in 32,64,128,256 and 512-bit */`
		a46658	`+ {6, 6, 6, 10, 15}, /* cost of storing SSE registers`
		a46658	`+ in 32,64,128,256 and 512-bit */`
		a46658	`+ 6, 6, /* SSE->integer and integer->SSE moves */`
		a46658	`+ 6, 6, /* mask->integer and integer->mask moves */`
		a46658	`+ {6, 6, 6}, /* cost of loading mask register`
		a46658	`+ in QImode, HImode, SImode. */`
		a46658	`+ {6, 6, 6}, /* cost if storing mask register`
		a46658	`+ in QImode, HImode, SImode. */`
		a46658	`+ 2, /* cost of moving mask register. */`
		a46658	`+ /* End of register allocator costs. */`
		a46658	`+ },`
		a46658	`+`
		a46658	`+ COSTS_N_INSNS (1), /* cost of an add instruction */`
		a46658	`+ /* Setting cost to 2 makes our current implementation of synth_mult result in`
		a46658	`+ use of unnecessary temporary registers causing regression on several`
		a46658	`+ SPECfp benchmarks. */`
		a46658	`+ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */`
		a46658	`+ COSTS_N_INSNS (1), /* variable shift costs */`
		a46658	`+ COSTS_N_INSNS (1), /* constant shift costs */`
		a46658	`+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */`
		a46658	`+ COSTS_N_INSNS (4), /* HI */`
		a46658	`+ COSTS_N_INSNS (3), /* SI */`
		a46658	`+ COSTS_N_INSNS (4), /* DI */`
		a46658	`+ COSTS_N_INSNS (4)}, /* other */`
		a46658	`+ 0, /* cost of multiply per each bit set */`
		a46658	`+ {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */`
		a46658	`+ COSTS_N_INSNS (22), /* HI */`
		a46658	`+ COSTS_N_INSNS (30), /* SI */`
		a46658	`+ COSTS_N_INSNS (74), /* DI */`
		a46658	`+ COSTS_N_INSNS (74)}, /* other */`
		a46658	`+ COSTS_N_INSNS (1), /* cost of movsx */`
		a46658	`+ COSTS_N_INSNS (1), /* cost of movzx */`
		a46658	`+ 8, /* "large" insn */`
		a46658	`+ 17, /* MOVE_RATIO */`
		a46658	`+ 17, /* CLEAR_RATIO */`
		a46658	`+ {6, 6, 6}, /* cost of loading integer registers`
		a46658	`+ in QImode, HImode and SImode.`
		a46658	`+ Relative to reg-reg move (2). */`
		a46658	`+ {6, 6, 6}, /* cost of storing integer registers */`
		a46658	`+ {6, 6, 6, 10, 15}, /* cost of loading SSE register`
		a46658	`+ in 32bit, 64bit, 128bit, 256bit and 512bit */`
		a46658	`+ {6, 6, 6, 10, 15}, /* cost of storing SSE register`
		a46658	`+ in 32bit, 64bit, 128bit, 256bit and 512bit */`
		a46658	`+ {6, 6, 6, 10, 15}, /* cost of unaligned loads. */`
		a46658	`+ {6, 6, 6, 10, 15}, /* cost of unaligned storess. */`
		a46658	`+ 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */`
		a46658	`+ 6, /* cost of moving SSE register to integer. */`
		a46658	`+ 18, 6, /* Gather load static, per_elt. */`
		a46658	`+ 18, 6, /* Gather store static, per_elt. */`
		a46658	`+ 32, /* size of l1 cache. */`
		a46658	`+ 512, /* size of l2 cache. */`
		a46658	`+ 64, /* size of prefetch block */`
		a46658	`+ 6, /* number of parallel prefetches */`
		a46658	`+ /* Benchmarks shows large regressions on K8 sixtrack benchmark when this`
		a46658	`+ value is increased to perhaps more appropriate value of 5. */`
		a46658	`+ 3, /* Branch cost */`
		a46658	`+ COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */`
		a46658	`+ COSTS_N_INSNS (5), /* cost of FMUL instruction. */`
		a46658	`+ COSTS_N_INSNS (17), /* cost of FDIV instruction. */`
		a46658	`+ COSTS_N_INSNS (1), /* cost of FABS instruction. */`
		a46658	`+ COSTS_N_INSNS (1), /* cost of FCHS instruction. */`
		a46658	`+ COSTS_N_INSNS (14), /* cost of FSQRT instruction. */`
		a46658	`+`
		a46658	`+ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */`
		a46658	`+ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */`
		a46658	`+ COSTS_N_INSNS (4), /* cost of MULSS instruction. */`
		a46658	`+ COSTS_N_INSNS (5), /* cost of MULSD instruction. */`
		a46658	`+ COSTS_N_INSNS (5), /* cost of FMA SS instruction. */`
		a46658	`+ COSTS_N_INSNS (5), /* cost of FMA SD instruction. */`
		a46658	`+ COSTS_N_INSNS (13), /* cost of DIVSS instruction. */`
		a46658	`+ COSTS_N_INSNS (17), /* cost of DIVSD instruction. */`
		a46658	`+ COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */`
		a46658	`+ COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */`
		a46658	`+ 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */`
		a46658	`+ tremont_memcpy,`
		a46658	`+ tremont_memset,`
		a46658	`+ COSTS_N_INSNS (4), /* cond_taken_branch_cost. */`
		a46658	`+ COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */`
		a46658	`+ "16:11:8", /* Loop alignment. */`
		a46658	`+ "16:11:8", /* Jump alignment. */`
		a46658	`+ "0:0:8", /* Label alignment. */`
		a46658	`+ "16", /* Func alignment. */`
		a46658	`+};`
		a46658	`+`
		a46658	`static stringop_algs intel_memcpy[2] = {`
		a46658	`{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},`
		a46658	`{libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},`
		a46658	`diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def`
		a46658	`index 6bd7087a03f..636e0c788bf 100644`
		a46658	`--- a/gcc/config/i386/x86-tune.def`
		a46658	`+++ b/gcc/config/i386/x86-tune.def`
		a46658	`@@ -273,7 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 \| m_P4_NOCONA)`
		a46658	`move/set sequences of bytes with known size. */`
		a46658	`DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,`
		a46658	`"prefer_known_rep_movsb_stosb",`
		a46658	`- m_SKYLAKE \| m_ALDERLAKE \| m_CORE_AVX512)`
		a46658	`+ m_SKYLAKE \| m_ALDERLAKE \| m_TREMONT \| m_CORE_AVX512)`
		a46658
		a46658	`/* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of`
		a46658	`compact prologues and epilogues by issuing a misaligned moves. This`
		a46658	`--`
		a46658	`2.18.2`
		a46658

rpms / devtoolset-11-gcc

Source Code

Blame SOURCES/gcc11-tremont2.patch