Blame SOURCES/gcc11-tremont2.patch

f49307
From 80c2ed8228817fb6438120997227811a746272ba Mon Sep 17 00:00:00 2001
f49307
From: "H.J. Lu" <hjl.tools@gmail.com>
f49307
Date: Wed, 15 Sep 2021 14:17:08 +0800
f49307
Subject: [PATCH 2/3] x86: Update memcpy/memset inline strategies for
f49307
 -mtune=tremont
f49307
f49307
Simply memcpy and memset inline strategies to avoid branches for
f49307
-mtune=tremont:
f49307
f49307
1. Create Tremont cost model from generic cost model.
f49307
2. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
f49307
   load and store for up to 16 * 16 (256) bytes when the data size is
f49307
   fixed and known.
f49307
3. Inline only if data size is known to be <= 256.
f49307
   a. Use "rep movsb/stosb" with simple code sequence if the data size
f49307
      is a constant.
f49307
   b. Use loop if data size is not a constant.
f49307
4. Use memcpy/memset libray function if data size is unknown or > 256.
f49307
f49307
	* config/i386/i386-options.c (processor_cost_table): Use
f49307
	tremont_cost for Tremont.
f49307
	* config/i386/x86-tune-costs.h (tremont_memcpy): New.
f49307
	(tremont_memset): Likewise.
f49307
	(tremont_cost): Likewise.
f49307
	* config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
f49307
	Enable for Tremont.
f49307
---
f49307
 gcc/config/i386/i386-options.c   |   2 +-
f49307
 gcc/config/i386/x86-tune-costs.h | 124 +++++++++++++++++++++++++++++++
f49307
 gcc/config/i386/x86-tune.def     |   2 +-
f49307
 3 files changed, 126 insertions(+), 2 deletions(-)
f49307
f49307
diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
f49307
index 19632b5fd6b..4b77d62926f 100644
f49307
--- a/gcc/config/i386/i386-options.c
f49307
+++ b/gcc/config/i386/i386-options.c
f49307
@@ -719,7 +719,7 @@ static const struct processor_costs *processor_cost_table[] =
f49307
   &slm_cost,
f49307
   &slm_cost,
f49307
   &slm_cost,
f49307
-  &slm_cost,
f49307
+  &tremont_cost,
f49307
   &slm_cost,
f49307
   &slm_cost,
f49307
   &skylake_cost,
f49307
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
f49307
index ffe810f2bcb..93644be9cb3 100644
f49307
--- a/gcc/config/i386/x86-tune-costs.h
f49307
+++ b/gcc/config/i386/x86-tune-costs.h
f49307
@@ -2734,6 +2734,130 @@ struct processor_costs slm_cost = {
f49307
   "16",					/* Func alignment.  */
f49307
 };
f49307
 
f49307
+static stringop_algs tremont_memcpy[2] = {
f49307
+  {libcall,
f49307
+   {{256, rep_prefix_1_byte, true},
f49307
+    {256, loop, false},
f49307
+    {-1, libcall, false}}},
f49307
+  {libcall,
f49307
+   {{256, rep_prefix_1_byte, true},
f49307
+    {256, loop, false},
f49307
+    {-1, libcall, false}}}};
f49307
+static stringop_algs tremont_memset[2] = {
f49307
+  {libcall,
f49307
+   {{256, rep_prefix_1_byte, true},
f49307
+    {256, loop, false},
f49307
+    {-1, libcall, false}}},
f49307
+  {libcall,
f49307
+   {{256, rep_prefix_1_byte, true},
f49307
+    {256, loop, false},
f49307
+    {-1, libcall, false}}}};
f49307
+static const
f49307
+struct processor_costs tremont_cost = {
f49307
+  {
f49307
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
f49307
+  6,				     /* cost for loading QImode using movzbl */
f49307
+  {6, 6, 6},				/* cost of loading integer registers
f49307
+					   in QImode, HImode and SImode.
f49307
+					   Relative to reg-reg move (2).  */
f49307
+  {6, 6, 6},				/* cost of storing integer registers */
f49307
+  4,					/* cost of reg,reg fld/fst */
f49307
+  {6, 6, 12},				/* cost of loading fp registers
f49307
+					   in SFmode, DFmode and XFmode */
f49307
+  {6, 6, 12},				/* cost of storing fp registers
f49307
+					   in SFmode, DFmode and XFmode */
f49307
+  2,					/* cost of moving MMX register */
f49307
+  {6, 6},				/* cost of loading MMX registers
f49307
+					   in SImode and DImode */
f49307
+  {6, 6},				/* cost of storing MMX registers
f49307
+					   in SImode and DImode */
f49307
+  2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
f49307
+  {6, 6, 6, 10, 15},			/* cost of loading SSE registers
f49307
+					   in 32,64,128,256 and 512-bit */
f49307
+  {6, 6, 6, 10, 15},			/* cost of storing SSE registers
f49307
+					   in 32,64,128,256 and 512-bit */
f49307
+  6, 6,				/* SSE->integer and integer->SSE moves */
f49307
+  6, 6,				/* mask->integer and integer->mask moves */
f49307
+  {6, 6, 6},				/* cost of loading mask register
f49307
+					   in QImode, HImode, SImode.  */
f49307
+  {6, 6, 6},			/* cost if storing mask register
f49307
+					   in QImode, HImode, SImode.  */
f49307
+  2,					/* cost of moving mask register.  */
f49307
+  /* End of register allocator costs.  */
f49307
+  },
f49307
+
f49307
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
f49307
+  /* Setting cost to 2 makes our current implementation of synth_mult result in
f49307
+     use of unnecessary temporary registers causing regression on several
f49307
+     SPECfp benchmarks.  */
f49307
+  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
f49307
+  COSTS_N_INSNS (1),			/* variable shift costs */
f49307
+  COSTS_N_INSNS (1),			/* constant shift costs */
f49307
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
f49307
+   COSTS_N_INSNS (4),			/*				 HI */
f49307
+   COSTS_N_INSNS (3),			/*				 SI */
f49307
+   COSTS_N_INSNS (4),			/*				 DI */
f49307
+   COSTS_N_INSNS (4)},			/*			      other */
f49307
+  0,					/* cost of multiply per each bit set */
f49307
+  {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI */
f49307
+   COSTS_N_INSNS (22),			/*			    HI */
f49307
+   COSTS_N_INSNS (30),			/*			    SI */
f49307
+   COSTS_N_INSNS (74),			/*			    DI */
f49307
+   COSTS_N_INSNS (74)},			/*			    other */
f49307
+  COSTS_N_INSNS (1),			/* cost of movsx */
f49307
+  COSTS_N_INSNS (1),			/* cost of movzx */
f49307
+  8,					/* "large" insn */
f49307
+  17,					/* MOVE_RATIO */
f49307
+  17,					/* CLEAR_RATIO */
f49307
+  {6, 6, 6},				/* cost of loading integer registers
f49307
+					   in QImode, HImode and SImode.
f49307
+					   Relative to reg-reg move (2).  */
f49307
+  {6, 6, 6},				/* cost of storing integer registers */
f49307
+  {6, 6, 6, 10, 15},			/* cost of loading SSE register
f49307
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
f49307
+  {6, 6, 6, 10, 15},			/* cost of storing SSE register
f49307
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
f49307
+  {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
f49307
+  {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
f49307
+  2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
f49307
+  6,					/* cost of moving SSE register to integer.  */
f49307
+  18, 6,				/* Gather load static, per_elt.  */
f49307
+  18, 6,				/* Gather store static, per_elt.  */
f49307
+  32,					/* size of l1 cache.  */
f49307
+  512,					/* size of l2 cache.  */
f49307
+  64,					/* size of prefetch block */
f49307
+  6,					/* number of parallel prefetches */
f49307
+  /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
f49307
+     value is increased to perhaps more appropriate value of 5.  */
f49307
+  3,					/* Branch cost */
f49307
+  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
f49307
+  COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
f49307
+  COSTS_N_INSNS (17),			/* cost of FDIV instruction.  */
f49307
+  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
f49307
+  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
f49307
+  COSTS_N_INSNS (14),			/* cost of FSQRT instruction.  */
f49307
+
f49307
+  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
f49307
+  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
f49307
+  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
f49307
+  COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
f49307
+  COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
f49307
+  COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
f49307
+  COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
f49307
+  COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
f49307
+  COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
f49307
+  COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
f49307
+  1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
f49307
+  tremont_memcpy,
f49307
+  tremont_memset,
f49307
+  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
f49307
+  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
f49307
+  "16:11:8",				/* Loop alignment.  */
f49307
+  "16:11:8",				/* Jump alignment.  */
f49307
+  "0:0:8",				/* Label alignment.  */
f49307
+  "16",					/* Func alignment.  */
f49307
+};
f49307
+
f49307
 static stringop_algs intel_memcpy[2] = {
f49307
   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
f49307
   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
f49307
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
f49307
index 6bd7087a03f..636e0c788bf 100644
f49307
--- a/gcc/config/i386/x86-tune.def
f49307
+++ b/gcc/config/i386/x86-tune.def
f49307
@@ -273,7 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
f49307
    move/set sequences of bytes with known size.  */
f49307
 DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
f49307
 	  "prefer_known_rep_movsb_stosb",
f49307
-	  m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
f49307
+	  m_SKYLAKE | m_ALDERLAKE | m_TREMONT | m_CORE_AVX512)
f49307
 
f49307
 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
f49307
    compact prologues and epilogues by issuing a misaligned moves.  This
f49307
-- 
f49307
2.18.2
f49307