Blame SOURCES/gcc11-tremont2.patch

e60d6e
From 80c2ed8228817fb6438120997227811a746272ba Mon Sep 17 00:00:00 2001
e60d6e
From: "H.J. Lu" <hjl.tools@gmail.com>
e60d6e
Date: Wed, 15 Sep 2021 14:17:08 +0800
e60d6e
Subject: [PATCH 2/3] x86: Update memcpy/memset inline strategies for
e60d6e
 -mtune=tremont
e60d6e
e60d6e
Simply memcpy and memset inline strategies to avoid branches for
e60d6e
-mtune=tremont:
e60d6e
e60d6e
1. Create Tremont cost model from generic cost model.
e60d6e
2. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
e60d6e
   load and store for up to 16 * 16 (256) bytes when the data size is
e60d6e
   fixed and known.
e60d6e
3. Inline only if data size is known to be <= 256.
e60d6e
   a. Use "rep movsb/stosb" with simple code sequence if the data size
e60d6e
      is a constant.
e60d6e
   b. Use loop if data size is not a constant.
e60d6e
4. Use memcpy/memset libray function if data size is unknown or > 256.
e60d6e
e60d6e
	* config/i386/i386-options.c (processor_cost_table): Use
e60d6e
	tremont_cost for Tremont.
e60d6e
	* config/i386/x86-tune-costs.h (tremont_memcpy): New.
e60d6e
	(tremont_memset): Likewise.
e60d6e
	(tremont_cost): Likewise.
e60d6e
	* config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
e60d6e
	Enable for Tremont.
e60d6e
---
e60d6e
 gcc/config/i386/i386-options.c   |   2 +-
e60d6e
 gcc/config/i386/x86-tune-costs.h | 124 +++++++++++++++++++++++++++++++
e60d6e
 gcc/config/i386/x86-tune.def     |   2 +-
e60d6e
 3 files changed, 126 insertions(+), 2 deletions(-)
e60d6e
e60d6e
diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
e60d6e
index 19632b5fd6b..4b77d62926f 100644
e60d6e
--- a/gcc/config/i386/i386-options.c
e60d6e
+++ b/gcc/config/i386/i386-options.c
e60d6e
@@ -719,7 +719,7 @@ static const struct processor_costs *processor_cost_table[] =
e60d6e
   &slm_cost,
e60d6e
   &slm_cost,
e60d6e
   &slm_cost,
e60d6e
-  &slm_cost,
e60d6e
+  &tremont_cost,
e60d6e
   &slm_cost,
e60d6e
   &slm_cost,
e60d6e
   &skylake_cost,
e60d6e
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
e60d6e
index ffe810f2bcb..93644be9cb3 100644
e60d6e
--- a/gcc/config/i386/x86-tune-costs.h
e60d6e
+++ b/gcc/config/i386/x86-tune-costs.h
e60d6e
@@ -2734,6 +2734,130 @@ struct processor_costs slm_cost = {
e60d6e
   "16",					/* Func alignment.  */
e60d6e
 };
e60d6e
 
e60d6e
+static stringop_algs tremont_memcpy[2] = {
e60d6e
+  {libcall,
e60d6e
+   {{256, rep_prefix_1_byte, true},
e60d6e
+    {256, loop, false},
e60d6e
+    {-1, libcall, false}}},
e60d6e
+  {libcall,
e60d6e
+   {{256, rep_prefix_1_byte, true},
e60d6e
+    {256, loop, false},
e60d6e
+    {-1, libcall, false}}}};
e60d6e
+static stringop_algs tremont_memset[2] = {
e60d6e
+  {libcall,
e60d6e
+   {{256, rep_prefix_1_byte, true},
e60d6e
+    {256, loop, false},
e60d6e
+    {-1, libcall, false}}},
e60d6e
+  {libcall,
e60d6e
+   {{256, rep_prefix_1_byte, true},
e60d6e
+    {256, loop, false},
e60d6e
+    {-1, libcall, false}}}};
e60d6e
+static const
e60d6e
+struct processor_costs tremont_cost = {
e60d6e
+  {
e60d6e
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
e60d6e
+  6,				     /* cost for loading QImode using movzbl */
e60d6e
+  {6, 6, 6},				/* cost of loading integer registers
e60d6e
+					   in QImode, HImode and SImode.
e60d6e
+					   Relative to reg-reg move (2).  */
e60d6e
+  {6, 6, 6},				/* cost of storing integer registers */
e60d6e
+  4,					/* cost of reg,reg fld/fst */
e60d6e
+  {6, 6, 12},				/* cost of loading fp registers
e60d6e
+					   in SFmode, DFmode and XFmode */
e60d6e
+  {6, 6, 12},				/* cost of storing fp registers
e60d6e
+					   in SFmode, DFmode and XFmode */
e60d6e
+  2,					/* cost of moving MMX register */
e60d6e
+  {6, 6},				/* cost of loading MMX registers
e60d6e
+					   in SImode and DImode */
e60d6e
+  {6, 6},				/* cost of storing MMX registers
e60d6e
+					   in SImode and DImode */
e60d6e
+  2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
e60d6e
+  {6, 6, 6, 10, 15},			/* cost of loading SSE registers
e60d6e
+					   in 32,64,128,256 and 512-bit */
e60d6e
+  {6, 6, 6, 10, 15},			/* cost of storing SSE registers
e60d6e
+					   in 32,64,128,256 and 512-bit */
e60d6e
+  6, 6,				/* SSE->integer and integer->SSE moves */
e60d6e
+  6, 6,				/* mask->integer and integer->mask moves */
e60d6e
+  {6, 6, 6},				/* cost of loading mask register
e60d6e
+					   in QImode, HImode, SImode.  */
e60d6e
+  {6, 6, 6},			/* cost if storing mask register
e60d6e
+					   in QImode, HImode, SImode.  */
e60d6e
+  2,					/* cost of moving mask register.  */
e60d6e
+  /* End of register allocator costs.  */
e60d6e
+  },
e60d6e
+
e60d6e
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
e60d6e
+  /* Setting cost to 2 makes our current implementation of synth_mult result in
e60d6e
+     use of unnecessary temporary registers causing regression on several
e60d6e
+     SPECfp benchmarks.  */
e60d6e
+  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
e60d6e
+  COSTS_N_INSNS (1),			/* variable shift costs */
e60d6e
+  COSTS_N_INSNS (1),			/* constant shift costs */
e60d6e
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
e60d6e
+   COSTS_N_INSNS (4),			/*				 HI */
e60d6e
+   COSTS_N_INSNS (3),			/*				 SI */
e60d6e
+   COSTS_N_INSNS (4),			/*				 DI */
e60d6e
+   COSTS_N_INSNS (4)},			/*			      other */
e60d6e
+  0,					/* cost of multiply per each bit set */
e60d6e
+  {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI */
e60d6e
+   COSTS_N_INSNS (22),			/*			    HI */
e60d6e
+   COSTS_N_INSNS (30),			/*			    SI */
e60d6e
+   COSTS_N_INSNS (74),			/*			    DI */
e60d6e
+   COSTS_N_INSNS (74)},			/*			    other */
e60d6e
+  COSTS_N_INSNS (1),			/* cost of movsx */
e60d6e
+  COSTS_N_INSNS (1),			/* cost of movzx */
e60d6e
+  8,					/* "large" insn */
e60d6e
+  17,					/* MOVE_RATIO */
e60d6e
+  17,					/* CLEAR_RATIO */
e60d6e
+  {6, 6, 6},				/* cost of loading integer registers
e60d6e
+					   in QImode, HImode and SImode.
e60d6e
+					   Relative to reg-reg move (2).  */
e60d6e
+  {6, 6, 6},				/* cost of storing integer registers */
e60d6e
+  {6, 6, 6, 10, 15},			/* cost of loading SSE register
e60d6e
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
e60d6e
+  {6, 6, 6, 10, 15},			/* cost of storing SSE register
e60d6e
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
e60d6e
+  {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
e60d6e
+  {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
e60d6e
+  2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
e60d6e
+  6,					/* cost of moving SSE register to integer.  */
e60d6e
+  18, 6,				/* Gather load static, per_elt.  */
e60d6e
+  18, 6,				/* Gather store static, per_elt.  */
e60d6e
+  32,					/* size of l1 cache.  */
e60d6e
+  512,					/* size of l2 cache.  */
e60d6e
+  64,					/* size of prefetch block */
e60d6e
+  6,					/* number of parallel prefetches */
e60d6e
+  /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
e60d6e
+     value is increased to perhaps more appropriate value of 5.  */
e60d6e
+  3,					/* Branch cost */
e60d6e
+  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
e60d6e
+  COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
e60d6e
+  COSTS_N_INSNS (17),			/* cost of FDIV instruction.  */
e60d6e
+  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
e60d6e
+  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
e60d6e
+  COSTS_N_INSNS (14),			/* cost of FSQRT instruction.  */
e60d6e
+
e60d6e
+  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
e60d6e
+  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
e60d6e
+  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
e60d6e
+  COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
e60d6e
+  COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
e60d6e
+  COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
e60d6e
+  COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
e60d6e
+  COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
e60d6e
+  COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
e60d6e
+  COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
e60d6e
+  1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
e60d6e
+  tremont_memcpy,
e60d6e
+  tremont_memset,
e60d6e
+  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
e60d6e
+  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
e60d6e
+  "16:11:8",				/* Loop alignment.  */
e60d6e
+  "16:11:8",				/* Jump alignment.  */
e60d6e
+  "0:0:8",				/* Label alignment.  */
e60d6e
+  "16",					/* Func alignment.  */
e60d6e
+};
e60d6e
+
e60d6e
 static stringop_algs intel_memcpy[2] = {
e60d6e
   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
e60d6e
   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
e60d6e
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
e60d6e
index 6bd7087a03f..636e0c788bf 100644
e60d6e
--- a/gcc/config/i386/x86-tune.def
e60d6e
+++ b/gcc/config/i386/x86-tune.def
e60d6e
@@ -273,7 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
e60d6e
    move/set sequences of bytes with known size.  */
e60d6e
 DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
e60d6e
 	  "prefer_known_rep_movsb_stosb",
e60d6e
-	  m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
e60d6e
+	  m_SKYLAKE | m_ALDERLAKE | m_TREMONT | m_CORE_AVX512)
e60d6e
 
e60d6e
 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
e60d6e
    compact prologues and epilogues by issuing a misaligned moves.  This
e60d6e
-- 
e60d6e
2.18.2
e60d6e