33fa5a
From dce732e9fe47b44d1a985d10a0eb97aac6afa28e Mon Sep 17 00:00:00 2001
33fa5a
From: Andreas Arnez <arnez@linux.ibm.com>
33fa5a
Date: Wed, 25 Mar 2020 20:11:19 +0100
33fa5a
Subject: [PATCH 6/8] Add IBM z14 support
33fa5a
33fa5a
Add general support for IBM z14.  Also detect and handle the vector
33fa5a
enhancements facility 1, which specifically adds single-precision FP
33fa5a
arithmetic for vectors.
33fa5a
---
33fa5a
 CONFIG/include/atlconf.h            | 14 ++++----
33fa5a
 CONFIG/src/Makefile                 |  6 ++++
33fa5a
 CONFIG/src/atlcomp.txt              |  4 +++
33fa5a
 CONFIG/src/backend/Make.ext         |  4 ++-
33fa5a
 CONFIG/src/backend/archinfo_linux.c |  3 +-
33fa5a
 CONFIG/src/backend/probe_vxz2.c     | 12 +++++++
33fa5a
 CONFIG/src/probe_comp.c             |  3 +-
33fa5a
 include/atlas_prefetch.h            |  3 +-
33fa5a
 include/atlas_simd.h                | 53 +++++++++++++++++++++++++++++
33fa5a
 9 files changed, 91 insertions(+), 11 deletions(-)
33fa5a
 create mode 100644 CONFIG/src/backend/probe_vxz2.c
33fa5a
33fa5a
diff --git a/CONFIG/include/atlconf.h b/CONFIG/include/atlconf.h
33fa5a
index e51d56d..3828fdb 100644
33fa5a
--- a/CONFIG/include/atlconf.h
33fa5a
+++ b/CONFIG/include/atlconf.h
33fa5a
@@ -25,11 +25,11 @@ enum ARCHFAM {AFOther=0, AFPPC, AFSPARC, AFALPHA, AFX86, AFIA64, AFMIPS,
33fa5a
  * Corei3EP: v3 Haswell, E5-26XX
33fa5a
  * Corei4: skylake
33fa5a
  */
33fa5a
-#define NMACH 62
33fa5a
+#define NMACH 63
33fa5a
 static char *machnam[NMACH] =
33fa5a
    {"UNKNOWN", "PPCG4", "PPCG5", "POWER3", "POWER4", "POWER5",
33fa5a
     "POWER6", "POWER7", "POWER8", "POWERe6500",
33fa5a
-    "IBMz9", "IBMz10", "IBMz196", "IBMz12", "IBMz13",
33fa5a
+    "IBMz9", "IBMz10", "IBMz196", "IBMz12", "IBMz13", "IBMz14",
33fa5a
     "x86x87", "x86SSE1", "x86SSE2", "x86SSE3",
33fa5a
     "P5", "P5MMX", "PPRO", "PII", "PIII", "PM", "CoreSolo",
33fa5a
     "CoreDuo", "Core2Solo", "Core2", "Corei1", "Corei2", "Corei3",
33fa5a
@@ -42,7 +42,7 @@ static char *machnam[NMACH] =
33fa5a
     "ARM64xgene1", "ARM64a53", "ARM64a57"};
33fa5a
 enum MACHTYPE {MACHOther, PPCG4, PPCG5, IbmPwr3, IbmPwr4, IbmPwr5,
33fa5a
                IbmPwr6, IbmPwr7, IbmPwr8, Pwre6500,
33fa5a
-               IbmZ9, IbmZ10, IbmZ196, IbmZ12, IbmZ13, /* s390(x) in Linux */
33fa5a
+               IbmZ9, IbmZ10, IbmZ196, IbmZ12, IbmZ13, IbmZ14, /* s390(x) */
33fa5a
                x86x87, x86SSE1, x86SSE2, x86SSE3, /* generic targets */
33fa5a
                IntP5, IntP5MMX, IntPPRO, IntPII, IntPIII, IntPM, IntCoreS,
33fa5a
                IntCoreDuo, IntCore2Solo, IntCore2, IntCorei1, IntCorei2,
33fa5a
@@ -82,7 +82,7 @@ enum MACHTYPE {MACHOther, PPCG4, PPCG5, IbmPwr3, IbmPwr4, IbmPwr5,
33fa5a
 #define MachIsARM64(mach_) \
33fa5a
    ( (mach_) >= ARM64xg && || (mach_) <= ARM64a57)
33fa5a
 #define MachIsS390(mach_) \
33fa5a
-   ( (mach_) >= IbmZ9 && (mach_) <= IbmZ13 )
33fa5a
+   ( (mach_) >= IbmZ9 && (mach_) <= IbmZ14 )
33fa5a
 
33fa5a
 
33fa5a
 static char *f2c_namestr[5] = {"UNKNOWN","Add_", "Add__", "NoChange", "UpCase"};
33fa5a
@@ -96,13 +96,13 @@ enum F2CNAME {f2c_NamErr=0, f2c_Add_, f2c_Add__, f2c_NoChange, f2c_UpCase};
33fa5a
 enum F2CINT {f2c_IntErr=0, FintCint, FintClong, FintClonglong, FintCshort};
33fa5a
 enum F2CSTRING {f2c_StrErr=0, fstrSun, fstrCray, fstrStructVal, fstrStructPtr};
33fa5a
 
33fa5a
-#define NISA 15
33fa5a
+#define NISA 16
33fa5a
 static char *ISAXNAM[NISA] =
33fa5a
-   {"", "VSX", "VXZ", "AltiVec",
33fa5a
+   {"", "VSX", "VXZ2", "VXZ", "AltiVec",
33fa5a
     "AVXMAC", "AVXFMA4", "AVX", "SSE3", "SSE2", "SSE1", "3DNow",
33fa5a
     "FPV3D2MACNEON", "FPV3D16MACNEON", "FPV3D32MAC", "FPV3D16MAC"};
33fa5a
 enum ISAEXT
33fa5a
-   {ISA_None=0, ISA_VSX, ISA_VXZ, ISA_AV,
33fa5a
+   {ISA_None=0, ISA_VSX, ISA_VXZ2, ISA_VXZ, ISA_AV,
33fa5a
     ISA_AVXMAC, ISA_AVXFMA4, ISA_AVX, ISA_SSE3, ISA_SSE2, ISA_SSE1, ISA_3DNow,
33fa5a
     ISA_NEON, ISA_NEON16, ISA_VFP3D32MAC, ISA_VFP3D16MAC};
33fa5a
 
33fa5a
diff --git a/CONFIG/src/Makefile b/CONFIG/src/Makefile
33fa5a
index 212b9d7..782a4cf 100644
33fa5a
--- a/CONFIG/src/Makefile
33fa5a
+++ b/CONFIG/src/Makefile
33fa5a
@@ -158,6 +158,12 @@ IRun_NEON :
33fa5a
 	$(MAKE) $(atlrun) atldir=$(mydir) exe=xprobe_neon args="$(args)" \
33fa5a
                 redir=config0.out
33fa5a
 	- cat config0.out
33fa5a
+IRun_VXZ2 :
33fa5a
+	$(CC) $(CCFLAGS) -march=native -mvx -mzvector -o xprobe_vxz2 \
33fa5a
+           $(SRCdir)/backend/probe_svec.c $(SRCdir)/backend/probe_vxz2.c
33fa5a
+	$(MAKE) $(atlrun) atldir=$(mydir) exe=xprobe_vxz2 args="$(args)" \
33fa5a
+                redir=config0.out
33fa5a
+	- cat config0.out
33fa5a
 IRun_VXZ :
33fa5a
 	$(CC) $(CCFLAGS) -march=native -mvx -mzvector -o xprobe_vxz \
33fa5a
            $(SRCdir)/backend/probe_dvec.c $(SRCdir)/backend/probe_vxz.c
33fa5a
diff --git a/CONFIG/src/atlcomp.txt b/CONFIG/src/atlcomp.txt
33fa5a
index 2ac71cf..2cfacc2 100644
33fa5a
--- a/CONFIG/src/atlcomp.txt
33fa5a
+++ b/CONFIG/src/atlcomp.txt
33fa5a
@@ -250,6 +250,10 @@ MACH=IBMz13 OS=ALL LVL=1000 COMPS=smc,dmc,skc,dkc,icc,xcc,gcc
33fa5a
    'gcc' '-march=z13 -mtune=z13 -O2'
33fa5a
 MACH=IBMz13 OS=ALL LVL=1000 COMPS=f77
33fa5a
    'gfortran' '-march=z13 -mtune=z13 -O2'
33fa5a
+MACH=IBMz14 OS=ALL LVL=1000 COMPS=smc,dmc,skc,dkc,icc,xcc,gcc
33fa5a
+   'gcc' '-march=z14 -mtune=z14 -O2'
33fa5a
+MACH=IBMz14 OS=ALL LVL=1000 COMPS=f77
33fa5a
+   'gfortran' '-march=z14 -mtune=z14 -O2'
33fa5a
 #
33fa5a
 # Windows defaults ; need to make SSE/SSE2 arch dep.
33fa5a
 #
33fa5a
diff --git a/CONFIG/src/backend/Make.ext b/CONFIG/src/backend/Make.ext
33fa5a
index 4743353..794babf 100644
33fa5a
--- a/CONFIG/src/backend/Make.ext
33fa5a
+++ b/CONFIG/src/backend/Make.ext
33fa5a
@@ -39,7 +39,7 @@ files = archinfo_aix.c archinfo_freebsd.c archinfo_irix.c archinfo_linux.c \
33fa5a
         probe_gas_mips.S probe_gas_parisc.S probe_gas_ppc.S probe_gas_s390.S \
33fa5a
         probe_gas_sparc.S probe_gas_wow64.S probe_gas_x8632.S \
33fa5a
         probe_gas_x8664.S probe_smac.c probe_svec.c probe_this_asm.c \
33fa5a
-        probe_vxz.c
33fa5a
+        probe_vxz2.c probe_vxz.c
33fa5a
 
33fa5a
 all : $(files)
33fa5a
 
33fa5a
@@ -107,6 +107,8 @@ flibchkF.f : $(basf)
33fa5a
 	$(extF) -b $(basf) -o flibchkF.f rout=flibchkF.f
33fa5a
 probe_arm32_FPABI.c : $(basf)
33fa5a
 	$(extC) -b $(basf) -o probe_arm32_FPABI.c rout=probe_arm32_FPABI
33fa5a
+probe_vxz2.c : $(basf)
33fa5a
+	$(extC) -b $(basf) -o probe_vxz2.c rout=probe_vxz2
33fa5a
 probe_vxz.c : $(basf)
33fa5a
 	$(extC) -b $(basf) -o probe_vxz.c rout=probe_vxz
33fa5a
 probe_aff_SETAFFNP.c : $(basf)
33fa5a
diff --git a/CONFIG/src/backend/archinfo_linux.c b/CONFIG/src/backend/archinfo_linux.c
33fa5a
index cdcee92..ed6f476 100644
33fa5a
--- a/CONFIG/src/backend/archinfo_linux.c
33fa5a
+++ b/CONFIG/src/backend/archinfo_linux.c
33fa5a
@@ -336,7 +336,8 @@ enum MACHTYPE ProbeArch()
33fa5a
          else if (strstr(res, "2817") || strstr(res, "2818")) mach = IbmZ196;
33fa5a
          else if (strstr(res, "2827") || strstr(res, "2828")) mach = IbmZ12;
33fa5a
          else if (strstr(res, "2964") || strstr(res, "2965")) mach = IbmZ13;
33fa5a
-         else mach = IbmZ13;  /* looks risky to me, but IBM folks did it */
33fa5a
+         else if (strstr(res, "3906") || strstr(res, "3907")) mach = IbmZ14;
33fa5a
+         else mach = IbmZ14;  /* looks risky to me, but IBM folks did it */
33fa5a
          free(res);
33fa5a
       }
33fa5a
       break;
33fa5a
diff --git a/CONFIG/src/backend/probe_vxz2.c b/CONFIG/src/backend/probe_vxz2.c
33fa5a
new file mode 100644
33fa5a
index 0000000..a69d92d
33fa5a
--- /dev/null
33fa5a
+++ b/CONFIG/src/backend/probe_vxz2.c
33fa5a
@@ -0,0 +1,12 @@
33fa5a
+#include <vecintrin.h>
33fa5a
+void do_vsum(float *z, float *x, float *y) // RETURNS: z = x + y
33fa5a
+{
33fa5a
+   vector float vx, vy;
33fa5a
+   vx = (vector float) {x[0], x[1], x[2], x[3]};
33fa5a
+   vy = (vector float) {y[0], y[1], y[2], y[3]};
33fa5a
+   vy += vx;
33fa5a
+   z[0] = vy[0];
33fa5a
+   z[1] = vy[1];
33fa5a
+   z[2] = vy[2];
33fa5a
+   z[3] = vy[3];
33fa5a
+}
33fa5a
diff --git a/CONFIG/src/probe_comp.c b/CONFIG/src/probe_comp.c
33fa5a
index 1652e24..857ea82 100644
33fa5a
--- a/CONFIG/src/probe_comp.c
33fa5a
+++ b/CONFIG/src/probe_comp.c
33fa5a
@@ -452,7 +452,7 @@ COMPNODE **GetDefaultComps(enum OSTYPE OS, enum MACHTYPE arch, int verb,
33fa5a
       vp = "-mavx2 -mfma";
33fa5a
    else if (vecexts & (1<
33fa5a
       vp = "-mvsx";
33fa5a
-   else if (vecexts & (1<
33fa5a
+   else if ((vecexts & (1<
33fa5a
       vp = "-mvx -mzvector";
33fa5a
    else if (vecexts & (1<
33fa5a
       vp = "-maltivec";
33fa5a
@@ -1207,6 +1207,7 @@ void GetBestGccVers(enum OSTYPE OS, enum MACHTYPE arch,
33fa5a
    {
33fa5a
    case IbmZ12:
33fa5a
    case IbmZ13:
33fa5a
+   case IbmZ14:
33fa5a
    case IntCorei3:
33fa5a
    case IntCorei4:
33fa5a
    case IntCorei2:
33fa5a
diff --git a/include/atlas_prefetch.h b/include/atlas_prefetch.h
33fa5a
index e7988a7..fa426ac 100644
33fa5a
--- a/include/atlas_prefetch.h
33fa5a
+++ b/include/atlas_prefetch.h
33fa5a
@@ -155,7 +155,8 @@
33fa5a
    #define ATL_L1LS 32
33fa5a
    #define ATL_L2LS 64
33fa5a
 #elif defined(ATL_ARCH_IBMz196) || defined(ATL_ARCH_IBMz10) || \
33fa5a
-      defined(ATL_ARCH_IBMzEC12) || defined(ATL_ARCH_IBMz13)
33fa5a
+      defined(ATL_ARCH_IBMzEC12) || defined(ATL_ARCH_IBMz13) || \
33fa5a
+      defined(ATL_ARCH_IbmZ14)
33fa5a
    #define ATL_pfl1R(mem) __builtin_prefetch(mem, 0, 3)
33fa5a
    #define ATL_pfl1W(mem) __builtin_prefetch(mem, 1, 3)
33fa5a
    #define ATL_GOT_L1PREFETCH
33fa5a
diff --git a/include/atlas_simd.h b/include/atlas_simd.h
33fa5a
index f171933..eb75577 100644
33fa5a
--- a/include/atlas_simd.h
33fa5a
+++ b/include/atlas_simd.h
33fa5a
@@ -68,6 +68,11 @@
33fa5a
           ((defined(DREAL) || defined(DCPLX)) && ATL_VLEN != 2)
33fa5a
          #define ATL_FRCGNUVEC
33fa5a
       #endif
33fa5a
+   #elif defined(ATL_VXZ2)
33fa5a
+      #if ((defined(SREAL) || defined(SCPLX)) && ATL_VLEN != 4) || \
33fa5a
+          ((defined(DREAL) || defined(DCPLX)) && ATL_VLEN != 2)
33fa5a
+         #define ATL_FRCGNUVEC
33fa5a
+      #endif
33fa5a
    #elif defined(ATL_VXZ)
33fa5a
       #if ATL_VLEN != 2
33fa5a
          #define ATL_FRCGNUVEC
33fa5a
@@ -113,6 +118,12 @@
33fa5a
       #else
33fa5a
          #define ATL_VLEN 2
33fa5a
       #endif
33fa5a
+   #elif defined(ATL_VXZ2)
33fa5a
+      #if defined(SREAL) || defined(SCPLX)
33fa5a
+         #define ATL_VLEN 4
33fa5a
+      #else
33fa5a
+         #define ATL_VLEN 2
33fa5a
+      #endif
33fa5a
    #elif defined(ATL_VXZ)
33fa5a
       #define ATL_VLEN 2
33fa5a
    #elif defined(ATL_NEON)
33fa5a
@@ -376,6 +387,48 @@
33fa5a
       #define ATL_vsplat0(d_, s_) d_ = vec_splat(s_, 0)
33fa5a
       #define ATL_vsplat1(d_, s_) d_ = vec_splat(s_, 1)
33fa5a
    #endif
33fa5a
+#elif defined(ATL_VXZ2)
33fa5a
+   #include <vecintrin.h>
33fa5a
+
33fa5a
+   #define ATL_VPERMI(s_, t_, i_) \
33fa5a
+     ((ATL_VTYPE) vec_permi((vector double) s_, (vector double) t_, i_))
33fa5a
+
33fa5a
+   #if defined(SREAL) || defined(SCPLX)
33fa5a
+      #define ATL_VTYPE vector float
33fa5a
+      #if ATL_VLEN != 4
33fa5a
+         #error "VSXZ2 supports only VLEN = 4 for floats!"
33fa5a
+      #endif
33fa5a
+      #define ATL_vvrsum4(s0_, s1_, s2_, s3_) \
33fa5a
+      {	 ATL_VTYPE t0_, t1_; \
33fa5a
+         t0_ = vec_mergeh(s0_, s1_) + vec_mergel(s0_, s1_); \
33fa5a
+         t1_ = vec_mergeh(s2_, s3_) + vec_mergel(s2_, s3_); \
33fa5a
+         s0_ = ATL_VPERMI(t0_, t1_, 0) + ATL_VPERMI(t0_, t1_, 3); \
33fa5a
+      }
33fa5a
+      #define ATL_vsplat2(d_, s_) d_ = vec_splat(s_, 2)
33fa5a
+      #define ATL_vsplat3(d_, s_) d_ = vec_splat(s_, 3)
33fa5a
+   #else        /* double precision */
33fa5a
+      #define ATL_VTYPE vector double
33fa5a
+      #if ATL_VLEN != 2
33fa5a
+         #error "VSXZ2 supports only VLEN = 2 for doubles!"
33fa5a
+      #endif
33fa5a
+      #define ATL_vvrsum1(s0_) \
33fa5a
+      {  s0_ = vec_mergeh(s0_, s0_) + vec_mergel(s0_, s0_); }
33fa5a
+      #define ATL_vvrsum2(s0_, s1_) \
33fa5a
+      {  s0_ = vec_mergeh(s0_, s1_) + vec_mergel(s0_, s1_); }
33fa5a
+   #endif
33fa5a
+   #define ATL_vld(v_, p_) v_ = *(ATL_VTYPE *)(p_)
33fa5a
+   #define ATL_vst(p_, v_) *(ATL_VTYPE *)(p_) = v_
33fa5a
+   #define ATL_vzero(v_) v_ = vec_splats((TYPE)0.0)
33fa5a
+   #define ATL_vcopy(d_, s_) d_ = s_
33fa5a
+   #define ATL_vbcast(v_, p_) v_ = vec_splats(*((TYPE*)(p_)))
33fa5a
+   #define ATL_vuld(v_, p_) v_ = vec_xl(0, (TYPE *)(p_))
33fa5a
+   #define ATL_vust(p_, v_) vec_xst(v_, 0, (TYPE *)(p_))
33fa5a
+   #define ATL_vadd(d_, s1_, s2_) d_ =  s1_ + s2_
33fa5a
+   #define ATL_vsub(d_, s1_, s2_) d_ =  s1_ - s2_
33fa5a
+   #define ATL_vmul(d_, s1_, s2_) d_ =  s1_ * s2_
33fa5a
+   #define ATL_vmac(d_, s1_, s2_) d_ = __builtin_s390_vec_madd(s1_, s2_, d_)
33fa5a
+   #define ATL_vsplat0(d_, s_) d_ = vec_splat(s_, 0)
33fa5a
+   #define ATL_vsplat1(d_, s_) d_ = vec_splat(s_, 1)
33fa5a
 #elif defined(ATL_VXZ)
33fa5a
    #include <vecintrin.h>
33fa5a
 
33fa5a
-- 
33fa5a
2.23.0
33fa5a