|
|
33fa5a |
From 999efd5370b33e8b02d9370eda3d454e08fc9d15 Mon Sep 17 00:00:00 2001
|
|
|
33fa5a |
From: Andreas Arnez <arnez@linux.ibm.com>
|
|
|
33fa5a |
Date: Wed, 5 Dec 2018 18:59:15 +0100
|
|
|
33fa5a |
Subject: [PATCH 3/8] Fix SIMD support on IBM z13
|
|
|
33fa5a |
|
|
|
33fa5a |
The header file atlas_simd.h contained a syntax error and a few functional
|
|
|
33fa5a |
errors that affected IBM z13. It prevented any SIMD kernels from being
|
|
|
33fa5a |
compiled successfully for that platform. This is fixed. The macro
|
|
|
33fa5a |
vec_madd is avoided, because some GCC versions don't implement it
|
|
|
33fa5a |
correctly; the equivalent GCC builtin __builtin_s390_vec_madd is used
|
|
|
33fa5a |
instead.
|
|
|
33fa5a |
---
|
|
|
33fa5a |
include/atlas_simd.h | 10 +++++-----
|
|
|
33fa5a |
1 file changed, 5 insertions(+), 5 deletions(-)
|
|
|
33fa5a |
|
|
|
33fa5a |
diff --git a/include/atlas_simd.h b/include/atlas_simd.h
|
|
|
33fa5a |
index baee6b1..68daf79 100644
|
|
|
33fa5a |
--- a/include/atlas_simd.h
|
|
|
33fa5a |
+++ b/include/atlas_simd.h
|
|
|
33fa5a |
@@ -69,7 +69,7 @@
|
|
|
33fa5a |
#define ATL_FRCGNUVEC
|
|
|
33fa5a |
#endif
|
|
|
33fa5a |
#elif defined(ATL_VXZ)
|
|
|
33fa5a |
- #if ATL_VLEN != 2;
|
|
|
33fa5a |
+ #if ATL_VLEN != 2
|
|
|
33fa5a |
#define ATL_FRCGNUVEC
|
|
|
33fa5a |
#endif
|
|
|
33fa5a |
#elif defined(ATL_NEON)
|
|
|
33fa5a |
@@ -390,19 +390,19 @@
|
|
|
33fa5a |
#define ATL_vld(v_, p_) v_ = vec_ld2f(p_);
|
|
|
33fa5a |
#define ATL_vst(p_, v_) vec_st2f(v_, p_);
|
|
|
33fa5a |
#endif
|
|
|
33fa5a |
- #define ATL_vzero(v_) v_ = vec_splats((TYPE)0.0)
|
|
|
33fa5a |
+ #define ATL_vzero(v_) v_ = vec_splats((double)0.0)
|
|
|
33fa5a |
#define ATL_vcopy(d_, s_) d_ = s_
|
|
|
33fa5a |
- #define ATL_vbcast(v_, p_) v_ = vec_splats(*((TYPE*)(p_)))
|
|
|
33fa5a |
+ #define ATL_vbcast(v_, p_) v_ = vec_splats((double)*((TYPE*)(p_)))
|
|
|
33fa5a |
#define ATL_vuld(v_, p_) ATL_vld(v_, p_)
|
|
|
33fa5a |
#define ATL_vust(p_, v_) ATL_vst(p_, v_)
|
|
|
33fa5a |
#define ATL_vadd(d_, s1_, s2_) d_ = s1_ + s2_
|
|
|
33fa5a |
#define ATL_vsub(d_, s1_, s2_) d_ = s1_ - s2_
|
|
|
33fa5a |
#define ATL_vmul(d_, s1_, s2_) d_ = s1_ * s2_
|
|
|
33fa5a |
- #define ATL_vmac(d_, s1_, s2_) d_ = vec_madd(s1_, s2_, d_)
|
|
|
33fa5a |
+ #define ATL_vmac(d_, s1_, s2_) d_ = __builtin_s390_vec_madd(s1_, s2_, d_)
|
|
|
33fa5a |
#define ATL_vvrsum1(s0_) \
|
|
|
33fa5a |
{ ATL_VTYPE t_;\
|
|
|
33fa5a |
t_ = vec_splat(s0_, 1); \
|
|
|
33fa5a |
- s0 += t_; \
|
|
|
33fa5a |
+ s0_ += t_; \
|
|
|
33fa5a |
}
|
|
|
33fa5a |
#define ATL_vsplat0(d_, s_) d_ = vec_splat(s_, 0)
|
|
|
33fa5a |
#define ATL_vsplat1(d_, s_) d_ = vec_splat(s_, 1)
|
|
|
33fa5a |
--
|
|
|
33fa5a |
2.23.0
|
|
|
33fa5a |
|