10 #ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
11 #define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
18 #if EIGEN_GNUC_AT_LEAST(5, 3)
20 #define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
21 const Packet16f p16f_##NAME = pset1<Packet16f>(X)
23 #define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \
24 const Packet16f p16f_##NAME = (__m512)pset1<Packet16i>(X)
26 #define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \
27 const Packet8d p8d_##NAME = pset1<Packet8d>(X)
29 #define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
30 const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
37 #if defined(EIGEN_VECTORIZE_AVX512DQ)
39 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
40 plog<Packet16f>(
const Packet16f& _x) {
42 _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
43 _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
44 _EIGEN_DECLARE_CONST_Packet16f(126f, 126.0f);
46 _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inv_mant_mask, ~0x7f800000);
49 _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
50 _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
51 _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000);
52 _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
55 _EIGEN_DECLARE_CONST_Packet16f(cephes_SQRTHF, 0.707106781186547524f);
56 _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p0, 7.0376836292E-2f);
57 _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p1, -1.1514610310E-1f);
58 _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p2, 1.1676998740E-1f);
59 _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p3, -1.2420140846E-1f);
60 _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p4, +1.4249322787E-1f);
61 _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p5, -1.6668057665E-1f);
62 _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p6, +2.0000714765E-1f);
63 _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p7, -2.4999993993E-1f);
64 _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p8, +3.3333331174E-1f);
65 _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q1, -2.12194440e-4f);
66 _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f);
69 __mmask16 invalid_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
70 __mmask16 iszero_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ);
73 x = pmax(x, p16f_min_norm_pos);
76 Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((__m512i)x, 23));
77 Packet16f e = _mm512_sub_ps(emm0, p16f_126f);
80 x = _mm512_and_ps(x, p16f_inv_mant_mask);
81 x = _mm512_or_ps(x, p16f_half);
90 __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
91 Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x);
93 e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1));
96 Packet16f x2 = pmul(x, x);
97 Packet16f x3 = pmul(x2, x);
102 y = pmadd(p16f_cephes_log_p0, x, p16f_cephes_log_p1);
103 y1 = pmadd(p16f_cephes_log_p3, x, p16f_cephes_log_p4);
104 y2 = pmadd(p16f_cephes_log_p6, x, p16f_cephes_log_p7);
105 y = pmadd(y, x, p16f_cephes_log_p2);
106 y1 = pmadd(y1, x, p16f_cephes_log_p5);
107 y2 = pmadd(y2, x, p16f_cephes_log_p8);
108 y = pmadd(y, x3, y1);
109 y = pmadd(y, x3, y2);
113 y1 = pmul(e, p16f_cephes_log_q1);
114 tmp = pmul(x2, p16f_half);
117 y2 = pmul(e, p16f_cephes_log_q2);
121 __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ);
126 return _mm512_mask_blend_ps(iszero_mask,
127 _mm512_mask_blend_ps(invalid_mask,
128 _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf),
139 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
140 pexp<Packet16f>(
const Packet16f& _x) {
141 _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
142 _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
143 _EIGEN_DECLARE_CONST_Packet16f(127, 127.0f);
145 _EIGEN_DECLARE_CONST_Packet16f(exp_hi, 88.3762626647950f);
146 _EIGEN_DECLARE_CONST_Packet16f(exp_lo, -88.3762626647949f);
148 _EIGEN_DECLARE_CONST_Packet16f(cephes_LOG2EF, 1.44269504088896341f);
150 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p0, 1.9875691500E-4f);
151 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p1, 1.3981999507E-3f);
152 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p2, 8.3334519073E-3f);
153 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p3, 4.1665795894E-2f);
154 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p4, 1.6666665459E-1f);
155 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p5, 5.0000001201E-1f);
158 Packet16f x = pmax(pmin(_x, p16f_exp_hi), p16f_exp_lo);
162 Packet16f m = _mm512_floor_ps(pmadd(x, p16f_cephes_LOG2EF, p16f_half));
166 _EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f);
167 Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x);
168 Packet16f r2 = pmul(r, r);
172 Packet16f y = p16f_cephes_exp_p0;
173 y = pmadd(y, r, p16f_cephes_exp_p1);
174 y = pmadd(y, r, p16f_cephes_exp_p2);
175 y = pmadd(y, r, p16f_cephes_exp_p3);
176 y = pmadd(y, r, p16f_cephes_exp_p4);
177 y = pmadd(y, r, p16f_cephes_exp_p5);
182 Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127));
183 emm0 = _mm512_slli_epi32(emm0, 23);
186 return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x);
266 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
267 psqrt<Packet16f>(
const Packet16f& _x) {
268 Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
269 __mmask16 denormal_mask = _mm512_kand(
270 _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
272 _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
274 Packet16f x = _mm512_rsqrt14_ps(_x);
277 x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
280 return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
284 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
285 psqrt<Packet8d>(
const Packet8d& _x) {
286 Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5));
287 __mmask16 denormal_mask = _mm512_kand(
288 _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
290 _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
292 Packet8d x = _mm512_rsqrt14_pd(_x);
295 x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
298 x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
300 return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
304 EIGEN_STRONG_INLINE Packet16f psqrt<Packet16f>(
const Packet16f& x) {
305 return _mm512_sqrt_ps(x);
308 EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(
const Packet8d& x) {
309 return _mm512_sqrt_pd(x);
318 #ifdef EIGEN_FAST_MATH
320 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
321 prsqrt<Packet16f>(
const Packet16f& _x) {
322 _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000);
323 _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
324 _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
325 _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
326 _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
328 Packet16f neg_half = pmul(_x, p16f_minus_half);
332 __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
333 Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
336 __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
337 Packet16f infs_and_nans = _mm512_mask_blend_ps(
338 neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
341 x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
344 return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
348 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
349 prsqrt<Packet8d>(
const Packet8d& _x) {
350 _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
351 _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL);
352 _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
353 _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
354 _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
356 Packet8d neg_half = pmul(_x, p8d_minus_half);
360 __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
361 Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
364 __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
365 Packet8d infs_and_nans = _mm512_mask_blend_pd(
366 neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
369 x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
372 x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
375 return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
377 #elif defined(EIGEN_VECTORIZE_AVX512ER)
379 EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(
const Packet16f& x) {
380 return _mm512_rsqrt28_ps(x);
Namespace containing all symbols from the Eigen library.
Definition: Core:309