Centralize the complications for special efficient rounding to integers.

This was open-coded in range reduction for trig and exp functions. Now there are 3 static inline functions rnint[fl]() that replace open-coded expressions, and type-generic irint() and i64rint() macros that hide the complications for efficiently using non-generic irint() and irintl() functions and casts. Special details: ld128/e_rem_pio2l.h needs to use i64rint() since it needs a 46-bit integer result. Everything else only needs a (less than) 32-bit integer result so uses irint(). Float and double cases now use float_t and double_t locally instead of STRICT_ASSIGN() to avoid bugs in extra precision. On amd64, inline asm is now only used for irint() on long doubles. The SSE asm for irint() on amd64 only existed because the ifdef tangles made the correct method of simply casting to int for this case non-obvious.
author: Bruce Evans <bde@FreeBSD.org> 2018-07-20 12:42:24 +0000
committer: Bruce Evans <bde@FreeBSD.org> 2018-07-20 12:42:24 +0000
commit: 27aa844253c86af777298be505c7b2b01bb69ed3 (patch)
tree: 4b24e3058b5660cc433bc466ac44d2aa1a5c0c92 /lib/msun
parent: 1a59bccc425fc8af01aee3086ab09616d1bb8936 (diff)
download: src-test2-27aa844253c86af777298be505c7b2b01bb69ed3.tar.gz
src-test2-27aa844253c86af777298be505c7b2b01bb69ed3.zip
9 files changed, 93 insertions, 75 deletions
diff --git a/lib/msun/ld128/e_rem_pio2l.h b/lib/msun/ld128/e_rem_pio2l.h
index 5d78c4de0905..fcef399a83a6 100644
--- a/lib/msun/ld128/e_rem_pio2l.h
+++ b/lib/msun/ld128/e_rem_pio2l.h
@@ -74,14 +74,9 @@ __ieee754_rem_pio2l(long double x, long double *y)
 	if (ex < BIAS + 45 || ex == BIAS + 45 &&
 	    u.bits.manh < 0x921fb54442d1LL) {
 	    /* |x| ~< 2^45*(pi/2), medium size */
-	    /* Use a specialized rint() to get fn.  Assume round-to-nearest. */
-	    fn = x*invpio2+0x1.8p112;
-	    fn = fn-0x1.8p112;
-#ifdef HAVE_EFFICIENT_I64RINT
+	    /* TODO: use only double precision for fn, as in expl(). */
+	    fn = rnintl(x * invpio2);
 	    n  = i64rint(fn);
-#else
-	    n  = fn;
-#endif
 	    r  = x-fn*pio2_1;
 	    w  = fn*pio2_1t;	/* 1st round good to 180 bit */
 	    {
diff --git a/lib/msun/ld128/k_expl.h b/lib/msun/ld128/k_expl.h
index 38a9cb98b33d..913107652272 100644
--- a/lib/msun/ld128/k_expl.h
+++ b/lib/msun/ld128/k_expl.h
@@ -244,16 +244,8 @@ __k_expl(long double x, long double *hip, long double *lop, int *kp)
 	int n, n2;
 
 	/* Reduce x to (k*ln2 + endpoint[n2] + r1 + r2). */
-	/* Use a specialized rint() to get fn.  Assume round-to-nearest. */
-	/* XXX assume no extra precision for the additions, as for trig fns. */
-	/* XXX this set of comments is now quadruplicated. */
-	/* XXX but see ../src/e_exp.c for a fix using double_t. */
-	fn = (double)x * INV_L + 0x1.8p52 - 0x1.8p52;
-#if defined(HAVE_EFFICIENT_IRINT)
+	fn = rnint((double)x * INV_L);
 	n = irint(fn);
-#else
-	n = (int)fn;
-#endif
 	n2 = (unsigned)n % INTERVALS;
 	/* Depend on the sign bit being propagated: */
 	*kp = n >> LOG2_INTERVALS;
diff --git a/lib/msun/ld128/s_expl.c b/lib/msun/ld128/s_expl.c
index f624d5143902..b47402b0fb4e 100644
--- a/lib/msun/ld128/s_expl.c
+++ b/lib/msun/ld128/s_expl.c
@@ -268,13 +268,8 @@ expm1l(long double x)
 	}
 
 	/* Reduce x to (k*ln2 + endpoint[n2] + r1 + r2). */
-	/* Use a specialized rint() to get fn.  Assume round-to-nearest. */
-	fn = (double)x * INV_L + 0x1.8p52 - 0x1.8p52;
-#if defined(HAVE_EFFICIENT_IRINT)
+	fn = rnint((double)x * INV_L);
 	n = irint(fn);
-#else
-	n = (int)fn;
-#endif
 	n2 = (unsigned)n % INTERVALS;
 	k = n >> LOG2_INTERVALS;
 	r1 = x - fn * L1;
diff --git a/lib/msun/ld80/e_rem_pio2l.h b/lib/msun/ld80/e_rem_pio2l.h
index 75e4e4bc1abc..9a5c64ca9943 100644
--- a/lib/msun/ld80/e_rem_pio2l.h
+++ b/lib/msun/ld80/e_rem_pio2l.h
@@ -84,14 +84,8 @@ __ieee754_rem_pio2l(long double x, long double *y)
 	ex = expsign & 0x7fff;
 	if (ex < BIAS + 25 || (ex == BIAS + 25 && u.bits.manh < 0xc90fdaa2)) {
 	    /* |x| ~< 2^25*(pi/2), medium size */
-	    /* Use a specialized rint() to get fn.  Assume round-to-nearest. */
-	    fn = x*invpio2+0x1.8p63;
-	    fn = fn-0x1.8p63;
-#ifdef HAVE_EFFICIENT_IRINT
+	    fn = rnintl(x*invpio2);
 	    n  = irint(fn);
-#else
-	    n  = fn;
-#endif
 	    r  = x-fn*pio2_1;
 	    w  = fn*pio2_1t;	/* 1st round good to 102 bit */
 	    {
diff --git a/lib/msun/ld80/k_expl.h b/lib/msun/ld80/k_expl.h
index 2d803ea647ed..63c65b65cf6e 100644
--- a/lib/msun/ld80/k_expl.h
+++ b/lib/msun/ld80/k_expl.h
@@ -225,16 +225,9 @@ __k_expl(long double x, long double *hip, long double *lop, int *kp)
 	int n, n2;
 
 	/* Reduce x to (k*ln2 + endpoint[n2] + r1 + r2). */
-	/* Use a specialized rint() to get fn.  Assume round-to-nearest. */
-	fn = x * INV_L + 0x1.8p63 - 0x1.8p63;
+	fn = rnintl(x * INV_L);
 	r = x - fn * L1 - fn * L2;	/* r = r1 + r2 done independently. */
-#if defined(HAVE_EFFICIENT_IRINTL)
-	n = irintl(fn);
-#elif defined(HAVE_EFFICIENT_IRINT)
 	n = irint(fn);
-#else
-	n = (int)fn;
-#endif
 	n2 = (unsigned)n % INTERVALS;
 	/* Depend on the sign bit being propagated: */
 	*kp = n >> LOG2_INTERVALS;
diff --git a/lib/msun/ld80/s_expl.c b/lib/msun/ld80/s_expl.c
index d28b7dd0317c..e46e73f0c1fa 100644
--- a/lib/msun/ld80/s_expl.c
+++ b/lib/msun/ld80/s_expl.c
@@ -225,15 +225,8 @@ expm1l(long double x)
 	}
 
 	/* Reduce x to (k*ln2 + endpoint[n2] + r1 + r2). */
-	/* Use a specialized rint() to get fn.  Assume round-to-nearest. */
-	fn = x * INV_L + 0x1.8p63 - 0x1.8p63;
-#if defined(HAVE_EFFICIENT_IRINTL)
-	n = irintl(fn);
-#elif defined(HAVE_EFFICIENT_IRINT)
+	fn = rnintl(x * INV_L);
 	n = irint(fn);
-#else
-	n = (int)fn;
-#endif
 	n2 = (unsigned)n % INTERVALS;
 	k = n >> LOG2_INTERVALS;
 	r1 = x - fn * L1;
diff --git a/lib/msun/src/e_rem_pio2.c b/lib/msun/src/e_rem_pio2.c
index be2630b26fba..47b651351a8c 100644
--- a/lib/msun/src/e_rem_pio2.c
+++ b/lib/msun/src/e_rem_pio2.c
@@ -127,14 +127,8 @@ __ieee754_rem_pio2(double x, double *y)
 	}
 	if(ix<0x413921fb) {	/* |x| ~< 2^20*(pi/2), medium size */
 medium:
-	    /* Use a specialized rint() to get fn.  Assume round-to-nearest. */
-	    STRICT_ASSIGN(double,fn,x*invpio2+0x1.8p52);
-	    fn = fn-0x1.8p52;
-#ifdef HAVE_EFFICIENT_IRINT
+	    fn = rnint((double_t)x*invpio2);
 	    n  = irint(fn);
-#else
-	    n  = (int32_t)fn;
-#endif
 	    r  = x-fn*pio2_1;
 	    w  = fn*pio2_1t;	/* 1st round good to 85 bit */
 	    {
diff --git a/lib/msun/src/e_rem_pio2f.c b/lib/msun/src/e_rem_pio2f.c
index f1ee7a042b7c..597f6139f248 100644
--- a/lib/msun/src/e_rem_pio2f.c
+++ b/lib/msun/src/e_rem_pio2f.c
@@ -55,14 +55,8 @@ __ieee754_rem_pio2f(float x, double *y)
 	ix = hx&0x7fffffff;
     /* 33+53 bit pi is good enough for medium size */
 	if(ix<0x4dc90fdb) {		/* |x| ~< 2^28*(pi/2), medium size */
-	    /* Use a specialized rint() to get fn.  Assume round-to-nearest. */
-	    STRICT_ASSIGN(double,fn,x*invpio2+0x1.8p52);
-	    fn = fn-0x1.8p52;
-#ifdef HAVE_EFFICIENT_IRINT
+	    fn = rnint((float_t)x*invpio2);
 	    n  = irint(fn);
-#else
-	    n  = (int32_t)fn;
-#endif
 	    r  = x-fn*pio2_1;
 	    w  = fn*pio2_1t;
 	    *y = r-w;
diff --git a/lib/msun/src/math_private.h b/lib/msun/src/math_private.h
index 1b1105e1c3ac..91c1b92969cd 100644
--- a/lib/msun/src/math_private.h
+++ b/lib/msun/src/math_private.h
@@ -571,48 +571,116 @@ CMPLXL(long double x, long double y)
 
 #endif /* _COMPLEX_H */
  
-#ifdef __GNUCLIKE_ASM
+/*
+ * The rnint() family rounds to the nearest integer for a restricted range
+ * range of args (up to about 2**MANT_DIG).  We assume that the current
+ * rounding mode is FE_TONEAREST so that this can be done efficiently.
+ * Extra precision causes more problems in practice, and we only centralize
+ * this here to reduce those problems, and have not solved the efficiency
+ * problems.  The exp2() family uses a more delicate version of this that
+ * requires extracting bits from the intermediate value, so it is not
+ * centralized here and should copy any solution of the efficiency problems.
+ */
+
+static inline double
+rnint(__double_t x)
+{
+	/*
+	 * This casts to double to kill any extra precision.  This depends
+	 * on the cast being applied to a double_t to avoid compiler bugs
+	 * (this is a cleaner version of STRICT_ASSIGN()).  This is
+	 * inefficient if there actually is extra precision, but is hard
+	 * to improve on.  We use double_t in the API to minimise conversions
+	 * for just calling here.  Note that we cannot easily change the
+	 * magic number to the one that works directly with double_t, since
+	 * the rounding precision is variable at runtime on x86 so the
+	 * magic number would need to be variable.  Assuming that the
+	 * rounding precision is always the default is too fragile.  This
+	 * and many other complications will move when the default is
+	 * changed to FP_PE.
+	 */
+	return ((double)(x + 0x1.8p52) - 0x1.8p52);
+}
+
+static inline float
+rnintf(__float_t x)
+{
+	/*
+	 * As for rnint(), except we could just call that to handle the
+	 * extra precision case, usually without losing efficiency.
+	 */
+	return ((float)(x + 0x1.8p23F) - 0x1.8p23F);
+}
+
+#ifdef LDBL_MANT_DIG
+/*
+ * The complications for extra precision are smaller for rnintl() since it
+ * can safely assume that the rounding precision has been increased from
+ * its default to FP_PE on x86.  We don't exploit that here to get small
+ * optimizations from limiting the rangle to double.  We just need it for
+ * the magic number to work with long doubles.  ld128 callers should use
+ * rnint() instead of this if possible.  ld80 callers should prefer
+ * rnintl() since for amd64 this avoids swapping the register set, while
+ * for i386 it makes no difference (assuming FP_PE), and for other arches
+ * it makes little difference.
+ */
+static inline long double
+rnintl(long double x)
+{
+	return (x + __CONCAT(0x1.8p, LDBL_MANT_DIG) / 2 -
+	    __CONCAT(0x1.8p, LDBL_MANT_DIG) / 2);
+}
+#endif /* LDBL_MANT_DIG */
 
-/* Asm versions of some functions. */
+/*
+ * irint() and i64rint() give the same result as casting to their integer
+ * return type provided their arg is a floating point integer.  They can
+ * sometimes be more efficient because no rounding is required.
+ */
+#if (defined(amd64) || defined(__i386__)) && defined(__GNUCLIKE_ASM)
+#define	irint(x)						\
+    (sizeof(x) == sizeof(float) &&				\
+    sizeof(__float_t) == sizeof(long double) ? irintf(x) :	\
+    sizeof(x) == sizeof(double) &&				\
+    sizeof(__double_t) == sizeof(long double) ? irintd(x) :	\
+    sizeof(x) == sizeof(long double) ? irintl(x) : (int)(x))
+#else
+#define	irint(x)	((int)(x))
+#endif
 
-#ifdef __amd64__
+#define	i64rint(x)	((int64_t)(x))	/* only needed for ld128 so not opt. */
+
+#if defined(__i386__) && defined(__GNUCLIKE_ASM)
 static __inline int
-irint(double x)
+irintf(float x)
 {
 	int n;
 
-	asm("cvtsd2si %1,%0" : "=r" (n) : "x" (x));
+	__asm("fistl %0" : "=m" (n) : "t" (x));
 	return (n);
 }
-#define	HAVE_EFFICIENT_IRINT
-#endif
 
-#ifdef __i386__
 static __inline int
-irint(double x)
+irintd(double x)
 {
 	int n;
 
-	asm("fistl %0" : "=m" (n) : "t" (x));
+	__asm("fistl %0" : "=m" (n) : "t" (x));
 	return (n);
 }
-#define	HAVE_EFFICIENT_IRINT
 #endif
 
-#if defined(__amd64__) || defined(__i386__)
+#if (defined(__amd64__) || defined(__i386__)) && defined(__GNUCLIKE_ASM)
 static __inline int
 irintl(long double x)
 {
 	int n;
 
-	asm("fistl %0" : "=m" (n) : "t" (x));
+	__asm("fistl %0" : "=m" (n) : "t" (x));
 	return (n);
 }
-#define	HAVE_EFFICIENT_IRINTL
 #endif
 
-#endif /* __GNUCLIKE_ASM */
-
 #ifdef DEBUG
 #if defined(__amd64__) || defined(__i386__)
 #define	breakpoint()	asm("int $3")
author	Bruce Evans <bde@FreeBSD.org>	2018-07-20 12:42:24 +0000
committer	Bruce Evans <bde@FreeBSD.org>	2018-07-20 12:42:24 +0000
commit	27aa844253c86af777298be505c7b2b01bb69ed3 (patch)
tree	4b24e3058b5660cc433bc466ac44d2aa1a5c0c92 /lib/msun
parent	1a59bccc425fc8af01aee3086ab09616d1bb8936 (diff)
download	src-test2-27aa844253c86af777298be505c7b2b01bb69ed3.tar.gz src-test2-27aa844253c86af777298be505c7b2b01bb69ed3.zip