Simplify and improve accuracy for subnormals in hypot() (GH-102785)

This commit is contained in:
Raymond Hettinger 2023-03-17 14:06:52 -05:00 committed by GitHub
parent 174c4bfd0f
commit 72186aa637
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -2498,7 +2498,7 @@ References:
static inline double static inline double
vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
{ {
double x, h, scale, oldcsum, csum = 1.0, frac1 = 0.0, frac2 = 0.0; double x, h, scale, csum = 1.0, frac1 = 0.0, frac2 = 0.0;
DoubleLength pr, sm; DoubleLength pr, sm;
int max_e; int max_e;
Py_ssize_t i; Py_ssize_t i;
@ -2513,49 +2513,42 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
return max; return max;
} }
frexp(max, &max_e); frexp(max, &max_e);
if (max_e >= -1023) { if (max_e < -1023) {
scale = ldexp(1.0, -max_e); /* When max_e < -1023, ldexp(1.0, -max_e) would overflow.
assert(max * scale >= 0.5); So we first perform lossless scaling from subnormals back to normals,
assert(max * scale < 1.0); then recurse back to vector_norm(), and then finally undo the scaling.
*/
for (i=0 ; i < n ; i++) { for (i=0 ; i < n ; i++) {
x = vec[i]; vec[i] /= DBL_MIN;
assert(Py_IS_FINITE(x) && fabs(x) <= max);
x *= scale;
assert(fabs(x) < 1.0);
pr = dl_mul(x, x);
assert(pr.hi <= 1.0);
sm = dl_fast_sum(csum, pr.hi);
csum = sm.hi;
frac1 += pr.lo;
frac2 += sm.lo;
} }
h = sqrt(csum - 1.0 + (frac1 + frac2)); return DBL_MIN * vector_norm(n, vec, max / DBL_MIN, found_nan);
pr = dl_mul(-h, h); }
scale = ldexp(1.0, -max_e);
assert(max * scale >= 0.5);
assert(max * scale < 1.0);
for (i=0 ; i < n ; i++) {
x = vec[i];
assert(Py_IS_FINITE(x) && fabs(x) <= max);
x *= scale;
assert(fabs(x) < 1.0);
pr = dl_mul(x, x);
assert(pr.hi <= 1.0);
sm = dl_fast_sum(csum, pr.hi); sm = dl_fast_sum(csum, pr.hi);
csum = sm.hi; csum = sm.hi;
frac1 += pr.lo; frac1 += pr.lo;
frac2 += sm.lo; frac2 += sm.lo;
x = csum - 1.0 + (frac1 + frac2);
return (h + x / (2.0 * h)) / scale;
} }
/* When max_e < -1023, ldexp(1.0, -max_e) overflows. h = sqrt(csum - 1.0 + (frac1 + frac2));
So instead of multiplying by a scale, we just divide by *max*. pr = dl_mul(-h, h);
*/ sm = dl_fast_sum(csum, pr.hi);
for (i=0 ; i < n ; i++) { csum = sm.hi;
x = vec[i]; frac1 += pr.lo;
assert(Py_IS_FINITE(x) && fabs(x) <= max); frac2 += sm.lo;
x /= max; x = csum - 1.0 + (frac1 + frac2);
x = x*x; return (h + x / (2.0 * h)) / scale;
assert(x <= 1.0);
assert(fabs(csum) >= fabs(x));
oldcsum = csum;
csum += x;
frac1 += (oldcsum - csum) + x;
}
return max * sqrt(csum - 1.0 + frac1);
} }
#define NUM_STACK_ELEMS 16 #define NUM_STACK_ELEMS 16