Factor-out common code. Also, optimize common cases by preallocating space on the stack. GH-8738

Improves speed by 9 to 10ns per call.
2024-12-13 03:45:42 +08:00 · 2018-08-11 18:39:05 -07:00 · 2018-08-11 18:39:05 -07:00 · c630e10440
commit c630e10440
parent 1399074535
1 changed files with 56 additions and 41 deletions
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@ -2032,10 +2032,10 @@ math_fmod_impl(PyObject *module, double x, double y)
 }

 /*
-Given an *n* length *vec* of non-negative, non-nan, non-inf values
+Given an *n* length *vec* of non-negative values
 where *max* is the largest value in the vector, compute:

-    sum((x / max) ** 2 for x in vec)
+    max * sqrt(sum((x / max) ** 2 for x in vec))

 When a maximum value is found, it is swapped to the end.  This
 lets us skip one loop iteration and just add 1.0 at the end.
@ -2045,19 +2045,31 @@ Kahan summation is used to improve accuracy.  The *csum*
 variable tracks the cumulative sum and *frac* tracks
 fractional round-off error for the most recent addition.

+The value of the *max* variable must be present in *vec*
+or should equal to 0.0 when n==0.  Likewise, *max* will
+be INF if an infinity is present in the vec.
+
+The *found_nan* variable indicates whether some member of
+the *vec* is a NaN.
 */

 static inline double
-scaled_vector_squared(Py_ssize_t n, double *vec, double max)
+vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
 {
    double x, csum = 0.0, oldcsum, frac = 0.0;
    Py_ssize_t i;

+    if (Py_IS_INFINITY(max)) {
+        return max;
+    }
+    if (found_nan) {
+        return Py_NAN;
+    }
    if (max == 0.0) {
        return 0.0;
    }
    assert(n > 0);
-    for (i=0 ; i<n-1 ; i++) {
+    for (i=0 ; i < n-1 ; i++) {
        x = vec[i];
        if (x == max) {
            x = vec[n-1];
@ -2071,9 +2083,11 @@ scaled_vector_squared(Py_ssize_t n, double *vec, double max)
    }
    assert(vec[n-1] == max);
    csum += 1.0 - frac;
-    return csum;
+    return max * sqrt(csum);
 }

+#define NUM_STACK_ELEMS 16
+
 /*[clinic input]
 math.dist

@ -2095,11 +2109,12 @@ math_dist_impl(PyObject *module, PyObject *p, PyObject *q)
 /*[clinic end generated code: output=56bd9538d06bbcfe input=937122eaa5f19272]*/
 {
    PyObject *item;
-    double *diffs;
    double max = 0.0;
    double x, px, qx, result;
    Py_ssize_t i, m, n;
    int found_nan = 0;
+    double diffs_on_stack[NUM_STACK_ELEMS];
+    double *diffs = diffs_on_stack;

    m = PyTuple_GET_SIZE(p);
    n = PyTuple_GET_SIZE(q);
@ -2109,22 +2124,22 @@ math_dist_impl(PyObject *module, PyObject *p, PyObject *q)
        return NULL;

    }
-    diffs = (double *) PyObject_Malloc(n * sizeof(double));
-    if (diffs == NULL) {
-        return NULL;
+    if (n > NUM_STACK_ELEMS) {
+        diffs = (double *) PyObject_Malloc(n * sizeof(double));
+        if (diffs == NULL) {
+            return NULL;
+        }
    }
    for (i=0 ; i<n ; i++) {
        item = PyTuple_GET_ITEM(p, i);
        px = PyFloat_AsDouble(item);
        if (px == -1.0 && PyErr_Occurred()) {
-            PyObject_Free(diffs);
-            return NULL;
+            goto error_exit;
        }
        item = PyTuple_GET_ITEM(q, i);
        qx = PyFloat_AsDouble(item);
        if (qx == -1.0 && PyErr_Occurred()) {
-            PyObject_Free(diffs);
-            return NULL;
+            goto error_exit;
        }
        x = fabs(px - qx);
        diffs[i] = x;
@ -2133,19 +2148,17 @@ math_dist_impl(PyObject *module, PyObject *p, PyObject *q)
            max = x;
        }
    }
-    if (Py_IS_INFINITY(max)) {
-        result = max;
-        goto done;
+    result = vector_norm(n, diffs, max, found_nan);
+    if (diffs != diffs_on_stack) {
+        PyObject_Free(diffs);
    }
-    if (found_nan) {
-        result = Py_NAN;
-        goto done;
-    }
-    result = max * sqrt(scaled_vector_squared(n, diffs, max));
-
-  done:
-    PyObject_Free(diffs);
    return PyFloat_FromDouble(result);
+
+  error_exit:
+    if (diffs != diffs_on_stack) {
+        PyObject_Free(diffs);
+    }
+    return NULL;
 }

 /* AC: cannot convert yet, waiting for *args support */
@ -2154,21 +2167,23 @@ math_hypot(PyObject *self, PyObject *args)
 {
    Py_ssize_t i, n;
    PyObject *item;
-    double *coordinates;
    double max = 0.0;
    double x, result;
    int found_nan = 0;
+    double coord_on_stack[NUM_STACK_ELEMS];
+    double *coordinates = coord_on_stack;

    n = PyTuple_GET_SIZE(args);
-    coordinates = (double *) PyObject_Malloc(n * sizeof(double));
-    if (coordinates == NULL)
-        return NULL;
+    if (n > NUM_STACK_ELEMS) {
+        coordinates = (double *) PyObject_Malloc(n * sizeof(double));
+        if (coordinates == NULL)
+            return NULL;
+    }
    for (i=0 ; i<n ; i++) {
        item = PyTuple_GET_ITEM(args, i);
        x = PyFloat_AsDouble(item);
        if (x == -1.0 && PyErr_Occurred()) {
-            PyObject_Free(coordinates);
-            return NULL;
+            goto error_exit;
        }
        x = fabs(x);
        coordinates[i] = x;
@ -2177,21 +2192,21 @@ math_hypot(PyObject *self, PyObject *args)
            max = x;
        }
    }
-    if (Py_IS_INFINITY(max)) {
-        result = max;
-        goto done;
+    result = vector_norm(n, coordinates, max, found_nan);
+    if (coordinates != coord_on_stack) {
+        PyObject_Free(coordinates);
    }
-    if (found_nan) {
-        result = Py_NAN;
-        goto done;
-    }
-    result = max * sqrt(scaled_vector_squared(n, coordinates, max));
-
-  done:
-    PyObject_Free(coordinates);
    return PyFloat_FromDouble(result);
+
+  error_exit:
+    if (coordinates != coord_on_stack) {
+        PyObject_Free(coordinates);
+    }
+    return NULL;
 }

+#undef NUM_STACK_ELEMS
+
 PyDoc_STRVAR(math_hypot_doc,
             "hypot(*coordinates) -> value\n\n\
 Multidimensional Euclidean distance from the origin to a point.\n\