mirror of
https://github.com/python/cpython.git
synced 2024-11-24 18:34:43 +08:00
bpo-38490: statistics: Add covariance, Pearson's correlation, and simple linear regression (#16813)
Co-authored-by: Tymoteusz Wołodźko <twolodzko+gitkraken@gmail.com
This commit is contained in:
parent
172c0f2752
commit
09aa6f914d
@ -68,6 +68,17 @@ tends to deviate from the typical or average values.
|
||||
:func:`variance` Sample variance of data.
|
||||
======================= =============================================
|
||||
|
||||
Statistics for relations between two inputs
|
||||
-------------------------------------------
|
||||
|
||||
These functions calculate statistics regarding relations between two inputs.
|
||||
|
||||
========================= =====================================================
|
||||
:func:`covariance` Sample covariance for two variables.
|
||||
:func:`correlation` Pearson's correlation coefficient for two variables.
|
||||
:func:`linear_regression` Intercept and slope for simple linear regression.
|
||||
========================= =====================================================
|
||||
|
||||
|
||||
Function details
|
||||
----------------
|
||||
@ -566,6 +577,98 @@ However, for reading convenience, most of the examples show sorted sequences.
|
||||
|
||||
.. versionadded:: 3.8
|
||||
|
||||
.. function:: covariance(x, y, /)
|
||||
|
||||
Return the sample covariance of two inputs *x* and *y*. Covariance
|
||||
is a measure of the joint variability of two inputs.
|
||||
|
||||
Both inputs must be of the same length (no less than two), otherwise
|
||||
:exc:`StatisticsError` is raised.
|
||||
|
||||
Examples:
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
|
||||
>>> covariance(x, y)
|
||||
0.75
|
||||
>>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1]
|
||||
>>> covariance(x, z)
|
||||
-7.5
|
||||
>>> covariance(z, x)
|
||||
-7.5
|
||||
|
||||
.. versionadded:: 3.10
|
||||
|
||||
.. function:: correlation(x, y, /)
|
||||
|
||||
Return the `Pearson's correlation coefficient
|
||||
<https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
|
||||
for two inputs. Pearson's correlation coefficient *r* takes values
|
||||
between -1 and +1. It measures the strength and direction of the linear
|
||||
relationship, where +1 means very strong, positive linear relationship,
|
||||
-1 very strong, negative linear relationship, and 0 no linear relationship.
|
||||
|
||||
Both inputs must be of the same length (no less than two), and need
|
||||
not to be constant, otherwise :exc:`StatisticsError` is raised.
|
||||
|
||||
Examples:
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
>>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
|
||||
>>> correlation(x, x)
|
||||
1.0
|
||||
>>> correlation(x, y)
|
||||
-1.0
|
||||
|
||||
.. versionadded:: 3.10
|
||||
|
||||
.. function:: linear_regression(regressor, dependent_variable)
|
||||
|
||||
Return the intercept and slope of `simple linear regression
|
||||
<https://en.wikipedia.org/wiki/Simple_linear_regression>`_
|
||||
parameters estimated using ordinary least squares. Simple linear
|
||||
regression describes relationship between *regressor* and
|
||||
*dependent variable* in terms of linear function:
|
||||
|
||||
*dependent_variable = intercept + slope \* regressor + noise*
|
||||
|
||||
where ``intercept`` and ``slope`` are the regression parameters that are
|
||||
estimated, and noise term is an unobserved random variable, for the
|
||||
variability of the data that was not explained by the linear regression
|
||||
(it is equal to the difference between prediction and the actual values
|
||||
of dependent variable).
|
||||
|
||||
Both inputs must be of the same length (no less than two), and regressor
|
||||
needs not to be constant, otherwise :exc:`StatisticsError` is raised.
|
||||
|
||||
For example, if we took the data on the data on `release dates of the Monty
|
||||
Python films <https://en.wikipedia.org/wiki/Monty_Python#Films>`_, and used
|
||||
it to predict the cumulative number of Monty Python films produced, we could
|
||||
predict what would be the number of films they could have made till year
|
||||
2019, assuming that they kept the pace.
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> year = [1971, 1975, 1979, 1982, 1983]
|
||||
>>> films_total = [1, 2, 3, 4, 5]
|
||||
>>> intercept, slope = linear_regression(year, films_total)
|
||||
>>> round(intercept + slope * 2019)
|
||||
16
|
||||
|
||||
We could also use it to "predict" how many Monty Python films existed when
|
||||
Brian Cohen was born.
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> round(intercept + slope * 1)
|
||||
-610
|
||||
|
||||
.. versionadded:: 3.10
|
||||
|
||||
|
||||
Exceptions
|
||||
----------
|
||||
|
@ -1051,6 +1051,14 @@ The :mod:`shelve` module now uses :data:`pickle.DEFAULT_PROTOCOL` by default
|
||||
instead of :mod:`pickle` protocol ``3`` when creating shelves.
|
||||
(Contributed by Zackery Spytz in :issue:`34204`.)
|
||||
|
||||
statistics
|
||||
----------
|
||||
|
||||
Added :func:`~statistics.covariance`, Pearson's
|
||||
:func:`~statistics.correlation`, and simple
|
||||
:func:`~statistics.linear_regression` functions.
|
||||
(Contributed by Tymoteusz Wołodźko in :issue:`38490`.)
|
||||
|
||||
site
|
||||
----
|
||||
|
||||
|
@ -73,6 +73,30 @@ second argument to the four "spread" functions to avoid recalculating it:
|
||||
2.5
|
||||
|
||||
|
||||
Statistics for relations between two inputs
|
||||
-------------------------------------------
|
||||
|
||||
================== ====================================================
|
||||
Function Description
|
||||
================== ====================================================
|
||||
covariance Sample covariance for two variables.
|
||||
correlation Pearson's correlation coefficient for two variables.
|
||||
linear_regression Intercept and slope for simple linear regression.
|
||||
================== ====================================================
|
||||
|
||||
Calculate covariance, Pearson's correlation, and simple linear regression
|
||||
for two inputs:
|
||||
|
||||
>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
|
||||
>>> covariance(x, y)
|
||||
0.75
|
||||
>>> correlation(x, y) #doctest: +ELLIPSIS
|
||||
0.31622776601...
|
||||
>>> linear_regression(x, y) #doctest:
|
||||
LinearRegression(intercept=1.5, slope=0.1)
|
||||
|
||||
|
||||
Exceptions
|
||||
----------
|
||||
|
||||
@ -98,6 +122,9 @@ __all__ = [
|
||||
'quantiles',
|
||||
'stdev',
|
||||
'variance',
|
||||
'correlation',
|
||||
'covariance',
|
||||
'linear_regression',
|
||||
]
|
||||
|
||||
import math
|
||||
@ -110,7 +137,7 @@ from itertools import groupby, repeat
|
||||
from bisect import bisect_left, bisect_right
|
||||
from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
|
||||
from operator import itemgetter
|
||||
from collections import Counter
|
||||
from collections import Counter, namedtuple
|
||||
|
||||
# === Exceptions ===
|
||||
|
||||
@ -826,6 +853,113 @@ def pstdev(data, mu=None):
|
||||
return math.sqrt(var)
|
||||
|
||||
|
||||
# === Statistics for relations between two inputs ===
|
||||
|
||||
# See https://en.wikipedia.org/wiki/Covariance
|
||||
# https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
|
||||
# https://en.wikipedia.org/wiki/Simple_linear_regression
|
||||
|
||||
|
||||
def covariance(x, y, /):
|
||||
"""Covariance
|
||||
|
||||
Return the sample covariance of two inputs *x* and *y*. Covariance
|
||||
is a measure of the joint variability of two inputs.
|
||||
|
||||
>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
|
||||
>>> covariance(x, y)
|
||||
0.75
|
||||
>>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1]
|
||||
>>> covariance(x, z)
|
||||
-7.5
|
||||
>>> covariance(z, x)
|
||||
-7.5
|
||||
|
||||
"""
|
||||
n = len(x)
|
||||
if len(y) != n:
|
||||
raise StatisticsError('covariance requires that both inputs have same number of data points')
|
||||
if n < 2:
|
||||
raise StatisticsError('covariance requires at least two data points')
|
||||
xbar = mean(x)
|
||||
ybar = mean(y)
|
||||
total = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
|
||||
return total / (n - 1)
|
||||
|
||||
|
||||
def correlation(x, y, /):
|
||||
"""Pearson's correlation coefficient
|
||||
|
||||
Return the Pearson's correlation coefficient for two inputs. Pearson's
|
||||
correlation coefficient *r* takes values between -1 and +1. It measures the
|
||||
strength and direction of the linear relationship, where +1 means very
|
||||
strong, positive linear relationship, -1 very strong, negative linear
|
||||
relationship, and 0 no linear relationship.
|
||||
|
||||
>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
>>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
|
||||
>>> correlation(x, x)
|
||||
1.0
|
||||
>>> correlation(x, y)
|
||||
-1.0
|
||||
|
||||
"""
|
||||
n = len(x)
|
||||
if len(y) != n:
|
||||
raise StatisticsError('correlation requires that both inputs have same number of data points')
|
||||
if n < 2:
|
||||
raise StatisticsError('correlation requires at least two data points')
|
||||
cov = covariance(x, y)
|
||||
stdx = stdev(x)
|
||||
stdy = stdev(y)
|
||||
try:
|
||||
return cov / (stdx * stdy)
|
||||
except ZeroDivisionError:
|
||||
raise StatisticsError('at least one of the inputs is constant')
|
||||
|
||||
|
||||
LinearRegression = namedtuple('LinearRegression', ['intercept', 'slope'])
|
||||
|
||||
|
||||
def linear_regression(regressor, dependent_variable, /):
|
||||
"""Intercept and slope for simple linear regression
|
||||
|
||||
Return the intercept and slope of simple linear regression
|
||||
parameters estimated using ordinary least squares. Simple linear
|
||||
regression describes relationship between *regressor* and
|
||||
*dependent variable* in terms of linear function::
|
||||
|
||||
dependent_variable = intercept + slope * regressor + noise
|
||||
|
||||
where ``intercept`` and ``slope`` are the regression parameters that are
|
||||
estimated, and noise term is an unobserved random variable, for the
|
||||
variability of the data that was not explained by the linear regression
|
||||
(it is equal to the difference between prediction and the actual values
|
||||
of dependent variable).
|
||||
|
||||
The parameters are returned as a named tuple.
|
||||
|
||||
>>> regressor = [1, 2, 3, 4, 5]
|
||||
>>> noise = NormalDist().samples(5, seed=42)
|
||||
>>> dependent_variable = [2 + 3 * regressor[i] + noise[i] for i in range(5)]
|
||||
>>> linear_regression(regressor, dependent_variable) #doctest: +ELLIPSIS
|
||||
LinearRegression(intercept=1.75684970486..., slope=3.09078914170...)
|
||||
|
||||
"""
|
||||
n = len(regressor)
|
||||
if len(dependent_variable) != n:
|
||||
raise StatisticsError('linear regression requires that both inputs have same number of data points')
|
||||
if n < 2:
|
||||
raise StatisticsError('linear regression requires at least two data points')
|
||||
try:
|
||||
slope = covariance(regressor, dependent_variable) / variance(regressor)
|
||||
except ZeroDivisionError:
|
||||
raise StatisticsError('regressor is constant')
|
||||
intercept = mean(dependent_variable) - slope * mean(regressor)
|
||||
return LinearRegression(intercept=intercept, slope=slope)
|
||||
|
||||
|
||||
## Normal Distribution #####################################################
|
||||
|
||||
|
||||
|
@ -2407,6 +2407,84 @@ class TestQuantiles(unittest.TestCase):
|
||||
quantiles([10, None, 30], n=4) # data is non-numeric
|
||||
|
||||
|
||||
class TestBivariateStatistics(unittest.TestCase):
|
||||
|
||||
def test_unequal_size_error(self):
|
||||
for x, y in [
|
||||
([1, 2, 3], [1, 2]),
|
||||
([1, 2], [1, 2, 3]),
|
||||
]:
|
||||
with self.assertRaises(statistics.StatisticsError):
|
||||
statistics.covariance(x, y)
|
||||
with self.assertRaises(statistics.StatisticsError):
|
||||
statistics.correlation(x, y)
|
||||
with self.assertRaises(statistics.StatisticsError):
|
||||
statistics.linear_regression(x, y)
|
||||
|
||||
def test_small_sample_error(self):
|
||||
for x, y in [
|
||||
([], []),
|
||||
([], [1, 2,]),
|
||||
([1, 2,], []),
|
||||
([1,], [1,]),
|
||||
([1,], [1, 2,]),
|
||||
([1, 2,], [1,]),
|
||||
]:
|
||||
with self.assertRaises(statistics.StatisticsError):
|
||||
statistics.covariance(x, y)
|
||||
with self.assertRaises(statistics.StatisticsError):
|
||||
statistics.correlation(x, y)
|
||||
with self.assertRaises(statistics.StatisticsError):
|
||||
statistics.linear_regression(x, y)
|
||||
|
||||
|
||||
class TestCorrelationAndCovariance(unittest.TestCase):
|
||||
|
||||
def test_results(self):
|
||||
for x, y, result in [
|
||||
([1, 2, 3], [1, 2, 3], 1),
|
||||
([1, 2, 3], [-1, -2, -3], -1),
|
||||
([1, 2, 3], [3, 2, 1], -1),
|
||||
([1, 2, 3], [1, 2, 1], 0),
|
||||
([1, 2, 3], [1, 3, 2], 0.5),
|
||||
]:
|
||||
self.assertAlmostEqual(statistics.correlation(x, y), result)
|
||||
self.assertAlmostEqual(statistics.covariance(x, y), result)
|
||||
|
||||
def test_different_scales(self):
|
||||
x = [1, 2, 3]
|
||||
y = [10, 30, 20]
|
||||
self.assertAlmostEqual(statistics.correlation(x, y), 0.5)
|
||||
self.assertAlmostEqual(statistics.covariance(x, y), 5)
|
||||
|
||||
y = [.1, .2, .3]
|
||||
self.assertAlmostEqual(statistics.correlation(x, y), 1)
|
||||
self.assertAlmostEqual(statistics.covariance(x, y), 0.1)
|
||||
|
||||
|
||||
class TestLinearRegression(unittest.TestCase):
|
||||
|
||||
def test_constant_input_error(self):
|
||||
x = [1, 1, 1,]
|
||||
y = [1, 2, 3,]
|
||||
with self.assertRaises(statistics.StatisticsError):
|
||||
statistics.linear_regression(x, y)
|
||||
|
||||
def test_results(self):
|
||||
for x, y, true_intercept, true_slope in [
|
||||
([1, 2, 3], [0, 0, 0], 0, 0),
|
||||
([1, 2, 3], [1, 2, 3], 0, 1),
|
||||
([1, 2, 3], [100, 100, 100], 100, 0),
|
||||
([1, 2, 3], [12, 14, 16], 10, 2),
|
||||
([1, 2, 3], [-1, -2, -3], 0, -1),
|
||||
([1, 2, 3], [21, 22, 23], 20, 1),
|
||||
([1, 2, 3], [5.1, 5.2, 5.3], 5, 0.1),
|
||||
]:
|
||||
intercept, slope = statistics.linear_regression(x, y)
|
||||
self.assertAlmostEqual(intercept, true_intercept)
|
||||
self.assertAlmostEqual(slope, true_slope)
|
||||
|
||||
|
||||
class TestNormalDist:
|
||||
|
||||
# General note on precision: The pdf(), cdf(), and overlap() methods
|
||||
|
@ -1927,6 +1927,7 @@ David Wolever
|
||||
Klaus-Juergen Wolf
|
||||
Dan Wolfe
|
||||
Richard Wolff
|
||||
Tymoteusz Wołodźko
|
||||
Adam Woodbeck
|
||||
William Woodruff
|
||||
Steven Work
|
||||
|
@ -0,0 +1 @@
|
||||
Covariance, Pearson's correlation, and simple linear regression functionality was added to statistics module. Patch by Tymoteusz Wołodźko.
|
Loading…
Reference in New Issue
Block a user