mirror of
https://github.com/python/cpython.git
synced 2025-09-03 15:31:08 +00:00
gh-115532: Add kernel density estimation to the statistics module (gh-115863)
This commit is contained in:
parent
6a3236fe2e
commit
6d34eb0e36
5 changed files with 285 additions and 41 deletions
|
@ -76,6 +76,7 @@ or sample.
|
||||||
:func:`fmean` Fast, floating point arithmetic mean, with optional weighting.
|
:func:`fmean` Fast, floating point arithmetic mean, with optional weighting.
|
||||||
:func:`geometric_mean` Geometric mean of data.
|
:func:`geometric_mean` Geometric mean of data.
|
||||||
:func:`harmonic_mean` Harmonic mean of data.
|
:func:`harmonic_mean` Harmonic mean of data.
|
||||||
|
:func:`kde` Estimate the probability density distribution of the data.
|
||||||
:func:`median` Median (middle value) of data.
|
:func:`median` Median (middle value) of data.
|
||||||
:func:`median_low` Low median of data.
|
:func:`median_low` Low median of data.
|
||||||
:func:`median_high` High median of data.
|
:func:`median_high` High median of data.
|
||||||
|
@ -259,6 +260,54 @@ However, for reading convenience, most of the examples show sorted sequences.
|
||||||
.. versionchanged:: 3.10
|
.. versionchanged:: 3.10
|
||||||
Added support for *weights*.
|
Added support for *weights*.
|
||||||
|
|
||||||
|
|
||||||
|
.. function:: kde(data, h, kernel='normal')
|
||||||
|
|
||||||
|
`Kernel Density Estimation (KDE)
|
||||||
|
<https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf>`_:
|
||||||
|
Create a continuous probability density function from discrete samples.
|
||||||
|
|
||||||
|
The basic idea is to smooth the data using `a kernel function
|
||||||
|
<https://en.wikipedia.org/wiki/Kernel_(statistics)>`_.
|
||||||
|
to help draw inferences about a population from a sample.
|
||||||
|
|
||||||
|
The degree of smoothing is controlled by the scaling parameter *h*
|
||||||
|
which is called the bandwidth. Smaller values emphasize local
|
||||||
|
features while larger values give smoother results.
|
||||||
|
|
||||||
|
The *kernel* determines the relative weights of the sample data
|
||||||
|
points. Generally, the choice of kernel shape does not matter
|
||||||
|
as much as the more influential bandwidth smoothing parameter.
|
||||||
|
|
||||||
|
Kernels that give some weight to every sample point include
|
||||||
|
*normal* or *gauss*, *logistic*, and *sigmoid*.
|
||||||
|
|
||||||
|
Kernels that only give weight to sample points within the bandwidth
|
||||||
|
include *rectangular* or *uniform*, *triangular*, *parabolic* or
|
||||||
|
*epanechnikov*, *quartic* or *biweight*, *triweight*, and *cosine*.
|
||||||
|
|
||||||
|
A :exc:`StatisticsError` will be raised if the *data* sequence is empty.
|
||||||
|
|
||||||
|
`Wikipedia has an example
|
||||||
|
<https://en.wikipedia.org/wiki/Kernel_density_estimation#Example>`_
|
||||||
|
where we can use :func:`kde` to generate and plot a probability
|
||||||
|
density function estimated from a small sample:
|
||||||
|
|
||||||
|
.. doctest::
|
||||||
|
|
||||||
|
>>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
|
||||||
|
>>> f_hat = kde(sample, h=1.5)
|
||||||
|
>>> xarr = [i/100 for i in range(-750, 1100)]
|
||||||
|
>>> yarr = [f_hat(x) for x in xarr]
|
||||||
|
|
||||||
|
The points in ``xarr`` and ``yarr`` can be used to make a PDF plot:
|
||||||
|
|
||||||
|
.. image:: kde_example.png
|
||||||
|
:alt: Scatter plot of the estimated probability density function.
|
||||||
|
|
||||||
|
.. versionadded:: 3.13
|
||||||
|
|
||||||
|
|
||||||
.. function:: median(data)
|
.. function:: median(data)
|
||||||
|
|
||||||
Return the median (middle value) of numeric data, using the common "mean of
|
Return the median (middle value) of numeric data, using the common "mean of
|
||||||
|
@ -1095,46 +1144,6 @@ The final prediction goes to the largest posterior. This is known as the
|
||||||
'female'
|
'female'
|
||||||
|
|
||||||
|
|
||||||
Kernel density estimation
|
|
||||||
*************************
|
|
||||||
|
|
||||||
It is possible to estimate a continuous probability density function
|
|
||||||
from a fixed number of discrete samples.
|
|
||||||
|
|
||||||
The basic idea is to smooth the data using `a kernel function such as a
|
|
||||||
normal distribution, triangular distribution, or uniform distribution
|
|
||||||
<https://en.wikipedia.org/wiki/Kernel_(statistics)#Kernel_functions_in_common_use>`_.
|
|
||||||
The degree of smoothing is controlled by a scaling parameter, ``h``,
|
|
||||||
which is called the *bandwidth*.
|
|
||||||
|
|
||||||
.. testcode::
|
|
||||||
|
|
||||||
def kde_normal(sample, h):
|
|
||||||
"Create a continuous probability density function from a sample."
|
|
||||||
# Smooth the sample with a normal distribution kernel scaled by h.
|
|
||||||
kernel_h = NormalDist(0.0, h).pdf
|
|
||||||
n = len(sample)
|
|
||||||
def pdf(x):
|
|
||||||
return sum(kernel_h(x - x_i) for x_i in sample) / n
|
|
||||||
return pdf
|
|
||||||
|
|
||||||
`Wikipedia has an example
|
|
||||||
<https://en.wikipedia.org/wiki/Kernel_density_estimation#Example>`_
|
|
||||||
where we can use the ``kde_normal()`` recipe to generate and plot
|
|
||||||
a probability density function estimated from a small sample:
|
|
||||||
|
|
||||||
.. doctest::
|
|
||||||
|
|
||||||
>>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
|
|
||||||
>>> f_hat = kde_normal(sample, h=1.5)
|
|
||||||
>>> xarr = [i/100 for i in range(-750, 1100)]
|
|
||||||
>>> yarr = [f_hat(x) for x in xarr]
|
|
||||||
|
|
||||||
The points in ``xarr`` and ``yarr`` can be used to make a PDF plot:
|
|
||||||
|
|
||||||
.. image:: kde_example.png
|
|
||||||
:alt: Scatter plot of the estimated probability density function.
|
|
||||||
|
|
||||||
..
|
..
|
||||||
# This modelines must appear within the last ten lines of the file.
|
# This modelines must appear within the last ten lines of the file.
|
||||||
kate: indent-width 3; remove-trailing-space on; replace-tabs on; encoding utf-8;
|
kate: indent-width 3; remove-trailing-space on; replace-tabs on; encoding utf-8;
|
||||||
|
|
|
@ -476,6 +476,14 @@ sqlite3
|
||||||
for filtering database objects to dump.
|
for filtering database objects to dump.
|
||||||
(Contributed by Mariusz Felisiak in :gh:`91602`.)
|
(Contributed by Mariusz Felisiak in :gh:`91602`.)
|
||||||
|
|
||||||
|
statistics
|
||||||
|
----------
|
||||||
|
|
||||||
|
* Add :func:`statistics.kde` for kernel density estimation.
|
||||||
|
This makes it possible to estimate a continuous probability density function
|
||||||
|
from a fixed number of discrete samples.
|
||||||
|
(Contributed by Raymond Hettinger in :gh:`115863`.)
|
||||||
|
|
||||||
subprocess
|
subprocess
|
||||||
----------
|
----------
|
||||||
|
|
||||||
|
|
|
@ -112,6 +112,7 @@ __all__ = [
|
||||||
'fmean',
|
'fmean',
|
||||||
'geometric_mean',
|
'geometric_mean',
|
||||||
'harmonic_mean',
|
'harmonic_mean',
|
||||||
|
'kde',
|
||||||
'linear_regression',
|
'linear_regression',
|
||||||
'mean',
|
'mean',
|
||||||
'median',
|
'median',
|
||||||
|
@ -137,7 +138,7 @@ from decimal import Decimal
|
||||||
from itertools import count, groupby, repeat
|
from itertools import count, groupby, repeat
|
||||||
from bisect import bisect_left, bisect_right
|
from bisect import bisect_left, bisect_right
|
||||||
from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod
|
from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod
|
||||||
from math import isfinite, isinf
|
from math import isfinite, isinf, pi, cos, cosh
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
from collections import Counter, namedtuple, defaultdict
|
from collections import Counter, namedtuple, defaultdict
|
||||||
|
@ -802,6 +803,171 @@ def multimode(data):
|
||||||
return [value for value, count in counts.items() if count == maxcount]
|
return [value for value, count in counts.items() if count == maxcount]
|
||||||
|
|
||||||
|
|
||||||
|
def kde(data, h, kernel='normal'):
|
||||||
|
"""Kernel Density Estimation: Create a continuous probability
|
||||||
|
density function from discrete samples.
|
||||||
|
|
||||||
|
The basic idea is to smooth the data using a kernel function
|
||||||
|
to help draw inferences about a population from a sample.
|
||||||
|
|
||||||
|
The degree of smoothing is controlled by the scaling parameter h
|
||||||
|
which is called the bandwidth. Smaller values emphasize local
|
||||||
|
features while larger values give smoother results.
|
||||||
|
|
||||||
|
The kernel determines the relative weights of the sample data
|
||||||
|
points. Generally, the choice of kernel shape does not matter
|
||||||
|
as much as the more influential bandwidth smoothing parameter.
|
||||||
|
|
||||||
|
Kernels that give some weight to every sample point:
|
||||||
|
|
||||||
|
normal or gauss
|
||||||
|
logistic
|
||||||
|
sigmoid
|
||||||
|
|
||||||
|
Kernels that only give weight to sample points within
|
||||||
|
the bandwidth:
|
||||||
|
|
||||||
|
rectangular or uniform
|
||||||
|
triangular
|
||||||
|
parabolic or epanechnikov
|
||||||
|
quartic or biweight
|
||||||
|
triweight
|
||||||
|
cosine
|
||||||
|
|
||||||
|
A StatisticsError will be raised if the data sequence is empty.
|
||||||
|
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
|
||||||
|
Given a sample of six data points, construct a continuous
|
||||||
|
function that estimates the underlying probability density:
|
||||||
|
|
||||||
|
>>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
|
||||||
|
>>> f_hat = kde(sample, h=1.5)
|
||||||
|
|
||||||
|
Compute the area under the curve:
|
||||||
|
|
||||||
|
>>> sum(f_hat(x) for x in range(-20, 20))
|
||||||
|
1.0
|
||||||
|
|
||||||
|
Plot the estimated probability density function at
|
||||||
|
evenly spaced points from -6 to 10:
|
||||||
|
|
||||||
|
>>> for x in range(-6, 11):
|
||||||
|
... density = f_hat(x)
|
||||||
|
... plot = ' ' * int(density * 400) + 'x'
|
||||||
|
... print(f'{x:2}: {density:.3f} {plot}')
|
||||||
|
...
|
||||||
|
-6: 0.002 x
|
||||||
|
-5: 0.009 x
|
||||||
|
-4: 0.031 x
|
||||||
|
-3: 0.070 x
|
||||||
|
-2: 0.111 x
|
||||||
|
-1: 0.125 x
|
||||||
|
0: 0.110 x
|
||||||
|
1: 0.086 x
|
||||||
|
2: 0.068 x
|
||||||
|
3: 0.059 x
|
||||||
|
4: 0.066 x
|
||||||
|
5: 0.082 x
|
||||||
|
6: 0.082 x
|
||||||
|
7: 0.058 x
|
||||||
|
8: 0.028 x
|
||||||
|
9: 0.009 x
|
||||||
|
10: 0.002 x
|
||||||
|
|
||||||
|
References
|
||||||
|
----------
|
||||||
|
|
||||||
|
Kernel density estimation and its application:
|
||||||
|
https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf
|
||||||
|
|
||||||
|
Kernel functions in common use:
|
||||||
|
https://en.wikipedia.org/wiki/Kernel_(statistics)#kernel_functions_in_common_use
|
||||||
|
|
||||||
|
Interactive graphical demonstration and exploration:
|
||||||
|
https://demonstrations.wolfram.com/KernelDensityEstimation/
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
n = len(data)
|
||||||
|
if not n:
|
||||||
|
raise StatisticsError('Empty data sequence')
|
||||||
|
|
||||||
|
if not isinstance(data[0], (int, float)):
|
||||||
|
raise TypeError('Data sequence must contain ints or floats')
|
||||||
|
|
||||||
|
if h <= 0.0:
|
||||||
|
raise StatisticsError(f'Bandwidth h must be positive, not {h=!r}')
|
||||||
|
|
||||||
|
match kernel:
|
||||||
|
|
||||||
|
case 'normal' | 'gauss':
|
||||||
|
c = 1 / sqrt(2 * pi)
|
||||||
|
K = lambda t: c * exp(-1/2 * t * t)
|
||||||
|
support = None
|
||||||
|
|
||||||
|
case 'logistic':
|
||||||
|
# 1.0 / (exp(t) + 2.0 + exp(-t))
|
||||||
|
K = lambda t: 1/2 / (1.0 + cosh(t))
|
||||||
|
support = None
|
||||||
|
|
||||||
|
case 'sigmoid':
|
||||||
|
# (2/pi) / (exp(t) + exp(-t))
|
||||||
|
c = 1 / pi
|
||||||
|
K = lambda t: c / cosh(t)
|
||||||
|
support = None
|
||||||
|
|
||||||
|
case 'rectangular' | 'uniform':
|
||||||
|
K = lambda t: 1/2
|
||||||
|
support = 1.0
|
||||||
|
|
||||||
|
case 'triangular':
|
||||||
|
K = lambda t: 1.0 - abs(t)
|
||||||
|
support = 1.0
|
||||||
|
|
||||||
|
case 'parabolic' | 'epanechnikov':
|
||||||
|
K = lambda t: 3/4 * (1.0 - t * t)
|
||||||
|
support = 1.0
|
||||||
|
|
||||||
|
case 'quartic' | 'biweight':
|
||||||
|
K = lambda t: 15/16 * (1.0 - t * t) ** 2
|
||||||
|
support = 1.0
|
||||||
|
|
||||||
|
case 'triweight':
|
||||||
|
K = lambda t: 35/32 * (1.0 - t * t) ** 3
|
||||||
|
support = 1.0
|
||||||
|
|
||||||
|
case 'cosine':
|
||||||
|
c1 = pi / 4
|
||||||
|
c2 = pi / 2
|
||||||
|
K = lambda t: c1 * cos(c2 * t)
|
||||||
|
support = 1.0
|
||||||
|
|
||||||
|
case _:
|
||||||
|
raise StatisticsError(f'Unknown kernel name: {kernel!r}')
|
||||||
|
|
||||||
|
if support is None:
|
||||||
|
|
||||||
|
def pdf(x):
|
||||||
|
return sum(K((x - x_i) / h) for x_i in data) / (n * h)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
sample = sorted(data)
|
||||||
|
bandwidth = h * support
|
||||||
|
|
||||||
|
def pdf(x):
|
||||||
|
i = bisect_left(sample, x - bandwidth)
|
||||||
|
j = bisect_right(sample, x + bandwidth)
|
||||||
|
supported = sample[i : j]
|
||||||
|
return sum(K((x - x_i) / h) for x_i in supported) / (n * h)
|
||||||
|
|
||||||
|
pdf.__doc__ = f'PDF estimate with {kernel=!r} and {h=!r}'
|
||||||
|
|
||||||
|
return pdf
|
||||||
|
|
||||||
|
|
||||||
# Notes on methods for computing quantiles
|
# Notes on methods for computing quantiles
|
||||||
# ----------------------------------------
|
# ----------------------------------------
|
||||||
#
|
#
|
||||||
|
|
|
@ -2353,6 +2353,66 @@ class TestGeometricMean(unittest.TestCase):
|
||||||
self.assertAlmostEqual(actual_mean, expected_mean, places=5)
|
self.assertAlmostEqual(actual_mean, expected_mean, places=5)
|
||||||
|
|
||||||
|
|
||||||
|
class TestKDE(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_kde(self):
|
||||||
|
kde = statistics.kde
|
||||||
|
StatisticsError = statistics.StatisticsError
|
||||||
|
|
||||||
|
kernels = ['normal', 'gauss', 'logistic', 'sigmoid', 'rectangular',
|
||||||
|
'uniform', 'triangular', 'parabolic', 'epanechnikov',
|
||||||
|
'quartic', 'biweight', 'triweight', 'cosine']
|
||||||
|
|
||||||
|
sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
|
||||||
|
|
||||||
|
# The approximate integral of a PDF should be close to 1.0
|
||||||
|
|
||||||
|
def integrate(func, low, high, steps=10_000):
|
||||||
|
"Numeric approximation of a definite function integral."
|
||||||
|
dx = (high - low) / steps
|
||||||
|
midpoints = (low + (i + 1/2) * dx for i in range(steps))
|
||||||
|
return sum(map(func, midpoints)) * dx
|
||||||
|
|
||||||
|
for kernel in kernels:
|
||||||
|
with self.subTest(kernel=kernel):
|
||||||
|
f_hat = kde(sample, h=1.5, kernel=kernel)
|
||||||
|
area = integrate(f_hat, -20, 20)
|
||||||
|
self.assertAlmostEqual(area, 1.0, places=4)
|
||||||
|
|
||||||
|
# Check error cases
|
||||||
|
|
||||||
|
with self.assertRaises(StatisticsError):
|
||||||
|
kde([], h=1.0) # Empty dataset
|
||||||
|
with self.assertRaises(TypeError):
|
||||||
|
kde(['abc', 'def'], 1.5) # Non-numeric data
|
||||||
|
with self.assertRaises(TypeError):
|
||||||
|
kde(iter(sample), 1.5) # Data is not a sequence
|
||||||
|
with self.assertRaises(StatisticsError):
|
||||||
|
kde(sample, h=0.0) # Zero bandwidth
|
||||||
|
with self.assertRaises(StatisticsError):
|
||||||
|
kde(sample, h=0.0) # Negative bandwidth
|
||||||
|
with self.assertRaises(TypeError):
|
||||||
|
kde(sample, h='str') # Wrong bandwidth type
|
||||||
|
with self.assertRaises(StatisticsError):
|
||||||
|
kde(sample, h=1.0, kernel='bogus') # Invalid kernel
|
||||||
|
|
||||||
|
# Test name and docstring of the generated function
|
||||||
|
|
||||||
|
h = 1.5
|
||||||
|
kernel = 'cosine'
|
||||||
|
f_hat = kde(sample, h, kernel)
|
||||||
|
self.assertEqual(f_hat.__name__, 'pdf')
|
||||||
|
self.assertIn(kernel, f_hat.__doc__)
|
||||||
|
self.assertIn(str(h), f_hat.__doc__)
|
||||||
|
|
||||||
|
# Test closed interval for the support boundaries.
|
||||||
|
# In particular, 'uniform' should non-zero at the boundaries.
|
||||||
|
|
||||||
|
f_hat = kde([0], 1.0, 'uniform')
|
||||||
|
self.assertEqual(f_hat(-1.0), 1/2)
|
||||||
|
self.assertEqual(f_hat(1.0), 1/2)
|
||||||
|
|
||||||
|
|
||||||
class TestQuantiles(unittest.TestCase):
|
class TestQuantiles(unittest.TestCase):
|
||||||
|
|
||||||
def test_specific_cases(self):
|
def test_specific_cases(self):
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
Add kernel density estimation to the statistics module.
|
Loading…
Add table
Add a link
Reference in a new issue