1.11. Visualization of the Central Limit Theorem#
The general statement of the central limit theory (CLT) is that the sum of
# set up for plots in this notebook using matplotlib
%matplotlib inline
import scipy.stats as stats # We'll use stats as our source of pdfs
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set() # nicer plots!
Limit of Poisson distribution is Gaussian#
One consequence of the CLT is that distributions such as the Binomial and Poisson distributions all tend to look like Gaussian distributions in the limit of a large number of drawings.
Here we visualize how the Poisson distribution in the limit
with
Note: this limiting functional form is not obviously connected to the general statement of the CLT up above. What is the connection of the Gaussian limit to a sum of random variables as prescribed by the CLT? Answered below with a test.
This visualization is adapted from the amplitude_in_presence_of_background_RECAP.ipynb
notebook.
from math import factorial
def poisson(N, D):
"""
Returns a Poisson distribution value with mean D for integer N.
We require N to be an integer greater than equal to zero.
"""
assert (isinstance(N, int) and N >= 0), \
"N must be a non-negative integer!"
return D**N * np.exp(-D) / factorial(N)
def poisson_plot(ax, D, max_N):
"""
Make a bar plot on the axis ax of the Poisson distribution for mu = D
and out to a maximum integer max_N.
UPDATE: we eplaced our own poisson function with poisson.pmf from
scipy.stats so that we can go to larger N.
"""
N_pts = np.arange(0, max_N, 1, dtype=int)
poisson_pts = [stats.poisson.pmf(N, D) for N in N_pts] # poisson_pts = [poisson(int(N), D) for N in N_pts]
ax.bar(N_pts, poisson_pts, width=0.8, bottom=None, align='center')
ax.set_xlabel(r'Number of counts $N$', fontsize=18)
ax.set_ylabel(fr'$\mathrm{{p}}(N \mid D={D:.1f})$', fontsize=18)
ax.set_title(rf'$D = {D:.1f}$', fontsize=20)
ax.tick_params(labelsize=16)
return N_pts
Make successive histogram plots of the Poisson distribution, increasing the value of
Note that the arguments of stats.norm.pdf are the # of points, the mean, and the standard deviation (not the variance) of the distribution.
D_list = np.array([0.5, 1.2, 1.7, 12.5, 20., 50., 100., 150.])
num_rows = int(D_list.size / 2)
fig, axes = plt.subplots(num_rows, 2, figsize=(15, 5 * num_rows))
for D, ax in zip(D_list, axes.flatten()):
max_N = int(D + 6.*np.sqrt(D)) # mean + multiple of standard deviation
N_pts = np.arange(0, max_N, 0.01)
y_pts = stats.norm.pdf(N_pts, D, np.sqrt(D))
poisson_plot(ax, D, max_N)
ax.plot(N_pts, y_pts, color='red')
fig.tight_layout()
Compare a sum of D Poisson draws with mean 1 to a single Poisson distribution with mean D#
Now we answer the question asked above: What is the connection of the Gaussian limit to a sum of random variables as prescribed by the CLT?
The answer is that a sum of Poisson distributed random variables is itself Poisson distributed. For example, if
Here we test this answer by plotting the histogram of num_draws samples, each the sum of
D = 5
num_draws = 1000
# set upper range of N
max_N = int(D + 6.*np.sqrt(D)) # mean + multiple of standard deviation
# make an array of the Poisson pmf with mean D
N_pts = np.arange(0, max_N, 1, dtype=int)
poisson_pts = [stats.poisson.pmf(N, D) for N in N_pts]
# (num_draws) samples of the sum of D mean-1 poisson draws
test_poisson_pts = np.array([np.sum([stats.poisson.rvs(1) for index in range(D)]) \
for draws in range(num_draws)])
my_bins = np.arange(0, test_poisson_pts.max() + 1.5) - 0.5
# Gaussian limit (mean D and standard deviation D)
norm_x_pts = np.arange(0, max_N, 0.01)
norm_y_pts = stats.norm.pdf(norm_x_pts, D, np.sqrt(D))
# Make the plot
fig, ax = plt.subplots(1, 1, figsize=(12, 5))
# normalized histogram of draws from summed Poissons with mean 1
ax.hist(test_poisson_pts, my_bins, density=True, color='green', alpha=0.5,
label='sum of Poisson')
# plot the Poisson pmf with mean D
ax.bar(N_pts, poisson_pts, width=0.8, bottom=None, align='center',
color='blue', alpha=.5, label=fr'$\mathrm{{p}}(N \mid D={D})$')
# plot normal distribution
ax.plot(norm_x_pts, norm_y_pts, color='red', label='normal')
ax.set_xlabel(r'Number of counts $N$', fontsize=18)
ax.set_ylabel(fr'$\mathrm{{p}}(N)$', fontsize=18)
ax.set_title(rf'Comparing sum of {D} mean 1 Poissons vs. mean $D = {D}$ Poisson', fontsize=20)
ax.tick_params(labelsize=16)
ax.legend(fontsize=18)
fig.tight_layout()
Things to try:
Change the number of draws
Change the value of D
Behavior of the mean of a fixed-size sample#
Here we take the mean of a fixed-size sample drawn from any pdf, and then repeat a number of times and look at the distribution of the means. According to the CLT, this distribution should approach a Gaussian pdf. For our test pdf we’ll use a uniform distribution, although this is easy to switch to another distribution.
import numpy as np
import matplotlib.pyplot as plt
# Sample the distribution we will use (by default a uniform distribution)
total_draws = 100000
uni_min, uni_max = 0., 1.
ran_uniform_array = np.random.uniform(uni_min, uni_max, total_draws)
bins = 15 # bins for the historam
# Find the mean and standard deviation of our distribution, to use for
# plotting a comparison Gaussian distribution.
mu = ran_uniform_array.mean()
sigma = ran_uniform_array.std()
print(f' mean of uniform array = {mu:.3f}')
print(f' standard deviation of uniform array = {sigma:.3f}')
def histogram_ax(ax, means_array, N_means, sample_size, bins, CLT_pdf=False):
"""
Plot a histogram on axis ax that shows the means. Add the expected
limiting normal distribution if CLT_pdf is True.
"""
sigma_tilde = sigma / np.sqrt(sample_size)
x_min = mu - 4.*sigma_tilde
x_max = mu + 4.*sigma_tilde
(_, bin_array, _) = ax.hist(means_array[:N_means], bins = bins,
align='mid')
bin_width = bin_array[1] - bin_array[0]
title_string = fr'Sampling size n={sample_size} of uniform pdf ' + \
fr'drawn {N_means:d} times'
ax.set_title(title_string)
ax.set_xlabel('Value of Mean')
ax.set_ylabel('Count in bin')
ax.set_xlim(x_min, x_max)
if (CLT_pdf): # if true, plot a normal pdf with the same mu and sigma
# divided by the sqrt of the sample size.
x_pts = np.linspace(x_min, x_max, 200)
y_pts = stats.norm.pdf(x_pts, mu, sigma_tilde)
ax.plot(x_pts, y_pts * (bin_width*N_means), color='red')
def plot_sample_result(sample_size, bins):
"""Plot a series of histograms to show the approach to a Gaussian."""
sample_means_fixed_sample_size = []
for i in range(10000):
samples = ran_uniform_array[np.random.randint(ran_uniform_array.size,
size = sample_size)]
sample_means_fixed_sample_size.append(np.mean(samples))
N_means_array = np.array([1, 5, 10, 20, 50, 100, 500, 1000, 10000])
fig3 = plt.figure(figsize=(15,10))
axes = fig3.subplots(nrows=3, ncols=3)
fig3.suptitle(f'Sample size n={sample_size:d} sampled by various times',
fontsize=16, va='baseline')
for index, ax in enumerate(axes.flatten()):
histogram_ax(ax, sample_means_fixed_sample_size, N_means_array[index],
sample_size, bins, CLT_pdf=True)
fig3.tight_layout(w_pad=1.8)
fig3.subplots_adjust(top = 0.92)
First example: each sample is only one point#
Since the sample size is one, the mean is just the point, so the distributions here will look like the original distribution (no approach to Gaussian). Note that we bin the trivial means (the individual draws) into 20 bins.
plot_sample_result(1, 20) # n=1 so just the original distribution
As expected, we just get a better and better reproduction of the original distribution, which is uniform in this case. We don’t expect any connection to a Gaussian from the CLT in this case. That is, just taking a lot of samples from a distribution doesn’t mean in general that we will get a normal distribution.
Second example: each sample is two points#
So the mean is the average of the two points. We divide the means into 20 bins.
plot_sample_result(2, 20)
As the number of means gets larger, we see that a peak develops at 1/2 and the shape becomes triangular. If we think of the average of two uniformly drawn points from [0,1], it is most likely that their average will be close to 1/2 and unlikely to be at the ends (e.g., to get near 0, one would need both numbers to be close to 0).
Third example: each sample is 10 points#
Now we see the trends from
plot_sample_result(10, 20)
Fourth example: each sample is 50 points#
By now we are taking the mean of 50 points with each sample. This will increasingly be close to 1/2 and very rarely far from 1/2. The width decreases as
plot_sample_result(50, 20)
Adding variables drawn from a distribution#
We’ll use the results derived for combining
So the idea will be to take a distribution, do an FT with the frequency divided by
Doing Fourier transforms by numerical integration (rather than FFT)#
We’ll try first with the scipy.integrate
function quad.
from scipy.integrate import quad
def cosineFT(omega, fun):
"""Calculate the cosine Fourier transform (CFT) using quad from
scipy.integrate with the cosine weight.
"""
ans, error = quad(fun, 0., np.inf, weight='cos', wvar=omega)
return 2.*ans, error # note the 2 because integration from [0, inf]
def cosineFT2(omega, fun):
"""Calculate the cosine Fourier transform (CFT) using quad from
scipy.integrate with the cosine term added explicitly.
"""
integrand = lambda x: np.cos(omega * x) * fun(x)
ans, error = quad(integrand, -np.inf, np.inf)
return ans, error
def invCFT(t, omega_pts, FT_pts):
"""Calculate the inverse cosine Fourier transform (invCFT) using numpy's
trapz function. Includes 1/2\pi factor
"""
integrand_pts = np.array( [np.cos(omega * t) * FT_value \
for omega, FT_value in zip(omega_pts, FT_pts)]
)
return np.trapz(integrand_pts, omega_pts) / (2. * np.pi)
def gaussian(x, sigma=1.):
"""One-dimensional normalized Gaussian."""
return 1./np.sqrt(2. * np.pi * sigma**2) * np.exp(-x**2/(2.*sigma**2))
We’ll use a uniform distribution again.
# FT of uniform distribution
x_pts = np.linspace(-4., 4., 401) # x range
# uniform distribution from -1 to +1
uni_dist = stats.uniform(loc=-1., scale=2.)
uni_dist_pts = uni_dist.pdf(x_pts)
uni_gauss_pts = np.array([gaussian(x, sigma=uni_dist.std()) for x in x_pts])
omega_pts = np.linspace(-40., 40., 401)
#FT_uniform = np.array([cosineFT(omega, uni_dist.pdf)[0] for omega in omega_pts])
def CFT_n(fun, omega_pts, n=1):
"""Take the Fourier transform of fun wrt omega/\sqrt{n} and then
raise it to the n'th power.
"""
CFT_n_pts = np.array([cosineFT(omega / np.sqrt(n), fun)[0] \
for omega in omega_pts])
return CFT_n_pts**n
n_vals = np.array([1, 2, 3, 4, 5, 8])
FT_uniform_pts = np.array([CFT_n(uni_dist.pdf, omega_pts, n) \
for n in n_vals])
invFT_uniform_pts = np.array([[invCFT(x, omega_pts, FT_uniform_pts[i])
for x in x_pts] for i in range(n_vals.size)])
The plots below on the right are the
fig, ax = plt.subplots(n_vals.size, 2, figsize=(15, 5 * n_vals.size))
ax[0, 0].plot(x_pts, uni_dist_pts, color='blue')
for index, n in enumerate(n_vals):
ax[index, 0].plot(x_pts, invFT_uniform_pts[index],
color='blue', label=rf'invFT[uniform] $n={n}$')
ax[index, 0].plot(x_pts, uni_gauss_pts, color='red',
label='CLT gaussian')
ax[index, 0].legend()
ax[index, 0].set_title(rf'$n = {n}$')
ax[index, 1].plot(omega_pts, FT_uniform_pts[index],
color='blue', label=rf'FT[uniform] $n={n}$')
ax[index, 1].legend()
ax[index, 1].set_title(rf'$n = {n}$')
fig.tight_layout()
So we see that multiplying the original Fourier transform will rapidly kill off the tails and highlight the central part. The