Histograms¶
Faisal Qureshi
Professor
Faculty of Science
Ontario Tech University
Oshawa ON Canada
http://vclab.science.ontariotechu.ca
Copyright information¶
© Faisal Qureshi
License¶
This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.datasets import make_blobs
from matplotlib.image import NonUniformImage
Lesson Plan¶
- Histograms in 1D and 2D
- Construction
- Visualization
- Non-uniform bins
Histogram¶
- A plot that lets us discover the underlying frequency distribution of some data.
Example 1D¶
np.random.seed(0)
x = np.hstack([np.random.rand(1000)*4, np.random.randn(1000)*2+9])
np.random.shuffle(x)
Plotting raw data¶
Lets plot this data using matplotlib plot
method and see if we can make any sense of it.
plt.figure(figsize=(15,7))
plt.title('This data plot doesn\'t provide any insight')
plt.plot(x,'.');
Histogram construction¶
That didn't work out. Lets construct a histogram. We will use the following steps:
- Find the minimum and maximum value in
x
- Use these values to define bins
- Count how many items in
x
fall into each of these bins - Plot bin counts
minimum = np.min(x)
maximum = np.max(x)
print(f'Minimum value in x is {minimum}')
print(f'Maximum value in x is {maximum}')
Minimum value in x is 0.0021838595879826173 Maximum value in x is 15.34194954658036
nbin = 40
bins = np.linspace(minimum, maximum, nbin+1)
print(f'Bins are defined by these boundary values:\n{bins.reshape(nbin+1,-1)}')
Bins are defined by these boundary values: [[2.18385959e-03] [3.85678002e-01] [7.69172144e-01] [1.15266629e+00] [1.53616043e+00] [1.91965457e+00] [2.30314871e+00] [2.68664285e+00] [3.07013700e+00] [3.45363114e+00] [3.83712528e+00] [4.22061942e+00] [4.60411357e+00] [4.98760771e+00] [5.37110185e+00] [5.75459599e+00] [6.13809013e+00] [6.52158428e+00] [6.90507842e+00] [7.28857256e+00] [7.67206670e+00] [8.05556085e+00] [8.43905499e+00] [8.82254913e+00] [9.20604327e+00] [9.58953741e+00] [9.97303156e+00] [1.03565257e+01] [1.07400198e+01] [1.11235140e+01] [1.15070081e+01] [1.18905023e+01] [1.22739964e+01] [1.26574906e+01] [1.30409847e+01] [1.34244788e+01] [1.38079730e+01] [1.41914671e+01] [1.45749613e+01] [1.49584554e+01] [1.53419495e+01]]
counts, edges = np.histogram(x, bins)
print(f'Counts for different bins are ({len(counts)}):\n{counts.reshape(nbin,-1)}')
print(f'Counts are for these edges ({len(edges)}):\n{edges.reshape(nbin+1,-1)}')
Counts for different bins are (40): [[ 89] [106] [ 95] [111] [ 98] [ 87] [ 88] [ 95] [ 81] [109] [ 47] [ 0] [ 5] [ 10] [ 20] [ 23] [ 31] [ 33] [ 60] [ 56] [ 59] [ 74] [ 66] [ 91] [ 84] [ 77] [ 65] [ 47] [ 59] [ 20] [ 39] [ 23] [ 18] [ 10] [ 8] [ 7] [ 5] [ 3] [ 0] [ 1]] Counts are for these edges (41): [[2.18385959e-03] [3.85678002e-01] [7.69172144e-01] [1.15266629e+00] [1.53616043e+00] [1.91965457e+00] [2.30314871e+00] [2.68664285e+00] [3.07013700e+00] [3.45363114e+00] [3.83712528e+00] [4.22061942e+00] [4.60411357e+00] [4.98760771e+00] [5.37110185e+00] [5.75459599e+00] [6.13809013e+00] [6.52158428e+00] [6.90507842e+00] [7.28857256e+00] [7.67206670e+00] [8.05556085e+00] [8.43905499e+00] [8.82254913e+00] [9.20604327e+00] [9.58953741e+00] [9.97303156e+00] [1.03565257e+01] [1.07400198e+01] [1.11235140e+01] [1.15070081e+01] [1.18905023e+01] [1.22739964e+01] [1.26574906e+01] [1.30409847e+01] [1.34244788e+01] [1.38079730e+01] [1.41914671e+01] [1.45749613e+01] [1.49584554e+01] [1.53419495e+01]]
Lets find bin centers using the edges
centers = (edges[:-1]+edges[1:])/2
print(f'Bin centers are({len(centers)}):\n{centers.reshape(nbin,-1)}')
Bin centers are(40): [[ 0.19393093] [ 0.57742507] [ 0.96091922] [ 1.34441336] [ 1.7279075 ] [ 2.11140164] [ 2.49489578] [ 2.87838993] [ 3.26188407] [ 3.64537821] [ 4.02887235] [ 4.41236649] [ 4.79586064] [ 5.17935478] [ 5.56284892] [ 5.94634306] [ 6.32983721] [ 6.71333135] [ 7.09682549] [ 7.48031963] [ 7.86381377] [ 8.24730792] [ 8.63080206] [ 9.0142962 ] [ 9.39779034] [ 9.78128449] [10.16477863] [10.54827277] [10.93176691] [11.31526105] [11.6987552 ] [12.08224934] [12.46574348] [12.84923762] [13.23273176] [13.61622591] [13.99972005] [14.38321419] [14.76670833] [15.15020248]]
Plotting counts¶
plt.figure(figsize=(15,7))
plt.title('Plotting counts vs bin centers')
plt.xlabel('Bin centers')
plt.ylabel('Counts')
plt.plot(centers, counts, '.');
Using Bar plot to visualize a histogram¶
Lets plot this as a bar graph
cmap = cm.get_cmap('Spectral')
max_count = np.max(counts)
colors = cmap(np.copy(counts) / max_count)
plt.figure(figsize=(15,7))
plt.title('Histogram shown as a bar plot')
plt.xlabel('Bin centers')
plt.ylabel('Counts')
plt.bar(centers, counts, width=.5, color=colors);
/var/folders/__/7_6vnl8x6tn6gq0nhwd6j4tc0000gn/T/ipykernel_71906/64945571.py:1: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. cmap = cm.get_cmap('Spectral')
Histogram normalization¶
Converting a histogram into a probability distribution
counts = counts / np.linalg.norm(counts) # Normalization
max_count = np.max(counts)
colors = cmap(np.copy(counts) / max_count)
plt.figure(figsize=(15,7))
plt.title('Histogram shown as a bar plot')
plt.xlabel('Bin centers')
plt.ylabel('Counts')
plt.bar(centers, counts, width=.5, color=colors);
Visualizing histogram bins¶
plt.figure(figsize=(15,15))
ax = plt.subplot(111)
ax.set_title('Visualizing bins')
ax.set_yticks(bins,minor=False)
ax.yaxis.grid(True, which='major')
ax.plot(x,'.');
Example in 2D¶
# data 1
d1, _ = make_blobs(n_samples=50000, centers=[[-4, -6], [2, -4]], cluster_std=[1.5, 1.5])
d2, _ = make_blobs(n_samples=10000, centers=[[2, 1]], cluster_std=[1.5])
d3, _ = make_blobs(n_samples=10000, centers=[[-1, -4.5]], cluster_std=[0.3])
data = np.vstack([d1, d2])
#data = np.vstack([d1, d2, d3])
print(f'Shape of data = {data.shape}')
Shape of data = (60000, 2)
Plotting raw data¶
plt.figure(figsize=(7,7))
plt.title('Raw data')
plt.xlim(-10, 8)
plt.ylim(-10, 8)
plt.plot(data[:,0], data[:,1], '.');
Setting up 2D grid¶
Lets define a grid over this 2D space. We will then count the number of points in each cell
nbins_x = 100
nbins_y = 100
edges_x = np.linspace(-10, 8, nbins_x)
edges_y = np.linspace(-10, 8, nbins_y)
plt.figure(figsize=(10,10))
ax = plt.subplot(111, title='Visualizing 2D grid')
ax.set_xlim(-10, 8)
ax.set_xticks(edges_x, minor=True)
ax.xaxis.grid(True, which='both')
ax.set_ylim(-10, 8)
ax.set_yticks(edges_y, minor=True)
ax.yaxis.grid(True, which='both')
plt.plot(data[:,0], data[:,1], '.');
Constructing 2D histogram¶
Use numpy.histogram2d
to compute the histogram. We can visualize this histogram as an image, for example.
H, _, _ = np.histogram2d(data[:,0], data[:,1], bins=(edges_x, edges_y))
Visualizing 2D histogram as an image¶
fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(111, title='Visualizing histogram as an image')
plt.imshow(H.T, interpolation='nearest', origin='lower', extent=[edges_x[0], edges_x[-1], edges_y[0], edges_y[-1]], cmap='Spectral')
# Uncomment the following if using data 2 above. Notice the +1. This is to ensure that np.log doesn't
# complain when it receives a 0 count
#plt.imshow(np.log(H.T+1), interpolation='nearest', origin='low', extent=[edges_x[0], edges_x[-1], edges_y[0], edges_y[-1]], cmap='Spectral')
plt.colorbar();
Visualizing 2D histogram as a bar plot¶
centers_x = (edges_x[:-1] + edges_x[1:]) / 2
centers_y = (edges_y[:-1] + edges_y[1:]) / 2
cxx, cyy = np.meshgrid(centers_x, centers_y)
cxx_, cyy_ = cxx.ravel(), cyy.ravel()
cmap = cm.get_cmap('Spectral')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d', title='2D histogram visualized as a bar plot')
H_ = H.ravel()
bottom_ = np.zeros_like(H_)
width = .1
depth = .1
colors = H_ / np.max(H_)
ax.bar3d(cxx_, cyy_, bottom_, width, depth, H_, color=cmap(colors));
/var/folders/__/7_6vnl8x6tn6gq0nhwd6j4tc0000gn/T/ipykernel_71906/8972550.py:1: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. cmap = cm.get_cmap('Spectral')
Non-uniform bins¶
The bins doesn't have all to be the same size, or square (in case of 2D bins). Often times it is advantages to choose bins that are not uniform. E.g., it is possible to pick a higher number of bins around regions of interest, or regions that contain a lot of data.
edges_x = np.array([-10,-7,-6,-5,-4,-3,-2,8])
edges_y = np.array([-10,-9,-8,-7,-6,-5,-4,-3,-2,0,8])
plt.figure(figsize=(10,10))
ax = plt.subplot(111, title='Visualizing non-uniform 2D grid2D grid')
plt.plot(data[:,0], data[:,1], '.', alpha=.3);
ax.set_xlim(-10, 8)
ax.set_xticks(edges_x)
ax.xaxis.grid(True, color='red', linewidth=2)
ax.set_ylim(-10, 8)
ax.set_yticks(edges_y)
ax.yaxis.grid(True, color='red', linewidth=2)
We can compute the histogram as before
H, _, _ = np.histogram2d(data[:,0], data[:,1], bins=(edges_x, edges_y))
H = H.T # So rows correspond to y-axis
Histogram visualization¶
However if we try to visualize this histogram as an image as before, it doesn't make any sense. Recall that images shown on a uniform grid, whereas the bin centers for the histogram seen above do not align with a uniform grid.
First attempt¶
Visualizing a histogram computed over a non-uniform grid as a regular image doesn't work
fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(111, title='Visualizing a non-uniform histogram as an image')
plt.imshow(H, interpolation='nearest', origin='lower', extent=[edges_x[0], edges_x[-1], edges_y[0], edges_y[-1]], cmap='Spectral')
# Uncomment the following if using data 2 above. Notice the +1. This is to ensure that np.log doesn't
# complain when it receives a 0 count
#plt.imshow(np.log(H.T+1), interpolation='nearest', origin='low', extent=[edges_x[0], edges_x[-1], edges_y[0], edges_y[-1]], cmap='Spectral')
plt.colorbar();
Second attempt¶
We will visualize a histogram computed on a non-uniform grid as a non-uniform image.
centers_x = (edges_x[:-1] + edges_x[1:])/2
centers_y = (edges_y[:-1] + edges_y[1:])/2
cxx, cyy = np.meshgrid(centers_x, centers_y)
fig = plt.figure(figsize=(7, 7))
ax = fig.add_subplot(111, title='Visualzing 2D histogram on a non-uniform grid.')
ax.pcolormesh(cxx, cyy, H, cmap='Spectral', shading='nearest');
cmap = cm.get_cmap('Spectral')
cxx_, cyy_ = cxx.ravel(), cyy.ravel()
H_ = H.ravel()
bottom_ = np.zeros_like(H_)
width = .5
depth = .5
colors = H_ / np.max(H_)
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d', title='2D histogram visualized as a bar plot')
plt.xlabel('x')
plt.ylabel('y')
ax.bar3d(cxx_, cyy_, bottom_, width, depth, H_, color=cmap(colors));
/var/folders/__/7_6vnl8x6tn6gq0nhwd6j4tc0000gn/T/ipykernel_71906/4200530748.py:1: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. cmap = cm.get_cmap('Spectral')
Histograms in higher dimensions¶
Check out numpy.histogramdd
to construct histograms in higher dimensional data.