Histograms

Faisal Qureshi
Professor
Faculty of Science
Ontario Tech University
Oshawa ON Canada
http://vclab.science.ontariotechu.ca

© Faisal Qureshi

In [384]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.datasets import make_blobs
from matplotlib.image import NonUniformImage

Lesson Plan

  • Histograms in 1D and 2D

Histogram

  • A plot that lets us discover the underlying frequency distribution of some data.

Example 1D

In [385]:
np.random.seed(0)

x = np.hstack([np.random.rand(1000)*4, np.random.randn(1000)*2+9])
np.random.shuffle(x)

Plotting raw data

Lets plot this data using matplotlib plot method and see if we can make any sense of it.

In [386]:
plt.figure(figsize=(15,7))
plt.title('This data plot doesn\'t provide any insight')
plt.plot(x,'.');

Histogram construction

That didn't work out. Lets construct a histogram. We will use the following steps:

  • Find the minimum and maximum value in x
  • Use these values to define bins
  • Count how many items in x fall into each of these bins
  • Plot bin counts
In [387]:
minimum = np.min(x)
maximum = np.max(x)

print(f'Minimum value in x is {minimum}')
print(f'Maximum value in x is {maximum}')
Minimum value in x is 0.0021838595879826173
Maximum value in x is 15.34194954658036
In [388]:
nbin = 40

bins = np.linspace(minimum, maximum, nbin+1)
print(f'Bins are defined by these boundary values:\n{bins.reshape(nbin+1,-1)}')
Bins are defined by these boundary values:
[[2.18385959e-03]
 [3.85678002e-01]
 [7.69172144e-01]
 [1.15266629e+00]
 [1.53616043e+00]
 [1.91965457e+00]
 [2.30314871e+00]
 [2.68664285e+00]
 [3.07013700e+00]
 [3.45363114e+00]
 [3.83712528e+00]
 [4.22061942e+00]
 [4.60411357e+00]
 [4.98760771e+00]
 [5.37110185e+00]
 [5.75459599e+00]
 [6.13809013e+00]
 [6.52158428e+00]
 [6.90507842e+00]
 [7.28857256e+00]
 [7.67206670e+00]
 [8.05556085e+00]
 [8.43905499e+00]
 [8.82254913e+00]
 [9.20604327e+00]
 [9.58953741e+00]
 [9.97303156e+00]
 [1.03565257e+01]
 [1.07400198e+01]
 [1.11235140e+01]
 [1.15070081e+01]
 [1.18905023e+01]
 [1.22739964e+01]
 [1.26574906e+01]
 [1.30409847e+01]
 [1.34244788e+01]
 [1.38079730e+01]
 [1.41914671e+01]
 [1.45749613e+01]
 [1.49584554e+01]
 [1.53419495e+01]]
In [389]:
counts, edges = np.histogram(x, bins)
print(f'Counts for different bins are ({len(counts)}):\n{counts.reshape(nbin,-1)}')
print(f'Counts are for these edges ({len(edges)}):\n{edges.reshape(nbin+1,-1)}')
Counts for different bins are (40):
[[ 89]
 [106]
 [ 95]
 [111]
 [ 98]
 [ 87]
 [ 88]
 [ 95]
 [ 81]
 [109]
 [ 47]
 [  0]
 [  5]
 [ 10]
 [ 20]
 [ 23]
 [ 31]
 [ 33]
 [ 60]
 [ 56]
 [ 59]
 [ 74]
 [ 66]
 [ 91]
 [ 84]
 [ 77]
 [ 65]
 [ 47]
 [ 59]
 [ 20]
 [ 39]
 [ 23]
 [ 18]
 [ 10]
 [  8]
 [  7]
 [  5]
 [  3]
 [  0]
 [  1]]
Counts are for these edges (41):
[[2.18385959e-03]
 [3.85678002e-01]
 [7.69172144e-01]
 [1.15266629e+00]
 [1.53616043e+00]
 [1.91965457e+00]
 [2.30314871e+00]
 [2.68664285e+00]
 [3.07013700e+00]
 [3.45363114e+00]
 [3.83712528e+00]
 [4.22061942e+00]
 [4.60411357e+00]
 [4.98760771e+00]
 [5.37110185e+00]
 [5.75459599e+00]
 [6.13809013e+00]
 [6.52158428e+00]
 [6.90507842e+00]
 [7.28857256e+00]
 [7.67206670e+00]
 [8.05556085e+00]
 [8.43905499e+00]
 [8.82254913e+00]
 [9.20604327e+00]
 [9.58953741e+00]
 [9.97303156e+00]
 [1.03565257e+01]
 [1.07400198e+01]
 [1.11235140e+01]
 [1.15070081e+01]
 [1.18905023e+01]
 [1.22739964e+01]
 [1.26574906e+01]
 [1.30409847e+01]
 [1.34244788e+01]
 [1.38079730e+01]
 [1.41914671e+01]
 [1.45749613e+01]
 [1.49584554e+01]
 [1.53419495e+01]]

Lets find bin centers using the edges

In [390]:
centers = (edges[:-1]+edges[1:])/2
print(f'Bin centers are({len(centers)}):\n{centers.reshape(nbin,-1)}')
Bin centers are(40):
[[ 0.19393093]
 [ 0.57742507]
 [ 0.96091922]
 [ 1.34441336]
 [ 1.7279075 ]
 [ 2.11140164]
 [ 2.49489578]
 [ 2.87838993]
 [ 3.26188407]
 [ 3.64537821]
 [ 4.02887235]
 [ 4.41236649]
 [ 4.79586064]
 [ 5.17935478]
 [ 5.56284892]
 [ 5.94634306]
 [ 6.32983721]
 [ 6.71333135]
 [ 7.09682549]
 [ 7.48031963]
 [ 7.86381377]
 [ 8.24730792]
 [ 8.63080206]
 [ 9.0142962 ]
 [ 9.39779034]
 [ 9.78128449]
 [10.16477863]
 [10.54827277]
 [10.93176691]
 [11.31526105]
 [11.6987552 ]
 [12.08224934]
 [12.46574348]
 [12.84923762]
 [13.23273176]
 [13.61622591]
 [13.99972005]
 [14.38321419]
 [14.76670833]
 [15.15020248]]

Plotting counts

In [391]:
plt.figure(figsize=(15,7))
plt.title('Plotting counts vs bin centers')
plt.xlabel('Bin centers')
plt.ylabel('Counts')
plt.plot(centers, counts, '.');

Using Bar plot to visualize a histogram

Lets plot this as a bar graph

In [392]:
cmap = cm.get_cmap('Spectral')

max_count = np.max(counts)
colors = cmap(np.copy(counts) / max_count)

plt.figure(figsize=(15,7))
plt.title('Histogram shown as a bar plot')
plt.xlabel('Bin centers')
plt.ylabel('Counts')
plt.bar(centers, counts, width=.5, color=colors);

Histogram normalization

Converting a histogram into a probability distribution

In [393]:
counts = counts / np.linalg.norm(counts)  # Normalization

max_count = np.max(counts)
colors = cmap(np.copy(counts) / max_count)

plt.figure(figsize=(15,7))
plt.title('Histogram shown as a bar plot')
plt.xlabel('Bin centers')
plt.ylabel('Counts')
plt.bar(centers, counts, width=.5, color=colors);

Visualizing histogram bins

In [394]:
plt.figure(figsize=(15,15))
ax = plt.subplot(111)
ax.set_title('Visualizing bins')
ax.set_yticks(bins,minor=False)
ax.yaxis.grid(True, which='major')
ax.plot(x,'.');

Example in 2D

In [467]:
# data 1
d1, _ = make_blobs(n_samples=50000, centers=[[-4, -6], [2, -4]], cluster_std=[1.5, 1.5])
d2, _ = make_blobs(n_samples=10000, centers=[[2, 1]], cluster_std=[1.5])
d3, _ = make_blobs(n_samples=10000, centers=[[-1, -4.5]], cluster_std=[0.3])

data = np.vstack([d1, d2])
#data = np.vstack([d1, d2, d3])

print(f'Shape of data = {data.shape}')
Shape of data = (60000, 2)

Plotting raw data

In [468]:
plt.figure(figsize=(7,7))
plt.title('Raw data')
plt.xlim(-10, 8)
plt.ylim(-10, 8)
plt.plot(data[:,0], data[:,1], '.');

Setting up 2D grid

Lets define a grid over this 2D space. We will then count the number of points in each cell

In [469]:
nbins_x = 100
nbins_y = 100

edges_x = np.linspace(-10, 8, nbins_x)
edges_y = np.linspace(-10, 8, nbins_y)
In [470]:
plt.figure(figsize=(10,10))
ax = plt.subplot(111, title='Visualizing 2D grid')
ax.set_xlim(-10, 8)
ax.set_xticks(edges_x, minor=True)
ax.xaxis.grid(True, which='both')
ax.set_ylim(-10, 8)
ax.set_yticks(edges_y, minor=True)
ax.yaxis.grid(True, which='both')
plt.plot(data[:,0], data[:,1], '.');

Constructing 2D histogram

Use numpy.histogram2d to compute the histogram. We can visualize this histogram as an image, for example.

In [471]:
H, _, _ = np.histogram2d(data[:,0], data[:,1], bins=(edges_x, edges_y))

Visualizing 2D histogram as an image

In [472]:
fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(111, title='Visualizing histogram as an image')
plt.imshow(H.T, interpolation='nearest', origin='low', extent=[edges_x[0], edges_x[-1], edges_y[0], edges_y[-1]], cmap='Spectral')
# Uncomment the following if using data 2 above.  Notice the +1.  This is to ensure that np.log doesn't
# complain when it receives a 0 count
#plt.imshow(np.log(H.T+1), interpolation='nearest', origin='low', extent=[edges_x[0], edges_x[-1], edges_y[0], edges_y[-1]], cmap='Spectral')
plt.colorbar();

Visualizing 2D histogram as a bar plot

In [473]:
centers_x = (edges_x[:-1] + edges_x[1:]) / 2
centers_y = (edges_y[:-1] + edges_y[1:]) / 2

cxx, cyy = np.meshgrid(centers_x, centers_y)
cxx_, cyy_ = cxx.ravel(), cyy.ravel()
In [474]:
cmap = cm.get_cmap('Spectral')

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d', title='2D histogram visualized as a bar plot')

H_ = H.ravel()
bottom_ = np.zeros_like(H_)
width = .1
depth = .1

colors = H_ / np.max(H_)

ax.bar3d(cxx_, cyy_, bottom_, width, depth, H_, color=cmap(colors));