# Histograms¶

Faisal Qureshi
Professor
Faculty of Science
Ontario Tech University
http://vclab.science.ontariotechu.ca

In [384]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.datasets import make_blobs
from matplotlib.image import NonUniformImage


## Lesson Plan¶

• Histograms in 1D and 2D

## Histogram¶

• A plot that lets us discover the underlying frequency distribution of some data.

## Example 1D¶

In [385]:
np.random.seed(0)

x = np.hstack([np.random.rand(1000)*4, np.random.randn(1000)*2+9])
np.random.shuffle(x)


### Plotting raw data¶

Lets plot this data using matplotlib plot method and see if we can make any sense of it.

In [386]:
plt.figure(figsize=(15,7))
plt.title('This data plot doesn\'t provide any insight')
plt.plot(x,'.');


### Histogram construction¶

That didn't work out. Lets construct a histogram. We will use the following steps:

• Find the minimum and maximum value in x
• Use these values to define bins
• Count how many items in x fall into each of these bins
• Plot bin counts
In [387]:
minimum = np.min(x)
maximum = np.max(x)

print(f'Minimum value in x is {minimum}')
print(f'Maximum value in x is {maximum}')

Minimum value in x is 0.0021838595879826173
Maximum value in x is 15.34194954658036

In [388]:
nbin = 40

bins = np.linspace(minimum, maximum, nbin+1)
print(f'Bins are defined by these boundary values:\n{bins.reshape(nbin+1,-1)}')

Bins are defined by these boundary values:
[[2.18385959e-03]
[3.85678002e-01]
[7.69172144e-01]
[1.15266629e+00]
[1.53616043e+00]
[1.91965457e+00]
[2.30314871e+00]
[2.68664285e+00]
[3.07013700e+00]
[3.45363114e+00]
[3.83712528e+00]
[4.22061942e+00]
[4.60411357e+00]
[4.98760771e+00]
[5.37110185e+00]
[5.75459599e+00]
[6.13809013e+00]
[6.52158428e+00]
[6.90507842e+00]
[7.28857256e+00]
[7.67206670e+00]
[8.05556085e+00]
[8.43905499e+00]
[8.82254913e+00]
[9.20604327e+00]
[9.58953741e+00]
[9.97303156e+00]
[1.03565257e+01]
[1.07400198e+01]
[1.11235140e+01]
[1.15070081e+01]
[1.18905023e+01]
[1.22739964e+01]
[1.26574906e+01]
[1.30409847e+01]
[1.34244788e+01]
[1.38079730e+01]
[1.41914671e+01]
[1.45749613e+01]
[1.49584554e+01]
[1.53419495e+01]]

In [389]:
counts, edges = np.histogram(x, bins)
print(f'Counts for different bins are ({len(counts)}):\n{counts.reshape(nbin,-1)}')
print(f'Counts are for these edges ({len(edges)}):\n{edges.reshape(nbin+1,-1)}')

Counts for different bins are (40):
[[ 89]
[106]
[ 95]
[111]
[ 98]
[ 87]
[ 88]
[ 95]
[ 81]
[109]
[ 47]
[  0]
[  5]
[ 10]
[ 20]
[ 23]
[ 31]
[ 33]
[ 60]
[ 56]
[ 59]
[ 74]
[ 66]
[ 91]
[ 84]
[ 77]
[ 65]
[ 47]
[ 59]
[ 20]
[ 39]
[ 23]
[ 18]
[ 10]
[  8]
[  7]
[  5]
[  3]
[  0]
[  1]]
Counts are for these edges (41):
[[2.18385959e-03]
[3.85678002e-01]
[7.69172144e-01]
[1.15266629e+00]
[1.53616043e+00]
[1.91965457e+00]
[2.30314871e+00]
[2.68664285e+00]
[3.07013700e+00]
[3.45363114e+00]
[3.83712528e+00]
[4.22061942e+00]
[4.60411357e+00]
[4.98760771e+00]
[5.37110185e+00]
[5.75459599e+00]
[6.13809013e+00]
[6.52158428e+00]
[6.90507842e+00]
[7.28857256e+00]
[7.67206670e+00]
[8.05556085e+00]
[8.43905499e+00]
[8.82254913e+00]
[9.20604327e+00]
[9.58953741e+00]
[9.97303156e+00]
[1.03565257e+01]
[1.07400198e+01]
[1.11235140e+01]
[1.15070081e+01]
[1.18905023e+01]
[1.22739964e+01]
[1.26574906e+01]
[1.30409847e+01]
[1.34244788e+01]
[1.38079730e+01]
[1.41914671e+01]
[1.45749613e+01]
[1.49584554e+01]
[1.53419495e+01]]


Lets find bin centers using the edges

In [390]:
centers = (edges[:-1]+edges[1:])/2
print(f'Bin centers are({len(centers)}):\n{centers.reshape(nbin,-1)}')

Bin centers are(40):
[[ 0.19393093]
[ 0.57742507]
[ 0.96091922]
[ 1.34441336]
[ 1.7279075 ]
[ 2.11140164]
[ 2.49489578]
[ 2.87838993]
[ 3.26188407]
[ 3.64537821]
[ 4.02887235]
[ 4.41236649]
[ 4.79586064]
[ 5.17935478]
[ 5.56284892]
[ 5.94634306]
[ 6.32983721]
[ 6.71333135]
[ 7.09682549]
[ 7.48031963]
[ 7.86381377]
[ 8.24730792]
[ 8.63080206]
[ 9.0142962 ]
[ 9.39779034]
[ 9.78128449]
[10.16477863]
[10.54827277]
[10.93176691]
[11.31526105]
[11.6987552 ]
[12.08224934]
[12.46574348]
[12.84923762]
[13.23273176]
[13.61622591]
[13.99972005]
[14.38321419]
[14.76670833]
[15.15020248]]


### Plotting counts¶

In [391]:
plt.figure(figsize=(15,7))
plt.title('Plotting counts vs bin centers')
plt.xlabel('Bin centers')
plt.ylabel('Counts')
plt.plot(centers, counts, '.');


### Using Bar plot to visualize a histogram¶

Lets plot this as a bar graph

In [392]:
cmap = cm.get_cmap('Spectral')

max_count = np.max(counts)
colors = cmap(np.copy(counts) / max_count)

plt.figure(figsize=(15,7))
plt.title('Histogram shown as a bar plot')
plt.xlabel('Bin centers')
plt.ylabel('Counts')
plt.bar(centers, counts, width=.5, color=colors);


### Histogram normalization¶

Converting a histogram into a probability distribution

In [393]:
counts = counts / np.linalg.norm(counts)  # Normalization

max_count = np.max(counts)
colors = cmap(np.copy(counts) / max_count)

plt.figure(figsize=(15,7))
plt.title('Histogram shown as a bar plot')
plt.xlabel('Bin centers')
plt.ylabel('Counts')
plt.bar(centers, counts, width=.5, color=colors);


### Visualizing histogram bins¶

In [394]:
plt.figure(figsize=(15,15))
ax = plt.subplot(111)
ax.set_title('Visualizing bins')
ax.set_yticks(bins,minor=False)
ax.yaxis.grid(True, which='major')
ax.plot(x,'.');


## Example in 2D¶

In [467]:
# data 1
d1, _ = make_blobs(n_samples=50000, centers=[[-4, -6], [2, -4]], cluster_std=[1.5, 1.5])
d2, _ = make_blobs(n_samples=10000, centers=[[2, 1]], cluster_std=[1.5])
d3, _ = make_blobs(n_samples=10000, centers=[[-1, -4.5]], cluster_std=[0.3])

data = np.vstack([d1, d2])
#data = np.vstack([d1, d2, d3])

print(f'Shape of data = {data.shape}')

Shape of data = (60000, 2)


### Plotting raw data¶

In [468]:
plt.figure(figsize=(7,7))
plt.title('Raw data')
plt.xlim(-10, 8)
plt.ylim(-10, 8)
plt.plot(data[:,0], data[:,1], '.');


### Setting up 2D grid¶

Lets define a grid over this 2D space. We will then count the number of points in each cell

In [469]:
nbins_x = 100
nbins_y = 100

edges_x = np.linspace(-10, 8, nbins_x)
edges_y = np.linspace(-10, 8, nbins_y)

In [470]:
plt.figure(figsize=(10,10))
ax = plt.subplot(111, title='Visualizing 2D grid')
ax.set_xlim(-10, 8)
ax.set_xticks(edges_x, minor=True)
ax.xaxis.grid(True, which='both')
ax.set_ylim(-10, 8)
ax.set_yticks(edges_y, minor=True)
ax.yaxis.grid(True, which='both')
plt.plot(data[:,0], data[:,1], '.');


### Constructing 2D histogram¶

Use numpy.histogram2d to compute the histogram. We can visualize this histogram as an image, for example.

In [471]:
H, _, _ = np.histogram2d(data[:,0], data[:,1], bins=(edges_x, edges_y))


### Visualizing 2D histogram as an image¶

In [472]:
fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(111, title='Visualizing histogram as an image')
plt.imshow(H.T, interpolation='nearest', origin='low', extent=[edges_x[0], edges_x[-1], edges_y[0], edges_y[-1]], cmap='Spectral')
# Uncomment the following if using data 2 above.  Notice the +1.  This is to ensure that np.log doesn't
# complain when it receives a 0 count
#plt.imshow(np.log(H.T+1), interpolation='nearest', origin='low', extent=[edges_x[0], edges_x[-1], edges_y[0], edges_y[-1]], cmap='Spectral')
plt.colorbar();


### Visualizing 2D histogram as a bar plot¶

In [473]:
centers_x = (edges_x[:-1] + edges_x[1:]) / 2
centers_y = (edges_y[:-1] + edges_y[1:]) / 2

cxx, cyy = np.meshgrid(centers_x, centers_y)
cxx_, cyy_ = cxx.ravel(), cyy.ravel()

In [474]:
cmap = cm.get_cmap('Spectral')

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d', title='2D histogram visualized as a bar plot')

H_ = H.ravel()
bottom_ = np.zeros_like(H_)
width = .1
depth = .1

colors = H_ / np.max(H_)

ax.bar3d(cxx_, cyy_, bottom_, width, depth, H_, color=cmap(colors));