Faisal Qureshi
Professor
Faculty of Science
Ontario Tech University
Oshawa ON Canada
http://vclab.science.ontariotechu.ca
© Faisal Qureshi
This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.cm as cm
# Generate some data
np.random.seed(0)
m_true = .75
c_true = 1
x = np.hstack([np.linspace(3, 5, 5), np.linspace(7, 8, 5)])
y = m_true * x + c_true + np.random.rand(len(x))
# Outliers
y[9] = 1
y[0] = 9
data = np.vstack([x, y]).T
n = data.shape[0]
plt.figure(figsize=(7,7))
plt.scatter(data[:,0], data[:,1], c='red', alpha=0.9)
plt.xlim([0,10])
plt.ylim([0,10])
plt.title('Generating some points from a line');
x = data[:,0]
y = data[:,1]
print(f'x =\n{x.reshape(n,1)}')
print(f'y =\n{y.reshape(n,1)}')
theta = np.polyfit(x, y, 1)
m = theta[0]
c = theta[1]
print(f'c = {c}')
print(f'm = {m}')
x_plotting = np.linspace(0,10,100)
y_plotting = m * x_plotting + c
plt.figure(figsize=(7,7))
plt.scatter(data[:,0], data[:,1], c='red', alpha=0.9)
plt.plot(x_plotting, y_plotting, 'b-')
plt.xlim([0,10])
plt.ylim([0,10])
plt.title('The effect of outliers on least squares');
Outlier is a data point that differs significantly from the other observations. These can have large negative impact on model fitting. These are difficult to deal with, since we have no way of knowning a priori that a particular data item is an outlier.
# Generate some data
np.random.seed(0)
m_true = .75
c_true = 1
x = np.hstack([np.linspace(1, 5, 50), np.linspace(7, 10, 50)])
y = m_true * x + c_true + np.random.rand(len(x))*.2
data = np.vstack([x, y]).T
n = data.shape[0]
outlier_percentage = 5
ind = (np.random.rand(np.int(n*outlier_percentage/100))*n).astype('int')
data[ind, 1] = np.random.rand(len(ind))*9
#data[ind, 1] = np.random.rand(len(ind))*10+90
plt.figure(figsize=(7,7))
plt.scatter(data[:,0], data[:,1], c='red', alpha=0.9)
plt.xlim([0,10])
plt.ylim([0,10])
#plt.ylim([0,100])
plt.title('Noisy data');
def draw_line(theta, ax, **kwargs):
m = theta[0]
c = theta[1]
xmin, xmax = ax.get_xbound()
ymin = m * xmin + c
ymax = m * xmax + c
l = mlines.Line2D([xmin, xmax], [ymin,ymax], **kwargs)
ax.add_line(l)
theta = np.polyfit(data[:,0], data[:,1], 1)
print(f'#data = {n}')
print(f'theta = {theta}')
plt.figure(figsize=(7,7))
plt.scatter(data[:,0], data[:,1], c='red', alpha=0.9)
plt.xlim([0,10])
plt.ylim([0,10])
#plt.ylim([0,100])
plt.title('Least squares in the presence of outliers');
draw_line(theta, plt.gca(), c='black', linewidth='3')
In the discussion below, set $$ e = y_{\mathrm{prediction}} - y_{\mathrm{true}} $$ and $$ z = e^2 $$ In other words, $z$ is error-squared.
The following expressions are pulled out from scipy implementation of least squares. For an in depth discussion, I suggest "Convex Optimization" by Boyd et al, 2004 to the interested reader.
Generally
$$ \rho(e) = \begin{cases} \frac{1}{2} e^2 & \text{for } |e| \le k \\ k|e| - \frac{1}{2}k^2 & \text{otherwise} \end{cases} $$Here $k$ is referred to as the tuning constant. Smaller values of $k$ produce more resistance to outliers, but at the expense of lower efficiency when the errors are normally distributed. The tuning constant is generally picked to give reasonably high efficiency in the normal case. Typically, we set $k = 1.345 \sigma$ for the Huber (where $\sigma$ is the standard deviation of the errors) produce 95-percent efficiency when the errors are normal, and still offer protection against outliers.
In scipy.optimize.least_square
Huber loss is defined as
Generally
$$ \rho(e) = \begin{cases} \frac{k^2}{6} \{ 1 - \left[ 1 - \left( \frac{e}{k} \right)^2 \right]^3 \} & \text{for } |e| \le k \\ \frac{k^2}{6} & \text{otherwise} \end{cases} $$$$ \rho(z) = \frac{z}{z + \sigma^2} $$Here $\sigma$ is often referred to a scale parameter.
e = np.linspace(-3,3,100)
z = np.square(e)
linear_loss = z
l1_loss = np.abs(e)
soft_l1_loss = 2*(np.power(1+z, 0.5) - 1) # Smooth approx. of l1 loss.
huber_loss = 2*np.power(z, 0.5) - 1
huber_loss[z <= 1] = z[z <= 1]
cauchy_loss = np.log(1 + z)
arctan_loss = np.arctan(z)
scale = 0.1
robust_loss = z / (z + scale**2)
plt.figure(figsize=(15,15))
plt.subplot(331)
plt.title('Linear loss')
plt.ylim(0,10)
plt.grid()
plt.plot(e, linear_loss, label='Linear loss')
plt.subplot(332)
plt.grid()
plt.ylim(0,10)
plt.title('L1 loss')
plt.plot(e, l1_loss, label='L1 loss')
plt.subplot(333)
plt.grid()
plt.ylim(0,10)
plt.title('Soft L1 loss')
plt.plot(e, soft_l1_loss, label='Soft L1 loss')
plt.subplot(334)
plt.grid()
plt.ylim(0,10)
plt.title('Huber loss')
plt.plot(e, huber_loss, label='Huber loss')
plt.subplot(335)
plt.grid()
plt.ylim(0,10)
plt.title('Cauchy loss')
plt.plot(e, cauchy_loss, label='Cauchy loss')
plt.subplot(336)
plt.grid()
plt.ylim(0,10)
plt.title('Arctan loss')
plt.plot(e, arctan_loss, label='Arctan loss')
plt.subplot(337)
plt.grid()
plt.ylim(0,10)
plt.title('?Loss')
plt.plot(e, robust_loss, label='Robust loss')
plt.subplot(338)
plt.grid()
plt.title('?Loss (Rescaled y-axis)')
plt.plot(e, z / (z + 2**2), label='$\sigma=2$')
plt.plot(e, z / (z + 1**2), label='$\sigma=1$')
plt.plot(e, z / (z + .5**2), label='$\sigma=.5$')
plt.plot(e, z / (z + .25**2), label='$\sigma=.25$')
plt.plot(e, z / (z + .1**2), label='$\sigma=.1$')
plt.legend();
# Generate some data
np.random.seed(0)
m_true = .75
c_true = 1
x = np.hstack([np.linspace(1, 5, 50), np.linspace(7, 10, 50)])
y = m_true * x + c_true + np.random.rand(len(x))*.2
data = np.vstack([x, y]).T
n = data.shape[0]
outlier_percentage = 50
ind = (np.random.rand(np.int(n*outlier_percentage/100))*n).astype('int')
data[ind, 1] = np.random.rand(len(ind))*9
plt.figure(figsize=(7,7))
plt.scatter(data[:,0], data[:,1], c='red', alpha=0.9)
plt.xlim([0,10])
plt.ylim([0,10])
plt.title('Noisy data');
def line_fitting_errors(x, data):
m = x[0]
c = x[1]
y_predicted = m * data[:,0] + c
e = y_predicted - data[:,1]
return e
theta_guess = [1, 1] # Need an intial guess
# Available losses
losses = [
'linear',
'soft_l1',
'huber',
'cauchy',
'arctan'
]
loss_id = 4 # Pick a loss
# Non-linear least square fitting
from scipy.optimize import least_squares
retval = least_squares(line_fitting_errors, theta_guess, loss=losses[loss_id], args=[data])
print(f'Reasons for stopping: {retval["message"]}')
print(f'Success: {retval["success"]}')
theta = retval['x']
print(f'#data = {n}')
print(f'theta = {theta}')
plt.figure(figsize=(7,7))
ax = plt.gca()
plt.scatter(data[:,0], data[:,1], c='red', alpha=0.9)
plt.xlim([0,10])
plt.ylim([0,10])
plt.title(f'Non linear least squares. Loss = {losses[loss_id]}')
draw_line(theta, ax, c='black', linewidth='3');
# Generate some data
np.random.seed(0)
m_true = .75
c_true = 0
x = np.linspace(1, 8, 150)
y = m_true * x + c_true + np.random.rand(len(x))*3
data = np.vstack([x, y]).T
n = data.shape[0]
plt.figure(figsize=(7,7))
plt.scatter(x, y, c='red', alpha=0.9)
plt.xlim([0,10])
plt.ylim([0,10])
plt.title('Complete data');
cutoff = 5
x_available = x[:cutoff]
y_available = y[:cutoff]
data = np.vstack([x_available, y_available]).T
n = data.shape[0]
plt.figure(figsize=(7,7))
plt.scatter(data[:,0], data[:,1], c='red', alpha=0.9)
plt.scatter(x[cutoff:], y[cutoff:], c='red', alpha=0.1)
plt.xlim([0,10])
plt.ylim([0,10])
plt.title('Available data');
Fitting model to available data
theta = np.polyfit(data[:,0], data[:,1], 1)
print(f'#available data = {n}')
print(f'theta = {theta}')
plt.figure(figsize=(7,7))
plt.scatter(data[:,0], data[:,1], c='red', alpha=0.9)
plt.scatter(x[cutoff:], y[cutoff:], c='red', alpha=0.1)
plt.xlim([0,10])
plt.ylim([0,10])
plt.title('Modeling fitting to available data');
draw_line(theta, plt.gca(), c='black', linewidth='3')
# Generate some data
np.random.seed(0)
m_true_1 = .75
c_true_1 = 1
x1 = np.linspace(1, 8, 150)
y1 = m_true_1 * x1 + c_true_1 + np.random.rand(len(x1))*2
m_true_2 = -.5
c_true_2 = 8
x2 = np.linspace(4, 9.5, 20)
y2 = m_true_2 * x2 + c_true_2 + np.random.rand(len(x2))*1.4
x = np.hstack([x1, x2])
y = np.hstack([y1, y2])
data = np.vstack([x, y]).T
n = data.shape[0]
plt.figure(figsize=(7,7))
plt.scatter(x1, y1, c='red', alpha=0.9)
plt.scatter(x2, y2, c='blue', alpha=0.5)
plt.xlim([0,10])
plt.ylim([0,10])
plt.title('Noisy data');
theta = np.polyfit(data[:,0], data[:,1], 1)
print(f'#data = {n}')
print(f'theta = {theta}')
plt.figure(figsize=(7,7))
ax = plt.gca()
plt.scatter(data[:,0], data[:,1], c='red', alpha=0.9)
plt.xlim([0,10])
plt.ylim([0,10])
plt.title('Least squares in the presence of outliers');
draw_line(theta, ax, c='black', linewidth='3')