Citation Note: The content and the structure of this article is based on the deep learning lectures from One-Fourth Labs — PadhAI.
from
Axes3D
provides some basic 3D plotting (scatter, surf, line, mesh) tools. Not the fastest or most feature complete 3D library out there, but it ships with Matplotlib. we are also importing
mpl_toolkits.mplot3d
and
colors
from Matplotlib. We would like to have animated plots to demonstrate how each optimization algorithm works, so we are importing
colormap(cm)
and
animation
to make graphs look good. To display/render
rc
content in-line in jupyter notebook import HTML. Finally, import
HTML
for computation purposes which does the most of our heavy lifting.
numpy
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.colors
from matplotlib import animation, rc
from IPython.display import HTML
import numpy as np
class SN:
#constructor
def __init__(self, w_init, b_init, algo):
self.w = w_init
self.b = b_init
self.w_h = []
self.b_h = []
self.e_h = []
self.algo = algo
#logistic function
def sigmoid(self, x, w=None, b=None):
if w is None:
w = self.w
if b is None:
b = self.b
return 1. / (1. + np.exp(-(w*x + b)))
#loss function
def error(self, X, Y, w=None, b=None):
if w is None:
w = self.w
if b is None:
b = self.b
err = 0
for x, y in zip(X, Y):
err += 0.5 * (self.sigmoid(x, w, b) - y) ** 2
return err
def grad_w(self, x, y, w=None, b=None):
if w is None:
w = self.w
if b is None:
b = self.b
y_pred = self.sigmoid(x, w, b)
return (y_pred - y) * y_pred * (1 - y_pred) * x
def grad_b(self, x, y, w=None, b=None):
if w is None:
w = self.w
if b is None:
b = self.b
y_pred = self.sigmoid(x, w, b)
return (y_pred - y) * y_pred * (1 - y_pred)
def fit(self, X, Y,
epochs=100, eta=0.01, gamma=0.9, mini_batch_size=100, eps=1e-8,
beta=0.9, beta1=0.9, beta2=0.9
):
self.w_h = []
self.b_h = []
self.e_h = []
self.X = X
self.Y = Y
if self.algo == 'GD':
for i in range(epochs):
dw, db = 0, 0
for x, y in zip(X, Y):
dw += self.grad_w(x, y)
db += self.grad_b(x, y)
self.w -= eta * dw / X.shape[0]
self.b -= eta * db / X.shape[0]
self.append_log()
elif self.algo == 'MiniBatch':
for i in range(epochs):
dw, db = 0, 0
points_seen = 0
for x, y in zip(X, Y):
dw += self.grad_w(x, y)
db += self.grad_b(x, y)
points_seen += 1
if points_seen % mini_batch_size == 0:
self.w -= eta * dw / mini_batch_size
self.b -= eta * db / mini_batch_size
self.append_log()
dw, db = 0, 0
elif self.algo == 'Momentum':
v_w, v_b = 0, 0
for i in range(epochs):
dw, db = 0, 0
for x, y in zip(X, Y):
dw += self.grad_w(x, y)
db += self.grad_b(x, y)
v_w = gamma * v_w + eta * dw
v_b = gamma * v_b + eta * db
self.w = self.w - v_w
self.b = self.b - v_b
self.append_log()
elif self.algo == 'NAG':
v_w, v_b = 0, 0
for i in range(epochs):
dw, db = 0, 0
v_w = gamma * v_w
v_b = gamma * v_b
for x, y in zip(X, Y):
dw += self.grad_w(x, y, self.w - v_w, self.b - v_b)
db += self.grad_b(x, y, self.w - v_w, self.b - v_b)
v_w = v_w + eta * dw
v_b = v_b + eta * db
self.w = self.w - v_w
self.b = self.b - v_b
self.append_log()
#logging
def append_log(self):
self.w_h.append(self.w)
self.b_h.append(self.b)
self.e_h.append(self.error(self.X, self.Y))
#constructor
def __init__(self, w_init, b_init, algo):
self.w = w_init
self.b = b_init
self.w_h = []
self.b_h = []
self.e_h = []
self.algo = algo
function (constructor function) helps to initialize the parameters of sigmoid neuron w weights and b biases. The function takes three arguments,
__init__
— These parameters take the initial values for the parameters ‘w’ and ‘b’ instead of setting parameters randomly, we are setting it to specific values. This allows us to understand how an algorithm performs by visualizing for different initial points. Some algorithms get stuck in local minima at some parameters.
w_init,b_init
— It tells which variant of gradient descent algorithm to use for finding the optimal parameters.
algo
def sigmoid(self, x, w=None, b=None):
if w is None:
w = self.w
if b is None:
b = self.b
return 1. / (1. + np.exp(-(w*x + b)))
—By taking ‘w’ and ‘b’ as the parameters it helps us to calculate the value of the sigmoid function at specifically specified values of parameters. If these arguments are not passed, it will take the values of learned parameters to compute the logistic function.
w & b
def error(self, X, Y, w=None, b=None):
if w is None:
w = self.w
if b is None:
b = self.b
err = 0
for x, y in zip(X, Y):
err += 0.5 * (self.sigmoid(x, w, b) - y) ** 2
return err
function that takes input X and Y as mandatory arguments and optional parameter arguments like in the
error
function. In this function, we are iterating through each data point and computing the cumulative mean squared error between the actual feature value and predicted feature value using the
sigmoid
function. Like we have seen in the sigmoid function, it has support for calculating the error at specified parameter values.
sigmoid
def grad_w(self, x, y, w=None, b=None):
.....
def grad_b(self, x, y, w=None, b=None):
.....
and
grad_w
takes input ‘x’ and ‘y’ as mandatory arguments, which helps to compute the gradient of the sigmoid with respect to inputs for the parameters ‘w’ and ‘b’ respectively. Again we have two optional arguments that allow us to compute the gradient at specified parameter values.
grad_b
def fit(self, X, Y, epochs=100, eta=0.01, gamma=0.9, mini_batch_size=100, eps=1e-8,beta=0.9, beta1=0.9, beta2=0.9):
self.w_h = []
.......
def append_log(self):
self.w_h.append(self.w)
self.b_h.append(self.b)
self.e_h.append(self.error(self.X, self.Y))
function to store the value of parameters and loss function value for each epoch in each variant of gradient descent.
theappend_log
#Data
X = np.asarray([3.5, 0.35, 3.2, -2.0, 1.5, -0.5])
Y = np.asarray([0.5, 0.50, 0.5, 0.5, 0.1, 0.3])
#Algo and parameter values
algo = 'GD'
w_init = 2.1
b_init = 4.0
#parameter min and max values- to plot update rule
w_min = -7
w_max = 5
b_min = -7
b_max = 5
#learning algorithum options
epochs = 200
mini_batch_size = 6
gamma = 0.9
eta = 5
#animation number of frames
animation_frames = 20
#plotting options
plot_2d = True
plot_3d = False
sn = SN(w_init, b_init, algo)
sn.fit(X, Y, epochs=epochs, eta=eta, gamma=gamma, mini_batch_size=mini_batch_size)
plt.plot(sn.e_h, 'r')
plt.plot(sn.w_h, 'b')
plt.plot(sn.b_h, 'g')
plt.legend(('error', 'weight', 'bias'))
plt.title("Variation of Parameters and loss function")
plt.xlabel("Epoch")
plt.show()
if plot_3d:
W = np.linspace(w_min, w_max, 256)
b = np.linspace(b_min, b_max, 256)
WW, BB = np.meshgrid(W, b)
Z = sn.error(X, Y, WW, BB)
fig = plt.figure(dpi=100)
ax = fig.gca(projection='3d')
surf = ax.plot_surface(WW, BB, Z, rstride=3, cstride=3, alpha=0.5, cmap=cm.coolwarm, linewidth=0, antialiased=False)
cset = ax.contourf(WW, BB, Z, 25, zdir='z', offset=-1, alpha=0.6, cmap=cm.coolwarm)
ax.set_xlabel('w')
ax.set_xlim(w_min - 1, w_max + 1)
ax.set_ylabel('b')
ax.set_ylim(b_min - 1, b_max + 1)
ax.set_zlabel('error')
ax.set_zlim(-1, np.max(Z))
ax.view_init (elev=25, azim=-75) # azim = -20
ax.dist=12
title = ax.set_title('Epoch 0')
function in our sigmoid class
error
. In line 8, we are creating an axis handle to create a 3D plot.
SN
function by specifying how often we want to sample the points along with the data by setting
ax.plot_surface
and
rstride
. Next, we are plotting the contour of the error with respect to weight and bias on top of the surface using ax.contourf function by specifying error values as ‘Z’ direction (Line 9 — 10). In line 11–16, we are setting the labels for each axis and axis limits for all three dimensions. Because we are plotting the 3D plot, we need to define the viewpoint. In line 17–18 we are setting a viewpoint for our plot at an elevation of 25 degrees in the ‘z’ axis and at a distance of 12 units.
cstride
def plot_animate_3d(i):
i = int(i*(epochs/animation_frames))
line1.set_data(sn.w_h[:i+1], sn.b_h[:i+1])
line1.set_3d_properties(sn.e_h[:i+1])
line2.set_data(sn.w_h[:i+1], sn.b_h[:i+1])
line2.set_3d_properties(np.zeros(i+1) - 1)
title.set_text('Epoch: {: d}, Error: {:.4f}'.format(i, sn.e_h[i]))
return line1, line2, title
if plot_3d:
#animation plots of gradient descent
i = 0
line1, = ax.plot(sn.w_h[:i+1], sn.b_h[:i+1], sn.e_h[:i+1], color='black',marker='.')
line2, = ax.plot(sn.w_h[:i+1], sn.b_h[:i+1], np.zeros(i+1) - 1, color='red', marker='.')
anim = animation.FuncAnimation(fig, func=plot_animate_3d, frames=animation_frames)
rc('animation', html='jshtml')
anim
function by passing our custom function
animation.FuncAnimation
as one of the parameters and also specify the number of frames needed to create an animation. The function
plot_animate_3d
updates the values of parameters and error value for the respective values of ‘w’ and ‘b’. In the same function at the line — 7, we are setting the text to show the error value at that particular epoch. Finally, to display the animation in-line we call the
plot_animate_3d
function to render the HTML content inside the jupyter notebook.
rc
if plot_2d:
W = np.linspace(w_min, w_max, 256)
b = np.linspace(b_min, b_max, 256)
WW, BB = np.meshgrid(W, b)
Z = sn.error(X, Y, WW, BB)
fig = plt.figure(dpi=100)
ax = plt.subplot(111)
ax.set_xlabel('w')
ax.set_xlim(w_min - 1, w_max + 1)
ax.set_ylabel('b')
ax.set_ylim(b_min - 1, b_max + 1)
title = ax.set_title('Epoch 0')
cset = plt.contourf(WW, BB, Z, 25, alpha=0.8, cmap=cm.bwr)
plt.savefig("temp.jpg",dpi = 2000)
plt.show()
def plot_animate_2d(i):
i = int(i*(epochs/animation_frames))
line.set_data(sn.w_h[:i+1], sn.b_h[:i+1])
title.set_text('Epoch: {: d}, Error: {:.4f}'.format(i, sn.e_h[i]))
return line, title
if plot_2d:
i = 0
line, = ax.plot(sn.w_h[:i+1], sn.b_h[:i+1], color='black',marker='.')
anim = animation.FuncAnimation(fig, func=plot_animate_2d, frames=animation_frames)
rc('animation', html='jshtml')
anim
for i in range(epochs):
dw, db = 0, 0
for x, y in zip(X, Y):
dw += self.grad_w(x, y)
db += self.grad_b(x, y)
self.w -= eta * dw / X.shape[0]
self.b -= eta * db / X.shape[0]
self.append_log()
X = np.asarray([0.5, 2.5])
Y = np.asarray([0.2, 0.9])
algo = 'GD'
w_init = -2
b_init = -2
w_min = -7
w_max = 5
b_min = -7
b_max = 5
epochs = 1000
eta = 1
animation_frames = 20
plot_2d = True
plot_3d = True
to ‘GD’ to indicate we want to execute the vanilla gradient descent algorithm in our sigmoid neuron to find the best parameter values. After we set up our configuration parameters, we will go ahead and execute the SN class ‘fit’ method to train sigmoid neuron on toy data.
algo
and
plot_2d
. I will show how the 3D error surface would look like for corresponding values of ‘w’ and ‘b’. The objective of the learning algorithm is to move towards the deep blue color region where the error/loss is minimum.
plot_3d
. As you play the animation, you can see the epoch number and the corresponding error value at that epoch.
plot_animate_3d
v_w, v_b = 0, 0
for i in range(epochs):
dw, db = 0, 0
for x, y in zip(X, Y):
dw += self.grad_w(x, y)
db += self.grad_b(x, y)
v_w = gamma * v_w + eta * dw
v_b = gamma * v_b + eta * db
self.w = self.w - v_w
self.b = self.b - v_b
self.append_log()
and
v_w
will be used to compute the movement of the gradient based on the history as well as the current gradient. At the end of each epoch, we are calling the
v_b
function to store the history of parameters and loss function values.
append_log
X = np.asarray([0.5, 2.5])
Y = np.asarray([0.2, 0.9])
algo = 'Momentum'
w_init = -2
b_init = -2
w_min = -7
w_max = 5
b_min = -7
b_max = 5
epochs = 1000
mini_batch_size = 6
gamma = 0.9
eta = 1
animation_frames = 20
plot_2d = True
plot_3d = True
is set to ‘Momentum’ to indicate that we want to use the Momentum GD for finding the best parameters for our sigmoid neuron and another important change is the
algo
variable, which is used to control how much momentum we need to impart into the learning algorithm. Gamma value varies between 0–1. After we set up our configuration parameters, we will go ahead and execute the SN class ‘fit’ method to train sigmoid neuron on toy data.
gamma
v_w, v_b = 0, 0
for i in range(epochs):
dw, db = 0, 0
v_w = gamma * v_w
v_b = gamma * v_b
for x, y in zip(X, Y):
dw += self.grad_w(x, y, self.w - v_w, self.b - v_b)
db += self.grad_b(x, y, self.w - v_w, self.b - v_b)
v_w = v_w + eta * dw
v_b = v_b + eta * db
self.w = self.w - v_w
self.b = self.b - v_b
self.append_log()
and
v_w
. In Momentum GD, we are computing these variables in one step but in NAG we are doing it in two steps.
v_b
v_w = gamma * v_w
v_b = gamma * v_b
for x, y in zip(X, Y):
dw += self.grad_w(x, y, self.w - v_w, self.b - v_b)
db += self.grad_b(x, y, self.w - v_w, self.b - v_b)
v_w = v_w + eta * dw
v_b = v_b + eta * db
and
self.w
. To execute the NAG GD, we need just need to set the
self.b
variable to ‘NAG’. You can generate the 3D or 2D animations to see how the NAG GD is different from Momentum GD in reaching the global minima.
algo
for i in range(epochs):
dw, db = 0, 0
points_seen = 0
for x, y in zip(X, Y):
dw += self.grad_w(x, y)
db += self.grad_b(x, y)
points_seen += 1
if points_seen % mini_batch_size == 0:
self.w -= eta * dw / mini_batch_size
self.b -= eta * db / mini_batch_size
self.append_log()
dw, db = 0, 0
v_w, v_b = 0, 0
for i in range(epochs):
dw, db = 0, 0
for x, y in zip(X, Y):
dw += self.grad_w(x, y)
db += self.grad_b(x, y)
v_w += dw**2
v_b += db**2
self.w -= (eta / np.sqrt(v_w) + eps) * dw
self.b -= (eta / np.sqrt(v_b) + eps) * db
self.append_log()
v_w, v_b = 0, 0
for i in range(epochs):
dw, db = 0, 0
for x, y in zip(X, Y):
dw += self.grad_w(x, y)
db += self.grad_b(x, y)
v_w = beta * v_w + (1 - beta) * dw**2
v_b = beta * v_b + (1 - beta) * db**2
self.w -= (eta / np.sqrt(v_w) + eps) * dw
self.b -= (eta / np.sqrt(v_b) + eps) * db
self.append_log()
and
v_w
. In AdaGrad
v_b
and v_b is always increasing by squares of the gradient per parameter wise since the first epoch but in RMSProp
v_w
and
v_w
is exponentially decaying weighted sum of gradients by using a hyperparameter called ‘gamma’. To execute the RMSProp GD, we need just need to set the algo variable to ‘RMSProp’. You can generate the 3D or 2D animations to see how the RMSProp GD is different from AdaGrad GD in reaching the global minima.
v_b
v_w, v_b = 0, 0
m_w, m_b = 0, 0
num_updates = 0
for i in range(epochs):
dw, db = 0, 0
for x, y in zip(X, Y):
dw = self.grad_w(x, y)
db = self.grad_b(x, y)
num_updates += 1
m_w = beta1 * m_w + (1-beta1) * dw
m_b = beta1 * m_b + (1-beta1) * db
v_w = beta2 * v_w + (1-beta2) * dw**2
v_b = beta2 * v_b + (1-beta2) * db**2
m_w_c = m_w / (1 - np.power(beta1, num_updates))
m_b_c = m_b / (1 - np.power(beta1, num_updates))
v_w_c = v_w / (1 - np.power(beta2, num_updates))
v_b_c = v_b / (1 - np.power(beta2, num_updates))
self.w -= (eta / np.sqrt(v_w_c) + eps) * m_w_c
self.b -= (eta / np.sqrt(v_b_c) + eps) * m_b_c
self.append_log()
to keep track of momentum history and
m_w & m_b
which is used to decay the denominator and prevent its rapid growth just like in RMSProp. After that, we implement the bias correction for the Momentum based history variables and RMSProp based history variables. Once we compute the corrected values of the parameters ‘w’ and ‘b’, we will use those values to update the values of parameters.
v_w & v_b
X = np.asarray([3.5, 0.35, 3.2, -2.0, 1.5, -0.5])
Y = np.asarray([0.5, 0.50, 0.5, 0.5, 0.1, 0.3])
algo = 'Adam'
w_init = -6
b_init = 4.0
w_min = -7
w_max = 5
b_min = -7
b_max = 5
epochs = 200
gamma = 0.9
eta = 0.5
eps = 1e-8
animation_frames = 20
plot_2d = True
plot_3d = False
