Backpropogation

import numpy as np
import tensorflow as tf

From Scratch

def sigmoid(x):
return 1.0/(1+np.exp(-x))
# Gradient of loss function: L'(W1, b1, W2, b2).
def L_prime(X, Y, W1, b1, W2, b2):
""" L'(W,b) function.
X: Feature matrix. Shape: [n,2].
Y: Label vector. Shape: [n,1].
W1: Weight matrix W1. Shape: [2,3].
b1: Bias vector b1. Shape: [3,1].
W2: Weight matrix W2. Shape: [3,1].
b2: Bias vector b2. Shape: [1,1].
Return the gradients: dL/dW1 (Shape: [2,3]), dL/db1 (Shape: [3,1]),
dL/dW2 (Shape: [3,1]), dL/db2 (Shape: [1,1]).
"""
# Get dimensions.
n = X.shape[0]

# Calculate feed-forward values.

H = sigmoid(W1.T.dot(X.T) + b1).T # Shape: [n, 3].
P = Y*(W2.T.dot(H.T)+b2).T # Shape: [n, 1].
# print(P.shape)
# Calculate the gradients: dL/dW1, dL/db1, dL/dW2, dL/db2.
dL_by_dW2 = H.T.dot((P-1)*Y) # Shape: [3,1].

# dL_by_db2 = (P-1).T.dot(Y) # Shape: [1,1].
dL_by_db2 = np.ones((n,1)).T.dot((P-1)*Y)

# print(W2.shape)
dL_by_dH = ((P-1)*Y).dot(W2.T) # Shape: [n,3].
dL_by_dW1 = X.T.dot(dL_by_dH*H*(1-H)) # Shape: [2,3].
# print(dL_by_dW1.shape)
dL_by_db1 = (dL_by_dH*H*(1-H)).T.dot(np.ones((n,1))) # Shape: [3,1].
# print(dL_by_db1.shape)
return dL_by_dW1, dL_by_db1, dL_by_dW2, dL_by_db2
# Loss function
def L(X, Y, W1, b1, W2, b2):
""" L(W,b) function.
X: Feature matrix. Shape: [n,2].
Y: Label vector. Shape: [n,1].
W1: Weight matrix W1. Shape: [2,3].
b1: Bias vector b1. Shape: [3,1].
W2: Weight matrix W2. Shape: [3,1].
b2: Bias vector b2. Shape: [1,1].
Return the loss. Shape: Scalar.
"""
# Get dimensions.
n = X.shape[0]

# Calculate feed-forward values.
# print(X.shape)

H = sigmoid(W1.T.dot(X.T) + b1).T # Shape: [n, 3].
# print(H.shape)
# print(W2.T.dot(H.T).shape)
P = sigmoid(Y*(W2.T.dot(H.T)+b2).T) # Shape: [n, 1].

# print((W2.T.dot(H.T)+b2).shape)
# print(P.shape)
# Get the loss.
L = -np.sum(np.log(P)) # Shape: Scalar.

return L
# lets generate some data using a function mapping from R^2 -> R^1 (2d coordinates to scalar values)
def generate_data():

# generates 1000 ordered data points from 0 to 1 with a bit of noise using random.uniform
def generate_linear_noisy():
return np.linspace(0, 1, num=1000) + np.random.uniform(-0.05, 0.05, (1000,))

X_train = np.array([generate_linear_noisy(), generate_linear_noisy()]).T

# the function modeled here is F(x, y) -> x / 2 + y / 2
Y_train = (X_train[:,0] * 0.5 + X_train[:,1] * 0.5).reshape(1000, 1)
return X_train, Y_train
X_train, Y_train = generate_data()
print(X_train)
# gradient descent
# supposed to find where loss is minimized
learning_rate = 0.0001
n_iter = 200000 # Number of iterations
np.random.seed(0)
W1 = np.random.randn(2,3)/((2*3)**2) # Weight matrix 1.
b1 = np.random.randn(3,1)/((3*1)**2) # Bias vector 1.
W2 = np.random.randn(3,1)/((3*1)**2) # Weight matrix 2.
b2 = np.random.randn(1,1)/((1*1)**2) # Bias vector 2.

# We will keep track of training loss over iterations.
iterations = [0]
L_list = [L(X_train, Y_train, W1, b1, W2, b2)]

for i in range(n_iter):

# gradient descent

gradient_W1, gradient_b1, gradient_W2, gradient_b2 = \
L_prime(X_train, Y_train, W1, b1, W2, b2)

W1_new = W1 - learning_rate * gradient_W1
b1_new = b1 - learning_rate * gradient_b1
W2_new = W2 - learning_rate * gradient_W2
b2_new = b2 - learning_rate * gradient_b2

iterations.append(i+1)
L_list.append(L(X_train, Y_train, W1_new, b1_new, W2_new, b2_new))

# L1-norm of weight/bias changing.
norm = np.abs(W1_new-W1).sum() + np.abs(b1_new-b1).sum() + \
np.abs(W2_new-W2).sum() + np.abs(b2_new-b2).sum()

if i%500 == 0:
print('i: {:6d} L: {:.3f} norm:{:.6f}'.format(i, L_list[-1], norm))

W1 = W1_new
b1 = b1_new
W2 = W2_new
b2 = b2_new

print ('W1 matrix: \n' + str(W1))
print ('b1 vector: \n' + str(b1))
print ('W2 matrix: \n' + str(W2))
print ('b2 vector: \n' + str(b2))

Backpropagation In Tensorflow

# See how simple this is using keras :)
# Try adding more layers, changing activation to e.g. 'tanh' or 'relu' or 'sigmoid' and compare results!
# You may notice that "linear" works best and thats obvious because our data is fairly linear
# You can also try changing the data generated
model = tf.keras.Sequential([
tf.keras.layers.Dense(2, activation='linear'),
# typically, more neurons, the more capable the network, but be wary of overfitting
tf.keras.layers.Dense(32, activation='linear'),
tf.keras.layers.Dense(1)
])

# we specify we want to use something known as the Adam optimizer to optimize the loss and minimize it
# Adam, like SGD, tries to minimize the loss function. In a future workshop we will explain why Adam runs much
# faster and has higher accuracy
# the loss we use here is known as Mean Squared Error
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
loss='mse',
metrics=['mse'])

# fit the model onto our dataset and run for 100 epochs
model.fit(X_train, Y_train, epochs=100)
# lets look at 20 data points and see how we do
for x, y in zip(X_train[0:80:4], Y_train[0:80:4]):
print("X = {}, Y = {}, Predicted - {}".format(x, y, model.predict([[x[0], x[1]]])))