I watched Josh Stormer’s video on how does the CNN work. I understood the Theoretical part of it and decided to implement it in python. I read TONS(I really mean it) of Articles in medium.com and towardsdatascience.com which talk about implementing CNN from Scratch using only NumPy.
However, the problem I am having is Most of them are using datasets for the training and testing from external libraries like MNIST/Keras which is not what I want. Since this is my first time, I want to practice the exact thing I learned in Josh Stormer’s video.
Here is the summary of it:
- What I am trying to do:
A model which reads the letter ‘O’ and ‘X’ from my excel file(Photo) that I made similar to the one in StatQuest youtube video. So the Input will be a matrix from a .csv file, not from MNIST or Keras(Sadly, most of the articles are like that for some reason!!) Train the model then save the Weights and Biases parameters into other parameters.csv file to test it using a test.py file and see the results. (I will also make another .csv file for the test dataset(Photo), which will have shifted and deformed ‘O’ and ‘X’ letters like the ones in the video, so I can see how my model works).
The code I’ve written (Or collected) from about 7-10 articles I found looking online.
nn_architecture = [
{"input_dim": 2, "output_dim": 4, "activation": "relu"},
{"input_dim": 4, "output_dim": 6, "activation": "relu"},
{"input_dim": 6, "output_dim": 6, "activation": "relu"},
{"input_dim": 6, "output_dim": 4, "activation": "relu"},
{"input_dim": 2, "output_dim": 2, "activation": "sigmoid"},
]
def init_layers(nn_architecture, seed = 99):
np.random.seed(seed)
number_of_layers = len(nn_architecture)
params_values = {}
for idx, layer in enumerate(nn_architecture):
layer_idx = idx + 1
layer_input_size = layer["input_dim"]
layer_output_size = layer["output_dim"]
params_values['W' + str(layer_idx)] = np.random.randn(
layer_output_size, layer_input_size) * 0.1
params_values['b' + str(layer_idx)] = np.random.randn(
layer_output_size, 1) * 0.1
return params_values
def sigmoid(Z):
return 1/(1+np.exp(-Z))
def relu(Z):
return np.maximum(0,Z)
def sigmoid_backward(dA, Z):
sig = sigmoid(Z)
return dA * sig * (1 - sig)
def relu_backward(dA, Z):
dZ = np.array(dA, copy = True)
dZ[Z <= 0] = 0;
return dZ;
def single_layer_forward_propagation(A_prev, W_curr, b_curr, activation="sigmoid"):
Z_curr = np.dot(W_curr, A_prev) + b_curr
if activation == "relu":
activation_func = relu
elif activation == "sigmoid":
activation_func = sigmoid
else:
raise Exception('Non-supported activation function')
return activation_func(Z_curr), Z_curr
def full_forward_propagation(X, params_values, nn_architecture):
memory = {}
A_curr = X
for idx, layer in enumerate(nn_architecture):
layer_idx = idx + 1
A_prev = A_curr
activ_function_curr = layer["activation"]
W_curr = params_values["W" + str(layer_idx)]
b_curr = params_values["b" + str(layer_idx)]
A_curr, Z_curr = single_layer_forward_propagation(A_prev, W_curr, b_curr, activ_function_curr)
memory["A" + str(idx)] = A_prev
memory["Z" + str(layer_idx)] = Z_curr
return A_curr, memory
def get_cost_value(Y_hat, Y):
m = Y_hat.shape[1]
cost = -1 / m * (np.dot(Y, np.log(Y_hat).T) + np.dot(1 - Y, np.log(1 - Y_hat).T))
return np.squeeze(cost)
def get_accuracy_value(Y_hat, Y):
Y_hat_ = convert_prob_into_class(Y_hat)
return (Y_hat_ == Y).all(axis=0).mean()
def single_layer_backward_propagation(dA_curr, W_curr, b_curr, Z_curr, A_prev, activation="sigmoid"):
m = A_prev.shape[1]
if activation == "relu":
backward_activation_func = relu_backward
elif activation == "sigmoid":
backward_activation_func = sigmoid_backward
else:
raise Exception('Non-supported activation function')
dZ_curr = backward_activation_func(dA_curr, Z_curr)
dW_curr = np.dot(dZ_curr, A_prev.T) / m
db_curr = np.sum(dZ_curr, axis=1, keepdims=True) / m
dA_prev = np.dot(W_curr.T, dZ_curr)
return dA_prev, dW_curr, db_curr
def full_backward_propagation(Y_hat, Y, memory, params_values, nn_architecture):
grads_values = {}
m = Y.shape[1]
Y = Y.reshape(Y_hat.shape)
dA_prev = - (np.divide(Y, Y_hat) - np.divide(1 - Y, 1 - Y_hat));
for layer_idx_prev, layer in reversed(list(enumerate(nn_architecture))):
layer_idx_curr = layer_idx_prev + 1
activ_function_curr = layer["activation"]
dA_curr = dA_prev
A_prev = memory["A" + str(layer_idx_prev)]
Z_curr = memory["Z" + str(layer_idx_curr)]
W_curr = params_values["W" + str(layer_idx_curr)]
b_curr = params_values["b" + str(layer_idx_curr)]
dA_prev, dW_curr, db_curr = single_layer_backward_propagation(
dA_curr, W_curr, b_curr, Z_curr, A_prev, activ_function_curr)
grads_values["dW" + str(layer_idx_curr)] = dW_curr
grads_values["db" + str(layer_idx_curr)] = db_curr
return grads_values
def update(params_values, grads_values, nn_architecture, learning_rate):
for layer_idx, layer in enumerate(nn_architecture):
params_values["W" + str(layer_idx)] -= learning_rate * grads_values["dW" + str(layer_idx)]
params_values["b" + str(layer_idx)] -= learning_rate * grads_values["db" + str(layer_idx)]
return params_values;
def train(X, Y, nn_architecture, epochs, learning_rate):
params_values = init_layers(nn_architecture, 2)
cost_history = []
accuracy_history = []
for i in range(epochs):
Y_hat, cashe = full_forward_propagation(X, params_values, nn_architecture)
cost = get_cost_value(Y_hat, Y)
cost_history.append(cost)
accuracy = get_accuracy_value(Y_hat, Y)
accuracy_history.append(accuracy)
grads_values = full_backward_propagation(Y_hat, Y, cashe, params_values, nn_architecture)
params_values = update(params_values, grads_values, nn_architecture, learning_rate)
return params_values, cost_history, accuracy_history
Helps are really appreciated, Thanks in advance!