I'm a young programmer that was interested by machine learning. I watched videos and read articles about the theory behind simple neural networks. However, I can't manage to set it up correctly. I've already managed a simple linear regression AI but I can't manage to make it deeper. I wanted to make An AI that could act as a XOR gate so that it outputs 1 if only one of the inputs is 1.
I then asked Chat GPT to give an example of code for a neural network from scratch and this was the result:
import numpy as np
class NeuralNetwork:
def __init__(self, input_size, hidden_size, output_size):
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
# Initialize weights with random values
self.W1 = np.random.randn(self.input_size, self.hidden_size)
self.b1 = np.zeros((1, self.hidden_size))
self.W2 = np.random.randn(self.hidden_size, self.output_size)
self.b2 = np.zeros((1, self.output_size))
def forward(self, X):
# Forward pass
self.z1 = np.dot(X, self.W1) + self.b1
self.a1 = self.sigmoid(self.z1)
self.z2 = np.dot(self.a1, self.W2) + self.b2
self.a2 = self.sigmoid(self.z2)
return self.a2
def backward(self, X, y, learning_rate):
# Backward pass
m = X.shape[0]
# Calculate gradients
self.dz2 = self.a2 - y
self.dW2 = np.dot(self.a1.T, self.dz2) / m
self.db2 = np.sum(self.dz2, axis=0, keepdims=True) / m
self.dz1 = np.dot(self.dz2, self.W2.T) * self.sigmoid_derivative(self.z1)
self.dW1 = np.dot(X.T, self.dz1) / m
self.db1 = np.sum(self.dz1, axis=0, keepdims=True) / m
# Update weights and biases
self.W2 -= learning_rate * self.dW2
self.b2 -= learning_rate * self.db2
self.W1 -= learning_rate * self.dW1
self.b1 -= learning_rate * self.db1
def train(self, X, y, num_iterations, learning_rate):
for i in range(num_iterations):
# Forward and backward pass
output = self.forward(X)
self.backward(X, y, learning_rate)
# Compute and print the loss
loss = self.compute_loss(y, output)
if i % 1000 == 0:
print(f"Loss after iteration {i}: {loss}")
def predict(self, X):
# Perform forward pass to get predictions
return self.forward(X)
def sigmoid(self, x):
return 1 / (1 + np.exp(-x))
def sigmoid_derivative(self, x):
return self.sigmoid(x) * (1 - self.sigmoid(x))
def compute_loss(self, y, y_hat):
# Calculate binary cross-entropy loss
loss = -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
return loss
# Example usage
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])
# Create a neural network with 2 input units, 2 hidden units, and 1 output unit
nn = NeuralNetwork(2, 3, 1)
# Train the neural network
nn.train(X, y, num_iterations=10000, learning_rate=0.1)
# Make predictions
predictions = nn.predict(X)
print(f"Predictions: {predictions}")
And this works quite well:
Loss after iteration 0: 0.7397250928187518
Loss after iteration 1000: 0.6786126098666916
Loss after iteration 2000: 0.5819465493933369
Loss after iteration 3000: 0.24539622398185007
Loss after iteration 4000: 0.08584491893702054
Loss after iteration 5000: 0.047898404813754
Loss after iteration 6000: 0.032597019709755495
Loss after iteration 7000: 0.024527823714170643
Loss after iteration 8000: 0.019591047168036294
Loss after iteration 9000: 0.016275136075386054
Predictions: [[0.01397655]
[0.98678561]
[0.98342758]
[0.01144902]]
However I wanted to make it myself and on the way implement the possibility of having more than 1 hidden layer. I thought of initializing it with an array like [2, 3, 4, 5, 1] where the items represent the number of neurons in each layer:
import numpy as np
class NeuralNetwork():
def __init__(self, layers):
self.layers = layers
self.W = []
self.B = []
self.O = []
self.A = []
self.dW = []
self.dB = []
for layer in range(len(self.layers)-1):
self.W.append(np.random.randn(self.layers[layer], self.layers[layer+1]))
self.B.append(np.zeros((1, self.layers[layer+1])))
@property
def weights(self):
'''Weights, Biases, Outputs and Activations of the Neural Network'''
return f"Weights:\n{self.W}:\nBiases:\n{self.B}\nOutputs:\n{self.O}\nActivations:\n{self.A}"
def __str__(self):
return f"Neural Network: {self.layers}"
def sigmoid(self, x):
'''Activation function'''
return 1 / (1 + np.exp(x))
def sigmoid_derivative(self, x):
'''Derivative of Activation Function'''
return self.sigmoid(x) * (1 - self.sigmoid(x))
def mean_square_error(self, predict, y):
'''Loss function'''
# Hinge Loss is used to update weights but this is used for showing improvements
return (predict - y)**2
def forward(self, x):
'''Returns the prediction'''
self.O = []
self.A = []
output = x
self.A.append(x)
for layer in range(len(self.layers)-1):
self.O.append(np.dot(output, self.W[layer]) + self.B[layer])
self.A.append(self.sigmoid(self.O[layer]))
output = self.O[layer]
return self.A[len(self.layers) - 1]
def backward(self, X, predict, y, learning_rate):
'''Uses Batch Gradient Descent to update the weights'''
m = X.shape[0] # Batch Gradient Descent divides dW and dB by the number of samples
self.dW = []
self.dB = []
delta = predict - y # Hinge Loss
for layer in reversed(range(len(self.layers)-1)):
self.dW.append(np.dot(self.A[layer].T, delta) / m)
self.dB.append(np.sum(delta, axis=0, keepdims=True) / m)
# Update delta according to previous layer only if it is not input layer
delta = np.dot(delta, self.W[layer].T) * self.sigmoid_derivative(self.O[layer-1]) if layer != 0 else 0
# Reverse the partial derivatives to update in order of layer
self.dW.reverse()
self.dB.reverse()
# Update Parameters
for layer in range(len(self.layers)-1):
self.W[layer] -= self.dW[layer] * learning_rate
self.B[layer] -= self.dB[layer] * learning_rate
def train(self, X, y, epochs, learning_rate):
'''Trains the Network while writing the loss'''
for epoch in range(epochs):
predict = self.forward(X)
self.backward(X, predict, y, learning_rate)
if epoch % 1000 == 0:
loss = 0
for i in range(len(X)):
predict = self.forward(X[i])
loss += self.mean_square_error(predict, y[i])
print(f"Loss after iteration {epoch}: {float(loss)}")
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])
nn = NeuralNetwork([2, 3, 1])
nn.train(X, y, epochs=10000, learning_rate=0.01)
predict = nn.forward(X)
print(f"Predict:\n{predict}")
However, this doesn't work. It seems there's only an issue with how batch gradient descent is applied and how parameters are updated.
So I focused on the parameters update part:
for layer in range(len(self.layers)-1):
self.W[layer] -= self.dW[layer] * learning_rate
self.B[layer] -= self.dB[layer] * learning_rate
I think that I have to use - as the loss = predict - y but when I do so:
Loss after iteration 0: 1.705589123036724
Loss after iteration 1000: 1.9999963657643962
Loss after iteration 2000: 1.9999999999940412
Loss after iteration 3000: 1.9999999999999998
Loss after iteration 4000: 2.0
Loss after iteration 5000: 2.0
Loss after iteration 6000: 2.0
Loss after iteration 7000: 2.0
Loss after iteration 8000: 2.0
Loss after iteration 9000: 2.0
Predict:
[[3.23480666e-32]
[9.63652615e-41]
[8.57030848e-40]
[2.55310473e-48]]
Its loss increases and it always outputs 0!
So I tried with a + to see if maybe it was backwards:
Loss after iteration 0: 1.9604741377657597
Loss after iteration 1000: 1.214654311749028
Loss after iteration 2000: 1.0401188666251346
Loss after iteration 3000: 1.0055208047391466
Loss after iteration 4000: 1.0016438815385298
Loss after iteration 5000: 1.0014348630015828
Loss after iteration 6000: 1.0015780199117614
Loss after iteration 7000: 1.0017468994611856
Loss after iteration 8000: 1.0019424663190586
Loss after iteration 9000: 1.0021913419187256
Predict:
[[0.47496774]
[0.52497857]
[0.47819749]
[0.52820627]]
But now it only outputs 0.5 which is right in the middle but that doesn't solve the problem, it just minimizes the error in it's way. Before I had tried before to code (but trashed the code as it didn't work) and I thought it was because my previous code used SGD (Stochastic Gradient Descent) and therefore it would minimize the error for each of the examples so I tried implementing Batch Gradient Descent as in Chat GPT example, therefore handling all of the possible inputs, outputs and correct answers in their respectives matrices. I can't figure out which part is wrong. I'm not sure if this was more convenient to post here or on stack overflow as there's more theory to it than programming but it is possibly a programming issue. Help :(