I've programmed a linear regression model from scratch. I use the "Sum of squared residuals" as the loss function for gradient descent. For testing I use linear data (y=x)
When running the algorithm, intercept b barely changes. Thus the slope m is not calculated correctly.
%matplotlib qt5
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
X = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12345)
class LinearRegression():
def __init__(self):
self.X = None
self.y = None
def ssr(self, m, b):
sum = 0
for i in range(len(self.X)):
sum += (self.y[i] - (m * self.X[i] + b) ) ** 2
return sum
def ssr_gradient(self, m, b):
sum_m = 0
sum_b = 0
n = len(self.X)
for i in range(n):
error = self.y[i] - (m * self.X[i] + b)
derivative_m = -(2/n) * self.X[i] * error # Derivative w.r.t. m
derivative_b = -(2/n) * error # Derivative w.r.t. b
sum_m += derivative_m
sum_b += derivative_b
return sum_m, sum_b
def fit(self, X, y, m, b): # Gradient Descent
self.X = X
self.y = y
M, B = np.meshgrid(np.arange(-10, 10, 0.1), np.arange(-10, 10, 0.1))
SSR = np.zeros_like(M)
for i in range(M.shape[0]):
for j in range(M.shape[1]):
SSR[i, j] = self.ssr(M[i, j], B[i, j])
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
gd_model = fig.add_subplot(121, projection="3d", computed_zorder=False)
lin_reg_model = axes[1]
current_pos = (m, b, self.ssr(m, b))
learning_rate = 0.001
min_step_size = 0.001
max_steps = 1000
current_steps = 0
while(current_steps < max_steps):
M_derivative, B_derivative = self.ssr_gradient(current_pos[0], current_pos[1])
M_step_size, B_step_size = M_derivative * learning_rate, B_derivative * learning_rate
if abs(M_step_size) < min_step_size or abs(B_step_size) < min_step_size:
break
M_new, B_new = current_pos[0] - M_step_size, current_pos[1] - B_step_size
current_pos = (M_new, B_new, self.ssr(M_new, B_new))
print(f"Parameters: m: {current_pos[0]}; b: {current_pos[1]}; SSR: {current_pos[2]}")
current_steps += 1
x = np.arange(0, 10, 1)
y = current_pos[0] * x + current_pos[1]
lin_reg_model.scatter(X_train, y_train, label="Train", s=75, c="#1f77b4")
lin_reg_model.plot(x, y)
gd_model.plot_surface(M, B, SSR, cmap="viridis", zorder=0)
gd_model.scatter(current_pos[0], current_pos[1], current_pos[2], c="red", zorder=1)
gd_model.set_xlabel("Slope m")
gd_model.set_ylabel("Intercept b")
gd_model.set_zlabel("Sum of squared residuals")
plt.tight_layout()
plt.pause(0.001)
gd_model.clear()
lin_reg_model.clear()
self.m = current_pos[0]
self.b = current_pos[1]
def predict(self, X_test):
return self.m * X_test + self.b
lin_reg_model = LinearRegression()
lin_reg_model.fit(X_train, y_train, 1, 10)
This is the result for initial values m=1 and b=10:
Parameters: m: -0.45129949840919587; b: 9.50972664859535; SSR: 145.06534359577407

Obviously this isn't optimal because my data is linear. So the optimal parameters should be m=1 and b=0 But I cannot find the problem in my code. The algorithm prints different results based on the initial values, but it should print the same result over and over again as long as there is exactly one minimum of the SSR function.
I tried using different learning rates but the problem persists.
n, when the loss function doesn't (unless I missed something)? If the thing you want to minimize is Σᵢ (yᵢ-mxᵢ-b)², then the derivative wrt m is Σᵢ -2xᵢ(yᵢ-mxᵢ-b), without any division byn. It is quite common to use rather (Σᵢ (yᵢ-mxᵢ-b)²)/n as loss function (which changes nothing, of course. Minimum is the same. The interest of doing so is to have some absolute meaning to loss function, independent from the size of the training dataset: the loss function is the average quadratic error, rather than the sum). And then, indeed, derivative for theandhere. Stop as long as none of the parameter evolve sufficiently.