Minimizing Euclidean Norm with Gradient Descent

Question

I'm trying to find a solution for a system of linear equations using Gradient Descent Method ∥Ax-b∥^2 in Python.

The linear equations are:

x - 2y + 3z = - 1
3x + 2y - 5z = 3
2x - 5y + 2z = 0

The solution is x = 0.2608, y = -0.0869, z = -0.4782, but I can't seem to get it via this method, which incorrectly gives:

x = 0.6397305590418562
y = 0.24242515764463315
z = -0.4736842052386

Can someone help me realize my mistake?

A = np.array([[1, -2, 3], [3, 2, -5], [2, -5, 2]])
X = np.array([[x, y, z]])
b = np.array([[-1], [3], [0]])

f = (A*X - b).transpose() * (A*X - b)
display(f)
print('---')

# Define the function and derivatives
g = (x + 1)**2 + (1 - 2*y)**2 + (3*z + 1)**2 + (3*x - 3)**2 + (2*y - 3)**2 + (-5*z - 3)**2 + 4*x**2 + 25*y**2 + 4*z**2
g = sp.expand(g)
print(g)
print('---')
g_x = sp.diff(g, x)
g_y = sp.diff(g, y)
g_z = sp.diff(g, z)
print(g_x)
print(g_y)
print(g_z)
print('---')

# Define the NumPy function
def g_func(x, y, z):
    return 14*x**2 - 16*x + 33*y**2 - 16*y + 38*z**2 + 36*z + 30

def grad_g(x, y, z):
    return 28*x - 16, 66*y - 16, 76*z + 36

# Running the Gradient Descent Method
cur_pos = (10, 10, 10)

def gradient_descent(f, grad, x0, alpha = 0.01, max_iter = 1000, tol = 1e-6):
    k = 0
    while k < max_iter and abs(grad(x0[0], x0[1], x0[2])[0]) > tol and abs(grad(x0[0], x0[1], x0[2])[1]) > tol and abs(grad(x0[0], x0[1], x0[2])[2]) > tol:
        x_der, y_der, z_der = grad(x0[0], x0[1], x0[2])
        x0 = x0[0] - alpha * x_der, x0[1] - alpha * y_der, x0[2] - alpha * z_der
        k += 1
    return x0[0], x0[1], x0[2], f(x0[0], x0[1], x0[2]), grad(x0[0], x0[1], x0[2])

gradient_descent(g_func, grad_g, cur_pos)

Trenton McKinney · Accepted Answer · 2024-06-11 02:26:15Z

import numpy as np

# Define the system of equations
A = np.array([[1, -2, 3], [3, 2, -5], [2, -5, 2]])
b = np.array([[-1], [3], [0]])

# Define the objective function
def objective_function(X):
    return 0.5 * np.linalg.norm(A @ X - b) ** 2

# Define the gradient of the objective function
def gradient(X):
    return A.T @ (A @ X - b)

# Gradient descent method
def gradient_descent(A, b, alpha=0.01, max_iter=1000, tol=1e-6):
    X = np.random.rand(A.shape[1], 1)  # Start with a random initial guess
    for i in range(max_iter):
        grad = gradient(X)
        X_new = X - alpha * grad
        if np.linalg.norm(X_new - X) < tol:
            break
        X = X_new
        # For debugging or analysis, you can print the objective function value
        # print(f"Iteration {i}, Objective Function Value: {objective_function(X)}")
    return X

# Running the Gradient Descent Method
solution = gradient_descent(A, b)
print("Solution found by gradient descent:")
print("x =", solution[0, 0])
print("y =", solution[1, 0])
print("z =", solution[2, 0])

Solution found by gradient descent:
x = 0.2609087468229011
y = -0.08692084284170505
z = -0.47822679318745703

Explanation of the corrections:

Objective function: The objective function is correctly defined as $$f(x) = \frac{1}{2} |Ax - b|^2$$.
Gradient calculation: The gradient of the objective function is $$\nabla f(x) = A^T (A x - b)$$.
Gradient Descent Implementation: A standard gradient descent loop is implemented with a stopping criterion based on the change in $$X$$ (the solution vector).
Debugging Information: Added to track the objective function value during iterations for better understanding and debugging.

This ensures the gradient descent method minimizes the correct objective function and should converge to the correct solution.

Issues in original implementation:

Incorrect Objective Function
Incorrect Gradient Calculation
The gradient descent function was based on an incorrect gradient and objective function, which led to incorrect updates and convergence to the wrong solution.
The initial guess cur_pos = (10, 10, 10) was arbitrary and the convergence criteria based on the tolerance for individual gradient components were not ideal. It is generally better to check the norm of the gradient vector.

Note:

The @ operator can be used as a shorthand for np.matmul on ndarrays.

References:

import numpy as np
import matplotlib.pyplot as plt


class GradientDescentSolver:
    def __init__(self, A: np.ndarray, b: np.ndarray, alpha: float = 0.01, max_iter: int = 1000, tol: float = 1e-6):
        self.A = A
        self.b = b
        self.alpha = alpha
        self.max_iter = max_iter
        self.tol = tol
        self.history = []
        self.objective_values = []

    def objective_function(self, X: np.ndarray) -> float:
        return 0.5 * np.linalg.norm(self.A @ X - self.b) ** 2

    def gradient(self, X: np.ndarray) -> np.ndarray:
        return self.A.T @ (self.A @ X - self.b)

    def solve(self) -> np.ndarray:
        X = np.random.rand(self.A.shape[1], 1)  # Start with a random initial guess
        for i in range(self.max_iter):
            grad = self.gradient(X)
            X_new = X - self.alpha * grad
            if np.linalg.norm(X_new - X) < self.tol:
                break
            X = X_new
            self.history.append(X.copy())
            self.objective_values.append(self.objective_function(X))
        return X

    def plot_solution(self) -> None:
        if not self.history:
            raise ValueError("No history found. Run solve() first.")
        
        history = np.array(self.history).squeeze()
        fig, axs = plt.subplots(2, figsize=(10, 10))

        # Plot objective function value
        axs[0].plot(self.objective_values, label='Objective Function Value')
        axs[0].set_xlabel('Iteration')
        axs[0].set_ylabel('Objective Function Value')
        axs[0].set_title('Gradient Descent Convergence')
        axs[0].legend()

        # Plot parameter values
        axs[1].plot(history[:, 0], label='x')
        axs[1].plot(history[:, 1], label='y')
        axs[1].plot(history[:, 2], label='z')
        axs[1].set_xlabel('Iteration')
        axs[1].set_ylabel('Parameter Values')
        axs[1].set_title('Parameter Trajectories')
        axs[1].legend()

        plt.tight_layout()
        plt.show()

# Define the system of equations
A = np.array([[1, -2, 3], [3, 2, -5], [2, -5, 2]])
b = np.array([[-1], [3], [0]])

# Running the Gradient Descent Method
solver = GradientDescentSolver(A, b)
solution = solver.solve()

print("Solution found by gradient descent:")
print("x =", solution[0, 0])
print("y =", solution[1, 0])
print("z =", solution[2, 0])

# Plot the solution if desired
solver.plot_solution()

Solution found by gradient descent:
x = 0.2609088084845793
y = -0.08692078669238036
z = -0.4782267395600823

Aicody · Accepted Answer · 2024-06-10 22:24:37Z

-1

You can use the sum of the squared residuals as your objective function:

import numpy as np


def gradient_descent(A, b, alpha=0.01, max_iter=1000, tol=1e-6):
    def _obj(X):
        return 2 * A.T @ (A @ X - b)
    X = np.zeros(A.shape[1])
    for _ in range(max_iter):
        NX = X - alpha * _obj(X)
        if np.linalg.norm(NX - X) < tol:
            break
        X = NX
    return X


A = np.array([[1, -2, 3], [3, 2, -5], [2, -5, 2]])
b = np.array([-1, 3, 0])
print(gradient_descent(A, b))

Prints


[ 0.26088918 -0.08693866 -0.47824381]

answered Jun 10, 2024 at 22:24

Aicody

4,9063 gold badges8 silver badges30 bronze badges

Collectives™ on Stack Overflow

Minimizing Euclidean Norm with Gradient Descent

2 Answers 2

Explanation of the corrections:

Issues in original implementation:

Note:

References:

Comments

Prints

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

Explanation of the corrections:

Issues in original implementation:

Note:

References:

Comments

Prints

Comments

Your Answer

Sign up or log in

Post as a guest

Related