I am trying to fit a linear model to data using Bayesian inference technique. For this, I thought of using PyMC. Naturally, after training a model, I want to test its performance on new data and that's where the problem occurs. I don't seem to be able to set a new dataset. Anybody with experience?
Example script that shows the error:
import numpy as np
import pymc as pm
import arviz as az
import matplotlib.pyplot as plt
def run_model():
# Generate synthetic data
np.random.seed(42)
x = np.linspace(0, 10, 100)
a_true = 2.5 # True slope
b_true = 1.0 # True intercept
y_true = a_true * x + b_true
y = y_true + np.random.normal(0, 1, size=x.size) # Add some noise
# Split into training and test sets
x_train, x_test = x[:80], x[80:]
y_train, y_test = y[:80], y[80:]
# Define and fit the model
with pm.Model() as linear_model:
# Define x as a pm.Data variable to allow updating with pm.set_data
x_shared = pm.Data("x", x_train)
# Priors for slope and intercept
a = pm.Normal("a", mu=0, sigma=10)
b = pm.Normal("b", mu=0, sigma=10)
sigma = pm.HalfNormal("sigma", sigma=1)
# Expected value of y
mu = a * x_shared + b
# Likelihood
y_obs = pm.Normal("y_obs", mu=mu, sigma=sigma, observed=y_train)
# Sample from the posterior
trace = pm.sample(1000, tune=1000, return_inferencedata=True, chains=1)
# Predict on training data
with linear_model:
pm.set_data({"x": x_train}) # Update data to training
post_pred_train = pm.sample_posterior_predictive(trace)
# Predict on test data
with linear_model:
pm.set_data({"x": x_test}) # Update data to testing
post_pred_test = pm.sample_posterior_predictive(trace)
# Plot results
plt.figure(figsize=(10, 5))
# Plot training data
plt.scatter(x_train, y_train, c="blue", label="Training data")
plt.plot(x_train, y_true[:80], "k--", label="True function")
# Plot posterior predictive for training data
plt.plot(
x_train,
post_pred_train["y_obs"].mean(axis=0),
label="Posterior predictive (train)",
color="red",
)
# Plot test data
plt.scatter(x_test, y_test, c="green", label="Test data")
# Plot posterior predictive for test data
plt.plot(
x_test,
post_pred_test["y_obs"].mean(axis=0),
label="Posterior predictive (test)",
color="orange",
)
plt.legend()
plt.xlabel("x")
plt.ylabel("y")
plt.title("Bayesian Linear Regression with PyMC")
plt.show()
# Summary of the model parameters
print(az.summary(trace, var_names=["a", "b", "sigma"]))
# Only execute if run as the main module
if __name__ == '__main__':
run_model()
Error that results:
ValueError: shape mismatch: objects cannot be broadcast to a single shape. Mismatch is between arg 0 with shape (80,) and arg 1 with shape (20,).
Apply node that caused the error: normal_rv{"(),()->()"}(RNG(<Generator(PCG64) at 0x1F23323B5A0>), [80], Composite{((i0 * i1) + i2)}.0, ExpandDims{axis=0}.0)
Toposort index: 4
Inputs types: [RandomGeneratorType, TensorType(int64, shape=(1,)), TensorType(float64, shape=(None,)), TensorType(float64, shape=(1,))]
Inputs shapes: ['No shapes', (1,), (20,), (1,)]
Inputs strides: ['No strides', (8,), (8,), (8,)]
Inputs values: [Generator(PCG64) at 0x1F23323B5A0, array([80], dtype=int64), 'not shown', array([0.97974278])]
Outputs clients: [[output[1](normal_rv{"(),()->()"}.0)], [output[0](y_obs)]]