Using matplotlib i am trying to draw the regression line i calculated using a CSV dataset of pollution.
# split the dependent and independent variables
X = df[['O3', 'TEMP', 'PRES', 'DEWP',
'RAIN', 'wd', 'WSPM']]
y = df["SO2"]
# training and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
I want to use the values of O3 as x and SO2 as y for this particular scatter plot, as X contains multiple elements i used. the .iloc[:,0].values to make sure it is only the first column. However the code below will generate the correct scattergraph but not the regression plot.
# fit the training data to the models
# LINEAR REGRESSION
linReg.fit(X_train, y_train)
print("Linear Regression Intercept: ",linReg.intercept_ )
print("Linear Regression Coeffients: ", linReg.coef_)
# compute accuracy using R-squared
# LINEAR REGRESSION ACCURACY
y_pred = linReg.predict(X_test)
print("Linear Regression Accuracy Score: ", r2_score(y_test, y_pred))
linScore = r2_score(y_test, y_pred)
plt.scatter(X_train.iloc[:,0].values, y_train, edgecolor='k', facecolor='grey', alpha=0.7, label='Sample data')
plt.plot(X_train.iloc[:,0].values, y_pred, color='r')
plt.ylabel('SO2 Values', fontsize=14)
plt.xlabel('O3 Values', fontsize=14)
plt.legend(facecolor='white', fontsize=11)
it generates this traceback.
ValueError Traceback (most recent call last)
Input In [25], in <cell line: 10>()
7 linScore = r2_score(y_test, y_pred)
9 plt.scatter(X_train.iloc[:,0].values, y_train, edgecolor='k', facecolor='grey', alpha=0.7, label='Sample data')
---> 10 plt.plot(X_train.iloc[:,0].values, y_pred, color='r')
11 plt.ylabel('SO2 Values', fontsize=14)
12 plt.xlabel('O3 Values', fontsize=14)
File ~/opt/anaconda3/lib/python3.9/site-packages/matplotlib/pyplot.py:2757, in plot(scalex, scaley, data, *args, **kwargs)
2755 @_copy_docstring_and_deprecators(Axes.plot)
2756 def plot(*args, scalex=True, scaley=True, data=None, **kwargs):
-> 2757 return gca().plot(
2758 *args, scalex=scalex, scaley=scaley,
2759 **({"data": data} if data is not None else {}), **kwargs)
File ~/opt/anaconda3/lib/python3.9/site-packages/matplotlib/axes/_axes.py:1632, in Axes.plot(self, scalex, scaley, data, *args, **kwargs)
1390 """
1391 Plot y versus x as lines and/or markers.
1392
(...)
1629 (``'green'``) or hex strings (``'#008000'``).
1630 """
1631 kwargs = cbook.normalize_kwargs(kwargs, mlines.Line2D)
-> 1632 lines = [*self._get_lines(*args, data=data, **kwargs)]
1633 for line in lines:
1634 self.add_line(line)
File ~/opt/anaconda3/lib/python3.9/site-packages/matplotlib/axes/_base.py:312, in _process_plot_var_args.__call__(self, data, *args, **kwargs)
310 this += args[0],
311 args = args[1:]
--> 312 yield from self._plot_args(this, kwargs)
File ~/opt/anaconda3/lib/python3.9/site-packages/matplotlib/axes/_base.py:498, in _process_plot_var_args._plot_args(self, tup, kwargs, return_kwargs)
495 self.axes.yaxis.update_units(y)
497 if x.shape[0] != y.shape[0]:
--> 498 raise ValueError(f"x and y must have same first dimension, but "
499 f"have shapes {x.shape} and {y.shape}")
500 if x.ndim > 2 or y.ndim > 2:
501 raise ValueError(f"x and y can be no greater than 2D, but have "
502 f"shapes {x.shape} and {y.shape}")
ValueError: x and y must have same first dimension, but have shapes (44269,) and (11068,)
here is the working scatter graph

what is wrong with the plot command? am i using the wrong value of x and y?
EDIT: Resizing X to be the same size as y_pred does produce a line, but i'm not sure why it isnt a best fit regression line but a line connecting these points together
X_plot = X_train.iloc[:,0].values
print(X_plot.shape)
print(y_pred.shape)
X_size = np.resize(X_plot,(11068,))
plt.scatter(X_plot, y_train, edgecolor='k', facecolor='grey', alpha=0.7, label='Sample data')
plt.plot(X_size, y_pred, color='r')
plt.ylabel('SO2 Values', fontsize=14)
plt.xlabel('O3 Values', fontsize=14)
plt.legend(facecolor='white', fontsize=11)
