6

I am using the ols.py code downloaded at scipy Cookbook (the download is in the first paragraph with the bold OLS) but I need to understand rather than using random data for the ols function to do a multiple linear regression.

I have a specific dependent variable y, and three explanatory variables. Every time I try to put in my variables in place of the random variables, it gives me the error:

TypeError: this constructor takes no arguments.

Can anyone help? Is this possible to do?


Here is a copy of the ols code I am trying to use along with the variables that I am trying to input

from __future__ import division
from scipy import c_, ones, dot, stats, diff
from scipy.linalg import inv, solve, det
from numpy import log, pi, sqrt, square, diagonal
from numpy.random import randn, seed
import time

class ols:
"""
Author: Vincent Nijs (+ ?)

Email: v-nijs at kellogg.northwestern.edu

Last Modified: Mon Jan 15 17:56:17 CST 2007

Dependencies: See import statement at the top of this file

Doc: Class for multi-variate regression using OLS

Input:
     dependent variable
    y_varnm = string with the variable label for y
    x = independent variables, note that a constant is added by default
    x_varnm = string or list of variable labels for the independent variables

Output:
    There are no values returned by the class. Summary provides printed output.
    All other measures can be accessed as follows:

    Step 1: Create an OLS instance by passing data to the class

        m = ols(y,x,y_varnm = 'y',x_varnm = ['x1','x2','x3','x4'])

    Step 2: Get specific metrics

        To print the coefficients: 
            >>> print m.b
        To print the coefficients p-values: 
            >>> print m.p

"""

y = [29.4, 29.9, 31.4, 32.8, 33.6, 34.6, 35.5, 36.3, 37.2, 37.8, 38.5, 38.8,
            38.6, 38.8, 39, 39.7, 40.6, 41.3, 42.5, 43.9, 44.9, 45.3, 45.8, 46.5,
            77.1, 48.2, 48.8, 50.5, 51, 51.3, 50.7, 50.7, 50.6, 50.7, 50.6, 50.7]
        #tuition
x1 = [376, 407, 438, 432, 433, 479, 512, 543, 583, 635, 714, 798, 891,
            971, 1045, 1106, 1218, 1285, 1356, 1454, 1624, 1782, 1942, 2057, 2179,
            2271, 2360, 2506, 2562, 2700, 2903, 3319, 3629, 3874, 4102, 4291]
        #research and development
x2 = [28740.00, 30952.00, 33359.00, 35671.00, 39435.00, 43338.00, 48719.00, 55379.00, 63224.00,
            72292.00, 80748.00, 89950.00, 102244.00, 114671.00, 120249.00, 126360.00, 133881.00, 141891.00,
            151993.00, 160876.00, 165350.00, 165730.00, 169207.00, 183625.00, 197346.00, 212152.00, 226402.00, 
            267298.00, 277366.00, 276022.00, 288324.00, 299201.00, 322104.00, 347048.00, 372535.00,
            397629.00]
        #one/none parents 
x3 = [11610, 12143, 12486, 13015, 13028, 13327, 14074, 14094, 14458, 14878, 15610, 15649,
            15584, 16326, 16379, 16923, 17237, 17088, 17634, 18435, 19327, 19712, 21424, 21978,
            22684, 22597, 22735, 22217, 22214, 22655, 23098, 23602, 24013, 24003, 21593, 22319]


def __init__(self,y,x1,y_varnm = 'y',x_varnm = ''):
"""
Initializing the ols class. 
"""
self.y = y
#self.x1 = c_[ones(x1.shape[0]),x1]
self.y_varnm = y_varnm
if not isinstance(x_varnm,list): 
    self.x_varnm = ['const'] + list(x_varnm)
else:
    self.x_varnm = ['const'] + x_varnm

# Estimate model using OLS
self.estimate()

def estimate(self):

# estimating coefficients, and basic stats
self.inv_xx = inv(dot(self.x.T,self.x))
xy = dot(self.x.T,self.y)
self.b = dot(self.inv_xx,xy)                    # estimate coefficients

self.nobs = self.y.shape[0]                     # number of observations
self.ncoef = self.x.shape[1]                    # number of coef.
self.df_e = self.nobs - self.ncoef              # degrees of freedom, error 
self.df_r = self.ncoef - 1                      # degrees of freedom, regression 

self.e = self.y - dot(self.x,self.b)            # residuals
self.sse = dot(self.e,self.e)/self.df_e         # SSE
self.se = sqrt(diagonal(self.sse*self.inv_xx))  # coef. standard errors
self.t = self.b / self.se                       # coef. t-statistics
self.p = (1-stats.t.cdf(abs(self.t), self.df_e)) * 2    # coef. p-values

self.R2 = 1 - self.e.var()/self.y.var()         # model R-squared
self.R2adj = 1-(1-self.R2)*((self.nobs-1)/(self.nobs-self.ncoef))   # adjusted R-square

self.F = (self.R2/self.df_r) / ((1-self.R2)/self.df_e)  # model F-statistic
self.Fpv = 1-stats.f.cdf(self.F, self.df_r, self.df_e)  # F-statistic p-value

def dw(self):
"""
Calculates the Durbin-Waston statistic
"""
de = diff(self.e,1)
dw = dot(de,de) / dot(self.e,self.e);

return dw

def omni(self):
"""
Omnibus test for normality
"""
return stats.normaltest(self.e) 

def JB(self):
"""
Calculate residual skewness, kurtosis, and do the JB test for normality
"""

# Calculate residual skewness and kurtosis
skew = stats.skew(self.e) 
kurtosis = 3 + stats.kurtosis(self.e) 

# Calculate the Jarque-Bera test for normality
JB = (self.nobs/6) * (square(skew) + (1/4)*square(kurtosis-3))
JBpv = 1-stats.chi2.cdf(JB,2);

return JB, JBpv, skew, kurtosis

def ll(self):
"""
Calculate model log-likelihood and two information criteria
"""

# Model log-likelihood, AIC, and BIC criterion values 
ll = -(self.nobs*1/2)*(1+log(2*pi)) - (self.nobs/2)*log(dot(self.e,self.e)/self.nobs)
aic = -2*ll/self.nobs + (2*self.ncoef/self.nobs)
bic = -2*ll/self.nobs + (self.ncoef*log(self.nobs))/self.nobs

return ll, aic, bic

def summary(self):
"""
Printing model output to screen
"""

# local time & date
t = time.localtime()

# extra stats
ll, aic, bic = self.ll()
JB, JBpv, skew, kurtosis = self.JB()
omni, omnipv = self.omni()

# printing output to screen
                                                                                         print                                                                                         '\n=============================================================================='
print "Dependent Variable: " + self.y_varnm
print "Method: Least Squares"
print "Date: ", time.strftime("%a, %d %b %Y",t)
print "Time: ", time.strftime("%H:%M:%S",t)
print '# obs:               %5.0f' % self.nobs
print '# variables:     %5.0f' % self.ncoef 
print '=============================================================================='
print 'variable     coefficient     std. Error      t-statistic     prob.'
print '=============================================================================='
for i in range(len(self.x_varnm)):
    print '''% -5s          % -5.6f     % -5.6f     % -5.6f     % -5.6f'''  %            tuple([self.x_varnm[i],self.b[i],self.se[i],self.t[i],self.p[i]]) 
print '=============================================================================='
print 'Models stats                         Residual stats'
print '=============================================================================='
print 'R-squared            % -5.6f         Durbin-Watson stat  % -5.6f' %              tuple([self.R2, self.dw()])
print 'Adjusted R-squared   % -5.6f         Omnibus stat        % -5.6f' % tuple([self.R2adj, omni])
print 'F-statistic          % -5.6f         Prob(Omnibus stat)  % -5.6f' % tuple([self.F, omnipv])
print 'Prob (F-statistic)   % -5.6f         JB stat             % -5.6f' % tuple([self.Fpv, JB])
print 'Log likelihood       % -5.6f         Prob(JB)            % -5.6f' % tuple([ll, JBpv])
print 'AIC criterion        % -5.6f         Skew                % -5.6f' % tuple([aic, skew])
print 'BIC criterion        % -5.6f         Kurtosis            % -5.6f' % tuple([bic, kurtosis])
print '=============================================================================='

if __name__ == '__main__':

##########################
### testing the ols class
##########################

# intercept is added, by default
m = ols(y,x1,y_varnm = 'y',x_varnm = ['x1','x2','x3'])
m.summary()
6
  • Pretty good first post. I think you posted too much code with the ols (probably just needed your code). Also, @Dat Chu is right about the punctuation. Your question is hard to read. Commented Sep 18, 2011 at 1:13
  • Anyone can suggest an edit. I've put some punctuation in, which should help. @RenCam - do you mean "multilinear"? I've never heard of mulilinear regression and your post comes up first on Google, which suggests you have made a typo in the title of your question. You might want to edit that. Commented Sep 18, 2011 at 9:54
  • @Verbeia most people call it "multiple linear regression" where you have multiple independent variables (e.g. the model is y[i]=a1x1[i] + a2x2[i] + ... + a_n x_n[i] + e[i] and you wish to minimize sum e[i]^2 Commented Sep 19, 2011 at 2:10
  • @Foo Bah Yeah, and that's what I call it too (I'm an economist). But I didn't want to edit the guy's post for that one without asking. Commented Sep 19, 2011 at 2:20
  • @Verbeia some very old books (for example, box and jenkins) actually use the phrase multilinear. Commented Sep 19, 2011 at 5:21

3 Answers 3

5

You should differentiate two cases: i) you just want to solve the equation. ii) you also want to know the statistical information about your model. You can do i) with np.linalg.lstsq; and for ii), you better use statsmodels.

Below you find a sample example, with both solutions:

# The standard imports
import numpy as np
import pandas as pd

# For the statistic
from statsmodels.formula.api import ols

def generatedata():
    ''' Generate and show the data '''
    x = np.linspace(-5,5,101)
    (X,Y) = np.meshgrid(x,x)

    # To get reproducable values, I provide a seed value
    np.random.seed(987654321)   

    Z = -5 + 3*X-0.5*Y+np.random.randn(np.shape(X)[0], np.shape(X)[1])

    return (X.flatten(),Y.flatten(),Z.flatten())

def regressionmodel(X,Y,Z):
    '''Multilinear regression model, calculating fit, P-values, confidence intervals etc.'''

    # Convert the data into a Pandas DataFrame
    df = pd.DataFrame({'x':X, 'y':Y, 'z':Z})

    # Fit the model
    model = ols("z ~ x + y", df).fit()

    # Print the summary
    print(model.summary())

    return model._results.params  # should be array([-4.99754526,  3.00250049, -0.50514907])

def linearmodel(X,Y,Z):
    '''Just fit the plane'''

    M = np.vstack((np.ones(len(X)), X, Y)).T
    bestfit = np.linalg.lstsq(M,Z)[0]
    print('Best fit plane:', bestfit)

    return bestfit

if __name__ == '__main__':
    (X,Y,Z) = generatedata()    
    regressionmodel(X,Y,Z)    
    linearmodel(X,Y,Z)
Sign up to request clarification or add additional context in comments.

Comments

3

maybe using http://pypi.python.org/pypi/scikits.statsmodels is easier, and it has more features

import numpy as np
import scikits.statsmodels.api as sm


y = [29.4, 29.9, 31.4, 32.8, 33.6, 34.6, 35.5, 36.3, 37.2, 37.8, 38.5, 38.8,
            38.6, 38.8, 39, 39.7, 40.6, 41.3, 42.5, 43.9, 44.9, 45.3, 45.8, 46.5,
            77.1, 48.2, 48.8, 50.5, 51, 51.3, 50.7, 50.7, 50.6, 50.7, 50.6, 50.7]
        #tuition
x1 = [376, 407, 438, 432, 433, 479, 512, 543, 583, 635, 714, 798, 891,
            971, 1045, 1106, 1218, 1285, 1356, 1454, 1624, 1782, 1942, 2057, 2179,
            2271, 2360, 2506, 2562, 2700, 2903, 3319, 3629, 3874, 4102, 4291]
        #research and development
x2 = [28740.00, 30952.00, 33359.00, 35671.00, 39435.00, 43338.00, 48719.00, 55379.00, 63224.00,
            72292.00, 80748.00, 89950.00, 102244.00, 114671.00, 120249.00, 126360.00, 133881.00, 141891.00,
            151993.00, 160876.00, 165350.00, 165730.00, 169207.00, 183625.00, 197346.00, 212152.00, 226402.00, 
            267298.00, 277366.00, 276022.00, 288324.00, 299201.00, 322104.00, 347048.00, 372535.00,
            397629.00]
        #one/none parents 
x3 = [11610, 12143, 12486, 13015, 13028, 13327, 14074, 14094, 14458, 14878, 15610, 15649,
            15584, 16326, 16379, 16923, 17237, 17088, 17634, 18435, 19327, 19712, 21424, 21978,
            22684, 22597, 22735, 22217, 22214, 22655, 23098, 23602, 24013, 24003, 21593, 22319]


x = np.column_stack((x1,x2,x3))  #stack explanatory variables into an array
x = sm.add_constant(x, prepend=True) #add a constant

res = sm.OLS(y,x).fit() #create a model and fit it
print res.params
print res.bse
print res.summary()

Comments

1

the ols function takes the entire independent data set as the second argument. Try

m = ols(y, [x1, x2, x3], ...)

though I suspect you may need to wrap it in numpy arrays:

x = numpy.ndarray([3, len(x1)])
x[0] = numpy.array(x1)
x[1] = numpy.array(x2)
x[2] = numpy.array(x3)
m = ols(y, x, ...)

5 Comments

Thank You! When I tried taking the entire independent data set it did not recognize it and outputted the same error, but then I tried wrapping the arrays and I am not that familiar with that but I added x = np.ndarray([3, len(x1)]) x[0] = np.ndarray(x1) x[1] = np.ndarray(x2) x[2] = np.ndarray(x3) and it keeps giving me the error ValueError: sequence too large; must be smaller than 32
@RenCam sorry it should have said x[0]=np.array(x1). edited my response
Im really sorry to ask again but now the error says ValueError: setting an array element with a sequence. ?
You could try x = np.hstack([x1, x2, x3]). It might be vstack. I can't remember for sure. Come to think of it, you might be able to just do x = np.array([x1, x2, x3])
Thank You I tried all three but just came back with the error TypeError: this constructor takes no arguments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.