1

Can anyone explain to me why this code:

from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import StratifiedKFold 
from sklearn.feature_selection import SelectKBest 
#from xgboost import XGBClassifier 
from sklearn.feature_selection import mutual_info_classif 
from sklearn.feature_selection import SelectKBest 
from sklearn.pipeline import Pipeline 
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score 
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import make_scorer 
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score 
from sklearn import metrics
from sklearn.datasets import make_classification 
from numpy import mean 
from sklearn.model_selection import train_test_split
from numpy import std 
from sklearn.utils import shuffle 
import numpy as np 
from sklearn.metrics import roc_curve 
import matplotlib.pyplot as plt 
import pickle
#import neptune.new as neptune
import pandas as pd


df = pd.DataFrame({
    'Height': [167, 175, 170, 186, 190, 188, 158, 169, 183, 180],
    'Weight': [65, 70, 72, 80, 86, 94, 50, 58, 78, 85],
    'Team': ['A', 'A', 'B', 'B', 'B', 'B', 'A', 'A', 'B', 'A']
})

full_X_train = df.iloc[:,:-1]
full_y_train = df.iloc[:,-1]


def create_model(X_train=full_X_train,y_train=full_y_train,model_name=SVC(kernel='linear'),n_splits=5,file_name='test_ml'):

      clf = model_name 
      k_fold = StratifiedKFold(n_splits=n_splits,random_state=42,shuffle=True) 
      precision = [] 
      recall = [] 
      f1 = [] 
      aucs = []

       
      for train_index,test_index in k_fold.split(X_train,y_train): 
            x_train_fold,x_test_fold = X_train.iloc[train_index],X_train.iloc[test_index] 
            y_train_fold,y_test_fold = y_train[train_index],y_train[test_index] 
            clf.fit(x_train_fold,y_train_fold) 
            y_pred = clf.predict(x_test_fold) 
            save_mod = file_name + '.' + str(count) + '.fold.json' 
            pickle.dump(clf,open(save_mod,'wb')) 
  
            f1.append(f1_score(y_test_fold,y_pred)) 
      return f1




def get_scores(model,output_file = 'output.txt'):
      open_output = open(output_file, 'a') 
      open_output.write('F1: mean=%.2f std=%.2f, n=%d' % (mean(model[2])*100, std(model[2])*100, len(model[2])) + '\n') 
      return 


def run_model_with_grid_search(model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='test_id', n_splits=5, output_file='', param_grid={}): 
      search = GridSearchCV( 
        model_name,
        cv=3,
        param_grid=param_grid,
        scoring='accuracy',
        refit=True
        ) 

      fit_model = search.fit(X_train,y_train)
      return fit_model,fit_model.best_params_,fit_model.best_score_


fit_model,params,best_score = run_model_with_grid_search(param_grid=[{'max_depth':list(range(5,9))}])
model = create_model(fit_model) #n_jobs=-1 
print(get_scores(model)) 

Returns:

  File "ml_models.py", line 84, in <module>
    model = create_model(fit_model) #n_jobs=-1 
  File "ml_models.py", line 50, in create_model
    for train_index,test_index in k_fold.split(X_train,y_train): 
  File "/Users/slowat/anaconda/envs/nlp_course/lib/python3.7/site-packages/sklearn/model_selection/_split.py", line 324, in split
    X, y, groups = indexable(X, y, groups)
  File "/Users/slowat/anaconda/envs/nlp_course/lib/python3.7/site-packages/sklearn/utils/validation.py", line 299, in indexable
    check_consistent_length(*result)
  File "/Users/slowat/anaconda/envs/nlp_course/lib/python3.7/site-packages/sklearn/utils/validation.py", line 259, in check_consistent_length
    lengths = [_num_samples(X) for X in arrays if X is not None]
  File "/Users/slowat/anaconda/envs/nlp_course/lib/python3.7/site-packages/sklearn/utils/validation.py", line 259, in <listcomp>
    lengths = [_num_samples(X) for X in arrays if X is not None]
  File "/Users/slowat/anaconda/envs/nlp_course/lib/python3.7/site-packages/sklearn/utils/validation.py", line 203, in _num_samples
    " a valid collection." % x)
TypeError: Singleton array array(GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid=[{'max_depth': [5, 6, 7, 8]}], scoring='accuracy'),
      dtype=object) cannot be considered a valid collection.

I have seen this answer, but I don't think this applies to me?

(In case it matters, the overall aim is to implement a grid search with features selection in a Pipeline object - but I haven't figured out how to do that yet because of this error).

1 Answer 1

1

You are passing fit_model as positional argument to create_model. The create_model function has this signature:

create_model(X_train=full_X_train,y_train=full_y_train,model_name=SVC(kernel='linear'),n_splits=5,file_name='test_ml')

So, currently X_train will have the value fit_model and get passed to the grid search, creating this error. To fix it, you should use the keyword argument:

model = create_model(model_name=fit_model)
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.