1

I am working on a multi-class classification problem using xgboost. The shape of my data is

print(train_ohe.shape, test_ohe.shape)
# (43266, 190) (18543, 190)

Custom F1 eval function and model training code

def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = 1-f1_score(y_true, np.round(y_pred),average='weighted')
    return 'f1_err', err

def train_model(algo,train,test,predictors,useTrainCV=True,
                cv_folds=5,early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = algo.get_params()
        xgb_train = xgb.DMatrix(train[predictors].values,label=train[target].values)
        xgb_test = xgb.DMatrix(test[predictors].values)
        print(xgb_train.num_row())
        print(xgb_test.num_row())
        cv_result = xgb.cv(xgb_param,
                           train,
                           num_boost_round=xgb_param['n_estimators'],
                           nfold=cv_folds,
                           metrics='f1_eval',
                          early_stopping_rounds=early_stopping_rounds)
        algo.set_params(n_estimators=cv_result.shape[0])

    # Fit algorithm on data
    algo.fit(train[predictors],train[target],eval_metric=f1_eval)

    # Predict train data
    train_predictions = algo.predict(train[predictors])
    train_pred_prob = algo.predict_proba(train[predictors])[:,1]

    # Report model performance
    print("Model performance")
    print("F1 Score Train {}".format(f1_score(train[target].values,train_predictions)))

    # Predict test data
    test_predictions = algo.predict(test[predictors])

    # Performance 
    print("F1 Score Test {}".format(f1_score(test[target].values,test_predictions)))

Here is my XgbClassifier code. Trying to find the number of estimators for a high learning rate.

target = 'Complaint-Status'

predictors = [x for x in train_ohe.columns if x not in target]

xgb1 = XGBClassifier(learning_rate=0.1,
                    n_estimators=1000,
                    max_depth=5,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='multi:softmax',
                    nthread=8,
                    scale_pos_weight=1,
                    seed=145)

train_model(xgb1, train_ohe, test_ohe, predictors)

I am getting following Attribute error saying 'DataFrame' object has no attribute 'num_row'in the xgb.cv line in train_model function.

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-116-5933227c171d> in <module>
     18                     seed=145)
     19 print(xgb1.get_params())
---> 20 train_model(xgb1, train_ohe, test_ohe, predictors)
     21 # xgb_param = xgb1.get_params()
     22 # cv_folds=5

<ipython-input-114-a9df39c19abf> in train_model(algo, train, test, predictors, useTrainCV, cv_folds, early_stopping_rounds)
     19                            nfold=cv_folds,
     20                            metrics='f1_eval',
---> 21                           early_stopping_rounds=early_stopping_rounds)
     22         algo.set_params(n_estimators=cv_result.shape[0])
     23 

/opt/virtual_env/py3/lib/python3.6/site-packages/xgboost/training.py in cv(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks, shuffle)
    413     results = {}
    414     cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc,
--> 415                       stratified, folds, shuffle)
    416 
    417     # setup callbacks

/opt/virtual_env/py3/lib/python3.6/site-packages/xgboost/training.py in mknfold(dall, nfold, param, seed, evals, fpreproc, stratified, folds, shuffle)
    246         # Do standard k-fold cross validation
    247         if shuffle is True:
--> 248             idx = np.random.permutation(dall.num_row())
    249         else:
    250             idx = np.arange(dall.num_row())

/opt/virtual_env/py3/lib/python3.6/site-packages/pandas/core/generic.py in __getattr__(self, name)
   4374             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   4375                 return self[name]
-> 4376             return object.__getattribute__(self, name)
   4377 
   4378     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'num_row'    

1 Answer 1

3

Saw your post when I was searching around for the same error.

Your second parameter train of the code:

 cv_result = xgb.cv(xgb_param,
                           train,
                           num_boost_round=xgb_param['n_estimators'],
                           nfold=cv_folds,
                           metrics='f1_eval',
                          early_stopping_rounds=early_stopping_rounds)
        algo.set_params(n_estimators=cv_result.shape[0])

should be a matrix such as

train = xgb.DMatrix(X_train, y_train)

hope this helps

Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.