Problem
Build prediction accuracy model using lightgbm on New York Taxi Duration dataset. [Kaggle model:https://www.kaggle.com/code/mobilematthew/newyorkcitytaxidurationprediction/edit/run/123885887
Setting up Light Gradient Boost with one set of parameters, and two models, 1) use to LGBMClassifier fit / predict, 2) to to LightGB train / predict.
Tested 2) LightGB train / predict and this code works. Added 1) LGBMClassifier fit / predict, with exact same parameters, should work fine, but the fit raises an error.
The 2) model trains fine before this issue. The problem is when I attempt to make a prediction from the lightgbm 1) LGBMClassifier fit model. This is the error: "TypeError" which is raised from the lightgbm.fit() function. The y is one dimension. X_train has multiple features, all reduced via importance. To analyze this numpy.ndarray for 2 dimensions.
lgb_train = lgb.Dataset(X_train)
lgb_y_train = lgb.Dataset(y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
param_grid = { 'boosting_type':'gbdt',
'n_estimators':50,
'objective': 'regression',
'num_leaves': 5,
'num_boost_round':20,
'class_weight':'balanced',
'colsample_bytree':1.0,
'importance_type':'gain',
'learning_rate':0.001,
'max_depth':-1,
'min_child_samples':20,
'min_child_weight':0.001,
'min_split_gain':0.0,
'n_jobs':-1,
'verbose':0,
'random_state':None,
'reg_alpha':0.0,
'reg_lambda':0.05,
'subsample':1.0,
'subsample_freq':0,
'min_data':1,
'force_row_wise' : True,
'eval_set':[X_test, y_test]
}
light_model = lgb.LGBMClassifier(param_grid,random_state=42)
light_model_fit = light_model.fit(X_train, y_train)
light_model_fix_y_pred = light_model_fit.predict(X_test)
light_model_trained = lgb.train(param_grid, lgb_train)
light_model_trained_pred = light_model_trained.predict(X_test)
Code with raised error
With above code, trained model, so far everything working fine
Setup for prediction
Setup light gradient boost fit for predict invocation, this is where the Value Error is raised.
light_model_fit = light_model.fit(X_train, y_train)
Error
TypeError Traceback (most recent call last)
/tmp/ipykernel_27/124012372.py in <module>
30
31 light_model = lgb.LGBMClassifier(param_grid,random_state=42)
---> 32 light_model_fit = light_model.fit(X_train, y_train)
33 light_model_fix_y_pred = light_model_fit.predict(X_test)
34
/opt/conda/lib/python3.7/site-packages/lightgbm/sklearn.py in fit(self, X, y, sample_weight, init_score, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_metric, early_stopping_rounds, verbose, feature_name, categorical_feature, callbacks, init_model)
970 eval_metric=eval_metric, early_stopping_rounds=early_stopping_rounds,
971 verbose=verbose, feature_name=feature_name, categorical_feature=categorical_feature,
--> 972 callbacks=callbacks, init_model=init_model)
973 return self
974
/opt/conda/lib/python3.7/site-packages/lightgbm/sklearn.py in fit(self, X, y, sample_weight, init_score, group, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_group, eval_metric, early_stopping_rounds, verbose, feature_name, categorical_feature, callbacks, init_model)
756 init_model=init_model,
757 feature_name=feature_name,
--> 758 callbacks=callbacks
759 )
760
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py in train(params, train_set, num_boost_round, valid_sets, valid_names, fobj, feval, init_model, feature_name, categorical_feature, early_stopping_rounds, evals_result, verbose_eval, learning_rates, keep_training_booster, callbacks)
269 # construct booster
270 try:
--> 271 booster = Booster(params=params, train_set=train_set)
272 if is_valid_contain_train:
273 booster.set_train_data_name(train_data_name)
/opt/conda/lib/python3.7/site-packages/lightgbm/basic.py in __init__(self, params, train_set, model_file, model_str, silent)
2603 )
2604 # construct booster object
-> 2605 train_set.construct()
2606 # copy the parameters from train_set
2607 params.update(train_set.get_params())
/opt/conda/lib/python3.7/site-packages/lightgbm/basic.py in construct(self)
1817 init_score=self.init_score, predictor=self._predictor,
1818 silent=self.silent, feature_name=self.feature_name,
-> 1819 categorical_feature=self.categorical_feature, params=self.params)
1820 if self.free_raw_data:
1821 self.data = None
/opt/conda/lib/python3.7/site-packages/lightgbm/basic.py in _lazy_init(self, data, label, reference, weight, group, init_score, predictor, silent, feature_name, categorical_feature, params)
1515 params['categorical_column'] = sorted(categorical_indices)
1516
-> 1517 params_str = param_dict_to_str(params)
1518 self.params = params
1519 # process for reference dataset
/opt/conda/lib/python3.7/site-packages/lightgbm/basic.py in param_dict_to_str(data)
292 pairs.append(f"{key}={val}")
293 elif val is not None:
--> 294 raise TypeError(f'Unknown type of parameter:{key}, got:{type(val).__name__}')
295 return ' '.join(pairs)
296
TypeError: Unknown type of parameter:boosting_type, got:dict
Data
ValueError: Input numpy.ndarray or list must be 2 dimensional.
Data for this lightgbm model: 1 X_test, 2 X_train, 3 y_train 4 lgb.Dataset(X_train) via lgb.get[X] commands
X_test
pickup_longitude pickup_latitude dropoff_latitude trip_duration direction week minute_oftheday
139168 -73.990189 40.757259 40.762600 1095 69.265257 6 1289
1401881 -73.955223 40.768841 40.777191 390 39.910385 24 881
1207916 -73.955345 40.764126 40.781013 1171 -49.064405 1 1156
466038 -73.996696 40.733234 40.713543 1626 162.417522 14 683
855381 -74.004532 40.706974 40.717777 689 -5.260115 22 1237
... ... ... ... ... ... ... ...
425268 -73.978287 40.752300 40.763824 858 2.899225 23 900
940105 -73.984207 40.759949 40.751755 432 -166.399449 6 1084
502876 -73.970856 40.753487 40.764393 708 -11.675022 1 924
895439 -74.006882 40.710022 40.713509 486 -46.212070 5 682
699976 -74.002594 40.739544 40.778725 866 27.435443 25 1374
X_train
pickup_longitude pickup_latitude dropoff_latitude trip_duration direction week minute_oftheday
97650 -73.979507 40.765388 40.759701 600 -175.931831 19 1097
1101996 -73.970932 40.765720 40.762833 224 138.507110 8 1151
61397 -73.862785 40.768963 40.753124 884 -106.385968 8 1296
941058 -73.957634 40.782143 40.761646 1239 -136.690811 1 495
909725 -74.013771 40.701969 40.706612 240 51.439139 21 555
... ... ... ... ... ... ... ...
1054126 -73.990929 40.750561 40.728096 646 175.218677 17 549
624177 -73.941513 40.851059 40.849010 197 107.450422 25 559
1175512 -73.989532 40.769600 40.788155 413 40.718927 16 1316
823176 -73.982628 40.751122 40.754730 308 66.554445 19 1042
448716 -73.989456 40.720070 40.770935 969 61.479580 2 274
y_train
97650 600
1101996 224
61397 884
941058 1239
909725 240
...
1234001 831
1403381 1590
454139 2226
557019 312
699873 1337
Name: trip_duration, Length: 2995, dtype: int64
Dataset X_train
, ,pickup_longitude ,pickup_latitude ,dropoff_latitude ,trip_duration ,Direction,week , minute_oftheday
,97650,-73.979507,40.765388,40.759701,600,175.931831,19,1097
,1101996,-73.970932,40.765720,40.762833,224,138.507110,8,1151
,61397,-73.862785,40.768963,40.753124,884,106.385968,8,1296
,941058,-73.957634,40.782143,40.761646,1239,136.690811,1,495
,909725,-74.013771,40.701969,40.706612,240,51.439139,21,555
Methods to get data from Lightgdm
print(lgb_train.get_data())
print(lgb_train.get_feature_name())
print(lgb_train.get_label())
print(lgb_train.get_params())
print(lgb_train.num_data())
print(lgb_train.num_feature())



