0

Code:

import numpy as np
import sklearn as skl

data = np.genfromtxt("water_potability.csv", delimiter = ",", names = True)
print(data)
print(data.shape)
print(type(data[0]))
imputer = skl.impute.SimpleImputer()
imputer.fit_transform(data)

(part of) water_potability.csv:

ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
,204.8904554713363,20791.318980747026,7.300211873184757,368.51644134980336,564.3086541722439,10.3797830780847,86.9909704615088,2.9631353806316407,0
3.71608007538699,129.42292051494425,18630.057857970347,6.635245883862,,592.8853591348523,15.180013116357259,56.32907628451764,4.500656274942408,0
8.099124189298397,224.23625939355776,19909.541732292393,9.275883602694089,,418.6062130644815,16.868636929550973,66.42009251176368,3.0559337496641685,0
...
9.41951031641321,175.76264629629543,33155.578218312294,7.350233233214412,,432.04478304536786,11.039069688154314,69.84540029205144,3.298875498646556,1
5.1267629233515315,230.60375750846123,11983.869376336363,6.303356534249105,,402.883113121781,11.168946221056501,77.48821310275477,4.708658467526655,1
7.874671357791283,195.10229858610904,17404.17706105066,7.509305856927908,,327.4597604610721,16.140367626166324,78.69844632549504,2.309149056634923,1

I have a ndarray of item type numpy.void which is actually a numpy.record or structured array (I'm not sure which, I just know that it have field names). When I try to use sklearn's SimpleImputer (or other imputers) on it, it throws an exception:

[(       nan, 204.89045547, 20791.31898075, 7.30021187, 368.51644135, 564.30865417, 10.37978308, 86.99097046, 2.96313538, 0.)
 (3.71608008, 129.42292051, 18630.05785797, 6.63524588,          nan, 592.88535913, 15.18001312, 56.32907628, 4.50065627, 0.)
 (8.09912419, 224.23625939, 19909.54173229, 9.2758836 ,          nan, 418.60621306, 16.86863693, 66.42009251, 3.05593375, 0.)
 ...
 (9.41951032, 175.7626463 , 33155.57821831, 7.35023323,          nan, 432.04478305, 11.03906969, 69.84540029, 3.2988755 , 1.)
 (5.12676292, 230.60375751, 11983.86937634, 6.30335653,          nan, 402.88311312, 11.16894622, 77.4882131 , 4.70865847, 1.)
 (7.87467136, 195.10229859, 17404.17706105, 7.50930586,          nan, 327.45976046, 16.14036763, 78.69844633, 2.30914906, 1.)]
(3276,)
<class 'numpy.void'>
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[4], line 9
      7 print(type(data[0]))
      8 imputer = skl.impute.SimpleImputer()
----> 9 imputer.fit_transform(data)

File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_set_output.py:316, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
    314 @wraps(f)
    315 def wrapped(self, X, *args, **kwargs):
--> 316     data_to_wrap = f(self, X, *args, **kwargs)
    317     if isinstance(data_to_wrap, tuple):
    318         # only wrap the first output for cross decomposition
    319         return_tuple = (
    320             _wrap_data_with_container(method, data_to_wrap[0], X, self),
    321             *data_to_wrap[1:],
    322         )

File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py:894, in TransformerMixin.fit_transform(self, X, y, **fit_params)
    879         warnings.warn(
    880             (
    881                 f"This object ({self.__class__.__name__}) has a `transform`"
   (...)    889             UserWarning,
    890         )
    892 if y is None:
    893     # fit method of arity 1 (unsupervised transformation)
--> 894     return self.fit(X, **fit_params).transform(X)
    895 else:
    896     # fit method of arity 2 (supervised transformation)
    897     return self.fit(X, y, **fit_params).transform(X)

File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py:1365, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1358     estimator._validate_params()
   1360 with config_context(
   1361     skip_parameter_validation=(
   1362         prefer_skip_nested_validation or global_skip_validation
   1363     )
   1364 ):
-> 1365     return fit_method(estimator, *args, **kwargs)

File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\impute\_base.py:452, in SimpleImputer.fit(self, X, y)
    434 @_fit_context(prefer_skip_nested_validation=True)
    435 def fit(self, X, y=None):
    436     """Fit the imputer on `X`.
    437 
    438     Parameters
   (...)    450         Fitted estimator.
    451     """
--> 452     X = self._validate_input(X, in_fit=True)
    454     # default fill_value is 0 for numerical input and "missing_value"
    455     # otherwise
    456     if self.fill_value is None:

File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\impute\_base.py:360, in SimpleImputer._validate_input(self, X, in_fit)
    357     ensure_all_finite = True
    359 try:
--> 360     X = validate_data(
    361         self,
    362         X,
    363         reset=in_fit,
    364         accept_sparse="csc",
    365         dtype=dtype,
    366         force_writeable=True if not in_fit else None,
    367         ensure_all_finite=ensure_all_finite,
    368         copy=self.copy,
    369     )
    370 except ValueError as ve:
    371     if "could not convert" in str(ve):

File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py:2954, in validate_data(_estimator, X, y, reset, validate_separately, skip_check_array, **check_params)
   2952         out = X, y
   2953 elif not no_val_X and no_val_y:
-> 2954     out = check_array(X, input_name="X", **check_params)
   2955 elif no_val_X and not no_val_y:
   2956     out = _check_y(y, **check_params)

File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py:1053, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_all_finite, ensure_non_negative, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
   1051         array = xp.astype(array, dtype, copy=False)
   1052     else:
-> 1053         array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
   1054 except ComplexWarning as complex_warning:
   1055     raise ValueError(
   1056         "Complex data not supported\n{}\n".format(array)
   1057     ) from complex_warning

File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_array_api.py:757, in _asarray_with_order(array, dtype, order, copy, xp, device)
    755     array = numpy.array(array, order=order, dtype=dtype)
    756 else:
--> 757     array = numpy.asarray(array, order=order, dtype=dtype)
    759 # At this point array is a NumPy ndarray. We convert it to an array
    760 # container that is consistent with the input's namespace.
    761 return xp.asarray(array)

TypeError: Cannot cast array data from dtype([('ph', '<f8'), ('Hardness', '<f8'), ('Solids', '<f8'), ('Chloramines', '<f8'), ('Sulfate', '<f8'), ('Conductivity', '<f8'), ('Organic_carbon', '<f8'), ('Trihalomethanes', '<f8'), ('Turbidity', '<f8'), ('Potability', '<f8')]) to dtype('float64') according to the rule 'unsafe'

After testing, I found out that data.shape is (3276, ), which indicates that data will be interpreted as 1d array, instead of 2d. So my question is: Is there any way to use imputation on the data while preserving the field names, or I have to cast numpy.void to other compatible types, say numpy.ndarray?

3
  • 1
    maybe you should convert data to normal 2D array, maybe you should read CSV using standard module csv or using pandas.read_csv Commented Oct 19 at 11:34
  • You need to load or covert this data as 2d array of floats. numpy.lib.recfunctions.structured_to_unstructured can be used to covert this structured array. Read about structut0red arays at numpy.org/doc/stable/user/basics.rec.html Commented Oct 19 at 20:23
  • 1
    Alternatively load with the default float dtype, skipping the 'names' row. Preserving the field names can be convenient in some cases, but generally gets in the way of 2d array calculations. Commented Oct 20 at 18:21

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.