Home SKlearn pipelines cannot work when creating new Dataframe inside custom transformer

Questions

SKlearn pipelines cannot work when creating new Dataframe inside custom transformer

March 24, 2022

I have a pipeline with pipelines and columntransformers, with some custom transformers
How can I fix this:

Input In [8], in <cell line: 21>()
     19 # Fit all (1) models defined in our model-search object
     20 print(X_train.shape)
---> 21 best = cv_model_search.fit(X_train,y_train)

File ~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:891, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
    885     results = self._format_results(
    886         all_candidate_params, n_splits, all_out, all_more_results
    887     )
    889     return results
--> 891 self._run_search(evaluate_candidates)
    893 # multimetric is determined here because in the case of a callable
    894 # self.scoring the return type is only known after calling
    895 first_test_score = all_out[0]["test_scores"]

File ~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:1392, in GridSearchCV._run_search(self, evaluate_candidates)
   1390 def _run_search(self, evaluate_candidates):
   1391     """Search all candidates in param_grid"""
-> 1392     evaluate_candidates(ParameterGrid(self.param_grid))

File ~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:838, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
    830 if self.verbose > 0:
    831     print(
    832         "Fitting {0} folds for each of {1} candidates,"
    833         " totalling {2} fits".format(
    834             n_splits, n_candidates, n_candidates * n_splits
    835         )
    836     )
--> 838 out = parallel(
    839     delayed(_fit_and_score)(
    840         clone(base_estimator),
    841         X,
    842         y,
    843         train=train,
    844         test=test,
    845         parameters=parameters,
    846         split_progress=(split_idx, n_splits),
    847         candidate_progress=(cand_idx, n_candidates),
    848         **fit_and_score_kwargs,
    849     )
    850     for (cand_idx, parameters), (split_idx, (train, test)) in product(
    851         enumerate(candidate_params), enumerate(cv.split(X, y, groups))
    852     )
    853 )
    855 if len(out) < 1:
    856     raise ValueError(
    857         "No fits were performed. "
    858         "Was the CV iterator empty? "
    859         "Were there no candidates?"
    860     )

File ~\anaconda3\lib\site-packages\joblib\parallel.py:1043, in Parallel.__call__(self, iterable)
   1034 try:
   1035     # Only set self._iterating to True if at least a batch
   1036     # was dispatched. In particular this covers the edge
   (...)
   1040     # was very quick and its callback already dispatched all the
   1041     # remaining jobs.
   1042     self._iterating = False
-> 1043     if self.dispatch_one_batch(iterator):
   1044         self._iterating = self._original_iterator is not None
   1046     while self.dispatch_one_batch(iterator):

File ~\anaconda3\lib\site-packages\joblib\parallel.py:861, in Parallel.dispatch_one_batch(self, iterator)
    859     return False
    860 else:
--> 861     self._dispatch(tasks)
    862     return True

File ~\anaconda3\lib\site-packages\joblib\parallel.py:779, in Parallel._dispatch(self, batch)
    777 with self._lock:
    778     job_idx = len(self._jobs)
--> 779     job = self._backend.apply_async(batch, callback=cb)
    780     # A job can complete so quickly than its callback is
    781     # called before we get here, causing self._jobs to
    782     # grow. To ensure correct results ordering, .insert is
    783     # used (rather than .append) in the following line
    784     self._jobs.insert(job_idx, job)

File ~\anaconda3\lib\site-packages\joblib\_parallel_backends.py:208, in SequentialBackend.apply_async(self, func, callback)
    206 def apply_async(self, func, callback=None):
    207     """Schedule a func to be run"""
--> 208     result = ImmediateResult(func)
    209     if callback:
    210         callback(result)

File ~\anaconda3\lib\site-packages\joblib\_parallel_backends.py:572, in ImmediateResult.__init__(self, batch)
    569 def __init__(self, batch):
    570     # Don't delay the application, to avoid keeping the input
    571     # arguments in memory
--> 572     self.results = batch()

File ~\anaconda3\lib\site-packages\joblib\parallel.py:262, in BatchedCalls.__call__(self)
    258 def __call__(self):
    259     # Set the default nested backend to self._backend but do not set the
    260     # change the default number of processes to -1
    261     with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262         return [func(*args, **kwargs)
    263                 for func, args, kwargs in self.items]

File ~\anaconda3\lib\site-packages\joblib\parallel.py:262, in <listcomp>(.0)
    258 def __call__(self):
    259     # Set the default nested backend to self._backend but do not set the
    260     # change the default number of processes to -1
    261     with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262         return [func(*args, **kwargs)
    263                 for func, args, kwargs in self.items]

File ~\anaconda3\lib\site-packages\sklearn\utils\fixes.py:216, in _FuncWrapper.__call__(self, *args, **kwargs)
    214 def __call__(self, *args, **kwargs):
    215     with config_context(**self.config):
--> 216         return self.function(*args, **kwargs)

File ~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:680, in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
    678         estimator.fit(X_train, **fit_params)
    679     else:
--> 680         estimator.fit(X_train, y_train, **fit_params)
    682 except Exception:
    683     # Note fit time as time until error
    684     fit_time = time.time() - start_time

File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:390, in Pipeline.fit(self, X, y, **fit_params)
    364 """Fit the model.
    365 
    366 Fit all the transformers one after the other and transform the
   (...)
    387     Pipeline with fitted steps.
    388 """
    389 fit_params_steps = self._check_fit_params(**fit_params)
--> 390 Xt = self._fit(X, y, **fit_params_steps)
    391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
    392     if self._final_estimator != "passthrough":

File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:348, in Pipeline._fit(self, X, y, **fit_params_steps)
    346     cloned_transformer = clone(transformer)
    347 # Fit or load from cache the current transformer
--> 348 X, fitted_transformer = fit_transform_one_cached(
    349     cloned_transformer,
    350     X,
    351     y,
    352     None,
    353     message_clsname="Pipeline",
    354     message=self._log_message(step_idx),
    355     **fit_params_steps[name],
    356 )
    357 # Replace the transformer of the step with the fitted
    358 # transformer. This is necessary when loading the transformer
    359 # from the cache.
    360 self.steps[step_idx] = (name, fitted_transformer)

File ~\anaconda3\lib\site-packages\joblib\memory.py:349, in NotMemorizedFunc.__call__(self, *args, **kwargs)
    348 def __call__(self, *args, **kwargs):
--> 349     return self.func(*args, **kwargs)

File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:893, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    891 with _print_elapsed_time(message_clsname, message):
    892     if hasattr(transformer, "fit_transform"):
--> 893         res = transformer.fit_transform(X, y, **fit_params)
    894     else:
    895         res = transformer.fit(X, y, **fit_params).transform(X)

File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:434, in Pipeline.fit_transform(self, X, y, **fit_params)
    432 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
    433 if hasattr(last_step, "fit_transform"):
--> 434     return last_step.fit_transform(Xt, y, **fit_params_last_step)
    435 else:
    436     return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)

File ~\anaconda3\lib\site-packages\sklearn\base.py:855, in TransformerMixin.fit_transform(self, X, y, **fit_params)
    852     return self.fit(X, **fit_params).transform(X)
    853 else:
    854     # fit method of arity 2 (supervised transformation)
--> 855     return self.fit(X, y, **fit_params).transform(X)

Input In [5], in MakeDataFrame.transform(self, X)
    170 def transform(self, X):
--> 171     return pd.DataFrame(data=X, index=np.arange(len(X)), columns=self.columns)

File ~\anaconda3\lib\site-packages\pandas\core\frame.py:694, in DataFrame.__init__(self, data, index, columns, dtype, copy)
    684         mgr = dict_to_mgr(
    685             # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
    686             # attribute "name"
   (...)
    691             typ=manager,
    692         )
    693     else:
--> 694         mgr = ndarray_to_mgr(
    695             data,
    696             index,
    697             columns,
    698             dtype=dtype,
    699             copy=copy,
    700             typ=manager,
    701         )
    703 # For data is list-like, or Iterable (will consume into list)
    704 elif is_list_like(data):

File ~\anaconda3\lib\site-packages\pandas\core\internals\construction.py:351, in ndarray_to_mgr(values, index, columns, dtype, copy, typ)
    346 # _prep_ndarray ensures that values.ndim == 2 at this point
    347 index, columns = _get_axes(
    348     values.shape[0], values.shape[1], index=index, columns=columns
    349 )
--> 351 _check_values_indices_shape_match(values, index, columns)
    353 if typ == "array":
    355     if issubclass(values.dtype.type, str):

File ~\anaconda3\lib\site-packages\pandas\core\internals\construction.py:422, in _check_values_indices_shape_match(values, index, columns)
    420 passed = values.shape
    421 implied = (len(index), len(columns))
--> 422 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")

ValueError: Shape of passed values is (730, 167), indices imply (730, 163)

Stack is saying my post is pure code so I’m adding this:
Lorem ipsum es el texto que se usa habitualmente en diseño gráfico en demostraciones de tipografías o de borradores de diseño para probar el diseño visual antes de insertar el texto final