Follow

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use
Contact

SKlearn pipelines cannot work when creating new Dataframe inside custom transformer

I have a pipeline with pipelines and columntransformers, with some custom transformers
How can I fix this:

Input In [8], in <cell line: 21>()
     19 # Fit all (1) models defined in our model-search object
     20 print(X_train.shape)
---> 21 best = cv_model_search.fit(X_train,y_train)

File ~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:891, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
    885     results = self._format_results(
    886         all_candidate_params, n_splits, all_out, all_more_results
    887     )
    889     return results
--> 891 self._run_search(evaluate_candidates)
    893 # multimetric is determined here because in the case of a callable
    894 # self.scoring the return type is only known after calling
    895 first_test_score = all_out[0]["test_scores"]

File ~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:1392, in GridSearchCV._run_search(self, evaluate_candidates)
   1390 def _run_search(self, evaluate_candidates):
   1391     """Search all candidates in param_grid"""
-> 1392     evaluate_candidates(ParameterGrid(self.param_grid))

File ~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:838, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
    830 if self.verbose > 0:
    831     print(
    832         "Fitting {0} folds for each of {1} candidates,"
    833         " totalling {2} fits".format(
    834             n_splits, n_candidates, n_candidates * n_splits
    835         )
    836     )
--> 838 out = parallel(
    839     delayed(_fit_and_score)(
    840         clone(base_estimator),
    841         X,
    842         y,
    843         train=train,
    844         test=test,
    845         parameters=parameters,
    846         split_progress=(split_idx, n_splits),
    847         candidate_progress=(cand_idx, n_candidates),
    848         **fit_and_score_kwargs,
    849     )
    850     for (cand_idx, parameters), (split_idx, (train, test)) in product(
    851         enumerate(candidate_params), enumerate(cv.split(X, y, groups))
    852     )
    853 )
    855 if len(out) < 1:
    856     raise ValueError(
    857         "No fits were performed. "
    858         "Was the CV iterator empty? "
    859         "Were there no candidates?"
    860     )

File ~\anaconda3\lib\site-packages\joblib\parallel.py:1043, in Parallel.__call__(self, iterable)
   1034 try:
   1035     # Only set self._iterating to True if at least a batch
   1036     # was dispatched. In particular this covers the edge
   (...)
   1040     # was very quick and its callback already dispatched all the
   1041     # remaining jobs.
   1042     self._iterating = False
-> 1043     if self.dispatch_one_batch(iterator):
   1044         self._iterating = self._original_iterator is not None
   1046     while self.dispatch_one_batch(iterator):

File ~\anaconda3\lib\site-packages\joblib\parallel.py:861, in Parallel.dispatch_one_batch(self, iterator)
    859     return False
    860 else:
--> 861     self._dispatch(tasks)
    862     return True

File ~\anaconda3\lib\site-packages\joblib\parallel.py:779, in Parallel._dispatch(self, batch)
    777 with self._lock:
    778     job_idx = len(self._jobs)
--> 779     job = self._backend.apply_async(batch, callback=cb)
    780     # A job can complete so quickly than its callback is
    781     # called before we get here, causing self._jobs to
    782     # grow. To ensure correct results ordering, .insert is
    783     # used (rather than .append) in the following line
    784     self._jobs.insert(job_idx, job)

File ~\anaconda3\lib\site-packages\joblib\_parallel_backends.py:208, in SequentialBackend.apply_async(self, func, callback)
    206 def apply_async(self, func, callback=None):
    207     """Schedule a func to be run"""
--> 208     result = ImmediateResult(func)
    209     if callback:
    210         callback(result)

File ~\anaconda3\lib\site-packages\joblib\_parallel_backends.py:572, in ImmediateResult.__init__(self, batch)
    569 def __init__(self, batch):
    570     # Don't delay the application, to avoid keeping the input
    571     # arguments in memory
--> 572     self.results = batch()

File ~\anaconda3\lib\site-packages\joblib\parallel.py:262, in BatchedCalls.__call__(self)
    258 def __call__(self):
    259     # Set the default nested backend to self._backend but do not set the
    260     # change the default number of processes to -1
    261     with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262         return [func(*args, **kwargs)
    263                 for func, args, kwargs in self.items]

File ~\anaconda3\lib\site-packages\joblib\parallel.py:262, in <listcomp>(.0)
    258 def __call__(self):
    259     # Set the default nested backend to self._backend but do not set the
    260     # change the default number of processes to -1
    261     with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262         return [func(*args, **kwargs)
    263                 for func, args, kwargs in self.items]

File ~\anaconda3\lib\site-packages\sklearn\utils\fixes.py:216, in _FuncWrapper.__call__(self, *args, **kwargs)
    214 def __call__(self, *args, **kwargs):
    215     with config_context(**self.config):
--> 216         return self.function(*args, **kwargs)

File ~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:680, in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
    678         estimator.fit(X_train, **fit_params)
    679     else:
--> 680         estimator.fit(X_train, y_train, **fit_params)
    682 except Exception:
    683     # Note fit time as time until error
    684     fit_time = time.time() - start_time

File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:390, in Pipeline.fit(self, X, y, **fit_params)
    364 """Fit the model.
    365 
    366 Fit all the transformers one after the other and transform the
   (...)
    387     Pipeline with fitted steps.
    388 """
    389 fit_params_steps = self._check_fit_params(**fit_params)
--> 390 Xt = self._fit(X, y, **fit_params_steps)
    391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
    392     if self._final_estimator != "passthrough":

File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:348, in Pipeline._fit(self, X, y, **fit_params_steps)
    346     cloned_transformer = clone(transformer)
    347 # Fit or load from cache the current transformer
--> 348 X, fitted_transformer = fit_transform_one_cached(
    349     cloned_transformer,
    350     X,
    351     y,
    352     None,
    353     message_clsname="Pipeline",
    354     message=self._log_message(step_idx),
    355     **fit_params_steps[name],
    356 )
    357 # Replace the transformer of the step with the fitted
    358 # transformer. This is necessary when loading the transformer
    359 # from the cache.
    360 self.steps[step_idx] = (name, fitted_transformer)

File ~\anaconda3\lib\site-packages\joblib\memory.py:349, in NotMemorizedFunc.__call__(self, *args, **kwargs)
    348 def __call__(self, *args, **kwargs):
--> 349     return self.func(*args, **kwargs)

File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:893, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    891 with _print_elapsed_time(message_clsname, message):
    892     if hasattr(transformer, "fit_transform"):
--> 893         res = transformer.fit_transform(X, y, **fit_params)
    894     else:
    895         res = transformer.fit(X, y, **fit_params).transform(X)

File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:434, in Pipeline.fit_transform(self, X, y, **fit_params)
    432 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
    433 if hasattr(last_step, "fit_transform"):
--> 434     return last_step.fit_transform(Xt, y, **fit_params_last_step)
    435 else:
    436     return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)

File ~\anaconda3\lib\site-packages\sklearn\base.py:855, in TransformerMixin.fit_transform(self, X, y, **fit_params)
    852     return self.fit(X, **fit_params).transform(X)
    853 else:
    854     # fit method of arity 2 (supervised transformation)
--> 855     return self.fit(X, y, **fit_params).transform(X)

Input In [5], in MakeDataFrame.transform(self, X)
    170 def transform(self, X):
--> 171     return pd.DataFrame(data=X, index=np.arange(len(X)), columns=self.columns)

File ~\anaconda3\lib\site-packages\pandas\core\frame.py:694, in DataFrame.__init__(self, data, index, columns, dtype, copy)
    684         mgr = dict_to_mgr(
    685             # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
    686             # attribute "name"
   (...)
    691             typ=manager,
    692         )
    693     else:
--> 694         mgr = ndarray_to_mgr(
    695             data,
    696             index,
    697             columns,
    698             dtype=dtype,
    699             copy=copy,
    700             typ=manager,
    701         )
    703 # For data is list-like, or Iterable (will consume into list)
    704 elif is_list_like(data):

File ~\anaconda3\lib\site-packages\pandas\core\internals\construction.py:351, in ndarray_to_mgr(values, index, columns, dtype, copy, typ)
    346 # _prep_ndarray ensures that values.ndim == 2 at this point
    347 index, columns = _get_axes(
    348     values.shape[0], values.shape[1], index=index, columns=columns
    349 )
--> 351 _check_values_indices_shape_match(values, index, columns)
    353 if typ == "array":
    355     if issubclass(values.dtype.type, str):

File ~\anaconda3\lib\site-packages\pandas\core\internals\construction.py:422, in _check_values_indices_shape_match(values, index, columns)
    420 passed = values.shape
    421 implied = (len(index), len(columns))
--> 422 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")

ValueError: Shape of passed values is (730, 167), indices imply (730, 163)

Stack is saying my post is pure code so I’m adding this:
Lorem ipsum es el texto que se usa habitualmente en diseño gráfico en demostraciones de tipografías o de borradores de diseño para probar el diseño visual antes de insertar el texto final

MEDevel.com: Open-source for Healthcare and Education

Collecting and validating open-source software for healthcare, education, enterprise, development, medical imaging, medical records, and digital pathology.

Visit Medevel

>Solution :

set columns = X.columns in your custom transfomer.

Add a comment

Leave a Reply

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use

Discover more from Dev solutions

Subscribe now to keep reading and get access to the full archive.

Continue reading