how can i Improve this model using GridSearchCV with Pipeline

September 14, 2022

I am trying to Improve a Regression Model using GridSearchCV with Pipeline, but I ran into an error. if i am not worn then, it points to Invalid Paramaters, I’ve cross checked the parameters properly, but still i can’t debug the code.

## importing libraries

import pandas as pd 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

## importing the model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

## setup random seed()
import numpy as np
np.random.seed(42)

## Import Data and Drop rows with Missing Labels
data = pd.read_csv("Data/car-sales-extended-missing-data.csv")
data.dropna(subset=["Price"],inplace=True)

## Define categorical columns
categorical_features = ["Make", "Colour"]
# Create categorical transformer (imputes missing values, then one hot encodes them)
categorical_transformer = Pipeline(steps=[
  ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
  ('onehot', OneHotEncoder(handle_unknown='ignore'))                                         
])

# Define door feature
door_feature = ["Doors"]
# Create door transformer (fills all door missing values with 4)
door_transformer = Pipeline(steps=[
  ('imputer', SimpleImputer(strategy='constant', fill_value=4)),
])

# Define numeric featrue
numeric_features = ["Odometer (KM)"]
# Create a transformer for filling all missing numeric values with the mean
numeric_transformer = Pipeline(steps=[
  ('imputer', SimpleImputer(strategy='mean'))  
])

# Create a column transformer which combines all of the other transformers 
# into one step
preprocessor = ColumnTransformer(
    transformers=[
      ('categorical', categorical_transformer, categorical_features),
      ('door', door_transformer, door_feature),
      ('numerical', numeric_transformer, numeric_features)
])

# Create the model pipeline
model = Pipeline(steps=[('preprocessor', preprocessor), # this will fill our missing data and make sure it's all numbers
                        ('regressor', RandomForestRegressor())]) # this will model our data

#split data
x = data.drop("Price",axis=1)
y = data["Price"]

# Split data into train and teset sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Fit the model on the training data 
#(note: when fit() is called with a Pipeline(), fit_transform() is used for transformers)
model.fit(X_train, y_train)

# Score the model on the data 
# (note: when score() or  predict() is called with a Pipeline(), transform() is used for transformers)
model.score(X_test, y_test)

The GridSearch Tuning

Tuning the model above with GridSearchCV using Pipeline

## from sklearn.model_selection import GridSearchCV
## Already Imported above.

pipe_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "model__e_estimators": [100, 1000],
    "model__max_depth": [None],
    "model__max_features": ["auto"],
    "model__min_samples_split": [2, 4]
    }

gs_model = GridSearchCV(model,pipe_grid,cv=5,verbose=2)
gs_model.fit(x_train,y_train)

Here’s the Error i got, After passing some hyperparameter’s to Improve on the model.

Fitting 5 folds for each of 8 candidates, totalling 40 fits
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [34], in <cell line: 12>()
      3 pipe_grid = {
      4     "preprocessor__num__imputer__strategy": ["mean", "median"],
      5     "model__e_estimators": [100, 1000],
   (...)
      8     "model__min_samples_split": [2, 4]
      9     }
     11 gs_model = GridSearchCV(model,pipe_grid,cv=5,verbose=2)
---> 12 gs_model.fit(x_train,y_train)

File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\sklearn\model_selection\_search.py:875, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
    869     results = self._format_results(
    870         all_candidate_params, n_splits, all_out, all_more_results
    871     )
    873     return results
--> 875 self._run_search(evaluate_candidates)
    877 # multimetric is determined here because in the case of a callable
    878 # self.scoring the return type is only known after calling
    879 first_test_score = all_out[0]["test_scores"]

File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\sklearn\model_selection\_search.py:1375, in GridSearchCV._run_search(self, evaluate_candidates)
   1373 def _run_search(self, evaluate_candidates):
   1374     """Search all candidates in param_grid"""
-> 1375     evaluate_candidates(ParameterGrid(self.param_grid))

File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\sklearn\model_selection\_search.py:822, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
    814 if self.verbose > 0:
    815     print(
    816         "Fitting {0} folds for each of {1} candidates,"
    817         " totalling {2} fits".format(
    818             n_splits, n_candidates, n_candidates * n_splits
    819         )
    820     )
--> 822 out = parallel(
    823     delayed(_fit_and_score)(
    824         clone(base_estimator),
    825         X,
    826         y,
    827         train=train,
    828         test=test,
    829         parameters=parameters,
    830         split_progress=(split_idx, n_splits),
    831         candidate_progress=(cand_idx, n_candidates),
    832         **fit_and_score_kwargs,
    833     )
    834     for (cand_idx, parameters), (split_idx, (train, test)) in product(
    835         enumerate(candidate_params), enumerate(cv.split(X, y, groups))
    836     )
    837 )
    839 if len(out) < 1:
    840     raise ValueError(
    841         "No fits were performed. "
    842         "Was the CV iterator empty? "
    843         "Were there no candidates?"
    844     )

File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\joblib\parallel.py:1043, in Parallel.__call__(self, iterable)
   1034 try:
   1035     # Only set self._iterating to True if at least a batch
   1036     # was dispatched. In particular this covers the edge
   (...)
   1040     # was very quick and its callback already dispatched all the
   1041     # remaining jobs.
   1042     self._iterating = False
-> 1043     if self.dispatch_one_batch(iterator):
   1044         self._iterating = self._original_iterator is not None
   1046     while self.dispatch_one_batch(iterator):

File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\joblib\parallel.py:861, in Parallel.dispatch_one_batch(self, iterator)
    859     return False
    860 else:
--> 861     self._dispatch(tasks)
    862     return True

File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\joblib\parallel.py:779, in Parallel._dispatch(self, batch)
    777 with self._lock:
    778     job_idx = len(self._jobs)
--> 779     job = self._backend.apply_async(batch, callback=cb)
    780     # A job can complete so quickly than its callback is
    781     # called before we get here, causing self._jobs to
    782     # grow. To ensure correct results ordering, .insert is
    783     # used (rather than .append) in the following line
    784     self._jobs.insert(job_idx, job)

File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\joblib\_parallel_backends.py:208, in SequentialBackend.apply_async(self, func, callback)
    206 def apply_async(self, func, callback=None):
    207     """Schedule a func to be run"""
--> 208     result = ImmediateResult(func)
    209     if callback:
    210         callback(result)

File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\joblib\_parallel_backends.py:572, in ImmediateResult.__init__(self, batch)
    569 def __init__(self, batch):
    570     # Don't delay the application, to avoid keeping the input
    571     # arguments in memory
--> 572     self.results = batch()

File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\joblib\parallel.py:262, in BatchedCalls.__call__(self)
    258 def __call__(self):
    259     # Set the default nested backend to self._backend but do not set the
    260     # change the default number of processes to -1
    261     with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262         return [func(*args, **kwargs)
    263                 for func, args, kwargs in self.items]

File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\joblib\parallel.py:262, in <listcomp>(.0)
    258 def __call__(self):
    259     # Set the default nested backend to self._backend but do not set the
    260     # change the default number of processes to -1
    261     with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262         return [func(*args, **kwargs)
    263                 for func, args, kwargs in self.items]

File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\sklearn\utils\fixes.py:117, in _FuncWrapper.__call__(self, *args, **kwargs)
    115 def __call__(self, *args, **kwargs):
    116     with config_context(**self.config):
--> 117         return self.function(*args, **kwargs)

File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\sklearn\model_selection\_validation.py:674, in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
    671     for k, v in parameters.items():
    672         cloned_parameters[k] = clone(v, safe=False)
--> 674     estimator = estimator.set_params(**cloned_parameters)
    676 start_time = time.time()
    678 X_train, y_train = _safe_split(estimator, X, y, train)

File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\sklearn\pipeline.py:188, in Pipeline.set_params(self, **kwargs)
    169 def set_params(self, **kwargs):
    170     """Set the parameters of this estimator.
    171 
    172     Valid parameter keys can be listed with ``get_params()``. Note that
   (...)
    186         Pipeline class instance.
    187     """
--> 188     self._set_params("steps", **kwargs)
    189     return self

File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\sklearn\utils\metaestimators.py:72, in _BaseComposition._set_params(self, attr, **params)
     69                 self._replace_estimator(attr, name, params.pop(name))
     71 # 3. Step parameters and other initialisation arguments
---> 72 super().set_params(**params)
     73 return self

File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\sklearn\base.py:246, in BaseEstimator.set_params(self, **params)
    244 if key not in valid_params:
    245     local_valid_params = self._get_param_names()
--> 246     raise ValueError(
    247         f"Invalid parameter {key!r} for estimator {self}. "
    248         f"Valid parameters are: {local_valid_params!r}."
    249     )
    251 if delim:
    252     nested_params[key][sub_key] = value

ValueError: Invalid parameter 'model' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Make', 'Colour']),
                                                 ('door',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value=4,
                                                                                 strategy='constant'))]),
                                                  ['Doors']),
                                                 ('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['Odometer (KM)'])])),
                ('regressor', RandomForestRegressor())]). Valid parameters are: ['memory', 'steps', 'verbose'].

>Solution :

The prefix should be regressor__, not model__, according to your pipeline steps naming.
There also seems to be a typo in n_estimators:

pipe_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "regressor__n_estimators": [100, 1000],
    "regressor__max_depth": [None],
    "regressor__max_features": ["auto"],
    "regressor__min_samples_split": [2, 4]
    }