I have a data set and I need to perform feature selection on it, with that I will have 4 different models where I need to use the majority vote. Until then it was working but now I need to use gridsearch to check the parameters for my model but I'm having difficulties. I appreciate if someone can help me.
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from pandas import read_csv
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.metrics import accuracy_score
from sklearn2pmml import make_pmml_pipeline
from sklearn2pmml import sklearn2pmml
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn.ensemble._voting import VotingClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics.cluster import fowlkes_mallows_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score #precision
from sklearn.metrics import recall_score #recall
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
print(__doc__)
fold1_train = 'D:/ARTIGO/TREINAMENTO.CSV'
df_fold1_train = read_csv(fold1_train, header=None)
data_fold1_train = df_fold1_train.values
fold1_test = 'D:/ARTIGO/TESTE.CSV'
df_fold1_test = read_csv(fold1_test, header=None)
data_fold1_test = df_fold1_test.values
X_train_fold1 = data_fold1_train[:, :-1]
y_train_fold1 = data_fold1_train[:, -1]
X_test_fold1 = data_fold1_test[:, :-1]
y_test_fold1 = data_fold1_test[:, -1]
#features selection
features1 = [2, 5, 7]
features2 = [0, 1, 4, 5, 7]
features3 = [0, 1, 4, 5, 6]
features4 = [1, 4]
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
preprocessor1 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features1)])
preprocessor2 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features2)])
preprocessor3 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features3)])
preprocessor4 = ColumnTransformer(transformers=[('numerical', numeric_transformer, features4)])
#RandomForest
pipeline = PMMLPipeline([
('classifier', VotingClassifier([
("pipe1", Pipeline(steps=[('preprocessor1', preprocessor1),('classifier1', SVC())])),
("pipe2", Pipeline(steps=[('preprocessor2', preprocessor2),('classifier2', SVC())])),
("pipe3", Pipeline(steps=[('preprocessor3', preprocessor3),('classifier3', SVC())])),
("pipe4", Pipeline(steps=[('preprocessor4', preprocessor4),('classifier4', SVC())]))
]))
])
# Set the parameters by cross-validation
tuned_parameters = [{'classifier__kernel': ['rbf', 'linear']
# , 'pipe1__gamma': [1e-3, 1e-4]
}]
scores = ['precision']
for score in scores:
print("# Tuning hyper-parameters for %s" % score)
print()
cv = KFold(n_splits=5)
clf = GridSearchCV(
estimator=pipeline, param_grid=tuned_parameters, n_jobs=-1, verbose=1, cv=cv, scoring='%s_macro' % score
)
clf.fit(X_train_fold1, y_train_fold1)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test_fold1, clf.predict(X_test_fold1)
print(classification_report(y_true, y_pred))
print()
I have the following error
ValueError: Invalid parameter kernel for estimator VotingClassifier(estimators=[('pipe1',
Pipeline(steps=[('preprocessor1',
ColumnTransformer(transformers=[('numerical',
Pipeline(steps=[('scaler',
StandardScaler())]),
[2,
5,
7])])),
('classifier1', SVC())])),
('pipe2',
Pipeline(steps=[('preprocessor2',
ColumnTransformer(transformers=[('numerical',
Pipeline(steps=[('scaler',
StandardScaler())]),
[0,
1,
4,
5,
7])])),
('...())])),
('pipe3',
Pipeline(steps=[('preprocessor3',
ColumnTransformer(transformers=[('numerical',
Pipeline(steps=[('scaler',
StandardScaler())]),
[0,
1,
4,
5,
6])])),
('classifier3', SVC())])),
('pipe4',
Pipeline(steps=[('preprocessor4',
ColumnTransformer(transformers=[('numerical',
Pipeline(steps=[('scaler',
StandardScaler())]),
[1,
4])])),
('classifier4', SVC())]))]). Check the list of available parameters with `estimator.get_params().keys()`.
"""