Data analysis
Preprocessing
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
num_features = [feature for feature in X_train.columns if X_train[feature].dtype in ['int64', 'float64']]
cat_features = [feature for feature in X_train.columns if X_train[feature].nunique() < 10 and X_train[feature].dtype == "object"]
my_cols = cat_features + num_features
X_train = X_train[my_cols].copy()
X_val = X_val[my_cols].copy()
X_test = test_df[my_cols].copy()
cat_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(handle_unknown='ignore'))
])
num_transformer = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
('num', num_transformer, num_features),
('cat', cat_transformer, cat_features)
])
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', RandomForestClassifier())
])
Hyperparameter tunning
xgb_model = XGBClassifier(
learning_rate=0.01,
n_estimators=400,
objective='binary:logistic',
nthread=1
)
pipeline_xgb = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', xgb_model)
])
params_xgb = {
'classifier__min_child_weight': [1, 5, 7, 10],
'classifier__gamma': [0.2, 0.5, 1, 1.5, 2, 5],
'classifier__subsample': [0.4, 0.6, 0.8, 1.0],
'classifier__colsample_bytree': [0.4, 0.6, 0.8, 1.0],
'classifier__max_depth': [2, 3, 4, 5, 6, 7],
'classifier__learning_rate': [0.001, 0.01, 0.1],
'classifier__n_estimators': [100, 200, 400, 600]
}
folds = 20
param_comb = 20
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
random_search = RandomizedSearchCV(pipeline_xgb, param_distributions=params_xgb, n_iter=param_comb, scoring='accuracy',n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, random_state=1001 )
random_search.fit(X_train, y_train)
pipeline_xgb.set_params(**random_search.best_params_)
pipeline_xgb.fit(X_train, y_train)
y_val_xgb_pred = pipeline_xgb.predict(X_val)
accuracy = classification_report(y_val, y_val_xgb_pred)
print(f"Accuracy: {accuracy}")
pipeline_xgb.fit(X, y)
y_pred_test_xgb = pipeline_xgb.predict(X_test)
Model Selection
classifiers = {
"LogisticRegression" : LogisticRegression(random_state=0),
"KNN" : KNeighborsClassifier(),
"SVC" : SVC(random_state=0, probability=True),
"RandomForest" : RandomForestClassifier(random_state=0),
"XGBoost" : XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss'), # XGBoost takes too long
"LGBM" : LGBMClassifier(random_state=0),
"CatBoost" : CatBoostClassifier(random_state=0, verbose=False),
"NaiveBayes": GaussianNB()
}
LR_grid = {'penalty': ['l1','l2'],
'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
'max_iter': [50, 100, 150]}
KNN_grid = {'n_neighbors': [3, 5, 7, 9],
'p': [1, 2]}
SVC_grid = {'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
'kernel': ['linear', 'rbf'],
'gamma': ['scale', 'auto']}
RF_grid = {'n_estimators': [50, 100, 150, 200, 250, 300],
'max_depth': [4, 6, 8, 10, 12]}
boosted_grid = {'n_estimators': [50, 100, 150, 200],
'max_depth': [4, 8, 12],
'learning_rate': [0.05, 0.1, 0.15]}
NB_grid={'var_smoothing': [1e-10, 1e-9, 1e-8, 1e-7]}
# Dictionary of all grids
grid = {
"LogisticRegression" : LR_grid,
"KNN" : KNN_grid,
"SVC" : SVC_grid,
"RandomForest" : RF_grid,
"XGBoost" : boosted_grid,
"LGBM" : boosted_grid,
"CatBoost" : boosted_grid,
"NaiveBayes": NB_grid
}
i=0
clf_best_params=classifiers.copy()
valid_scores=pd.DataFrame({'Classifer':classifiers.keys(), 'Validation accuracy': np.zeros(len(classifiers)), 'Training time': np.zeros(len(classifiers))})
for key, classifier in classifiers.items():
start = time.time()
clf = GridSearchCV(estimator=classifier, param_grid=grid[key], n_jobs=-1, cv=None)
# Train and score
clf.fit(X_train, y_train)
valid_scores.iloc[i,1]=clf.score(X_valid, y_valid)
# Save trained model
clf_best_params[key]=clf.best_params_
# Print iteration and training time
stop = time.time()
valid_scores.iloc[i,2]=np.round((stop - start)/60, 2)
print('Model:', key)
print('Training time (mins):', valid_scores.iloc[i,2])
print('')
i+=1
valid_scores