import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

# Set matplotlib sizes
plt.rc('font', size=20)
plt.rc('axes', titlesize=20)
plt.rc('axes', labelsize=20)
plt.rc('xtick', labelsize=20)
plt.rc('ytick', labelsize=20)
plt.rc('legend', fontsize=20)
plt.rc('figure', titlesize=20)

import tensorflow as tf
from tensorflow import keras

# The random seed
random_seed = 42

# Set random seed in tensorflow
tf.random.set_seed(random_seed)

# Set random seed in numpy
import numpy as np
np.random.seed(random_seed)

import pandas as pd

# Load the raw training data from github to avoid downloading the data
df_raw_train = pd.read_csv("https://raw.githubusercontent.com/phanindra-max/Machine-Learning-I/refs/heads/main/data/census.csv", header=0)
# Make a copy of df_raw_train
data_train = df_raw_train.copy(deep=True)

# Load the raw test data from github
df_raw_test = pd.read_csv("https://raw.githubusercontent.com/phanindra-max/Machine-Learning-I/refs/heads/main/data/test_census.csv", header=0)
# Make a copy of df_raw_test
data_test = df_raw_test.copy(deep=True)

# Get the name of the target
target = 'income'

# Print the dimension of data_train
pd.DataFrame([[data_train.shape[0], data_train.shape[1]]], columns=['# rows', '# columns'])

# Print the dimension of data_test
pd.DataFrame([[data_test.shape[0], data_test.shape[1]]], columns=['# rows', '# columns'])

# Print the first 5 rows of data_train
data_train.head()

# Print the first 5 rows of data_test
data_test.head()

# gain statistics insight for data_train
data_train.describe()

# gain statistics insight for data_test
data_test.describe()

# Check info for train dataset
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45222 entries, 0 to 45221
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              45222 non-null  int64  
 1   workclass        45222 non-null  object 
 2   education_level  45222 non-null  object 
 3   education-num    45222 non-null  float64
 4   marital-status   45222 non-null  object 
 5   occupation       45222 non-null  object 
 6   relationship     45222 non-null  object 
 7   race             45222 non-null  object 
 8   sex              45222 non-null  object 
 9   capital-gain     45222 non-null  float64
 10  capital-loss     45222 non-null  float64
 11  hours-per-week   45222 non-null  float64
 12  native-country   45222 non-null  object 
 13  income           45222 non-null  object 
dtypes: float64(4), int64(1), object(9)
memory usage: 4.8+ MB

# Check info for test dataset
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45222 entries, 0 to 45221
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       45222 non-null  int64  
 1   age              45205 non-null  float64
 2   workclass        45200 non-null  object 
 3   education_level  45202 non-null  object 
 4   education-num    45208 non-null  float64
 5   marital-status   45201 non-null  object 
 6   occupation       45200 non-null  object 
 7   relationship     45206 non-null  object 
 8   race             45203 non-null  object 
 9   sex              45203 non-null  object 
 10  capital-gain     45207 non-null  float64
 11  capital-loss     45205 non-null  float64
 12  hours-per-week   45209 non-null  float64
 13  native-country   45206 non-null  object 
dtypes: float64(5), int64(1), object(8)
memory usage: 4.8+ MB

from sklearn.model_selection import train_test_split

# Divide the training data into training (80%) and validation (20%)
data_train, data_val = train_test_split(data_train, train_size=0.8, random_state=random_seed)

# Reset the index
data_train, data_val = data_train.reset_index(drop=True), data_val.reset_index(drop=True)

# Print the dimension of data_train
pd.DataFrame([[data_train.shape[0], data_train.shape[1]]], columns=['# rows', '# columns'])

# Print the dimension of data_val
pd.DataFrame([[data_val.shape[0], data_val.shape[1]]], columns=['# rows', '# columns'])

# Utilising common_var_checker mentioned in class

def common_var_checker(df_train, data_val, df_test, target):
    """
    The common variables checker

    Parameters
    ----------
    df_train : the dataframe of training data
    data_val : the dataframe of validation data
    df_test : the dataframe of test data
    target : the name of the target

    Returns
    ----------
    The dataframe of common variables between the training, validation and test data
    """

    # Get the dataframe of common variables between the training, validation and test data
    df_common_var = pd.DataFrame(np.intersect1d(np.intersect1d(df_train.columns, data_val.columns), np.union1d(df_test.columns, [target])),
                                 columns=['common var'])

    return df_common_var

data_common_var = common_var_checker(data_train, data_val, data_test, target)

# Print df_common_var
data_common_var

# Get the features in the training data but not in the validation or test data
uncommon_feature_train_not_val_test = np.setdiff1d(data_train.columns, data_common_var['common var'])

# Print the uncommon features
pd.DataFrame(uncommon_feature_train_not_val_test, columns=['uncommon feature'])

# Get the features in the validation data but not in the training or test data
uncommon_feature_val_not_train_test = np.setdiff1d(data_val.columns, data_common_var['common var'])

# Print the uncommon features
pd.DataFrame(uncommon_feature_val_not_train_test, columns=['uncommon feature'])

# Get the features in the test data but not in the training or validation data
uncommon_feature_test_not_train_val = np.setdiff1d(data_test.columns, data_common_var['common var'])

# Print the uncommon features
pd.DataFrame(uncommon_feature_test_not_train_val, columns=['uncommon feature'])

# Remove the uncommon features from the test data
data_test = data_test.drop(columns=uncommon_feature_test_not_train_val)

# Print the first 5 rows of data_test
data_test.head()

# Combine data_train, data_val and data_test
data = pd.concat([data_train, data_val, data_test], sort=False)

#utilising id-checker as mentioned in class
def id_checker(df, dtype='float'):
    """
    The identifier checker

    Parameters
    ----------
    df : dataframe
    dtype : the data type identifiers cannot have, 'float' by default
            i.e., if a feature has this data type, it cannot be an identifier

    Returns
    ----------
    The dataframe of identifiers
    """

    # Get the dataframe of identifiers
    df_id = df[[var for var in df.columns
                # If the data type is not dtype
                if (df[var].dtype != dtype
                    # If the value is unique for each sample
                    and df[var].nunique(dropna=True) == df[var].notnull().sum())]]

    return df_id
df_id = id_checker(data)

# Print the first 5 rows of df_id
df_id.head()

# Utilising nan_checker from class
def nan_checker(df):
    """
    The NaN checker

    Parameters
    ----------
    df : the dataframe

    Returns
    ----------
    The dataframe of variables with NaN, their proportion of NaN and data type
    """

    # Get the dataframe of variables with NaN, their proportion of NaN and data type
    df_nan = pd.DataFrame([[var, df[var].isna().sum() / df.shape[0], df[var].dtype]
                           for var in df.columns if df[var].isna().sum() > 0],
                          columns=['var', 'proportion', 'dtype'])

    # Sort df_nan in accending order of the proportion of NaN
    df_nan = df_nan.sort_values(by='proportion', ascending=False).reset_index(drop=True)

    return df_nan

# Checking for missing values in training data
missing_train_df = nan_checker(data_train)
missing_train_df

# Checking for missing values in validation data
missing_val_df = nan_checker(data_val)
missing_val_df

# Checking for missing values in test data
missing_test_df = nan_checker(data_test)
missing_test_df

# Print the unique data type of variables with NaN in test dataframe
pd.DataFrame(missing_test_df['dtype'].unique(), columns=['dtype'])

# Checking the number of null values in each column
null_counts = data_test.isnull().sum()
print(null_counts)

age                17
workclass          22
education_level    20
education-num      14
marital-status     21
occupation         22
relationship       16
race               19
sex                19
capital-gain       15
capital-loss       17
hours-per-week     13
native-country     16
dtype: int64

# Combine data_train, data_val
data_trv = pd.concat([data_train, data_val], sort=False)

# Utilising describe method in pandas to find out continuous or numerical variables in the dataframe
continuous = list(data_trv.describe().columns)
print("Non-categorical/continuous columns are:", continuous)

Non-categorical/continuous columns are: ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

## Utilising cat_var_checker as mentioned in class for finding categorical variables
def cat_var_checker(df, dtype='object'):
    """
    The categorical variable checker

    Parameters
    ----------
    df : the dataframe
    dtype : the data type categorical variables should have, 'object' by default
            i.e., if a variable has this data type, it should be a categorical variable

    Returns
    ----------
    The dataframe of categorical variables and their number of unique value
    """

    # Get the dataframe of categorical variables and their number of unique value
    df_cat = pd.DataFrame([[var, df[var].nunique(dropna=False)]
                           # If the data type is dtype
                           for var in df.columns if df[var].dtype == dtype],
                          columns=['var', 'nunique'])

    # Sort df_cat in accending order of the number of unique value
    df_cat = df_cat.sort_values(by='nunique', ascending=False).reset_index(drop=True)

    return df_cat

data_cat = cat_var_checker(data_trv)

# Print the dataframe containing categorical column names and count of unique values
data_cat

categorical = data_cat['var'].tolist()

# checking unique values in each categorical column
for col in data_trv.columns:
    if col in categorical:
        values = list(data_trv[col].value_counts().index)
        print('{}: {}'.format(col, ', '.join(values)))
        print('\n')

workclass:  Private,  Self-emp-not-inc,  Local-gov,  State-gov,  Self-emp-inc,  Federal-gov,  Without-pay


education_level:  HS-grad,  Some-college,  Bachelors,  Masters,  Assoc-voc,  11th,  Assoc-acdm,  10th,  7th-8th,  Prof-school,  9th,  12th,  Doctorate,  5th-6th,  1st-4th,  Preschool


marital-status:  Married-civ-spouse,  Never-married,  Divorced,  Separated,  Widowed,  Married-spouse-absent,  Married-AF-spouse


occupation:  Craft-repair,  Prof-specialty,  Exec-managerial,  Adm-clerical,  Sales,  Other-service,  Machine-op-inspct,  Transport-moving,  Handlers-cleaners,  Farming-fishing,  Tech-support,  Protective-serv,  Priv-house-serv,  Armed-Forces


relationship:  Husband,  Not-in-family,  Own-child,  Unmarried,  Wife,  Other-relative


race:  White,  Black,  Asian-Pac-Islander,  Amer-Indian-Eskimo,  Other


sex:  Male,  Female


native-country:  United-States,  Mexico,  Philippines,  Germany,  Puerto-Rico,  Canada,  El-Salvador,  India,  Cuba,  England,  China,  Jamaica,  South,  Italy,  Dominican-Republic,  Japan,  Guatemala,  Vietnam,  Columbia,  Poland,  Haiti,  Portugal,  Iran,  Taiwan,  Greece,  Nicaragua,  Peru,  Ecuador,  France,  Ireland,  Thailand,  Hong,  Trinadad&Tobago,  Cambodia,  Yugoslavia,  Outlying-US(Guam-USVI-etc),  Laos,  Scotland,  Honduras,  Hungary,  Holand-Netherlands


income: <=50K, >50K

# importing seaborn for visualisations
import seaborn as sns

# check the distributions of continuous features
for col in continuous:
    a = sns.FacetGrid(data_trv, height=8, aspect=2)
    a.map(sns.histplot, col, kde_kws={'bw': 25})
    a.add_legend
    print('{} skew: {}'.format(col, data_trv[col].skew()))

age skew: 0.5328158942555791
education-num skew: -0.3106209464770188
capital-gain skew: 11.789002180291192
capital-loss skew: 4.516304152981587
hours-per-week skew: 0.34054514378520867

# Log-transform the skewed features
skewed = ['capital-gain', 'capital-loss']
features_log_transformed = pd.DataFrame(data = data_trv)
features_log_transformed[skewed] = data_trv[skewed].apply(lambda x: np.log(x + 1))

# checking the skew after applying log transformation
for col in skewed:
    print('{} skew: {}'.format(col, features_log_transformed[col].skew()))

capital-gain skew: 3.0823859202151755
capital-loss skew: 4.271194769830899

# Displaying the distributions after log-transform
sns.set()
fig = plt.figure(figsize=(11,5))
fig.suptitle("Log-transformed Distributions of Continuous Census Data Features", fontsize=16)

for i, feature in enumerate(skewed):
    ax = fig.add_subplot(1, 2, i+1)
    ax.hist(features_log_transformed[feature], bins=25)
    ax.set_title("{} Feature Distribution".format(feature), fontsize=14)
    ax.set_xlabel("Value")
    ax.set_ylabel("Number of Records")
    ax.set_ylim((0,2000))
    ax.set_yticks([0, 500, 1000, 1500, 2000])
    ax.set_yticklabels([0, 500, 1000, 1500, ">2000"])

# Separating the training data
data_train = data_trv.iloc[:data_train.shape[0], :]

# Separating the validation data
data_val = data_trv.iloc[data_train.shape[0]:data_train.shape[0] + data_val.shape[0], :]

# Print the dimension of data_train
pd.DataFrame([[data_train.shape[0], data_train.shape[1]]], columns=['# rows', '# columns'])

# Print the dimension of data_val
pd.DataFrame([[data_val.shape[0], data_val.shape[1]]], columns=['# rows', '# columns'])

print(f'Continuous data columns: {continuous}')
print("----")
print(f'Skewed data columns: {skewed}')
print("----")
print(f'Categorical data columns: {categorical}')
print("----")
cat_nottarget = [item for item in categorical if item != target]
print(f'Categorical without target data columns: {cat_nottarget}')
print("----")
cont_notskewed = list(set(continuous) - set(skewed))
print(f'Continuous but not skewed data columns: {cont_notskewed}')

Continuous data columns: ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
----
Skewed data columns: ['capital-gain', 'capital-loss']
----
Categorical data columns: ['native-country', 'education_level', 'occupation', 'workclass', 'marital-status', 'relationship', 'race', 'sex', 'income']
----
Categorical without target data columns: ['native-country', 'education_level', 'occupation', 'workclass', 'marital-status', 'relationship', 'race', 'sex']
----
Continuous but not skewed data columns: ['hours-per-week', 'education-num', 'age']

from sklearn.impute import SimpleImputer

# for imputing missing values with mean for continuous features except skewed data columns
mean_imputer = SimpleImputer(strategy='mean')

# for imputing missing values with median for skewed features
median_imputer = SimpleImputer(strategy='median')

# for imputing missing values with mode for categorical features
most_frequent_imputer = SimpleImputer(strategy='most_frequent')

data_test[cont_notskewed] = mean_imputer.fit_transform(data_test[cont_notskewed])
data_test[skewed] = median_imputer.fit_transform(data_test[skewed])
data_test[cat_nottarget] = most_frequent_imputer.fit_transform(data_test[cat_nottarget])

# Importing Min Max Scaler from sklearn.preprocessing
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler()

# Standardize the training data
data_train[continuous] = scaler.fit_transform(data_train[continuous])

# Standardize the validation data
data_val[continuous] = scaler.fit_transform(data_val[continuous])

# Standardize the test data
data_test[continuous] = scaler.fit_transform(data_test[continuous])

# Combine data_train, data_val and data_test
data = pd.concat([data_train, data_val, data_test], sort=False)

# Print the unique data type of variables in df
pd.DataFrame(data.dtypes.unique(), columns=['dtype'])

# categorical variables are already identified in above step
data_cat

# One-hot-encode the categorical features in the combined data
data = pd.get_dummies(data, columns= cat_nottarget)
# Print the first 5 rows of df
data.head()

data['income'] = data['income'].map({"<=50K":0, ">50K":1})

data['income'].unique()

array([ 0.,  1., nan])

# Separating the training data
data_train = data.iloc[:data_train.shape[0], :]

# Separating the validation data
data_val = data.iloc[data_train.shape[0]:data_train.shape[0] + data_val.shape[0], :]

# Separating the test data
data_test = data.iloc[data_train.shape[0] + data_val.shape[0]:, :]

# Print the dimension of data_train
pd.DataFrame([[data_train.shape[0], data_train.shape[1]]], columns=['# rows', '# columns'])

# Print the dimension of data_val
pd.DataFrame([[data_val.shape[0], data_val.shape[1]]], columns=['# rows', '# columns'])

# Print the dimension of data_test
pd.DataFrame([[data_test.shape[0], data_test.shape[1]]], columns=['# rows', '# columns'])

# Get the feature matrix
X_train = data_train[np.setdiff1d(data_train.columns, [target])].values
X_val = data_val[np.setdiff1d(data_val.columns, [target])].values
X_test = data_test[np.setdiff1d(data_test.columns, [target])].values

# Get the target vector
y_train = data_train[target].values
y_val = data_val[target].values
y_test = data_test[target].values

data_test['income'].unique()

array([nan])

pd.Series(y_train).value_counts()

0.0    27269
1.0     8908
Name: count, dtype: int64

# Import the supervised learning models from sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

models = {'Gaussian Naive Bayes' : GaussianNB(),
          'KNeighbors Classifier' : KNeighborsClassifier(),
          'Logistic Regression Classifer': LogisticRegression(class_weight='balanced', random_state=random_seed),
          'Decision Tree Classifier' : DecisionTreeClassifier(class_weight='balanced', random_state=random_seed),
          'Random Forest Classifier': RandomForestClassifier(class_weight='balanced', random_state=random_seed),
          'GradientBoosting Classifier' : GradientBoostingClassifier(random_state = random_seed),
          'AdaBoost Classifier' : AdaBoostClassifier(random_state = random_seed, learning_rate=0.5, n_estimators=100)
          }

from sklearn.pipeline import Pipeline

pipes = {}

for acronym, model in models.items():
    pipes[acronym] = Pipeline([('model', model)])

# Get the:
# feature matrix and target velctor in the combined training and validation data
# target vector in the combined training and validation data
# PredefinedSplit
# See the implementation in pmlm_utilities.ipynb

## utilising function defined in class for predefined split cross-validator

from sklearn.model_selection import PredefinedSplit

def get_train_val_ps(X_train, y_train, X_val, y_val):
    """
    Get the:
    feature matrix and target velctor in the combined training and validation data
    target vector in the combined training and validation data
    PredefinedSplit

    Parameters
    ----------
    X_train : the feature matrix in the training data
    y_train : the target vector in the training data
    X_val : the feature matrix in the validation data
    y_val : the target vector in the validation data

    Return
    ----------
    The feature matrix in the combined training and validation data
    The target vector in the combined training and validation data
    PredefinedSplit
    """

    # Combine the feature matrix in the training and validation data
    X_train_val = np.vstack((X_train, X_val))

    # Combine the target vector in the training and validation data
    y_train_val = np.vstack((y_train.reshape(-1, 1), y_val.reshape(-1, 1))).reshape(-1)

    # Get the indices of training and validation data
    train_val_idxs = np.append(np.full(X_train.shape[0], -1), np.full(X_val.shape[0], 0))

    # The PredefinedSplit
    ps = PredefinedSplit(train_val_idxs)

    return X_train_val, y_train_val, ps


# calling the function for obtaining predefined split cross validator
X_train_val, y_train_val, ps = get_train_val_ps(X_train, y_train, X_val, y_val)

param_grids = {}

# The parameter grid for var_smoothing
var_smoothing_gnbc = np.logspace(-12, -6, num=7)

# Update param_grids
param_grids['Gaussian Naive Bayes'] = [{'model__var_smoothing' : var_smoothing_gnbc}]

# The parameter grid for tol
tol_lr = [10 ** -6, 10 ** -5, 10 ** -4]

# The parameter grid for C
C_lr = [8, 10, 13, 15]

# Update param_grids
param_grids['Logistic Regression Classifer'] = [{'model__tol': tol_lr,
                      'model__C': C_lr}]

# The parameter grid for n_neighbors
n_neighbors_knbc = [10, 20, 30]

# The parameter grid for weights
weights_knbc = ['uniform', 'distance']

# The parameter grid for metric
metric_knbc = ['euclidean', 'manhattan']

# Update param_grids
param_grids['KNeighbors Classifier'] = [{'model__n_neighbors': n_neighbors_knbc,
                        'model__weights': weights_knbc,
                        'model__metric': metric_knbc}]

# The parameter grid for min_samples_split
min_samples_split_dtc = [2, 5, 10]

# The parameter grid for min_samples_leaf
min_samples_leaf_dtc = [50, 75, 100]

# Update param_grids
param_grids['Decision Tree Classifier'] = [{'model__min_samples_split': min_samples_split_dtc,
                       'model__min_samples_leaf': min_samples_leaf_dtc}]

# The parameter grid for min_samples_split
min_samples_split_rfc = [10, 50, 100]

# The parameter grid for min_samples_leaf
min_samples_leaf_rfc = [0.9, 1, 3, 5]

# Update param_grids
param_grids['Random Forest Classifier'] = [{'model__min_samples_split': min_samples_split_rfc,
                       'model__min_samples_leaf': min_samples_leaf_rfc}]

# The parameter grid for n_estimators
n_estimators_abc = list(range(50, 201, 25))

# Update param_grids
param_grids['AdaBoost Classifier'] = [{ 'model__n_estimators': n_estimators_abc}]

# The parameter grid for n_estimators
n_estimators_gbc = [40]

# The parameter grid for learning_rate
learning_rate_gbc = [0.3]

# The parameter grid for min_samples_split
min_samples_split_gbc = [100, 250, 500]

# The parameter grid for min_samples_leaf
min_samples_leaf_gbc = [25, 50, 75, 100]

# The parameter grid for max_depth
max_depth_gbc = [8]

# The parameter grid for subsample
subsample_gbc = [0.8]

# update param_grids
param_grids['GradientBoosting Classifier'] = [{'model__n_estimators': n_estimators_gbc,
                       'model__learning_rate': learning_rate_gbc,
                       'model__min_samples_split': min_samples_split_gbc,
                       'model__min_samples_leaf' : min_samples_leaf_gbc,
                       'model__max_depth' : max_depth_gbc,
                       'model__subsample' : subsample_gbc}]

from sklearn.model_selection import GridSearchCV
import time
from IPython.display import display

# The list of [best_score_, best_params_, best_estimator_] obtained by GridSearchCV
best_score_params_estimator_gs = []

# For each model
for acronym in pipes.keys():

    print(f"{acronym}:")

    # GridSearchCV
    gs = GridSearchCV(estimator=pipes[acronym],
                      param_grid=param_grids[acronym],
                      scoring='f1_macro',
                      n_jobs=2,
                      cv=ps,
                      return_train_score=True,
                      verbose=1)

    print(f'Parameter tuning started for {acronym}')
    start_time = time.time()

    # Fit the pipeline
    gs = gs.fit(X_train_val, y_train_val)

    # Update best_score_params_estimator_gs
    best_score_params_estimator_gs.append([gs.best_score_, gs.best_params_, gs.best_estimator_])

    # Sort cv_results in ascending order of 'rank_test_score' and 'std_test_score'
    cv_results = pd.DataFrame.from_dict(gs.cv_results_).sort_values(by=['rank_test_score', 'std_test_score'])

    # Get the important columns in cv_results
    important_columns = ['rank_test_score',
                         'mean_test_score',
                         'std_test_score',
                         'mean_train_score',
                         'std_train_score',
                         'mean_fit_time',
                         'std_fit_time',
                         'mean_score_time',
                         'std_score_time']

    # Move the important columns ahead
    cv_results = cv_results[important_columns + sorted(list(set(cv_results.columns) - set(important_columns)))]

    # print estimator status
    print(f'Parameter tuning is done for {acronym}')

    # calculating time taken by each classifier for parameter tuning
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Execution time for {acronym}: {execution_time:.4f} seconds \n")

    # display cv_results
    display(cv_results)

    # adding line space before displaying next estimator status
    print("\n----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n")

Gaussian Naive Bayes:
Parameter tuning started for Gaussian Naive Bayes
Fitting 1 folds for each of 7 candidates, totalling 7 fits
Parameter tuning is done for Gaussian Naive Bayes
Execution time for Gaussian Naive Bayes: 8.2264 seconds

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

KNeighbors Classifier:
Parameter tuning started for KNeighbors Classifier
Fitting 1 folds for each of 12 candidates, totalling 12 fits
Parameter tuning is done for KNeighbors Classifier
Execution time for KNeighbors Classifier: 188.9343 seconds

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Logistic Regression Classifer:
Parameter tuning started for Logistic Regression Classifer
Fitting 1 folds for each of 12 candidates, totalling 12 fits
Parameter tuning is done for Logistic Regression Classifer
Execution time for Logistic Regression Classifer: 9.0568 seconds

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Decision Tree Classifier:
Parameter tuning started for Decision Tree Classifier
Fitting 1 folds for each of 9 candidates, totalling 9 fits
Parameter tuning is done for Decision Tree Classifier
Execution time for Decision Tree Classifier: 5.3405 seconds

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Random Forest Classifier:
Parameter tuning started for Random Forest Classifier
Fitting 1 folds for each of 12 candidates, totalling 12 fits
Parameter tuning is done for Random Forest Classifier
Execution time for Random Forest Classifier: 28.9782 seconds

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

GradientBoosting Classifier:
Parameter tuning started for GradientBoosting Classifier
Fitting 1 folds for each of 12 candidates, totalling 12 fits
Parameter tuning is done for GradientBoosting Classifier
Execution time for GradientBoosting Classifier: 68.1616 seconds

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

AdaBoost Classifier:
Parameter tuning started for AdaBoost Classifier
Fitting 1 folds for each of 7 candidates, totalling 7 fits
Parameter tuning is done for AdaBoost Classifier
Execution time for AdaBoost Classifier: 297.8931 seconds

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Sort best_score_params_estimator_gs in descending order of the best_score_
best_score_params_estimator_gs = sorted(best_score_params_estimator_gs, key=lambda x : x[0], reverse=True)

# Print best_score_params_estimator_gs
results = pd.DataFrame(best_score_params_estimator_gs, columns=['best_score', 'best_param', 'best_estimator'])
results

# Get the best_score, best_params and best_estimator obtained by GridSearchCV
best_score_gs, best_params_gs, best_estimator_gs = best_score_params_estimator_gs[0]
pd.DataFrame([[best_score_gs, best_params_gs, best_estimator_gs]], columns=['Best Score', 'Best Parameters', 'Best Estimator'])

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_auc_score

# Get the prediction on the testing data using best_model
y_val_pred = best_estimator_gs.predict(X_val)

# Get the precision, recall, fscore, support
precision, recall, fscore, support = precision_recall_fscore_support(y_val, y_val_pred)

# Get the auc
auc = roc_auc_score(y_val, y_val_pred)

# Get the dataframe of precision, recall, fscore and auc
pd.DataFrame([[precision, recall, fscore, auc]], columns=['Precision', 'Recall', 'F1-score', 'AUC'])

from sklearn.metrics import roc_curve, roc_auc_score

# Calculate the false positive rate and true positive rate
fpr, tpr, _ = roc_curve(y_val, y_val_pred)

# Calculate the AUC
auc = roc_auc_score(y_val, y_val_pred)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Diagonal line (random classifier)
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

# Get the best_score, best_param and best_estimator of random forest obtained by GridSearchCV
best_score_rfc, best_param_rfc, best_estimator_rfc = best_score_params_estimator_gs[1]

# Get the dataframe of feature and importance
df_fi_rfc = pd.DataFrame(np.hstack((np.setdiff1d(data_train.columns, [target]).reshape(-1, 1), best_estimator_rfc.named_steps['model'].feature_importances_.reshape(-1, 1))),
                         columns=['Features', 'Importance'])

# Sort df_fi_rfc in descending order of the importance
df_fi_rfc = df_fi_rfc.sort_values(ascending=False, by='Importance').reset_index(drop=True)

# Print the first 5 rows of df_fi_rfc
df_fi_rfc.head()

# Create a figure
fig = plt.figure(figsize=(10, 5))

# Implement me
# The bar plot of the top 5 feature importance
plt.bar(df_fi_rfc['Features'][:5], df_fi_rfc['Importance'][:5], color='green')

# Set x-axis
plt.xlabel('Features')
plt.xticks(rotation=45)

# Set y-axis
plt.ylabel('Importance')

# Save and show the figure
plt.tight_layout()
plt.show()

# Create a submission file for Kaggle
submission = pd.DataFrame({'id': data_test.index, 'income': best_estimator_gs.predict(X_test)})

# Save the submission file
submission.to_csv("submission.csv", index=False)

	age	education-num	capital-gain	capital-loss	hours-per-week
count	45222.000000	45222.000000	45222.000000	45222.000000	45222.000000
mean	38.547941	10.118460	1101.430344	88.595418	40.938017
std	13.217870	2.552881	7506.430084	404.956092	12.007508
min	17.000000	1.000000	0.000000	0.000000	1.000000
25%	28.000000	9.000000	0.000000	0.000000	40.000000
50%	37.000000	10.000000	0.000000	0.000000	40.000000
75%	47.000000	13.000000	0.000000	0.000000	45.000000
max	90.000000	16.000000	99999.000000	4356.000000	99.000000

	Unnamed: 0	age	education-num	capital-gain	capital-loss	hours-per-week
count	45222.000000	45205.000000	45208.000000	45207.000000	45205.000000	45209.000000
mean	22610.500000	38.549983	10.118519	1099.583781	88.595885	40.939348
std	13054.611273	13.218616	2.552811	7493.227006	404.975547	12.007470
min	0.000000	17.000000	1.000000	0.000000	0.000000	1.000000
25%	11305.250000	28.000000	9.000000	0.000000	0.000000	40.000000
50%	22610.500000	37.000000	10.000000	0.000000	0.000000	40.000000
75%	33915.750000	47.000000	13.000000	0.000000	0.000000	45.000000
max	45221.000000	90.000000	16.000000	99999.000000	4356.000000	99.000000

	rank_test_score	mean_test_score	mean_train_score	mean_fit_time	mean_score_time	param_model__var_smoothing	params	split0_test_score	split0_train_score
6	1	0.647918	0.637177	0.200689	0.057572	1.000000e-06	{'model__var_smoothing': 1e-06}	0.647918	0.637177
5	2	0.642795	0.630785	0.239884	0.050012	1.000000e-07	{'model__var_smoothing': 1e-07}	0.642795	0.630785
4	3	0.634789	0.624228	0.235980	0.063560	1.000000e-08	{'model__var_smoothing': 1e-08}	0.634789	0.624228
3	4	0.627703	0.618466	0.245632	0.068480	1.000000e-09	{'model__var_smoothing': 1e-09}	0.627703	0.618466
2	5	0.623973	0.613738	0.290623	0.060556	1.000000e-10	{'model__var_smoothing': 1e-10}	0.623973	0.613738
1	6	0.619126	0.610720	0.252528	0.062383	1.000000e-11	{'model__var_smoothing': 1e-11}	0.619126	0.610720
0	7	0.616548	0.607055	0.263637	0.067887	1.000000e-12	{'model__var_smoothing': 1e-12}	0.616548	0.607055

	rank_test_score	mean_test_score	mean_train_score	mean_fit_time	mean_score_time	param_model__C	param_model__tol	params	split0_test_score	split0_train_score
8	1	0.782308	0.771391	0.716677	0.039024	13	0.000100	{'model__C': 13, 'model__tol': 0.0001}	0.782308	0.771391
9	2	0.782200	0.771751	0.806934	0.035851	15	0.000001	{'model__C': 15, 'model__tol': 1e-06}	0.782200	0.771751
10	2	0.782200	0.771751	0.845628	0.042376	15	0.000010	{'model__C': 15, 'model__tol': 1e-05}	0.782200	0.771751
3	4	0.782048	0.771667	0.969281	0.046206	10	0.000001	{'model__C': 10, 'model__tol': 1e-06}	0.782048	0.771667
4	4	0.782048	0.771667	0.805568	0.040276	10	0.000010	{'model__C': 10, 'model__tol': 1e-05}	0.782048	0.771667
6	4	0.782048	0.771674	0.812758	0.045614	13	0.000001	{'model__C': 13, 'model__tol': 1e-06}	0.782048	0.771674
7	4	0.782048	0.771674	0.814858	0.036132	13	0.000010	{'model__C': 13, 'model__tol': 1e-05}	0.782048	0.771674
5	8	0.782004	0.771406	0.729433	0.037260	10	0.000100	{'model__C': 10, 'model__tol': 0.0001}	0.782004	0.771406
0	9	0.781940	0.771483	1.129106	0.041819	8	0.000001	{'model__C': 8, 'model__tol': 1e-06}	0.781940	0.771483
1	9	0.781940	0.771483	1.134537	0.038162	8	0.000010	{'model__C': 8, 'model__tol': 1e-05}	0.781940	0.771483
11	9	0.781940	0.771391	0.721066	0.038878	15	0.000100	{'model__C': 15, 'model__tol': 0.0001}	0.781940	0.771391
2	12	0.781355	0.771177	0.815686	0.043849	8	0.000100	{'model__C': 8, 'model__tol': 0.0001}	0.781355	0.771177

	rank_test_score	mean_test_score	mean_train_score	mean_fit_time	mean_score_time	param_model__min_samples_leaf	param_model__min_samples_split	params	split0_test_score	split0_train_score
6	1	0.777969	0.771858	0.443637	0.041894	100	2	{'model__min_samples_leaf': 100, 'model__min_s...	0.777969	0.771858
7	1	0.777969	0.771858	0.428084	0.037903	100	5	{'model__min_samples_leaf': 100, 'model__min_s...	0.777969	0.771858
8	1	0.777969	0.771858	0.404086	0.038292	100	10	{'model__min_samples_leaf': 100, 'model__min_s...	0.777969	0.771858
0	4	0.777925	0.783718	0.532796	0.047931	50	2	{'model__min_samples_leaf': 50, 'model__min_sa...	0.777925	0.783718
1	4	0.777925	0.783718	0.525226	0.045173	50	5	{'model__min_samples_leaf': 50, 'model__min_sa...	0.777925	0.783718
2	4	0.777925	0.783718	0.523500	0.046729	50	10	{'model__min_samples_leaf': 50, 'model__min_sa...	0.777925	0.783718
3	7	0.772367	0.773796	0.507775	0.041900	75	2	{'model__min_samples_leaf': 75, 'model__min_sa...	0.772367	0.773796
4	7	0.772367	0.773796	0.480251	0.039921	75	5	{'model__min_samples_leaf': 75, 'model__min_sa...	0.772367	0.773796
5	7	0.772367	0.773796	0.454637	0.039043	75	10	{'model__min_samples_leaf': 75, 'model__min_sa...	0.772367	0.773796

	age	workclass	education_level	education-num	marital-status	occupation	relationship	race	sex	capital-gain	hours-per-week	native-country	income
0	39	State-gov	Bachelors	13.0	Never-married	Adm-clerical	Not-in-family	White	Male	2174.0	40.0	United-States	<=50K
1	50	Self-emp-not-inc	Bachelors	13.0	Married-civ-spouse	Exec-managerial	Husband	White	Male	0.0	13.0	United-States	<=50K
2	38	Private	HS-grad	9.0	Divorced	Handlers-cleaners	Not-in-family	White	Male	0.0	40.0	United-States	<=50K
3	53	Private	11th	7.0	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0.0	40.0	United-States	<=50K
4	28	Private	Bachelors	13.0	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0.0	40.0	Cuba	<=50K

	Unnamed: 0	age	workclass	education_level	education-num	marital-status	occupation	relationship	race	sex	capital-gain	hours-per-week	native-country
0	0	21.0	Private	10th	6.0	Married-civ-spouse	Craft-repair	Husband	White	Male	0.0	40.0	United-States
1	1	49.0	Private	Bachelors	13.0	Married-civ-spouse	Adm-clerical	Wife	White	Female	0.0	40.0	United-States
2	2	44.0	Self-emp-not-inc	Assoc-acdm	12.0	Married-civ-spouse	Other-service	Wife	White	Female	0.0	99.0	United-States
3	3	34.0	Private	Bachelors	13.0	Married-civ-spouse	Sales	Husband	White	Male	7298.0	46.0	United-States
4	4	24.0	Private	HS-grad	9.0	Married-civ-spouse	Machine-op-inspct	Husband	White	Male	0.0	40.0	United-States

	var	proportion	dtype
0	workclass	0.000486	object
1	occupation	0.000486	object
2	marital-status	0.000464	object
3	education_level	0.000442	object
4	race	0.000420	object
5	sex	0.000420	object
6	age	0.000376	float64
7	capital-loss	0.000376	float64
8	relationship	0.000354	object
9	native-country	0.000354	object
10	capital-gain	0.000332	float64
11	education-num	0.000310	float64
12	hours-per-week	0.000287	float64

	age	education-num	hours-per-week	income	native-country_ Cambodia	native-country_ Canada	native-country_ China	native-country_ Columbia	...	relationship_ Own-child	relationship_ Unmarried	relationship_ Wife	race_ Amer-Indian-Eskimo	race_ Asian-Pac-Islander	race_ Black	race_ Other	race_ White	sex_ Female	sex_ Male
0	0.205479	0.800000	0.397959	<=50K	False	False	False	False	...	False	False	False	False	False	False	False	True	True	False
1	0.383562	0.733333	0.663265	<=50K	False	False	False	False	...	False	False	False	False	False	False	False	True	False	True
2	0.383562	0.400000	0.500000	<=50K	False	False	False	False	...	False	False	False	False	False	False	False	True	False	True
3	0.301370	0.533333	0.704082	<=50K	False	False	False	False	...	False	False	False	False	False	False	False	True	False	True
4	0.465753	0.533333	0.377551	<=50K	False	False	False	False	...	False	False	False	False	False	False	False	True	True	False

	rank_test_score	mean_test_score	mean_train_score	mean_fit_time	mean_score_time	param_model__metric	param_model__n_neighbors	param_model__weights	params	split0_test_score	split0_train_score
8	1	0.756003	0.774393	0.170373	10.826124	manhattan	20	uniform	{'model__metric': 'manhattan', 'model__n_neigh...	0.756003	0.774393
4	2	0.755242	0.769807	0.179639	1.779021	euclidean	30	uniform	{'model__metric': 'euclidean', 'model__n_neigh...	0.755242	0.769807
2	3	0.754691	0.776388	0.169055	1.371828	euclidean	20	uniform	{'model__metric': 'euclidean', 'model__n_neigh...	0.754691	0.776388
10	4	0.754428	0.769568	0.160955	10.122170	manhattan	30	uniform	{'model__metric': 'manhattan', 'model__n_neigh...	0.754428	0.769568
5	5	0.754354	0.961486	0.162438	1.438892	euclidean	30	distance	{'model__metric': 'euclidean', 'model__n_neigh...	0.754354	0.961486
3	6	0.753519	0.961486	0.200519	1.682414	euclidean	20	distance	{'model__metric': 'euclidean', 'model__n_neigh...	0.753519	0.961486
1	7	0.753397	0.961453	0.163555	1.750763	euclidean	10	distance	{'model__metric': 'euclidean', 'model__n_neigh...	0.753397	0.961453
7	8	0.751266	0.961398	0.184966	9.837932	manhattan	10	distance	{'model__metric': 'manhattan', 'model__n_neigh...	0.751266	0.961398
11	9	0.750819	0.961434	0.168707	10.179976	manhattan	30	distance	{'model__metric': 'manhattan', 'model__n_neigh...	0.750819	0.961434
9	10	0.750659	0.961434	0.154550	10.395100	manhattan	20	distance	{'model__metric': 'manhattan', 'model__n_neigh...	0.750659	0.961434
0	11	0.745135	0.787930	0.159826	2.191112	euclidean	10	uniform	{'model__metric': 'euclidean', 'model__n_neigh...	0.745135	0.787930
6	12	0.742126	0.788557	0.178821	10.353799	manhattan	10	uniform	{'model__metric': 'manhattan', 'model__n_neigh...	0.742126	0.788557

	rank_test_score	mean_test_score	mean_train_score	mean_fit_time	mean_score_time	param_model__min_samples_leaf	param_model__min_samples_split	params	split0_test_score	split0_train_score
4	1	0.796564	0.825241	3.739510	0.168113	1.0	50	{'model__min_samples_leaf': 1, 'model__min_sam...	0.796564	0.825241
3	2	0.795629	0.889979	4.372025	0.194828	1.0	10	{'model__min_samples_leaf': 1, 'model__min_sam...	0.795629	0.889979
5	3	0.792938	0.808658	3.749197	0.153605	1.0	100	{'model__min_samples_leaf': 1, 'model__min_sam...	0.792938	0.808658
6	4	0.792703	0.811290	3.781244	0.170853	3.0	10	{'model__min_samples_leaf': 3, 'model__min_sam...	0.792703	0.811290
7	5	0.787059	0.799534	3.473020	0.135167	3.0	50	{'model__min_samples_leaf': 3, 'model__min_sam...	0.787059	0.799534
10	6	0.786269	0.793671	3.215243	0.155477	5.0	50	{'model__min_samples_leaf': 5, 'model__min_sam...	0.786269	0.793671
9	7	0.785388	0.797029	3.299765	0.164196	5.0	10	{'model__min_samples_leaf': 5, 'model__min_sam...	0.785388	0.797029
11	8	0.785314	0.790501	3.117904	0.135673	5.0	100	{'model__min_samples_leaf': 5, 'model__min_sam...	0.785314	0.790501
8	9	0.785163	0.794564	3.293961	0.137157	3.0	100	{'model__min_samples_leaf': 3, 'model__min_sam...	0.785163	0.794564
0	10	0.202732	0.197582	0.503190	0.046250	0.9	10	{'model__min_samples_leaf': 0.9, 'model__min_s...	0.202732	0.197582
1	10	0.202732	0.197582	0.580184	0.045858	0.9	50	{'model__min_samples_leaf': 0.9, 'model__min_s...	0.202732	0.197582
2	10	0.202732	0.197582	0.608448	0.049123	0.9	100	{'model__min_samples_leaf': 0.9, 'model__min_s...	0.202732	0.197582

	rank_test_score	mean_test_score	mean_train_score	mean_fit_time	mean_score_time	param_model__learning_rate	param_model__max_depth	param_model__min_samples_leaf	param_model__min_samples_split	param_model__n_estimators	param_model__subsample	params	split0_test_score	split0_train_score
5	1	0.802002	0.826607	9.478559	0.066335	0.3	8	50	500	40	0.8	{'model__learning_rate': 0.3, 'model__max_dept...	0.802002	0.826607
6	2	0.799953	0.828675	9.790354	0.056207	0.3	8	75	100	40	0.8	{'model__learning_rate': 0.3, 'model__max_dept...	0.799953	0.828675
3	3	0.799014	0.831488	9.388532	0.070796	0.3	8	50	100	40	0.8	{'model__learning_rate': 0.3, 'model__max_dept...	0.799014	0.831488
8	4	0.799002	0.823248	10.018849	0.059763	0.3	8	75	500	40	0.8	{'model__learning_rate': 0.3, 'model__max_dept...	0.799002	0.823248
10	5	0.798920	0.825210	9.565602	0.054077	0.3	8	100	250	40	0.8	{'model__learning_rate': 0.3, 'model__max_dept...	0.798920	0.825210
9	6	0.798621	0.825271	9.946554	0.060660	0.3	8	100	100	40	0.8	{'model__learning_rate': 0.3, 'model__max_dept...	0.798621	0.825271
0	7	0.797362	0.836347	9.490401	0.061779	0.3	8	25	100	40	0.8	{'model__learning_rate': 0.3, 'model__max_dept...	0.797362	0.836347
4	8	0.797322	0.829079	9.532293	0.054835	0.3	8	50	250	40	0.8	{'model__learning_rate': 0.3, 'model__max_dept...	0.797322	0.829079
7	9	0.796874	0.827342	9.673643	0.107158	0.3	8	75	250	40	0.8	{'model__learning_rate': 0.3, 'model__max_dept...	0.796874	0.827342
2	10	0.796697	0.825826	9.403579	0.056762	0.3	8	25	500	40	0.8	{'model__learning_rate': 0.3, 'model__max_dept...	0.796697	0.825826
11	11	0.794152	0.823807	9.436072	0.051999	0.3	8	100	500	40	0.8	{'model__learning_rate': 0.3, 'model__max_dept...	0.794152	0.823807
1	12	0.792167	0.828532	9.650312	0.063502	0.3	8	25	250	40	0.8	{'model__learning_rate': 0.3, 'model__max_dept...	0.792167	0.828532

	rank_test_score	mean_test_score	mean_train_score	mean_fit_time	mean_score_time	param_model__n_estimators	params	split0_test_score	split0_train_score
0	1	0.793320	0.788393	18.674231	2.040284	50	{'model__n_estimators': 50}	0.793320	0.788393
4	2	0.788837	0.794279	54.621016	5.903436	150	{'model__n_estimators': 150}	0.788837	0.794279
1	3	0.788642	0.789887	27.552853	2.956101	75	{'model__n_estimators': 75}	0.788642	0.789887
5	4	0.788569	0.795536	63.063454	6.956146	175	{'model__n_estimators': 175}	0.788569	0.795536
6	5	0.786053	0.796325	71.141048	7.675160	200	{'model__n_estimators': 200}	0.786053	0.796325
3	6	0.785383	0.792420	44.966056	4.980326	125	{'model__n_estimators': 125}	0.785383	0.792420
2	7	0.784964	0.791249	36.175189	4.035005	100	{'model__n_estimators': 100}	0.784964	0.791249

	best_score	best_param	best_estimator
0	0.802002	{'model__learning_rate': 0.3, 'model__max_dept...	(([DecisionTreeRegressor(criterion='friedman_m...
1	0.796564	{'model__min_samples_leaf': 1, 'model__min_sam...	((DecisionTreeClassifier(max_features='sqrt', ...
2	0.793320	{'model__n_estimators': 50}	((DecisionTreeClassifier(max_depth=1, random_s...
3	0.782308	{'model__C': 13, 'model__tol': 0.0001}	(LogisticRegression(C=13, class_weight='balanc...
4	0.777969	{'model__min_samples_leaf': 100, 'model__min_s...	(DecisionTreeClassifier(class_weight='balanced...
5	0.756003	{'model__metric': 'manhattan', 'model__n_neigh...	(KNeighborsClassifier(metric='manhattan', n_ne...
6	0.647918	{'model__var_smoothing': 1e-06}	(GaussianNB(var_smoothing=np.float64(1e-06)))

Classifier Model	Accuracy (Best Score)
Gradient Boosting	0.802
RandomForest	0.796
Adaboost	0.793
Logistic Regression	0.782
DecisionTree	0.777
KNeighbors	0.756
Gaussian Naive Bayes	0.648

Metric	Value
Precision	0.895 (class 0), 0.808 (class 1)
Recall	0.945 (class 0), 0.676 (class 1)
F1-Score	0.920 (class 0), 0.736 (class 1)

	Features	Importance
0	marital-status_ Married-civ-spouse	0.136925
1	capital-gain	0.102263
2	age	0.100958
3	education-num	0.100553
4	relationship_ Husband	0.093712

Introduction¶

Why did we choose this project?¶

Introduction of Project¶

Data Source¶

How Machine Learning helps solve this project?¶

Experiment¶

Notebook Configuration¶

1. Warnings¶

2. Matplotlib¶

3. TensorFlow¶

4. Random seed¶

Data Preprocessing¶

Loading the data¶

Splitting the data¶

Handling uncommon features¶

Identifying uncommon features¶

Removing uncommon features¶

Handling identifiers¶

Combining the training, validation and test data¶

Identifying identifiers¶

Handling missing data¶

Identifying missing values¶

Imputing missing values¶

Combining the training and validation data¶

Identifying categorical and continuous data columns¶

Summary of features in terms of datatype:¶

Distribution check of features¶

Log-transform highly skewed features¶

Splitting train and validation data¶

Imputing the null values¶

Scaling the data¶

Standardization¶

Encoding the data¶

Combining the training, validation and test data¶

Identifying categorical variables¶

Encoding categorical features¶

Encoding categorical target¶

Separating the training, validation and test data¶

Splitting the feature and target¶

Class Imbalance¶

Hyperparameter Tuning¶

Creating the dictionary of the models¶

Creating the dictionary of the pipelines¶

Getting the predefined split cross-validator¶

GridSearchCV¶

Creating the dictionary of the parameter grids¶

The parameter grid for Gaussian Naive Bayes¶

The parameter grid for LogisticRegression¶

The parameter grid for KNeighbors Classification¶

The parameter grid for Decision Tree Classifier¶

The parameter grid for Random Forest¶

The parameter grid for Adaboost Classifier¶

The parameter grid for Gradient Boosting¶

Tuning the hyperparameters¶

Model Selection¶

Model Evaluation¶

Area Under Curve¶

Interpretation¶

Getting the feature importance detected by random forest¶

Plotting the Feature Importance¶

Submission¶

Conclusion¶