from assets.utils.functions import *

# Core libraries and configurations
import math
import warnings
import joblib
from typing import Dict, Tuple, List, Any, NoReturn, Optional
import pandas as pd
import numpy as np
import lightgbm as lgb
from contextlib import contextmanager
import sys
import os
warnings.filterwarnings("ignore", message="The default of observed=False is deprecated")

# Data analysis and manipulation
import sidetable as stb
from skimpy import skim

# Statistical analysis and hypothesis testing
import scipy.stats as stats
from scipy.stats import norm, ttest_ind, chi2_contingency, t, zscore
import statsmodels.api as sms
import statsmodels.stats.api as sms_stats
from statsmodels.stats.weightstats import CompareMeans, DescrStatsW
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Visualization tools
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# Machine learning and preprocessing
from sklearn import ensemble, linear_model, neighbors, svm, tree
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier,
    HistGradientBoostingClassifier, VotingClassifier
)
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.inspection import permutation_importance
from sklearn.metrics import (
    accuracy_score, mean_squared_error, mean_absolute_error, r2_score, f1_score,
    classification_report, roc_auc_score, confusion_matrix, roc_curve,
    plot_confusion_matrix, plot_roc_curve, plot_precision_recall_curve
)
from sklearn.model_selection import (
    train_test_split, cross_val_score, learning_curve, StratifiedKFold, GridSearchCV,
    RandomizedSearchCV, ShuffleSplit
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder, PolynomialFeatures
)
from sklearn.tree import DecisionTreeClassifier

# Advanced machine learning algorithms (MLAs)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier

# Categorical encoding
from category_encoders import TargetEncoder

%load_ext pycodestyle_magic
# %reload_ext pycodestyle_magic
%pycodestyle_on
%flake8_on
%flake8_on --max_line_length 79
%matplotlib inline

# Load a dataset into a Pandas Dataframe
train_df = pd.read_csv('./assets/data/train.csv')
test_df = pd.read_csv('./assets/data/test.csv')
print("Full train dataset shape is {}".format(train_df.shape))
print("Full test dataset shape is {}".format(test_df.shape))

# Additional datasets to be used for analysis
train_adj_df = train_df.copy(deep=True)

# List of datasets to be cleaned (if needed)
clean_df_list = [train_adj_df, test_df]

Full train dataset shape is (8693, 14)
Full test dataset shape is (4277, 13)

train_adj_df.head()

test_df.head()

skim(train_adj_df)

╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮
│          Data Summary                Data Types                                                                 │
│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓                                                          │
│ ┃ dataframe         ┃ Values ┃ ┃ Column Type ┃ Count ┃                                                          │
│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩                                                          │
│ │ Number of rows    │ 8693   │ │ float64     │ 6     │                                                          │
│ │ Number of columns │ 14     │ │ string      │ 5     │                                                          │
│ └───────────────────┴────────┘ │ bool        │ 3     │                                                          │
│                                └─────────────┴───────┘                                                          │
│                                                     number                                                      │
│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓  │
│ ┃ column_name       ┃ NA    ┃ NA %   ┃ mean     ┃ sd       ┃ p0   ┃ p25   ┃ p50   ┃ p75  ┃ p100    ┃ hist    ┃  │
│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩  │
│ │ Age               │   179 │   2.06 │    28.83 │    14.49 │    0 │    19 │    27 │   38 │      79 │  ▂▇▆▃▁  │  │
│ │ RoomService       │   181 │   2.08 │    224.7 │    666.7 │    0 │     0 │     0 │   47 │   14330 │    ▇    │  │
│ │ FoodCourt         │   183 │   2.11 │    458.1 │     1611 │    0 │     0 │     0 │   76 │   29810 │    ▇    │  │
│ │ ShoppingMall      │   208 │   2.39 │    173.7 │    604.7 │    0 │     0 │     0 │   27 │   23490 │    ▇    │  │
│ │ Spa               │   183 │   2.11 │    311.1 │     1137 │    0 │     0 │     0 │   59 │   22410 │    ▇    │  │
│ │ VRDeck            │   188 │   2.16 │    304.9 │     1146 │    0 │     0 │     0 │   46 │   24130 │    ▇    │  │
│ └───────────────────┴───────┴────────┴──────────┴──────────┴──────┴───────┴───────┴──────┴─────────┴─────────┘  │
│                                                      bool                                                       │
│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓  │
│ ┃ column_name                        ┃ true            ┃ true rate                     ┃ hist                ┃  │
│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩  │
│ │ CryoSleep                          │            3254 │                          0.37 │       ▇    ▅        │  │
│ │ VIP                                │             402 │                         0.046 │          ▇          │  │
│ │ Transported                        │            4378 │                           0.5 │       ▇    ▇        │  │
│ └────────────────────────────────────┴─────────────────┴───────────────────────────────┴─────────────────────┘  │
│                                                     string                                                      │
│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓  │
│ ┃ column_name              ┃ NA       ┃ NA %       ┃ words per row                ┃ total words              ┃  │
│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩  │
│ │ PassengerId              │        0 │          0 │                            1 │                     8693 │  │
│ │ HomePlanet               │      201 │       2.31 │                         0.98 │                     8492 │  │
│ │ Cabin                    │      199 │       2.29 │                         0.98 │                     8494 │  │
│ │ Destination              │      182 │       2.09 │                          1.5 │                    12907 │  │
│ │ Name                     │      200 │        2.3 │                            2 │                    16986 │  │
│ └──────────────────────────┴──────────┴────────────┴──────────────────────────────┴──────────────────────────┘  │
╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯

check_distinct_values(train_adj_df)

Distinct values for 'PassengerId': ['0001_01' '0002_01' '0003_01' ... '9279_01' '9280_01' '9280_02']
Distinct values for 'HomePlanet': ['Europa' 'Earth' 'Mars' nan]
Distinct values for 'CryoSleep': [False True nan]
Distinct values for 'Cabin': ['B/0/P' 'F/0/S' 'A/0/S' ... 'G/1499/S' 'G/1500/S' 'E/608/S']
Distinct values for 'Destination': ['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e' nan]
Distinct values for 'VIP': [False True nan]
Distinct values for 'Name': ['Maham Ofracculy' 'Juanna Vines' 'Altark Susent' ... 'Fayey Connon'
 'Celeon Hontichre' 'Propsh Hontichre']

print(f"\nNumber of duplicate rows: {train_adj_df.duplicated().sum()}")

Number of duplicate rows: 0

check_blank_or_whitespace(train_adj_df)

Count of empty strings or single spaces per column:
PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64

train_adj_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB

# List of numerical columns to check for outliers
numerical_columns = ['Age', 'RoomService', 'FoodCourt',
                     'ShoppingMall', 'Spa', 'VRDeck']

# Set up the figure for boxplots
plt.figure(figsize=(15, 8))
for i, col in enumerate(numerical_columns, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(data=train_adj_df, x=col)
    plt.title(f"Boxplot of {col}")
plt.tight_layout()
plt.show()

# List of numerical columns to check for outliers
spending_columns = ['RoomService', 'FoodCourt',
                    'ShoppingMall', 'Spa', 'VRDeck']

# Create boxplots for each spending column by VIP status
plt.figure(figsize=(14, 10))
for i, col in enumerate(spending_columns, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(data=train_adj_df, x='VIP', y=col)
    plt.title(f"Spending on {col} by VIP Status")
plt.tight_layout()
plt.show()

train_null_summary = calculate_null_total_and_percentage(train_adj_df)
test_null_summary = calculate_null_total_and_percentage(test_df)

# Print the summaries
print("Train columns with total and percentage of null values:\n",
      train_null_summary)
print("-" * 50)
print("Test/Validation columns with total and percentage of null values:\n",
      test_null_summary)

Train columns with total and percentage of null values:
               Total Missing  Percentage Missing
PassengerId               0            0.000000
HomePlanet              201            2.312205
CryoSleep               217            2.496261
Cabin                   199            2.289198
Destination             182            2.093639
Age                     179            2.059128
VIP                     203            2.335212
RoomService             181            2.082135
FoodCourt               183            2.105142
ShoppingMall            208            2.392730
Spa                     183            2.105142
VRDeck                  188            2.162660
Name                    200            2.300702
Transported               0            0.000000
--------------------------------------------------
Test/Validation columns with total and percentage of null values:
               Total Missing  Percentage Missing
PassengerId               0            0.000000
HomePlanet               87            2.034136
CryoSleep                93            2.174421
Cabin                   100            2.338087
Destination              92            2.151040
Age                      91            2.127660
VIP                      93            2.174421
RoomService              82            1.917232
FoodCourt               106            2.478373
ShoppingMall             98            2.291326
Spa                     101            2.361468
VRDeck                   80            1.870470
Name                     94            2.197802

# Disable flake8 for this cell
# flake8: noqa

for dataset in clean_df_list:
    # Convert 'CryoSleep' and 'VIP' to 'object' for imputation
    dataset['CryoSleep'] = dataset['CryoSleep'].astype('object')
    dataset['VIP'] = dataset['VIP'].astype('object')

    # Mode imputation for categorical columns
    mode_imputer = SimpleImputer(strategy='most_frequent')
    dataset['HomePlanet'] = mode_imputer.fit_transform(
        dataset[['HomePlanet']]).ravel()
    dataset['CryoSleep'] = mode_imputer.fit_transform(
        dataset[['CryoSleep']]).ravel()
    dataset['VIP'] = mode_imputer.fit_transform(
        dataset[['VIP']]).ravel()
    dataset['Destination'] = mode_imputer.fit_transform(
        dataset[['Destination']]).ravel()

    # Convert 'CryoSleep' and 'VIP' back to boolean
    dataset['CryoSleep'] = dataset['CryoSleep'].astype(bool)
    dataset['VIP'] = dataset['VIP'].astype(bool)

    # Median imputation for 'Age'
    median_imputer = SimpleImputer(strategy='median')
    dataset['Age'] = median_imputer.fit_transform(dataset[['Age']]).ravel()

    # Fill missing values with 0 for spending-related columns
    dataset[['RoomService', 'FoodCourt',
             'ShoppingMall', 'Spa', 'VRDeck']] = dataset[[
                 'RoomService', 'FoodCourt',
                 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)

    # Drop unnecessary features for both training and test datasets
    drop_features = ['Cabin', 'Name']
    dataset.drop(drop_features, axis=1, inplace=True)

# Dropping unnecessary features - Train dataset
drop_feat_id = ['PassengerId']
train_adj_df.drop(drop_feat_id, axis=1, inplace=True)

# Print information to verify the process
print(f"Train dataset after imputation and dropping features:\n",
      train_adj_df.isnull().sum())
print("-"*50)
print(f"Test dataset after imputation and dropping features:\n",
      test_df.isnull().sum())

# Re-enable flake8 for the next cells if desired
%flake8_on

Train dataset after imputation and dropping features:
 HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
dtype: int64
--------------------------------------------------
Test dataset after imputation and dropping features:
 PassengerId     0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

# Create the TotalSpending feature
for dataset in clean_df_list:
    dataset['TotalSpending'] = dataset[
        ['RoomService', 'FoodCourt',
         'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

# Print information about train_adj_df and test_df
print("Train dataset after Feature Engineering:")
train_adj_df.info()
print("-" * 50)

print("Test dataset after Feature Engineering:")
test_df.info()

Train dataset after Feature Engineering:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   HomePlanet     8693 non-null   object 
 1   CryoSleep      8693 non-null   bool   
 2   Destination    8693 non-null   object 
 3   Age            8693 non-null   float64
 4   VIP            8693 non-null   bool   
 5   RoomService    8693 non-null   float64
 6   FoodCourt      8693 non-null   float64
 7   ShoppingMall   8693 non-null   float64
 8   Spa            8693 non-null   float64
 9   VRDeck         8693 non-null   float64
 10  Transported    8693 non-null   bool   
 11  TotalSpending  8693 non-null   float64
dtypes: bool(3), float64(7), object(2)
memory usage: 636.8+ KB
--------------------------------------------------
Test dataset after Feature Engineering:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PassengerId    4277 non-null   object 
 1   HomePlanet     4277 non-null   object 
 2   CryoSleep      4277 non-null   bool   
 3   Destination    4277 non-null   object 
 4   Age            4277 non-null   float64
 5   VIP            4277 non-null   bool   
 6   RoomService    4277 non-null   float64
 7   FoodCourt      4277 non-null   float64
 8   ShoppingMall   4277 non-null   float64
 9   Spa            4277 non-null   float64
 10  VRDeck         4277 non-null   float64
 11  TotalSpending  4277 non-null   float64
dtypes: bool(2), float64(7), object(3)
memory usage: 342.6+ KB

# Custom bins based on age distribution
age_bins = [-float('inf'), 18, 35, 60, float('inf')]
age_labels = ['Young', 'Adult', 'Middle Aged', 'Senior']

# Custom bins based on TotalSpending distribution
spending_bins = [-float('inf'), 0, 716, 1441, float('inf')]
spending_labels = ['Non-Spender', 'Low Spender',
                   'Medium Spender', 'High Spender']

# Create Bins for each dataset in clean_df_list
for dataset in clean_df_list:
    dataset['AgeBin'] = pd.cut(
        dataset['Age'], bins=age_bins,
        labels=age_labels)
    dataset['SpendingBin'] = pd.cut(
        dataset['TotalSpending'], bins=spending_bins,
        labels=spending_labels)

univariate_df_train = train_adj_df.copy()
univariate_df_train.drop(columns=['RoomService', 'FoodCourt',
                                  'ShoppingMall', 'Spa',
                                  'VRDeck'], inplace=True)
univariate_analysis(univariate_df_train)

bivariate_df_train = train_adj_df.copy(deep=True)
bivariate_df_train.drop(columns=['RoomService', 'FoodCourt',
                                 'ShoppingMall', 'Spa', 'VRDeck',
                                 'TotalSpending'], inplace=True)
features_vs_target(bivariate_df_train)

# Frequency tables for multiple columns at once
columns_to_analyze = ['AgeBin', 'CryoSleep']

# Loop through the columns and create a frequency table for each
for col in columns_to_analyze:
    print(f"\nFrequency table for {col}:\n")
    display(train_adj_df.stb.freq([col], cum_cols=False).round(2))

Frequency table for AgeBin:

Frequency table for CryoSleep:

df_age = train_adj_df.copy(deep=True)
# Check if CryoSleep is a category
if df_age['CryoSleep'].dtype != 'category':
    df_age['CryoSleep'] = df_age['CryoSleep'].astype('category')

# Generate plots
g = sns.catplot(
    data=df_age, x='AgeBin', hue='Transported', col='CryoSleep', kind='count',
    palette='coolwarm', col_wrap=2, height=5, aspect=1
)

# Set titles, labels, and legend
g.set_titles("CryoSleep: {col_name}")
g.set_axis_labels("Age Bin", "Count")
g._legend.set_title("Transported")

# Rotate x-axis labels for clarity
for ax in g.axes.flat:
    for label in ax.get_xticklabels():
        label.set_rotation(45)

plt.tight_layout()
plt.show()

# Frequency tables for multiple columns at once
columns_to_analyze = ['SpendingBin', 'CryoSleep']

# Loop through the columns and create a frequency table for each
for col in columns_to_analyze:
    print(f"\nFrequency table for {col}:\n")
    display(train_adj_df.stb.freq([col], cum_cols=False).round(2))

Frequency table for SpendingBin:

Frequency table for CryoSleep:

# Create a copy of train_adj_df for manipulation
df_spending = train_adj_df.copy(deep=True)

# Ensure CryoSleep is categorical to avoid blank plot issues
if df_spending['CryoSleep'].dtype != 'category':
    df_spending['CryoSleep'] = df_spending['CryoSleep'].astype('category')

# Calculate TotalSpending in the copied DataFrame
df_spending['TotalSpending'] = df_spending[['RoomService', 'FoodCourt',
                                            'ShoppingMall', 'Spa',
                                            'VRDeck']].sum(axis=1)

# Define bins and labels for SpendingBin
spending_bins = [-1, 0, 500, 2000, 5000, df_spending['TotalSpending'].max()]
spending_labels = ['Non-Spender', 'Low Spender',
                   'Medium Spender', 'High Spender',
                   'Very High Spender']
df_spending['SpendingBin'] = pd.cut(df_spending['TotalSpending'],
                                    bins=spending_bins, labels=spending_labels)

# Generate the faceted plot using catplot
g = sns.catplot(
    data=df_spending, x='SpendingBin',
    hue='Transported',
    col='CryoSleep',
    kind='count',
    palette='coolwarm', col_wrap=2, height=5, aspect=1
)

# Set titles, labels, and legend
g.set_titles("CryoSleep: {col_name}")
g.set_axis_labels("Spending Bin", "Count")
g._legend.set_title("Transported")

# Rotate x-axis labels for clarity
for ax in g.axes.flat:
    for label in ax.get_xticklabels():
        label.set_rotation(45)

plt.tight_layout()
plt.show()

# Frequency tables for multiple columns at once
columns_to_analyze = ['Destination', 'HomePlanet']

# Loop through the columns and create a frequency table for each
for col in columns_to_analyze:
    print(f"\nFrequency table for {col}:\n")
    display(train_adj_df.stb.freq([col], cum_cols=False).round(2))

Frequency table for Destination:

Frequency table for HomePlanet:

# Group by Destination and Transported, calculate counts
destination_data = train_adj_df.groupby(
    ['Destination', 'Transported']).size().unstack().fillna(0)
destination_data.columns = ['Not Transported', 'Transported']

# Plot pie charts for Destination
fig, axes = plt.subplots(1, len(destination_data), figsize=(18, 5))
fig.suptitle('Transported Status by Destination', fontsize=16)

for i, (destination, counts) in enumerate(destination_data.iterrows()):
    axes[i].pie(counts, labels=['Not Transported', 'Transported'],
                autopct='%1.1f%%',
                startangle=90,
                colors=['lightcoral', 'skyblue'])
    axes[i].set_title(destination)

plt.show()

# Group by HomePlanet and Transported, calculate counts
homeplanet_data = train_adj_df.groupby(
    ['HomePlanet', 'Transported']).size().unstack().fillna(0)
homeplanet_data.columns = ['Not Transported', 'Transported']

# Plot pie charts for HomePlanet
fig, axes = plt.subplots(1, len(homeplanet_data), figsize=(18, 5))
fig.suptitle('Transported Status by HomePlanet', fontsize=16)

for i, (homeplanet, counts) in enumerate(homeplanet_data.iterrows()):
    axes[i].pie(counts, labels=['Not Transported', 'Transported'],
                autopct='%1.1f%%',
                startangle=90,
                colors=['lightcoral', 'skyblue'])
    axes[i].set_title(homeplanet)

plt.show()

label = LabelEncoder()
for dataset in clean_df_list:
    dataset['HomeLabel'] = label.fit_transform(dataset['HomePlanet'])
    dataset['CryoLabel'] = label.fit_transform(dataset['CryoSleep'])
    dataset['DestLabel'] = label.fit_transform(dataset['Destination'])
    dataset['VIPLabel'] = label.fit_transform(dataset['VIP'])
    dataset['AgeLabel'] = label.fit_transform(dataset['AgeBin'])
    dataset['SpendingLabel'] = label.fit_transform(dataset['SpendingBin'])
    dataset['AgeBin'] = label.fit_transform(dataset['AgeBin'])
    dataset['SpendingBin'] = label.fit_transform(dataset['SpendingBin'])

# Create a copy of the original DF for correlation analysis
correlation_features = train_adj_df.drop(
    columns=['HomeLabel', 'CryoLabel', 'DestLabel',
             'VIPLabel', 'AgeLabel', 'SpendingLabel',
             'AgeBin', 'SpendingBin'])

# One-hot encoding for categorical features
correlation_features = pd.get_dummies(
    correlation_features,
    columns=['HomePlanet', 'CryoSleep', 'Destination', 'VIP'],
    drop_first=True  # Reduce multicollinearity
)

# Target variable is numeric for correlation calculation
correlation_features['Transported'] = train_adj_df['Transported'].astype(int)

# Calculate the correlation matrix
correlation_matrix = correlation_features.corr()

# Convert correlation values to percentages
correlation_matrix_percent = correlation_matrix * 100

# Apply a mask to show only the lower triangle of the correlation matrix
mask = np.triu(np.ones_like(correlation_matrix_percent, dtype=bool))

# Plot the correlation matrix with the target variable in percentage
plt.figure(figsize=(12, 8))
sns.heatmap(
    correlation_matrix_percent, mask=mask, annot=True,
    fmt=".2f", cmap='coolwarm', center=0, linewidths=0.5,
    cbar_kws={"shrink": .8}
)
plt.title("Correlation Matrix with Target")
plt.show()

# Copy the DataFrame for VIF calculation
vif_data = train_adj_df.drop(
    columns=['Transported', 'HomeLabel', 'CryoLabel',
             'DestLabel', 'VIPLabel', 'AgeLabel',
             'SpendingLabel', 'TotalSpending'])

# All categorical columns are encoded to numeric for VIF calculation
label_enc = LabelEncoder()
for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']:
    vif_data[col] = label_enc.fit_transform(vif_data[col])

# Calculate VIF for each feature
vif_df = pd.DataFrame()
vif_df["Feature"] = vif_data.columns
vif_df["VIF"] = [variance_inflation_factor(
    vif_data.values, i) for i in range(vif_data.shape[1])]

# Display the VIF results
print("VIF values:")
print(vif_df)

VIF values:
         Feature       VIF
0     HomePlanet  1.897416
1      CryoSleep  3.210495
2    Destination  3.721231
3            Age  3.855633
4            VIP  1.072493
5    RoomService  1.303056
6      FoodCourt  1.276631
7   ShoppingMall  1.156679
8            Spa  1.209944
9         VRDeck  1.199526
10        AgeBin  1.779625
11   SpendingBin  6.785149

# Continuous features for T-Test
continuous_features = ['Age', 'RoomService', 'FoodCourt',
                       'ShoppingMall', 'Spa', 'VRDeck',
                       'TotalSpending']
for feature in continuous_features:
    perform_ttests(train_adj_df, feature)

# Categorical features for Chi-Square Test
categorical_features = ['HomePlanet', 'CryoSleep',
                        'Destination', 'VIP']
for feature in categorical_features:
    perform_chi2_test(train_adj_df, feature)

Feature: Age
T-Statistic: -6.9446, P-Value: 0.000000000004072
95% Confidence Interval for the Difference in Means: (-2.730122829019935, -1.5281421607344878)
Reject the null hypothesis: Significant difference in Age between transported and not transported groups.

Feature: RoomService
T-Statistic: -23.0249, P-Value: 0.000000000000000
95% Confidence Interval for the Difference in Means: (-345.64370808192706, -291.4032500840212)
Reject the null hypothesis: Significant difference in RoomService between transported and not transported groups.

Feature: FoodCourt
T-Statistic: 4.2676, P-Value: 0.000020007607209
95% Confidence Interval for the Difference in Means: (78.65411189189258, 212.3031489726323)
Reject the null hypothesis: Significant difference in FoodCourt between transported and not transported groups.

Feature: ShoppingMall
T-Statistic: 0.8782, P-Value: 0.379845889059888
95% Confidence Interval for the Difference in Means: (-13.838023038900825, 36.300928422238044)
Fail to reject the null hypothesis: No significant difference in ShoppingMall between transported and not transported groups.

Feature: Spa
T-Statistic: -20.7362, P-Value: 0.000000000000000
95% Confidence Interval for the Difference in Means: (-538.4673826416797, -445.4444570794161)
Reject the null hypothesis: Significant difference in Spa between transported and not transported groups.

Feature: VRDeck
T-Statistic: -19.3819, P-Value: 0.000000000000000
95% Confidence Interval for the Difference in Means: (-511.6943456312086, -417.6872614455743)
Reject the null hypothesis: Significant difference in VRDeck between transported and not transported groups.

Feature: TotalSpending
T-Statistic: -18.9400, P-Value: 0.000000000000000
95% Confidence Interval for the Difference in Means: (-1234.2193522611149, -1002.7008864548501)
Reject the null hypothesis: Significant difference in TotalSpending between transported and not transported groups.

Feature: HomePlanet
Chi-Square Statistic: 318.9344, P-Value: 0.000000000000000
Reject the null hypothesis: Significant association between HomePlanet and transported status.

Feature: CryoSleep
Chi-Square Statistic: 1838.5677, P-Value: 0.000000000000000
Reject the null hypothesis: Significant association between CryoSleep and transported status.

Feature: Destination
Chi-Square Statistic: 105.5630, P-Value: 0.000000000000000
Reject the null hypothesis: Significant association between Destination and transported status.

Feature: VIP
Chi-Square Statistic: 11.5760, P-Value: 0.000668102360015
Reject the null hypothesis: Significant association between VIP and transported status.

# Define target and features
target = 'Transported'
categorical_columns = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
numerical_columns = ['Age', 'RoomService', 'FoodCourt',
                     'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpending']

# Split the data first to avoid data leakage
X = train_adj_df.drop(columns=[target])
y = train_adj_df[target]

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=0,
    stratify=y)

# Define the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),  # Scale numerical col
        ('cat', OneHotEncoder(drop='first'), categorical_columns)  # One-hot ct
    ])

# Fit the preprocessor only on X_train to avoid leakage
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)

# Verify the transformed shapes
print("X_train_processed shape:", X_train_processed.shape)
print("X_val_processed shape:", X_val_processed.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)

X_train_processed shape: (6954, 13)
X_val_processed shape: (1739, 13)
y_train shape: (6954,)
y_val shape: (1739,)

# Train the Random Forest model as a baseline
model = RandomForestClassifier(random_state=42)
model.fit(X_train_processed, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val_processed)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Baseline Model (RandomForest) Accuracy: {accuracy:.4f}")

Baseline Model (RandomForest) Accuracy: 0.7775

# Print Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

# Print Classification Report (includes Precision, Recall, and F1-Score)
print("\nClassification Report:\n", classification_report(y_val, y_pred))

# Calculate F1-Score and AUC for the validation set
f1 = f1_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, model.predict_proba(X_val_processed)[:, 1])

print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

Confusion Matrix:
 [[673 190]
 [197 679]]

Classification Report:
               precision    recall  f1-score   support

       False       0.77      0.78      0.78       863
        True       0.78      0.78      0.78       876

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739

F1 Score: 0.7782
ROC AUC Score: 0.8483

# List of machine learning algorithms for comparison
MLA = [
    # Ensemble Methods
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),
    ensemble.AdaBoostClassifier(algorithm="SAMME"),
    ensemble.BaggingClassifier(),

    # Generalized Linear Models (GLM)
    linear_model.LogisticRegressionCV(max_iter=500),

    # Nearest Neighbors
    neighbors.KNeighborsClassifier(),

    # Support Vector Machines (SVM)
    svm.SVC(probability=True),

    # Tree-based models
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),

    # Gradient Boosting Models
    XGBClassifier(),
    LGBMClassifier(verbosity=-1),
    CatBoostClassifier(verbose=0),
    ensemble.HistGradientBoostingClassifier(),

    # Tree-based models for imbalanced datasets
    BalancedRandomForestClassifier(sampling_strategy='all',
                                   replacement=True,
                                   bootstrap=False),
    EasyEnsembleClassifier(sampling_strategy='all', replacement=True),
]

# Print list to verify
print("Machine Learning Algorithms for Comparison (MLA):")
for model in MLA:
    if model.__class__.__name__ == 'AdaBoostClassifier':
        model.set_params(algorithm="SAMME")
    print(f"{model.__class__.__name__}")

Machine Learning Algorithms for Comparison (MLA):
GradientBoostingClassifier
RandomForestClassifier
AdaBoostClassifier
BaggingClassifier
LogisticRegressionCV
KNeighborsClassifier
SVC
DecisionTreeClassifier
ExtraTreeClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier
HistGradientBoostingClassifier
BalancedRandomForestClassifier
EasyEnsembleClassifier

# Initialize an empty list to collect the results
results_list = []

# Loop through each model in MLA
for model in MLA:
    model_name = model.__class__.__name__

    # Calculate cross-validated train accuracy
    train_accuracy_scores = cross_val_score(model, X_train_processed,
                                            y_train, cv=5,
                                            scoring='accuracy')
    train_accuracy_mean = train_accuracy_scores.mean()

    # Fit the model on the entire training set and predict on validation set
    model.fit(X_train_processed, y_train)
    y_pred = model.predict(X_val_processed)
    y_pred_proba = model.predict_proba(
        X_val_processed)[:, 1] if hasattr(model, "predict_proba") else None

    # Calculate test metrics
    test_accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    roc_auc = roc_auc_score(
        y_val, y_pred_proba) if y_pred_proba is not None else "N/A"

    # Append each result as a dictionary to the results list
    results_list.append({
        'Model': model_name,
        'Train Acc Mean': train_accuracy_mean,
        'Test Acc': test_accuracy,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

# Convert the results list into a DataFrame
results = pd.DataFrame(results_list)

# Sort by F1 Score to see the top-performing models
results = results.sort_values(by='F1 Score', ascending=False)
print("Model Comparison Results:")
print(results.to_string())

Model Comparison Results:
                             Model  Train Acc Mean  Test Acc  F1 Score   ROC AUC
11              CatBoostClassifier        0.798822  0.790109  0.798008  0.864953
0       GradientBoostingClassifier        0.795084  0.782634  0.795676  0.865144
10                  LGBMClassifier        0.796809  0.787234  0.793987  0.860660
12  HistGradientBoostingClassifier        0.795514  0.784359  0.793388  0.861215
4             LogisticRegressionCV        0.787460  0.786659  0.791221  0.860331
6                              SVC        0.792494  0.779183  0.785475  0.852502
13  BalancedRandomForestClassifier        0.786742  0.780334  0.781714  0.847410
1           RandomForestClassifier        0.785017  0.778608  0.780627  0.847465
9                    XGBClassifier        0.785736  0.775733  0.780158  0.854954
5             KNeighborsClassifier        0.770061  0.769983  0.771429  0.835128
3                BaggingClassifier        0.776818  0.764807  0.768272  0.829032
2               AdaBoostClassifier        0.772936  0.766532  0.764774  0.838083
14          EasyEnsembleClassifier        0.772074  0.760782  0.757858  0.837873
7           DecisionTreeClassifier        0.735404  0.741806  0.753973  0.754224
8              ExtraTreeClassifier        0.727783  0.718804  0.730875  0.724918

# Plot creation function
fig = plot_model_comparison(results)

# Display the plot
fig.show()

# Disable flake8 for this cell
# flake8: noqa

# Define parameter distributions for each model
param_dist_gb = {
    'n_estimators': [100, 150, 200, 250],
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_dist_cb = {
    'iterations': [100, 150, 200, 250],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
    'bagging_temperature': [0.5, 1.0]
}

param_dist_lgbm = {
    'n_estimators': [100, 150, 200, 250],
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'max_depth': [3, 5, -1],  # -1 for no limit in LightGBM
    'num_leaves': [20, 31, 63],
    'min_child_samples': [5, 10, 20]
}

# Number of parameter settings to sample
n_iter_search = 20

# Tuning Gradient Boosting Classifier
print("Tuning Gradient Boosting Classifier...")
gb = GradientBoostingClassifier()
random_search_gb = RandomizedSearchCV(
    estimator=gb,
    param_distributions=param_dist_gb,
    n_iter=n_iter_search,
    cv=5,
    scoring='accuracy',
    random_state=0)
random_search_gb.fit(X_train_processed, y_train)
print("Best Gradient Boosting parameters:", random_search_gb.best_params_)
print("Best Gradient Boosting accuracy:", random_search_gb.best_score_)

# Tuning CatBoost Classifier
print("\nTuning CatBoost Classifier...")
cb = CatBoostClassifier(verbose=0)
random_search_cb = RandomizedSearchCV(
    estimator=cb,
    param_distributions=param_dist_cb,
    n_iter=n_iter_search,
    cv=5,
    scoring='accuracy',
    random_state=0)
random_search_cb.fit(X_train_processed, y_train)
print("Best CatBoost parameters:", random_search_cb.best_params_)
print("Best CatBoost accuracy:", random_search_cb.best_score_)

# Tuning LightGBM Classifier
print("\nTuning LightGBM Classifier...")
lgbm = LGBMClassifier(verbosity=-1)
random_search_lgbm = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_dist_lgbm,
    n_iter=n_iter_search,
    cv=5,
    scoring='accuracy',
    random_state=0)
random_search_lgbm.fit(X_train_processed, y_train)
print("Best LightGBM parameters:", random_search_lgbm.best_params_)
print("Best LightGBM accuracy:", random_search_lgbm.best_score_)

Tuning Gradient Boosting Classifier...
Best Gradient Boosting parameters: {'subsample': 0.8, 'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 5, 'learning_rate': 0.05}
Best Gradient Boosting accuracy: 0.7983913027737407

Tuning CatBoost Classifier...
Best CatBoost parameters: {'learning_rate': 0.1, 'l2_leaf_reg': 1, 'iterations': 150, 'depth': 8, 'bagging_temperature': 0.5}
Best CatBoost accuracy: 0.8009794723531025

Tuning LightGBM Classifier...
Best LightGBM parameters: {'num_leaves': 63, 'n_estimators': 250, 'min_child_samples': 20, 'max_depth': -1, 'learning_rate': 0.01}
Best LightGBM accuracy: 0.7986773140797211

# Convert numpy arrays to DataFrames (if needed)
X_train_df = pd.DataFrame(X_train_processed)
X_val_df = pd.DataFrame(X_val_processed)

# Combine training and validation sets
X_final_train = pd.concat(
    [X_train_df, X_val_df], ignore_index=True)
y_final_train = pd.concat(
    [y_train.reset_index(drop=True),
     y_val.reset_index(drop=True)], ignore_index=True)

# Define models with best parameters from RandomizedSearchCV
gb_best = GradientBoostingClassifier(**random_search_gb.best_params_)
cb_best = CatBoostClassifier(**random_search_cb.best_params_, verbose=0)
lgbm_best = LGBMClassifier(**random_search_lgbm.best_params_)

# Re-train models on the combined dataset
print("Training Gradient Boosting Classifier with best parameters...")
gb_best.fit(X_final_train, y_final_train)

print("Training CatBoost Classifier with best parameters...")
cb_best.fit(X_final_train, y_final_train)

print("Training LightGBM Classifier with best parameters...")
lgbm_best.fit(X_final_train, y_final_train)

Training Gradient Boosting Classifier with best parameters...
Training CatBoost Classifier with best parameters...
Training LightGBM Classifier with best parameters...

LGBMClassifier(learning_rate=0.01, n_estimators=250, num_leaves=63)

LGBMClassifier(learning_rate=0.01, n_estimators=250, num_leaves=63)

# Suppress output during model operations
@contextmanager
def suppress_output():
    original_stdout = sys.stdout
    original_stderr = sys.stderr
    try:
        sys.stdout = open(os.devnull, 'w')
        sys.stderr = open(os.devnull, 'w')
        yield
    finally:
        sys.stdout.close()
        sys.stdout = original_stdout
        sys.stderr = original_stderr


# Suppress LightGBM output
lgbm_best.set_params(verbosity=-1)
gb_best.set_params
cb_best.set_params

# Define the VotingClassifier with the configured base models
ensemble_df = VotingClassifier(
    estimators=[('gb', gb_best),
                ('cb', cb_best),
                ('lgbm', lgbm_best)],
    voting='soft'
)

# Fit and evaluate the ensemble model with suppressed output
with suppress_output():
    # Train the ensemble
    ensemble_df.fit(X_final_train, y_final_train)

    # Compute learning curve
    train_sizes, train_scores, val_scores = learning_curve(
        ensemble_df, X_final_train, y_final_train,
        cv=5, scoring='f1', n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 5)
    )

# Fit and evaluate the ensemble model with suppressed output
with suppress_output():
    # Train the ensemble
    ensemble_df.fit(X_final_train, y_final_train)

# Generate predictions for the validation set
y_pred_ensemble_val = ensemble_df.predict(
    X_val_processed)
y_pred_proba_ensemble_val = ensemble_df.predict_proba(
    X_val_processed)[:, 1]

# Print ensemble performance on the validation set
print("\nEnsemble Model Performance on Validation Set:")
print("Accuracy on Validation Set:", accuracy_score(
    y_val, y_pred_ensemble_val))
print("F1 Score on Validation Set:", f1_score(
    y_val, y_pred_ensemble_val))
print("ROC AUC Score on Validation Set:", roc_auc_score(
    y_val, y_pred_proba_ensemble_val))
print("Classification Report (Validation Set):\n", classification_report(
    y_val, y_pred_ensemble_val))

# Generate and display the confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred_ensemble_val)

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues",
            xticklabels=['False', 'True'],
            yticklabels=['False', 'True'])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix (Validation Set)")
plt.show()

Ensemble Model Performance on Validation Set:
Accuracy on Validation Set: 0.8125359401955147
F1 Score on Validation Set: 0.8206820682068208
ROC AUC Score on Validation Set: 0.902338396905771
Classification Report (Validation Set):
               precision    recall  f1-score   support

       False       0.84      0.77      0.80       863
        True       0.79      0.85      0.82       876

    accuracy                           0.81      1739
   macro avg       0.81      0.81      0.81      1739
weighted avg       0.81      0.81      0.81      1739

# Evaluate ensemble on training set
y_pred_train = ensemble_df.predict(
    X_train_processed)
y_pred_proba_train = ensemble_df.predict_proba(
    X_train_processed)[:, 1]

# Print ensemble performance on the training set
print("Ensemble Model Performance on Training Set:")
print("Training Accuracy:", accuracy_score(
    y_train, y_pred_train))
print("Training F1 Score:", f1_score(
    y_train, y_pred_train))
print("Training ROC AUC Score:", roc_auc_score(
    y_train, y_pred_proba_train))

# Print classification report for training set
print("\nClassification Report (Training Set):")
print(classification_report(y_train, y_pred_train))

# Generate and display the confusion matrix
conf_matrix_train = confusion_matrix(y_train, y_pred_train)

# Plot confusion matrix for training set
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix_train, annot=True, fmt="d", cmap="Blues",
            xticklabels=['False', 'True'],
            yticklabels=['False', 'True'])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix (Training Set)")
plt.show()

Ensemble Model Performance on Training Set:
Training Accuracy: 0.8275812482024734
Training F1 Score: 0.833680122069635
Training ROC AUC Score: 0.9179100934212069

Classification Report (Training Set):
              precision    recall  f1-score   support

       False       0.85      0.80      0.82      3452
        True       0.81      0.86      0.83      3502

    accuracy                           0.83      6954
   macro avg       0.83      0.83      0.83      6954
weighted avg       0.83      0.83      0.83      6954

# Ensure ensemble_df is your VotingClassifier with soft voting
ensemble_df = VotingClassifier(
    estimators=[('gb', gb_best), ('cb', cb_best), ('lgbm', lgbm_best)],
    voting='soft'
)

# Perform cross-validation and calculate F1 scores
cv_scores = cross_val_score(ensemble_df,
                            X_final_train,
                            y_final_train,
                            cv=5, scoring='f1',
                            n_jobs=-1)
print("Cross-Validation F1 Scores:", cv_scores)
print("Mean Cross-Validation F1 Score:", cv_scores.mean())
print("Standard Deviation of Cross-Validation F1 Score:", cv_scores.std())

Cross-Validation F1 Scores: [0.80289532 0.80415755 0.8077135  0.8087493  0.79625551]
Mean Cross-Validation F1 Score: 0.8039542352677802
Standard Deviation of Cross-Validation F1 Score: 0.0044171880512989615

import h2o
from h2o.automl import H2OAutoML
from h2o.frame import H2OFrame
import pandas as pd

# Initialize the H2O server and suppress output
h2o.no_progress()
h2o.init()

# Use get_feature_names_out to get all generated column
preprocessor.fit(X_train)
feature_names = preprocessor.get_feature_names_out()

# Convert preprocessed data to DataFrame with the correct feature names
X_train_df = pd.DataFrame(X_train_processed, columns=feature_names)
X_val_df = pd.DataFrame(X_val_processed, columns=feature_names)

# Convert the processed DataFrames to H2O Frames
X_train_h2o = H2OFrame(X_train_df)
y_train_h2o = H2OFrame(y_train.to_frame(name="Transported"))
X_val_h2o = H2OFrame(X_val_df)

# Combine features and target into one frame for H2O
train_h2o = X_train_h2o.cbind(y_train_h2o)

# Define target and feature columns
target = 'Transported'
x = train_h2o.columns
x.remove(target)

# Initialize H2OAutoML
aml = H2OAutoML(max_runtime_secs=600,
                max_models=10,
                balance_classes=True,
                stopping_metric="AUC",
                seed=42)

# Run AutoML
aml.train(x=x, y=target, training_frame=train_h2o)

# Get predictions and leaderboard, using multi-threading for conversions
pred_val_h2o = aml.leader.predict(X_val_h2o)
pred_val_df = pred_val_h2o.as_data_frame(use_multi_thread=True)
leaderboard_df = aml.leaderboard.as_data_frame(use_multi_thread=True)

print("\nLeaderboard:")
print(leaderboard_df.to_string())

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.25" 2024-10-15; OpenJDK Runtime Environment Homebrew (build 11.0.25+0); OpenJDK 64-Bit Server VM Homebrew (build 11.0.25+0, mixed mode)
  Starting server from /Users/ctw01365/Documents/fabiano/courses/turing/fchapu-DS.v2.5.3.3.5/.dsvenv335/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/4n/8ybtyqy963bfcv7kjsjvg8qwkn3sgb/T/tmpz5d5fetf
  JVM stdout: /var/folders/4n/8ybtyqy963bfcv7kjsjvg8qwkn3sgb/T/tmpz5d5fetf/h2o_ctw01365_started_from_python.out
  JVM stderr: /var/folders/4n/8ybtyqy963bfcv7kjsjvg8qwkn3sgb/T/tmpz5d5fetf/h2o_ctw01365_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.

17:04:29.906: AutoML: XGBoost is not available; skipping it.


Leaderboard:
                                                   model_id       auc   logloss     aucpr  mean_per_class_error      rmse       mse
0      StackedEnsemble_AllModels_1_AutoML_1_20241110_170429  0.882434  0.425683  0.891425              0.202900  0.372636  0.138858
1   StackedEnsemble_BestOfFamily_1_AutoML_1_20241110_170429  0.882358  0.425922  0.891336              0.203953  0.372814  0.138991
2                            GBM_5_AutoML_1_20241110_170429  0.880660  0.436042  0.890327              0.207752  0.375080  0.140685
3                            GBM_2_AutoML_1_20241110_170429  0.879736  0.441515  0.888426              0.204329  0.376459  0.141721
4                            GBM_3_AutoML_1_20241110_170429  0.878689  0.436132  0.888385              0.204778  0.375829  0.141247
5               GBM_grid_1_AutoML_1_20241110_170429_model_1  0.878174  0.440057  0.889226              0.210661  0.376555  0.141793
6                            GBM_1_AutoML_1_20241110_170429  0.877494  0.439596  0.886572              0.209109  0.376602  0.141829
7                            GBM_4_AutoML_1_20241110_170429  0.875633  0.440228  0.885317              0.208075  0.378047  0.142920
8                   DeepLearning_1_AutoML_1_20241110_170429  0.873856  0.445169  0.880417              0.204005  0.381588  0.145610
9                            GLM_1_AutoML_1_20241110_170429  0.872114  0.444958  0.878515              0.208532  0.380792  0.145002
10                           XRT_1_AutoML_1_20241110_170429  0.870377  0.468953  0.878898              0.216933  0.388502  0.150934
11                           DRF_1_AutoML_1_20241110_170429  0.869122  0.482578  0.879354              0.207884  0.383078  0.146749

# Calculate means and standard deviations
train_scores_mean = np.mean(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)

# Plot learning curves
plt.figure()
plt.plot(train_sizes,
         train_scores_mean,
         'o-', color="r",
         label="Training score")
plt.plot(train_sizes,
         val_scores_mean,
         'o-', color="g",
         label="Validation score")
plt.xlabel("Training Set Size")
plt.ylabel("F1 Score")
plt.title("Learning Curves for Ensemble Model")
plt.legend(loc="best")
plt.show()

import lime
from lime.lime_tabular import LimeTabularExplainer
import numpy as np
import matplotlib.pyplot as plt

# Ensure feature names are obtained from the preprocessor
feature_names = preprocessor.get_feature_names_out()

# Fit the ensemble model if it hasn’t been done yet
ensemble_df.fit(X_final_train, y_final_train)

# Initialize the LimeTabularExplainer with the correct feature names
explainer = LimeTabularExplainer(
    X_train_processed,
    feature_names=feature_names,
    class_names=['Not Transported', 'Transported'],
    discretize_continuous=True,
    mode='classification'
)

# Choose an instance to explain
i = 0  # Select the instance to explain
exp = explainer.explain_instance(
    X_val_processed[i],
    ensemble_df.predict_proba,
    num_features=10
)

# Plot the explanation inline with the proper feature names
fig = exp.as_pyplot_figure()
fig.patch.set_facecolor('white')
plt.show()

# Predict probabilities for the positive class
y_pred_proba_val = ensemble_df.predict_proba(X_val_processed)[:, 1]

# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba_val)

# Calculate AUC score
roc_auc = roc_auc_score(y_val, y_pred_proba_val)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f"Ensemble Model (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Ensemble Model")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# Disable flake8 for this cell
# flake8: noqa

# Apply the preprocessor (fitted on X_train) to test_df
X_test_processed = preprocessor.transform(test_df)

# Generate predictions and probabilities for the test set
y_pred_ensemble_test = ensemble_df.predict(X_test_processed)
y_pred_proba_ensemble_test = ensemble_df.predict_proba(X_test_processed)[:, 1]

# Ensure that 'PassengerId' exists in test_df for submission
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': y_pred_ensemble_test
})

# Convert 'Transported' predictions to boolean format if not already
submission_df['Transported'] = submission_df['Transported'].astype(bool)

# Save the DataFrame to a CSV file in the required format
submission_df.to_csv('submission_f05.csv', index=False)

print("Submission file 'submission_f05.csv' "
      f"created with the following structure:")
print(submission_df.head())

Submission file 'submission_f05.csv' created with the following structure:
  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         True

Column	Description
PassengerId	A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
HomePlanet	The planet the passenger departed from, typically their planet of permanent residence.
CryoSleep	Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
Cabin	The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
Destination	The planet the passenger will be debarking to.
Age	The age of the passenger.
VIP	Whether the passenger has paid for special VIP service during the voyage.
RoomService, FoodCourt, ShoppingMall, Spa, VRDeck	Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
Name	The first and last names of the passenger.
Transported	Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

Dataset	Description
train.csv	Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
test.csv	Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. The task is to predict the value of Transported for the passengers in this set.
sample_submission.csv	A sample submission file in the correct format.

H2O_cluster_uptime:	01 secs
H2O_cluster_timezone:	Europe/Lisbon
H2O_data_parsing_timezone:	UTC
H2O_cluster_version:	3.46.0.6
H2O_cluster_version_age:	8 days
H2O_cluster_name:	H2O_from_python_ctw01365_ar8wjm
H2O_cluster_total_nodes:	1
H2O_cluster_free_memory:	9 Gb
H2O_cluster_total_cores:	12
H2O_cluster_allowed_cores:	12
H2O_cluster_status:	locked, healthy
H2O_connection_url:	http://127.0.0.1:54321
H2O_connection_proxy:	{"http": null, "https": null}
H2O_internal_security:	False
Python_version:	3.10.0 final

Table of Contents¶

Introduction¶

Dataset Structure¶

Notebook settings¶

Data Loading, Cleaning and Initial Exploration¶

Outliers Analysis¶

Conclusion for Outliers Analysis¶

Conclusion for Outliers Analysis and VIP Feature¶

Data Preparation¶

Strategy for missing data¶

Insights and Approach definition
¶

Feature Engineering - TotalSpending¶

Binning Features: Age and TotalSpending¶

EDA¶

Conclusion for Univariate Analysis after data imputation¶

Bivariate Analysis¶

Conclusion over Bivariate Analysis¶

Target Variable vs. Age and CryoSleep¶

Correlation Matrix Analysis¶

Multicolinarity Analysis¶

Statistical Inference and Hypothesis Testing¶

Data Preprocessing¶

Baseline Model¶

Data Modeling¶

Hyperparameter Tuning¶

Final Model Training with Optimized Parameters¶

Ensemble Model Approach with Optimized Classifiers¶

Compare Train and Validation Performance¶

Validation Set Evaluation¶

Training Set Evaluation¶

Summary for Ensemble Model evaluation¶

Cross-Validation Performance¶

Conclusion for Cross-Validation Results:¶

Practicing AutoML: H2O.ai¶

Learning Curves¶

LIME Interpretation¶

AUC-ROC Curves for the Ensemble Model¶

Generating Predictions for Test Set and Preparing Submission File¶

Final Conclusion¶

Suggested Improvements¶

	PassengerId	HomePlanet	CryoSleep	Cabin	Destination	Age	VIP	RoomService	FoodCourt	ShoppingMall	Spa	VRDeck	Name	Transported
0	0001_01	Europa	False	B/0/P	TRAPPIST-1e	39.0	False	0.0	0.0	0.0	0.0	0.0	Maham Ofracculy	False
1	0002_01	Earth	False	F/0/S	TRAPPIST-1e	24.0	False	109.0	9.0	25.0	549.0	44.0	Juanna Vines	True
2	0003_01	Europa	False	A/0/S	TRAPPIST-1e	58.0	True	43.0	3576.0	0.0	6715.0	49.0	Altark Susent	False
3	0003_02	Europa	False	A/0/S	TRAPPIST-1e	33.0	False	0.0	1283.0	371.0	3329.0	193.0	Solam Susent	False
4	0004_01	Earth	False	F/1/S	TRAPPIST-1e	16.0	False	303.0	70.0	151.0	565.0	2.0	Willy Santantines	True

	PassengerId	HomePlanet	CryoSleep	Cabin	Destination	Age	VIP	RoomService	FoodCourt	ShoppingMall	Spa	VRDeck	Name
0	0013_01	Earth	True	G/3/S	TRAPPIST-1e	27.0	False	0.0	0.0	0.0	0.0	0.0	Nelly Carsoning
1	0018_01	Earth	False	F/4/S	TRAPPIST-1e	19.0	False	0.0	9.0	0.0	2823.0	0.0	Lerome Peckers
2	0019_01	Europa	True	C/0/S	55 Cancri e	31.0	False	0.0	0.0	0.0	0.0	0.0	Sabih Unhearfus
3	0021_01	Europa	False	C/1/S	TRAPPIST-1e	38.0	False	0.0	6652.0	0.0	181.0	585.0	Meratz Caltilter
4	0023_01	Earth	False	F/5/S	TRAPPIST-1e	20.0	False	10.0	0.0	635.0	0.0	0.0	Brence Harperez

	AgeBin	count	percent
0	Adult	4315	49.64
1	Middle Aged	2293	26.38
2	Young	1865	21.45
3	Senior	220	2.53

	SpendingBin	count	percent
0	Non-Spender	3653	42.02
1	High Spender	2173	25.00
2	Medium Spender	2173	25.00
3	Low Spender	694	7.98

	Destination	count	percent
0	TRAPPIST-1e	6097	70.14
1	55 Cancri e	1800	20.71
2	PSO J318.5-22	796	9.16

	HomePlanet	count	percent
0	Earth	4803	55.25
1	Europa	2131	24.51
2	Mars	1759	20.23

Table of Contents¶

Introduction¶

Dataset Structure¶

Notebook settings¶

Data Loading, Cleaning and Initial Exploration¶

Outliers Analysis¶

Conclusion for Outliers Analysis¶

Conclusion for Outliers Analysis and VIP Feature¶

Data Preparation¶

Strategy for missing data¶

Insights and Approach definition¶

Feature Engineering - TotalSpending¶

Binning Features: Age and TotalSpending¶

EDA¶

Conclusion for Univariate Analysis after data imputation¶

Bivariate Analysis¶

Conclusion over Bivariate Analysis¶

Target Variable vs. Age and CryoSleep¶

Correlation Matrix Analysis¶

Multicolinarity Analysis¶

Statistical Inference and Hypothesis Testing¶

Data Preprocessing¶

Baseline Model¶

Data Modeling¶

Hyperparameter Tuning¶

Final Model Training with Optimized Parameters¶

Ensemble Model Approach with Optimized Classifiers¶

Compare Train and Validation Performance¶

Validation Set Evaluation¶

Training Set Evaluation¶

Summary for Ensemble Model evaluation¶

Cross-Validation Performance¶

Conclusion for Cross-Validation Results:¶

Practicing AutoML: H2O.ai¶

Learning Curves¶

LIME Interpretation¶

AUC-ROC Curves for the Ensemble Model¶

Generating Predictions for Test Set and Preparing Submission File¶

Final Conclusion¶

Suggested Improvements¶

Insights and Approach definition
¶