from assets.utils.functions import *

# Standard libraries

import warnings
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
from IPython.display import display

# Model explanation tools
import lime
import lime.lime_tabular

# Data inspection and EDA
import sidetable as stb
from skimpy import skim


# Statistical analysis and hypothesis testing
from scipy import stats
from scipy.stats import norm, ttest_ind, chi2_contingency, t, zscore
import statsmodels.api as sm
import statsmodels.stats.api as sms_stats
from statsmodels.stats.weightstats import CompareMeans, DescrStatsW
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Machine learning and modeling
from sklearn.model_selection import (train_test_split, cross_val_score,
                                     learning_curve, cross_val_predict,
                                     StratifiedKFold, GridSearchCV,
                                     RandomizedSearchCV)
from sklearn.impute import SimpleImputer
from sklearn.exceptions import DataConversionWarning
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (OneHotEncoder, LabelEncoder,
                                   StandardScaler, MinMaxScaler)
from sklearn.ensemble import (RandomForestClassifier,
                              GradientBoostingClassifier,
                              VotingClassifier, BaggingClassifier)
from sklearn.metrics import (classification_report, roc_auc_score,
                             f1_score, make_scorer, accuracy_score,
                             confusion_matrix, ConfusionMatrixDisplay,
                             roc_curve)
from sklearn.inspection import permutation_importance
from sklearn.utils import resample, shuffle
from sklearn.linear_model import LogisticRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, ShuffleSplit
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Resampling techniques and pipelines (imbalanced data)
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# Additional machine learning libraries
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Categorical encoding
from category_encoders import TargetEncoder

# Warnings
warnings.filterwarnings('ignore')

%load_ext pycodestyle_magic
# %reload_ext pycodestyle_magic
%pycodestyle_on
%flake8_on
%flake8_on --max_line_length 79
%matplotlib inline

main_df = pd.read_csv("assets/data/healthcare-dataset-stroke-data.csv")
main_df.head()

main_df.stb.freq(['stroke'], cum_cols=False).round(2)

# Drop column = 'id'
main_df.drop(columns='id', inplace=True)

main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB

skim(main_df)

╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮
│          Data Summary                Data Types                                                                 │
│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓                                                          │
│ ┃ dataframe         ┃ Values ┃ ┃ Column Type ┃ Count ┃                                                          │
│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩                                                          │
│ │ Number of rows    │ 5110   │ │ string      │ 5     │                                                          │
│ │ Number of columns │ 11     │ │ float64     │ 3     │                                                          │
│ └───────────────────┴────────┘ │ int64       │ 3     │                                                          │
│                                └─────────────┴───────┘                                                          │
│                                                     number                                                      │
│ ┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┓  │
│ ┃ column_name         ┃ NA   ┃ NA %  ┃ mean     ┃ sd      ┃ p0     ┃ p25    ┃ p50   ┃ p75   ┃ p100  ┃ hist   ┃  │
│ ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━┩  │
│ │ age                 │    0 │     0 │    43.23 │   22.61 │   0.08 │     25 │    45 │    61 │    82 │ ▅▆▇▇▇▆ │  │
│ │ hypertension        │    0 │     0 │  0.09746 │  0.2966 │      0 │      0 │     0 │     0 │     1 │ ▇    ▁ │  │
│ │ heart_disease       │    0 │     0 │  0.05401 │  0.2261 │      0 │      0 │     0 │     0 │     1 │   ▇    │  │
│ │ avg_glucose_level   │    0 │     0 │    106.1 │   45.28 │  55.12 │  77.25 │ 91.88 │ 114.1 │ 271.7 │ ▇▅▁▁▁  │  │
│ │ bmi                 │  201 │  3.93 │    28.89 │   7.854 │   10.3 │   23.5 │  28.1 │  33.1 │  97.6 │  ▅▇▁   │  │
│ │ stroke              │    0 │     0 │  0.04873 │  0.2153 │      0 │      0 │     0 │     0 │     1 │   ▇    │  │
│ └─────────────────────┴──────┴───────┴──────────┴─────────┴────────┴────────┴───────┴───────┴───────┴────────┘  │
│                                                     string                                                      │
│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┓  │
│ ┃ column_name                   ┃ NA     ┃ NA %       ┃ words per row               ┃ total words            ┃  │
│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━┩  │
│ │ gender                        │      0 │          0 │                           1 │                   5110 │  │
│ │ ever_married                  │      0 │          0 │                           1 │                   5110 │  │
│ │ work_type                     │      0 │          0 │                           1 │                   5110 │  │
│ │ Residence_type                │      0 │          0 │                           1 │                   5110 │  │
│ │ smoking_status                │      0 │          0 │                         1.5 │                   7887 │  │
│ └───────────────────────────────┴────────┴────────────┴─────────────────────────────┴────────────────────────┘  │
╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯

# Handling missing values in the 'BMI' column by filling with the median value
bmi_median = main_df['bmi'].median()

# Assining values to the BMI missing values
main_df['bmi'] = main_df['bmi'].fillna(bmi_median)

# Check if there are missing values
missing_values = main_df.isnull().sum()
print("Missing values:")
print(missing_values)

Missing values:
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

check_blank_or_whitespace(main_df)

Count of empty strings or single spaces per column:
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

check_distinct_values(main_df)

Distinct values for 'gender': ['Male' 'Female' 'Other']
Distinct values for 'ever_married': ['Yes' 'No']
Distinct values for 'work_type': ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
Distinct values for 'Residence_type': ['Urban' 'Rural']
Distinct values for 'smoking_status': ['formerly smoked' 'never smoked' 'smokes' 'Unknown']

main_df['gender'].value_counts()

gender
Female    2994
Male      2115
Other        1
Name: count, dtype: int64

main_df.drop(main_df[main_df.gender == 'Other'].index, inplace=True)
main_df.reset_index(drop=True, inplace=True)

# Assuming 'main_df' contains your dataset and 'numerical_features' is defined
numerical_features = ['avg_glucose_level', 'bmi']


# Function to detect outliers using the IQR method
def count_outliers(feature):
    Q1 = main_df[feature].quantile(0.25)
    Q3 = main_df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = main_df[(
        main_df[feature] < lower_bound) | (
            main_df[feature] > upper_bound)]
    return outliers.shape[0]


# Plot outliers for each numerical feature using boxplots
plt.figure(figsize=(15, 10))

# Loop through each feature, create a boxplot, and print outlier count
for i, feature in enumerate(numerical_features, 1):
    # Count the number of outliers
    outliers_count = count_outliers(feature)

    # Create a boxplot
    plt.subplot(2, 3, i)
    sns.boxplot(x=main_df[feature])
    plt.title(f"Boxplot of {feature} \n Outliers: {outliers_count}")

plt.tight_layout()
plt.show()

# Apply capping for 'avg_glucose_level' and 'bmi'
main_df = cap_outliers(main_df, 'avg_glucose_level')
main_df = cap_outliers(main_df, 'bmi')

# Step 1: Define the BMI bins and labels
bmi_bins = [0, 18.5, 24.9, 29.9, float('inf')]
bmi_labels = ['Underweight', 'Normal weight', 'Overweight', 'Obesity']

# Step 2: Create a new column in the dataframe for categorized BMI
main_df['bmi_cat'] = pd.cut(main_df['bmi'], bins=bmi_bins,
                            labels=bmi_labels, right=False)

# Step 3: Calculate the distribution of each BMI category
bmi_dist = main_df['bmi_cat'].value_counts(normalize=True).sort_index()

# Display the distribution of the bmi levels
print(bmi_dist)

bmi_cat
Underweight      0.065962
Normal weight    0.237816
Overweight       0.315326
Obesity          0.380896
Name: proportion, dtype: float64

# Step 1: glucose level bins and labels
bins = [0, 70, 99, 125, float('inf')]  # Use 'inf' for an upper bound
labels = ['Low', 'Normal', 'Prediabetes', 'Diabetes']

# Step 2: New column in the dataframe for categorized glucose levels
main_df['avg_glucose_level_cat'] = pd.cut(main_df['avg_glucose_level'],
                                          bins=bins, labels=labels,
                                          right=False)

# Step 3: Distribution of each glucose level category
glucose_dist = main_df['avg_glucose_level_cat'].value_counts().sort_index()

print(glucose_dist)

avg_glucose_level_cat
Low             754
Normal         2315
Prediabetes    1041
Diabetes        999
Name: count, dtype: int64

# Step 1: Define age group bins and labels
age_bins = [0, 2, 12, 18, 35, 60, float('inf')]  # Use 'inf' for an upper bound
age_labels = ['Infant', 'Child', 'Adolescent', 'Young Adults',
              'Middle Aged Adults', 'Old Aged Adults']

# Step 2: Create a new column in the dataframe for categorized age groups
main_df['age_group_cat'] = pd.cut(main_df['age'],
                                  bins=age_bins, labels=age_labels,
                                  right=False)

# Step 3: Calculate the distribution of each age group category
age_group_dist = main_df['age_group_cat'].value_counts().sort_index()

# Display the distribution of the new age groups
print(age_group_dist)

age_group_cat
Infant                 120
Child                  423
Adolescent             313
Young Adults           988
Middle Aged Adults    1889
Old Aged Adults       1376
Name: count, dtype: int64

# Dataframe info and checks
print(main_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109 entries, 0 to 5108
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   gender                 5109 non-null   object  
 1   age                    5109 non-null   float64 
 2   hypertension           5109 non-null   int64   
 3   heart_disease          5109 non-null   int64   
 4   ever_married           5109 non-null   object  
 5   work_type              5109 non-null   object  
 6   Residence_type         5109 non-null   object  
 7   avg_glucose_level      5109 non-null   float64 
 8   bmi                    5109 non-null   float64 
 9   smoking_status         5109 non-null   object  
 10  stroke                 5109 non-null   int64   
 11  bmi_cat                5109 non-null   category
 12  avg_glucose_level_cat  5109 non-null   category
 13  age_group_cat          5109 non-null   category
dtypes: category(3), float64(3), int64(3), object(5)
memory usage: 454.8+ KB
None

# Drop the 'Id' column before plotting
univariate_analysis(main_df)

features_stroke(main_df)

import matplotlib.pyplot as plt
import seaborn as sns

# Create a figure and axes for 4 subplots
fig, axes = plt.subplots(1, 3, figsize=(24, 6))

# Plot for hypertension VS age VS gender
sns.boxplot(data=main_df, x='hypertension', y='age', hue='gender', ax=axes[0])
axes[0].set_title('Hypertension vs Age by Gender')

# Plot for hypertension VS avg_glucose_level VS gender
sns.boxplot(data=main_df, x='hypertension', y='avg_glucose_level',
            hue='gender', ax=axes[1])
axes[1].set_title('Hypertension vs Avg Glucose Level by Gender')

# Plot for hypertension VS bmi VS gender
sns.boxplot(data=main_df, x='hypertension', y='bmi', hue='gender', ax=axes[2])
axes[2].set_title('Hypertension vs BMI by Gender')

# Adjust layout to prevent overlapping
plt.tight_layout()
plt.show()

multiple_distribution(
    main_df,
    group_by='hypertension',
    hue='heart_disease',
    column='gender',
    legend_loc=(1, 1.1)
)

multiple_distribution(
    main_df,
    group_by='hypertension',
    hue='heart_disease',
    column='stroke',
    legend_loc=(1, 1.1)
)

multiple_distribution(
    main_df,
    group_by='hypertension',
    hue='smoking_status',
    column='stroke',
    legend_loc=(1, 1.1)
)

# Step 1: Create a crosstab of smoking status, hypertension, and heart disease
crosstab_df = pd.crosstab(
    index=[main_df['smoking_status'], main_df['hypertension']],
    columns=main_df['heart_disease'],
    normalize='index'
)

# Step 2: Create a heatmap from the crosstab
plt.figure(figsize=(10, 6))
sns.heatmap(crosstab_df, annot=True, cmap='Blues', fmt=".1%",
            cbar_kws={'label': 'Percentage'})
plt.title('Heatmap of Smoking Status, Hypertension, and Heart Disease')
plt.xlabel('Heart Disease')
plt.ylabel('Smoking Status and Hypertension')
plt.show()

bivariate_features(main_df,
                   group_by='avg_glucose_level_cat',
                   compare_by='hypertension')

bivariate_features(main_df,
                   group_by='work_type',
                   compare_by='avg_glucose_level_cat')

bivariate_features(main_df,
                   group_by='work_type',
                   compare_by='hypertension')

# Create a copy of the dataframe to avoid modifying the original
corr_data = main_df.copy()

# Encoding categorical features for correlation analysis
encoder = LabelEncoder()
categorical_cols = corr_data.select_dtypes(
    include=["object", "category"]).columns

# Encode categorical columns into numerical values
for col in categorical_cols:
    corr_data[col] = encoder.fit_transform(corr_data[col])

# Compute correlation matrix
corr_matrix = corr_data.corr()

# Generate mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Set up the matplotlib figure
plt.figure(figsize=(16, 13))

# Draw the heatmap with the mask
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', square=True)
plt.title("Correlation Heatmap")
plt.show()

# Calculate VIF for each feature in corr_data
X_vif = corr_data.drop(columns=['stroke'])
vif_data = pd.DataFrame()
vif_data["Feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(
    X_vif.values, i) for i in range(X_vif.shape[1])]

# Display VIF results
print(vif_data)

                  Feature        VIF
0                  gender   1.732442
1                     age  10.805309
2            hypertension   1.217095
3           heart_disease   1.166223
4            ever_married   5.715051
5               work_type   4.890263
6          Residence_type   1.995980
7       avg_glucose_level  10.433977
8                     bmi  16.321572
9          smoking_status   3.125287
10                bmi_cat   2.871162
11  avg_glucose_level_cat   3.663102
12          age_group_cat   8.661315

main_df['stroke'] = main_df['stroke'].astype(int)

# Continuous features for T-Test
continuous_features = ['avg_glucose_level', 'bmi', 'age',
                       'heart_disease', 'hypertension']
for feature in continuous_features:
    perform_ttests(main_df, feature)

# Categorical features for Chi-Square Test
categorical_features = ['work_type', 'smoking_status',
                        'ever_married', 'Residence_type', 'gender']
for feature in categorical_features:
    perform_chi2_test(main_df, feature)

Feature: avg_glucose_level
T-Statistic: 6.6415, P-Value: 1.766534321786102e-10
95% Confidence Interval for the Difference in Means: (12.558552333081456, 23.142887178049982)
Reject the null hypothesis: Significant difference in avg_glucose_level between stroke and no-stroke groups.

Feature: bmi
T-Statistic: 3.7795, P-Value: 0.00019067641466155664
95% Confidence Interval for the Difference in Means: (0.6761611400158061, 2.145547422629734)
Reject the null hypothesis: Significant difference in bmi between stroke and no-stroke groups.

Feature: age
T-Statistic: 29.6819, P-Value: 2.175773269747846e-95
95% Confidence Interval for the Difference in Means: (24.046577639007694, 27.460145351720612)
Reject the null hypothesis: Significant difference in age between stroke and no-stroke groups.

Feature: heart_disease
T-Statistic: 5.6578, P-Value: 4.1031523007226445e-08
95% Confidence Interval for the Difference in Means: (0.0923369828292136, 0.19093437420385695)
Reject the null hypothesis: Significant difference in heart_disease between stroke and no-stroke groups.

Feature: hypertension
T-Statistic: 6.2202, P-Value: 1.985812780520689e-09
95% Confidence Interval for the Difference in Means: (0.1203992598303807, 0.23194344431955238)
Reject the null hypothesis: Significant difference in hypertension between stroke and no-stroke groups.

Feature: work_type
Chi-Square Statistic: 49.1591, P-Value: 5.40903546949726e-10
Reject the null hypothesis: Significant association between work_type and stroke.

Feature: smoking_status
Chi-Square Statistic: 29.2257, P-Value: 2.007704175610833e-06
Reject the null hypothesis: Significant association between smoking_status and stroke.

Feature: ever_married
Chi-Square Statistic: 58.8678, P-Value: 1.686285619167346e-14
Reject the null hypothesis: Significant association between ever_married and stroke.

Feature: Residence_type
Chi-Square Statistic: 1.0750, P-Value: 0.29982523877153633
Fail to reject the null hypothesis: No significant association between Residence_type and stroke.

Feature: gender
Chi-Square Statistic: 0.3400, P-Value: 0.5598277580669416
Fail to reject the null hypothesis: No significant association between gender and stroke.

# Drop uneeded columns
columns_to_drop = ['avg_glucose_level_cat', 'bmi_cat', 'age_group_cat']
main_df.drop(columns=columns_to_drop, axis=1, errors='ignore', inplace=True)

# Cell 1: Perform your data processing and encoding
encoded_df = main_df.copy()
encoded_df.ever_married = main_df.ever_married.map(
    {'No': 0, 'Yes': 1})
encoded_df.smoking_status = main_df.smoking_status.map(
    {'never smoked': 0, 'formerly smoked': 1,
     'smokes': 2, 'Unknown': 3})
encoded_df.work_type = main_df.work_type.map(
    {'Private': 0, 'Self-employed': 1, 'children': 2,
     'Govt_job': 3, 'Never_worked': 4})
encoded_df.Residence_type = main_df.Residence_type.map(
    {'Rural': 0, 'Urban': 1})
encoded_df.gender = main_df.gender.map(
    {'Male': 0, 'Female': 1})

# Cell 2: Handle print separately (without issues from pycodestyle)
encoded_df.head()

X = encoded_df.drop('stroke', axis=1)
y = encoded_df['stroke']

# Split the data - avoid data leakage
train_x, test_x, train_y, test_y = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Imputation and Label Encoding only on the training data
imputer = SimpleImputer(strategy='mean')
train_x['bmi'] = imputer.fit_transform(train_x[['bmi']])

# Apply the same imputation on test data
test_x['bmi'] = imputer.transform(test_x[['bmi']])

# Label Encoding
le = LabelEncoder()
train_x['gender'] = le.fit_transform(train_x['gender'])

# Apply the same encoding on test data
test_x['gender'] = le.transform(test_x['gender'])

# Scale the training data
scaler = StandardScaler()
train_x[['age', 'avg_glucose_level', 'bmi']] = scaler.fit_transform(
    train_x[['age', 'avg_glucose_level', 'bmi']])

# Apply the scaling to the test data (do not fit again)
test_x[['age', 'avg_glucose_level', 'bmi']] = scaler.transform(
    test_x[['age', 'avg_glucose_level', 'bmi']])

# Apply SMOTE to only the training data
smote = SMOTE(random_state=42)
balanced_train_x, balanced_train_y = smote.fit_resample(train_x, train_y)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(balanced_train_x, balanced_train_y)

# Evaluate on the test set (unseen data)
y_pred = model.predict(test_x)

# Print evaluation metrics
print("Classification Report:\n", classification_report(test_y, y_pred))
print("ROC-AUC Score:",
      roc_auc_score(test_y, model.predict_proba(test_x)[:, 1]))

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.92      0.94       972
           1       0.13      0.22      0.16        50

    accuracy                           0.89      1022
   macro avg       0.54      0.57      0.55      1022
weighted avg       0.92      0.89      0.90      1022

ROC-AUC Score: 0.7478086419753087

models = {
    'CatBoost': CatBoostClassifier(
        auto_class_weights='Balanced', iterations=500,
        depth=6, learning_rate=0.1, silent=True),

    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=300, max_depth=10, learning_rate=0.2,
        subsample=0.7, random_state=42  # No class_weight
    ),

    'XGBoost': XGBClassifier(
        n_estimators=300, max_depth=10, learning_rate=0.2,
        colsample_bytree=0.8, subsample=0.7,
        scale_pos_weight=10,  # Balance classes
        random_state=42
    ),

    'LightGBM': LGBMClassifier(
        n_estimators=300, max_depth=6, learning_rate=0.1,
        class_weight='balanced',  # Automatically balances classes
        subsample=0.7, verbosity=-1, random_state=42
    ),

    'Random Forest': RandomForestClassifier(
        n_estimators=200, max_depth=10, min_samples_split=10,
        min_samples_leaf=4, class_weight={0: 1, 1: 5},
        random_state=42
    )
}

# Evaluate each model
for model_name, model in models.items():
    evaluate_model(model, balanced_train_x,
                   balanced_train_y, test_x, test_y, model_name)

CatBoost Results:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       972
           1       0.11      0.10      0.10        50

    accuracy                           0.92      1022
   macro avg       0.53      0.53      0.53      1022
weighted avg       0.91      0.92      0.91      1022

CatBoost ROC-AUC Score: 0.7602880658436214


Gradient Boosting Results:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       972
           1       0.15      0.12      0.13        50

    accuracy                           0.92      1022
   macro avg       0.55      0.54      0.55      1022
weighted avg       0.92      0.92      0.92      1022

Gradient Boosting ROC-AUC Score: 0.7665020576131687


XGBoost Results:
              precision    recall  f1-score   support

           0       0.96      0.90      0.93       972
           1       0.11      0.24      0.15        50

    accuracy                           0.86      1022
   macro avg       0.53      0.57      0.54      1022
weighted avg       0.92      0.86      0.89      1022

XGBoost ROC-AUC Score: 0.7423045267489711


LightGBM Results:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       972
           1       0.15      0.14      0.15        50

    accuracy                           0.92      1022
   macro avg       0.55      0.55      0.55      1022
weighted avg       0.92      0.92      0.92      1022

LightGBM ROC-AUC Score: 0.7653703703703704


Random Forest Results:
              precision    recall  f1-score   support

           0       0.99      0.65      0.78       972
           1       0.11      0.82      0.19        50

    accuracy                           0.66      1022
   macro avg       0.55      0.73      0.49      1022
weighted avg       0.94      0.66      0.75      1022

Random Forest ROC-AUC Score: 0.7678395061728395

# Hyperparameter grids for CatBoost
catboost_param_grid = {
    'iterations': [200, 300, 500],
    'depth': [6, 8, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5],
    'auto_class_weights': ['Balanced']  # Keep class balancing
}

# Hyperparameter grids for Gradient Boosting
gb_param_grid = {
    'n_estimators': [200, 300, 500],
    'max_depth': [6, 8, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Hyperparameter grids for XGBoost
xgb_param_grid = {
    'n_estimators': [200, 300, 500],
    'max_depth': [6, 8, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'scale_pos_weight': [10, 20, 30],  # Adjust for class imbalance
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# Hyperparameter grids for Random Forest
rf_param_grid = {
    'n_estimators': [200, 300, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [{0: 1, 1: 5}, {0: 1, 1: 10}, {0: 1, 1: 20}]
}

# Hyperparameter grids for LightGBM
lgbm_param_grid = {
    'n_estimators': [200, 300, 500],
    'max_depth': [6, 8, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'class_weight': ['balanced'],
    'subsample': [0.7, 0.8, 1.0]
}

# Define the models
catboost_model = CatBoostClassifier(
    auto_class_weights='Balanced', silent=True, random_state=42)
xgb_model = XGBClassifier(
    use_label_encoder=False, random_state=42, scale_pos_weight=20,
    eval_metric='logloss', verbosity=0)
lgbm_model = LGBMClassifier(
    class_weight='balanced', random_state=42, verbosity=-1)
rf_model = RandomForestClassifier(class_weight='balanced',
                                  random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)

# RandomizedSearchCV for CatBoost
catboost_search = RandomizedSearchCV(
    catboost_model, param_distributions=catboost_param_grid,
    n_iter=10, scoring='precision', cv=3, verbose=0,
    random_state=42, n_jobs=-1)

# RandomizedSearchCV for XGBoost
xgb_search = RandomizedSearchCV(
    xgb_model, param_distributions=xgb_param_grid,
    n_iter=10, scoring='precision', cv=3, verbose=0,
    random_state=42, n_jobs=-1)

# RandomizedSearchCV for Random Forest
rf_search = RandomizedSearchCV(
    rf_model, param_distributions=rf_param_grid, n_iter=10,
    scoring='precision', cv=3, verbose=0, random_state=42, n_jobs=-1)

# RandomizedSearchCV for LightGBM
lgbm_search = RandomizedSearchCV(
    lgbm_model, param_distributions=lgbm_param_grid, n_iter=10,
    scoring='precision', cv=3, verbose=0, random_state=42, n_jobs=-1)


# RandomizedSearchCV for Gradient Boosting
gb_search = RandomizedSearchCV(
    gb_model, param_distributions=gb_param_grid, n_iter=10,
    scoring='precision', cv=3, verbose=0, random_state=42, n_jobs=-1)

# Fit the searches to the training data
catboost_search.fit(balanced_train_x, balanced_train_y)
xgb_search.fit(balanced_train_x, balanced_train_y)
rf_search.fit(balanced_train_x, balanced_train_y)
lgbm_search.fit(balanced_train_x, balanced_train_y)
gb_search.fit(balanced_train_x, balanced_train_y)

# Get Hyperparameters
print("Best CatBoost Parameters:", catboost_search.best_params_)
print("Best XGBoost Parameters:", xgb_search.best_params_)
print("Best Random Forest Parameters:", rf_search.best_params_)
print("Best LightGBM Parameters:", lgbm_search.best_params_)
print("Best Gradient Boosting Parameters:", gb_search.best_params_)

Best CatBoost Parameters: {'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 500, 'depth': 6, 'auto_class_weights': 'Balanced'}
Best XGBoost Parameters: {'subsample': 0.7, 'scale_pos_weight': 10, 'n_estimators': 300, 'max_depth': 10, 'learning_rate': 0.2, 'colsample_bytree': 0.8}
Best Random Forest Parameters: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 20, 'class_weight': {0: 1, 1: 10}}
Best LightGBM Parameters: {'subsample': 0.7, 'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.1, 'class_weight': 'balanced'}
Best Gradient Boosting Parameters: {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 10, 'learning_rate': 0.1}

# Train and evaluate CatBoost
best_catboost = CatBoostClassifier(
    **catboost_search.best_params_, random_state=42, silent=True)
best_catboost.fit(balanced_train_x, balanced_train_y)
y_pred_catboost = best_catboost.predict(test_x)
print("CatBoost Results after Hyperparameter:\n",
      classification_report(test_y, y_pred_catboost))
print("CatBoost ROC-AUC after Hyperparameter:",
      roc_auc_score(test_y, best_catboost.predict_proba(test_x)[:, 1]))

# Train and evaluate XGBoost
best_xgb = XGBClassifier(
    **xgb_search.best_params_, random_state=42, use_label_encoder=False,
    eval_metric='logloss', verbosity=0)
best_xgb.fit(balanced_train_x, balanced_train_y)
y_pred_xgb = best_xgb.predict(test_x)
print("XGBoost Results after Hyperparameter:\n",
      classification_report(test_y, y_pred_xgb))
print("XGBoost ROC-AUC after Hyperparameter:",
      roc_auc_score(test_y, best_xgb.predict_proba(test_x)[:, 1]))

# Train and evaluate Random Forest
best_rf = RandomForestClassifier(
    **rf_search.best_params_, random_state=42)
best_rf.fit(balanced_train_x, balanced_train_y)
y_pred_rf = best_rf.predict(test_x)
print("Random Forest Results after Hyperparameter:\n",
      classification_report(test_y, y_pred_rf))
print("Random Forest ROC-AUC after Hyperparameter:",
      roc_auc_score(test_y, best_rf.predict_proba(test_x)[:, 1]))

# Train and evaluate LightGBM
best_lgbm = LGBMClassifier(
    **lgbm_search.best_params_, random_state=42, verbosity=-1)
best_lgbm.fit(balanced_train_x, balanced_train_y)
y_pred_lgbm = best_lgbm.predict(test_x)
print("LightGBM Results after Hyperparameter:\n",
      classification_report(test_y, y_pred_lgbm))
print("LightGBM ROC-AUC after Hyperparameter:",
      roc_auc_score(test_y, best_lgbm.predict_proba(test_x)[:, 1]))

# Train and evaluate Gradient Boosting
best_gb = GradientBoostingClassifier(
    **gb_search.best_params_, random_state=42)
best_gb.fit(balanced_train_x, balanced_train_y)
y_pred_gb = best_gb.predict(test_x)
print("Gradient Boosting Results after Hyperparameter:\n",
      classification_report(test_y, y_pred_gb))
print("Gradient Boosting ROC-AUC after Hyperparameter",
      roc_auc_score(test_y, best_gb.predict_proba(test_x)[:, 1]))

CatBoost Results after Hyperparameter:
               precision    recall  f1-score   support

           0       0.96      0.95      0.96       972
           1       0.15      0.16      0.16        50

    accuracy                           0.91      1022
   macro avg       0.55      0.56      0.56      1022
weighted avg       0.92      0.91      0.92      1022

CatBoost ROC-AUC after Hyperparameter: 0.7675308641975309
XGBoost Results after Hyperparameter:
               precision    recall  f1-score   support

           0       0.96      0.90      0.93       972
           1       0.11      0.24      0.15        50

    accuracy                           0.86      1022
   macro avg       0.53      0.57      0.54      1022
weighted avg       0.92      0.86      0.89      1022

XGBoost ROC-AUC after Hyperparameter: 0.7423045267489711
Random Forest Results after Hyperparameter:
               precision    recall  f1-score   support

           0       0.96      0.87      0.91       972
           1       0.12      0.36      0.18        50

    accuracy                           0.84      1022
   macro avg       0.54      0.61      0.55      1022
weighted avg       0.92      0.84      0.88      1022

Random Forest ROC-AUC after Hyperparameter: 0.7509156378600823
LightGBM Results after Hyperparameter:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96       972
           1       0.15      0.14      0.15        50

    accuracy                           0.92      1022
   macro avg       0.55      0.55      0.55      1022
weighted avg       0.92      0.92      0.92      1022

LightGBM ROC-AUC after Hyperparameter: 0.7653703703703704
Gradient Boosting Results after Hyperparameter:
               precision    recall  f1-score   support

           0       0.95      0.96      0.96       972
           1       0.13      0.12      0.12        50

    accuracy                           0.92      1022
   macro avg       0.54      0.54      0.54      1022
weighted avg       0.91      0.92      0.92      1022

Gradient Boosting ROC-AUC after Hyperparameter 0.7502880658436214

ensemble_ml = VotingClassifier(estimators=[
    ('catboost', best_catboost),
    ('xgb', best_xgb),
    ('rf', best_rf),
    ('lgbm', best_lgbm),
    ('gb', best_gb)
], voting='soft', weights=[2, 1, 1, 1, 1])

ensemble_ml.fit(balanced_train_x, balanced_train_y)
y_pred_voting = ensemble_ml.predict(test_x)
print("Ensemble (Voting Classifier) Results:\n",
      classification_report(test_y, y_pred_voting))

Ensemble (Voting Classifier) Results:
               precision    recall  f1-score   support

           0       0.96      0.95      0.95       972
           1       0.13      0.14      0.14        50

    accuracy                           0.91      1022
   macro avg       0.55      0.55      0.55      1022
weighted avg       0.92      0.91      0.91      1022

# Get predicted probabilities
y_pred_proba_voting = ensemble_ml.predict_proba(
    test_x)[:, 1]  # Probabilities for class 1

# Threshold (need to try values lower-> 0.5, e.g., 0.3 or 0.25)
threshold = 0.28
y_pred_adjusted = (
    y_pred_proba_voting >= threshold).astype(int)

# Evaluate the adjusted predictions
print("Ensemble (Voting Classifier) with Adjusted Threshold Results:\n",
      classification_report(test_y, y_pred_adjusted))

Ensemble (Voting Classifier) with Adjusted Threshold Results:
               precision    recall  f1-score   support

           0       0.96      0.88      0.92       972
           1       0.13      0.34      0.19        50

    accuracy                           0.86      1022
   macro avg       0.55      0.61      0.55      1022
weighted avg       0.92      0.86      0.89      1022

# StratifiedKFold to maintain the same class distribution across folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# List to hold F1-scores for each fold
f1_scores = []

# Custom cross-validation loop
for train_index, val_index in skf.split(balanced_train_x, balanced_train_y):
    # Split data into train and validation sets
    X_train, X_val = balanced_train_x.iloc[
        train_index], balanced_train_x.iloc[val_index]
    y_train, y_val = balanced_train_y.iloc[
        train_index], balanced_train_y.iloc[val_index]

    # Fit the voting classifier on the current fold's training data
    ensemble_ml.fit(X_train, y_train)

    # Get predicted probabilities on the validation set
    y_pred_proba_val = ensemble_ml.predict_proba(X_val)[:, 1]

    # Apply the adjusted threshold
    y_pred_adjusted_val = (y_pred_proba_val >= threshold).astype(int)

    # Calculate F1-score with adjusted threshold and store it
    f1 = f1_score(y_val, y_pred_adjusted_val)
    f1_scores.append(f1)

# Convert the list to a numpy array for easier statistics
f1_scores = np.array(f1_scores)

# Output the F1-scores and mean
print(f"Custom Cross-Validation F1 Scores: {f1_scores}")
print(f"Mean F1 Score with Threshold Adjustment: {np.mean(f1_scores)}")

Custom Cross-Validation F1 Scores: [0.93567961 0.93967093 0.92882992 0.93268648 0.94067278]
Mean F1 Score with Threshold Adjustment: 0.9355079438193711

# Wrap the VotingClassifier in a BaggingClassifier to apply bootstrapping
bagging_clf = BaggingClassifier(base_estimator=ensemble_ml,
                                n_estimators=10,  # bootstrap iterations
                                random_state=42,
                                n_jobs=-1,
                                bootstrap=True)

# Fit the bagging ensemble model on the balanced training data
bagging_clf.fit(balanced_train_x, balanced_train_y)

# Get predicted probabilities for the test set
y_pred_proba_bagging = bagging_clf.predict_proba(test_x)[:, 1]

# Adjust the threshold
threshold = 0.28
y_pred_adjusted_bagging = (y_pred_proba_bagging >= threshold).astype(int)

# Evaluate the adjusted predictions for the bagging ensemble
print("Bagging (Voting Classifier) with Adjusted Threshold Results:\n",
      classification_report(test_y, y_pred_adjusted_bagging))

# Evaluate the ROC-AUC score for the bagging ensemble
roc_auc = roc_auc_score(test_y, y_pred_proba_bagging)
print("Bagging Ensemble ROC-AUC:", roc_auc)

Bagging (Voting Classifier) with Adjusted Threshold Results:
               precision    recall  f1-score   support

           0       0.97      0.86      0.91       972
           1       0.13      0.40      0.19        50

    accuracy                           0.83      1022
   macro avg       0.55      0.63      0.55      1022
weighted avg       0.92      0.83      0.87      1022

Bagging Ensemble ROC-AUC: 0.7741975308641975

# Number of folds for cross-validation
cv_folds = 5

# Perform cross-validation predictions using the bagging classifier
y_pred_proba_cv = cross_val_predict(
    bagging_clf, balanced_train_x,
    balanced_train_y, cv=cv_folds,
    method='predict_proba')[:, 1]  # probability for class 1

# Adjust the threshold during cross-validation
threshold = 0.28
y_pred_adjusted_cv = (y_pred_proba_cv >= threshold).astype(int)

# Calculate the F1 score for each fold
f1_scores = []
for i in range(cv_folds):
    # Calculate F1-score with adjusted threshold for each fold
    f1 = f1_score(balanced_train_y, y_pred_adjusted_cv)
    f1_scores.append(f1)

# Output the F1-scores and mean
print(f"Custom Cross-Validation F1 Scores: {f1_scores}")
print(f"Mean F1 Score with Threshold Adjustment: {np.mean(f1_scores)}")

Custom Cross-Validation F1 Scores: [0.9244851258581235, 0.9244851258581235, 0.9244851258581235, 0.9244851258581235, 0.9244851258581235]
Mean F1 Score with Threshold Adjustment: 0.9244851258581235

# Check training performance
y_train_pred = ensemble_ml.predict(balanced_train_x)
print("Training Set Performance:\n",
      classification_report(balanced_train_y, y_train_pred))

# Check test performance
y_test_pred = ensemble_ml.predict(test_x)
print("Test Set Performance:\n",
      classification_report(test_y, y_test_pred))

Training Set Performance:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      3888
           1       0.99      0.99      0.99      3888

    accuracy                           0.99      7776
   macro avg       0.99      0.99      0.99      7776
weighted avg       0.99      0.99      0.99      7776

Test Set Performance:
               precision    recall  f1-score   support

           0       0.96      0.95      0.95       972
           1       0.12      0.14      0.13        50

    accuracy                           0.91      1022
   macro avg       0.54      0.54      0.54      1022
weighted avg       0.91      0.91      0.91      1022

# Generate learning curve data
train_sizes, train_scores, test_scores = learning_curve(
    ensemble_ml, balanced_train_x, balanced_train_y, cv=5,
    scoring='f1', n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))

# Calculate mean and standard deviation for the training and test scores
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

# Plot learning curve
plt.figure()
plt.title("Learning Curve (Voting Classifier)")
plt.xlabel("Training Examples")
plt.ylabel("F1 Score")

# Plot the training and test scores
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
         label="Training F1 score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
         label="Cross-validation F1 score")

plt.legend(loc="best")
plt.show()

# Get predictions for the test set using the ensemble model
y_pred_voting = ensemble_ml.predict(test_x)

# Generate confusion matrix
conf_matrix = confusion_matrix(test_y, y_pred_voting)

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix,
                              display_labels=[0, 1])
disp.plot(cmap=plt.cm.Blues)

plt.title("Confusion Matrix for Ensemble Model")
plt.show()

# Before scaling, save a copy of the unscaled training and test data
unscaled_train_x = train_x.copy()
unscaled_test_x = test_x.copy()

# Scale the training data
scaler = StandardScaler()
train_x[['age', 'avg_glucose_level', 'bmi']] = scaler.fit_transform(
    train_x[['age', 'avg_glucose_level', 'bmi']])
test_x[['age', 'avg_glucose_level', 'bmi']] = scaler.transform(
    test_x[['age', 'avg_glucose_level', 'bmi']])

# Train your model using the scaled data
ensemble_ml.fit(train_x, train_y)

# Create the LIME explainer using the **unscaled** training data
explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=np.array(unscaled_train_x),
    feature_names=unscaled_train_x.columns,
    class_names=['No Stroke', 'Stroke'],
    mode='classification'
)

# Select a test instance (unscaled) to explain, e.g., the first test sample
test_instance_unscaled = unscaled_test_x.iloc[0].values.reshape(1, -1)

# Generate the explanation using the unscaled test instance,
# but the model trained on scaled data
explanation = explainer.explain_instance(
    test_instance_unscaled[0], ensemble_ml.predict_proba)

# Plot the LIME explanation
fig = explanation.as_pyplot_figure()
plt.title('LIME Explanation for Stroke Prediction', fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

# Get the predicted probabilities
# for the test set from the ensemble model
y_pred_proba = ensemble_ml.predict_proba(
    test_x)[:, 1]  # Probabilities for class 1 (Stroke)

# Calculate the FPR, TPR, and threshold values for the ROC curve
fpr, tpr, thresholds = roc_curve(test_y, y_pred_proba)

# Calculate the AUC score
roc_auc = roc_auc_score(test_y, y_pred_proba)

# Step 4: Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='orange',
         label=f'Ensemble ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')  # Baseline
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Ensemble Model')
plt.legend(loc='lower right')
plt.show()

import joblib

# ensemble_model
fco_model_01 = ensemble_ml

# Save the entire pipeline
joblib.dump(fco_model_01, 'fco_model_01.pkl')

['fco_model_01.pkl']

	stroke	count	percent
0	0	4861	95.13
1	1	249	4.87

Column	Description	Datatype	Count
gender	Gender like: 'Male' 'Female' 'Other'	object	5110
age	Age	float64	5110
hypertension	If someone has hypertension: 0-'No', 1-'Yes'	int64	5110
heart_disease	If someone has heart disease: 0-'No', 1-'Yes'	int64	5110
ever_married	If someone is married or not: 'Yes' 'No'	object	5110
work_type	Type of work: 'Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked'	object	5110
Residence_type	Type of residence: 'Urban' or 'Rural'	object	5110
avg_glucose_level	Glucose level average	float64	5110
bmi	Body Mass Index: estimate the amount of body fat by using height and weight measurements	float64	4909
smoking_status	Smoking status;: 'formerly smoked' 'never smoked' 'smokes' 'Unknown'	object	5110

	id	gender	age	hypertension	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	stroke
0	9046	Male	67.0	0	1	Yes	Private	Urban	228.69	36.6	formerly smoked	1
1	51676	Female	61.0	0	0	Yes	Self-employed	Rural	202.21	NaN	never smoked	1
2	31112	Male	80.0	0	1	Yes	Private	Rural	105.92	32.5	never smoked	1
3	60182	Female	49.0	0	0	Yes	Private	Urban	171.23	34.4	smokes	1
4	1665	Female	79.0	1	0	Yes	Self-employed	Rural	174.12	24.0	never smoked	1

	gender	age	hypertension	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	stroke
0	0	67.0	0	1	1	0	1	169.365	36.6	1	1
1	1	61.0	0	0	1	1	0	169.365	28.1	0	1
2	0	80.0	0	1	1	0	0	105.920	32.5	0	1
3	1	49.0	0	0	1	0	1	169.365	34.4	2	1
4	1	79.0	1	0	1	1	0	169.365	24.0	0	1

Table of Contents¶

Introduction¶

Dataset Structure¶

Notebook settings¶

Data Loading, Cleaning and Initial Exploration¶

Strategy for BMI missing data: Median imputation¶

Gender Analysis¶

Outliers Analysis¶

Winsorizing (Capping) Outliers¶

Data Preparation¶

Grouping Data¶

EDA¶

Univaritate Analysis¶

Insights from Univariate Plots:¶

Insights from the Features vs. Stroke Plots:¶

Hypertension analysis with other features¶

Insights from the Hypertension Plots:¶

Smoking Status analysis with other features¶

Insights from the Smoking Status Plots:¶

Correlation Matrix¶

Insights from the Correlation Matrix:¶

Multicolinarity Analysis¶

Statistical Inference and Hypothesis Testing¶

Data Preprocessing¶

Baseline Model¶

Baseline Results Conclusion:¶

Summary:¶

Hyperparameter Tuning including Class Weights¶

Conclusion after Hyperparameter Tuning:¶

Ensemble Model with Voting Classifier¶

Conclusion:¶

Conclusion:¶

Cross-validation with Stratified K-Fold¶

Conclusion:¶

Bootstrapping with Bagging¶

Conclusion¶

Cross Validation with Threshold Adjustment after Bootstrap and Bagging¶

Conclustion¶

Overfitting Check¶

Conclusion¶

Learning Curve Analysis - Ensemble Model (Voting Classifier)¶

Conclusion¶

Confusion Matrix for the Ensemble Model (Voting Classifier)¶

Conclusion¶

LIME Interpretation¶

Conclusion¶

AUC-ROC Curves for the Ensemble Model¶

Conclusion:¶

Model Deployment: FastAPI¶

Final Conclusion¶

Suggested Improvements¶