from assets.utils.functions import *

# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sidetable as stb
from skimpy import skim
from scipy import stats
from scipy.stats import ttest_ind, chi2_contingency
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.proportion import proportion_confint
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import warnings

%load_ext pycodestyle_magic
#%reload_ext pycodestyle_magic
%pycodestyle_on
%flake8_on
%flake8_on --max_line_length 79
%matplotlib inline

# Suppress the specific FutureWarning (Sidetable)
warnings.filterwarnings("ignore", category=FutureWarning)

travel_df = pd.read_csv("assets/data/TravelInsurancePrediction.csv")
travel_df.head()

skim(travel_df)

╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮
│          Data Summary                Data Types                                                                 │
│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓                                                          │
│ ┃ dataframe         ┃ Values ┃ ┃ Column Type ┃ Count ┃                                                          │
│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩                                                          │
│ │ Number of rows    │ 1987   │ │ int64       │ 6     │                                                          │
│ │ Number of columns │ 10     │ │ string      │ 4     │                                                          │
│ └───────────────────┴────────┘ └─────────────┴───────┘                                                          │
│                                                     number                                                      │
│ ┏━━━━━━━━━━━━━━━━━━┳━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┓  │
│ ┃ column_name      ┃ NA  ┃ NA %  ┃ mean    ┃ sd      ┃ p0     ┃ p25    ┃ p50    ┃ p75     ┃ p100    ┃ hist   ┃  │
│ ┡━━━━━━━━━━━━━━━━━━╇━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━┩  │
│ │ Unnamed: 0       │   0 │     0 │     993 │   573.7 │      0 │  496.5 │    993 │    1490 │    1986 │ ▇▇▇▇▇▇ │  │
│ │ Age              │   0 │     0 │   29.65 │   2.913 │     25 │     28 │     29 │      32 │      35 │ ▃▇▂▃▃▅ │  │
│ │ AnnualIncome     │   0 │     0 │  932800 │  376900 │ 300000 │ 600000 │ 900000 │ 1250000 │ 1800000 │ ▆▇▆▇▇▁ │  │
│ │ FamilyMembers    │   0 │     0 │   4.753 │    1.61 │      2 │      4 │      5 │       6 │       9 │ ▇▇▇▅▃▂ │  │
│ │ ChronicDiseases  │   0 │     0 │  0.2778 │   0.448 │      0 │      0 │      0 │       1 │       1 │ ▇    ▃ │  │
│ │ TravelInsurance  │   0 │     0 │  0.3573 │  0.4793 │      0 │      0 │      0 │       1 │       1 │ ▇    ▅ │  │
│ └──────────────────┴─────┴───────┴─────────┴─────────┴────────┴────────┴────────┴─────────┴─────────┴────────┘  │
│                                                     string                                                      │
│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓  │
│ ┃ column_name                          ┃ NA    ┃ NA %      ┃ words per row            ┃ total words          ┃  │
│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩  │
│ │ Employment Type                      │     0 │         0 │                      2.7 │                 5391 │  │
│ │ GraduateOrNot                        │     0 │         0 │                        1 │                 1987 │  │
│ │ FrequentFlyer                        │     0 │         0 │                        1 │                 1987 │  │
│ │ EverTravelledAbroad                  │     0 │         0 │                        1 │                 1987 │  │
│ └──────────────────────────────────────┴───────┴───────────┴──────────────────────────┴──────────────────────┘  │
╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯

# Drop irrelevant columns
travel_df.drop(columns=["Unnamed: 0"], axis=1, inplace=True)

# Dataframe info and checks
print(travel_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987 entries, 0 to 1986
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Age                  1987 non-null   int64 
 1   Employment Type      1987 non-null   object
 2   GraduateOrNot        1987 non-null   object
 3   AnnualIncome         1987 non-null   int64 
 4   FamilyMembers        1987 non-null   int64 
 5   ChronicDiseases      1987 non-null   int64 
 6   FrequentFlyer        1987 non-null   object
 7   EverTravelledAbroad  1987 non-null   object
 8   TravelInsurance      1987 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 139.8+ KB
None

check_blank_or_whitespace(travel_df)

Count of empty strings or single spaces per column:
Age                    0
Employment Type        0
GraduateOrNot          0
AnnualIncome           0
FamilyMembers          0
ChronicDiseases        0
FrequentFlyer          0
EverTravelledAbroad    0
TravelInsurance        0
dtype: int64

travel_df.nunique()

Age                    11
Employment Type         2
GraduateOrNot           2
AnnualIncome           30
FamilyMembers           8
ChronicDiseases         2
FrequentFlyer           2
EverTravelledAbroad     2
TravelInsurance         2
dtype: int64

# Check Numerical features statistics
print("Summary Stats (Numerical):")
travel_df.describe(include=np.number).T.round(2)

Summary Stats (Numerical):

# Check Object/categorical features statistics
print("Summary Stats (Categorical):")
travel_df.describe(include="object").T.round(2)

Summary Stats (Categorical):

travel_df.stb.freq(['TravelInsurance'], cum_cols=False)

# Prints only subsequent occurrences of duplicates, ignoring the first one
print(f"\nNumber of duplicate rows: {travel_df.duplicated().sum()}")

Number of duplicate rows: 738

# Check duplicates by ages and count_thresholds
features_to_check = ['Age', 'Employment Type', 'GraduateOrNot', 'AnnualIncome',
                     'FamilyMembers', 'ChronicDiseases', 'FrequentFlyer',
                     'EverTravelledAbroad']
count_thresholds = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
duplicate_occurrences(travel_df, features_to_check, count_thresholds,
                      suppress_output=True)

# Group Age
age_bins = [25, 28, 31, 35]  # Age groups
age_labels = ['25-27', '28-30', '31-35']  # Labels
travel_df['AgeGroup'] = pd.cut(travel_df['Age'],
                               bins=age_bins,
                               labels=age_labels,
                               right=False)

# Group AnnualIncome
income_bins = [300000, 800000, 1300000, 1800000]  # Income groups
income_labels = ['300K-800K', '800K-1.3M', '1.3M-1.8M']  # Labels
travel_df['IncomeGroup'] = pd.cut(travel_df['AnnualIncome'],
                                  bins=income_bins,
                                  labels=income_labels,
                                  right=False)

# Group FamilyMembers
family_bins = [2, 4, 6, 9]  # Family groups
family_labels = ['2-3', '4-5', '6-9']  # Labels
travel_df['FamilyGroup'] = pd.cut(travel_df['FamilyMembers'],
                                  bins=family_bins,
                                  labels=family_labels,
                                  right=False)

# Check the new grouped columns
print(travel_df.head().to_string())

   Age               Employment Type GraduateOrNot  AnnualIncome  FamilyMembers  ChronicDiseases FrequentFlyer EverTravelledAbroad  TravelInsurance AgeGroup IncomeGroup FamilyGroup
0   31             Government Sector           Yes        400000              6                1            No                  No                0    31-35   300K-800K         6-9
1   31  Private Sector/Self Employed           Yes       1250000              7                0            No                  No                0    31-35   800K-1.3M         6-9
2   34  Private Sector/Self Employed           Yes        500000              4                1            No                  No                1    31-35   300K-800K         4-5
3   28  Private Sector/Self Employed           Yes        700000              3                1            No                  No                0    28-30   300K-800K         2-3
4   28  Private Sector/Self Employed           Yes        700000              8                1           Yes                  No                0    28-30   300K-800K         6-9

# Convert categorical columns to 'category' dtype
categorical_columns = ['Employment Type', 'GraduateOrNot',
                       'FrequentFlyer', 'EverTravelledAbroad']
travel_df[categorical_columns] = travel_df[
    categorical_columns].astype('category')

# Verify changes
print(travel_df.dtypes)

Age                       int64
Employment Type        category
GraduateOrNot          category
AnnualIncome              int64
FamilyMembers             int64
ChronicDiseases           int64
FrequentFlyer          category
EverTravelledAbroad    category
TravelInsurance           int64
AgeGroup               category
IncomeGroup            category
FamilyGroup            category
dtype: object

plot_univariate(travel_df)

plot_hist_categorical(travel_df)

# Grouped pie plots
pieplot_group(travel_df, ['AgeGroup', 'IncomeGroup', 'FamilyGroup'],
              ['Age Group Distribution', 'Income Group Distribution',
               'Family Group Distribution'])

# Grouped columns and target variable
grouped_columns = ['AgeGroup', 'IncomeGroup', 'FamilyGroup']
target_column = 'TravelInsurance'

# Bivariate relationships between grouped features and TravelInsurance
bivariate_countplots(travel_df, grouped_columns, target_column)

# Continuous columns for boxplots
continuous_columns = ['Age', 'AnnualIncome', 'FamilyMembers']

# Create subplots for boxplots of continuous columns
fig, axes = plt.subplots(1, len(continuous_columns), figsize=(18, 6))
for i, col in enumerate(continuous_columns):
    sns.boxplot(y=travel_df[col], ax=axes[i], color='skyblue')
    axes[i].set_title(f'Boxplot for {col}')
plt.tight_layout()
plt.show()

# Encoding categorical columns into numerical
encoded_df = travel_df.copy()
encoded_df = encoded_df.drop(columns=['AgeGroup', 'IncomeGroup',
                                      'FamilyGroup'])

encoded_df['GraduateOrNot'] = encoded_df[
    'GraduateOrNot'].map({'Yes': 1, 'No': 0}).astype(int)
encoded_df['FrequentFlyer'] = encoded_df[
    'FrequentFlyer'].map({'Yes': 1, 'No': 0}).astype(int)
encoded_df['EverTravelledAbroad'] = encoded_df[
    'EverTravelledAbroad'].map({'Yes': 1, 'No': 0}).astype(int)
encoded_df['ChronicDiseases'] = encoded_df[
    'ChronicDiseases'].map({'Yes': 1, 'No': 0}).astype(int)
encoded_df['Employment Type'] = encoded_df[
    'Employment Type'].map({'Government Sector': 0,
                            'Private Sector/Self Employed': 1}).astype(int)

# Pairplot to visualize relationships
sns.pairplot(encoded_df, hue='TravelInsurance')
plt.show()

encoded_df.head()

# Correlation matrix and heatmap
corr_matrix = encoded_df.corr(method='spearman')
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, mask=mask,
            annot=True, fmt=".2%",
            cmap='coolwarm', square=True, linewidths=.5)
plt.title('Correlation Matrix')
plt.show()

# Checking for multicollinearity using VIF
vif_result = calculate_vif(encoded_df.drop(columns='TravelInsurance'))
print(vif_result)

               Feature       VIF
0                  Age  1.015369
1      Employment Type  1.191133
2        GraduateOrNot  1.050756
3         AnnualIncome  1.570125
4        FamilyMembers  1.002788
5      ChronicDiseases  1.004206
6        FrequentFlyer  1.171943
7  EverTravelledAbroad  1.334016

# Hypothesis Testing - T-Test on Annual Income
t_stat, p_value = ttest_ind(
    travel_df[travel_df['TravelInsurance'] == 1]['AnnualIncome'],
    travel_df[travel_df['TravelInsurance'] == 0]['AnnualIncome']
)
print(f"T-Statistic: {t_stat}, P-Value: {p_value}")

if p_value < 0.05:
    print("Reject the null hypothesis - significant difference in annual "
          "income between the two groups.")
else:
    print("Fail to reject the null hypothesis - no significant difference in "
          "annual income between the two groups.")

T-Statistic: 19.257785253687544, P-Value: 6.63501776035329e-76
Reject the null hypothesis - significant difference in annual income between the two groups.

# Hypothesis Testing - Chi-Square on Frequent Flyer status on Travel Insurance
contingency_table_ff = pd.crosstab(
    travel_df['FrequentFlyer'], travel_df['TravelInsurance'])
chi2_ff, p_value_ff, dof_ff, expected_ff = chi2_contingency(
    contingency_table_ff)

print(f"Chi-Square Statistic for Frequent Flyer: {chi2_ff}, "
      f"P-Value: {p_value_ff}")

if p_value_ff < 0.05:
    print("Reject the null hypothesis - significant association between "
          "Frequent Flyer status and Travel Insurance.")
else:
    print("Fail to reject the null hypothesis - no significant association "
          "between Frequent Flyer status and Travel Insurance.")

Chi-Square Statistic for Frequent Flyer: 105.85723074203977, P-Value: 7.924360415064537e-25
Reject the null hypothesis - significant association between Frequent Flyer status and Travel Insurance.

# Hypothesis Testing - T-Test on 'Age'
t_stat_age, p_value_age = ttest_ind(
    travel_df[travel_df['TravelInsurance'] == 1]['Age'],
    travel_df[travel_df['TravelInsurance'] == 0]['Age']
)
print(f"T-Statistic for Age: {t_stat_age}, P-Value: {p_value_age}")

if p_value_age < 0.05:
    print("Reject the null hypothesis - significant difference in Age between "
          "the two groups.")
else:
    print("Fail to reject the null hypothesis - no significant difference in "
          "Age between the two groups.")

T-Statistic for Age: 2.7255058475523968, P-Value: 0.006476683978749601
Reject the null hypothesis - significant difference in Age between the two groups.

# Train-Test Split
X = encoded_df.drop(columns='TravelInsurance')
y = encoded_df['TravelInsurance']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
log_reg = LogisticRegression(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)
rf_model = RandomForestClassifier(random_state=42)
knn_model = KNeighborsClassifier()

# Train models
log_reg.fit(X_train_scaled, y_train)
dt_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
knn_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_logreg_train = log_reg.predict(X_train_scaled)
y_pred_dt_train = dt_model.predict(X_train)
y_pred_rf_train = rf_model.predict(X_train)
y_pred_knn_train = knn_model.predict(X_train_scaled)

# Predictions on test set
y_pred_logreg_test = log_reg.predict(X_test_scaled)
y_pred_dt_test = dt_model.predict(X_test)
y_pred_rf_test = rf_model.predict(X_test)
y_pred_knn_test = knn_model.predict(X_test_scaled)

# Evaluate models using classification reports
print("Logistic Regression Performance:")
print(classification_report(y_test, y_pred_logreg_test))

print("Decision Tree Performance:")
print(classification_report(y_test, y_pred_dt_test))

print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf_test))

print("K-Nearest Neighbors Performance:")
print(classification_report(y_test, y_pred_knn_test))

Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.77      0.92      0.84       257
           1       0.77      0.50      0.60       141

    accuracy                           0.77       398
   macro avg       0.77      0.71      0.72       398
weighted avg       0.77      0.77      0.75       398

Decision Tree Performance:
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       257
           1       0.78      0.66      0.71       141

    accuracy                           0.81       398
   macro avg       0.80      0.78      0.79       398
weighted avg       0.81      0.81      0.81       398

Random Forest Performance:
              precision    recall  f1-score   support

           0       0.82      0.91      0.86       257
           1       0.79      0.65      0.71       141

    accuracy                           0.81       398
   macro avg       0.81      0.78      0.79       398
weighted avg       0.81      0.81      0.81       398

K-Nearest Neighbors Performance:
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       257
           1       0.74      0.55      0.63       141

    accuracy                           0.77       398
   macro avg       0.76      0.72      0.73       398
weighted avg       0.77      0.77      0.76       398

# Calculate and print training and test accuracy
train_accuracy_logreg = accuracy_score(y_train, y_pred_logreg_train)
test_accuracy_logreg = accuracy_score(y_test, y_pred_logreg_test)

train_accuracy_dt = accuracy_score(y_train, y_pred_dt_train)
test_accuracy_dt = accuracy_score(y_test, y_pred_dt_test)

train_accuracy_rf = accuracy_score(y_train, y_pred_rf_train)
test_accuracy_rf = accuracy_score(y_test, y_pred_rf_test)

train_accuracy_knn = accuracy_score(y_train, y_pred_knn_train)
test_accuracy_knn = accuracy_score(y_test, y_pred_knn_test)

print(f"Logistic Regression - Training Accuracy: {train_accuracy_logreg:.4f}, "
      f"Test Accuracy: {test_accuracy_logreg:.4f}")
print(f"Decision Tree - Training Accuracy: {train_accuracy_dt:.4f}, "
      f"Test Accuracy: {test_accuracy_dt:.4f}")
print(f"Random Forest - Training Accuracy: {train_accuracy_rf:.4f}, "
      f"Test Accuracy: {test_accuracy_rf:.4f}")
print(f"K-Nearest Neighbors - Training Accuracy: {train_accuracy_knn:.4f}, "
      f"Test Accuracy: {test_accuracy_knn:.4f}")

Logistic Regression - Training Accuracy: 0.7753, Test Accuracy: 0.7688
Decision Tree - Training Accuracy: 0.9226, Test Accuracy: 0.8116
Random Forest - Training Accuracy: 0.9226, Test Accuracy: 0.8141
K-Nearest Neighbors - Training Accuracy: 0.8408, Test Accuracy: 0.7714

# Confusion matrices
print("Confusion Matrix - Logistic Regression:")
print(confusion_matrix(y_test, y_pred_logreg_test))

print("Confusion Matrix - Decision Tree:")
print(confusion_matrix(y_test, y_pred_dt_test))

print("Confusion Matrix - Random Forest:")
print(confusion_matrix(y_test, y_pred_rf_test))

print("Confusion Matrix - K-Nearest Neighbors:")
print(confusion_matrix(y_test, y_pred_knn_test))

# ROC Curves and AUC Scores
fpr_logreg, tpr_logreg, _ = roc_curve(
    y_test, log_reg.predict_proba(X_test_scaled)[:, 1])
fpr_dt, tpr_dt, _ = roc_curve(
    y_test, dt_model.predict_proba(X_test)[:, 1])
fpr_rf, tpr_rf, _ = roc_curve(
    y_test, rf_model.predict_proba(X_test)[:, 1])
fpr_knn, tpr_knn, _ = roc_curve(
    y_test, knn_model.predict_proba(X_test_scaled)[:, 1])

roc_auc_logreg = roc_auc_score(
    y_test, log_reg.predict_proba(X_test_scaled)[:, 1])
roc_auc_dt = roc_auc_score(
    y_test, dt_model.predict_proba(X_test)[:, 1])
roc_auc_rf = roc_auc_score(
    y_test, rf_model.predict_proba(X_test)[:, 1])
roc_auc_knn = roc_auc_score(
    y_test, knn_model.predict_proba(X_test_scaled)[:, 1])

Confusion Matrix - Logistic Regression:
[[236  21]
 [ 71  70]]
Confusion Matrix - Decision Tree:
[[230  27]
 [ 48  93]]
Confusion Matrix - Random Forest:
[[233  24]
 [ 50  91]]
Confusion Matrix - K-Nearest Neighbors:
[[230  27]
 [ 64  77]]

# Plot ROC Curves
plt.figure(figsize=(10, 6))
plt.plot(fpr_logreg, tpr_logreg,
         label=f'Logistic Regression (AUC = {roc_auc_logreg:.4f})',
         color='orange')
plt.plot(fpr_dt, tpr_dt,
         label=f'Decision Tree (AUC = {roc_auc_dt:.4f})',
         color='blue')
plt.plot(fpr_rf, tpr_rf,
         label=f'Random Forest (AUC = {roc_auc_rf:.4f})',
         color='green')
plt.plot(fpr_knn, tpr_knn,
         label=f'K-Nearest Neighbors (AUC = {roc_auc_knn:.4f})',
         color='purple')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.show()

# Calculate confidence intervals for accuracy
accuracy_logreg = accuracy_score(y_test, y_pred_logreg_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn_test)

n = len(y_test)  # Sample size
ci_logreg = proportion_confint(
    int(accuracy_logreg * n), n, alpha=0.05, method='normal')
ci_dt = proportion_confint(
    int(accuracy_dt * n), n, alpha=0.05, method='normal')
ci_rf = proportion_confint(
    int(accuracy_rf * n), n, alpha=0.05, method='normal')
ci_knn = proportion_confint(
    int(accuracy_knn * n), n, alpha=0.05, method='normal')

print(f"Logistic Regression Accuracy: {accuracy_logreg:.4f}, "
      f"Confidence Interval: {ci_logreg}")
print(f"Decision Tree Accuracy: {accuracy_dt:.4f}, "
      f"Confidence Interval: {ci_dt}")
print(f"Random Forest Accuracy: {accuracy_rf:.4f}, "
      f"Confidence Interval: {ci_rf}")
print(f"K-Nearest Neighbors Accuracy: {accuracy_knn:.4f}, "
      f"Confidence Interval: {ci_knn}")

Logistic Regression Accuracy: 0.7688, Confidence Interval: (0.7274272714756453, 0.8102611707354099)
Decision Tree Accuracy: 0.8116, Confidence Interval: (0.7731379645262779, 0.8499776133631693)
Random Forest Accuracy: 0.8141, Confidence Interval: (0.7758484890777377, 0.8522922144398501)
K-Nearest Neighbors Accuracy: 0.7714, Confidence Interval: (0.7300982903647009, 0.812615277474495)

# Cross-validation for more reliable performance estimates
cv_logreg = cross_val_score(
    log_reg, X_train_scaled, y_train, cv=5, scoring='accuracy')
cv_dt = cross_val_score(
    dt_model, X_train, y_train, cv=5, scoring='accuracy')
cv_rf = cross_val_score(
    rf_model, X_train, y_train, cv=5, scoring='accuracy')
cv_knn = cross_val_score(
    knn_model, X_train_scaled, y_train, cv=5, scoring='accuracy')

# Print the mean accuracy and standard deviation for each model
print(f"Logistic Regression Accuracy: Mean = {cv_logreg.mean():.4f}, "
      f"Std = {cv_logreg.std():.4f}")
print(f"Decision Tree Accuracy: Mean = {cv_dt.mean():.4f}, "
      f"Std = {cv_dt.std():.4f}")
print(f"Random Forest Accuracy: Mean = {cv_rf.mean():.4f}, "
      f"Std = {cv_rf.std():.4f}")
print(f"K-Nearest Neighbors Accuracy: Mean = {cv_knn.mean():.4f}, "
      f"Std = {cv_knn.std():.4f}")

Logistic Regression Accuracy: Mean = 0.7671, Std = 0.0219
Decision Tree Accuracy: Mean = 0.7552, Std = 0.0148
Random Forest Accuracy: Mean = 0.7785, Std = 0.0129
K-Nearest Neighbors Accuracy: Mean = 0.7842, Std = 0.0150

# Interpretation of Overfitting/Underfitting
check_overfit_underfit(train_accuracy_logreg,
                       test_accuracy_logreg,
                       "Logistic Regression")
check_overfit_underfit(train_accuracy_dt,
                       test_accuracy_dt,
                       "Decision Tree")
check_overfit_underfit(train_accuracy_rf,
                       test_accuracy_rf,
                       "Random Forest")
check_overfit_underfit(train_accuracy_knn,
                       test_accuracy_knn,
                       "K-Nearest Neighbors")

Logistic Regression: Model is likely well-balanced.
Decision Tree: Potential overfitting detected.
Random Forest: Potential overfitting detected.
K-Nearest Neighbors: Potential overfitting detected.

param_grid_dt = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_model = DecisionTreeClassifier(random_state=42)

# Grid Search for Decision Tree
grid_dt = GridSearchCV(dt_model, param_grid_dt, cv=5, scoring='accuracy')
grid_dt.fit(X_train, y_train)

print(f"Best Hyperparameters for Decision Tree: {grid_dt.best_params_}")

Best Hyperparameters for Decision Tree: {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}

param_grid_rf = {
    'n_estimators': [100, 150, 200, 250, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf_model = RandomForestClassifier(random_state=42)

# Grid Search for Random Forest
grid_rf = GridSearchCV(rf_model, param_grid_rf, cv=5, scoring='accuracy')
grid_rf.fit(X_train, y_train)

print(f"Best Hyperparameters for Random Forest: {grid_rf.best_params_}")

Best Hyperparameters for Random Forest: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 150}

param_grid_knn = {
    'n_neighbors': [5, 10, 20],
    'weights': ['uniform', 'distance']
}

knn_model = KNeighborsClassifier()

# Grid Search for KNN
grid_knn = GridSearchCV(knn_model, param_grid_knn, cv=5, scoring='accuracy')
grid_knn.fit(X_train_scaled, y_train)

print(f"Best Hyperparameters for K-Nearest Neighbors: {grid_knn.best_params_}")

Best Hyperparameters for K-Nearest Neighbors: {'n_neighbors': 20, 'weights': 'uniform'}

# Initialize adjusted models
dt_model_adj = DecisionTreeClassifier(min_samples_split=10,
                                      min_samples_leaf=1,
                                      random_state=42)

rf_model_adj = RandomForestClassifier(n_estimators=150,
                                      max_depth=10,
                                      min_samples_split=2,
                                      random_state=42)

knn_model_adj = KNeighborsClassifier(n_neighbors=20,
                                     weights='uniform')

# Train the adjusted models
dt_model_adj.fit(X_train, y_train)
rf_model_adj.fit(X_train, y_train)
knn_model_adj.fit(X_train_scaled, y_train)

# Make predictions with the adjusted models
y_pred_dt_adj = dt_model_adj.predict(X_test)
y_pred_rf_adj = rf_model_adj.predict(X_test)
y_pred_knn_adj = knn_model_adj.predict(X_test_scaled)

# Predictions with adjusted models
y_pred_dt_adj_test = dt_model_adj.predict(X_test)
y_pred_rf_adj_test = rf_model_adj.predict(X_test)
y_pred_knn_adj_test = knn_model_adj.predict(X_test_scaled)


# Adjusted Model Evaluation
print("Decision Tree (Adjusted) Performance:")
print(classification_report(y_test, y_pred_dt_adj_test))

print("Random Forest (Adjusted) Performance:")
print(classification_report(y_test, y_pred_rf_adj_test))

print("K-Nearest Neighbors (Adjusted) Performance:")
print(classification_report(y_test, y_pred_knn_adj_test))

Decision Tree (Adjusted) Performance:
              precision    recall  f1-score   support

           0       0.83      0.91      0.87       257
           1       0.81      0.67      0.73       141

    accuracy                           0.83       398
   macro avg       0.82      0.79      0.80       398
weighted avg       0.83      0.83      0.82       398

Random Forest (Adjusted) Performance:
              precision    recall  f1-score   support

           0       0.81      0.99      0.89       257
           1       0.98      0.57      0.72       141

    accuracy                           0.84       398
   macro avg       0.89      0.78      0.81       398
weighted avg       0.87      0.84      0.83       398

K-Nearest Neighbors (Adjusted) Performance:
              precision    recall  f1-score   support

           0       0.79      0.94      0.86       257
           1       0.83      0.54      0.65       141

    accuracy                           0.80       398
   macro avg       0.81      0.74      0.75       398
weighted avg       0.80      0.80      0.78       398

# Training set predictions for adjusted models
y_train_pred_dt_adj = dt_model_adj.predict(X_train)
y_train_pred_rf_adj = rf_model_adj.predict(X_train)
y_train_pred_knn_adj = knn_model_adj.predict(X_train_scaled)

# Calculate training and test accuracies for adjusted models
train_accuracy_dt_adj = accuracy_score(y_train, y_train_pred_dt_adj)
test_accuracy_dt_adj = accuracy_score(y_test, y_pred_dt_adj_test)

train_accuracy_rf_adj = accuracy_score(y_train, y_train_pred_rf_adj)
test_accuracy_rf_adj = accuracy_score(y_test, y_pred_rf_adj_test)

train_accuracy_knn_adj = accuracy_score(y_train, y_train_pred_knn_adj)
test_accuracy_knn_adj = accuracy_score(y_test, y_pred_knn_adj_test)

# Print training and test accuracies for adjusted models
print(
    "Decision Tree (Adjusted) - Training Accuracy: "
    f"{train_accuracy_dt_adj:.4f}, Test Accuracy: {test_accuracy_dt_adj:.4f}"
)
print(
    "Random Forest (Adjusted) - Training Accuracy: "
    f"{train_accuracy_rf_adj:.4f}, Test Accuracy: {test_accuracy_rf_adj:.4f}"
)
print(
    "K-Nearest Neighbors (Adjusted) - Training Accuracy: "
    f"{train_accuracy_knn_adj:.4f}, Test Accuracy: {test_accuracy_knn_adj:.4f}"
)

Decision Tree (Adjusted) - Training Accuracy: 0.8722, Test Accuracy: 0.8266
Random Forest (Adjusted) - Training Accuracy: 0.8697, Test Accuracy: 0.8442
K-Nearest Neighbors (Adjusted) - Training Accuracy: 0.7974, Test Accuracy: 0.7965

# Interpretation of Overfitting/Underfitting for Adjusted Models
check_overfit_underfit(train_accuracy_dt_adj,
                       test_accuracy_dt_adj,
                       "Decision Tree (Adjusted)")
check_overfit_underfit(train_accuracy_rf_adj,
                       test_accuracy_rf_adj,
                       "Random Forest (Adjusted)")
check_overfit_underfit(train_accuracy_knn_adj,
                       test_accuracy_knn_adj,
                       "K-Nearest Neighbors (Adjusted)")

Decision Tree (Adjusted): Model is likely well-balanced.
Random Forest (Adjusted): Model is likely well-balanced.
K-Nearest Neighbors (Adjusted): Model is likely well-balanced.

# Confusion matrices for adjusted models
print("Confusion Matrix - Decision Tree (Adjusted):")
print(confusion_matrix(y_test, y_pred_dt_adj_test))

print("Confusion Matrix - Random Forest (Adjusted):")
print(confusion_matrix(y_test, y_pred_rf_adj_test))

print("Confusion Matrix - K-Nearest Neighbors (Adjusted):")
print(confusion_matrix(y_test, y_pred_knn_adj_test))

Confusion Matrix - Decision Tree (Adjusted):
[[235  22]
 [ 47  94]]
Confusion Matrix - Random Forest (Adjusted):
[[255   2]
 [ 60  81]]
Confusion Matrix - K-Nearest Neighbors (Adjusted):
[[241  16]
 [ 65  76]]

# ROC Curves and AUC Scores for adjusted models
fpr_dt_adj, tpr_dt_adj, _ = roc_curve(
    y_test, dt_model_adj.predict_proba(X_test)[:, 1])
fpr_rf_adj, tpr_rf_adj, _ = roc_curve(
    y_test, rf_model_adj.predict_proba(X_test)[:, 1])
fpr_knn_adj, tpr_knn_adj, _ = roc_curve(
    y_test, knn_model_adj.predict_proba(X_test_scaled)[:, 1])

roc_auc_dt_adj = roc_auc_score(
    y_test, dt_model_adj.predict_proba(X_test)[:, 1])
roc_auc_rf_adj = roc_auc_score(
    y_test, rf_model_adj.predict_proba(X_test)[:, 1])
roc_auc_knn_adj = roc_auc_score(
    y_test, knn_model_adj.predict_proba(X_test_scaled)[:, 1])

# Plot ROC Curves for adjusted models
plt.figure(figsize=(10, 6))
plt.plot(fpr_dt_adj, tpr_dt_adj,
         label=f'Decision Tree (Adjusted) AUC = {roc_auc_dt_adj:.4f}',
         color='blue')
plt.plot(fpr_rf_adj, tpr_rf_adj,
         label=f'Random Forest (Adjusted) AUC = {roc_auc_rf_adj:.4f}',
         color='green')
plt.plot(fpr_knn_adj, tpr_knn_adj,
         label=f'KNN (Adjusted) AUC = {roc_auc_knn_adj:.4f}',
         color='purple')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison - Adjusted Models')
plt.legend()
plt.show()

# Calculate accuracy for adjusted models
accuracy_dt_adj = accuracy_score(y_test, y_pred_dt_adj_test)
accuracy_rf_adj = accuracy_score(y_test, y_pred_rf_adj_test)
accuracy_knn_adj = accuracy_score(y_test, y_pred_knn_adj_test)

# Sample size
n = len(y_test)

# Calculate confidence intervals for adjusted models
ci_dt_adj = proportion_confint(
    int(accuracy_dt_adj * n), n, alpha=0.05, method='normal')
ci_rf_adj = proportion_confint(
    int(accuracy_rf_adj * n), n, alpha=0.05, method='normal')
ci_knn_adj = proportion_confint(
    int(accuracy_knn_adj * n), n, alpha=0.05, method='normal')

# Print accuracy and confidence intervals for adjusted models
print(
    f"Decision Tree (Adjusted) Accuracy: {accuracy_dt_adj:.4f}, "
    f"Confidence Interval: {ci_dt_adj}"
)
print(
    f"Random Forest (Adjusted) Accuracy: {accuracy_rf_adj:.4f}, "
    f"Confidence Interval: {ci_rf_adj}"
)
print(
    f"K-Nearest Neighbors (Adjusted) Accuracy: {accuracy_knn_adj:.4f}, "
    f"Confidence Interval: {ci_knn_adj}"
)

Decision Tree (Adjusted) Accuracy: 0.8266, Confidence Interval: (0.7894414703490236, 0.8638248613092678)
Random Forest (Adjusted) Accuracy: 0.8442, Confidence Interval: (0.8085933087552651, 0.8798489023000112)
K-Nearest Neighbors (Adjusted) Accuracy: 0.7965, Confidence Interval: (0.7569279371614691, 0.8360368869591338)

cv_dt_adj = cross_val_score(
    dt_model_adj, X_train, y_train, cv=5, scoring='accuracy')
cv_rf_adj = cross_val_score(
    rf_model_adj, X_train, y_train, cv=5, scoring='accuracy')
cv_knn_adj = cross_val_score(
    knn_model_adj, X_train_scaled, y_train, cv=5, scoring='accuracy')

# Print the mean accuracy and standard deviation for each adjusted model
print(
    f"Decision Tree (Adjusted) Accuracy: Mean = "
    f"{cv_dt_adj.mean():.4f}, Std = {cv_dt_adj.std():.4f}"
)
print(
    f"Random Forest (Adjusted) Accuracy: Mean = "
    f"{cv_rf_adj.mean():.4f}, Std = {cv_rf_adj.std():.4f}"
)
print(
    "K-Nearest Neighbors (Adjusted) Accuracy: Mean = "
    f"{cv_knn_adj.mean():.4f}, Std = {cv_knn_adj.std():.4f}"
)

Decision Tree (Adjusted) Accuracy: Mean = 0.7829, Std = 0.0135
Random Forest (Adjusted) Accuracy: Mean = 0.8225, Std = 0.0141
K-Nearest Neighbors (Adjusted) Accuracy: Mean = 0.7873, Std = 0.0105

	count	mean	std	min	25%	50%	75%	max
Age	1987.0	29.65	2.91	25.0	28.0	29.0	32.0	35.0
AnnualIncome	1987.0	932762.96	376855.68	300000.0	600000.0	900000.0	1250000.0	1800000.0
FamilyMembers	1987.0	4.75	1.61	2.0	4.0	5.0	6.0	9.0
ChronicDiseases	1987.0	0.28	0.45	0.0	0.0	0.0	1.0	1.0
TravelInsurance	1987.0	0.36	0.48	0.0	0.0	0.0	1.0	1.0

	TravelInsurance	count	percent
0	0	1277	64.26774
1	1	710	35.73226

Column	Description	Datatype	Count
Age	Age Of The Customer	int64	1987
Employment Type	The Sector In Which Customer Is Employed	object	1987
GraduateOrNot	Whether The Customer Is College Graduate Or Not	object	1987
AnnualIncome	The Yearly Income Of The Customer In Indian Rupees[Rounded To Nearest 50 Thousand Rupees]	int64	1987
FamilyMembers	Number Of Members In Customer's Family	int64	1987
ChronicDiseases	Whether The Customer Suffers From Any Major Disease Or Conditions Like Diabetes/High BP or Asthama,etc.	int64	1987
FrequentFlyer	Derived Data Based On Customer's History Of Booking Air Tickets On Atleast 4 Different Instances In The Last 2 Years[2017-2019]	object	1987
EverTravelledAbroad	Has The Customer Ever Travelled To A Foreign Country[Not Necessarily Using The Company's Services]	object	1987

	Unnamed: 0	Age	Employment Type	GraduateOrNot	AnnualIncome	FamilyMembers	ChronicDiseases	FrequentFlyer	EverTravelledAbroad	TravelInsurance
0	0	31	Government Sector	Yes	400000	6	1	No	No	0
1	1	31	Private Sector/Self Employed	Yes	1250000	7	0	No	No	0
2	2	34	Private Sector/Self Employed	Yes	500000	4	1	No	No	1
3	3	28	Private Sector/Self Employed	Yes	700000	3	1	No	No	0
4	4	28	Private Sector/Self Employed	Yes	700000	8	1	Yes	No	0

Table of Contents¶

Introduction¶

Dataset Structure¶

Approach Overview¶

Notebook settings¶

Data Loading, Cleaning and Initial Exploration¶

Checking for duplicates¶

Decision Regarding Duplicates¶

Data Preparation¶

Grouping:¶

Dtype changes - object to category¶

EDA¶

Summary¶

Summary¶

Analysing grouped features¶

Summary¶

Analysing grouped features with target variable(TravelInsurance)¶

Summary¶

Outlier Detection and Visualization¶

Conclusion¶

Encoding and Correlation Analysis¶

Correlation Analysis¶

Multicollinearity Check (VIF)¶

Statistical Inference and Hypothesis Testing¶

Model Set-Up, Testing and Evaluation¶

Train-Test Split and Standardization¶

Model Training and Evaluation¶

Evaluating Model Generalization Through Training and Test Accuracy¶

Confusion Matrices and ROC Curves¶

Confidence Intervals for Model Accuracy¶

Analysis¶

Cross-Validation for More Reliable Performance Estimates¶

Overfitting and Underfitting Analysis¶

Applying Hyperparameter Tuning¶

Hyperparameter Tuning - Decision Tree¶

Hyperparameter Tuning - Random Forest¶

Hyperparameter Tuning - K-Nearest Neighbors (KNN)¶

Model Set-Up, Testing and Evaluation - After Hyperparameter Tuning¶

Confusion Matrices and ROC Curves for Adjusted Models¶

Comparison of Confusion Matrices: Before and After Hyperparameter Tuning¶

Comparison of ROC Curve: Before and After Hyperparameter Tuning¶

Comparison of Confidence Interval Accuracy: Before and After Hyperparameter Tuning¶

Cross-validation for more reliable performance estimates - Adjusted Models¶

Comparison of Cross Validation: Before and After Hyperparameter Tuning¶

Final Model Selection and Evaluation¶

Suggested Improvements:¶