from assets.utils.functions import *

%load_ext pycodestyle_magic
# %reload_ext pycodestyle_magic
%pycodestyle_on
%flake8_on
%flake8_on --max_line_length 79
%matplotlib inline

# Standard libraries
from IPython.display import display
import warnings
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
from statsmodels.stats.outliers_influence import variance_inflation_factor
import phik
from phik import resources, report
from scipy.stats import ttest_ind
# from IPython.core.display import display, HTML

warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# for 100% jupyter notebook cell width
# display(HTML("<style>.container { width:100% !important; }</style>"))

# Loading datasets
train_df = pd.read_csv(
    '../analytical/assets/data/application_train.csv')
test_df = pd.read_csv(
    '../analytical/assets/data/application_test.csv')
bureau_df = pd.read_csv(
    '../analytical/assets/data/bureau.csv')
bureau_bal_df = pd.read_csv(
    '../analytical/assets/data/bureau_balance.csv')
credit_card_bal_df = pd.read_csv(
    '../analytical/assets/data/credit_card_balance.csv')
installments_pay_df = pd.read_csv(
    '../analytical/assets/data/installments_payments.csv')
poscash_bal_df = pd.read_csv(
    '../analytical/assets/data/pos_cash_balance.csv')
prev_app_df = pd.read_csv(
    '../analytical/assets/data/previous_application.csv')

# List of train and test datasets
train_test_df_list = {
    "train_df": train_df,
    "test_df": test_df
}

# List of all datasets
all_df_dict = {
    "train_df": train_df,
    "test_df": test_df,
    "bureau_df": bureau_df,
    "bureau_bal_df": bureau_bal_df,
    "credit_card_bal_df": credit_card_bal_df,
    "installments_pay_df": installments_pay_df,
    "poscash_bal_df": poscash_bal_df,
    "prev_app_df": prev_app_df
}

# Displaying the shape of each dataset
print("application_train     :", train_df.shape)
print("application_test      :", test_df.shape)
print("bureau                :", bureau_df.shape)
print("bureau_balance        :", bureau_bal_df.shape)
print("credit_card_balance   :", credit_card_bal_df.shape)
print("installments_payments :", installments_pay_df.shape)
print("pos_cash_balance      :", poscash_bal_df.shape)
print("previous_application  :", prev_app_df.shape)

application_train     : (307511, 122)
application_test      : (48744, 121)
bureau                : (1716428, 17)
bureau_balance        : (27299925, 3)
credit_card_balance   : (3840312, 23)
installments_payments : (13605401, 8)
pos_cash_balance      : (10001358, 8)
previous_application  : (1670214, 37)

# Apply the function to all DataFrames
for name, df in all_df_dict.items():
    print(f"\n--- Optimizing {name} DataFrame ---\n")
    memory_usage_improve(df)

--- Optimizing train_df DataFrame ---

Memory Usage Before Optimization:
  Memory usage of float64 columns: 152.50 MB
  Memory usage of int64 columns: 96.19 MB
  Memory usage of object columns: 288.00 MB
  Total memory usage: 536.69 MB

Memory Usage After Optimization:
  Memory usage of float32 columns: 76.25 MB
  Memory usage of int32 columns: 48.10 MB
  Memory usage of category columns: 4.71 MB
  Total memory usage: 129.05 MB

Memory reduced by: 407.64 MB (75.95%)

--- Optimizing test_df DataFrame ---

Memory Usage Before Optimization:
  Memory usage of float64 columns: 24.17 MB
  Memory usage of int64 columns: 14.88 MB
  Memory usage of object columns: 45.69 MB
  Total memory usage: 84.74 MB

Memory Usage After Optimization:
  Memory usage of float32 columns: 12.09 MB
  Memory usage of int32 columns: 7.44 MB
  Memory usage of category columns: 0.76 MB
  Total memory usage: 20.28 MB

Memory reduced by: 64.45 MB (76.06%)

--- Optimizing bureau_df DataFrame ---

Memory Usage Before Optimization:
  Memory usage of float64 columns: 104.76 MB
  Memory usage of int64 columns: 78.57 MB
  Memory usage of object columns: 328.78 MB
  Total memory usage: 512.11 MB

Memory Usage After Optimization:
  Memory usage of float32 columns: 52.38 MB
  Memory usage of int32 columns: 39.29 MB
  Memory usage of category columns: 4.91 MB
  Total memory usage: 96.58 MB

Memory reduced by: 415.53 MB (81.14%)

--- Optimizing bureau_bal_df DataFrame ---

Memory Usage Before Optimization:
  Memory usage of float64 columns: 0.00 MB
  Memory usage of int64 columns: 416.56 MB
  Memory usage of object columns: 1510.04 MB
  Total memory usage: 1926.61 MB

Memory Usage After Optimization:
  Memory usage of float32 columns: 0.00 MB
  Memory usage of int32 columns: 208.28 MB
  Memory usage of category columns: 26.04 MB
  Total memory usage: 234.32 MB

Memory reduced by: 1692.29 MB (87.84%)

--- Optimizing credit_card_bal_df DataFrame ---

Memory Usage Before Optimization:
  Memory usage of float64 columns: 439.49 MB
  Memory usage of int64 columns: 205.09 MB
  Memory usage of object columns: 231.10 MB
  Total memory usage: 875.69 MB

Memory Usage After Optimization:
  Memory usage of float32 columns: 219.74 MB
  Memory usage of int32 columns: 102.55 MB
  Memory usage of category columns: 3.66 MB
  Total memory usage: 325.96 MB

Memory reduced by: 549.73 MB (62.78%)

--- Optimizing installments_pay_df DataFrame ---

Memory Usage Before Optimization:
  Memory usage of float64 columns: 519.00 MB
  Memory usage of int64 columns: 311.40 MB
  Memory usage of object columns: 0.00 MB
  Total memory usage: 830.41 MB

Memory Usage After Optimization:
  Memory usage of float32 columns: 259.50 MB
  Memory usage of int32 columns: 155.70 MB
  Memory usage of category columns: 0.00 MB
  Total memory usage: 415.20 MB

Memory reduced by: 415.20 MB (50.00%)

--- Optimizing poscash_bal_df DataFrame ---

Memory Usage Before Optimization:
  Memory usage of float64 columns: 152.61 MB
  Memory usage of int64 columns: 381.52 MB
  Memory usage of object columns: 603.12 MB
  Total memory usage: 1137.25 MB

Memory Usage After Optimization:
  Memory usage of float32 columns: 76.30 MB
  Memory usage of int32 columns: 190.76 MB
  Memory usage of category columns: 9.54 MB
  Total memory usage: 276.60 MB

Memory reduced by: 860.65 MB (75.68%)

--- Optimizing prev_app_df DataFrame ---

Memory Usage Before Optimization:
  Memory usage of float64 columns: 191.14 MB
  Memory usage of int64 columns: 76.46 MB
  Memory usage of object columns: 1633.03 MB
  Total memory usage: 1900.63 MB

Memory Usage After Optimization:
  Memory usage of float32 columns: 95.57 MB
  Memory usage of int32 columns: 38.23 MB
  Memory usage of category columns: 25.50 MB
  Total memory usage: 159.30 MB

Memory reduced by: 1741.33 MB (91.62%)

print('-'*80)
print(f'The shape of application_train.csv is: {train_df.shape}')
print('-'*80)
print(
    f"Number of duplicate values in application_train: "
    f"{train_df.shape[0] - train_df.duplicated().shape[0]}"
)
print('-'*80)
display(train_df.head())

--------------------------------------------------------------------------------
The shape of application_train.csv is: (307511, 122)
--------------------------------------------------------------------------------
Number of duplicate values in application_train: 0
--------------------------------------------------------------------------------

print('-'*80)
print(f'The shape of application_test.csv is: {test_df.shape}')
print('-'*80)
print(
    f"Number of duplicate values in application_train: "
    f"{test_df.shape[0] - test_df.duplicated().shape[0]}"
)
print('-'*80)
display(test_df.head())

--------------------------------------------------------------------------------
The shape of application_test.csv is: (48744, 121)
--------------------------------------------------------------------------------
Number of duplicate values in application_train: 0
--------------------------------------------------------------------------------

# NaN values in the training dataset
view_percent_nan(
    null_df(train_df), 'Application Train Dataset', grid=True)

Number of columns having NaN values: 67 columns

# NaN values in the test dataset
view_percent_nan(null_df(test_df), 'Application Test Dataset', grid=True)

Number of columns having NaN values: 64 columns

train_df['TARGET'] = train_df['TARGET'].astype(int)
plot_target_distribution(train_df)

# Categorical columns (object type and boolean-like integers)
categorical_columns_auto = list(
    train_df.select_dtypes(
        include=['object', 'category']).columns)

# Add boolean-like numerical features explicitly
flag_columns = [
    'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE',
    'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
    'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
    'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
    'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY',
    'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY'
]

# Add FLAG_DOCUMENT_2 to FLAG_DOCUMENT_21 explicitly
document_flags = ['FLAG_DOCUMENT_' + str(i) for i in range(2, 22)]

# Combine all
categorical_columns = list(set(
    categorical_columns_auto + flag_columns + document_flags))

# Ensure all are present in the dataframe
categorical_columns = [
    col for col in categorical_columns if col in train_df.columns]

# Check that all identified columns exist in the dataset
missing_cols = [
    col for col in categorical_columns if col not in train_df.columns]
print(f"Columns in the list but missing from the dataset: {missing_cols}")

Columns in the list but missing from the dataset: []

# Categorical columns and target
categorical_columns_with_target = categorical_columns + ['TARGET']

# Display top correlations
view_categ_corr_matrix(
    data=train_df,
    categorical_columns=categorical_columns_with_target,
    figsize=(20, 20),
    mask_upper=True,
    show_target_top_corr=True,
    target_top_columns=10
)

Categories with highest correlation value with Target:

# Columns to drop
columns_to_drop = ['SK_ID_CURR'] + list(
    set(categorical_columns_with_target) - set(['TARGET']))

view_num_corr_matrix(
    data=train_df,
    columns_to_drop=columns_to_drop,
    figsize=(25, 23),
    mask_upper=True,
    linewidth=0.1,
    fontsize=10,
    cmap='coolwarm'
)

# Select numerical features
numerical_columns = train_df.select_dtypes(
    include=['float32', 'int32']).columns.tolist()

# Initialize the correlation matrix for numerical features
corr_mat = CorrelationMatrix(
    data=train_df, target='TARGET',
    numerical_columns=numerical_columns, min_unique_values=10)

top_corr_numerical = corr_mat.target_top_corr(target_top_columns=10)

print("Top Phi-K Correlations for Numerical Features:")
display(top_corr_numerical)

Top Phi-K Correlations for Numerical Features:

# Exclude irrelevant and non-numeric columns
columns_to_exclude = ['SK_ID_CURR', 'TARGET']
columns_for_vif = [
    col for col in train_df.columns if col not in columns_to_exclude]

# Create a copy of the dataset for VIF analysis
multicollinearity_data = train_df[columns_for_vif].copy()

# Encode categorical variables to numeric
categorical_cols = multicollinearity_data.select_dtypes(
    include=['object', 'category']).columns
for col in categorical_cols:
    multicollinearity_data[col] = multicollinearity_data[col].astype(
        'category').cat.codes

# Handle missing and infinite values
multicollinearity_data.fillna(
    multicollinearity_data.mean(), inplace=True)
multicollinearity_data.replace(
    [float('inf'), -float('inf')], 0, inplace=True)

# Add a constant column for VIF calculation
multicollinearity_data = multicollinearity_data.assign(CONSTANT=1)

# Calculate Variance Inflation Factor (VIF)
vif_data = pd.DataFrame()
vif_data['Feature'] = multicollinearity_data.columns
vif_data['VIF'] = [variance_inflation_factor(
    multicollinearity_data.values, i) for i in range(
        multicollinearity_data.shape[1])]

# Drop the constant column from the results
vif_data = vif_data[vif_data['Feature'] != 'CONSTANT']

# Sort and display the VIF results
vif_data = vif_data.sort_values(by='VIF', ascending=False)
print(vif_data)

                          Feature          VIF
21                 FLAG_EMP_PHONE  2209.718472
16                  DAYS_EMPLOYED  2189.441347
45                YEARS_BUILD_AVG   379.598773
73               YEARS_BUILD_MEDI   378.368117
91       OBS_60_CNT_SOCIAL_CIRCLE   332.441132
89       OBS_30_CNT_SOCIAL_CIRCLE   331.921964
75                 ELEVATORS_MEDI   300.852974
76                 ENTRANCES_MEDI   292.604097
70                APARTMENTS_MEDI   289.053368
77                 FLOORSMAX_MEDI   285.926160
78                 FLOORSMIN_MEDI   279.862892
48                  ENTRANCES_AVG   258.957539
42                 APARTMENTS_AVG   258.902094
47                  ELEVATORS_AVG   242.028826
49                  FLOORSMAX_AVG   240.232505
50                  FLOORSMIN_AVG   229.106694
81                LIVINGAREA_MEDI   221.029095
53                 LIVINGAREA_AVG   196.001510
80          LIVINGAPARTMENTS_MEDI   181.425466
52           LIVINGAPARTMENTS_AVG   160.537111
74                COMMONAREA_MEDI   154.317179
46                 COMMONAREA_AVG   135.833277
71              BASEMENTAREA_MEDI   124.030316
44    YEARS_BEGINEXPLUATATION_AVG   107.424140
43               BASEMENTAREA_AVG   104.370801
79                  LANDAREA_MEDI    88.002448
82       NONLIVINGAPARTMENTS_MEDI    85.449105
72   YEARS_BEGINEXPLUATATION_MEDI    83.078401
83             NONLIVINGAREA_MEDI    79.161674
56                APARTMENTS_MODE    75.210034
61                 ELEVATORS_MODE    74.520567
63                 FLOORSMAX_MODE    65.565445
64                 FLOORSMIN_MODE    63.225511
51                   LANDAREA_AVG    61.305289
54        NONLIVINGAPARTMENTS_AVG    61.292899
59               YEARS_BUILD_MODE    60.782848
55              NONLIVINGAREA_AVG    58.414368
62                 ENTRANCES_MODE    53.177391
67                LIVINGAREA_MODE    53.175891
8                 AMT_GOODS_PRICE    42.896141
6                      AMT_CREDIT    41.926529
57              BASEMENTAREA_MODE    39.809246
66          LIVINGAPARTMENTS_MODE    39.095080
60                COMMONAREA_MODE    34.314145
65                  LANDAREA_MODE    34.237400
68       NONLIVINGAPARTMENTS_MODE    25.961802
69             NONLIVINGAREA_MODE    25.930602
58   YEARS_BEGINEXPLUATATION_MODE    18.900677
27                CNT_FAM_MEMBERS    17.760169
4                    CNT_CHILDREN    13.257755
95                FLAG_DOCUMENT_3    12.266187
28           REGION_RATING_CLIENT    10.989874
29    REGION_RATING_CLIENT_W_CITY    10.872970
33     REG_REGION_NOT_WORK_REGION     9.075078
86                 TOTALAREA_MODE     7.987570
36         REG_CITY_NOT_WORK_CITY     7.573843
34    LIVE_REGION_NOT_WORK_REGION     7.357019
85                 HOUSETYPE_MODE     6.643642
88            EMERGENCYSTATE_MODE     6.382314
98                FLAG_DOCUMENT_6     6.212168
37        LIVE_CITY_NOT_WORK_CITY     6.037850
0              NAME_CONTRACT_TYPE     5.376454
100               FLAG_DOCUMENT_8     5.293834
87             WALLSMATERIAL_MODE     5.087571
12             NAME_FAMILY_STATUS     4.109616
90       DEF_30_CNT_SOCIAL_CIRCLE     4.066447
92       DEF_60_CNT_SOCIAL_CIRCLE     3.871220
7                     AMT_ANNUITY     2.784039
15                     DAYS_BIRTH     2.448739
35         REG_CITY_NOT_LIVE_CITY     2.446805
32     REG_REGION_NOT_LIVE_REGION     2.436091
84             FONDKAPREMONT_MODE     1.830200
38              ORGANIZATION_TYPE     1.717919
97                FLAG_DOCUMENT_5     1.624040
14     REGION_POPULATION_RELATIVE     1.557676
26                OCCUPATION_TYPE     1.411083
1                     CODE_GENDER     1.279183
22                FLAG_WORK_PHONE     1.264659
39                   EXT_SOURCE_1     1.250308
2                    FLAG_OWN_CAR     1.234432
40                   EXT_SOURCE_2     1.210609
101               FLAG_DOCUMENT_9     1.202234
10               NAME_INCOME_TYPE     1.189643
17              DAYS_REGISTRATION     1.182372
11            NAME_EDUCATION_TYPE     1.159386
103              FLAG_DOCUMENT_11     1.158173
24                     FLAG_PHONE     1.157811
18                DAYS_ID_PUBLISH     1.148993
31        HOUR_APPR_PROCESS_START     1.142273
13              NAME_HOUSING_TYPE     1.127115
3                 FLAG_OWN_REALTY     1.121518
115     AMT_REQ_CREDIT_BUREAU_DAY     1.113343
93         DAYS_LAST_PHONE_CHANGE     1.108702
41                   EXT_SOURCE_3     1.072609
23               FLAG_CONT_MOBILE     1.065465
5                AMT_INCOME_TOTAL     1.063118
114    AMT_REQ_CREDIT_BUREAU_HOUR     1.059802
119    AMT_REQ_CREDIT_BUREAU_YEAR     1.058902
116    AMT_REQ_CREDIT_BUREAU_WEEK     1.055091
108              FLAG_DOCUMENT_16     1.037873
25                     FLAG_EMAIL     1.034020
110              FLAG_DOCUMENT_18     1.032908
105              FLAG_DOCUMENT_13     1.032336
19                    OWN_CAR_AGE     1.022424
106              FLAG_DOCUMENT_14     1.021144
112              FLAG_DOCUMENT_20     1.020985
9                 NAME_TYPE_SUITE     1.017852
117     AMT_REQ_CREDIT_BUREAU_MON     1.015819
107              FLAG_DOCUMENT_15     1.008876
118     AMT_REQ_CREDIT_BUREAU_QRT     1.008262
113              FLAG_DOCUMENT_21     1.005291
99                FLAG_DOCUMENT_7     1.004723
96                FLAG_DOCUMENT_4     1.003694
111              FLAG_DOCUMENT_19     1.003514
94                FLAG_DOCUMENT_2     1.003042
109              FLAG_DOCUMENT_17     1.002282
30     WEEKDAY_APPR_PROCESS_START     1.001028
104              FLAG_DOCUMENT_12     1.000682
20                     FLAG_MOBIL     1.000566
102              FLAG_DOCUMENT_10     1.000539

print(train_df['NAME_CONTRACT_TYPE'].value_counts())
categ_distr_target(train_df, 'NAME_CONTRACT_TYPE')

NAME_CONTRACT_TYPE
Cash loans         278232
Revolving loans     29279
Name: count, dtype: int64

Percentages of TARGET = 0 (repaid on time) for each category in 'NAME_CONTRACT_TYPE':
Cash loans: 91.65%
Revolving loans: 94.52%

print(train_df['CODE_GENDER'].value_counts())
categ_distr_target(train_df, 'CODE_GENDER')

CODE_GENDER
F      202448
M      105059
XNA         4
Name: count, dtype: int64

Percentages of TARGET = 0 (repaid on time) for each category in 'CODE_GENDER':
F: 93.00%
M: 89.86%
XNA: 100.00%

print(train_df['OCCUPATION_TYPE'].value_counts())
categ_distr_target(train_df, 'OCCUPATION_TYPE')

OCCUPATION_TYPE
Laborers                 55186
Sales staff              32102
Core staff               27570
Managers                 21371
Drivers                  18603
High skill tech staff    11380
Accountants               9813
Medicine staff            8537
Security staff            6721
Cooking staff             5946
Cleaning staff            4653
Private service staff     2652
Low-skill Laborers        2093
Waiters/barmen staff      1348
Secretaries               1305
Realty agents              751
HR staff                   563
IT staff                   526
Name: count, dtype: int64

Percentages of TARGET = 0 (repaid on time) for each category in 'OCCUPATION_TYPE':
Accountants: 95.17%
Cleaning staff: 90.39%
Cooking staff: 89.56%
Core staff: 93.70%
Drivers: 88.67%
HR staff: 93.61%
High skill tech staff: 93.84%
IT staff: 93.54%
Laborers: 89.42%
Low-skill Laborers: 82.85%
Managers: 93.79%
Medicine staff: 93.30%
Private service staff: 93.40%
Realty agents: 92.14%
Sales staff: 90.37%
Secretaries: 92.95%
Security staff: 89.26%
Waiters/barmen staff: 88.72%

# Subplots setup
fig, axes = plt.subplots(1, 2, figsize=(25, 14))
sns.set(style='whitegrid', font_scale=1.2)

# Spacing between plots
plt.subplots_adjust(wspace=0.4)

# Plot 1: Distribution of ORGANIZATION_TYPE
count_organization = train_df[
    'ORGANIZATION_TYPE'].value_counts().sort_values(ascending=False)
sns.barplot(
    ax=axes[0],
    x=count_organization.values,
    y=count_organization.index,
    order=count_organization.index,
    palette="viridis"
)
axes[0].set_title(
    'Total Distribution of ORGANIZATION_TYPE', fontsize=16, pad=20)
axes[0].set_xlabel('Counts', fontsize=14)
axes[0].set_ylabel('ORGANIZATION_TYPE', fontsize=14)

# Plot 2: Percentage of Defaulters for each ORGANIZATION_TYPE
default_counts = train_df[
    train_df['TARGET'] == 1]['ORGANIZATION_TYPE'].value_counts()
percentage_default_per_organization = (
    default_counts / count_organization * 100).dropna().sort_values(
        ascending=False)
sns.barplot(
    ax=axes[1],
    x=percentage_default_per_organization.values,
    y=percentage_default_per_organization.index,
    order=percentage_default_per_organization.index,
    palette="coolwarm"
)
axes[1].set_title(
    'Percentage of TARGET = 1 (did not repay on time)', fontsize=16, pad=20)
axes[1].set_xlabel('Percentage of TARGET = 1', fontsize=14)
axes[1].set_ylabel('ORGANIZATION_TYPE', fontsize=14)

# Show the plots
plt.show()

# Count the total number of unique categories
print(f"Total Number of categories in ORGANIZATION_TYPE: "
      f"{len(train_df['ORGANIZATION_TYPE'].unique())}")

Total Number of categories in ORGANIZATION_TYPE: 58

print(train_df['FLAG_DOCUMENT_3'].value_counts())
categ_distr_target(train_df, 'FLAG_DOCUMENT_3')

FLAG_DOCUMENT_3
1    218340
0     89171
Name: count, dtype: int64

Percentages of TARGET = 0 (repaid on time) for each category in 'FLAG_DOCUMENT_3':
1: 91.16%
0: 93.82%

print(train_df['FLAG_EMP_PHONE'].value_counts())
categ_distr_target(train_df, 'FLAG_EMP_PHONE')

FLAG_EMP_PHONE
1    252125
0     55386
Name: count, dtype: int64

Percentages of TARGET = 0 (repaid on time) for each category in 'FLAG_EMP_PHONE':
1: 91.34%
0: 94.60%

print(train_df['NAME_HOUSING_TYPE'].value_counts())
categ_distr_target(train_df, 'NAME_HOUSING_TYPE')

NAME_HOUSING_TYPE
House / apartment      272868
With parents            14840
Municipal apartment     11183
Rented apartment         4881
Office apartment         2617
Co-op apartment          1122
Name: count, dtype: int64

Percentages of TARGET = 0 (repaid on time) for each category in 'NAME_HOUSING_TYPE':
House / apartment: 92.20%
With parents: 88.30%
Municipal apartment: 91.46%
Rented apartment: 87.69%
Office apartment: 93.43%
Co-op apartment: 92.07%

print(train_df['NAME_EDUCATION_TYPE'].value_counts())
categ_distr_target(train_df, 'NAME_EDUCATION_TYPE')

NAME_EDUCATION_TYPE
Secondary / secondary special    218391
Higher education                  74863
Incomplete higher                 10277
Lower secondary                    3816
Academic degree                     164
Name: count, dtype: int64

Percentages of TARGET = 0 (repaid on time) for each category in 'NAME_EDUCATION_TYPE':
Secondary / secondary special: 91.06%
Higher education: 94.64%
Incomplete higher: 91.52%
Lower secondary: 89.07%
Academic degree: 98.17%

print(train_df['REGION_RATING_CLIENT_W_CITY'].value_counts())
categ_distr_target(train_df, 'REGION_RATING_CLIENT_W_CITY')

REGION_RATING_CLIENT_W_CITY
2    229484
3     43860
1     34167
Name: count, dtype: int64

Percentages of TARGET = 0 (repaid on time) for each category in 'REGION_RATING_CLIENT_W_CITY':
2: 92.08%
3: 88.60%
1: 95.16%

categ_distr_target(train_df, 'REG_CITY_NOT_LIVE_CITY')
categ_distr_target(train_df, 'REG_CITY_NOT_WORK_CITY')
categ_distr_target(train_df, 'LIVE_CITY_NOT_WORK_CITY')

Percentages of TARGET = 0 (repaid on time) for each category in 'REG_CITY_NOT_LIVE_CITY':
0: 92.28%
1: 87.77%

Percentages of TARGET = 0 (repaid on time) for each category in 'REG_CITY_NOT_WORK_CITY':
0: 92.69%
1: 89.39%

Percentages of TARGET = 0 (repaid on time) for each category in 'LIVE_CITY_NOT_WORK_CITY':
0: 92.34%
1: 90.03%

train_df['AGE_YEARS'] = train_df['DAYS_BIRTH'] * -1 / 365
view_continuous_feat(
    data=train_df,
    column_name='AGE_YEARS',
    plots=['distplot', 'box', 'CDF'],
    scale_limits=(0, 70),  # Age range
    log_scale=False
)

# Check percentiles for DAYS_EMPLOYED
view_percentiles(train_df, column_name='DAYS_EMPLOYED')

Percentile values for column: DAYS_EMPLOYED
The 0th percentile value of DAYS_EMPLOYED is -17912.00
The 25th percentile value of DAYS_EMPLOYED is -2760.00
The 50th percentile value of DAYS_EMPLOYED is -1213.00
The 75th percentile value of DAYS_EMPLOYED is -289.00
The 90th percentile value of DAYS_EMPLOYED is 365243.00
The 92th percentile value of DAYS_EMPLOYED is 365243.00
The 94th percentile value of DAYS_EMPLOYED is 365243.00
The 96th percentile value of DAYS_EMPLOYED is 365243.00
The 98th percentile value of DAYS_EMPLOYED is 365243.00
The 100th percentile value of DAYS_EMPLOYED is 365243.00

train_df['YEARS_EMPLOYED'] = train_df['DAYS_EMPLOYED'] * -1 / 365
view_continuous_feat(
    data=train_df,
    column_name='YEARS_EMPLOYED',
    plots=['distplot', 'box'],
    scale_limits=(0, 50),
    log_scale=False
)

view_continuous_feat(data=train_df,
                     column_name='DAYS_ID_PUBLISH',
                     plots=['box', 'CDF'],
                     figsize=(10, 8))

view_continuous_feat(data=train_df,
                     column_name='EXT_SOURCE_1',
                     plots=['distplot', 'box'],
                     figsize=(16, 8))

view_continuous_feat(data=train_df,
                     column_name='EXT_SOURCE_2',
                     plots=['distplot', 'box'],
                     figsize=(16, 8))

view_continuous_feat(data=train_df,
                     column_name='EXT_SOURCE_3',
                     plots=['distplot', 'box'],
                     figsize=(16, 8))

train_df['TARGET'] = train_df['TARGET'].astype(int)

train_df['TARGET'] = train_df['TARGET'].astype(int)
valid_income_data = train_df[train_df[
    'AMT_INCOME_TOTAL'].notnull()]
defaulters_income = valid_income_data[
    valid_income_data['TARGET'] == 1]['AMT_INCOME_TOTAL']
non_defaulters_income = valid_income_data[
    valid_income_data['TARGET'] == 0]['AMT_INCOME_TOTAL']
defaulters = train_df[
    train_df['TARGET'] == 1]
non_defaulters = train_df[
    train_df['TARGET'] == 0]

income_results = hypothesis_test(
    'AMT_INCOME_TOTAL', defaulters, non_defaulters)
print("### Hypothesis 1: Income Levels Influence Default Rates ###")
print(f"T-statistic: {income_results['T-statistic']:.2f}, "
      f"P-value: {income_results['P-value']:.4f}")
print(f"Defaulters Mean Income: {income_results['Defaulters Mean']:.2f}")
print(f"Non-Defaulters Mean Income: "
      f"{income_results['Non-Defaulters Mean']:.2f}")
print(f"95% Confidence Interval: {income_results['95% CI']}")
print(f"Decision: {income_results['Decision']}\n")

### Hypothesis 1: Income Levels Influence Default Rates ###
T-statistic: -2.21, P-value: 0.0272
Defaulters Mean Income: 165611.77
Non-Defaulters Mean Income: 169077.73
95% Confidence Interval: (-12763.362769931764, 5831.425269931764)
Decision: Reject the null hypothesis

loan_results = hypothesis_test(
    'AMT_CREDIT', defaulters, non_defaulters, alternative='greater')
print("### Hypothesis 2: Loan Amount Affects Default Risk ###")
print(f"T-statistic: {loan_results['T-statistic']:.2f}, "
      f"P-value: {loan_results['P-value']:.4f}")
print(f"Defaulters Mean Loan Amount: {loan_results['Defaulters Mean']:.2f}")
print(f"Non-Defaulters Mean Loan Amount: "
      f"{loan_results['Non-Defaulters Mean']:.2f}")
print(f"95% Confidence Interval: {loan_results['95% CI']}")
print(f"Decision: {loan_results['Decision']}\n")

### Hypothesis 2: Loan Amount Affects Default Risk ###
T-statistic: -16.85, P-value: 1.0000
Defaulters Mean Loan Amount: 557778.50
Non-Defaulters Mean Loan Amount: 602648.31
95% Confidence Interval: (-49432.87600700068, -40306.74899299932)
Decision: Fail to reject the null hypothesis

age_results = hypothesis_test(
    'DAYS_BIRTH', defaulters, non_defaulters)
print("### Hypothesis 3: Age Distribution Differs ###")
print(f"T-statistic: {age_results['T-statistic']:.2f}, "
      f"P-value: {age_results['P-value']:.4f}")
print(f"Defaulters Mean Age (Years): "
      f"{-age_results['Defaulters Mean'] / 365:.2f}")
print(f"Non-Defaulters Mean Age (Years): "
      f"{-age_results['Non-Defaulters Mean'] / 365:.2f}")
print(f"95% Confidence Interval: "
      f"{(-age_results['95% CI'][1] / 365, -age_results['95% CI'][0] / 365)}")
print(f"Decision: {age_results['Decision']}\n")

### Hypothesis 3: Age Distribution Differs ###
T-statistic: 43.52, P-value: 0.0000
Defaulters Mean Age (Years): 40.78
Non-Defaulters Mean Age (Years): 44.21
95% Confidence Interval: (-3.5833728693545583, -3.2842891613164795)
Decision: Reject the null hypothesis

employment_results = hypothesis_test(
    'DAYS_EMPLOYED', defaulters, non_defaulters)
print("### Hypothesis 4: Employment Stability and Default Risk ###")
print(f"T-statistic: {employment_results['T-statistic']:.2f}, "
      f"P-value: {employment_results['P-value']:.4f}")
print(f"Defaulters Mean Employment Days: "
      f"{-employment_results['Defaulters Mean']:.2f}")
print(f"Non-Defaulters Mean Employment Days: "
      f"{-employment_results['Non-Defaulters Mean']:.2f}")
ci = employment_results['95% CI']
print(f"95% Confidence Interval: {(-ci[1], -ci[0])}")
print(f"Decision: {employment_results['Decision']}\n")

### Hypothesis 4: Employment Stability and Default Risk ###
T-statistic: -24.94, P-value: 0.0000
Defaulters Mean Employment Days: -42394.68
Non-Defaulters Mean Employment Days: -65696.15
95% Confidence Interval: (21724.5494170607, 24878.391933186253)
Decision: Reject the null hypothesis

family_results = hypothesis_test(
    'CNT_FAM_MEMBERS', defaulters, non_defaulters)
print("### Hypothesis 5: Family Size Impacts Default Risk ###")
print(f"T-statistic: {family_results['T-statistic']:.2f}, "
      f"P-value: {family_results['P-value']:.4f}")
print(f"Defaulters Mean Family Size: "
      f"{family_results['Defaulters Mean']:.2f}")
print(f"Non-Defaulters Mean Family Size: "
      f"{family_results['Non-Defaulters Mean']:.2f}")
print(f"95% Confidence Interval: {family_results['95% CI']}")
print(f"Decision: {family_results['Decision']}\n")

### Hypothesis 5: Family Size Impacts Default Risk ###
T-statistic: 5.16, P-value: 0.0000
Defaulters Mean Family Size: 2.18
Non-Defaulters Mean Family Size: 2.15
95% Confidence Interval: (0.018817774837163313, 0.04341281216845192)
Decision: Reject the null hypothesis

print(f'The shape of bureau.csv is: {bureau_df.shape}')

print(f'Number of unique SK_ID_BUREAU in bureau.csv are: '
      f'{len(bureau_df.SK_ID_BUREAU.unique())}')

print(f'Number of unique SK_ID_CURR in bureau.csv are: '
      f'{len(bureau_df.SK_ID_CURR.unique())}')

train_unique = set(train_df.SK_ID_CURR.unique())
bureau_unique = set(bureau_df.SK_ID_CURR.unique())
print(f'Number of overlapping SK_ID_CURR in application_train.csv and '
      f'bureau.csv are: {len(train_unique.intersection(bureau_unique))}')

test_unique = set(test_df.SK_ID_CURR.unique())
print(f'Number of overlapping SK_ID_CURR in application_test.csv and '
      f'bureau.csv are: {len(test_unique.intersection(bureau_unique))}')

print(f'Number of duplicate values in bureau: '
      f'{bureau_df.shape[0] - bureau_df.drop_duplicates().shape[0]}')

display(bureau_df.head(5))

The shape of bureau.csv is: (1716428, 17)
Number of unique SK_ID_BUREAU in bureau.csv are: 1716428
Number of unique SK_ID_CURR in bureau.csv are: 305811
Number of overlapping SK_ID_CURR in application_train.csv and bureau.csv are: 263491
Number of overlapping SK_ID_CURR in application_test.csv and bureau.csv are: 42320
Number of duplicate values in bureau: 0

nan_df_bureau = null_df(bureau_df)

view_percent_nan(nan_df_bureau, 'bureau', grid=True)

Number of columns having NaN values: 7 columns

# Merge the filtered train_df with bureau_df
bureau_merged = train_df[['SK_ID_CURR', 'TARGET']].merge(
    bureau_df, on='SK_ID_CURR', how='left')

# Verify the merge
print("Merging completed. Shape of bureau_merged:", bureau_merged.shape)
print(bureau_merged['TARGET'].value_counts(dropna=False))

Merging completed. Shape of bureau_merged: (1509345, 18)
TARGET
0    1390368
1     118977
Name: count, dtype: int64

bureau_merged['TARGET'] = bureau_merged['TARGET'].astype(int)

# Define categorical columns and target
bureau_categorical_cols = ['TARGET', 'CREDIT_ACTIVE',
                           'CREDIT_CURRENCY', 'CREDIT_TYPE']

# Display top correlations
view_categ_corr_matrix(
    data=bureau_merged,
    categorical_columns=bureau_categorical_cols,
    figsize=(6, 6),
    mask_upper=True,
    show_target_top_corr=True
)

Categories with highest correlation value with Target:

# Columns to drop
columns_to_drop = ['SK_ID_CURR', 'SK_ID_BUREAU',
                   'CREDIT_ACTIVE', 'CREDIT_CURRENCY',
                   'CREDIT_TYPE']

view_num_corr_matrix(
    data=bureau_merged,
    columns_to_drop=columns_to_drop,
    figsize=(10, 8),
    mask_upper=True,
    linewidth=0.1,
    fontsize=10,
    cmap='coolwarm'
)

# Select numerical columns dynamically, excluding IDs
numerical_columns_bureau = [
    col for col in bureau_merged.columns
    if bureau_merged[col].dtype in [
        'float32', 'int32'] and col not in [
            'SK_ID_CURR', 'SK_ID_BUREAU']
]

# Ensure important columns are included
key_features = ['DAYS_CREDIT', 'CREDIT_DAY_OVERDUE',
                'CNT_CREDIT_PROLONG', 'DAYS_CREDIT_UPDATE']
numerical_columns_bureau = list(set(
    numerical_columns_bureau + key_features))

# Initialize correlation matrix
corr_mat_bureau = CorrelationMatrix(
    data=bureau_merged,
    target='TARGET',
    numerical_columns=numerical_columns_bureau,
    min_unique_values=10
)

# Compute top correlated numerical features
top_corr_numerical_bureau = corr_mat_bureau.target_top_corr(
    target_top_columns=10)

# Display results
print("Top Phi-K Correlations for Numerical Features in Bureau:")
display(top_corr_numerical_bureau)

Top Phi-K Correlations for Numerical Features in Bureau:

unique_values_counts = bureau_merged['CREDIT_ACTIVE'].value_counts()
print(unique_values_counts)
categ_distr_target(bureau_merged, 'CREDIT_ACTIVE')

CREDIT_ACTIVE
Closed      917733
Active      541919
Sold          5653
Bad debt        20
Name: count, dtype: int64

Percentages of TARGET = 0 (repaid on time) for each category in 'CREDIT_ACTIVE':
Closed: 93.07%
Active: 90.71%
Sold: 89.90%
Bad debt: 80.00%

bureau_merged['YEARS_CREDIT'] = bureau_merged['DAYS_CREDIT'] / -365
view_continuous_feat(
    data=bureau_merged,
    column_name='YEARS_CREDIT',
    plots=['distplot', 'box'],
    figsize=(15, 8),
    log_scale=False
)

# Check percentiles for DAYS_CREDIT_ENDDATE
view_percentiles(bureau_merged, column_name='DAYS_CREDIT_ENDDATE')

Percentile values for column: DAYS_CREDIT_ENDDATE
The 0th percentile value of DAYS_CREDIT_ENDDATE is -42060.00
The 25th percentile value of DAYS_CREDIT_ENDDATE is -1144.00
The 50th percentile value of DAYS_CREDIT_ENDDATE is -334.00
The 75th percentile value of DAYS_CREDIT_ENDDATE is 473.00
The 90th percentile value of DAYS_CREDIT_ENDDATE is 1332.00
The 92th percentile value of DAYS_CREDIT_ENDDATE is 1503.00
The 94th percentile value of DAYS_CREDIT_ENDDATE is 1693.00
The 96th percentile value of DAYS_CREDIT_ENDDATE is 6258.00
The 98th percentile value of DAYS_CREDIT_ENDDATE is 27473.00
The 100th percentile value of DAYS_CREDIT_ENDDATE is 31199.00

view_continuous_feat(
    data=bureau_merged,
    column_name='DAYS_CREDIT_ENDDATE',
    plots=['box'],
    figsize=(8, 6),
    log_scale=False
)

# Check percentiles for DAYS_ENDDATE_FACT
view_percentiles(bureau_merged, column_name='DAYS_ENDDATE_FACT')

Percentile values for column: DAYS_ENDDATE_FACT
The 0th percentile value of DAYS_ENDDATE_FACT is -42023.00
The 25th percentile value of DAYS_ENDDATE_FACT is -1503.00
The 50th percentile value of DAYS_ENDDATE_FACT is -900.00
The 75th percentile value of DAYS_ENDDATE_FACT is -427.00
The 90th percentile value of DAYS_ENDDATE_FACT is -177.00
The 92th percentile value of DAYS_ENDDATE_FACT is -145.00
The 94th percentile value of DAYS_ENDDATE_FACT is -111.00
The 96th percentile value of DAYS_ENDDATE_FACT is -78.00
The 98th percentile value of DAYS_ENDDATE_FACT is -43.00
The 100th percentile value of DAYS_ENDDATE_FACT is 0.00

view_continuous_feat(
    data=bureau_merged,
    column_name='DAYS_ENDDATE_FACT',
    plots=['box'],
    figsize=(8, 6),
    log_scale=False,
    scale_limits=[-40000, 0]
)

# Check percentiles for DAYS_CREDIT_UPDATE
view_percentiles(bureau_merged, column_name='DAYS_CREDIT_UPDATE')

Percentile values for column: DAYS_CREDIT_UPDATE
The 0th percentile value of DAYS_CREDIT_UPDATE is -41947.00
The 25th percentile value of DAYS_CREDIT_UPDATE is -904.00
The 50th percentile value of DAYS_CREDIT_UPDATE is -406.00
The 75th percentile value of DAYS_CREDIT_UPDATE is -33.00
The 90th percentile value of DAYS_CREDIT_UPDATE is -13.00
The 92th percentile value of DAYS_CREDIT_UPDATE is -11.00
The 94th percentile value of DAYS_CREDIT_UPDATE is -9.00
The 96th percentile value of DAYS_CREDIT_UPDATE is -7.00
The 98th percentile value of DAYS_CREDIT_UPDATE is -5.00
The 100th percentile value of DAYS_CREDIT_UPDATE is 372.00

view_continuous_feat(
    data=bureau_merged,
    column_name='DAYS_CREDIT_UPDATE',
    plots=['box'],
    figsize=(8, 6),
    log_scale=False,
    scale_limits=[-40000, 400]
)

print(f'The shape of bureau.csv is: {bureau_bal_df.shape}')
dup_count = bureau_bal_df.shape[0] - bureau_bal_df.duplicated().shape[0]
print(f'Number of duplicate values in bureau: {dup_count}')
display(bureau_bal_df.head(5))

The shape of bureau.csv is: (27299925, 3)
Number of duplicate values in bureau: 0

print(f'Number of unique SK_ID_BUREAU in bureau.csv are: '
      f'{len(bureau_bal_df.SK_ID_BUREAU.unique())}')
print(f'Number of unique values for STATUS are: '
      f'{len(bureau_bal_df.STATUS.unique())}')
print(f'Unique values of STATUS are:\n{bureau_bal_df.STATUS.unique()}')
print(f'Max number of months for Months Balance: '
      f'{np.abs(bureau_bal_df.MONTHS_BALANCE.min())}')

Number of unique SK_ID_BUREAU in bureau.csv are: 817395
Number of unique values for STATUS are: 8
Unique values of STATUS are:
['C', '0', 'X', '1', '2', '3', '5', '4']
Categories (8, object): ['0', '1', '2', '3', '4', '5', 'C', 'X']
Max number of months for Months Balance: 96

# NaN values in the bureau_balance dataset
view_percent_nan(null_df(
    bureau_bal_df), 'bureau_balance', grid=True)

The dataframe bureau_balance does not contain any NaN values.

print(f'The shape of previous_application is: {prev_app_df.shape}')
print(f'Number of unique SK_ID_BUREAU in previous_application are: '
      f'{len(prev_app_df.SK_ID_PREV.unique())}')
print(f'Number of unique SK_ID_CURR in previous_application are: '
      f'{len(prev_app_df.SK_ID_CURR.unique())}')
train_unique = set(train_df.SK_ID_CURR.unique())
prev_app_unique = set(prev_app_df.SK_ID_CURR.unique())
overlap_train = len(train_unique.intersection(prev_app_unique))
print(f'Number of overlapping SK_ID_CURR in application_train.csv and '
      f'previous_application are: {overlap_train}')
test_unique = set(test_df.SK_ID_CURR.unique())
overlap_test = len(test_unique.intersection(prev_app_unique))
print(f'Number of overlapping SK_ID_CURR in application_test.csv and '
      f'previous_application are: {overlap_test}')
duplicates = prev_app_df.shape[0] - prev_app_df.drop_duplicates().shape[0]
print(f'Number of duplicate values in previous_application: {duplicates}')
display(prev_app_df.head())

The shape of previous_application is: (1670214, 37)
Number of unique SK_ID_BUREAU in previous_application are: 1670214
Number of unique SK_ID_CURR in previous_application are: 338857
Number of overlapping SK_ID_CURR in application_train.csv and previous_application are: 291057
Number of overlapping SK_ID_CURR in application_test.csv and previous_application are: 47800
Number of duplicate values in previous_application: 0

# NaN values in the training dataset
view_percent_nan(null_df(
    prev_app_df), 'Previous Applications Dataset', grid=True)

Number of columns having NaN values: 16 columns

# Merge the filtered train_df with prev_app_df
prev_app_merged = train_df[['SK_ID_CURR', 'TARGET']].merge(
    prev_app_df, on='SK_ID_CURR', how='left')

# Verify the merge
print("Merging completed. Shape of prev_app_merged:", prev_app_merged.shape)
print(prev_app_merged['TARGET'].value_counts(dropna=False))

Merging completed. Shape of prev_app_merged: (1430155, 38)
TARGET
0    1306815
1     123340
Name: count, dtype: int64

prev_app_merged['TARGET'] = prev_app_merged['TARGET'].astype(int)

# Define categorical columns and include additional relevant features
cat_cols_prev_app = ['TARGET'] + prev_app_merged.dtypes[
    prev_app_merged.dtypes == 'category'].index.tolist() + [
        'NFLAG_INSURED_ON_APPROVAL']

view_categ_corr_matrix(
    data=prev_app_merged,
    categorical_columns=cat_cols_prev_app,
    figsize=(12, 10),
    mask_upper=True,
    show_target_top_corr=True
)

Categories with highest correlation value with Target:

num_cols_prev_app = prev_app_merged.select_dtypes(
    include=['float32', 'float64', 'int32', 'int64']).columns.tolist()
num_cols_prev_app = [
    col for col in num_cols_prev_app if col not in [
        'SK_ID_CURR', 'SK_ID_PREV']
        ]

corr_mat_prev = CorrelationMatrix(
    data=prev_app_merged,
    target='TARGET',
    numerical_columns=num_cols_prev_app,
    min_unique_values=10
)
corr_mat_prev.plot_correlation_matrix(figsize=(14, 12))

top_corr_numerical = corr_mat_prev.target_top_corr(target_top_columns=10)

print("Top Phi-K Correlations for Numerical Features:")
display(top_corr_numerical)

Top Phi-K Correlations for Numerical Features:

unique_values_counts = prev_app_merged['NAME_CONTRACT_TYPE'].value_counts()
print(unique_values_counts)
categ_distr_target(prev_app_merged, 'NAME_CONTRACT_TYPE')

NAME_CONTRACT_TYPE
Cash loans         626764
Consumer loans     625256
Revolving loans    161368
XNA                   313
Name: count, dtype: int64

Percentages of TARGET = 0 (repaid on time) for each category in 'NAME_CONTRACT_TYPE':
Cash loans: 90.87%
Consumer loans: 92.29%
Revolving loans: 89.53%
XNA: 79.87%

unique_values_counts = prev_app_merged['NAME_CONTRACT_STATUS'].value_counts()
print(unique_values_counts)
categ_distr_target(prev_app_merged, 'NAME_CONTRACT_STATUS')

NAME_CONTRACT_STATUS
Approved        886099
Canceled        259441
Refused         245390
Unused offer     22771
Name: count, dtype: int64

Percentages of TARGET = 0 (repaid on time) for each category in 'NAME_CONTRACT_STATUS':
Approved: 92.41%
Canceled: 90.83%
Refused: 88.00%
Unused offer: 91.75%

unique_values_counts = prev_app_merged['CODE_REJECT_REASON'].value_counts()
print(unique_values_counts)
categ_distr_target(prev_app_merged, 'CODE_REJECT_REASON')

CODE_REJECT_REASON
XAP       1145533
HC         145984
LIMIT       47773
SCO         32636
CLIENT      22771
SCOFR       10875
XNA          4378
VERIF        3079
SYSTEM        672
Name: count, dtype: int64

Percentages of TARGET = 0 (repaid on time) for each category in 'CODE_REJECT_REASON':
XAP: 92.05%
HC: 88.27%
LIMIT: 87.44%
SCO: 90.00%
CLIENT: 91.75%
SCOFR: 79.07%
XNA: 89.90%
VERIF: 90.74%
SYSTEM: 93.75%

unique_values_counts = prev_app_merged['CHANNEL_TYPE'].value_counts()
print(unique_values_counts)
categ_distr_target(prev_app_merged, 'CHANNEL_TYPE')

CHANNEL_TYPE
Credit and cash offices       600342
Country-wide                  423062
Stone                         183508
Regional / Local               92472
Contact center                 58891
AP+ (Cash loan)                49624
Channel of corporate sales      5396
Car dealer                       406
Name: count, dtype: int64

Percentages of TARGET = 0 (repaid on time) for each category in 'CHANNEL_TYPE':
Credit and cash offices: 90.99%
Country-wide: 91.74%
Stone: 92.63%
Regional / Local: 92.39%
Contact center: 89.73%
AP+ (Cash loan): 87.17%
Channel of corporate sales: 93.62%
Car dealer: 94.83%

unique_values_counts = prev_app_merged['PRODUCT_COMBINATION'].value_counts()
print(unique_values_counts)
categ_distr_target(prev_app_merged, 'PRODUCT_COMBINATION')

PRODUCT_COMBINATION
Cash                              235101
POS household with interest       226464
POS mobile with interest          190850
Cash X-Sell: middle               120036
Cash X-Sell: low                  110599
Card Street                        94594
POS industry with interest         83508
POS household without interest     71519
Card X-Sell                        66774
Cash Street: high                  51699
Cash X-Sell: high                  50802
Cash Street: middle                29934
Cash Street: low                   28593
POS other with interest            20309
POS mobile without interest        19890
POS industry without interest      10621
POS others without interest         2095
Name: count, dtype: int64

Percentages of TARGET = 0 (repaid on time) for each category in 'PRODUCT_COMBINATION':
Cash: 90.46%
POS household with interest: 92.27%
POS mobile with interest: 91.19%
Cash X-Sell: middle: 92.16%
Cash X-Sell: low: 93.42%
Card Street: 88.71%
POS industry with interest: 93.70%
POS household without interest: 93.36%
Card X-Sell: 90.70%
Cash Street: high: 88.64%
Cash X-Sell: high: 88.51%
Cash Street: middle: 88.38%
Cash Street: low: 89.90%
POS other with interest: 91.87%
POS mobile without interest: 92.12%
POS industry without interest: 95.36%
POS others without interest: 92.79%

view_continuous_feat(
    data=prev_app_merged,
    column_name='DAYS_DECISION',
    plots=['distplot', 'box'],
    figsize=(15, 8),
    log_scale=False,
)

# Check percentiles for DAYS_ENDDATE_FACT
view_percentiles(prev_app_merged, column_name='DAYS_FIRST_DRAWING')

view_continuous_feat(
    data=prev_app_merged,
    column_name='DAYS_FIRST_DRAWING',
    plots=['box'],
    figsize=(8, 6),
    log_scale=False,
    scale_limits=[-3000, 0]
)

Percentile values for column: DAYS_FIRST_DRAWING
The 0th percentile value of DAYS_FIRST_DRAWING is -2922.00
The 25th percentile value of DAYS_FIRST_DRAWING is 365243.00
The 50th percentile value of DAYS_FIRST_DRAWING is 365243.00
The 75th percentile value of DAYS_FIRST_DRAWING is 365243.00
The 90th percentile value of DAYS_FIRST_DRAWING is 365243.00
The 92th percentile value of DAYS_FIRST_DRAWING is 365243.00
The 94th percentile value of DAYS_FIRST_DRAWING is 365243.00
The 96th percentile value of DAYS_FIRST_DRAWING is 365243.00
The 98th percentile value of DAYS_FIRST_DRAWING is 365243.00
The 100th percentile value of DAYS_FIRST_DRAWING is 365243.00

# Check percentiles for DAYS_ENDDATE_FACT
view_percentiles(prev_app_merged, column_name='DAYS_FIRST_DUE')
print('-'*80)
view_percentiles(prev_app_merged, column_name='DAYS_LAST_DUE_1ST_VERSION')
print('-'*80)
view_percentiles(prev_app_merged, column_name='DAYS_LAST_DUE')
print('-'*80)
view_percentiles(prev_app_merged, column_name='DAYS_TERMINATION')

Percentile values for column: DAYS_FIRST_DUE
The 0th percentile value of DAYS_FIRST_DUE is -2892.00
The 25th percentile value of DAYS_FIRST_DUE is -1630.00
The 50th percentile value of DAYS_FIRST_DUE is -825.00
The 75th percentile value of DAYS_FIRST_DUE is -408.00
The 90th percentile value of DAYS_FIRST_DUE is -173.00
The 92th percentile value of DAYS_FIRST_DUE is -137.00
The 94th percentile value of DAYS_FIRST_DUE is -86.00
The 96th percentile value of DAYS_FIRST_DUE is -4.00
The 98th percentile value of DAYS_FIRST_DUE is 365243.00
The 100th percentile value of DAYS_FIRST_DUE is 365243.00
--------------------------------------------------------------------------------
Percentile values for column: DAYS_LAST_DUE_1ST_VERSION
The 0th percentile value of DAYS_LAST_DUE_1ST_VERSION is -2801.00
The 25th percentile value of DAYS_LAST_DUE_1ST_VERSION is -1244.00
The 50th percentile value of DAYS_LAST_DUE_1ST_VERSION is -358.00
The 75th percentile value of DAYS_LAST_DUE_1ST_VERSION is 135.00
The 90th percentile value of DAYS_LAST_DUE_1ST_VERSION is 1374.00
The 92th percentile value of DAYS_LAST_DUE_1ST_VERSION is 365243.00
The 94th percentile value of DAYS_LAST_DUE_1ST_VERSION is 365243.00
The 96th percentile value of DAYS_LAST_DUE_1ST_VERSION is 365243.00
The 98th percentile value of DAYS_LAST_DUE_1ST_VERSION is 365243.00
The 100th percentile value of DAYS_LAST_DUE_1ST_VERSION is 365243.00
--------------------------------------------------------------------------------
Percentile values for column: DAYS_LAST_DUE
The 0th percentile value of DAYS_LAST_DUE is -2889.00
The 25th percentile value of DAYS_LAST_DUE is -1316.00
The 50th percentile value of DAYS_LAST_DUE is -534.00
The 75th percentile value of DAYS_LAST_DUE is -73.00
The 90th percentile value of DAYS_LAST_DUE is 365243.00
The 92th percentile value of DAYS_LAST_DUE is 365243.00
The 94th percentile value of DAYS_LAST_DUE is 365243.00
The 96th percentile value of DAYS_LAST_DUE is 365243.00
The 98th percentile value of DAYS_LAST_DUE is 365243.00
The 100th percentile value of DAYS_LAST_DUE is 365243.00
--------------------------------------------------------------------------------
Percentile values for column: DAYS_TERMINATION
The 0th percentile value of DAYS_TERMINATION is -2874.00
The 25th percentile value of DAYS_TERMINATION is -1269.00
The 50th percentile value of DAYS_TERMINATION is -494.00
The 75th percentile value of DAYS_TERMINATION is -42.00
The 90th percentile value of DAYS_TERMINATION is 365243.00
The 92th percentile value of DAYS_TERMINATION is 365243.00
The 94th percentile value of DAYS_TERMINATION is 365243.00
The 96th percentile value of DAYS_TERMINATION is 365243.00
The 98th percentile value of DAYS_TERMINATION is 365243.00
The 100th percentile value of DAYS_TERMINATION is 365243.00

print(f'The shape of previous_application is: {installments_pay_df.shape}')
print(f'Number of unique SK_ID_BUREAU in previous_application are: '
      f'{len(installments_pay_df.SK_ID_PREV.unique())}')
print(f'Number of unique SK_ID_CURR in previous_application are: '
      f'{len(installments_pay_df.SK_ID_CURR.unique())}')

overlap_train = len(set(train_df.SK_ID_CURR.unique()).intersection(
    set(installments_pay_df.SK_ID_CURR.unique())))
print(f'Number of overlapping SK_ID_CURR in application_train.csv and '
      f'previous_application are: {overlap_train}')

overlap_test = len(set(test_df.SK_ID_CURR.unique()).intersection(
    set(installments_pay_df.SK_ID_CURR.unique())))
print(f'Number of overlapping SK_ID_CURR in application_test.csv and '
      f'previous_application are: {overlap_test}')

duplicates = (installments_pay_df.shape[0] -
              installments_pay_df.duplicated().shape[0])
print(f'Number of duplicate values in previous_application: {duplicates}')
display(prev_app_df.head())

The shape of previous_application is: (13605401, 8)
Number of unique SK_ID_BUREAU in previous_application are: 997752
Number of unique SK_ID_CURR in previous_application are: 339587
Number of overlapping SK_ID_CURR in application_train.csv and previous_application are: 291643
Number of overlapping SK_ID_CURR in application_test.csv and previous_application are: 47944
Number of duplicate values in previous_application: 0

# NaN values in the training dataset
view_percent_nan(null_df(
    installments_pay_df), 'Install Payments Dataset', grid=True)

Number of columns having NaN values: 2 columns

# Merge the filtered train_df with installments_pay_df
inst_pay_merged = train_df[
    ['SK_ID_CURR', 'TARGET']].merge(
        installments_pay_df, on='SK_ID_CURR', how='left')

# Convert 'TARGET' to numeric
inst_pay_merged['TARGET'] = pd.to_numeric(
    inst_pay_merged['TARGET']).round().astype(int)

# Verify the merge
print("Merging completed. Shape of inst_pay_merged:", inst_pay_merged.shape)
print(inst_pay_merged['TARGET'].value_counts(dropna=False))

Merging completed. Shape of inst_pay_merged: (11607460, 9)
TARGET
0    10726258
1      881202
Name: count, dtype: int64

# Select numerical columns for the correlation matrix
num_cols_inst_app = inst_pay_merged.select_dtypes(
    include=['float32', 'float64', 'int32', 'int64']).columns.tolist()
num_cols_inst_app = [
    col for col in num_cols_inst_app if col not in [
        'SK_ID_CURR', 'SK_ID_PREV']]

# Initialize and plot the correlation matrix
corr_mat_inst = CorrelationMatrix(
    data=inst_pay_merged,
    target='TARGET',
    numerical_columns=num_cols_inst_app,
    min_unique_values=10
)

# Plot the correlation matrix
corr_mat_inst.plot_correlation_matrix(figsize=(8, 6))

top_corr_numerical = corr_mat_inst.target_top_corr(target_top_columns=10)

print("Top Phi-K Correlations for Numerical Features:")
display(top_corr_numerical)

Top Phi-K Correlations for Numerical Features:

view_continuous_feat(
    data=inst_pay_merged,
    column_name='DAYS_INSTALMENT',
    plots=['box'],
    figsize=(8, 6),
    log_scale=False,
)

view_continuous_feat(
    data=inst_pay_merged,
    column_name='DAYS_ENTRY_PAYMENT',
    plots=['box'],
    figsize=(8, 6),
    log_scale=False,
)

print(f'The shape of poscash_bal_df is: {poscash_bal_df.shape}')
print(f'Number of unique SK_ID_BUREAU in poscash_bal_df are: '
      f'{len(poscash_bal_df.SK_ID_PREV.unique())}')
print(f'Number of unique SK_ID_CURR in poscash_bal_df are: '
      f'{len(poscash_bal_df.SK_ID_CURR.unique())}')
overlap_train = len(set(train_df.SK_ID_CURR.unique()).intersection(
    set(poscash_bal_df.SK_ID_CURR.unique())))
print(f'Number of overlapping SK_ID_CURR in application_train.csv and '
      f'poscash_bal_df are: {overlap_train}')

overlap_test = len(set(test_df.SK_ID_CURR.unique()).intersection(
    set(poscash_bal_df.SK_ID_CURR.unique())))
print(f'Number of overlapping SK_ID_CURR in application_test.csv and '
      f'poscash_bal_df are: {overlap_test}')

duplicates = poscash_bal_df.shape[0] - poscash_bal_df.duplicated().shape[0]
print(f'Number of duplicate values in poscash_bal_df: {duplicates}')
display(prev_app_df.head())

The shape of poscash_bal_df is: (10001358, 8)
Number of unique SK_ID_BUREAU in poscash_bal_df are: 936325
Number of unique SK_ID_CURR in poscash_bal_df are: 337252
Number of overlapping SK_ID_CURR in application_train.csv and poscash_bal_df are: 289444
Number of overlapping SK_ID_CURR in application_test.csv and poscash_bal_df are: 47808
Number of duplicate values in poscash_bal_df: 0

# NaN values in the POS_CASH_balance
view_percent_nan(null_df(
    poscash_bal_df), 'POS_CASH_balance Dataset', grid=True)

Number of columns having NaN values: 2 columns

# Merge the filtered train_df with pos_cash_bal_df
pos_cash_merged = train_df[
    ['SK_ID_CURR', 'TARGET']].merge(
        poscash_bal_df, on='SK_ID_CURR', how='left')

# Verify the merge
print("Merging completed. Shape of pos_cash_merged:", pos_cash_merged.shape)
print(pos_cash_merged['TARGET'].value_counts(dropna=False))
pos_cash_merged['TARGET'] = pos_cash_merged['TARGET'].astype(int)

Merging completed. Shape of pos_cash_merged: (8561442, 9)
TARGET
0    7931652
1     629790
Name: count, dtype: int64

# Select numerical columns for the correlation matrix
num_cols_pos_cash = pos_cash_merged.select_dtypes(
    include=['float32', 'float64', 'int32', 'int64']).columns.tolist()
num_cols_pos_cash = [
    col for col in num_cols_pos_cash if col not in [
        'SK_ID_CURR', 'SK_ID_PREV']
        ]

# Initialize and plot the correlation matrix
corr_mat_pos_cash = CorrelationMatrix(
    data=pos_cash_merged,
    target='TARGET',
    numerical_columns=num_cols_pos_cash,
    min_unique_values=10
)

# Plot the correlation matrix
corr_mat_pos_cash.plot_correlation_matrix(figsize=(8, 6))

top_corr_numerical = corr_mat_pos_cash.target_top_corr(target_top_columns=10)

print("Top Phi-K Correlations for Numerical Features:")
display(top_corr_numerical)

Top Phi-K Correlations for Numerical Features:

view_continuous_feat(
    data=pos_cash_merged,
    column_name='CNT_INSTALMENT_FUTURE',
    plots=['box'],
    figsize=(8, 6),
    log_scale=False,
)

print(f'The shape of credit_card_bal_df is: {credit_card_bal_df.shape}')
print(f'Number of unique SK_ID_BUREAU in credit_card_bal_df are: '
      f'{len(credit_card_bal_df.SK_ID_PREV.unique())}')
print(f'Number of unique SK_ID_CURR in credit_card_bal_df are: '
      f'{len(credit_card_bal_df.SK_ID_CURR.unique())}')
overlap_train = len(set(train_df.SK_ID_CURR.unique()).intersection(
    set(credit_card_bal_df.SK_ID_CURR.unique())))
print(f'Number of overlapping SK_ID_CURR in application_train.csv and '
      f'credit_card_bal_df are: {overlap_train}')

overlap_test = len(set(test_df.SK_ID_CURR.unique()).intersection(
    set(credit_card_bal_df.SK_ID_CURR.unique())))
print(f'Number of overlapping SK_ID_CURR in application_test.csv and '
      f'credit_card_bal_df are: {overlap_test}')

duplicates = (credit_card_bal_df.shape[0] -
              credit_card_bal_df.duplicated().shape[0])
print(f'Number of duplicate values in credit_card_bal_df: {duplicates}')

display(credit_card_bal_df.head())

The shape of credit_card_bal_df is: (3840312, 23)
Number of unique SK_ID_BUREAU in credit_card_bal_df are: 104307
Number of unique SK_ID_CURR in credit_card_bal_df are: 103558
Number of overlapping SK_ID_CURR in application_train.csv and credit_card_bal_df are: 86905
Number of overlapping SK_ID_CURR in application_test.csv and credit_card_bal_df are: 16653
Number of duplicate values in credit_card_bal_df: 0

# NaN values in the POS_CASH_balance
view_percent_nan(null_df(
    credit_card_bal_df), 'Credit Card Balance Dataset', grid=True)

Number of columns having NaN values: 9 columns

# Merge the filtered train_df with prev_app_df
credit_card_merged = train_df[
    ['SK_ID_CURR', 'TARGET']].merge(
        credit_card_bal_df, on='SK_ID_CURR', how='left')

# Verify the merge
print(
    "Merging completed. Shape of credit_card_merged:", credit_card_merged.shape
    )
print(credit_card_merged['TARGET'].value_counts(dropna=False))
credit_card_merged['TARGET'] = credit_card_merged['TARGET'].astype(int)

Merging completed. Shape of credit_card_merged: (3448571, 24)
TARGET
0    3201279
1     247292
Name: count, dtype: int64

# Select numerical columns for the correlation matrix
num_cols_credit_card = credit_card_merged.select_dtypes(
    include=['float32', 'float64', 'int32', 'int64']).columns.tolist()
num_cols_credit_card = [
    col for col in num_cols_credit_card if col not in [
        'SK_ID_CURR', 'SK_ID_PREV']
        ]

# Initialize and plot the correlation matrix
corr_mat_credit_card = CorrelationMatrix(
    data=credit_card_merged,
    target='TARGET',
    numerical_columns=num_cols_credit_card,
    min_unique_values=10
)

# Plot the correlation matrix
corr_mat_credit_card.plot_correlation_matrix(figsize=(15, 13))

top_corr_numerical = corr_mat_credit_card.target_top_corr(
    target_top_columns=10)

print("Top Phi-K Correlations for Numerical Features:")
display(top_corr_numerical)

Top Phi-K Correlations for Numerical Features:

view_continuous_feat(
    data=credit_card_merged,
    column_name='AMT_BALANCE',
    plots=['box'],
    figsize=(8, 6),
    log_scale=False,
)

view_continuous_feat(
    data=credit_card_merged,
    column_name='AMT_TOTAL_RECEIVABLE',
    plots=['box'],
    figsize=(8, 6),
    log_scale=False,
)

view_continuous_feat(
    data=credit_card_merged,
    column_name='CNT_INSTALMENT_MATURE_CUM',
    plots=['box'],
    figsize=(8, 6),
    log_scale=False,
)

	Column Name	Phik-Correlation
1	CREDIT_ACTIVE	0.064481
3	CREDIT_TYPE	0.049954
2	CREDIT_CURRENCY	0.004993

	Column Name	Phik-Correlation
11	DAYS_CREDIT	0.088648
0	DAYS_CREDIT_ENDDATE	0.018997
5	AMT_CREDIT_SUM_OVERDUE	0.005654
7	AMT_CREDIT_SUM_LIMIT	0.005107
10	AMT_CREDIT_MAX_OVERDUE	0.004280
4	CNT_CREDIT_PROLONG	0.003862
3	CREDIT_DAY_OVERDUE	0.002528
8	DAYS_CREDIT_UPDATE	0.002158
2	AMT_CREDIT_SUM_DEBT	0.001695
9	AMT_CREDIT_SUM	0.000670

	Column Name	Phik-Correlation
5	NAME_CONTRACT_STATUS	0.088266
16	PRODUCT_COMBINATION	0.063839
7	CODE_REJECT_REASON	0.062771
1	NAME_CONTRACT_TYPE	0.050859
13	CHANNEL_TYPE	0.050302
10	NAME_GOODS_CATEGORY	0.042951
4	NAME_CASH_LOAN_PURPOSE	0.040305
6	NAME_PAYMENT_TYPE	0.039752
14	NAME_SELLER_INDUSTRY	0.038077
15	NAME_YIELD_GROUP	0.034626

	Column Name	Phik-Correlation
11	CNT_PAYMENT	0.056639
9	DAYS_DECISION	0.053695
12	DAYS_FIRST_DRAWING	0.048990
6	RATE_DOWN_PAYMENT	0.039592
5	HOUR_APPR_PROCESS_START	0.038121
8	RATE_INTEREST_PRIVILEGED	0.028204
14	DAYS_LAST_DUE_1ST_VERSION	0.027877
15	DAYS_LAST_DUE	0.027321
16	DAYS_TERMINATION	0.026480
0	AMT_ANNUITY	0.013808

	Column Name	Phik-Correlation
3	DAYS_ENTRY_PAYMENT	0.047231
2	DAYS_INSTALMENT	0.046815
1	NUM_INSTALMENT_NUMBER	0.022993
4	AMT_INSTALMENT	0.004125
5	AMT_PAYMENT	0.003084
0	NUM_INSTALMENT_VERSION	0.002198

Dataset	Memory Before (MB)	Memory After (MB)	Reduction (MB)	Reduction (%)
train_df	536.69	129.05	407.64	75.95%
test_df	84.74	20.28	64.45	76.06%
bureau_df	512.11	96.58	415.53	81.14%
bureau_bal_df	1926.61	234.32	1692.29	87.84%
credit_card_bal_df	875.69	325.96	549.73	62.78%
installments_pay_df	830.41	415.20	415.20	50.00%
poscash_bal_df	1137.25	276.60	860.65	75.68%
prev_app_df	1900.63	159.30	1741.33	91.62%

	SK_ID_CURR	TARGET	NAME_CONTRACT_TYPE	CODE_GENDER	FLAG_OWN_CAR	FLAG_OWN_REALTY	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	AMT_GOODS_PRICE	NAME_TYPE_SUITE	NAME_INCOME_TYPE	NAME_EDUCATION_TYPE	NAME_FAMILY_STATUS	NAME_HOUSING_TYPE	REGION_POPULATION_RELATIVE	DAYS_BIRTH	DAYS_EMPLOYED	DAYS_REGISTRATION	DAYS_ID_PUBLISH	OWN_CAR_AGE	FLAG_MOBIL	FLAG_EMP_PHONE	FLAG_WORK_PHONE	FLAG_CONT_MOBILE	FLAG_PHONE	OCCUPATION_TYPE	CNT_FAM_MEMBERS	REGION_RATING_CLIENT	REGION_RATING_CLIENT_W_CITY	WEEKDAY_APPR_PROCESS_START	HOUR_APPR_PROCESS_START	REG_CITY_NOT_WORK_CITY	LIVE_CITY_NOT_WORK_CITY	ORGANIZATION_TYPE	EXT_SOURCE_1	EXT_SOURCE_2	EXT_SOURCE_3	APARTMENTS_AVG	BASEMENTAREA_AVG	YEARS_BEGINEXPLUATATION_AVG	YEARS_BUILD_AVG	COMMONAREA_AVG	ELEVATORS_AVG	ENTRANCES_AVG	FLOORSMAX_AVG	FLOORSMIN_AVG	LANDAREA_AVG	LIVINGAPARTMENTS_AVG	LIVINGAREA_AVG	NONLIVINGAPARTMENTS_AVG	NONLIVINGAREA_AVG	APARTMENTS_MODE	BASEMENTAREA_MODE	YEARS_BEGINEXPLUATATION_MODE	YEARS_BUILD_MODE	COMMONAREA_MODE	ELEVATORS_MODE	ENTRANCES_MODE	FLOORSMAX_MODE	FLOORSMIN_MODE	LANDAREA_MODE	LIVINGAPARTMENTS_MODE	LIVINGAREA_MODE	NONLIVINGAPARTMENTS_MODE	NONLIVINGAREA_MODE	APARTMENTS_MEDI	BASEMENTAREA_MEDI	YEARS_BEGINEXPLUATATION_MEDI	YEARS_BUILD_MEDI	COMMONAREA_MEDI	ELEVATORS_MEDI	ENTRANCES_MEDI	FLOORSMAX_MEDI	FLOORSMIN_MEDI	LANDAREA_MEDI	LIVINGAPARTMENTS_MEDI	LIVINGAREA_MEDI	NONLIVINGAPARTMENTS_MEDI	NONLIVINGAREA_MEDI	FONDKAPREMONT_MODE	HOUSETYPE_MODE	TOTALAREA_MODE	WALLSMATERIAL_MODE	EMERGENCYSTATE_MODE	OBS_30_CNT_SOCIAL_CIRCLE	DEF_30_CNT_SOCIAL_CIRCLE	OBS_60_CNT_SOCIAL_CIRCLE	DEF_60_CNT_SOCIAL_CIRCLE	DAYS_LAST_PHONE_CHANGE	FLAG_DOCUMENT_3	FLAG_DOCUMENT_8	AMT_REQ_CREDIT_BUREAU_HOUR	AMT_REQ_CREDIT_BUREAU_DAY	AMT_REQ_CREDIT_BUREAU_WEEK	AMT_REQ_CREDIT_BUREAU_MON	AMT_REQ_CREDIT_BUREAU_QRT	AMT_REQ_CREDIT_BUREAU_YEAR
0	100002	1	Cash loans	M	N	Y	202500.0	406597.5	24700.5	351000.0	Unaccompanied	Working	Secondary / secondary special	Single / not married	House / apartment	0.018801	-9461	-637	-3648.0	-2120	NaN	1	1	0	1	1	Laborers	1.0	2	2	WEDNESDAY	10	0	0	Business Entity Type 3	0.083037	0.262949	0.139376	0.0247	0.0369	0.9722	0.6192	0.0143	0.00	0.0690	0.0833	0.1250	0.0369	0.0202	0.0190	0.0000	0.0000	0.0252	0.0383	0.9722	0.6341	0.0144	0.0000	0.0690	0.0833	0.1250	0.0377	0.022	0.0198	0.0	0.0	0.0250	0.0369	0.9722	0.6243	0.0144	0.00	0.0690	0.0833	0.1250	0.0375	0.0205	0.0193	0.0000	0.00	reg oper account	block of flats	0.0149	Stone, brick	No	2.0	2.0	2.0	2.0	-1134.0	1	0	0.0	0.0	0.0	0.0	0.0	1.0
1	100003	0	Cash loans	F	N	N	270000.0	1293502.5	35698.5	1129500.0	Family	State servant	Higher education	Married	House / apartment	0.003541	-16765	-1188	-1186.0	-291	NaN	1	1	0	1	1	Core staff	2.0	1	1	MONDAY	11	0	0	School	0.311267	0.622246	NaN	0.0959	0.0529	0.9851	0.7960	0.0605	0.08	0.0345	0.2917	0.3333	0.0130	0.0773	0.0549	0.0039	0.0098	0.0924	0.0538	0.9851	0.8040	0.0497	0.0806	0.0345	0.2917	0.3333	0.0128	0.079	0.0554	0.0	0.0	0.0968	0.0529	0.9851	0.7987	0.0608	0.08	0.0345	0.2917	0.3333	0.0132	0.0787	0.0558	0.0039	0.01	reg oper account	block of flats	0.0714	Block	No	1.0	0.0	1.0	0.0	-828.0	1	0	0.0	0.0	0.0	0.0	0.0	0.0
2	100004	0	Revolving loans	M	Y	Y	67500.0	135000.0	6750.0	135000.0	Unaccompanied	Working	Secondary / secondary special	Single / not married	House / apartment	0.010032	-19046	-225	-4260.0	-2531	26.0	1	1	1	1	1	Laborers	1.0	2	2	MONDAY	9	0	0	Government	NaN	0.555912	0.729567	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	0.0	0.0	0.0	-815.0	0	0	0.0	0.0	0.0	0.0	0.0	0.0
3	100006	0	Cash loans	F	N	Y	135000.0	312682.5	29686.5	297000.0	Unaccompanied	Working	Secondary / secondary special	Civil marriage	House / apartment	0.008019	-19005	-3039	-9833.0	-2437	NaN	1	1	0	1	0	Laborers	2.0	2	2	WEDNESDAY	17	0	0	Business Entity Type 3	NaN	0.650442	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.0	0.0	2.0	0.0	-617.0	1	0	NaN	NaN	NaN	NaN	NaN	NaN
4	100007	0	Cash loans	M	N	Y	121500.0	513000.0	21865.5	513000.0	Unaccompanied	Working	Secondary / secondary special	Single / not married	House / apartment	0.028663	-19932	-3038	-4311.0	-3458	NaN	1	1	0	1	0	Core staff	1.0	2	2	THURSDAY	11	1	1	Religion	NaN	0.322738	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	0.0	0.0	0.0	-1106.0	0	1	0.0	0.0	0.0	0.0	0.0	0.0

	SK_ID_CURR	NAME_CONTRACT_TYPE	CODE_GENDER	FLAG_OWN_CAR	FLAG_OWN_REALTY	CNT_CHILDREN	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	AMT_GOODS_PRICE	NAME_TYPE_SUITE	NAME_INCOME_TYPE	NAME_EDUCATION_TYPE	NAME_FAMILY_STATUS	NAME_HOUSING_TYPE	REGION_POPULATION_RELATIVE	DAYS_BIRTH	DAYS_EMPLOYED	DAYS_REGISTRATION	DAYS_ID_PUBLISH	OWN_CAR_AGE	FLAG_MOBIL	FLAG_EMP_PHONE	FLAG_WORK_PHONE	FLAG_CONT_MOBILE	FLAG_PHONE	FLAG_EMAIL	OCCUPATION_TYPE	CNT_FAM_MEMBERS	REGION_RATING_CLIENT	REGION_RATING_CLIENT_W_CITY	WEEKDAY_APPR_PROCESS_START	HOUR_APPR_PROCESS_START	REG_CITY_NOT_WORK_CITY	LIVE_CITY_NOT_WORK_CITY	ORGANIZATION_TYPE	EXT_SOURCE_1	EXT_SOURCE_2	EXT_SOURCE_3	APARTMENTS_AVG	BASEMENTAREA_AVG	YEARS_BEGINEXPLUATATION_AVG	YEARS_BUILD_AVG	COMMONAREA_AVG	ELEVATORS_AVG	ENTRANCES_AVG	FLOORSMAX_AVG	FLOORSMIN_AVG	LANDAREA_AVG	LIVINGAPARTMENTS_AVG	LIVINGAREA_AVG	NONLIVINGAPARTMENTS_AVG	NONLIVINGAREA_AVG	APARTMENTS_MODE	BASEMENTAREA_MODE	YEARS_BEGINEXPLUATATION_MODE	YEARS_BUILD_MODE	COMMONAREA_MODE	ELEVATORS_MODE	ENTRANCES_MODE	FLOORSMAX_MODE	FLOORSMIN_MODE	LANDAREA_MODE	LIVINGAPARTMENTS_MODE	LIVINGAREA_MODE	NONLIVINGAPARTMENTS_MODE	NONLIVINGAREA_MODE	APARTMENTS_MEDI	BASEMENTAREA_MEDI	YEARS_BEGINEXPLUATATION_MEDI	YEARS_BUILD_MEDI	COMMONAREA_MEDI	ELEVATORS_MEDI	ENTRANCES_MEDI	FLOORSMAX_MEDI	FLOORSMIN_MEDI	LANDAREA_MEDI	LIVINGAPARTMENTS_MEDI	LIVINGAREA_MEDI	NONLIVINGAPARTMENTS_MEDI	NONLIVINGAREA_MEDI	FONDKAPREMONT_MODE	HOUSETYPE_MODE	TOTALAREA_MODE	WALLSMATERIAL_MODE	EMERGENCYSTATE_MODE	DAYS_LAST_PHONE_CHANGE	FLAG_DOCUMENT_3	FLAG_DOCUMENT_8	AMT_REQ_CREDIT_BUREAU_HOUR	AMT_REQ_CREDIT_BUREAU_DAY	AMT_REQ_CREDIT_BUREAU_WEEK	AMT_REQ_CREDIT_BUREAU_MON	AMT_REQ_CREDIT_BUREAU_QRT	AMT_REQ_CREDIT_BUREAU_YEAR
0	100001	Cash loans	F	N	Y	0	135000.0	568800.0	20560.5	450000.0	Unaccompanied	Working	Higher education	Married	House / apartment	0.018850	-19241	-2329	-5170.0	-812	NaN	1	1	0	1	0	1	NaN	2.0	2	2	TUESDAY	18	0	0	Kindergarten	0.752614	0.789654	0.159520	0.0660	0.0590	0.9732	NaN	NaN	NaN	0.1379	0.125	NaN	NaN	NaN	0.0505	NaN	NaN	0.0672	0.0612	0.9732	NaN	NaN	NaN	0.1379	0.125	NaN	NaN	NaN	0.0526	NaN	NaN	0.0666	0.0590	0.9732	NaN	NaN	NaN	0.1379	0.125	NaN	NaN	NaN	0.0514	NaN	NaN	NaN	block of flats	0.0392	Stone, brick	No	-1740.0	1	0	0.0	0.0	0.0	0.0	0.0	0.0
1	100005	Cash loans	M	N	Y	0	99000.0	222768.0	17370.0	180000.0	Unaccompanied	Working	Secondary / secondary special	Married	House / apartment	0.035792	-18064	-4469	-9118.0	-1623	NaN	1	1	0	1	0	0	Low-skill Laborers	2.0	2	2	FRIDAY	9	0	0	Self-employed	0.564990	0.291656	0.432962	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	1	0	0.0	0.0	0.0	0.0	0.0	3.0
2	100013	Cash loans	M	Y	Y	0	202500.0	663264.0	69777.0	630000.0	NaN	Working	Higher education	Married	House / apartment	0.019101	-20038	-4458	-2175.0	-3503	5.0	1	1	0	1	0	0	Drivers	2.0	2	2	MONDAY	14	0	0	Transport: type 3	NaN	0.699787	0.610991	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	-856.0	0	1	0.0	0.0	0.0	0.0	1.0	4.0
3	100028	Cash loans	F	N	Y	2	315000.0	1575000.0	49018.5	1575000.0	Unaccompanied	Working	Secondary / secondary special	Married	House / apartment	0.026392	-13976	-1866	-2000.0	-4208	NaN	1	1	0	1	1	0	Sales staff	4.0	2	2	WEDNESDAY	11	0	0	Business Entity Type 3	0.525734	0.509677	0.612704	0.3052	0.1974	0.9970	0.9592	0.1165	0.32	0.2759	0.375	0.0417	0.2042	0.2404	0.3673	0.0386	0.08	0.3109	0.2049	0.9970	0.9608	0.1176	0.3222	0.2759	0.375	0.0417	0.2089	0.2626	0.3827	0.0389	0.0847	0.3081	0.1974	0.9970	0.9597	0.1173	0.32	0.2759	0.375	0.0417	0.2078	0.2446	0.3739	0.0388	0.0817	reg oper account	block of flats	0.3700	Panel	No	-1805.0	1	0	0.0	0.0	0.0	0.0	0.0	3.0
4	100038	Cash loans	M	Y	N	1	180000.0	625500.0	32067.0	625500.0	Unaccompanied	Working	Secondary / secondary special	Married	House / apartment	0.010032	-13040	-2191	-4000.0	-4262	16.0	1	1	1	1	0	0	NaN	3.0	2	2	FRIDAY	5	1	1	Business Entity Type 3	0.202145	0.425687	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	-821.0	1	0	NaN	NaN	NaN	NaN	NaN	NaN

	Column Name	Phik-Correlation
0	OCCUPATION_TYPE	0.102846
40	ORGANIZATION_TYPE	0.089164
16	NAME_INCOME_TYPE	0.084831
3	REG_CITY_NOT_WORK_CITY	0.079946
20	FLAG_EMP_PHONE	0.072087
48	REG_CITY_NOT_LIVE_CITY	0.069588
43	FLAG_DOCUMENT_3	0.069525
22	NAME_FAMILY_STATUS	0.056043
17	NAME_HOUSING_TYPE	0.051107
19	LIVE_CITY_NOT_WORK_CITY	0.050956

	Column Name	Phik-Correlation
16	EXT_SOURCE_3	0.247680
14	EXT_SOURCE_1	0.217846
15	EXT_SOURCE_2	0.213965
7	DAYS_BIRTH	0.102395
63	DAYS_LAST_PHONE_CHANGE	0.073215
8	DAYS_EMPLOYED	0.072094
10	DAYS_ID_PUBLISH	0.067769
5	AMT_GOODS_PRICE	0.059093
24	FLOORSMAX_AVG	0.058743
52	FLOORSMAX_MEDI	0.058445

	SK_ID_CURR	SK_ID_BUREAU	CREDIT_ACTIVE	CREDIT_CURRENCY	DAYS_CREDIT	DAYS_CREDIT_ENDDATE	DAYS_ENDDATE_FACT	AMT_CREDIT_MAX_OVERDUE	AMT_CREDIT_SUM	AMT_CREDIT_SUM_DEBT	AMT_CREDIT_SUM_LIMIT	CREDIT_TYPE	DAYS_CREDIT_UPDATE	AMT_ANNUITY
0	215354	5714462	Closed	currency 1	-497	-153.0	-153.0	NaN	91323.0	0.0	NaN	Consumer credit	-131	NaN
1	215354	5714463	Active	currency 1	-208	1075.0	NaN	NaN	225000.0	171342.0	NaN	Credit card	-20	NaN
2	215354	5714464	Active	currency 1	-203	528.0	NaN	NaN	464323.5	NaN	NaN	Consumer credit	-16	NaN
3	215354	5714465	Active	currency 1	-203	NaN	NaN	NaN	90000.0	NaN	NaN	Credit card	-16	NaN
4	215354	5714466	Active	currency 1	-629	1197.0	NaN	77674.5	2700000.0	NaN	NaN	Consumer credit	-21	NaN

	SK_ID_PREV	SK_ID_CURR	NAME_CONTRACT_TYPE	AMT_ANNUITY	AMT_APPLICATION	AMT_CREDIT	AMT_DOWN_PAYMENT	AMT_GOODS_PRICE	WEEKDAY_APPR_PROCESS_START	HOUR_APPR_PROCESS_START	FLAG_LAST_APPL_PER_CONTRACT	NFLAG_LAST_APPL_IN_DAY	RATE_DOWN_PAYMENT	RATE_INTEREST_PRIMARY	RATE_INTEREST_PRIVILEGED	NAME_CASH_LOAN_PURPOSE	NAME_CONTRACT_STATUS	DAYS_DECISION	NAME_PAYMENT_TYPE	CODE_REJECT_REASON	NAME_TYPE_SUITE	NAME_CLIENT_TYPE	NAME_GOODS_CATEGORY	NAME_PORTFOLIO	NAME_PRODUCT_TYPE	CHANNEL_TYPE	SELLERPLACE_AREA	NAME_SELLER_INDUSTRY	CNT_PAYMENT	NAME_YIELD_GROUP	PRODUCT_COMBINATION	DAYS_FIRST_DRAWING	DAYS_FIRST_DUE	DAYS_LAST_DUE_1ST_VERSION	DAYS_LAST_DUE	DAYS_TERMINATION	NFLAG_INSURED_ON_APPROVAL
0	2030495	271877	Consumer loans	1730.430054	17145.0	17145.0	0.0	17145.0	SATURDAY	15	Y	1	0.0	0.182832	0.867336	XAP	Approved	-73	Cash through the bank	XAP	NaN	Repeater	Mobile	POS	XNA	Country-wide	35	Connectivity	12.0	middle	POS mobile with interest	365243.0	-42.0	300.0	-42.0	-37.0	0.0
1	2802425	108129	Cash loans	25188.615234	607500.0	679671.0	NaN	607500.0	THURSDAY	11	Y	1	NaN	NaN	NaN	XNA	Approved	-164	XNA	XAP	Unaccompanied	Repeater	XNA	Cash	x-sell	Contact center	-1	XNA	36.0	low_action	Cash X-Sell: low	365243.0	-134.0	916.0	365243.0	365243.0	1.0
2	2523466	122040	Cash loans	15060.735352	112500.0	136444.5	NaN	112500.0	TUESDAY	11	Y	1	NaN	NaN	NaN	XNA	Approved	-301	Cash through the bank	XAP	Spouse, partner	Repeater	XNA	Cash	x-sell	Credit and cash offices	-1	XNA	12.0	high	Cash X-Sell: high	365243.0	-271.0	59.0	365243.0	365243.0	1.0
3	2819243	176158	Cash loans	47041.335938	450000.0	470790.0	NaN	450000.0	MONDAY	7	Y	1	NaN	NaN	NaN	XNA	Approved	-512	Cash through the bank	XAP	NaN	Repeater	XNA	Cash	x-sell	Credit and cash offices	-1	XNA	12.0	middle	Cash X-Sell: middle	365243.0	-482.0	-152.0	-182.0	-177.0	1.0
4	1784265	202054	Cash loans	31924.394531	337500.0	404055.0	NaN	337500.0	THURSDAY	9	Y	1	NaN	NaN	NaN	Repairs	Refused	-781	Cash through the bank	HC	NaN	Repeater	XNA	Cash	walk-in	Credit and cash offices	-1	XNA	24.0	high	Cash Street: high	NaN	NaN	NaN	NaN	NaN	NaN

	Column Name	Phik-Correlation
2	CNT_INSTALMENT_FUTURE	0.033194
1	CNT_INSTALMENT	0.030947
0	MONTHS_BALANCE	0.027383
3	SK_DPD	0.012773
4	SK_DPD_DEF	0.010539

	SK_ID_PREV	SK_ID_CURR	MONTHS_BALANCE	AMT_BALANCE	AMT_CREDIT_LIMIT_ACTUAL	AMT_DRAWINGS_ATM_CURRENT	AMT_DRAWINGS_CURRENT	AMT_DRAWINGS_POS_CURRENT	AMT_INST_MIN_REGULARITY	AMT_PAYMENT_CURRENT	AMT_PAYMENT_TOTAL_CURRENT	AMT_RECEIVABLE_PRINCIPAL	AMT_RECIVABLE	AMT_TOTAL_RECEIVABLE	CNT_DRAWINGS_ATM_CURRENT	CNT_DRAWINGS_CURRENT	CNT_DRAWINGS_POS_CURRENT	CNT_INSTALMENT_MATURE_CUM	NAME_CONTRACT_STATUS
0	2562384	378907	-6	56.970001	135000	0.0	877.5	877.5	1700.324951	1800.0	1800.0	0.000000	0.000000	0.000000	0.0	1	1.0	35.0	Active
1	2582071	363914	-1	63975.554688	45000	2250.0	2250.0	0.0	2250.000000	2250.0	2250.0	60175.078125	64875.554688	64875.554688	1.0	1	0.0	69.0	Active
2	1740877	371185	-7	31815.224609	450000	0.0	0.0	0.0	2250.000000	2250.0	2250.0	26926.425781	31460.085938	31460.085938	0.0	0	0.0	30.0	Active
3	1389973	337855	-4	236572.109375	225000	2250.0	2250.0	0.0	11795.759766	11925.0	11925.0	224949.281250	233048.968750	233048.968750	1.0	1	0.0	10.0	Active
4	1891521	126868	-1	453919.468750	450000	0.0	11547.0	11547.0	22924.890625	27000.0	27000.0	443044.406250	453919.468750	453919.468750	0.0	1	1.0	101.0	Active

	Column Name	Phik-Correlation
1	AMT_BALANCE	0.059838
11	AMT_RECIVABLE	0.059311
12	AMT_TOTAL_RECEIVABLE	0.059288
10	AMT_RECEIVABLE_PRINCIPAL	0.058895
0	MONTHS_BALANCE	0.050330
7	AMT_INST_MIN_REGULARITY	0.042174
17	CNT_INSTALMENT_MATURE_CUM	0.037519
13	CNT_DRAWINGS_ATM_CURRENT	0.030052
2	AMT_CREDIT_LIMIT_ACTUAL	0.028752
14	CNT_DRAWINGS_CURRENT	0.027841

	SK_ID_BUREAU	MONTHS_BALANCE	STATUS
0	5715448	0	C
1	5715448	-1	C
2	5715448	-2	C
3	5715448	-3	C
4	5715448	-4	C

Table of Contents¶

Introduction¶

Notebook settings¶

Data Loading¶

Optimize Memory Usage:¶

EDA¶

Null values (Nan) analysis¶

Distribution of Target Variable¶

Correlation Analysis: Numerical and Categorical Features¶

Correlation Analysis: Categorical Features¶

Correlation Analysis: Numerical Features¶

Multicollinearity Analysis¶

Application_train: Categorical Variables Analysis¶

Visualizing Continuous Variables¶

Statistical Inference¶

Hypothesis 1: Do Income Levels Significantly Differ Between Defaulters and Non-Defaulters?¶

Hypothesis 2: The Loan Amount Has a Positive Influence on Default Risk¶

Hypothesis 3: Age Distribution Differs Between Defaulters and Non-Defaulters¶

Hypothesis 4: Employment Stability Impacts Default Risk¶

Hypothesis 5: Family Size and Default Risk Correlation¶

Bureau Data Analysis¶

Merging Bureau Data with Application Data¶

Correlation Analysis: Categorical Features¶

Correlation Analysis: Numerical Features¶

Bureau: Categorical Variables Analysis¶

Bureau: Numerical Variables Analysis¶

Bureau Balance Data Analysis¶

Previous Application Data Analysis¶

Previous Application: Null Values analysis¶

Merging Previous Application with Application Data Target¶

Correlation Analysis: Categorical Features¶

Correlation Analysis: Numerical Features¶

Previous Application: Categorical Variables Analysis¶

Previous Application: Numerical Variables Analysis¶

Install Payments Data Analysis¶

Install Payments: Null Values analysis¶

Merging Install Payments with Application Data Target¶

Correlation Analysis: Numerical Features¶

Correlation Analysis: Continuous Features¶

Distribution of DAYS_FIRST_DRAWING feature and Analysis¶

POS_CASH_balance Data Analysis¶

POS_CASH_balance: Null Values analysis¶

Merging POS_CASH_balance with Application Data Target¶

Correlation Analysis: Numerical Features¶

Correlation Analysis: Continuous Features¶

Distribution of CNT_INSTALMENT_FUTURE feature and Analysis¶

Credit Card Balance Data Analysis¶

Credit Card Balance: Null Values analysis¶

Merging Credit Card Balance with Application Data Target¶

Correlation Analysis: Continuous Features¶

Distribution of AMT_BALANCE feature and Analysis¶

Distribution of AMT_TOTAL_RECEIVABLE feature and Analysis¶

Distribution of CNT_INSTALMENT_MATURE_CUM feature and Analysis¶

Main conclusions from EDA¶

Suggestions for EDA Improvement¶