from assets.utils.functions_feat_eng import *

# Libraries
# ----------------------
# Data Manipulation Libraries
# ----------------------
import pandas as pd
import numpy as np
from scipy.stats import randint, uniform

# ----------------------
# Visualization Libraries
# ----------------------
import matplotlib.pyplot as plt
import seaborn as sns
from prettytable import PrettyTable

# ----------------------
# Utility Libraries
# ----------------------
import os
import gc
import pickle
import warnings
from datetime import datetime
import sys
from contextlib import redirect_stdout, redirect_stderr

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# ----------------------
# Jupyter Notebook Customization
# ----------------------
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# ----------------------
# Scikit-learn Libraries
# ----------------------
# Preprocessing
from sklearn.preprocessing import StandardScaler

# Model Selection and Validation
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier


# Metrics
from sklearn.metrics import (
    roc_auc_score,
    precision_score,
    recall_score,
    roc_curve,
    confusion_matrix
)

# Algorithms
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score


# ----------------------
# Advanced Machine Learning Libraries
# ----------------------
# Bayesian Optimization
from bayes_opt import BayesianOptimization

# XGBoost
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from xgboost import DMatrix, train
from xgboost import XGBClassifier

# LightGBM
import lightgbm as lgb
from lightgbm import LGBMClassifier, LGBMRegressor

############################################
# Bureau and Bureau Balance Data Processing
############################################

class preprocess_bureau_balance_and_bureau:
    '''
    Preprocess the tables bureau_balance and bureau.
    Contains 4 member functions:
        1. init method
        2. preprocess_bureau_balance method
        3. preprocess_bureau method
        4. main method
    '''
    
    def __init__(self, file_directory='', verbose=True, dump_to_pickle=False):
        '''
        This function is used to initialize the class members 
        
        Inputs:
            self
            file_directory: Path, str, default = ''
                The path where the file exists. Include a '/' at the end of the path in input
            verbose: bool, default = True
                Whether to enable verbosity or not
            dump_to_pickle: bool, default = False
                Whether to pickle the final preprocessed table or not
                
        Returns:
            None
        '''
        
        self.file_directory = file_directory
        self.verbose = verbose
        self.dump_to_pickle = dump_to_pickle
        self.start = datetime.now()
        
    def preprocess_bureau_balance(self):
        '''
        Function to preprocess bureau_balance table.
        This function first loads the table into memory, does some feature engineering, and finally
        aggregates the data over SK_ID_BUREAU
        
        Inputs:
            self
            
        Returns:
            preprocessed and aggregated bureau_balance table.
        '''
        
        if self.verbose:
            print('#######################################################')
            print('#          Pre-processing bureau_balance.csv          #')
            print('#######################################################')
            print("\nLoading the DataFrame, bureau_balance.csv, into memory...")

        # Construct the dynamic file path using `file_directory`
        file_path = os.path.join(self.file_directory, 'bureau_balance.csv')
        
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")

        bureau_balance = pd.read_csv(file_path)

        if self.verbose:
            print("Loaded bureau_balance.csv")
            print(f"Time Taken to load = {datetime.now() - self.start}")
            print("Starting Data Cleaning and Feature Engineering...")

        # As we saw from EDA, bureau_balance has a variable called STATUS, which describes about the status of loan.
        # it has 7 labels, we will label encode them so we give C as 0, and rest increasing
        # Also we will give X the benefit of doubt and keep it as middle value
        dict_for_status = { 'C': 0, '0': 1, '1': 2, '2': 3, 'X': 4, '3': 5, '4': 6, '5': 7}
        bureau_balance['STATUS'] = bureau_balance['STATUS'].map(dict_for_status)

        # Weighing the status with the months_balance
        # Converting months to positive
        bureau_balance['MONTHS_BALANCE'] = np.abs(bureau_balance['MONTHS_BALANCE'])
        bureau_balance['WEIGHTED_STATUS'] = bureau_balance.STATUS / (bureau_balance.MONTHS_BALANCE + 1)

        # Sorting the bureau_balance in ascending order of month and by the bureau SK_ID
        # this is done so as to make the rolling exponential average easily for previous months till current month
        bureau_balance = bureau_balance.sort_values(by=['SK_ID_BUREAU', 'MONTHS_BALANCE'], ascending=[0, 0])
        # We will do exponential weighted average on the encoded status
        # this is because if a person had a bad status 2 years ago, it should be given less weight age today
        # we keep the latent variable alpha = 0.8 doing this for both weighted status and the status itself
        bureau_balance['EXP_WEIGHTED_STATUS'] = bureau_balance.groupby('SK_ID_BUREAU')['WEIGHTED_STATUS'].transform(lambda x: x.ewm(alpha = 0.8).mean())
        bureau_balance['EXP_ENCODED_STATUS'] = bureau_balance.groupby('SK_ID_BUREAU')['STATUS'].transform(lambda x: x.ewm(alpha = 0.8).mean())    

        if self.verbose:
            print(f"Total Time Elapsed = {datetime.now() - self.start}")

        #we can see that these datapoints are for 96 months i.e. 8 years.
        #so we will extract the means, and exponential averages for each year separately
        #first we convert month to year
        bureau_balance['MONTHS_BALANCE'] = bureau_balance['MONTHS_BALANCE'] // 12

        # Defining our aggregations
        aggregations_basic = {
            'MONTHS_BALANCE' : ['mean','max'],
            'STATUS' : ['mean','max','first'],
            'WEIGHTED_STATUS' : ['mean','sum','first'],
            'EXP_ENCODED_STATUS' : ['last'],
            'EXP_WEIGHTED_STATUS' : ['last']}

        # Finding aggregates for each year too
        aggregations_for_year = {
            'STATUS' : ['mean','max','last','first'],
            'WEIGHTED_STATUS' : ['mean','max', 'first','last'],
            'EXP_WEIGHTED_STATUS' : ['last'],
            'EXP_ENCODED_STATUS' : ['last'] }

        # Aggregating over whole dataset first
        aggregated_bureau_balance = bureau_balance.groupby(['SK_ID_BUREAU']).agg(aggregations_basic)
        aggregated_bureau_balance.columns = ['_'.join(ele).upper() for ele in aggregated_bureau_balance.columns]

        # Aggregating some of the features separately for latest 2 years
        aggregated_bureau_years = pd.DataFrame()
        for year in range(2):
            year_group = bureau_balance[bureau_balance['MONTHS_BALANCE'] == year].groupby('SK_ID_BUREAU').agg(aggregations_for_year)
            year_group.columns = ['_'.join(ele).upper() + '_YEAR_' + str(year) for ele in year_group.columns]

            if year == 0:
                aggregated_bureau_years = year_group
            else:
                aggregated_bureau_years = aggregated_bureau_years.merge(year_group, on = 'SK_ID_BUREAU', how = 'outer')

        # Aggregating for rest of the years
        aggregated_bureau_rest_years = bureau_balance[bureau_balance.MONTHS_BALANCE > year].groupby(['SK_ID_BUREAU']).agg(aggregations_for_year)
        aggregated_bureau_rest_years.columns = ['_'.join(ele).upper() + '_YEAR_REST' for ele in aggregated_bureau_rest_years.columns]

        # Merging with rest of the years
        aggregated_bureau_years = aggregated_bureau_years.merge(aggregated_bureau_rest_years, on = 'SK_ID_BUREAU', how = 'outer')
        aggregated_bureau_balance = aggregated_bureau_balance.merge(aggregated_bureau_years, on = 'SK_ID_BUREAU', how = 'inner')

        # Filling the missing values obtained after aggregations with 0
        aggregated_bureau_balance.fillna(0, inplace = True)

        if self.verbose:
            print('Done preprocessing bureau_balance.')
            print(f"Initial Size of bureau_balance: {bureau_balance.shape}")
            print(f'Size of bureau_balance after Pre-Processing, Feature Engineering and Aggregation: {aggregated_bureau_balance.shape}')
            print(f'Total Time Taken = {datetime.now() - self.start}')

        if self.dump_to_pickle:
            if self.verbose:
                print('Pickling pre-processed bureau_balance to bureau_balance_preprocessed.pkl')
            with open(self.file_directory + 'bureau_balance_preprocessed.pkl', 'wb') as f:
                pickle.dump(aggregated_bureau_balance, f)
            if self.verbose:
                print('Finished.')     
        
        return aggregated_bureau_balance
    
    def preprocess_bureau(self, aggregated_bureau_balance):
        '''
        Function to preprocess the bureau table and merge it with the aggregated bureau_balance table.
        Finally aggregates the data over SK_ID_CURR for it to be merged with application_train table.
        
        Inputs:
            self
            aggregated_bureau_balance: DataFrame of aggregated bureau_balance table
        
        Returns:
            Final preprocessed, merged and aggregated bureau table
        '''
        
        if self.verbose:
            start2 = datetime.now()
            print('\n##############################################')
            print('#          Pre-processing bureau.csv         #')
            print('##############################################')
            print("\nLoading the DataFrame, bureau.csv, into memory...")

        bureau = pd.read_csv(self.file_directory + 'bureau.csv')

        if self.verbose:
            print("Loaded bureau.csv")
            print(f"Time Taken to load = {datetime.now() - start2}")
            print("Starting Data Cleaning and Feature Engineering...")

        # Merging it with aggregated bureau_balance on 'SK_ID_BUREAU'
        bureau_merged = bureau.merge(aggregated_bureau_balance, on = 'SK_ID_BUREAU', how = 'left')

        # Merging it with aggregated bureau_balance on 'SK_ID_BUREAU'
        bureau_merged = bureau.merge(aggregated_bureau_balance, on = 'SK_ID_BUREAU', how = 'left')

        # From the EDA we saw some erroneous values in DAYS Fields, we will remove those
        # there are some loans which ended about very long ago, around 100 years ago.
        # Thus we will only keep those loans which have ended in past 50 years.
        bureau_merged['DAYS_CREDIT_ENDDATE'][bureau_merged['DAYS_CREDIT_ENDDATE'] > -50*365] = np.nan
        bureau_merged['DAYS_ENDDATE_FACT'][bureau_merged['DAYS_ENDDATE_FACT'] > -50*365] = np.nan
        # There is also a feature which tells about the number of days ago the Credit Report Came
        bureau_merged['DAYS_CREDIT_UPDATE'][bureau_merged['DAYS_CREDIT_UPDATE'] > -50*365] = np.nan
        
        # Engineering some features based on domain knowledge
        bureau_merged['CREDIT_DURATION'] = np.abs(bureau_merged['DAYS_CREDIT'] - bureau_merged['DAYS_CREDIT_ENDDATE'])
        bureau_merged['FLAG_OVERDUE_RECENT'] = [0 if ele == 0 else 1 for ele in bureau_merged['CREDIT_DAY_OVERDUE']]
        bureau_merged['MAX_AMT_OVERDUE_DURATION_RATIO'] = bureau_merged['AMT_CREDIT_MAX_OVERDUE'] / (bureau_merged['CREDIT_DURATION'] + 0.00001)
        bureau_merged['CURRENT_AMT_OVERDUE_DURATION_RATIO'] = bureau_merged['AMT_CREDIT_SUM_OVERDUE'] / (bureau_merged['CREDIT_DURATION'] + 0.00001)
        bureau_merged['AMT_OVERDUE_DURATION_LEFT_RATIO'] = bureau_merged['AMT_CREDIT_SUM_OVERDUE'] / (bureau_merged['DAYS_CREDIT_ENDDATE'] + 0.00001)
        bureau_merged['CNT_PROLONGED_MAX_OVERDUE_MUL'] = bureau_merged['CNT_CREDIT_PROLONG'] * bureau_merged['AMT_CREDIT_MAX_OVERDUE']
        bureau_merged['CNT_PROLONGED_DURATION_RATIO'] = bureau_merged['CNT_CREDIT_PROLONG'] / (bureau_merged['CREDIT_DURATION'] + 0.00001)
        bureau_merged['CURRENT_DEBT_TO_CREDIT_RATIO'] = bureau_merged['AMT_CREDIT_SUM_DEBT'] / (bureau_merged['AMT_CREDIT_SUM'] + 0.00001)
        bureau_merged['CURRENT_CREDIT_DEBT_DIFF'] = bureau_merged['AMT_CREDIT_SUM'] - bureau_merged['AMT_CREDIT_SUM_DEBT']
        bureau_merged['AMT_ANNUITY_CREDIT_RATIO'] = bureau_merged['AMT_ANNUITY'] / (bureau_merged['AMT_CREDIT_SUM'] + 0.00001)
        bureau_merged['CREDIT_ENDDATE_UPDATE_DIFF'] = np.abs(bureau_merged['DAYS_CREDIT_UPDATE'] - bureau_merged['DAYS_CREDIT_ENDDATE'])

        # Aggregating the bureau_merged df with respect to 'SK_ID_CURR' so as to merge it with application_train later    
        # Firstly we will aggregate the columns based on the category of CREDIT_ACTIVE
        aggregations_CREDIT_ACTIVE = {
                        'DAYS_CREDIT' : ['mean','min','max','last'],
                        'CREDIT_DAY_OVERDUE' : ['mean','max'],
                        'DAYS_CREDIT_ENDDATE' : ['mean','max'],
                        'DAYS_ENDDATE_FACT' : ['mean','min'],
                        'AMT_CREDIT_MAX_OVERDUE': ['max','sum'],
                        'CNT_CREDIT_PROLONG': ['max','sum'],
                        'AMT_CREDIT_SUM' : ['sum','max'],
                        'AMT_CREDIT_SUM_DEBT': ['sum'],
                        'AMT_CREDIT_SUM_LIMIT': ['max','sum'],
                        'AMT_CREDIT_SUM_OVERDUE': ['max','sum'],
                        'DAYS_CREDIT_UPDATE' : ['mean','min'],
                        'AMT_ANNUITY' : ['mean','sum','max'],
                        'CREDIT_DURATION' : ['max','mean'],
                        'FLAG_OVERDUE_RECENT': ['sum'],
                        'MAX_AMT_OVERDUE_DURATION_RATIO' : ['max','sum'],
                        'CURRENT_AMT_OVERDUE_DURATION_RATIO' : ['max','sum'],
                        'AMT_OVERDUE_DURATION_LEFT_RATIO' : ['max', 'mean'],
                        'CNT_PROLONGED_MAX_OVERDUE_MUL' : ['mean','max'],
                        'CNT_PROLONGED_DURATION_RATIO' : ['mean', 'max'],
                        'CURRENT_DEBT_TO_CREDIT_RATIO' : ['mean', 'min'],
                        'CURRENT_CREDIT_DEBT_DIFF' : ['mean','min'],
                        'AMT_ANNUITY_CREDIT_RATIO' : ['mean','max','min'],
                        'CREDIT_ENDDATE_UPDATE_DIFF' : ['max','min'],
                        'STATUS_MEAN' : ['mean', 'max'],
                        'WEIGHTED_STATUS_MEAN' : ['mean', 'max']
                         }

        # We saw from EDA that the two most common type of CREDIT ACTIVE were 'Closed' and 'Active'.
        # So we will aggregate them two separately and the remaining categories separately.
        categories_to_aggregate_on = ['Closed','Active']
        bureau_merged_aggregated_credit = pd.DataFrame()
        for i, status in enumerate(categories_to_aggregate_on):
            group = bureau_merged[bureau_merged['CREDIT_ACTIVE'] == status].groupby('SK_ID_CURR').agg(aggregations_CREDIT_ACTIVE)
            group.columns = ['_'.join(ele).upper() + '_CREDITACTIVE_' + status.upper() for ele in group.columns]

            if i==0:
                bureau_merged_aggregated_credit = group
            else:
                bureau_merged_aggregated_credit = bureau_merged_aggregated_credit.merge(group, on = 'SK_ID_CURR', how = 'outer')
        # Aggregating for remaining categories
        bureau_merged_aggregated_credit_rest = bureau_merged[(bureau_merged['CREDIT_ACTIVE'] != 'Active') & 
                                                             (bureau_merged['CREDIT_ACTIVE'] != 'Closed')].groupby('SK_ID_CURR').agg(aggregations_CREDIT_ACTIVE)
        bureau_merged_aggregated_credit_rest.columns = ['_'.join(ele).upper() + 'CREDIT_ACTIVE_REST' for ele in bureau_merged_aggregated_credit_rest.columns]

        # Merging with other categories
        bureau_merged_aggregated_credit = bureau_merged_aggregated_credit.merge(bureau_merged_aggregated_credit_rest, on = 'SK_ID_CURR', how = 'outer')

        # Encoding the categorical columns in one-hot form
        currency_ohe = pd.get_dummies(bureau_merged['CREDIT_CURRENCY'], prefix = 'CURRENCY')
        credit_active_ohe = pd.get_dummies(bureau_merged['CREDIT_ACTIVE'], prefix = 'CREDIT_ACTIVE')
        credit_type_ohe = pd.get_dummies(bureau_merged['CREDIT_TYPE'], prefix = 'CREDIT_TYPE')

        # Merging the one-hot encoded columns
        bureau_merged = pd.concat([bureau_merged.drop(['CREDIT_CURRENCY','CREDIT_ACTIVE','CREDIT_TYPE'], axis = 1), 
                                   currency_ohe, credit_active_ohe, credit_type_ohe], axis = 1)

        # Aggregating the bureau_merged over all the columns
        bureau_merged_aggregated = bureau_merged.drop('SK_ID_BUREAU', axis = 1).groupby('SK_ID_CURR').agg('mean')
        bureau_merged_aggregated.columns = [ele + '_MEAN_OVERALL' for ele in bureau_merged_aggregated.columns]
        # Merging it with aggregates over categories
        bureau_merged_aggregated = bureau_merged_aggregated.merge(bureau_merged_aggregated_credit, on = 'SK_ID_CURR', how = 'outer')

        if self.verbose:
            print('Done preprocessing bureau and bureau_balance.')
            print(f"Initial Size of bureau: {bureau.shape}")
            print(f'Size of bureau and bureau_balance after Merging, Pre-Processing, Feature Engineering and Aggregation: {bureau_merged_aggregated.shape}')
            print(f'Total Time Taken = {datetime.now() - self.start}')

        if self.dump_to_pickle:
            if self.verbose:
                print('Pickling pre-processed bureau and bureau_balance to bureau_merged_preprocessed.pkl')
            with open(self.file_directory + 'bureau_merged_preprocessed.pkl', 'wb') as f:
                pickle.dump(bureau_merged_aggregated, f)
            if self.verbose:
                print('Finished.')
        if self.verbose:
            print('-'*100)

        return bureau_merged_aggregated

    def main(self):
        '''
        Function to be called for complete preprocessing and aggregation of the bureau and bureau_balance tables.
        
        Inputs:
            self
            
        Returns:
            Final preprocessed and merged bureau and bureau_balance tables
        '''
        
        aggregated_bureau_balance = self.preprocess_bureau_balance()
        bureau_merged_aggregated = self.preprocess_bureau(aggregated_bureau_balance)
        return bureau_merged_aggregated

bureau_aggregated = preprocess_bureau_balance_and_bureau(
    file_directory='../analytical/assets/data/',
    verbose=True,
    dump_to_pickle=True
)

bureau_aggregated = bureau_aggregated.main()

#######################################################
#          Pre-processing bureau_balance.csv          #
#######################################################

Loading the DataFrame, bureau_balance.csv, into memory...
Loaded bureau_balance.csv
Time Taken to load = 0:00:02.038645
Starting Data Cleaning and Feature Engineering...
Total Time Elapsed = 0:02:08.406865
Done preprocessing bureau_balance.
Initial Size of bureau_balance: (27299925, 6)
Size of bureau_balance after Pre-Processing, Feature Engineering and Aggregation: (817395, 40)
Total Time Taken = 0:02:12.255969
Pickling pre-processed bureau_balance to bureau_balance_preprocessed.pkl
Finished.

##############################################
#          Pre-processing bureau.csv         #
##############################################

Loading the DataFrame, bureau.csv, into memory...
Loaded bureau.csv
Time Taken to load = 0:00:01.139357
Starting Data Cleaning and Feature Engineering...
Done preprocessing bureau and bureau_balance.
Initial Size of bureau: (1716428, 17)
Size of bureau and bureau_balance after Merging, Pre-Processing, Feature Engineering and Aggregation: (305811, 242)
Total Time Taken = 0:02:18.607898
Pickling pre-processed bureau and bureau_balance to bureau_merged_preprocessed.pkl
Finished.
----------------------------------------------------------------------------------------------------

#######################################
# Previous Application Data Processing
#######################################
class preprocess_previous_application:
    '''
    Preprocess the previous_application table.
    Contains 5 member functions:
        1. __init__ method
        2. load_dataframe method
        3. data_cleaning method
        4. preprocessing_feature_engineering method
        5. main method
    '''
    
    def __init__(self, file_directory='../analytical/assets/data/', verbose=True, dump_to_pickle=False):
        '''
        Initialize the class members.
        
        Inputs:
            file_directory: Path, str, default='../analytical/assets/data/'
                The directory where the files are located. Include '/' at the end.
            verbose: bool, default=True
                Whether to print detailed logs or not.
            dump_to_pickle: bool, default=False
                Whether to pickle the final preprocessed table.
        '''
        self.file_directory = file_directory
        self.verbose = verbose
        self.dump_to_pickle = dump_to_pickle
    
    def load_dataframe(self):
        '''
        Load the previous_application.csv DataFrame.
        '''
        if self.verbose:
            self.start = datetime.now()
            print('########################################################')
            print('#        Pre-processing previous_application.csv        #')
            print('########################################################')
            print("\nLoading the DataFrame, previous_application.csv, into memory...")

        csv_path = self.file_directory + 'previous_application.csv'
        try:
            self.previous_application = pd.read_csv(csv_path)
            self.initial_shape = self.previous_application.shape

            if self.verbose:
                print(f"Loaded {csv_path}")
                print(f"Time Taken to load = {datetime.now() - self.start}")
        except FileNotFoundError:
            raise FileNotFoundError(f"File not found at {csv_path}. Please ensure the file exists at the specified location.")
    
    def data_cleaning(self):
            '''
            Function to clean the data. Removes erroneous points, fills categorical NaNs with 'XNA'.
            
            Inputs:
                self
                
            Returns:
                None
            '''
            if self.verbose:
                start = datetime.now()
                print('Starting Data Cleaning...')
                
            # Sorting the applications from oldest to most recent previous loans for each user
            self.previous_application = self.previous_application.sort_values(by = ['SK_ID_CURR','DAYS_FIRST_DUE'])
            
            # In the EDA we found some erroneous values in DAYS columns, so we will replace them with NaN values
            self.previous_application['DAYS_FIRST_DRAWING'][self.previous_application['DAYS_FIRST_DRAWING'] == 365243.0] = np.nan
            self.previous_application['DAYS_FIRST_DUE'][self.previous_application['DAYS_FIRST_DUE'] == 365243.0] = np.nan
            self.previous_application['DAYS_LAST_DUE_1ST_VERSION'][self.previous_application['DAYS_LAST_DUE_1ST_VERSION'] == 365243.0] = np.nan
            self.previous_application['DAYS_LAST_DUE'][self.previous_application['DAYS_LAST_DUE'] == 365243.0] = np.nan
            self.previous_application['DAYS_TERMINATION'][self.previous_application['DAYS_TERMINATION'] == 365243.0] = np.nan
            # We also see abruptly large value for SELLERPLACE_AREA
            self.previous_application['SELLERPLACE_AREA'][self.previous_application['SELLERPLACE_AREA'] == 4000000] = np.nan
            # Filling the NaN values for categories
            categorical_columns = self.previous_application.dtypes[self.previous_application.dtypes == 'object'].index.tolist()
            self.previous_application[categorical_columns] = self.previous_application[categorical_columns].fillna('XNA')
            
            if self.verbose:
                print("Finished.")
                print(f"Time taken = {datetime.now() - start}")
    
    def preprocessing_feature_engineering(self):
        '''
        Function to do preprocessing such as categorical encoding and feature engineering.
        
        Inputs: 
            self
            
        Returns:
            None
        '''
        
        if self.verbose:
            start = datetime.now()
            print("Performing Preprocessing and Feature Engineering...")

        # Label encoding the categorical variables
        name_contract_dict = {'Approved': 0, 'Refused' : 3, 'Canceled' : 2, 'Unused offer' : 1}
        self.previous_application['NAME_CONTRACT_STATUS'] = self.previous_application['NAME_CONTRACT_STATUS'].map(name_contract_dict)
        yield_group_dict = {'XNA': 0, 'low_action': 1, 'low_normal': 2,'middle': 3, 'high': 4}
        self.previous_application['NAME_YIELD_GROUP'] = self.previous_application['NAME_YIELD_GROUP'].map(yield_group_dict)
        appl_per_contract_last_dict = {'Y':1, 'N':0}
        self.previous_application['FLAG_LAST_APPL_PER_CONTRACT'] = self.previous_application['FLAG_LAST_APPL_PER_CONTRACT'].map(appl_per_contract_last_dict)
        remaining_categorical_columns = self.previous_application.dtypes[self.previous_application.dtypes == 'object'].index.tolist()
        for col in remaining_categorical_columns:
            encoding_dict = dict([(j,i) for i,j in enumerate(self.previous_application[col].unique(),1)])
            self.previous_application[col] = self.previous_application[col].map(encoding_dict)    
        
        # Engineering some features on domain knowledge
        self.previous_application['MISSING_VALUES_TOTAL_PREV'] = self.previous_application.isna().sum(axis = 1)
        self.previous_application['AMT_DECLINED'] = self.previous_application['AMT_APPLICATION'] - self.previous_application['AMT_CREDIT']
        self.previous_application['AMT_CREDIT_GOODS_RATIO'] = self.previous_application['AMT_CREDIT'] / (self.previous_application['AMT_GOODS_PRICE'] + 0.00001)
        self.previous_application['AMT_CREDIT_GOODS_DIFF'] = self.previous_application['AMT_CREDIT'] - self.previous_application['AMT_GOODS_PRICE']
        self.previous_application['AMT_CREDIT_APPLICATION_RATIO'] = self.previous_application['AMT_APPLICATION'] / (self.previous_application['AMT_CREDIT'] + 0.00001)
        self.previous_application['CREDIT_DOWNPAYMENT_RATIO'] = self.previous_application['AMT_DOWN_PAYMENT'] / (self.previous_application['AMT_CREDIT'] + 0.00001)
        self.previous_application['GOOD_DOWNPAYMET_RATIO'] = self.previous_application['AMT_DOWN_PAYMENT'] / (self.previous_application['AMT_GOODS_PRICE'] + 0.00001)
        self.previous_application['INTEREST_DOWNPAYMENT'] = self.previous_application['RATE_DOWN_PAYMENT'] * self.previous_application['AMT_DOWN_PAYMENT']
        self.previous_application['INTEREST_CREDIT'] = self.previous_application['AMT_CREDIT'] * self.previous_application['RATE_INTEREST_PRIMARY']
        self.previous_application['INTEREST_CREDIT_PRIVILEGED'] = self.previous_application['AMT_CREDIT'] * self.previous_application['RATE_INTEREST_PRIVILEGED']
        self.previous_application['APPLICATION_AMT_TO_DECISION_RATIO'] = self.previous_application['AMT_APPLICATION'] / (self.previous_application['DAYS_DECISION'] + 0.00001) * -1
        self.previous_application['AMT_APPLICATION_TO_SELLERPLACE_AREA'] = self.previous_application['AMT_APPLICATION'] / (self.previous_application['SELLERPLACE_AREA'] + 0.00001)
        self.previous_application['ANNUITY'] = self.previous_application['AMT_CREDIT'] / (self.previous_application['CNT_PAYMENT'] + 0.00001)
        self.previous_application['ANNUITY_GOODS'] = self.previous_application['AMT_GOODS_PRICE'] / (self.previous_application['CNT_PAYMENT'] + 0.00001)
        self.previous_application['DAYS_FIRST_LAST_DUE_DIFF' ] = self.previous_application['DAYS_LAST_DUE'] - self.previous_application['DAYS_FIRST_DUE']
        self.previous_application['AMT_CREDIT_HOUR_PROCESS_START'] = self.previous_application['AMT_CREDIT'] * self.previous_application['HOUR_APPR_PROCESS_START']
        self.previous_application['AMT_CREDIT_NFLAG_LAST_APPL_DAY'] = self.previous_application['AMT_CREDIT'] * self.previous_application['NFLAG_LAST_APPL_IN_DAY']
        self.previous_application['AMT_CREDIT_YIELD_GROUP'] = self.previous_application['AMT_CREDIT'] * self.previous_application['NAME_YIELD_GROUP']
        # Reference: https://www.kaggle.com/c/home-credit-default-risk/discussion/64598
        self.previous_application['AMT_INTEREST'] = self.previous_application['CNT_PAYMENT'] * self.previous_application[
                                                'AMT_ANNUITY'] - self.previous_application['AMT_CREDIT'] 
        self.previous_application['INTEREST_SHARE'] = self.previous_application['AMT_INTEREST'] / (self.previous_application[
                                                                                                'AMT_CREDIT'] + 0.00001)
        self.previous_application['INTEREST_RATE'] = 2 * 12 * self.previous_application['AMT_INTEREST'] / (self.previous_application[
                                            'AMT_CREDIT'] * (self.previous_application['CNT_PAYMENT'] + 1))
        
        if self.verbose:
            print("Finished.")
            print(f"Time taken = {datetime.now() - start}")
    
    def aggregations(self):
        '''
        Function to aggregate the previous applications over SK_ID_CURR
        
        Inputs:
            self
        
        Returns:
            aggregated previous_applications
        '''
    
        if self.verbose:
            print("Aggregating previous applications over SK_ID_CURR...")
                
        aggregations_for_previous_application = {
            'MISSING_VALUES_TOTAL_PREV' : ['sum'],
            'NAME_CONTRACT_TYPE' : ['mean','last'],
            'AMT_ANNUITY' : ['mean','sum','max'],
            'AMT_APPLICATION' : ['mean','max','sum'],
            'AMT_CREDIT' : ['mean','max','sum'],
            'AMT_DOWN_PAYMENT' : ['mean','max','sum'],
            'AMT_GOODS_PRICE' : ['mean','max','sum'],
            'WEEKDAY_APPR_PROCESS_START' : ['mean','max','min'],
            'HOUR_APPR_PROCESS_START' : ['mean','max','min'],
            'FLAG_LAST_APPL_PER_CONTRACT' : ['mean','sum'],
            'NFLAG_LAST_APPL_IN_DAY' : ['mean','sum'],
            'RATE_DOWN_PAYMENT' : ['mean','max'],
            'RATE_INTEREST_PRIMARY' : ['mean','max'],
            'RATE_INTEREST_PRIVILEGED' : ['mean','max'],
            'NAME_CASH_LOAN_PURPOSE' : ['mean','last'],
            'NAME_CONTRACT_STATUS' : ['mean','max','last'],
            'DAYS_DECISION' : ['mean','max','min'],
            'NAME_PAYMENT_TYPE' : ['mean', 'last'],
            'CODE_REJECT_REASON' : ['mean','last'],
            'NAME_TYPE_SUITE' : ['mean','last'],
            'NAME_CLIENT_TYPE' : ['mean','last'],
            'NAME_GOODS_CATEGORY' : ['mean','last'],
            'NAME_PORTFOLIO' : ['mean','last'],
            'NAME_PRODUCT_TYPE' : ['mean','last'],
            'CHANNEL_TYPE' : ['mean','last'],
            'SELLERPLACE_AREA' : ['mean','max','min'],
            'NAME_SELLER_INDUSTRY' : ['mean','last'],
            'CNT_PAYMENT' : ['sum','mean','max'],
            'NAME_YIELD_GROUP' : ['mean','last'],
            'PRODUCT_COMBINATION' : ['mean', 'last'],
            'DAYS_FIRST_DRAWING' : ['mean','max'],
            'DAYS_FIRST_DUE' : ['mean','max'],
            'DAYS_LAST_DUE_1ST_VERSION' : ['mean'],
            'DAYS_LAST_DUE' : ['mean'],
            'DAYS_TERMINATION' : ['mean','max'],
            'NFLAG_INSURED_ON_APPROVAL' : ['sum'],
            'AMT_DECLINED' : ['mean','max','sum'],
            'AMT_CREDIT_GOODS_RATIO' : ['mean', 'max', 'min'],
            'AMT_CREDIT_GOODS_DIFF' : ['sum','mean','max', 'min'],
            'AMT_CREDIT_APPLICATION_RATIO' : ['mean','min'],
            'CREDIT_DOWNPAYMENT_RATIO' : ['mean','max'],
            'GOOD_DOWNPAYMET_RATIO' : ['mean','max'],
            'INTEREST_DOWNPAYMENT' : ['mean','sum','max'],
            'INTEREST_CREDIT' : ['mean','sum','max'],
            'INTEREST_CREDIT_PRIVILEGED' : ['mean','sum','max'],
            'APPLICATION_AMT_TO_DECISION_RATIO' : ['mean','min'],
            'AMT_APPLICATION_TO_SELLERPLACE_AREA' : ['mean','max'],
            'ANNUITY' : ['mean','sum','max'],
            'ANNUITY_GOODS' : ['mean','sum','max'],
            'DAYS_FIRST_LAST_DUE_DIFF' : ['mean','max'],
            'AMT_CREDIT_HOUR_PROCESS_START' : ['mean','sum'],
            'AMT_CREDIT_NFLAG_LAST_APPL_DAY' : ['mean','max'],
            'AMT_CREDIT_YIELD_GROUP' : ['mean','sum','min'],
            'AMT_INTEREST' : ['mean','sum','max','min'],
            'INTEREST_SHARE' : ['mean','max','min'],
            'INTEREST_RATE' : ['mean','max','min']
        }

        # Grouping the previous applications over SK_ID_CURR while only taking the latest 5 applications
        group_last_3 = self.previous_application.groupby('SK_ID_CURR').tail(5).groupby('SK_ID_CURR').agg(aggregations_for_previous_application)
        group_last_3.columns = ['_'.join(ele).upper() + '_LAST_5' for ele in group_last_3.columns]
        # Grouping the previous applications over SK_ID_CURR while only taking the first 2 applications
        group_first_3 = self.previous_application.groupby('SK_ID_CURR').head(2).groupby('SK_ID_CURR').agg(aggregations_for_previous_application)
        group_first_3.columns = ['_'.join(ele).upper() + '_FIRST_2' for ele in group_first_3.columns]
        # Grouping the previous applications over SK_ID_CURR while taking all the applications into consideration
        group_all = self.previous_application.groupby('SK_ID_CURR').agg(aggregations_for_previous_application)
        group_all.columns = ['_'.join(ele).upper() + '_ALL' for ele in group_all.columns]

        #merging all the applications
        previous_application_aggregated = group_last_3.merge(group_first_3, on = 'SK_ID_CURR', how = 'outer')
        previous_application_aggregated = previous_application_aggregated.merge(group_all, on = 'SK_ID_CURR', how = 'outer')

        return previous_application_aggregated
    
    def main(self):
        '''
        Function to be called for complete preprocessing and aggregation of previous_application table.
        
        Inputs:
            self
            
        Returns:
            Final pre=processed and aggregated previous_application table.
        '''
        
        # Load the DataFrame
        self.load_dataframe()
        
        # Data cleaning
        self.data_cleaning()
        
        # Preprocessing the categorical features and creating new features
        self.preprocessing_feature_engineering()
        
        # Aggregating data over SK_ID_CURR
        previous_application_aggregated = self.aggregations()
        
        if self.verbose:
            print('Finished aggregations.')
            print(f"Initial Size of previous_application: {self.initial_shape}")
            print(f'Size of previous_application after Pre-Processing, Feature Engineering and Aggregation: {previous_application_aggregated.shape}')
            print(f'Total Time Taken = {datetime.now() - self.start}')

        if self.dump_to_pickle:
            if self.verbose:
                print('Pickling pre-processed previous_application to previous_application_preprocessed.pkl')
            with open(self.file_directory + 'previous_application_preprocessed.pkl', 'wb') as f:
                pickle.dump(previous_application_aggregated, f)
            if self.verbose:
                print('Finished.')  
        if self.verbose:
            print('-'*100)
                    
        return previous_application_aggregated

previous_aggregated = preprocess_previous_application(
    file_directory='../analytical/assets/data/',
    verbose=True,
    dump_to_pickle=True
)

previous_aggregated = previous_aggregated.main()

print("Processed DataFrame:")
print(previous_aggregated.head())

########################################################
#        Pre-processing previous_application.csv        #
########################################################

Loading the DataFrame, previous_application.csv, into memory...
Loaded ../analytical/assets/data/previous_application.csv
Time Taken to load = 0:00:05.570838
Starting Data Cleaning...
Finished.
Time taken = 0:00:02.134821
Performing Preprocessing and Feature Engineering...
Finished.
Time taken = 0:00:01.412144
Aggregating previous applications over SK_ID_CURR...
Finished aggregations.
Initial Size of previous_application: (1670214, 37)
Size of previous_application after Pre-Processing, Feature Engineering and Aggregation: (338857, 399)
Total Time Taken = 0:00:13.035515
Pickling pre-processed previous_application to previous_application_preprocessed.pkl
Finished.
----------------------------------------------------------------------------------------------------
Processed DataFrame:
            MISSING_VALUES_TOTAL_PREV_SUM_LAST_5  \
SK_ID_CURR                                         
100001                                         3   
100002                                         3   
100003                                        11   
100004                                         3   
100005                                        16   

            NAME_CONTRACT_TYPE_MEAN_LAST_5  NAME_CONTRACT_TYPE_LAST_LAST_5  \
SK_ID_CURR                                                                   
100001                            1.000000                               1   
100002                            1.000000                               1   
100003                            1.333333                               2   
100004                            1.000000                               1   
100005                            1.500000                               2   

            AMT_ANNUITY_MEAN_LAST_5  AMT_ANNUITY_SUM_LAST_5  \
SK_ID_CURR                                                    
100001                     3951.000                3951.000   
100002                     9251.775                9251.775   
100003                    56553.990              169661.970   
100004                     5357.250                5357.250   
100005                     4813.200                4813.200   

            AMT_ANNUITY_MAX_LAST_5  AMT_APPLICATION_MEAN_LAST_5  \
SK_ID_CURR                                                        
100001                    3951.000                     24835.50   
100002                    9251.775                    179055.00   
100003                   98356.995                    435436.50   
100004                    5357.250                     24282.00   
100005                    4813.200                     22308.75   

            AMT_APPLICATION_MAX_LAST_5  AMT_APPLICATION_SUM_LAST_5  \
SK_ID_CURR                                                           
100001                         24835.5                     24835.5   
100002                        179055.0                    179055.0   
100003                        900000.0                   1306309.5   
100004                         24282.0                     24282.0   
100005                         44617.5                     44617.5   

            AMT_CREDIT_MEAN_LAST_5  ...  AMT_INTEREST_MEAN_ALL  \
SK_ID_CURR                          ...                          
100001                    23787.00  ...                7821.00   
100002                   179055.00  ...               42987.60   
100003                   484191.00  ...               65321.55   
100004                    20106.00  ...                1323.00   
100005                    20076.75  ...               17604.90   

            AMT_INTEREST_SUM_ALL  AMT_INTEREST_MAX_ALL  AMT_INTEREST_MIN_ALL  \
SK_ID_CURR                                                                     
100001                   7821.00               7821.00               7821.00   
100002                  42987.60              42987.60              42987.60   
100003                 195964.65             144401.94              12794.22   
100004                   1323.00               1323.00               1323.00   
100005                  17604.90              17604.90              17604.90   

            INTEREST_SHARE_MEAN_ALL  INTEREST_SHARE_MAX_ALL  \
SK_ID_CURR                                                    
100001                     0.328793                0.328793   
100002                     0.240080                0.240080   
100003                     0.146201                0.188002   
100004                     0.065801                0.065801   
100005                     0.438440                0.438440   

            INTEREST_SHARE_MIN_ALL  INTEREST_RATE_MEAN_ALL  \
SK_ID_CURR                                                   
100001                    0.328793                0.876781   
100002                    0.240080                0.230477   
100003                    0.111200                0.328564   
100004                    0.065801                0.315846   
100005                    0.438440                0.809428   

            INTEREST_RATE_MAX_ALL  INTEREST_RATE_MIN_ALL  
SK_ID_CURR                                                
100001                   0.876781               0.876781  
100002                   0.230477               0.230477  
100003                   0.381257               0.257354  
100004                   0.315846               0.315846  
100005                   0.809428               0.809428  

[5 rows x 399 columns]

#######################################
# Installment Payments Data Processing
#######################################

class preprocess_installments_payments:
    '''
    Preprocess the installments_payments table.
    Contains 6 member functions:
        1. __init__ method
        2. load_dataframe method
        3. data_preprocessing_and_feature_engineering method
        4. aggregations_sk_id_prev method
        5. aggregations_sk_id_curr method
        6. main method
    '''
     
    def __init__(self, file_directory='../analytical/assets/data/', verbose=True, dump_to_pickle=False):
        '''
        Initialize the class members.
        
        Inputs:
            file_directory: Path, str, default = '../analytical/assets/data/'
                The path where the file exists. Include a '/' at the end of the path in input.
            verbose: bool, default = True
                Whether to enable verbosity or not.
            dump_to_pickle: bool, default = False
                Whether to pickle the final preprocessed table or not.
                
        Returns:
            None
        '''
        self.file_directory = file_directory
        self.verbose = verbose
        self.dump_to_pickle = dump_to_pickle
        
    def load_dataframe(self):
        '''
        Load the installments_payments.csv DataFrame.
        
        Inputs:
            self
            
        Returns:
            None
        '''
        if self.verbose:
            self.start = datetime.now()
            print('##########################################################')
            print('#        Pre-processing installments_payments.csv        #')
            print('##########################################################')
            print("\nLoading the DataFrame, installments_payments.csv, into memory...")
        
        # Constructing the file path
        csv_path = self.file_directory + 'installments_payments.csv'
        
        try:
            self.installments_payments = pd.read_csv(csv_path)
            self.initial_shape = self.installments_payments.shape
            
            if self.verbose:
                print(f"Loaded {csv_path}")
                print(f"Time Taken to load = {datetime.now() - self.start}")
        except FileNotFoundError:
            raise FileNotFoundError(f"File not found at {csv_path}. Please ensure the file exists at the specified location.")

    def data_preprocessing_and_feature_engineering(self):
        '''
        Function for pre-processing and feature engineering
        
        Inputs:
            self
        
        Returns:
            None
        '''
        
        if self.verbose:
            start = datetime.now()
            print("Starting Data Pre-processing and Feature Engineering...")
        
        # Sorting by SK_ID_PREV and NUM_INSTALMENT_NUMBER
        self.installments_payments = self.installments_payments.sort_values(by = ['SK_ID_CURR','SK_ID_PREV','NUM_INSTALMENT_NUMBER'], ascending = True)
        
        # Getting the total NaN values in the table
        self.installments_payments['MISSING_VALS_TOTAL_INSTAL'] = self.installments_payments.isna().sum(axis = 1)
        # Engineering new features based on some domain based polynomial operations
        self.installments_payments['DAYS_PAYMENT_RATIO'] = self.installments_payments['DAYS_INSTALMENT'] / (self.installments_payments['DAYS_ENTRY_PAYMENT'] + 0.00001)
        self.installments_payments['DAYS_PAYMENT_DIFF'] = self.installments_payments['DAYS_INSTALMENT'] - self.installments_payments['DAYS_ENTRY_PAYMENT']
        self.installments_payments['AMT_PAYMENT_RATIO'] = self.installments_payments['AMT_PAYMENT'] / (self.installments_payments['AMT_INSTALMENT'] + 0.00001)
        self.installments_payments['AMT_PAYMENT_DIFF'] = self.installments_payments['AMT_INSTALMENT'] - self.installments_payments['AMT_PAYMENT']
        self.installments_payments['EXP_DAYS_PAYMENT_RATIO'] = self.installments_payments['DAYS_PAYMENT_RATIO'].transform(lambda x: x.ewm(alpha = 0.5).mean())
        self.installments_payments['EXP_DAYS_PAYMENT_DIFF'] = self.installments_payments['DAYS_PAYMENT_DIFF'].transform(lambda x: x.ewm(alpha = 0.5).mean())
        self.installments_payments['EXP_AMT_PAYMENT_RATIO'] = self.installments_payments['AMT_PAYMENT_RATIO'].transform(lambda x: x.ewm(alpha = 0.5).mean())
        self.installments_payments['EXP_AMT_PAYMENT_DIFF'] = self.installments_payments['AMT_PAYMENT_DIFF'].transform(lambda x: x.ewm(alpha = 0.5).mean())
        
        if self.verbose:
            print("Finished.")
            print(f"Time Taken = {datetime.now() - start}")
    
    def aggregations_sk_id_prev(self):
        '''
        Function for aggregations of installments on previous loans over SK_ID_PREV
        
        Inputs:
            self
        
        Returns:
            installments_payments table aggregated over previous loans
        '''
        
        if self.verbose:
            start = datetime.now()
            print("Performing Aggregations over SK_ID_PREV...")
        
        # Aggregating the data over SK_ID_PREV, i.e. for each previous loan
        overall_aggregations = {
            'MISSING_VALS_TOTAL_INSTAL' : ['sum'],
            'NUM_INSTALMENT_VERSION' : ['mean','sum'],
            'NUM_INSTALMENT_NUMBER' : ['max'],
            'DAYS_INSTALMENT' : ['max','min'],
            'DAYS_ENTRY_PAYMENT' : ['max','min'],
            'AMT_INSTALMENT' : ['mean', 'sum', 'max'],
            'AMT_PAYMENT' : ['mean', 'sum', 'max'],
            'DAYS_PAYMENT_RATIO' : ['mean', 'min','max'],
            'DAYS_PAYMENT_DIFF' : ['mean','min','max'],
            'AMT_PAYMENT_RATIO' : ['mean','min','max'],
            'AMT_PAYMENT_DIFF' : ['mean','min','max'],
            'EXP_DAYS_PAYMENT_RATIO' : ['last'],
            'EXP_DAYS_PAYMENT_DIFF' : ['last'],
            'EXP_AMT_PAYMENT_RATIO' : ['last'],
            'EXP_AMT_PAYMENT_DIFF' : ['last']
        }
        limited_period_aggregations = {
            'NUM_INSTALMENT_VERSION' : ['mean','sum'],
            'AMT_INSTALMENT' : ['mean', 'sum', 'max'],
            'AMT_PAYMENT' : ['mean', 'sum', 'max'],
            'DAYS_PAYMENT_RATIO' : ['mean', 'min','max'],
            'DAYS_PAYMENT_DIFF' : ['mean','min','max'],
            'AMT_PAYMENT_RATIO' : ['mean','min','max'],
            'AMT_PAYMENT_DIFF' : ['mean','min','max'],
            'EXP_DAYS_PAYMENT_RATIO' : ['last'],
            'EXP_DAYS_PAYMENT_DIFF' : ['last'],
            'EXP_AMT_PAYMENT_RATIO' : ['last'],
            'EXP_AMT_PAYMENT_DIFF' : ['last']
        }

        # Aggregating installments_payments over SK_ID_PREV for last 1 year installments
        group_last_1_year = self.installments_payments[self.installments_payments['DAYS_INSTALMENT'] > -365].groupby('SK_ID_PREV').agg(limited_period_aggregations)
        group_last_1_year.columns = ['_'.join(ele).upper() + '_LAST_1_YEAR' for ele in group_last_1_year.columns]
        # Aggregating installments_payments over SK_ID_PREV for first 5 installments
        group_first_5_instalments = self.installments_payments.groupby('SK_ID_PREV', as_index = False).head(5).groupby('SK_ID_PREV').agg(limited_period_aggregations)
        group_first_5_instalments.columns = ['_'.join(ele).upper() + '_FIRST_5_INSTALLMENTS' for ele in group_first_5_instalments.columns]
        # Overall aggregation of installments_payments over SK_ID_PREV
        group_overall = self.installments_payments.groupby(['SK_ID_PREV','SK_ID_CURR'], as_index = False).agg(overall_aggregations)
        group_overall.columns = ['_'.join(ele).upper() for ele in group_overall.columns]
        group_overall.rename(columns = {'SK_ID_PREV_': 'SK_ID_PREV','SK_ID_CURR_' : 'SK_ID_CURR'}, inplace = True)

        # Merging all of the above aggregations together
        installments_payments_agg_prev = group_overall.merge(group_last_1_year, on = 'SK_ID_PREV', how = 'outer')
        installments_payments_agg_prev = installments_payments_agg_prev.merge(group_first_5_instalments, on = 'SK_ID_PREV', how = 'outer')
        
        if self.verbose:
            print("Done.")
            print(f"Time Taken = {datetime.now() - start}")
            
        return installments_payments_agg_prev
    
    def aggregations_sk_id_curr(self, installments_payments_agg_prev):
        '''
        Function to aggregate the installments payments on previous loans over SK_ID_CURR
        
        Inputs:
            self
            installments_payments_agg_prev: DataFrame
                installments payments aggregated over SK_ID_PREV
        
        Returns:
            installments payments aggregated over SK_ID_CURR
        '''
                    
        # Aggregating over SK_ID_CURR
        main_features_aggregations = {
            'MISSING_VALS_TOTAL_INSTAL_SUM' : ['sum'],
            'NUM_INSTALMENT_VERSION_MEAN' : ['mean'],
            'NUM_INSTALMENT_VERSION_SUM' : ['mean'],
            'NUM_INSTALMENT_NUMBER_MAX' : ['mean','sum','max'],
            'AMT_INSTALMENT_MEAN' : ['mean','sum','max'],
            'AMT_INSTALMENT_SUM' : ['mean','sum','max'],
            'AMT_INSTALMENT_MAX' : ['mean'],
            'AMT_PAYMENT_MEAN' : ['mean','sum','max'],
            'AMT_PAYMENT_SUM' : ['mean','sum','max'],
            'AMT_PAYMENT_MAX' : ['mean'],
            'DAYS_PAYMENT_RATIO_MEAN' : ['mean','min','max'],
            'DAYS_PAYMENT_RATIO_MIN' : ['mean','min'],
            'DAYS_PAYMENT_RATIO_MAX' : ['mean','max'],
            'DAYS_PAYMENT_DIFF_MEAN' : ['mean','min','max'],
            'DAYS_PAYMENT_DIFF_MIN' : ['mean','min'],
            'DAYS_PAYMENT_DIFF_MAX' : ['mean','max'],
            'AMT_PAYMENT_RATIO_MEAN' : ['mean', 'min','max'],
            'AMT_PAYMENT_RATIO_MIN' : ['mean','min'],
            'AMT_PAYMENT_RATIO_MAX' : ['mean','max'],
            'AMT_PAYMENT_DIFF_MEAN' : ['mean','min','max'],
            'AMT_PAYMENT_DIFF_MIN' : ['mean','min'],
            'AMT_PAYMENT_DIFF_MAX' : ['mean','max'],
            'EXP_DAYS_PAYMENT_RATIO_LAST' : ['mean'],
            'EXP_DAYS_PAYMENT_DIFF_LAST' : ['mean'],
            'EXP_AMT_PAYMENT_RATIO_LAST' : ['mean'],
            'EXP_AMT_PAYMENT_DIFF_LAST' : ['mean']
        }

        grouped_main_features = installments_payments_agg_prev.groupby('SK_ID_CURR').agg(main_features_aggregations)
        grouped_main_features.columns = ['_'.join(ele).upper() for ele in grouped_main_features.columns]

        # Group remaining ones
        grouped_remaining_features = installments_payments_agg_prev.iloc[:,[1] + list(range(31,len(installments_payments_agg_prev.columns)))].groupby('SK_ID_CURR').mean()

        installments_payments_aggregated = grouped_main_features.merge(grouped_remaining_features, on = 'SK_ID_CURR', how = 'inner')
                
        return installments_payments_aggregated
    
    def main(self):
        '''
        Complete preprocessing and aggregation of installments_payments table.
        
        Inputs:
            self
            
        Returns:
            Final pre-processed and aggregated installments_payments table.
        '''
        self.load_dataframe()
        self.data_preprocessing_and_feature_engineering()
        installments_payments_agg_prev = self.aggregations_sk_id_prev()
        installments_payments_aggregated = self.aggregations_sk_id_curr(installments_payments_agg_prev)
        
        if self.verbose:
            print('Finished preprocessing installments_payments.')
            print(f"Initial Size of installments_payments: {self.initial_shape}")
            print(f'Size after Pre-Processing and Aggregation: {installments_payments_aggregated.shape}')
            print(f'Total Time Taken = {datetime.now() - self.start}')
        
        if self.dump_to_pickle:
            pickle_path = self.file_directory + 'installments_payments_preprocessed.pkl'
            with open(pickle_path, 'wb') as f:
                pickle.dump(installments_payments_aggregated, f)
            if self.verbose:
                print(f'Pickled pre-processed data to {pickle_path}')
        
        return installments_payments_aggregated

installments_aggregated = preprocess_installments_payments(
    file_directory='../analytical/assets/data/',
    verbose=True,
    dump_to_pickle=True
)

installments_aggregated = installments_aggregated.main()

print("Processed DataFrame:")
print(installments_aggregated.head())

##########################################################
#        Pre-processing installments_payments.csv        #
##########################################################

Loading the DataFrame, installments_payments.csv, into memory...
Loaded ../analytical/assets/data/installments_payments.csv
Time Taken to load = 0:00:04.523316
Starting Data Pre-processing and Feature Engineering...
Finished.
Time Taken = 0:00:31.784566
Performing Aggregations over SK_ID_PREV...
Done.
Time Taken = 0:00:08.273633
Finished preprocessing installments_payments.
Initial Size of installments_payments: (13605401, 8)
Size after Pre-Processing and Aggregation: (339587, 101)
Total Time Taken = 0:00:45.770606
Pickled pre-processed data to ../analytical/assets/data/installments_payments_preprocessed.pkl
Processed DataFrame:
            MISSING_VALS_TOTAL_INSTAL_SUM_SUM  \
SK_ID_CURR                                      
100001                                      0   
100002                                      0   
100003                                      0   
100004                                      0   
100005                                      0   

            NUM_INSTALMENT_VERSION_MEAN_MEAN  NUM_INSTALMENT_VERSION_SUM_MEAN  \
SK_ID_CURR                                                                      
100001                              1.125000                         4.000000   
100002                              1.052632                        20.000000   
100003                              1.047619                         8.666667   
100004                              1.333333                         4.000000   
100005                              1.111111                        10.000000   

            NUM_INSTALMENT_NUMBER_MAX_MEAN  NUM_INSTALMENT_NUMBER_MAX_SUM  \
SK_ID_CURR                                                                  
100001                            4.000000                              8   
100002                           19.000000                             19   
100003                            8.333333                             25   
100004                            3.000000                              3   
100005                            9.000000                              9   

            NUM_INSTALMENT_NUMBER_MAX_MAX  AMT_INSTALMENT_MEAN_MEAN  \
SK_ID_CURR                                                            
100001                                  4               5647.200000   
100002                                 19              11559.247105   
100003                                 12              78558.479286   
100004                                  3               7096.155000   
100005                                  9               6240.205000   

            AMT_INSTALMENT_MEAN_SUM  AMT_INSTALMENT_MEAN_MAX  \
SK_ID_CURR                                                     
100001                 11294.400000              7312.725000   
100002                 11559.247105             11559.247105   
100003                235675.437857            164425.332857   
100004                  7096.155000              7096.155000   
100005                  6240.205000              6240.205000   

            AMT_INSTALMENT_SUM_MEAN  ...  \
SK_ID_CURR                           ...   
100001                   20597.9625  ...   
100002                  219625.6950  ...   
100003                  539621.5500  ...   
100004                   21288.4650  ...   
100005                   56161.8450  ...   

            AMT_PAYMENT_RATIO_MEAN_FIRST_5_INSTALLMENTS  \
SK_ID_CURR                                                
100001                                              1.0   
100002                                              1.0   
100003                                              1.0   
100004                                              1.0   
100005                                              1.0   

            AMT_PAYMENT_RATIO_MIN_FIRST_5_INSTALLMENTS  \
SK_ID_CURR                                               
100001                                             1.0   
100002                                             1.0   
100003                                             1.0   
100004                                             1.0   
100005                                             1.0   

            AMT_PAYMENT_RATIO_MAX_FIRST_5_INSTALLMENTS  \
SK_ID_CURR                                               
100001                                             1.0   
100002                                             1.0   
100003                                             1.0   
100004                                             1.0   
100005                                             1.0   

            AMT_PAYMENT_DIFF_MEAN_FIRST_5_INSTALLMENTS  \
SK_ID_CURR                                               
100001                                             0.0   
100002                                             0.0   
100003                                             0.0   
100004                                             0.0   
100005                                             0.0   

            AMT_PAYMENT_DIFF_MIN_FIRST_5_INSTALLMENTS  \
SK_ID_CURR                                              
100001                                            0.0   
100002                                            0.0   
100003                                            0.0   
100004                                            0.0   
100005                                            0.0   

            AMT_PAYMENT_DIFF_MAX_FIRST_5_INSTALLMENTS  \
SK_ID_CURR                                              
100001                                            0.0   
100002                                            0.0   
100003                                            0.0   
100004                                            0.0   
100005                                            0.0   

            EXP_DAYS_PAYMENT_RATIO_LAST_FIRST_5_INSTALLMENTS  \
SK_ID_CURR                                                     
100001                                              0.996151   
100002                                              0.954027   
100003                                              0.987843   
100004                                              0.990915   
100005                                              0.977839   

            EXP_DAYS_PAYMENT_DIFF_LAST_FIRST_5_INSTALLMENTS  \
SK_ID_CURR                                                    
100001                                             5.844619   
100002                                            22.591941   
100003                                             8.158240   
100004                                             6.728890   
100005                                            14.960278   

            EXP_AMT_PAYMENT_RATIO_LAST_FIRST_5_INSTALLMENTS  \
SK_ID_CURR                                                    
100001                                                  1.0   
100002                                                  1.0   
100003                                                  1.0   
100004                                                  1.0   
100005                                                  1.0   

            EXP_AMT_PAYMENT_DIFF_LAST_FIRST_5_INSTALLMENTS  
SK_ID_CURR                                                  
100001                                                 0.0  
100002                                                 0.0  
100003                                                 0.0  
100004                                                 0.0  
100005                                                 0.0  

[5 rows x 101 columns]

###################################
# POS_CASH_Balance Data Processing
###################################

class preprocess_POS_CASH_balance:
    '''
    Preprocess the POS_CASH_balance table.
    Contains 6 member functions:
        1. init method
        2. load_dataframe method
        3. data_preprocessing_and_feature_engineering method
        4. aggregations_sk_id_prev method
        5. aggregations_sk_id_curr method
        6. main method
    '''

    def __init__(self, file_directory='../analytical/assets/data/', verbose=True, dump_to_pickle=False):
        '''
        Initialize the class members.
        
        Inputs:
            file_directory: Path, str, default = '../analytical/assets/data/'
                The path where the file exists. Include a '/' at the end of the path in input.
            verbose: bool, default = True
                Whether to enable verbosity or not.
            dump_to_pickle: bool, default = False
                Whether to pickle the final preprocessed table or not.
                
        Returns:
            None
        '''
        self.file_directory = file_directory
        self.verbose = verbose
        self.dump_to_pickle = dump_to_pickle
    
    def load_dataframe(self):
        '''
        Load the POS_CASH_balance.csv DataFrame.
        
        Inputs:
            self
            
        Returns:
            None
        '''
        if self.verbose:
            self.start = datetime.now()
            print('#########################################################')
            print('#          Pre-processing POS_CASH_balance.csv          #')
            print('#########################################################')
            print("\nLoading the DataFrame, POS_CASH_balance.csv, into memory...")

        # Constructing the file path
        csv_path = self.file_directory + 'POS_CASH_balance.csv'
        
        try:
            self.pos_cash = pd.read_csv(csv_path)
            self.initial_size = self.pos_cash.shape
            
            if self.verbose:
                print(f"Loaded {csv_path}")
                print(f"Time Taken to load = {datetime.now() - self.start}")
        except FileNotFoundError:
            raise FileNotFoundError(f"File not found at {csv_path}. Please ensure the file exists at the specified location.")
            
    def data_preprocessing_and_feature_engineering(self):
        '''
        Function to preprocess the table and create new features.
        
        Inputs:
            self
        
        Returns:
            None
        '''
        
        if self.verbose:
            start = datetime.now()
            print("Starting Data Cleaning and Feature Engineering...")

        # Making the MONTHS_BALANCE Positive
        self.pos_cash['MONTHS_BALANCE'] = np.abs(self.pos_cash['MONTHS_BALANCE'])
        # Sorting the DataFrame according to the month of status from oldest to latest, for rolling computations
        self.pos_cash = self.pos_cash.sort_values(by=['SK_ID_PREV', 'MONTHS_BALANCE'], ascending=False)

        # Computing Exponential Moving Average for some features based on MONTHS_BALANCE
        columns_for_ema = ['CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE']
        exp_columns = ['EXP_'+ele for ele in columns_for_ema]
        self.pos_cash[exp_columns] = self.pos_cash.groupby('SK_ID_PREV')[columns_for_ema].transform(lambda x: x.ewm(alpha = 0.6).mean())

        # Creating new features based on Domain Knowledge
        self.pos_cash['SK_DPD_RATIO'] = self.pos_cash['SK_DPD'] / (self.pos_cash['SK_DPD_DEF'] + 0.00001)
        self.pos_cash['TOTAL_TERM'] = self.pos_cash['CNT_INSTALMENT'] + self.pos_cash['CNT_INSTALMENT_FUTURE']
        self.pos_cash['EXP_POS_TOTAL_TERM'] = self.pos_cash['EXP_CNT_INSTALMENT'] + self.pos_cash['EXP_CNT_INSTALMENT_FUTURE']
        
        if self.verbose:
            print("Finished.")
            print(f"Time Taken = {datetime.now() - start}")
            
    def aggregations_sk_id_prev(self):
        '''
        Function to aggregated the POS_CASH_balance rows over SK_ID_PREV
        
        Inputs:
            self
        
        Returns:
            Aggregated POS_CASH_balance table over SK_ID_PREV
        '''
        
        if self.verbose:
            start = datetime.now()
            print("\nAggregations over SK_ID_PREV...")
            
        # Aggregating over SK_ID_PREV
        overall_aggregations = {
            'SK_ID_CURR' : ['first'],
            'MONTHS_BALANCE' : ['max'],
            'CNT_INSTALMENT' : ['mean', 'max','min'],
            'CNT_INSTALMENT_FUTURE' : ['mean','max','min'],
            'SK_DPD' : ['max','sum'],
            'SK_DPD_DEF' : ['max','sum'],
            'EXP_CNT_INSTALMENT' : ['last'],
            'EXP_CNT_INSTALMENT_FUTURE' : ['last'],
            'SK_DPD_RATIO' : ['mean','max'],
            'TOTAL_TERM' : ['mean','max','last'],
            'EXP_POS_TOTAL_TERM' : ['mean'] 
        }
        aggregations_for_year = {
            'CNT_INSTALMENT' : ['mean', 'max','min'],
            'CNT_INSTALMENT_FUTURE' : ['mean','max','min'],
            'SK_DPD' : ['max','sum'],
            'SK_DPD_DEF' : ['max','sum'],
            'EXP_CNT_INSTALMENT' : ['last'],
            'EXP_CNT_INSTALMENT_FUTURE' : ['last'],
            'SK_DPD_RATIO' : ['mean','max'],
            'TOTAL_TERM' : ['mean','max'],
            'EXP_POS_TOTAL_TERM' : ['last'] 
        }
        aggregations_for_categories = {
            'CNT_INSTALMENT' : ['mean', 'max','min'],
            'CNT_INSTALMENT_FUTURE' : ['mean','max','min'],
            'SK_DPD' : ['max','sum'],
            'SK_DPD_DEF' : ['max','sum'],
            'EXP_CNT_INSTALMENT' : ['last'],
            'EXP_CNT_INSTALMENT_FUTURE' : ['last'],
            'SK_DPD_RATIO' : ['mean','max'],
            'TOTAL_TERM' : ['mean','max'],
            'EXP_POS_TOTAL_TERM' : ['last']
        }
        # Performing overall aggregations over SK_ID_PREV
        pos_cash_aggregated_overall = self.pos_cash.groupby('SK_ID_PREV').agg(overall_aggregations)
        pos_cash_aggregated_overall.columns = ['_'.join(ele).upper() for ele in pos_cash_aggregated_overall.columns]
        pos_cash_aggregated_overall.rename(columns = {'SK_ID_CURR_FIRST': 'SK_ID_CURR'}, inplace = True)

        # Yearwise aggregations
        self.pos_cash['YEAR_BALANCE'] = self.pos_cash['MONTHS_BALANCE'] //12
        # Aggregating over SK_ID_PREV for each last 2 years
        pos_cash_aggregated_year = pd.DataFrame()
        for year in range(2):
            group = self.pos_cash[self.pos_cash['YEAR_BALANCE'] == year].groupby('SK_ID_PREV').agg(aggregations_for_year)
            group.columns = ['_'.join(ele).upper() + '_YEAR_' + str(year) for ele in group.columns]
            if year == 0:
                pos_cash_aggregated_year = group
            else:
                pos_cash_aggregated_year = pos_cash_aggregated_year.merge(group, on = 'SK_ID_PREV', how = 'outer')

        # Aggregating over SK_ID_PREV for rest of the years
        pos_cash_aggregated_rest_years = self.pos_cash[self.pos_cash['YEAR_BALANCE'] >= 2].groupby('SK_ID_PREV').agg(aggregations_for_year)
        pos_cash_aggregated_rest_years.columns = ['_'.join(ele).upper() + '_YEAR_REST' for ele in pos_cash_aggregated_rest_years.columns]
        # Merging all the years aggregations
        pos_cash_aggregated_year = pos_cash_aggregated_year.merge(pos_cash_aggregated_rest_years, on = 'SK_ID_PREV', how = 'outer')
        self.pos_cash = self.pos_cash.drop(['YEAR_BALANCE'], axis = 1)

        # Aggregating over SK_ID_PREV for each of NAME_CONTRACT_STATUS categories
        contract_type_categories = ['Active', 'Completed']
        pos_cash_aggregated_contract = pd.DataFrame()
        for i, contract_type in enumerate(contract_type_categories):
            group = self.pos_cash[self.pos_cash['NAME_CONTRACT_STATUS'] == contract_type].groupby('SK_ID_PREV').agg(aggregations_for_categories)
            group.columns = ['_'.join(ele).upper() + '_' + contract_type.upper() for ele in group.columns]
            if i == 0:
                pos_cash_aggregated_contract = group
            else:
                pos_cash_aggregated_contract = pos_cash_aggregated_contract.merge(group, on = 'SK_ID_PREV', how = 'outer')

        pos_cash_aggregated_rest_contract = self.pos_cash[(self.pos_cash['NAME_CONTRACT_STATUS'] != 'Active') & 
                                        (self.pos_cash['NAME_CONTRACT_STATUS'] != 'Completed')].groupby('SK_ID_PREV').agg(aggregations_for_categories)
        pos_cash_aggregated_rest_contract.columns = ['_'.join(ele).upper() + '_REST' for ele in pos_cash_aggregated_rest_contract.columns]
        # Merging the categorical aggregations
        pos_cash_aggregated_contract = pos_cash_aggregated_contract.merge(pos_cash_aggregated_rest_contract, on = 'SK_ID_PREV', how = 'outer')    

        # Merging all the aggregations
        pos_cash_aggregated = pos_cash_aggregated_overall.merge(pos_cash_aggregated_year, on = 'SK_ID_PREV', how = 'outer')
        pos_cash_aggregated = pos_cash_aggregated.merge(pos_cash_aggregated_contract, on = 'SK_ID_PREV', how = 'outer')

        # Onehot encoding the categorical feature NAME_CONTRACT_TYPE
        name_contract_dummies = pd.get_dummies(self.pos_cash['NAME_CONTRACT_STATUS'], prefix='CONTRACT')
        contract_names = name_contract_dummies.columns.tolist()
        # Concatenating one-hot encoded categories with main table
        self.pos_cash = pd.concat([self.pos_cash, name_contract_dummies], axis=1)
        # Aggregating these over SK_ID_PREV as well
        aggregated_cc_contract = self.pos_cash[['SK_ID_PREV'] + contract_names].groupby('SK_ID_PREV').mean()    

        # Merging with the final aggregations
        pos_cash_aggregated = pos_cash_aggregated.merge(aggregated_cc_contract, on = 'SK_ID_PREV', how = 'outer')
                
        if self.verbose:
            print("Finished.")
            print(f"Time Taken = {datetime.now() - start}")
        
        return pos_cash_aggregated
    
    def aggregations_sk_id_curr(self, pos_cash_aggregated):
        '''
        Function to aggregated the aggregateed POS_CASH_balance table over SK_ID_CURR
        
        Inputs:
            self
            pos_cash_aggregated: DataFrame
                aggregated pos_cash table over SK_ID_PREV
                
        Returns:
            pos_cash_balance table aggregated over SK_ID_CURR
        '''
        
        # Aggregating over SK_ID_CURR
        columns_to_aggregate = pos_cash_aggregated.columns[1:]
        # Defining the aggregations to perform
        aggregations_final = {}
        for col in columns_to_aggregate:
            if 'MEAN' in col:
                aggregates = ['mean','sum','max']
            else:
                aggregates = ['mean']
            aggregations_final[col] = aggregates
        pos_cash_aggregated_final = pos_cash_aggregated.groupby('SK_ID_CURR').agg(aggregations_final)
        pos_cash_aggregated_final.columns = ['_'.join(ele).upper() for ele in pos_cash_aggregated_final.columns]
        
        return pos_cash_aggregated_final
    
    def main(self):
        '''
        Complete preprocessing and aggregation of POS_CASH_balance table.
        
        Inputs:
            self
            
        Returns:
            Final pre-processed and aggregated POS_CASH_balance table.
        '''
        # Loading the DataFrame
        self.load_dataframe()

        # Performing data pre-processing and feature engineering
        self.data_preprocessing_and_feature_engineering()

        # Performing aggregations over SK_ID_PREV
        pos_cash_aggregated = self.aggregations_sk_id_prev()

        if pos_cash_aggregated is None:
            print("Error: No data to aggregate over SK_ID_PREV. Exiting.")
            return None

        if self.verbose:
            print("Aggregation over SK_ID_CURR...")

        # Aggregating over SK_ID_CURR
        pos_cash_aggregated_final = self.aggregations_sk_id_curr(pos_cash_aggregated)

        if self.verbose:
            print('Finished preprocessing POS_CASH_balance.')
            print(f"Initial Size of POS_CASH_balance: {self.initial_size}")
            print(f'Size of POS_CASH_balance after Pre-Processing, Feature Engineering, and Aggregation: {pos_cash_aggregated_final.shape}')
            print(f'Total Time Taken = {datetime.now() - self.start}')

        if self.dump_to_pickle:
            if self.verbose:
                print('Pickling pre-processed POS_CASH_balance to POS_CASH_balance_preprocessed.pkl')
            with open(self.file_directory + 'POS_CASH_balance_preprocessed.pkl', 'wb') as f:
                pickle.dump(pos_cash_aggregated_final, f)
            if self.verbose:
                print('Finished.')

        return pos_cash_aggregated_final

pos_aggregated = preprocess_POS_CASH_balance(
    file_directory='../analytical/assets/data/',
    verbose=True,
    dump_to_pickle=True
)

pos_aggregated = pos_aggregated.main()

if pos_aggregated is not None:
    print("Processed POS_CASH_balance DataFrame:")
    print(pos_aggregated.head())
else:
    print("Processing failed or returned no data.")

#########################################################
#          Pre-processing POS_CASH_balance.csv          #
#########################################################

Loading the DataFrame, POS_CASH_balance.csv, into memory...
Loaded ../analytical/assets/data/POS_CASH_balance.csv
Time Taken to load = 0:00:02.280092
Starting Data Cleaning and Feature Engineering...
Finished.
Time Taken = 0:02:50.643557

Aggregations over SK_ID_PREV...
Finished.
Time Taken = 0:00:12.663423
Aggregation over SK_ID_CURR...
Finished preprocessing POS_CASH_balance.
Initial Size of POS_CASH_balance: (10001358, 8)
Size of POS_CASH_balance after Pre-Processing, Feature Engineering, and Aggregation: (337252, 188)
Total Time Taken = 0:03:09.973096
Pickling pre-processed POS_CASH_balance to POS_CASH_balance_preprocessed.pkl
Finished.
Processed POS_CASH_balance DataFrame:
            MONTHS_BALANCE_MAX_MEAN  CNT_INSTALMENT_MEAN_MEAN  \
SK_ID_CURR                                                      
100001                    76.500000                  4.000000   
100002                    19.000000                 24.000000   
100003                    43.333333                  9.791667   
100004                    27.000000                  3.750000   
100005                    25.000000                 11.700000   

            CNT_INSTALMENT_MEAN_SUM  CNT_INSTALMENT_MEAN_MAX  \
SK_ID_CURR                                                     
100001                        8.000                     4.00   
100002                       24.000                    24.00   
100003                       29.375                    12.00   
100004                        3.750                     3.75   
100005                       11.700                    11.70   

            CNT_INSTALMENT_MAX_MEAN  CNT_INSTALMENT_MIN_MEAN  \
SK_ID_CURR                                                     
100001                          4.0                 4.000000   
100002                         24.0                24.000000   
100003                         10.0                 8.333333   
100004                          4.0                 3.000000   
100005                         12.0                 9.000000   

            CNT_INSTALMENT_FUTURE_MEAN_MEAN  CNT_INSTALMENT_FUTURE_MEAN_SUM  \
SK_ID_CURR                                                                    
100001                             1.375000                            2.75   
100002                            15.000000                           15.00   
100003                             5.666667                           17.00   
100004                             2.250000                            2.25   
100005                             7.200000                            7.20   

            CNT_INSTALMENT_FUTURE_MEAN_MAX  CNT_INSTALMENT_FUTURE_MAX_MEAN  \
SK_ID_CURR                                                                   
100001                               2.000                             3.0   
100002                              15.000                            24.0   
100003                               7.875                            10.0   
100004                               2.250                             4.0   
100005                               7.200                            12.0   

            ...  EXP_POS_TOTAL_TERM_LAST_REST_MEAN  CONTRACT_ACTIVE_MEAN  \
SK_ID_CURR  ...                                                            
100001      ...                                NaN              0.775000   
100002      ...                                NaN              1.000000   
100003      ...                                NaN              0.916667   
100004      ...                                NaN              0.750000   
100005      ...                                NaN              0.818182   

            CONTRACT_AMORTIZED DEBT_MEAN  CONTRACT_APPROVED_MEAN  \
SK_ID_CURR                                                         
100001                               0.0                     0.0   
100002                               0.0                     0.0   
100003                               0.0                     0.0   
100004                               0.0                     0.0   
100005                               0.0                     0.0   

            CONTRACT_CANCELED_MEAN  CONTRACT_COMPLETED_MEAN  \
SK_ID_CURR                                                    
100001                         0.0                 0.225000   
100002                         0.0                 0.000000   
100003                         0.0                 0.083333   
100004                         0.0                 0.250000   
100005                         0.0                 0.090909   

            CONTRACT_DEMAND_MEAN  CONTRACT_RETURNED TO THE STORE_MEAN  \
SK_ID_CURR                                                              
100001                       0.0                                  0.0   
100002                       0.0                                  0.0   
100003                       0.0                                  0.0   
100004                       0.0                                  0.0   
100005                       0.0                                  0.0   

            CONTRACT_SIGNED_MEAN  CONTRACT_XNA_MEAN  
SK_ID_CURR                                           
100001                  0.000000                0.0  
100002                  0.000000                0.0  
100003                  0.000000                0.0  
100004                  0.000000                0.0  
100005                  0.090909                0.0  

[5 rows x 188 columns]

###################################
# Credit Card Balance Data Processing
###################################

class preprocess_credit_card_balance:
    '''
    Preprocess the credit_card_balance table.
    Contains 5 member functions:
        1. __init__ method
        2. load_dataframe method
        3. data_preprocessing_and_feature_engineering method
        4. aggregations method
        5. main method
    '''

    def __init__(self, file_directory='', verbose=True, dump_to_pickle=False):
        '''
        Initialize the class members.
        '''
        self.file_directory = file_directory
        self.verbose = verbose
        self.dump_to_pickle = dump_to_pickle

    def load_dataframe(self):
        '''
        Load the credit_card_balance.csv DataFrame.
        '''
        if self.verbose:
            self.start = datetime.now()
            print('#########################################################')
            print('#          Pre-processing Credit Card Balance.csv       #')
            print('#########################################################')
            print("\nLoading the DataFrame, credit_card_balance.csv, into memory...")

        # Adjusted loading logic to ensure compatibility
        try:
            self.cc_balance = pd.read_csv(self.file_directory + 'credit_card_balance.csv')
        except FileNotFoundError as e:
            raise FileNotFoundError(f"File not found in the specified directory: {self.file_directory}") from e

        self.initial_size = self.cc_balance.shape

        if self.verbose:
            print("Loaded credit_card_balance.csv")
            print(f"Time Taken to load = {datetime.now() - self.start}")

    def data_preprocessing_and_feature_engineering(self):
        '''
        Preprocess and create domain-based features.
        '''
        if self.verbose:
            start = datetime.now()
            print("Starting Preprocessing and Feature Engineering...")

        self.cc_balance['AMT_PAYMENT_CURRENT'] = self.cc_balance['AMT_PAYMENT_CURRENT'].where(
            self.cc_balance['AMT_PAYMENT_CURRENT'] <= 4000000, np.nan)

        self.cc_balance['MISSING_VALS_TOTAL_CC'] = self.cc_balance.isna().sum(axis=1)
        self.cc_balance['MONTHS_BALANCE'] = np.abs(self.cc_balance['MONTHS_BALANCE'])
        self.cc_balance = self.cc_balance.sort_values(by=['SK_ID_PREV', 'MONTHS_BALANCE'], ascending=[1, 0])

        self.cc_balance['AMT_DRAWING_SUM'] = (
            self.cc_balance['AMT_DRAWINGS_ATM_CURRENT'] +
            self.cc_balance['AMT_DRAWINGS_CURRENT'] +
            self.cc_balance['AMT_DRAWINGS_OTHER_CURRENT'] +
            self.cc_balance['AMT_DRAWINGS_POS_CURRENT']
        )
        self.cc_balance['BALANCE_LIMIT_RATIO'] = self.cc_balance['AMT_BALANCE'] / (
                self.cc_balance['AMT_CREDIT_LIMIT_ACTUAL'] + 1e-5)
        self.cc_balance['CNT_DRAWING_SUM'] = (
            self.cc_balance['CNT_DRAWINGS_ATM_CURRENT'] +
            self.cc_balance['CNT_DRAWINGS_CURRENT'] +
            self.cc_balance['CNT_DRAWINGS_OTHER_CURRENT'] +
            self.cc_balance['CNT_DRAWINGS_POS_CURRENT'] +
            self.cc_balance['CNT_INSTALMENT_MATURE_CUM']
        )
        self.cc_balance['MIN_PAYMENT_RATIO'] = self.cc_balance['AMT_PAYMENT_CURRENT'] / (
                self.cc_balance['AMT_INST_MIN_REGULARITY'] + 1e-4)
        self.cc_balance['PAYMENT_MIN_DIFF'] = self.cc_balance['AMT_PAYMENT_CURRENT'] - self.cc_balance[
            'AMT_INST_MIN_REGULARITY']
        self.cc_balance['MIN_PAYMENT_TOTAL_RATIO'] = self.cc_balance['AMT_PAYMENT_TOTAL_CURRENT'] / (
                self.cc_balance['AMT_INST_MIN_REGULARITY'] + 1e-5)
        self.cc_balance['AMT_INTEREST_RECEIVABLE'] = (
                self.cc_balance['AMT_TOTAL_RECEIVABLE'] - self.cc_balance['AMT_RECEIVABLE_PRINCIPAL']
        )
        self.cc_balance['SK_DPD_RATIO'] = self.cc_balance['SK_DPD'] / (self.cc_balance['SK_DPD_DEF'] + 1e-5)

        rolling_columns = [
            'AMT_BALANCE', 'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_RECEIVABLE_PRINCIPAL',
            'AMT_RECIVABLE', 'AMT_TOTAL_RECEIVABLE', 'AMT_DRAWING_SUM',
            'BALANCE_LIMIT_RATIO', 'CNT_DRAWING_SUM', 'MIN_PAYMENT_RATIO',
            'PAYMENT_MIN_DIFF', 'MIN_PAYMENT_TOTAL_RATIO', 'AMT_INTEREST_RECEIVABLE',
            'SK_DPD_RATIO'
        ]
        exp_weighted_columns = ['EXP_' + col for col in rolling_columns]
        self.cc_balance[exp_weighted_columns] = self.cc_balance.groupby(['SK_ID_CURR', 'SK_ID_PREV'])[rolling_columns].transform(
            lambda x: x.ewm(alpha=0.7).mean())

        if self.verbose:
            print("Finished.")
            print(f"Time Taken = {datetime.now() - start}")

    def aggregations(self):
        '''
        Function to perform aggregations of rows of credit_card_balance table, first over SK_ID_PREV,
        and then over SK_ID_CURR
        
        Inputs:
            self
        
        Returns:
            aggregated credit_card_balance table.
        '''

        if self.verbose:
            print("Aggregating the DataFrame, first over SK_ID_PREv, then over SK_ID_CURR")

        # Aggregations over SK_ID_PREV
        overall_aggregations = {
            'SK_ID_CURR' : ['first'],
            'MONTHS_BALANCE': ['max'],
            'AMT_BALANCE' : ['sum','mean','max'],
            'AMT_CREDIT_LIMIT_ACTUAL' : ['sum','mean','max'],
            'AMT_DRAWINGS_ATM_CURRENT' : ['sum','max'],
            'AMT_DRAWINGS_CURRENT' : ['sum','max'],
            'AMT_DRAWINGS_OTHER_CURRENT' : ['sum','max'],
            'AMT_DRAWINGS_POS_CURRENT' : ['sum','max'],
            'AMT_INST_MIN_REGULARITY' : ['mean','min','max'],
            'AMT_PAYMENT_CURRENT' : ['mean','min','max'],
            'AMT_PAYMENT_TOTAL_CURRENT' : ['mean','min','max'],
            'AMT_RECEIVABLE_PRINCIPAL' : ['sum','mean','max'],
            'AMT_RECIVABLE' : ['sum','mean','max'],
            'AMT_TOTAL_RECEIVABLE' : ['sum','mean','max'],
            'CNT_DRAWINGS_ATM_CURRENT' : ['sum','max'],
            'CNT_DRAWINGS_CURRENT' : ['sum','max'],
            'CNT_DRAWINGS_OTHER_CURRENT' : ['sum','max'],
            'CNT_DRAWINGS_POS_CURRENT' : ['sum','max'],
            'CNT_INSTALMENT_MATURE_CUM' : ['sum','max','min'],
            'SK_DPD' : ['sum','max'],
            'SK_DPD_DEF' : ['sum','max'],

            'AMT_DRAWING_SUM' : ['sum','max'],
            'BALANCE_LIMIT_RATIO' : ['mean','max','min'],
            'CNT_DRAWING_SUM' : ['sum','max'],
            'MIN_PAYMENT_RATIO': ['min','mean'],
            'PAYMENT_MIN_DIFF' : ['min','mean'],
            'MIN_PAYMENT_TOTAL_RATIO' : ['min','mean'], 
            'AMT_INTEREST_RECEIVABLE' : ['min','mean'],
            'SK_DPD_RATIO' : ['max','mean'],

            'EXP_AMT_BALANCE' : ['last'],
            'EXP_AMT_CREDIT_LIMIT_ACTUAL' : ['last'],
            'EXP_AMT_RECEIVABLE_PRINCIPAL' : ['last'],
            'EXP_AMT_RECIVABLE' : ['last'],
            'EXP_AMT_TOTAL_RECEIVABLE' : ['last'],
            'EXP_AMT_DRAWING_SUM' : ['last'],
            'EXP_BALANCE_LIMIT_RATIO' : ['last'],
            'EXP_CNT_DRAWING_SUM' : ['last'],
            'EXP_MIN_PAYMENT_RATIO' : ['last'],
            'EXP_PAYMENT_MIN_DIFF' : ['last'],
            'EXP_MIN_PAYMENT_TOTAL_RATIO' : ['last'],
            'EXP_AMT_INTEREST_RECEIVABLE' : ['last'],
            'EXP_SK_DPD_RATIO' : ['last'],
            'MISSING_VALS_TOTAL_CC' : ['sum']
        }
        aggregations_for_categories = {
            'SK_DPD' : ['sum','max'],
            'SK_DPD_DEF' : ['sum','max'],
            'BALANCE_LIMIT_RATIO' : ['mean','max','min'],
            'CNT_DRAWING_SUM' : ['sum','max'],
            'MIN_PAYMENT_RATIO': ['min','mean'],
            'PAYMENT_MIN_DIFF' : ['min','mean'],
            'MIN_PAYMENT_TOTAL_RATIO' : ['min','mean'], 
            'AMT_INTEREST_RECEIVABLE' : ['min','mean'],
            'SK_DPD_RATIO' : ['max','mean'],
            'EXP_AMT_DRAWING_SUM' : ['last'],
            'EXP_BALANCE_LIMIT_RATIO' : ['last'],
            'EXP_CNT_DRAWING_SUM' : ['last'],
            'EXP_MIN_PAYMENT_RATIO' : ['last'],
            'EXP_PAYMENT_MIN_DIFF' : ['last'],
            'EXP_MIN_PAYMENT_TOTAL_RATIO' : ['last'],
            'EXP_AMT_INTEREST_RECEIVABLE' : ['last'],
            'EXP_SK_DPD_RATIO' : ['last']
        }
        aggregations_for_year = {
            'SK_DPD' : ['sum','max'],
            'SK_DPD_DEF' : ['sum','max'],
            'BALANCE_LIMIT_RATIO' : ['mean','max','min'],
            'CNT_DRAWING_SUM' : ['sum','max'],
            'MIN_PAYMENT_RATIO': ['min','mean'],
            'PAYMENT_MIN_DIFF' : ['min','mean'],
            'MIN_PAYMENT_TOTAL_RATIO' : ['min','mean'], 
            'AMT_INTEREST_RECEIVABLE' : ['min','mean'],
            'SK_DPD_RATIO' : ['max','mean'],
            'EXP_AMT_DRAWING_SUM' : ['last'],
            'EXP_BALANCE_LIMIT_RATIO' : ['last'],
            'EXP_CNT_DRAWING_SUM' : ['last'],
            'EXP_MIN_PAYMENT_RATIO' : ['last'],
            'EXP_PAYMENT_MIN_DIFF' : ['last'],
            'EXP_MIN_PAYMENT_TOTAL_RATIO' : ['last'],
            'EXP_AMT_INTEREST_RECEIVABLE' : ['last'],
            'EXP_SK_DPD_RATIO' : ['last']
        }
        # Overall aggregations over SK_ID_PREV for all features
        cc_balance_aggregated_overall = self.cc_balance.groupby('SK_ID_PREV').agg(overall_aggregations)
        cc_balance_aggregated_overall.columns = ['_'.join(ele).upper() for ele in cc_balance_aggregated_overall.columns]
        cc_balance_aggregated_overall.rename(columns = {'SK_ID_CURR_FIRST' : 'SK_ID_CURR'}, inplace = True)

        # Aggregation over SK_ID_PREV for different categories
        contract_status_categories = ['Active','Completed']
        cc_balance_aggregated_categories = pd.DataFrame()
        for i, contract_type in enumerate(contract_status_categories):
            group = self.cc_balance[self.cc_balance['NAME_CONTRACT_STATUS'] == contract_type].groupby('SK_ID_PREV').agg(aggregations_for_categories)
            group.columns = ['_'.join(ele).upper() + '_' + contract_type.upper() for ele in group.columns]
            if i == 0:
                cc_balance_aggregated_categories = group
            else:
                cc_balance_aggregated_categories = cc_balance_aggregated_categories.merge(group, on = 'SK_ID_PREV', how = 'outer')
        # Aggregation over SK_ID_PREV for rest of the categories
        cc_balance_aggregated_categories_rest = self.cc_balance[(self.cc_balance['NAME_CONTRACT_STATUS'] != 'Active') & 
                                        (self.cc_balance.NAME_CONTRACT_STATUS != 'Completed')].groupby('SK_ID_PREV').agg(aggregations_for_categories)
        cc_balance_aggregated_categories_rest.columns = ['_'.join(ele).upper() + '_REST' for ele in cc_balance_aggregated_categories_rest.columns]
        # Merge all the categorical aggregations
        cc_balance_aggregated_categories = cc_balance_aggregated_categories.merge(cc_balance_aggregated_categories_rest, on = 'SK_ID_PREV', how = 'outer')

        # Aggregation  over SK_ID_PREV for different years
        self.cc_balance['YEAR_BALANCE'] = self.cc_balance['MONTHS_BALANCE'] //12
        cc_balance_aggregated_year = pd.DataFrame()
        for year in range(2):
            group = self.cc_balance[self.cc_balance['YEAR_BALANCE'] == year].groupby('SK_ID_PREV').agg(aggregations_for_year)
            group.columns = ['_'.join(ele).upper() + '_YEAR_' + str(year) for ele in group.columns]
            if year == 0:
                cc_balance_aggregated_year = group
            else:
                cc_balance_aggregated_year = cc_balance_aggregated_year.merge(group, on = 'SK_ID_PREV', how = 'outer')
        # Aggregation  over SK_ID_PREV for rest of years
        cc_balance_aggregated_year_rest = self.cc_balance[self.cc_balance['YEAR_BALANCE'] >= 2].groupby('SK_ID_PREV').agg(aggregations_for_year)
        cc_balance_aggregated_year_rest.columns = ['_'.join(ele).upper() + '_YEAR_REST' for ele in cc_balance_aggregated_year_rest.columns]
        # Merge all all the yearwise aggregations
        cc_balance_aggregated_year = cc_balance_aggregated_year.merge(cc_balance_aggregated_year_rest, on = 'SK_ID_PREV', how = 'outer')
        self.cc_balance = self.cc_balance.drop('YEAR_BALANCE', axis = 1)

        # Merge all the aggregations
        cc_aggregated = cc_balance_aggregated_overall.merge(cc_balance_aggregated_categories, on = 'SK_ID_PREV', how = 'outer')
        cc_aggregated = cc_aggregated.merge(cc_balance_aggregated_year, on = 'SK_ID_PREV', how = 'outer')

        # One-hot encoding the categorical column NAME_CONTRACT_STATUS
        name_contract_dummies = pd.get_dummies(self.cc_balance.NAME_CONTRACT_STATUS, prefix='CONTRACT')
        contract_names = name_contract_dummies.columns.tolist()

        # Merge  the one-hot encoded feature with original table
        self.cc_balance = pd.concat([self.cc_balance, name_contract_dummies], axis=1)
        
        # Aggregation over SK_ID_PREV the one-hot encoded columns
        aggregated_cc_contract = self.cc_balance[['SK_ID_PREV'] + contract_names].groupby('SK_ID_PREV').mean()

        # Merge with the aggregated table
        cc_aggregated = cc_aggregated.merge(aggregated_cc_contract, on = 'SK_ID_PREV', how = 'outer')

        # Aggregation on SK_ID_CURR
        # As per the EDA, since most of the SK_ID_CURR had only 1 credit card, so for aggregations, we will simply take the means
        cc_aggregated = cc_aggregated.groupby('SK_ID_CURR', as_index = False).mean()
                    
        return cc_aggregated

    def main(self):
        '''
        Execute the complete preprocessing and aggregation pipeline.
        '''
        self.load_dataframe()
        self.data_preprocessing_and_feature_engineering()
        cc_aggregated = self.aggregations()

        if self.verbose:
            print('Finished preprocessing credit_card_balance.')
            print(f"Initial Size of credit_card_balance: {self.initial_size}")
            print(f'Size of credit_card_balance after Pre-Processing, Feature Engineering and Aggregation: {cc_aggregated.shape}')
            print(f'Total Time Taken = {datetime.now() - self.start}')

        if self.dump_to_pickle:
            if self.verbose:
                print('\nPickling pre-processed credit_card_balance to credit_card_balance_preprocessed.pkl')
            with open(self.file_directory + 'credit_card_balance_preprocessed.pkl', 'wb') as f:
                pickle.dump(cc_aggregated, f)
            if self.verbose:
                print('Finished.')

        return cc_aggregated

cc_aggregated = preprocess_credit_card_balance(
    file_directory='../analytical/assets/data/',
    verbose=True,
    dump_to_pickle=True
)

cc_aggregated = cc_aggregated.main()

print("Processed credit_card_balance DataFrame:")
print(cc_aggregated.head())

#########################################################
#          Pre-processing Credit Card Balance.csv       #
#########################################################

Loading the DataFrame, credit_card_balance.csv, into memory...
Loaded credit_card_balance.csv
Time Taken to load = 0:00:06.560330
Starting Preprocessing and Feature Engineering...
Finished.
Time Taken = 0:00:47.566696
Aggregating the DataFrame, first over SK_ID_PREv, then over SK_ID_CURR
Finished preprocessing credit_card_balance.
Initial Size of credit_card_balance: (3840312, 23)
Size of credit_card_balance after Pre-Processing, Feature Engineering and Aggregation: (103558, 249)
Total Time Taken = 0:01:03.547317

Pickling pre-processed credit_card_balance to credit_card_balance_preprocessed.pkl
Finished.
Processed credit_card_balance DataFrame:
   SK_ID_CURR  MONTHS_BALANCE_MAX  AMT_BALANCE_SUM  AMT_BALANCE_MEAN  \
0      100006                 6.0            0.000          0.000000   
1      100011                75.0      4031676.225      54482.111149   
2      100013                96.0      1743352.245      18159.919219   
3      100021                18.0            0.000          0.000000   
4      100023                11.0            0.000          0.000000   

   AMT_BALANCE_MAX  AMT_CREDIT_LIMIT_ACTUAL_SUM  AMT_CREDIT_LIMIT_ACTUAL_MEAN  \
0             0.00                    1620000.0                 270000.000000   
1        189000.00                   12150000.0                 164189.189189   
2        161420.22                   12645000.0                 131718.750000   
3             0.00                   11475000.0                 675000.000000   
4             0.00                    1080000.0                 135000.000000   

   AMT_CREDIT_LIMIT_ACTUAL_MAX  AMT_DRAWINGS_ATM_CURRENT_SUM  \
0                     270000.0                           0.0   
1                     180000.0                      180000.0   
2                     157500.0                      571500.0   
3                     675000.0                           0.0   
4                     225000.0                           0.0   

   AMT_DRAWINGS_ATM_CURRENT_MAX  ...  \
0                           NaN  ...   
1                      180000.0  ...   
2                      157500.0  ...   
3                           NaN  ...   
4                           NaN  ...   

   EXP_MIN_PAYMENT_TOTAL_RATIO_LAST_YEAR_REST  \
0                                         NaN   
1                                5.364418e-10   
2                                1.632702e-21   
3                                         NaN   
4                                         NaN   

   EXP_AMT_INTEREST_RECEIVABLE_LAST_YEAR_REST  \
0                                         NaN   
1                               -7.937222e-06   
2                               -2.582366e-10   
3                                         NaN   
4                                         NaN   

   EXP_SK_DPD_RATIO_LAST_YEAR_REST  CONTRACT_Active  CONTRACT_Approved  \
0                              NaN         1.000000                0.0   
1                     0.000000e+00         1.000000                0.0   
2                     2.067998e-24         1.000000                0.0   
3                              NaN         0.411765                0.0   
4                              NaN         1.000000                0.0   

   CONTRACT_Completed  CONTRACT_Demand  CONTRACT_Refused  \
0            0.000000              0.0               0.0   
1            0.000000              0.0               0.0   
2            0.000000              0.0               0.0   
3            0.588235              0.0               0.0   
4            0.000000              0.0               0.0   

   CONTRACT_Sent proposal  CONTRACT_Signed  
0                     0.0              0.0  
1                     0.0              0.0  
2                     0.0              0.0  
3                     0.0              0.0  
4                     0.0              0.0  

[5 rows x 249 columns]

# ##################################################
# # Preprocess application_train and application_test
# ##################################################

class preprocess_application_train_test:
    '''
    Preprocess the application_train and application_test tables.
    Contains 11 member functions:
        1. init method
        2. load_dataframe method
        3. data_cleaning method
        4. ext_source_values_predictor method
        5. numeric_feature_engineering method
        6. neighbors_EXT_SOURCE_feature method
        7. categorical_interaction_features method
        8. response_fit method
        9. response_transform method
        10. cnt_payment_prediction method
        11. main method
    '''
    
    def __init__(self, file_directory = '', verbose = True, dump_to_pickle = False):
        '''
        This function is used to initialize the class members 
        
        Inputs:
            self
            file_directory: Path, str, default = ''
                The path where the file exists. Include a '/' at the end of the path in input
            verbose: bool, default = True
                Whether to enable verbosity or not
            dump_to_pickle: bool, default = False
                Whether to pickle the final preprocessed table or not
                
        Returns:
            None
        '''
        
        self.verbose = verbose
        self.dump_to_pickle = dump_to_pickle
        self.file_directory = file_directory
        
    def load_dataframes(self):
        '''
        Load the application_train.csv and application_test.csv DataFrames.
        '''
        if self.verbose:
            self.start = datetime.now()
            print('#######################################################')
            print('#        Pre-processing application_train.csv         #')
            print('#        Pre-processing application_test.csv          #')
            print('#######################################################')
            print("\nLoading the DataFrames into memory...")

        train_path = self.file_directory + 'application_train.csv'
        test_path = self.file_directory + 'application_test.csv'
        
        # Debugging file paths
        print(f"Loading application_train from: {train_path}")
        print(f"Loading application_test from: {test_path}")

        self.application_train = pd.read_csv(train_path)
        self.application_test = pd.read_csv(test_path)
        
        # Debugging initial shape
        print(f"application_train shape: {self.application_train.shape}")
        print(f"application_test shape: {self.application_test.shape}")
        
        self.initial_shape = self.application_train.shape
        
        if self.verbose:
            print(f"Time Taken to load = {datetime.now() - self.start}")

    def data_cleaning(self):
        '''
        Function to clean the tables, by removing erroneous rows/entries.
        '''
        if self.verbose:
            print("\nPerforming Data Cleaning...")
        
        print(f"Initial shape of application_train: {self.application_train.shape}")
        print(f"Initial shape of application_test: {self.application_test.shape}")
        
        # Dropping FLAG_DOCUMENT columns
        flag_cols_to_drop = ['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_20']
        self.application_train = self.application_train.drop(flag_cols_to_drop, axis=1)
        self.application_test = self.application_test.drop(flag_cols_to_drop, axis=1)
        print(f"Shape after dropping FLAG_DOCUMENT columns: {self.application_train.shape}")

        # Convert age from days to years
        self.application_train['DAYS_BIRTH'] = self.application_train['DAYS_BIRTH'] * -1 / 365
        self.application_test['DAYS_BIRTH'] = self.application_test['DAYS_BIRTH'] * -1 / 365

        # Handle erroneous DAYS_EMPLOYED values
        self.application_train.loc[self.application_train['DAYS_EMPLOYED'] == 365243, 'DAYS_EMPLOYED'] = np.nan
        self.application_test.loc[self.application_test['DAYS_EMPLOYED'] == 365243, 'DAYS_EMPLOYED'] = np.nan

        # OBS columns with erroneous values
        self.application_train.loc[self.application_train['OBS_30_CNT_SOCIAL_CIRCLE'] > 30, 'OBS_30_CNT_SOCIAL_CIRCLE'] = np.nan
        self.application_train.loc[self.application_train['OBS_60_CNT_SOCIAL_CIRCLE'] > 30, 'OBS_60_CNT_SOCIAL_CIRCLE'] = np.nan
        self.application_test.loc[self.application_test['OBS_30_CNT_SOCIAL_CIRCLE'] > 30, 'OBS_30_CNT_SOCIAL_CIRCLE'] = np.nan
        self.application_test.loc[self.application_test['OBS_60_CNT_SOCIAL_CIRCLE'] > 30, 'OBS_60_CNT_SOCIAL_CIRCLE'] = np.nan

        print(f"Shape after handling OBS_SOCIAL_CIRCLE: {self.application_train.shape}")

        # Remove rows with 'XNA' in CODE_GENDER
        self.application_train = self.application_train[self.application_train['CODE_GENDER'] != 'XNA']
        print(f"Shape after removing 'XNA' in CODE_GENDER: {self.application_train.shape}")

        # Fill missing categorical values with 'XNA'
        categorical_columns = self.application_train.dtypes[self.application_train.dtypes == 'object'].index.tolist()
        self.application_train[categorical_columns] = self.application_train[categorical_columns].fillna('XNA')
        self.application_test[categorical_columns] = self.application_test[categorical_columns].fillna('XNA')

        # Convert REGION_RATING_CLIENT columns to object type
        self.application_train['REGION_RATING_CLIENT'] = self.application_train['REGION_RATING_CLIENT'].astype('object')
        self.application_train['REGION_RATING_CLIENT_W_CITY'] = self.application_train['REGION_RATING_CLIENT_W_CITY'].astype('object')
        self.application_test['REGION_RATING_CLIENT'] = self.application_test['REGION_RATING_CLIENT'].astype('object')
        self.application_test['REGION_RATING_CLIENT_W_CITY'] = self.application_test['REGION_RATING_CLIENT_W_CITY'].astype('object')

        print(f"Shape after converting REGION_RATING_CLIENT: {self.application_train.shape}")

        # Count missing values
        self.application_train['MISSING_VALS_TOTAL_APP'] = self.application_train.isna().sum(axis=1)
        self.application_test['MISSING_VALS_TOTAL_APP'] = self.application_test.isna().sum(axis=1)

        if self.verbose:
            print("Done with Data Cleaning.")
            
    def ext_source_values_predictor(self):
        '''
        Function to predict the missing values of EXT_SOURCE features
        
        Inputs:
            self
        
        Returns:
            None
        '''
        
        if self.verbose:
            start = datetime.now()
            print("Predicting the missing values of EXT_SOURCE columns...")
            
        # Predicting the EXT_SOURCE missing values
        # Numeric columns for predicting the EXT_SOURCES
        columns_for_modelling = list(set(self.application_test.dtypes[self.application_test.dtypes != 'object'].index.tolist())
                                             - set(['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','SK_ID_CURR']))
        with open('columns_for_ext_values_predictor.pkl', 'wb') as f:
            pickle.dump(columns_for_modelling, f)
        
        # XGB Regression model for predicting missing EXT_SOURCE values
        # Predict in the order of least number of missing value columns to max.
        for ext_col in ['EXT_SOURCE_2','EXT_SOURCE_3','EXT_SOURCE_1']:
            # X_model - datapoints which do not have missing values of given column
            # Y_train - values of column trying to predict with non missing values
            # X_train_missing - datapoints in application_train with missing values
            # X_test_missing - datapoints in application_test with missing values
            X_model, X_train_missing, X_test_missing, Y_train = self.application_train[~self.application_train[ext_col].isna()][columns_for_modelling], self.application_train[
                                                                self.application_train[ext_col].isna()][columns_for_modelling], self.application_test[
                                                                self.application_test[ext_col].isna()][columns_for_modelling], self.application_train[
                                                                ext_col][~self.application_train[ext_col].isna()]
            xg = XGBRegressor(n_estimators = 1000, max_depth = 3, learning_rate = 0.1, n_jobs = -1, random_state = 59)
            xg.fit(X_model, Y_train)

            # Dump the model to pickle file
            with open(f'nan_{ext_col}_xgbr_model.pkl', 'wb') as f:
                pickle.dump(xg, f)

            self.application_train[ext_col][self.application_train[ext_col].isna()] = xg.predict(X_train_missing)
            self.application_test[ext_col][self.application_test[ext_col].isna()] = xg.predict(X_test_missing)
            
            columns_for_modelling = columns_for_modelling + [ext_col]
            
        if self.verbose:
            print("Finished.")
            print(f"Time elapsed = {datetime.now() - start}")
                    
    def numeric_feature_engineering(self, data):
        '''
        Function to perform feature engineering on numeric columns based on domain knowledge.
        
        Inputs:
            self
            data: DataFrame
                The tables of whose features are to be generated
        
        Returns: 
            None
        '''
    
        # Income and credit features
        data['CREDIT_INCOME_RATIO'] = data['AMT_CREDIT'] / (data['AMT_INCOME_TOTAL'] + 0.00001)
        data['CREDIT_ANNUITY_RATIO'] = data['AMT_CREDIT'] / (data['AMT_ANNUITY'] + 0.00001)
        data['ANNUITY_INCOME_RATIO'] = data['AMT_ANNUITY'] / (data['AMT_INCOME_TOTAL'] + 0.00001)
        data['INCOME_ANNUITY_DIFF'] = data['AMT_INCOME_TOTAL'] - data['AMT_ANNUITY']
        data['CREDIT_GOODS_RATIO'] = data['AMT_CREDIT'] / (data['AMT_GOODS_PRICE'] + 0.00001)
        data['CREDIT_GOODS_DIFF'] = data['AMT_CREDIT'] - data['AMT_GOODS_PRICE'] + 0.00001
        data['GOODS_INCOME_RATIO'] = data['AMT_GOODS_PRICE'] / (data['AMT_INCOME_TOTAL'] + 0.00001)
        data['INCOME_EXT_RATIO'] = data['AMT_INCOME_TOTAL'] / (data['EXT_SOURCE_3'] + 0.00001)
        data['CREDIT_EXT_RATIO'] = data['AMT_CREDIT'] / (data['EXT_SOURCE_3'] + 0.00001)
        # Age ratios and diffs
        data['AGE_EMPLOYED_DIFF'] = data['DAYS_BIRTH'] - data['DAYS_EMPLOYED']
        data['EMPLOYED_TO_AGE_RATIO'] = data['DAYS_EMPLOYED'] / (data['DAYS_BIRTH'] + 0.00001)
        # Car ratios
        data['CAR_EMPLOYED_DIFF'] = data['OWN_CAR_AGE'] - data['DAYS_EMPLOYED']
        data['CAR_EMPLOYED_RATIO'] = data['OWN_CAR_AGE'] / (data['DAYS_EMPLOYED']+0.00001)
        data['CAR_AGE_DIFF'] = data['DAYS_BIRTH'] - data['OWN_CAR_AGE']
        data['CAR_AGE_RATIO'] = data['OWN_CAR_AGE'] / (data['DAYS_BIRTH'] + 0.00001)
        # Flag contacts sum
        data['FLAG_CONTACTS_SUM'] = data['FLAG_MOBIL'] + data['FLAG_EMP_PHONE'] + data['FLAG_WORK_PHONE'] + data[
                                    'FLAG_CONT_MOBILE'] + data['FLAG_PHONE'] + data['FLAG_EMAIL']
        
        data['HOUR_PROCESS_CREDIT_MUL'] = data['AMT_CREDIT'] * data['HOUR_APPR_PROCESS_START']
        # Family members
        data['CNT_NON_CHILDREN'] = data['CNT_FAM_MEMBERS'] - data['CNT_CHILDREN']
        data['CHILDREN_INCOME_RATIO'] = data['CNT_CHILDREN'] / (data['AMT_INCOME_TOTAL'] + 0.00001)
        data['PER_CAPITA_INCOME'] = data['AMT_INCOME_TOTAL'] / (data['CNT_FAM_MEMBERS'] + 1)
        # Region ratings
        data['REGIONS_RATING_INCOME_MUL'] = (data['REGION_RATING_CLIENT'] + data['REGION_RATING_CLIENT_W_CITY']) * data['AMT_INCOME_TOTAL'] / 2
        data['REGION_RATING_MAX'] = [max(ele1, ele2) for ele1, ele2 in zip(data['REGION_RATING_CLIENT'], data['REGION_RATING_CLIENT_W_CITY'])]
        data['REGION_RATING_MAX'] = [min(ele1, ele2) for ele1, ele2 in zip(data['REGION_RATING_CLIENT'], data['REGION_RATING_CLIENT_W_CITY'])]
        data['REGION_RATING_MEAN'] = (data['REGION_RATING_CLIENT'] + data['REGION_RATING_CLIENT_W_CITY']) / 2
        data['REGION_RATING_MUL'] = data['REGION_RATING_CLIENT'] * data['REGION_RATING_CLIENT_W_CITY']
        # Flag regions
        data['FLAG_REGIONS'] = data['REG_REGION_NOT_LIVE_REGION'] + data['REG_REGION_NOT_WORK_REGION'] + data['LIVE_REGION_NOT_WORK_REGION']+data[
                                'REG_CITY_NOT_LIVE_CITY'] + data['REG_CITY_NOT_WORK_CITY'] + data['LIVE_CITY_NOT_WORK_CITY']   
        # Ext_sources
        data['EXT_SOURCE_MEAN'] = (data['EXT_SOURCE_1'] + data['EXT_SOURCE_2'] + data['EXT_SOURCE_3'] ) / 3
        data['EXT_SOURCE_MUL'] = data['EXT_SOURCE_1'] * data['EXT_SOURCE_2'] * data['EXT_SOURCE_3'] 
        data['EXT_SOURCE_MAX'] = [max(ele1,ele2,ele3) for ele1, ele2, ele3 in zip(data['EXT_SOURCE_1'], data['EXT_SOURCE_2'], data['EXT_SOURCE_3'])]
        data['EXT_SOURCE_MIN'] = [min(ele1,ele2,ele3) for ele1, ele2, ele3 in zip(data['EXT_SOURCE_1'], data['EXT_SOURCE_2'], data['EXT_SOURCE_3'])]
        data['EXT_SOURCE_VAR'] = [np.var([ele1,ele2,ele3]) for ele1, ele2, ele3 in zip(data['EXT_SOURCE_1'], data['EXT_SOURCE_2'], data['EXT_SOURCE_3'])]
        data['WEIGHTED_EXT_SOURCE'] =  data.EXT_SOURCE_1 * 2 + data.EXT_SOURCE_2 * 3 + data.EXT_SOURCE_3 * 4
        # Apartment scores
        data['APARTMENTS_SUM_AVG'] = data['APARTMENTS_AVG'] + data['BASEMENTAREA_AVG'] + data['YEARS_BEGINEXPLUATATION_AVG'] + data[
                                    'YEARS_BUILD_AVG'] + data['COMMONAREA_AVG'] + data['ELEVATORS_AVG'] + data['ENTRANCES_AVG'] + data[
                                    'FLOORSMAX_AVG'] + data['FLOORSMIN_AVG'] + data['LANDAREA_AVG'] + data['LIVINGAPARTMENTS_AVG'] + data[
                                    'LIVINGAREA_AVG'] + data['NONLIVINGAPARTMENTS_AVG'] + data['NONLIVINGAREA_AVG']

        data['APARTMENTS_SUM_MODE'] = data['APARTMENTS_MODE'] + data['BASEMENTAREA_MODE'] + data['YEARS_BEGINEXPLUATATION_MODE'] + data[
                                    'YEARS_BUILD_MODE'] + data['COMMONAREA_MODE'] + data['ELEVATORS_MODE'] + data['ENTRANCES_MODE'] + data[
                                    'FLOORSMAX_MODE'] + data['FLOORSMIN_MODE'] + data['LANDAREA_MODE'] + data['LIVINGAPARTMENTS_MODE'] + data[
                                    'LIVINGAREA_MODE'] + data['NONLIVINGAPARTMENTS_MODE'] + data['NONLIVINGAREA_MODE'] + data['TOTALAREA_MODE']

        data['APARTMENTS_SUM_MEDI'] = data['APARTMENTS_MEDI'] + data['BASEMENTAREA_MEDI'] + data['YEARS_BEGINEXPLUATATION_MEDI'] + data[
                                    'YEARS_BUILD_MEDI'] + data['COMMONAREA_MEDI'] + data['ELEVATORS_MEDI'] + data['ENTRANCES_MEDI'] + data[
                                    'FLOORSMAX_MEDI'] + data['FLOORSMIN_MEDI'] + data['LANDAREA_MEDI'] + data['LIVINGAPARTMENTS_MEDI'] + data[
                                    'LIVINGAREA_MEDI'] + data['NONLIVINGAPARTMENTS_MEDI'] + data['NONLIVINGAREA_MEDI']
        data['INCOME_APARTMENT_AVG_MUL'] = data['APARTMENTS_SUM_AVG'] * data['AMT_INCOME_TOTAL']
        data['INCOME_APARTMENT_MODE_MUL'] = data['APARTMENTS_SUM_MODE'] * data['AMT_INCOME_TOTAL']
        data['INCOME_APARTMENT_MEDI_MUL'] = data['APARTMENTS_SUM_MEDI'] * data['AMT_INCOME_TOTAL']
        # OBS And DEF
        data['OBS_30_60_SUM'] = data['OBS_30_CNT_SOCIAL_CIRCLE'] + data['OBS_60_CNT_SOCIAL_CIRCLE']
        data['DEF_30_60_SUM'] = data['DEF_30_CNT_SOCIAL_CIRCLE'] + data['DEF_60_CNT_SOCIAL_CIRCLE']
        data['OBS_DEF_30_MUL'] = data['OBS_30_CNT_SOCIAL_CIRCLE'] *  data['DEF_30_CNT_SOCIAL_CIRCLE']
        data['OBS_DEF_60_MUL'] = data['OBS_60_CNT_SOCIAL_CIRCLE'] *  data['DEF_60_CNT_SOCIAL_CIRCLE']
        data['SUM_OBS_DEF_ALL'] = data['OBS_30_CNT_SOCIAL_CIRCLE'] + data['DEF_30_CNT_SOCIAL_CIRCLE'] + data[
                                    'OBS_60_CNT_SOCIAL_CIRCLE'] + data['DEF_60_CNT_SOCIAL_CIRCLE']
        data['OBS_30_CREDIT_RATIO'] = data['AMT_CREDIT'] / (data['OBS_30_CNT_SOCIAL_CIRCLE'] + 0.00001)
        data['OBS_60_CREDIT_RATIO'] = data['AMT_CREDIT'] / (data['OBS_60_CNT_SOCIAL_CIRCLE'] + 0.00001)
        data['DEF_30_CREDIT_RATIO'] = data['AMT_CREDIT'] / (data['DEF_30_CNT_SOCIAL_CIRCLE'] + 0.00001)
        data['DEF_60_CREDIT_RATIO'] = data['AMT_CREDIT'] / (data['DEF_60_CNT_SOCIAL_CIRCLE'] + 0.00001)
        # Flag Documents combined
        data['SUM_FLAGS_DOCUMENTS'] = data['FLAG_DOCUMENT_3'] + data['FLAG_DOCUMENT_5'] + data['FLAG_DOCUMENT_6']  + data[
                                    'FLAG_DOCUMENT_7'] + data['FLAG_DOCUMENT_8'] + data['FLAG_DOCUMENT_9'] + data[
                                    'FLAG_DOCUMENT_11'] + data['FLAG_DOCUMENT_13'] + data['FLAG_DOCUMENT_14'] + data[
                                    'FLAG_DOCUMENT_15'] + data['FLAG_DOCUMENT_16'] + data['FLAG_DOCUMENT_17'] + data[
                                    'FLAG_DOCUMENT_18'] + data['FLAG_DOCUMENT_19'] + data['FLAG_DOCUMENT_21']
        # Details change
        data['DAYS_DETAILS_CHANGE_MUL'] = data['DAYS_LAST_PHONE_CHANGE'] * data['DAYS_REGISTRATION'] * data['DAYS_ID_PUBLISH']
        data['DAYS_DETAILS_CHANGE_SUM'] = data['DAYS_LAST_PHONE_CHANGE'] + data['DAYS_REGISTRATION'] + data['DAYS_ID_PUBLISH']
        # Enquires
        data['AMT_ENQ_SUM'] = data['AMT_REQ_CREDIT_BUREAU_HOUR'] + data['AMT_REQ_CREDIT_BUREAU_DAY'] + data['AMT_REQ_CREDIT_BUREAU_WEEK'] + data[
                            'AMT_REQ_CREDIT_BUREAU_MON'] + data['AMT_REQ_CREDIT_BUREAU_QRT'] + data['AMT_REQ_CREDIT_BUREAU_YEAR']
        data['ENQ_CREDIT_RATIO'] = data['AMT_ENQ_SUM'] / (data['AMT_CREDIT'] + 0.00001)
        
        cnt_payment = self.cnt_payment_prediction(data)
        data['EXPECTED_CNT_PAYMENT'] = cnt_payment
        data['EXPECTED_INTEREST'] = data['AMT_ANNUITY'] *  data['EXPECTED_CNT_PAYMENT'] - data['AMT_CREDIT']
        data['EXPECTED_INTEREST_SHARE'] = data['EXPECTED_INTEREST'] / (data['AMT_CREDIT'] + 0.00001)
        data['EXPECTED_INTEREST_RATE'] = 2 * 12 * data['EXPECTED_INTEREST'] / (data['AMT_CREDIT'] * (data['EXPECTED_CNT_PAYMENT'] + 1))
                    
        return data
    
    def neighbors_EXT_SOURCE_feature(self):
        '''
        Function to generate a feature which contains the means of TARGET of 500 neighbors of a particular row.
        
        Inputs:
            self
        
        Returns:
            None
        '''
        
        # Reference: https://www.kaggle.com/c/home-credit-default-risk/discussion/64821
        # Imputing the mean of 500 nearest neighbor's target values for each application
        # Neighbors are computed using EXT_SOURCE feature and CREDIT_ANNUITY_RATIO
        
        knn = KNeighborsClassifier(500, n_jobs = -1)
        
        train_data_for_neighbors = self.application_train[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','CREDIT_ANNUITY_RATIO']].fillna(0)
        # Saving the training data for neighbors
        with open('TARGET_MEAN_500_Neighbors_training_data.pkl', 'wb') as f:
            pickle.dump(train_data_for_neighbors, f)
        train_target = self.application_train.TARGET
        test_data_for_neighbors = self.application_test[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','CREDIT_ANNUITY_RATIO']].fillna(0)
        
        knn.fit(train_data_for_neighbors, train_target)
        # Pickle the knn model
        with open('KNN_model_TARGET_500_neighbors.pkl', 'wb') as f:
            pickle.dump(knn, f)
            
        train_500_neighbors = knn.kneighbors(train_data_for_neighbors)[1]
        test_500_neighbors = knn.kneighbors(test_data_for_neighbors)[1]
        
        # Adding the means of targets of 500 neighbors to new column
        self.application_train['TARGET_NEIGHBORS_500_MEAN'] = [self.application_train['TARGET'].iloc[ele].mean() for ele in train_500_neighbors]
        self.application_test['TARGET_NEIGHBORS_500_MEAN'] = [self.application_train['TARGET'].iloc[ele].mean() for ele in test_500_neighbors]
    
    def categorical_interaction_features(self, train_data, test_data):
        '''
        Function to generate some features based on categorical groupings.
        
        Inputs:
            self
            train_data, test_data : DataFrames
                train and test dataframes
        
        Returns:
            Train and test datasets, with added categorical interaction features.
        '''
        
        # Create features based on categorical interactions
        columns_to_aggregate_on = [
            ['NAME_CONTRACT_TYPE', 'NAME_INCOME_TYPE', 'OCCUPATION_TYPE'],
            ['CODE_GENDER', 'NAME_FAMILY_STATUS', 'NAME_INCOME_TYPE'],
            ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE'],
            ['NAME_EDUCATION_TYPE','NAME_INCOME_TYPE','OCCUPATION_TYPE'],
            ['OCCUPATION_TYPE','ORGANIZATION_TYPE'],
            ['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY']

        ]
        aggregations = {
            'AMT_ANNUITY' : ['mean','max','min'],
            'ANNUITY_INCOME_RATIO' : ['mean','max','min'],
            'AGE_EMPLOYED_DIFF' : ['mean','min'],
            'AMT_INCOME_TOTAL' : ['mean','max','min'],
            'APARTMENTS_SUM_AVG' : ['mean','max','min'],
            'APARTMENTS_SUM_MEDI' : ['mean','max','min'],
            'EXT_SOURCE_MEAN' : ['mean','max','min'],
            'EXT_SOURCE_1' : ['mean','max','min'],
            'EXT_SOURCE_2' : ['mean','max','min'],
            'EXT_SOURCE_3' : ['mean','max','min']
        }
        
        # Extracting values
        for group in columns_to_aggregate_on:
            # Grouping based on categories
            grouped_interactions = train_data.groupby(group).agg(aggregations) 
            grouped_interactions.columns = ['_'.join(ele).upper() + '_AGG_' + '_'.join(group) for ele in grouped_interactions.columns]
            # Saving the grouped interactions to pickle file
            group_name = '_'.join(group)
            with open(f'Application_train_grouped_interactions_{group_name}.pkl', 'wb') as f:
                pickle.dump(grouped_interactions, f)
            # Merging with the original data
            train_data = train_data.join(grouped_interactions, on = group)
            test_data = test_data.join(grouped_interactions, on = group)

        return train_data, test_data

    def response_fit(self, data, column):
        '''
        Response Encoding Fit Function
        Function to create a vocabulary with the probability of occurrence of each category for categorical features
        for a given class label.
        
        Inputs:
            self
            data: DataFrame
                training Dataset
            column: str
                the categorical column for which vocab is to be generated
        
        Returns:
            Dictionary of probability of occurrence of each category in a particular class label.
        '''
        
        dict_occurrences = {1: {}, 0: {}}
        for label in [0,1]:
            dict_occurrences[label] = dict((data[column][data.TARGET == label].value_counts() / data[column].value_counts()).fillna(0))

        return dict_occurrences

    def response_transform(self, data, column, dict_mapping):
        '''
        Response Encoding Transform Function
        Function to transform the categorical feature into two features, which contain the probability
        of occurrence of that category for each class label.
        
        Inputs:
            self
            data: DataFrame
                DataFrame whose categorical features are to be encoded
            column: str
                categorical column whose encoding is to be done
            dict_mapping: dict
                Dictionary obtained from Response Fit function for that particular column
        
        Returns:
            None
        '''
        
        data[column + '_0'] = data[column].map(dict_mapping[0])
        data[column + '_1'] = data[column].map(dict_mapping[1])
    
    def cnt_payment_prediction(self, data_to_predict):
        '''
        Function to predict the Count_payments on Current Loans using data from previous loans.
        
        Inputs:
            self
            data_to_predict: DataFrame
                the values using which the model would predict the Count_payments on current applications
        
        Returns:
            Predicted Count_payments of the current applications.
        '''
        
        # Refernce: https://www.kaggle.com/c/home-credit-default-risk/discussion/64598
        if self.verbose:
            print("\nLoading previous_application.csv for Count Payment Prediction...")
        
        # Load the previous_application dataset
        csv_path = self.file_directory + 'previous_application.csv'
        try:
            self.previous_application = pd.read_csv(csv_path)
            self.initial_shape = self.previous_application.shape
        except FileNotFoundError:
            raise FileNotFoundError(f"File not found at path: {csv_path}")
        train_data = self.previous_application[['AMT_CREDIT', 'AMT_ANNUITY', 'CNT_PAYMENT']].dropna()
        train_data['CREDIT_ANNUITY_RATIO'] = train_data['AMT_CREDIT'] / (train_data['AMT_ANNUITY'] + 1)
        # Value to predict is our CNT_PAYMENT
        train_value = train_data.pop('CNT_PAYMENT')
        
        # Test data would be our application_train data
        test_data = data_to_predict[['AMT_CREDIT','AMT_ANNUITY']].fillna(0)
        test_data['CREDIT_ANNUITY_RATIO'] = test_data['AMT_CREDIT'] / (test_data['AMT_ANNUITY'] + 1)
        
        lgbmr = LGBMRegressor(max_depth = 9, n_estimators = 5000, n_jobs = -1, learning_rate = 0.3, 
                              random_state = 125)
        lgbmr.fit(train_data, train_value)
        # Dumping the model to pickle file
        with open('cnt_payment_predictor_lgbmr.pkl', 'wb') as f:
            pickle.dump(lgbmr, f)
        # Predicting the CNT_PAYMENT for test_data
        cnt_payment = lgbmr.predict(test_data)
        
        return cnt_payment

    def main(self):
        '''
        Function to be called for complete preprocessing of application_train and application_test tables.
        
        Inputs:
            self
            
        Returns:
            Final pre=processed application_train and application_test tables.
        '''

        # Loading the DataFrames first
        self.load_dataframes()
        # Data Cleaning
        self.data_cleaning()
        # Predicting the missing values of EXT_SOURCE columns
        self.ext_source_values_predictor()
        
        # Feature engineering
        if self.verbose:
            start = datetime.now()
            print("Starting Feature Engineering...")
            print("Creating Domain Based Features on Numeric Data")
        # Creating Numeric features based on domain knowledge
        self.application_train = self.numeric_feature_engineering(self.application_train)
        self.application_test = self.numeric_feature_engineering(self.application_test)
        # 500 Neighbors Target mean
        self.neighbors_EXT_SOURCE_feature()
        if self.verbose:
            print("Finished.")
            print(f"Time Taken = {datetime.now() - start}")

        if self.verbose:
            start = datetime.now()
            print("Creating features based on Categorical Interactions on some Numeric Features")
        # Creating features based on categorical interactions
        self.application_train, self.application_test = self.categorical_interaction_features(self.application_train, self.application_test)   
        if self.verbose:
            print("Finished.")
            print(f"Time taken = {datetime.now() - start}")
        
        # Using response coding on categorical features, to keep the dimensionality in check
        # Categorical columns to perform response coding on
        categorical_columns_application = self.application_train.dtypes[self.application_train.dtypes == 'object'].index.tolist()
        for col in categorical_columns_application:
            # Extracting the dictionary with values corresponding to TARGET variable 0 and 1 for each of the categories
            mapping_dictionary = self.response_fit(self.application_train, col)
            # Saving the mapping dictionary to pickle file
            with open(f'Response_coding_dict_{col}.pkl', 'wb') as f:
                pickle.dump(mapping_dictionary, f)
            # Mapping this dictionary with our DataFrame
            self.response_transform(self.application_train, col, mapping_dictionary)
            self.response_transform(self.application_test, col, mapping_dictionary)
            # Removing the original categorical columns
            _ = self.application_train.pop(col)
            _ = self.application_test.pop(col)
        
        if self.verbose:
            print('Finished preprocessing appplication_train and application_test.')
            print(f'Size of application_train after Pre-Processing and Feature Engineering: {self.application_train.shape}')
            print(f'Total Time Taken = {datetime.now() - self.start}')

        if self.dump_to_pickle:
            if self.verbose:
                print('Pickling pre-processed application_train and application_test to application_train_preprocessed.pkl and application_test_preprocessed, respectively.')
            with open(self.file_directory + 'application_train_preprocessed.pkl', 'wb') as f:
                pickle.dump(self.application_train, f)
            with open(self.file_directory + 'application_test_preprocessed.pkl', 'wb') as f:
                pickle.dump(self.application_test, f)
            if self.verbose:
                print('Finished.')  
        if self.verbose:
            print('-'*100)
                        
        return self.application_train, self.application_test

# Instantiate the preprocessor
preprocessor = preprocess_application_train_test(
    file_directory='../analytical/assets/data/',
    verbose=True,
    dump_to_pickle=True
)

# Run the preprocessing pipeline
application_train, application_test = preprocessor.main()

# Display a preview of the processed training data
print("Processed application_train DataFrame:")
print(application_train.head())

# Display a preview of the processed test data
print("\nProcessed application_test DataFrame:")
print(application_test.head())

#######################################################
#        Pre-processing application_train.csv         #
#        Pre-processing application_test.csv          #
#######################################################

Loading the DataFrames into memory...
Loading application_train from: ../analytical/assets/data/application_train.csv
Loading application_test from: ../analytical/assets/data/application_test.csv
application_train shape: (307511, 122)
application_test shape: (48744, 121)
Time Taken to load = 0:00:03.891461

Performing Data Cleaning...
Initial shape of application_train: (307511, 122)
Initial shape of application_test: (48744, 121)
Shape after dropping FLAG_DOCUMENT columns: (307511, 117)
Shape after handling OBS_SOCIAL_CIRCLE: (307511, 117)
Shape after removing 'XNA' in CODE_GENDER: (307507, 117)
Shape after converting REGION_RATING_CLIENT: (307507, 117)
Done with Data Cleaning.
Predicting the missing values of EXT_SOURCE columns...
Finished.
Time elapsed = 0:00:37.214085
Starting Feature Engineering...
Creating Domain Based Features on Numeric Data

Loading previous_application.csv for Count Payment Prediction...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004063 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 1297978, number of used features: 3
[LightGBM] [Info] Start training from score 16.054119

Loading previous_application.csv for Count Payment Prediction...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003602 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 1297978, number of used features: 3
[LightGBM] [Info] Start training from score 16.054119
Finished.
Time Taken = 0:03:57.533419
Creating features based on Categorical Interactions on some Numeric Features
Finished.
Time taken = 0:00:04.050246
Finished preprocessing appplication_train and application_test.
Size of application_train after Pre-Processing and Feature Engineering: (307507, 369)
Total Time Taken = 0:04:47.366477
Pickling pre-processed application_train and application_test to application_train_preprocessed.pkl and application_test_preprocessed, respectively.
Finished.
----------------------------------------------------------------------------------------------------
Processed application_train DataFrame:
   SK_ID_CURR  TARGET  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  \
0      100002       1             0          202500.0    406597.5   
1      100003       0             0          270000.0   1293502.5   
2      100004       0             0           67500.0    135000.0   
3      100006       0             0          135000.0    312682.5   
4      100007       0             0          121500.0    513000.0   

   AMT_ANNUITY  AMT_GOODS_PRICE  REGION_POPULATION_RELATIVE  DAYS_BIRTH  \
0      24700.5         351000.0                    0.018801   25.920548   
1      35698.5        1129500.0                    0.003541   45.931507   
2       6750.0         135000.0                    0.010032   52.180822   
3      29686.5         297000.0                    0.008019   52.068493   
4      21865.5         513000.0                    0.028663   54.608219   

   DAYS_EMPLOYED  ...  WALLSMATERIAL_MODE_0  WALLSMATERIAL_MODE_1  \
0         -637.0  ...              0.925941              0.074059   
1        -1188.0  ...              0.929753              0.070247   
2         -225.0  ...              0.908718              0.091282   
3        -3039.0  ...              0.908718              0.091282   
4        -3038.0  ...              0.908718              0.091282   

   EMERGENCYSTATE_MODE_0  EMERGENCYSTATE_MODE_1  REGIONS_RATING_INCOME_MUL_0  \
0               0.930350               0.069650                     0.909563   
1               0.930350               0.069650                     0.919943   
2               0.907392               0.092608                     0.928388   
3               0.907392               0.092608                     0.919943   
4               0.907392               0.092608                     0.903079   

   REGIONS_RATING_INCOME_MUL_1  REGION_RATING_MEAN_0  REGION_RATING_MEAN_1  \
0                     0.090437              0.920995              0.079005   
1                     0.080057              0.951797              0.048203   
2                     0.071612              0.920995              0.079005   
3                     0.080057              0.920995              0.079005   
4                     0.096921              0.920995              0.079005   

   REGION_RATING_MUL_0  REGION_RATING_MUL_1  
0             0.920933             0.079067  
1             0.951797             0.048203  
2             0.920933             0.079067  
3             0.920933             0.079067  
4             0.920933             0.079067  

[5 rows x 369 columns]

Processed application_test DataFrame:
   SK_ID_CURR  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0      100001             0          135000.0    568800.0      20560.5   
1      100005             0           99000.0    222768.0      17370.0   
2      100013             0          202500.0    663264.0      69777.0   
3      100028             2          315000.0   1575000.0      49018.5   
4      100038             1          180000.0    625500.0      32067.0   

   AMT_GOODS_PRICE  REGION_POPULATION_RELATIVE  DAYS_BIRTH  DAYS_EMPLOYED  \
0         450000.0                    0.018850   52.715068        -2329.0   
1         180000.0                    0.035792   49.490411        -4469.0   
2         630000.0                    0.019101   54.898630        -4458.0   
3        1575000.0                    0.026392   38.290411        -1866.0   
4         625500.0                    0.010032   35.726027        -2191.0   

   DAYS_REGISTRATION  ...  WALLSMATERIAL_MODE_0  WALLSMATERIAL_MODE_1  \
0            -5170.0  ...              0.925941              0.074059   
1            -9118.0  ...              0.908718              0.091282   
2            -2175.0  ...              0.908718              0.091282   
3            -2000.0  ...              0.936522              0.063478   
4            -4000.0  ...              0.908718              0.091282   

   EMERGENCYSTATE_MODE_0  EMERGENCYSTATE_MODE_1  REGIONS_RATING_INCOME_MUL_0  \
0               0.930350               0.069650                     0.919943   
1               0.907392               0.092608                     0.914215   
2               0.907392               0.092608                     0.909563   
3               0.930350               0.069650                     0.933973   
4               0.907392               0.092608                     0.918350   

   REGIONS_RATING_INCOME_MUL_1  REGION_RATING_MEAN_0  REGION_RATING_MEAN_1  \
0                     0.080057              0.920995              0.079005   
1                     0.085785              0.920995              0.079005   
2                     0.090437              0.920995              0.079005   
3                     0.066027              0.920995              0.079005   
4                     0.081650              0.920995              0.079005   

   REGION_RATING_MUL_0  REGION_RATING_MUL_1  
0             0.920933             0.079067  
1             0.920933             0.079067  
2             0.920933             0.079067  
3             0.920933             0.079067  
4             0.920933             0.079067  

[5 rows x 368 columns]

def merge_all_tables(application_train, application_test, bureau_aggregated, previous_aggregated, 
                    installments_aggregated, pos_aggregated, cc_aggregated):
    '''
    Function to merge all the tables together with the application_train and application_test tables
    on SK_ID_CURR.
    
    Inputs:
        All the previously pre-processed Tables.
        
    Returns:
        Single merged tables, one for training data and one for test data
    '''

    # Merging application_train and application_test with Aggregated bureau table
    app_train_merged = application_train.merge(bureau_aggregated, on = 'SK_ID_CURR', how = 'left')
    app_test_merged = application_test.merge(bureau_aggregated, on = 'SK_ID_CURR', how = 'left')
    # Merging with aggregated previous_applications
    app_train_merged = app_train_merged.merge(previous_aggregated, on = 'SK_ID_CURR', how = 'left')
    app_test_merged = app_test_merged.merge(previous_aggregated, on = 'SK_ID_CURR', how = 'left')
    # Merging with aggregated installments tables
    app_train_merged = app_train_merged.merge(installments_aggregated, on = 'SK_ID_CURR', how = 'left')
    app_test_merged = app_test_merged.merge(installments_aggregated, on = 'SK_ID_CURR', how = 'left')
    # Merging with aggregated POS_Cash balance table
    app_train_merged = app_train_merged.merge(pos_aggregated, on = 'SK_ID_CURR', how = 'left')
    app_test_merged = app_test_merged.merge(pos_aggregated, on = 'SK_ID_CURR', how = 'left')
    # Merging with aggregated credit card table
    app_train_merged = app_train_merged.merge(cc_aggregated, on = 'SK_ID_CURR', how = 'left')
    app_test_merged = app_test_merged.merge(cc_aggregated, on = 'SK_ID_CURR', how = 'left')

    return reduce_mem_usage(app_train_merged), reduce_mem_usage(app_test_merged)

train_data, test_data = merge_all_tables(application_train, application_test, 
                                         bureau_aggregated, previous_aggregated, 
                                         installments_aggregated, pos_aggregated, 
                                         cc_aggregated)

----------------------------------------------------------------------------------------------------
Memory usage of dataframe: 3629.40 MB
Memory usage after optimization: 1274.81 MB
Decreased by 64.9%
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Memory usage of dataframe: 574.94 MB
Memory usage after optimization: 207.70 MB
Decreased by 63.9%
----------------------------------------------------------------------------------------------------

def create_new_features(data):
    '''
    Function to create few more features after the merging of features, by using the
    interactions between various tables.
    
    Inputs:
        data: DataFrame
    
    Returns:
        None
    '''
    
    # Previous applications columns
    prev_annuity_columns = [ele for ele in previous_aggregated.columns if 'AMT_ANNUITY' in ele]
    for col in prev_annuity_columns:
        data['PREV_' + col + '_INCOME_RATIO'] = data[col] / (data['AMT_INCOME_TOTAL'] + 0.00001)
    prev_goods_columns = [ele for ele in previous_aggregated.columns if 'AMT_GOODS' in ele]
    for col in prev_goods_columns:
        data['PREV_' + col + '_INCOME_RATIO'] = data[col] / (data['AMT_INCOME_TOTAL'] + 0.00001)
  
    # Credit_card_balance columns
    cc_amt_principal_cols = [ele for ele in cc_aggregated.columns if 'AMT_RECEIVABLE_PRINCIPAL' in ele]
    for col in cc_amt_principal_cols:
        data['CC_' + col + '_INCOME_RATIO'] = data[col] / (data['AMT_INCOME_TOTAL'] + 0.00001)
    cc_amt_recivable_cols = [ele for ele in cc_aggregated.columns if 'AMT_RECIVABLE' in ele]
    for col in cc_amt_recivable_cols:
        data['CC_' + col + '_INCOME_RATIO'] = data[col] / (data['AMT_INCOME_TOTAL'] + 0.00001)
    cc_amt_total_receivable_cols = [ele for ele in cc_aggregated.columns if 'TOTAL_RECEIVABLE' in ele]
    for col in cc_amt_total_receivable_cols:
        data['CC_' + col + '_INCOME_RATIO'] = data[col] / (data['AMT_INCOME_TOTAL'] + 0.00001)
    
    # Installments_payments columns
    installments_payment_cols = [ele for ele in installments_aggregated.columns if 'AMT_PAYMENT' in ele and 'RATIO' not in ele and 'DIFF' not in ele]
    for col in installments_payment_cols:
        data['INSTALLMENTS_' + col + '_INCOME_RATIO'] = data[col] / (data['AMT_INCOME_TOTAL'] + 0.00001)
    
    # Reference: https://www.kaggle.com/c/home-credit-default-risk/discussion/64821
    installments_max_installment = ['AMT_INSTALMENT_MEAN_MAX', 'AMT_INSTALMENT_SUM_MAX']
    for col in installments_max_installment:
        data['INSTALLMENTS_ANNUITY_' + col + '_RATIO'] = data['AMT_ANNUITY'] / (data[col] + 0.00001)
    
    # POS_CASH_balance features have been created in its own dataframe itself

    # Bureau and bureau_balance columns
    bureau_days_credit_cols = [ele for ele in bureau_aggregated.columns if 'DAYS_CREDIT' in ele and 'ENDDATE' not in ele and 'UPDATE' not in ele]
    for col in bureau_days_credit_cols:
        data['BUREAU_' + col + '_EMPLOYED_DIFF'] = data[col] - data['DAYS_EMPLOYED']
        data['BUREAU_' + col + '_REGISTRATION_DIFF'] = data[col] - data['DAYS_REGISTRATION']  
    bureau_overdue_cols = [ele for ele in bureau_aggregated.columns if 'AMT_CREDIT' in ele and 'OVERDUE' in ele]
    for col in bureau_overdue_cols:
        data['BUREAU_' + col + '_INCOME_RATIO'] = data[col] / (data['AMT_INCOME_TOTAL'] + 0.00001)
    bureau_amt_annuity_cols = [ele for ele in bureau_aggregated.columns if 'AMT_ANNUITY' in ele and 'CREDIT'  not in ele]
    for col in bureau_amt_annuity_cols:
        data['BUREAU_' + col + '_INCOME_RATIO'] = data[col] / (data['AMT_INCOME_TOTAL'] + 0.00001)

create_new_features(train_data)
create_new_features(test_data)

print("After Pre-processing, aggregation, merging and Feature Engineering,")
print(f"Final Shape of Training Data = {train_data.shape}")
print(f"Final Shape of Test Data = {test_data.shape}")

# Freeing up the memory
del application_train, application_test, bureau_aggregated, previous_aggregated, installments_aggregated, pos_aggregated, cc_aggregated

After Pre-processing, aggregation, merging and Feature Engineering,
Final Shape of Training Data = (307507, 1633)
Final Shape of Test Data = (48744, 1632)

def final_pickle_dump(train_data, test_data, train_file_name, test_file_name, file_directory = '', verbose = True):
    '''
    Function to dump the preprocessed files to pickle.
    
    Inputs:
        train_data: DataFrame
            Training Data
        test_data: DataFrame
            Test Data
        train_file_name: str
            Name of pickle file for training data
        test_file_name: str
            Name of pickle file for test data
        file_directory: str, default = ''
            Path of directory to save pickle file into
        verbose: bool, default = True
            Whether to keep verbosity or not
    
    Returns:
        None
    '''
    if verbose:
        print("Dumping the final preprocessed data to pickle files.")
        start = datetime.now()
    with open(file_directory + train_file_name + '.pkl','wb') as f:
        pickle.dump(train_data, f)
    with open(file_directory + test_file_name + '.pkl','wb') as f:
        pickle.dump(test_data,f)
    
    if verbose:
        print("Finished.")
        print(f"Time elapsed = {datetime.now() - start}")

final_pickle_dump(train_data, test_data, 'train_data_final', 'test_data_final')

Dumping the final preprocessed data to pickle files.
Finished.
Time elapsed = 0:00:00.768607

# Removing the SK_ID_CURR from training and test data
train_data = train_data.drop(['SK_ID_CURR'], axis = 1)
skid_test = test_data.pop('SK_ID_CURR')
# Extracting the class labels for training data
target_train = train_data.pop('TARGET')

empty_columns = []
for col in train_data.columns:
    if len(train_data[col].unique()) <=1:
        empty_columns.append(col)
    
print(f"There are {len(empty_columns)} columns with just 1 unique value")
print("Removing these from dataset")
train_data = train_data.drop(empty_columns, axis = 1)
test_data = test_data.drop(empty_columns, axis = 1)

There are 23 columns with just 1 unique value
Removing these from dataset

class recursive_feature_selector:
    '''
    Class to recursively select top features.
    Contains 2 methods:
        1. init method
        2. main method
    '''
    
    def __init__(self, train_data, test_data, target_train, num_folds = 3, verbose = True, random_state = 5358):
        '''
        Function to initialize the class variables.
        
        Inputs:
            self
            train_data: DataFrame
                Training Data
            test_data: DataFrame
                Test Data
            target_train: Series
                Class Labels for training Data
            num_folds: int, default = 3
                Number of folds for K-Fold CV
            verbose: bool, default = True
                Whether to keep verbosity or not
            random_state: int, default = 5358
                The random state for the classifier for recursive feature selection
                
        Returns: 
            None
        '''
        
        self.train_data = train_data
        self.test_data = test_data
        self.target_train = target_train
        self.num_folds = num_folds
        self.verbose = verbose
        self.random_state = random_state
        
    def main(self):
        '''
        Function to select features recursively
        
        Inputs:
            self
        
        Returns:
            Training and testing data with reduced number of features
        '''
        
        if self.verbose:
            print("Starting Feature Selection...")
            start = datetime.now()
            
        # Set of important features
        self.important_columns = set()
        score = 1
        i = 1
        
        while score > 0.72:
            if self.verbose:
                print(f"Iteration {i}:")
                
            # Removing the features which have been selected from the modelling data
            selection_data = self.train_data.drop(list(self.important_columns), axis = 1)
            #defining the CV strategry
            fold = StratifiedKFold(n_splits = self.num_folds, shuffle = True, random_state = 33)
            # Reinitializing the score
            score = 0
            model_feature_importance = np.zeros_like(selection_data.columns)
            
            # Doing K-Fold Cross validation
            for fold_num, (train_indices, val_indices) in enumerate(fold.split(selection_data, self.target_train),1):
                if self.verbose:
                    print(f"\t\tFitting fold {fold_num}")
                
                # Defining the train and validation data
                x_train = selection_data.iloc[train_indices]
                x_val = selection_data.iloc[val_indices]
                y_train = self.target_train.iloc[train_indices]
                y_val = self.target_train.iloc[val_indices]
                
                # Instantiating the LightGBM Classifier
                lg = LGBMClassifier(n_jobs = -1, random_state = self.random_state)
                lg.fit(x_train, y_train)

                # Appending the feature importance of each feature averaged over different folds
                model_feature_importance += lg.feature_importances_ / self.num_folds
                # Average k-fold ROC-AUC Score
                score += roc_auc_score(y_val, lg.predict_proba(x_val)[:,1]) / self.num_folds

            # Getting the non-zero feature importance columns
            imp_cols_indices = np.where(np.abs(model_feature_importance) > 0)
            # Names of non-zero feature importance columns
            cols_imp = self.train_data.columns[imp_cols_indices]
            
            if score > 0.7:
                self.important_columns.update(cols_imp)
                if self.verbose:
                    print(f"\tNo. of important columns kept = {len(self.important_columns)}")            
            if self.verbose:
                print(f"\tCross Validation score = {score}")
            i += 1
            
        self.important_columns = list(self.important_columns)
        
        if self.verbose:
            print("\nDone Selecting Features.")
            print(f"Total columns removed = {self.train_data.shape[1] - len(self.important_columns)}")
            print(f"\nInitial Shape of train_data = {self.train_data.shape}")
        self.train_data = self.train_data[self.important_columns]
        self.test_data = self.test_data[self.important_columns]
        if self.verbose:
            print(f"Final Shape of train_data = {self.train_data.shape}")
            print(f"\nTotal Time Taken = {datetime.now() - start}")
            
        # Saving the final columns into a pickle file
        with open('final_cols.pkl', 'wb') as f:
            pickle.dump(train_data.columns.tolist(), f)
        
        gc.collect()

        return self.train_data, self.test_data

# Instantiating the class recursive_feature_selector
feature_selector =  recursive_feature_selector(train_data, test_data, target_train)
train_data, test_data = feature_selector.main()
important_columns = feature_selector.important_columns

Starting Feature Selection...
Iteration 1:
		Fitting fold 1
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Number of positive: 16550, number of negative: 188454
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.975669 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 277576
[LightGBM] [Info] Number of data points in the train set: 205004, number of used features: 1602
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080730 -> initscore=-2.432468
[LightGBM] [Info] Start training from score -2.432468
		Fitting fold 2
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Number of positive: 16550, number of negative: 188455
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.038324 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 277870
[LightGBM] [Info] Number of data points in the train set: 205005, number of used features: 1602
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080730 -> initscore=-2.432473
[LightGBM] [Info] Start training from score -2.432473
		Fitting fold 3
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Number of positive: 16550, number of negative: 188455
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.810662 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 277809
[LightGBM] [Info] Number of data points in the train set: 205005, number of used features: 1602
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080730 -> initscore=-2.432473
[LightGBM] [Info] Start training from score -2.432473
	No. of important columns kept = 1124
	Cross Validation score = 0.7973582682015632
Iteration 2:
		Fitting fold 1
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Number of positive: 16550, number of negative: 188454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061201 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 47140
[LightGBM] [Info] Number of data points in the train set: 205004, number of used features: 478
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080730 -> initscore=-2.432468
[LightGBM] [Info] Start training from score -2.432468
		Fitting fold 2
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Number of positive: 16550, number of negative: 188455
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.100978 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 47329
[LightGBM] [Info] Number of data points in the train set: 205005, number of used features: 478
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080730 -> initscore=-2.432473
[LightGBM] [Info] Start training from score -2.432473
		Fitting fold 3
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Number of positive: 16550, number of negative: 188455
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073155 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 47347
[LightGBM] [Info] Number of data points in the train set: 205005, number of used features: 478
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080730 -> initscore=-2.432473
[LightGBM] [Info] Start training from score -2.432473
	No. of important columns kept = 1221
	Cross Validation score = 0.7165016706644562

Done Selecting Features.
Total columns removed = 387

Initial Shape of train_data = (307507, 1608)
Final Shape of train_data = (307507, 1221)

Total Time Taken = 0:01:49.506704

# Standardizing the data
scaler = StandardScaler()
x_train_std = scaler.fit_transform(train_data)
x_test_std = scaler.fit_transform(test_data)
y_train = target_train

# Replacing nan values with 0
x_train_std[np.isnan(x_train_std)] = 0
x_test_std[np.isnan(x_test_std)] = 0

# Generate random probabilities
predicted_proba_train = np.random.random(len(target_train))
predicted_proba_test = np.random.random(len(test_data))

# Metrics Calculation
roc_auc = roc_auc_score(target_train, predicted_proba_train)
precision = precision_score(target_train, np.round(predicted_proba_train))
recall = recall_score(target_train, np.round(predicted_proba_train))

# Print Results
print("=" * 100)
print("Training Dataset Results:")
print(f"\tROC-AUC Score = {roc_auc}")
print(f"\tPrecision Score = {precision}")
print(f"\tRecall Score = {recall}")
print("=" * 100)

# Confusion Matrix with Heatmap
conf_mat = confusion_matrix(target_train, np.round(predicted_proba_train))
conf_mat_df = pd.DataFrame(conf_mat, columns=['Predicted 0', 'Predicted 1'], index=['Actual 0', 'Actual 1'])

plt.figure(figsize=(8, 6))
plt.title('Confusion Matrix Heatmap')
sns.heatmap(conf_mat_df, annot=True, fmt='d', cmap='coolwarm', linewidths=0.5, annot_kws={"size": 15})
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

# Distribution of Class Labels
print("Distribution of Original and Predicted Labels")
plt.figure(figsize=(15, 5))

# Define colors for Class 0 and Class 1
colors = ['steelblue', 'darkorange']

# Original Class Labels
plt.subplot(1, 3, 1)
plt.title('Original Labels')
plt.bar(['Class 0', 'Class 1'], np.bincount(target_train), color=colors, alpha=0.8)
plt.xlabel('Classes')
plt.ylabel('Count')

# Predicted Train Labels
plt.subplot(1, 3, 2)
plt.title('Predicted Train Labels')
plt.bar(['Class 0', 'Class 1'], np.bincount(np.round(predicted_proba_train).astype(int)), color=colors, alpha=0.8)
plt.xlabel('Classes')
plt.ylabel('Count')

# Predicted Test Labels
plt.subplot(1, 3, 3)
plt.title('Predicted Test Labels')
plt.bar(['Class 0', 'Class 1'], np.bincount(np.round(predicted_proba_test).astype(int)), color=colors, alpha=0.8)
plt.xlabel('Classes')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

====================================================================================================
Training Dataset Results:
	ROC-AUC Score = 0.5007396118774923
	Precision Score = 0.08083299082066037
	Recall Score = 0.4990936555891239
====================================================================================================

Distribution of Original and Predicted Labels

# Define the directory and file name for saving the submission
submission_dir = '../analytical/assets/submissions'
submission_file = 'Random_Model_Submission.csv'

# Create the directory if it doesn't exist
os.makedirs(submission_dir, exist_ok=True)

# Define the full path for the submission file
submission_path = os.path.join(submission_dir, submission_file)

# Save the submission file
submission_data = pd.DataFrame({'SK_ID_CURR': skid_test, 'TARGET': predicted_proba_test})
submission_data.to_csv(submission_path, index=False)

# Inform the user
print("You can manually upload it to the competition page.")

You can manually upload it to the competition page.

class modelling:
    '''
    Class for Doing Hyperparameter tuning to find best set of hyperparameters, building models on best hyperparams and
    displaying results on best hyperparameters.
    
    It has 4 methods:
        1. init method
        2. random_search_cv method
        3. train_on_best_params method
        4. proba_to_class method
        5. tune_threshold method
        6. results_on_best_params method
        7. feat_importances_show method
    '''
    def __init__(self, base_model, x_train, y_train, x_test, calibration = False, calibration_method = 'isotonic',
                 calibration_cv = 4, k_folds = 4, random_state = 982):
        '''
        Function to initialize the class members.
        
        Inputs: 
            self
            base_model: estimator/classifier
                The base model to be used for the modelling purpose
            x_train: numpy array
                Training standardized data
            y_train: numpy array
                Training class labels
            x_test: numpy array
                Test standardized data
            calibration: bool, default = False
                Whether to calibrate the model for generating class probabilities
            calibration_method: str, default = 'isotonic'
                The type of calibration to use, i.e. sigmoid or isotonic
            calibration_cv: int, default = 4
                Number of cross-validation folds for calibrating the probabilities
            k_folds: int, default = 4
                Number of cross-validation folds for training and tuning the model
            random_state: int, default = 982
                Random state for StratifiedKFold for reproducibility
                
        Returns: 
            None            
        '''
        self.base_model = base_model
        self.num_folds = k_folds
        self.kfolds = StratifiedKFold(n_splits = k_folds, shuffle = True, random_state = random_state)
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.calibration = calibration
        if self.calibration:
            self.calibration_method = calibration_method
            self.calibration_cv = calibration_cv

    def random_search_cv(self, hyperparams_dict, n_iter=30, verbose=True, n_jobs=1, random_state=843):
        '''
        Function to do RandomizedSearchCV on training data.
        
        Inputs:
            self
            hyperparams_dict: dict
                Dictionary of hyperparameters to tune
            n_iter: int, default = 30
                Number of iterations to perform for random search
            verbose: bool, default = True
                Whether to keep verbosity or not
            n_jobs: int, default = 1
                Number of cores to use for Random Search
            random_state: int, default = 843
                Random state for reproducibility of RandomizedSearchCV
                
        Returns:
            None
        '''

        if verbose:
            start = datetime.now()
            print(f"Doing Randomized Search CV on Classifier with {n_iter} random initializations...")

        # Suppressing stdout and stderr outputs (including `[CV]` logs)
        with open(os.devnull, 'w') as devnull, redirect_stdout(devnull), redirect_stderr(devnull):
            rscv = RandomizedSearchCV(
                self.base_model, 
                hyperparams_dict, 
                n_iter=n_iter, 
                scoring='roc_auc', 
                cv=self.kfolds, 
                return_train_score=True, 
                verbose=0,
                n_jobs=n_jobs, 
                random_state=random_state
            )
            rscv.fit(self.x_train, self.y_train)

        if verbose:
            print("Done.")
            print(f"Time elapsed = {datetime.now() - start}")
        
        # Getting the Search Results
        self.tuning_results = pd.DataFrame(rscv.cv_results_)
        # Best model
        self.best_model = rscv.best_estimator_
        
        gc.collect()
    
        
    def train_on_best_params(self, verbose = True):
        '''
        Function to train the model on best hyperparameters obtained from previous method.
        Generates Cross-Validation predictions as Out-of-fold predictions
        
        Inputs:
            self
            verbose: bool, default = True
                Whether to keep verbosity or not
        
        Returns:
            None
        '''
        
        if verbose:
            print("Fitting Classifier on best parameters\n")
            print(f"{self.num_folds}-Fold Cross Validation")
            start = datetime.now()
            
        self.cv_preds_probas = np.zeros(self.x_train.shape[0])
        # We will select a best threshold for each fold of cross-validation and average over the
        # folds to find the optimal threshold
        self.best_threshold_train = 0
        for fold_number, (train_indices, val_indices) in enumerate(self.kfolds.split(self.x_train, self.y_train), 1):
            if verbose:
                print(f"\tFitting Fold {fold_number}")
                
            self.best_model.fit(self.x_train[train_indices], self.y_train[train_indices])
            if not self.calibration:
                self.train_preds_probas = self.best_model.predict_proba(self.x_train[train_indices])[:,1]
                self.cv_preds_probas[val_indices] = self.best_model.predict_proba(self.x_train[val_indices])[:,1]
            else:
                # Fitting the calibration Classifier over the base model for calibrated probabilities
                self.calibrated_classifier = CalibratedClassifierCV(self.best_model, method = self.calibration_method, cv = self.calibration_cv)
                self.calibrated_classifier.fit(self.x_train[train_indices], self.y_train[train_indices])
                
                self.train_preds_probas = self.calibrated_classifier.predict_proba(self.x_train[train_indices])[:,1]
                self.cv_preds_probas[val_indices] = self.calibrated_classifier.predict_proba(self.x_train[val_indices])[:,1]
            
            # Tuning the threshold for optimal TPR and FPR from ROC Curve
            self.best_threshold_train += self.tune_threshold(self.y_train[train_indices], self.train_preds_probas) / self.num_folds
        
        # Converting the class probabilities to class labels
        self.cv_preds_class = self.proba_to_class(self.cv_preds_probas, self.best_threshold_train)
        
        if verbose:
            print("Finised.")
            print(f"Time elapsed = {datetime.now() - start}")
            
        gc.collect()
    
    def proba_to_class(self, proba, threshold):
        '''
        Function to convert a given probability to class label based on a threshold value.
        
        Inputs:
            self
            proba: numpy array
                Probabilities of class label = 1
            threshold: int
                Threshold probability to be considered as Positive or Negative Class Label
            
        Returns:
            Converted Class Label
        '''
        return np.where(proba >= threshold, 1, 0)
        
    def tune_threshold(self, true_labels, predicted_probas):
        '''
        Function to find the optimal threshold for maximizing the TPR and minimizing the FPR from ROC-AUC Curve.
        This is found out by using the J Statistic, which is J = TPR - FPR.
        Reference: https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/
        
        Inputs:
            self
            true_labels: numpy array or pandas series
                True Class Labels
            predicted_probas: numpy array
                Predicted Probability of Positive Class label
            
        Returns:
            Threshold probability.
        '''
        fpr, tpr, threshold = roc_curve(true_labels, predicted_probas)
        j_stat = tpr - fpr
        index_for_best_threshold = np.argmax(j_stat)
        
        return threshold[index_for_best_threshold]

    def results_on_best_params(self, model_name):
        '''
        Function to train the whole data on best parameters and display the results.

        Inputs:
            self
            model_name: str
                model name to get feature importances.

        Returns:
            None
        '''

        # Fit the model only if not already trained
        if not hasattr(self, 'best_model_trained') or not self.best_model_trained:
            self.best_model.fit(self.x_train, self.y_train)
            self.best_model_trained = True

        # Optimize prediction flow
        predict_proba_fn = self.best_model.predict_proba if not self.calibration else self.calibrated_classifier.predict_proba

        self.train_preds_probas = predict_proba_fn(self.x_train)[:, 1]
        self.test_preds_probas = predict_proba_fn(self.x_test)[:, 1]

        # Convert probabilities to class labels
        self.train_preds_class = self.proba_to_class(self.train_preds_probas, self.best_threshold_train)
        self.test_preds_class = self.proba_to_class(self.test_preds_probas, self.best_threshold_train)
        
        # Feature importances
        if hasattr(self.best_model, 'coef_'):  # For linear models
            self.feat_imp = self.best_model.coef_[0]
        elif hasattr(self.best_model, 'feature_importances_'):  # For tree-based models
            self.feat_imp = self.best_model.feature_importances_
        else:
            self.feat_imp = None  # Handle cases where model doesn't support importance
            print("Warning: Feature importances are not available for this model.")

        # Display results efficiently
        print("=" * 80)
        print(f"The best selected Threshold as per the J-Statistic is = {self.best_threshold_train}\n")
        print("Train Results:")
        print(f"\tROC-AUC Score = {roc_auc_score(self.y_train, self.train_preds_probas):.12f}")
        print(f"\tPrecision Score = {precision_score(self.y_train, self.train_preds_class):.12f}")
        print(f"\tRecall Score = {recall_score(self.y_train, self.train_preds_class):.12f}")

        print("CV Results:")
        print(f"\tROC-AUC Score = {roc_auc_score(self.y_train, self.cv_preds_probas):.12f}")
        print(f"\tPrecision Score = {precision_score(self.y_train, self.cv_preds_class):.12f}")
        print(f"\tRecall Score = {recall_score(self.y_train, self.cv_preds_class):.12f}")
        print('=' * 80)

        # Confusion Matrix - Display once
        print("Confusion Matrix of CV data:")
        conf_mat = confusion_matrix(self.y_train, self.cv_preds_class)
        plt.figure(figsize=(7, 6))
        sns.heatmap(conf_mat, annot=True, fmt='g', cmap='Blues', linewidths=0.5,
                    annot_kws={"size": 15})
        plt.title('Confusion Matrix Heatmap')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()

        # Optimize Plotting - Use precomputed counts and ensure efficiency
        print("Distribution of Original Class Labels and Predicted CV and Test Class Labels")

        # Explicitly convert inputs to pd.Series
        y_train_flat = pd.Series(self.y_train).astype(int)
        cv_preds_class_flat = pd.Series(self.cv_preds_class).astype(int)
        test_preds_class_flat = pd.Series(self.test_preds_class).astype(int)

        # Precompute counts for faster plotting
        y_train_counts = y_train_flat.value_counts().sort_index()
        cv_preds_counts = cv_preds_class_flat.value_counts().sort_index()
        test_preds_counts = test_preds_class_flat.value_counts().sort_index()

        # Plotting
        fig, axes = plt.subplots(1, 3, figsize=(18, 6))

        # Define a custom palette for class colors
        custom_palette = {0: 'steelblue', 1: 'darkorange'}

        # Original Class Labels
        sns.barplot(x=y_train_counts.index, y=y_train_counts.values, ax=axes[0], 
                    palette=[custom_palette[x] for x in y_train_counts.index])
        axes[0].set_title('Class Distribution of Original Dataset')
        axes[0].set_xlabel('Classes')
        axes[0].set_ylabel('Count')

        # Predicted CV Class Labels
        sns.barplot(x=cv_preds_counts.index, y=cv_preds_counts.values, ax=axes[1], 
                    palette=[custom_palette[x] for x in cv_preds_counts.index])
        axes[1].set_title('Class Distribution of Predicted Class Labels on CV')
        axes[1].set_xlabel('Classes')
        axes[1].set_ylabel('Count')

        # Predicted Test Class Labels
        sns.barplot(x=test_preds_counts.index, y=test_preds_counts.values, ax=axes[2], 
                    palette=[custom_palette[x] for x in test_preds_counts.index])
        axes[2].set_title('Class Distribution of Predicted Test Dataset')
        axes[2].set_xlabel('Classes')
        axes[2].set_ylabel('Count')

        plt.tight_layout()
        plt.show()


    def feat_importances_show(self, feature_names, num_features, figsize=(10, 15)):
        '''
        Function to display the top most important features.

        Inputs:
            self
            feature_names: numpy array
                Names of features of training set
            num_features: int
                Number of top features importances to display
            figsize: tuple, default = (10,15)
                Size of figure to be displayed

        Returns:
            None
        '''

        # Getting the top features indices and their names
        top_feats_indices = np.argsort(self.feat_imp)[::-1][:num_features]
        feat_importance_top = np.nan_to_num(self.feat_imp[top_feats_indices])
        column_names = feature_names[top_feats_indices]

        # Generate multicolored bars
        colors = sns.color_palette("viridis", len(feat_importance_top))

        # Plotting a horizontal bar plot of feature importances
        plt.figure(figsize=figsize)
        sns.barplot(x=feat_importance_top, y=column_names, palette=colors, orient='h')
        plt.title(f'Top {num_features} Features as per Classifier')
        plt.xlabel('Feature Importance')
        plt.ylabel('Feature Names')
        plt.grid()
        plt.show()

        gc.collect


    def random_search_cv_rf(self, n_iter=10, verbose=True, n_jobs=4, random_state=42):
        '''
        Optimized RandomizedSearchCV for Random Forest Classifier.

        Inputs:
            self
            n_iter: int, default = 10
                Number of random hyperparameter configurations to try
            verbose: bool, default = True
                Whether to keep verbosity or not
            n_jobs: int, default = 4
                Number of CPU cores to use
            random_state: int, default = 42
                Random state for reproducibility

        Returns:
            None
        '''

        # Optimized hyperparameter grid for faster Random Search
        hyperparams = {
            'n_estimators': [50, 100, 150],             # Smaller range for faster tuning
            'max_depth': [8, 10, 12, 14],               # Reduced depth
            'min_samples_split': [10, 20, 30],          # Larger splits
            'min_samples_leaf': [5, 10, 15],            # Larger leaf nodes
            'max_samples': [0.7],                       # Fixed subsample to 70%
            'max_features': ['sqrt']                    # Fixed to 'sqrt' for consistency
        }
        # Initialize Random Forest Classifier with consistent parameters
        params = {
            'n_jobs': n_jobs,
            'random_state': random_state,
            'class_weight': 'balanced_subsample',
            'verbose': 0
        }
        rf_clf = RandomForestClassifier(**params)

        # Initialize RandomizedSearchCV
        if verbose:
            start = datetime.now()
            print(f"Performing RandomizedSearchCV with {n_iter} iterations on Random Forest...")

        rscv = RandomizedSearchCV(
            estimator=rf_clf,
            param_distributions=hyperparams,
            n_iter=n_iter,
            scoring='roc_auc',
            return_train_score=True,
            cv=2,
            n_jobs=n_jobs,
            random_state=random_state,
            verbose=1
        )

        # Fit RandomizedSearchCV
        rscv.fit(self.x_train, self.y_train)

        # Verbose output for completion and timing
        if verbose:
            print("RandomizedSearchCV completed successfully.")
            print(f"Time elapsed = {datetime.now() - start}")

        # Store results
        self.tuning_results = pd.DataFrame(rscv.cv_results_)
        self.best_model = rscv.best_estimator_

        # Clear memory
        gc.collect()


    def plot_hyperparam_vs_auc(self, hyperparam_name):
        """
        Plot ROC-AUC Score for a given hyperparameter from RandomizedSearchCV results.

        Parameters:
            hyperparam_name (str): Name of the hyperparameter to plot (e.g., 'param_n_estimators')

        Returns:
            None
        """
        # Check if tuning results exist
        if not hasattr(self, 'tuning_results') or self.tuning_results.empty:
            print("Error: Tuning results are not available. Run random_search_cv first.")
            return

        # Extract cv_results
        cv_results = self.tuning_results

        # Validate if the hyperparameter exists in the results
        if hyperparam_name not in cv_results.columns:
            print(f"Error: '{hyperparam_name}' column not found in cv_results.")
            print(f"Available columns: {cv_results.columns.tolist()}")
            return

        # Check for mean_train_score and mean_test_score columns
        required_columns = ['mean_train_score', 'mean_test_score']
        for col in required_columns:
            if col not in cv_results.columns:
                print(f"Error: '{col}' column not found in cv_results.")
                return

        # Sort results by the selected hyperparameter
        sorted_results = cv_results.sort_values(hyperparam_name)

        # Plot train and CV scores
        plt.figure(figsize=(9, 6))
        plt.plot(sorted_results[hyperparam_name], sorted_results['mean_train_score'], 
                 label='Train Score', marker='o', linestyle='-')
        plt.plot(sorted_results[hyperparam_name], sorted_results['mean_test_score'], 
                 label='CV Score', marker='o', linestyle='--')
        plt.title(f'Hyperparameter {hyperparam_name.replace("param_", "")} vs ROC-AUC Score')
        plt.xlabel(hyperparam_name.replace('param_', '').capitalize())
        plt.ylabel('ROC-AUC Score')
        plt.legend()
        plt.grid()
        plt.show()

params = {
    'loss' : 'log_loss',
    'penalty' : 'l2',
    'random_state' : 98,
    'class_weight' : 'balanced',
    'n_jobs' : -1
}
clf = SGDClassifier(**params)
hyperparams = {'alpha' : np.logspace(-4,2)}

sgd_lr_l2 = modelling(clf, x_train_std, y_train, x_test_std)

# Random Search CV
sgd_lr_l2.random_search_cv(hyperparams, n_iter = 15, n_jobs = 2, verbose=False)

# Viewing the results
cv_results = sgd_lr_l2.tuning_results
cv_results = cv_results.sort_values('param_alpha')

# Plotting the results
plt.figure(figsize = (9,6))
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'], label = 'Train Score')
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'], label = 'CV Score')
plt.title('Hyperparameter Alpha vs ROC-AUC Score')
plt.xlabel('Alpha')
plt.ylabel('ROC-AUC Score')
plt.legend()
plt.grid()
plt.xscale('log')
plt.show()

# Training on best parameters
sgd_lr_l2.train_on_best_params()
# Checking the results
sgd_lr_l2.results_on_best_params('linear')
# Showing the feature importances
sgd_lr_l2.feat_importances_show(train_data.columns, num_features = 50)

Fitting Classifier on best parameters

4-Fold Cross Validation
	Fitting Fold 1
	Fitting Fold 2
	Fitting Fold 3
	Fitting Fold 4
Finised.
Time elapsed = 0:00:35.306465
================================================================================
The best selected Threshold as per the J-Statistic is = 0.48515179031861144

Train Results:
	ROC-AUC Score = 0.795863738737
	Precision Score = 0.183668930940
	Recall Score = 0.731923464250
CV Results:
	ROC-AUC Score = 0.789439650259
	Precision Score = 0.180829869644
	Recall Score = 0.720281973817
================================================================================
Confusion Matrix of CV data:

Distribution of Original Class Labels and Predicted CV and Test Class Labels

# Define the directory and file name for saving the submission
submission_dir = '../analytical/assets/submissions'
submission_file = 'SGD_LR_L2_penalty.csv'

# Define the full path for the submission file
submission_path = os.path.join(submission_dir, submission_file)

# Save the submission file
submission_data = pd.DataFrame({'SK_ID_CURR': skid_test, 'TARGET': sgd_lr_l2.test_preds_probas})
submission_data.to_csv(submission_path, index=False)

# Inform the user
print(f"Submission file has been saved to: {submission_path}")
print("You can manually upload it to the competition page.")

Submission file has been saved to: ../analytical/assets/submissions/SGD_LR_L2_penalty.csv
You can manually upload it to the competition page.

with open('SGD_LR_L2_Penalty_CV_Preds.pkl', 'wb') as f:
    pickle.dump(sgd_lr_l2.cv_preds_probas, f)
with open('SGD_LR_L2_Penalty_Test_Preds.pkl', 'wb') as f:
    pickle.dump(sgd_lr_l2.test_preds_probas, f)
with open('SGD_LR_L2_Penalty_Model.pkl', 'wb') as f:
    pickle.dump(sgd_lr_l2.best_model, f)

params = {
    'loss' : 'hinge',
    'class_weight' : 'balanced',
    'random_state' : 129,
    'n_jobs' : -1
}
clf = SGDClassifier(**params)
hyperparams = {
    'alpha' : np.logspace(-5,4) 
}
linear_svm = modelling(clf, x_train_std, y_train, x_test_std, calibration = True)

# Randomized Search CV
linear_svm.random_search_cv(hyperparams, n_iter = 15, n_jobs = 2, random_state = 693)

# Showing cv results
cv_results = linear_svm.tuning_results
cv_results = cv_results.sort_values('param_alpha')

# Plotting the train and cv scores
plt.figure(figsize = (9,6))
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'], label = 'Train Score')
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'], label = 'CV Score')
plt.title('Hyperparameter Alpha vs ROC-AUC Score')
plt.xlabel('Alpha')
plt.ylabel('ROC-AUC Score')
plt.legend()
plt.grid()
plt.xscale('log')
plt.show()

Doing Randomized Search CV on Classifier with 15 random initializations...
Done.
Time elapsed = 0:08:26.508617

# Training on best parameters
linear_svm.train_on_best_params()
# Checking the results
linear_svm.results_on_best_params('linear')
# Showing the feature importances
linear_svm.feat_importances_show(train_data.columns, num_features = 50)

Fitting Classifier on best parameters

4-Fold Cross Validation
	Fitting Fold 1
	Fitting Fold 2
	Fitting Fold 3
	Fitting Fold 4
Finised.
Time elapsed = 0:03:08.134316
================================================================================
The best selected Threshold as per the J-Statistic is = 0.08185533849795068

Train Results:
	ROC-AUC Score = 0.795752803781
	Precision Score = 0.185462704694
	Recall Score = 0.725840886203
CV Results:
	ROC-AUC Score = 0.790144219931
	Precision Score = 0.181318570098
	Recall Score = 0.721651560926
================================================================================
Confusion Matrix of CV data:

Distribution of Original Class Labels and Predicted CV and Test Class Labels

# Define the directory and file name for saving the submission
submission_dir = '../analytical/assets/submissions'
submission_file = 'Linear_SVM_Submission.csv'

# Ensure the directory exists
os.makedirs(submission_dir, exist_ok=True)

# Define the full path for the submission file
submission_path = os.path.join(submission_dir, submission_file)

# Save the predictions to the submission file
submission_data = pd.DataFrame({
    'SK_ID_CURR': skid_test, 
    'TARGET': linear_svm.test_preds_probas
})
submission_data.to_csv(submission_path, index=False)

# Inform the user
print(f"Submission file has been saved to: {submission_path}")
print("You can manually upload it to the competition page.")

Submission file has been saved to: ../analytical/assets/submissions/Linear_SVM_Submission.csv
You can manually upload it to the competition page.

with open('SGD_Linear_SVM_CV_Preds.pkl', 'wb') as f:
    pickle.dump(linear_svm.cv_preds_probas, f)
with open('SGD_Linear_SVM_Test_Preds.pkl', 'wb') as f:
    pickle.dump(linear_svm.test_preds_probas, f)
with open('SGD_Linear_SVM_Model.pkl', 'wb') as f:
    pickle.dump(linear_svm.best_model, f)

# Initialize the modelling class
rf = modelling(RandomForestClassifier(), x_train_std, y_train, x_test_std)

# Run improved RandomizedSearchCV
rf.random_search_cv_rf(n_iter=10, verbose=True)

# Print the best parameters
print("Best Parameters:", rf.best_model)

# Add Train-CV difference for overfitting check
rf.tuning_results['Train_CV_Score_Difference'] = (
    rf.tuning_results['mean_train_score'] - rf.tuning_results['mean_test_score']
)

# Display the results
print("Differences between Train and CV Scores:\n")
display(rf.tuning_results[['params', 'mean_test_score', 'mean_train_score', 'Train_CV_Score_Difference']])

Performing RandomizedSearchCV with 10 iterations on Random Forest...
Fitting 2 folds for each of 10 candidates, totalling 20 fits
RandomizedSearchCV completed successfully.
Time elapsed = 0:08:01.969284
Best Parameters: RandomForestClassifier(class_weight='balanced_subsample', max_depth=12,
                       max_samples=0.7, min_samples_leaf=15,
                       min_samples_split=20, n_estimators=150, n_jobs=4,
                       random_state=42)
Differences between Train and CV Scores:

cv_results = rf.tuning_results.copy()

# ROC-AUC curve for various hyperparameter values
fig, axes = plt.subplots(1, 3, figsize=(20, 7))

# Subplot 1: n_estimators
cv_results_n_est = cv_results.sort_values('param_n_estimators')
axes[0].plot(cv_results_n_est['param_n_estimators'], cv_results_n_est['mean_train_score'], 
             marker='o', label='Train Score', linestyle='-', linewidth=2)
axes[0].plot(cv_results_n_est['param_n_estimators'], cv_results_n_est['mean_test_score'], 
             marker='o', label='CV Score', linestyle='--', linewidth=2)
axes[0].set_ylim(0.7, 1)
axes[0].set_title('Hyperparameter n_estimators vs ROC-AUC Score')
axes[0].set_xlabel('n_estimators')
axes[0].set_ylabel('ROC-AUC Score')
axes[0].legend()
axes[0].grid()

# Subplot 2: max_depth
cv_results_max_depth = cv_results.sort_values('param_max_depth')
axes[1].plot(cv_results_max_depth['param_max_depth'], cv_results_max_depth['mean_train_score'], 
             marker='o', label='Train Score', linestyle='-', linewidth=2)
axes[1].plot(cv_results_max_depth['param_max_depth'], cv_results_max_depth['mean_test_score'], 
             marker='o', label='CV Score', linestyle='--', linewidth=2)
axes[1].set_ylim(0.7, 1)
axes[1].set_title('Hyperparameter max_depth vs ROC-AUC Score')
axes[1].set_xlabel('max_depth')
axes[1].set_ylabel('ROC-AUC Score')
axes[1].legend()
axes[1].grid()

# Subplot 3: max_samples
cv_results_max_samples = cv_results.sort_values('param_max_samples')
axes[2].plot(cv_results_max_samples['param_max_samples'], cv_results_max_samples['mean_train_score'], 
             marker='o', label='Train Score', linestyle='-', linewidth=2)
axes[2].plot(cv_results_max_samples['param_max_samples'], cv_results_max_samples['mean_test_score'], 
             marker='o', label='CV Score', linestyle='--', linewidth=2)
axes[2].set_ylim(0.7, 1)
axes[2].set_title('Hyperparameter max_samples vs ROC-AUC Score')
axes[2].set_xlabel('max_samples')
axes[2].set_ylabel('ROC-AUC Score')
axes[2].legend()
axes[2].grid()

plt.tight_layout()
plt.show()

# Find the row with the lowest Train-CV Score Difference
best_diff_row = rf.tuning_results.loc[rf.tuning_results['Train_CV_Score_Difference'].idxmin()]

# Extract and print the hyperparameters and scores
print("Best Hyperparameters (Lowest Train-CV Score Difference):")
print(best_diff_row['params'])
print(f"Mean Test Score: {best_diff_row['mean_test_score']:.6f}")
print(f"Mean Train Score: {best_diff_row['mean_train_score']:.6f}")
print(f"Train-CV Score Difference: {best_diff_row['Train_CV_Score_Difference']:.6f}")

Best Hyperparameters (Lowest Train-CV Score Difference):
{'n_estimators': 150, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_samples': 0.7, 'max_features': 'sqrt', 'max_depth': 8}
Mean Test Score: 0.768048
Mean Train Score: 0.818001
Train-CV Score Difference: 0.049953

# Training on best parameters
rf.train_on_best_params()
# Checking the results
rf.results_on_best_params('linear')
# Showing the feature importances
rf.feat_importances_show(train_data.columns, num_features = 50)

Fitting Classifier on best parameters

4-Fold Cross Validation
	Fitting Fold 1
	Fitting Fold 2
	Fitting Fold 3
	Fitting Fold 4
Finised.
Time elapsed = 0:04:16.592314
================================================================================
The best selected Threshold as per the J-Statistic is = 0.44206178441515626

Train Results:
	ROC-AUC Score = 0.878604762027
	Precision Score = 0.235224940623
	Recall Score = 0.821832829809
CV Results:
	ROC-AUC Score = 0.771939800934
	Precision Score = 0.191108460715
	Recall Score = 0.632547834844
================================================================================
Confusion Matrix of CV data:

Distribution of Original Class Labels and Predicted CV and Test Class Labels

# Re-instantiate the best model with updated hyperparameters
rf.best_model = RandomForestClassifier(
    class_weight='balanced_subsample',
    max_depth=8,                  # Updated max_depth
    max_samples=0.7,              # Updated max_samples
    min_samples_leaf=10,          # Updated min_samples_leaf
    min_samples_split=10,         # Updated min_samples_split
    n_estimators=150,             # Updated n_estimators
    max_features='sqrt',          # Updated max_features
    n_jobs=-1,                    # Use all available cores
    random_state=42,              # For reproducibility
    verbose=0
)

# Training the model with the best parameters
rf.train_on_best_params()

# Displaying results
rf.results_on_best_params('random_forest')

# Plotting feature importances for the top 50 features
rf.feat_importances_show(train_data.columns, num_features=50)

Fitting Classifier on best parameters

4-Fold Cross Validation
	Fitting Fold 1
	Fitting Fold 2
	Fitting Fold 3
	Fitting Fold 4
Finised.
Time elapsed = 0:02:04.594034
================================================================================
The best selected Threshold as per the J-Statistic is = 0.47886330822660783

Train Results:
	ROC-AUC Score = 0.795882936664
	Precision Score = 0.179483397737
	Recall Score = 0.735307150050
CV Results:
	ROC-AUC Score = 0.768744768184
	Precision Score = 0.170488115035
	Recall Score = 0.702074521652
================================================================================
Confusion Matrix of CV data:

Distribution of Original Class Labels and Predicted CV and Test Class Labels

# Define the directory and file name for saving the submission
submission_dir = '../analytical/assets/submissions'
submission_file = 'Random_Forest_Submission.csv'

# Ensure the directory exists
os.makedirs(submission_dir, exist_ok=True)

# Define the full path for the submission file
submission_path = os.path.join(submission_dir, submission_file)

# Save the predictions to the submission file
submission_data = pd.DataFrame({
    'SK_ID_CURR': skid_test, 
    'TARGET': rf.test_preds_probas  # Assuming rf holds the Random Forest model
})
submission_data.to_csv(submission_path, index=False)

# Inform the user
print(f"Submission file has been saved to: {submission_path}")
print("You can manually upload it to the competition page.")

Submission file has been saved to: ../analytical/assets/submissions/Random_Forest_Submission.csv
You can manually upload it to the competition page.

with open('Random_Forest_CV_Preds.pkl', 'wb') as f:
    pickle.dump(rf.cv_preds_probas, f)
with open('Random_Forest_Test_Preds.pkl', 'wb') as f:
    pickle.dump(rf.test_preds_probas, f)
with open('Random_Forest_Model.pkl', 'wb') as f:
    pickle.dump(rf.best_model, f)

# Updated XGBoost evaluation function
def xgb_evaluation(max_depth, min_child_weight, gamma, subsample, colsample_bytree,
                   colsample_bylevel, colsample_bynode, reg_alpha, reg_lambda):
    """
    Objective function for Bayesian Optimization of XGBoost's hyperparameters.
    """
    # Define parameters
    params = {
        'learning_rate': 0.01,
        'max_depth': int(round(max_depth)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': subsample,
        'gamma': gamma,
        'colsample_bytree': colsample_bytree,
        'colsample_bylevel': colsample_bylevel,
        'colsample_bynode': colsample_bynode,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'hist',  # Use 'gpu_hist' if GPU is enabled
        'random_state': 51412
    }

    # Cross-validation strategy
    stratified_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=33)
    cv_preds = np.zeros(train_data.shape[0])

    # Iterate over folds
    for train_indices, cv_indices in stratified_cv.split(train_data, target_train):
        x_tr = train_data.iloc[train_indices]
        y_tr = target_train.iloc[train_indices]
        x_cv = train_data.iloc[cv_indices]
        y_cv = target_train.iloc[cv_indices]

        # Convert to DMatrix
        dtrain = DMatrix(data=x_tr, label=y_tr)
        dvalid = DMatrix(data=x_cv, label=y_cv)

        # Train the model with early stopping
        evals = [(dtrain, 'train'), (dvalid, 'eval')]
        booster = train(params, dtrain, num_boost_round=10000, evals=evals,
                        early_stopping_rounds=200, verbose_eval=False)

        # Use the number of boosting rounds from the booster for prediction
        best_iteration = booster.attributes().get('best_iteration')
        ntree_limit = int(best_iteration) if best_iteration else 0
        cv_preds[cv_indices] = booster.predict(DMatrix(data=x_cv), iteration_range=(0, ntree_limit))

        # Free memory
        gc.collect()

    # Return the ROC-AUC score
    return roc_auc_score(target_train, cv_preds)

# Define the Bayesian Optimizer
bopt_xgb = BayesianOptimization(
    f=xgb_evaluation,
    pbounds={
        'max_depth': (5, 15),
        'min_child_weight': (5, 80),
        'gamma': (0.2, 1),
        'subsample': (0.5, 1),
        'colsample_bytree': (0.5, 1),
        'colsample_bylevel': (0.3, 1),
        'colsample_bynode': (0.3, 1),
        'reg_alpha': (0.001, 0.3),
        'reg_lambda': (0.001, 0.3)
    },
    random_state=55
)

# Run Bayesian Optimization
bopt_xgb.maximize(
    n_iter=6,
    init_points=4
)

|   iter    |  target   | colsam... | colsam... | colsam... |   gamma   | max_depth | min_ch... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| 1         | 0.8051    | 0.3652    | 0.9802    | 0.7419    | 0.394     | 10.31     | 26.42     | 0.2589    | 0.01329   | 0.5542    |
| 2         | 0.8049    | 0.837     | 0.336     | 0.8879    | 0.2073    | 11.18     | 66.4      | 0.2697    | 0.2957    | 0.7484    |
| 3         | 0.8052    | 0.5466    | 0.907     | 0.6984    | 0.7149    | 5.205     | 65.61     | 0.1295    | 0.168     | 0.8896    |
| 4         | 0.8053    | 0.5818    | 0.9356    | 0.825     | 0.7818    | 12.8      | 31.3      | 0.006878  | 0.1082    | 0.6516    |
| 5         | 0.8051    | 0.8057    | 0.7387    | 0.8461    | 0.9899    | 5.404     | 74.35     | 0.1637    | 0.2228    | 0.9098    |
| 6         | 0.8023    | 0.642     | 0.9929    | 0.5992    | 0.8559    | 12.96     | 5.217     | 0.1832    | 0.199     | 0.8784    |
| 7         | 0.8057    | 0.8291    | 0.7766    | 0.7451    | 0.247     | 5.027     | 45.25     | 0.111     | 0.2364    | 0.7915    |
| 8         | 0.8054    | 0.3113    | 0.3649    | 0.7252    | 0.9805    | 5.022     | 34.27     | 0.1358    | 0.1028    | 0.9909    |
| 9         | 0.8048    | 0.558     | 0.5259    | 0.65      | 0.2643    | 14.99     | 48.01     | 0.09494   | 0.1122    | 0.8904    |
| 10        | 0.8056    | 0.8591    | 0.6145    | 0.6762    | 0.2999    | 5.015     | 53.53     | 0.2283    | 0.2457    | 0.7997    |
=====================================================================================================================================

# Getting the best parameters
target_values = []
for result in bopt_xgb.res:
    target_values.append(result['target'])
    if result['target'] == max(target_values):
        best_params = result['params']
    
print("Best Hyperparameters for XGBoost are:\n")
print(best_params)

Best Hyperparameters for XGBoost are:

{'colsample_bylevel': 0.829110344275626, 'colsample_bynode': 0.7766452949295628, 'colsample_bytree': 0.7451279001773903, 'gamma': 0.24704360150463167, 'max_depth': 5.026923270462706, 'min_child_weight': 45.24908763380125, 'reg_alpha': 0.1110046722163512, 'reg_lambda': 0.23642567413011664, 'subsample': 0.7915292135660588}

class Boosting:
    '''
    Class for Boosting Ensembles and displaying results. Contains methods to:
    
        1. Initialize the class (init)
        2. Train the model (train)
        3. Convert probabilities to class labels (proba_to_class)
        4. Tune the decision threshold (tune_threshold)
        5. Display results (results)
        6. Show feature importance (feat_importances_show)
    '''

    def __init__(self, x_train, y_train, x_test, params, num_folds=3, random_state=33, verbose=True, save_model_to_pickle=False):
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.params = params
        self.num_folds = num_folds
        self.stratified_cv = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=random_state)
        self.verbose = verbose
        self.save_model = save_model_to_pickle

    def train(self, booster, verbose=400, early_stopping=200, pickle_name=''):
        """
        Train the boosting model using the specified booster ('xgboost' or 'lightgbm').
        """
        # Initialize predictions and other variables
        self.train_preds_proba_mean = np.zeros(self.x_train.shape[0])
        self.cv_preds_proba = np.zeros(self.x_train.shape[0])
        self.test_preds_proba_mean = np.zeros(self.x_test.shape[0])
        self.best_threshold_train = 0
        self.feature_importance = pd.DataFrame(columns=["features", "gain"])

        if self.verbose:
            print(f"Training {booster} with {self.num_folds}-fold cross-validation and Out-Of-Folds Predictions")

        start = datetime.now()

        for fold_number, (train_indices, cv_indices) in enumerate(self.stratified_cv.split(self.x_train, self.y_train), 1):
            if self.verbose:
                print(f"\nFold {fold_number}")

            x_tr, y_tr = self.x_train.iloc[train_indices], self.y_train.iloc[train_indices]
            x_cv, y_cv = self.x_train.iloc[cv_indices], self.y_train.iloc[cv_indices]

            if booster == 'xgboost':
                self.params['eval_metric'] = 'auc'
                dtrain = DMatrix(data=x_tr, label=y_tr)
                dvalid = DMatrix(data=x_cv, label=y_cv)
                evals = [(dtrain, 'train'), (dvalid, 'eval')]

                booster_model = train(
                    params=self.params,
                    dtrain=dtrain,
                    num_boost_round=10000,
                    evals=evals,
                    early_stopping_rounds=early_stopping,
                    verbose_eval=verbose
                )

                best_iteration = booster_model.best_iteration
                iteration_range = (0, best_iteration)
                self.train_preds_proba_mean[train_indices] += (
                    booster_model.predict(DMatrix(data=x_tr), iteration_range=iteration_range) / (self.num_folds - 1)
                )
                self.cv_preds_proba[cv_indices] = booster_model.predict(DMatrix(data=x_cv), iteration_range=iteration_range)
                self.test_preds_proba_mean += (
                    booster_model.predict(DMatrix(data=self.x_test), iteration_range=iteration_range) / self.num_folds
                )

                # Feature importance
                gain_fold = booster_model.get_score(importance_type='gain')
                feat_imp = pd.DataFrame(list(gain_fold.items()), columns=['features', 'gain'])

            # elif booster == 'lightgbm':
            #     clf = LGBMClassifier(**self.params)
            #     clf.fit(
            #         x_tr, y_tr,
            #         eval_set=[(x_cv, y_cv)],
            #         eval_metric='auc',
            #         verbose=verbose,
            #         early_stopping_rounds=early_stopping
            #     )
            #     self.train_preds_proba_mean[train_indices] += clf.predict_proba(x_tr)[:, 1] / (self.num_folds - 1)
            #     self.cv_preds_proba[cv_indices] = clf.predict_proba(x_cv)[:, 1]
            #     self.test_preds_proba_mean += clf.predict_proba(self.x_test)[:, 1] / self.num_folds
            elif booster == 'lightgbm':
                clf = LGBMClassifier(**self.params)
                clf.fit(
                    x_tr, y_tr,
                    eval_set=[(x_cv, y_cv)],
                    eval_metric='auc',
                    callbacks=[
                        lgb.early_stopping(stopping_rounds=early_stopping),
                        lgb.log_evaluation(period=verbose if verbose else 0)  # Controls verbosity
                    ]
                )
                self.train_preds_proba_mean[train_indices] += clf.predict_proba(x_tr)[:, 1] / (self.num_folds - 1)
                self.cv_preds_proba[cv_indices] = clf.predict_proba(x_cv)[:, 1]
                self.test_preds_proba_mean += clf.predict_proba(self.x_test)[:, 1] / self.num_folds

                gain_fold = clf.booster_.feature_importance(importance_type='gain')
                feat_imp = pd.DataFrame({'features': self.x_train.columns, 'gain': gain_fold})
            else:
                raise ValueError("Invalid booster type. Use 'xgboost' or 'lightgbm'.")

            # Update threshold
            self.best_threshold_train += self.tune_threshold(y_tr, self.train_preds_proba_mean[train_indices]) / self.num_folds
            self.feature_importance = pd.concat([self.feature_importance, feat_imp], axis=0)

            # Save model if required
            if self.save_model:
                with open(f'clf_{booster}_fold_{fold_number}_model_{pickle_name}.pkl', 'wb') as f:
                    pickle.dump(booster_model if booster == 'xgboost' else clf, f)

        self.feature_importance = self.feature_importance.groupby("features", as_index=False).mean()
        self.feature_importance = self.feature_importance.sort_values(by="gain", ascending=False)

        if self.verbose:
            print(f"Training complete. Time elapsed: {datetime.now() - start}")

    def proba_to_class(self, proba, threshold):
        return np.where(proba >= threshold, 1, 0)

    def tune_threshold(self, true_labels, predicted_probas):
        fpr, tpr, threshold = roc_curve(true_labels, predicted_probas)
        j_stat = tpr - fpr
        return threshold[np.argmax(j_stat)]

    def results(self, roc_auc=True, precision_recall=True, show_confusion_matrix=True, cv_test_distribution=True):
        """
        Function to display the final results of Train, CV, and Test Dataset.
        
        Inputs:
            self
            roc_auc: bool
                Whether to calculate and display ROC-AUC scores.
            precision_recall: bool
                Whether to calculate and display Precision and Recall scores.
            show_confusion_matrix: bool
                Whether to display the confusion matrix.
            cv_test_distribution: bool
                Whether to display distribution of predicted class labels.
        
        Returns:
            None
        """
        # Convert probabilities to crisp class labels
        self.train_preds_class = self.proba_to_class(self.train_preds_proba_mean, self.best_threshold_train)
        self.cv_preds_class = self.proba_to_class(self.cv_preds_proba, self.best_threshold_train)
        self.test_preds_class = self.proba_to_class(self.test_preds_proba_mean, self.best_threshold_train)
        
        print("=" * 100)
        print("Train Results:")
        print(f"\nThe best selected Threshold as per the J-Statistic, which is J = TPR - FPR, is = {self.best_threshold_train}\n")
        if roc_auc:
            print(f"\tTrain ROC-AUC Score = {roc_auc_score(self.y_train, self.train_preds_proba_mean)}")
            print(f"\tCV ROC-AUC Score = {roc_auc_score(self.y_train, self.cv_preds_proba)}")
        if precision_recall:
            print(f"\tTrain Precision Score = {precision_score(self.y_train, self.train_preds_class)}")
            print(f"\tCV Precision Score = {precision_score(self.y_train, self.cv_preds_class)}")
            print(f"\tTrain Recall Score = {recall_score(self.y_train, self.train_preds_class)}")
            print(f"\tCV Recall Score = {recall_score(self.y_train, self.cv_preds_class)}")
        
        if show_confusion_matrix:
            print("Confusion, Precision, and Recall Matrix on CV data:")
            cm = confusion_matrix(self.y_train, self.cv_preds_class)
            cm_df = pd.DataFrame(cm, columns=['Predicted_0', 'Predicted_1'], index=['Actual_0', 'Actual_1'])
            plt.figure(figsize=(7, 6))
            sns.heatmap(cm_df, annot=True, fmt='g', linewidths=0.5, annot_kws={"size": 15}, cmap='Blues')
            plt.title('Confusion Matrix Heatmap')
            plt.show()
        
        if cv_test_distribution:
            print('=' * 100)
            print("Distribution of Original Class Labels and Predicted CV and Test Class Labels")
            
            # Identify unique class labels dynamically
            unique_classes = sorted(self.y_train.unique())
            
            # Define a custom palette for class colors
            custom_palette = {cls: 'steelblue' if cls == 0 else 'darkorange' for cls in unique_classes}
            
            # Calculate counts for each dataset
            y_train_counts = self.y_train.value_counts(sort=False)
            cv_preds_counts = pd.Series(self.cv_preds_class).value_counts(sort=False)
            test_preds_counts = pd.Series(self.test_preds_class).value_counts(sort=False)
            
            # Create subplots
            fig, axes = plt.subplots(1, 3, figsize=(20, 6), sharey=True)
            
            # Original Class Labels
            sns.barplot(x=y_train_counts.index, y=y_train_counts.values, ax=axes[0], 
                        palette=[custom_palette[x] for x in y_train_counts.index])
            axes[0].set_title('Class Distribution of Original Dataset')
            axes[0].set_xlabel('Classes')
            axes[0].set_ylabel('Count')
            
            # Predicted CV Class Labels
            sns.barplot(x=cv_preds_counts.index, y=cv_preds_counts.values, ax=axes[1], 
                        palette=[custom_palette[x] for x in cv_preds_counts.index])
            axes[1].set_title('Class Distribution of Predicted Class Labels on CV')
            axes[1].set_xlabel('Classes')
            axes[1].set_ylabel('Count')
            
            # Predicted Test Class Labels
            sns.barplot(x=test_preds_counts.index, y=test_preds_counts.values, ax=axes[2], 
                        palette=[custom_palette[x] for x in test_preds_counts.index])
            axes[2].set_title('Class Distribution of Predicted Test Dataset')
            axes[2].set_xlabel('Classes')
            axes[2].set_ylabel('Count')
            
            # Adjust layout to avoid overlap
            plt.tight_layout()
            plt.show()


    def feat_importances_show(self, num_features=10, figsize=None):
        """
        Function to display the top most important features.

        Inputs:
            self
            num_features: int, default = 10
                Number of top feature importances to display.
            figsize: tuple, optional
                Size of the figure to display. Default is adjusted based on num_features.
        
        Returns:
            None
        """
        # Validate if feature importance data exists
        if not hasattr(self, 'feature_importance') or self.feature_importance.empty:
            raise ValueError("Feature importance data is missing. Ensure the model has been trained.")

        # Extract top features
        top_features = self.feature_importance.nlargest(num_features, 'gain')
        
        # Adjust default figsize dynamically if not provided
        if figsize is None:
            figsize = (10, max(5, num_features / 2))  # Ensure a reasonable default height based on num_features
        
        # Plot feature importances
        plt.figure(figsize=figsize)
        sns.barplot(data=top_features, x="gain", y="features", palette="viridis", orient="h")
        plt.title(f"Top {num_features} Feature Importances")
        plt.xlabel("Feature Importance (Gain)")
        plt.ylabel("Features")
        plt.grid(axis='x')
        plt.tight_layout()  # Adjust layout to avoid overlap
        plt.show()

params = {
    'learning_rate': 0.01,
    'n_estimators': 10000,
    'tree_method': 'hist',  # Use CPU-based tree method
    'max_depth': 5,
    'min_child_weight': 1,
    'subsample': 0.8,
    'gamma': 0.2,
    'colsample_bytree': 0.8,
    'colsample_bylevel': 0.8,
    'colsample_bynode': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'random_state': 42,
}

boosting_model = Boosting(
    x_train=train_data,
    y_train=target_train,
    x_test=test_data,
    params=params,
    num_folds=3,
    verbose=True
)

boosting_model.train(booster='xgboost')
boosting_model.results()

Training xgboost with 3-fold cross-validation and Out-Of-Folds Predictions

Fold 1
[0]	train-auc:0.75122	eval-auc:0.74723
[400]	train-auc:0.80469	eval-auc:0.78699
[800]	train-auc:0.82700	eval-auc:0.79496
[1200]	train-auc:0.84229	eval-auc:0.79791
[1600]	train-auc:0.85420	eval-auc:0.79944
[2000]	train-auc:0.86397	eval-auc:0.80030
[2400]	train-auc:0.87302	eval-auc:0.80070
[2800]	train-auc:0.88135	eval-auc:0.80082
[2962]	train-auc:0.88444	eval-auc:0.80065

Fold 2
[0]	train-auc:0.75305	eval-auc:0.74888
[400]	train-auc:0.80554	eval-auc:0.78416
[800]	train-auc:0.82765	eval-auc:0.79193
[1200]	train-auc:0.84287	eval-auc:0.79494
[1600]	train-auc:0.85503	eval-auc:0.79642
[2000]	train-auc:0.86532	eval-auc:0.79715
[2400]	train-auc:0.87453	eval-auc:0.79735
[2800]	train-auc:0.88239	eval-auc:0.79739
[3200]	train-auc:0.88968	eval-auc:0.79738
[3218]	train-auc:0.89000	eval-auc:0.79737

Fold 3
[0]	train-auc:0.75226	eval-auc:0.74989
[400]	train-auc:0.80456	eval-auc:0.78611
[800]	train-auc:0.82644	eval-auc:0.79450
[1200]	train-auc:0.84122	eval-auc:0.79771
[1600]	train-auc:0.85330	eval-auc:0.79944
[2000]	train-auc:0.86321	eval-auc:0.80041
[2400]	train-auc:0.87203	eval-auc:0.80093
[2800]	train-auc:0.88020	eval-auc:0.80127
[3200]	train-auc:0.88733	eval-auc:0.80143
[3215]	train-auc:0.88762	eval-auc:0.80142
Training complete. Time elapsed: 0:20:07.376944
====================================================================================================
Train Results:

The best selected Threshold as per the J-Statistic, which is J = TPR - FPR, is = 0.07924748957157135

	Train ROC-AUC Score = 0.8857353397142647
	CV ROC-AUC Score = 0.7999337513848167
	Train Precision Score = 0.20752823805974427
	CV Precision Score = 0.17564249316889446
	Train Recall Score = 0.8748036253776436
	CV Recall Score = 0.7664451158106748
Confusion, Precision, and Recall Matrix on CV data:

====================================================================================================
Distribution of Original Class Labels and Predicted CV and Test Class Labels

# Feature Size
boosting_model.feat_importances_show(num_features=50, figsize=(10, 15))

with open('xgb_imp_feats.pkl','wb') as f:
    pickle.dump(boosting_model.feature_importance, f)
with open('xgb_cv_preds.pkl', 'wb') as f:
    pickle.dump(boosting_model.cv_preds_proba, f)
with open('xgb_test_preds.pkl', 'wb') as f:
    pickle.dump(boosting_model.test_preds_proba_mean, f)

# Define the directory and file name for saving the submission
submission_dir = '../analytical/assets/submissions'
submission_file = 'XGB_Boost_final.csv'  # Updated file name

# Ensure the directory exists
os.makedirs(submission_dir, exist_ok=True)

# Clip the probabilities to be within the range [0, 1]
test_preds_clipped = np.clip(boosting_model.test_preds_proba_mean, 0, 1)

# Define the full path for the submission file
submission_path = os.path.join(submission_dir, submission_file)

# Save the predictions to the submission file
submission_data = pd.DataFrame({
    'SK_ID_CURR': skid_test, 
    'TARGET': test_preds_clipped
})
submission_data.to_csv(submission_path, index=False)

# Inform the user
print(f"Submission file has been saved to: {submission_path}")
print("You can manually upload it to the competition page.")

Submission file has been saved to: ../analytical/assets/submissions/XGB_Boost_final.csv
You can manually upload it to the competition page.

def lgbm_evaluation(num_leaves, max_depth, min_split_gain, min_child_weight,
                    min_child_samples, subsample, colsample_bytree, reg_alpha, reg_lambda):
    """
    Objective function for Bayesian Optimization of LightGBM's Hyperparameters.
    """
    # Define parameters
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'metric': 'auc',
        'learning_rate': 0.005,
        'num_leaves': int(round(num_leaves)),
        'max_depth': int(round(max_depth)),
        'min_split_gain': min_split_gain,
        'min_child_weight': min_child_weight,
        'min_child_samples': int(round(min_child_samples)),
        'subsample': subsample,
        'subsample_freq': 1,
        'colsample_bytree': colsample_bytree,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'verbosity': -1,
        'seed': 266
    }

    # Prepare dataset
    train_set = lgb.Dataset(train_data, label=target_train)

    # Perform cross-validation
    cv_results = lgb.cv(
        params=params,
        train_set=train_set,
        num_boost_round=1000,
        nfold=2,
        stratified=True,
        shuffle=True,
        metrics=['auc'],
        callbacks=[lgb.early_stopping(stopping_rounds=50)],
    )

    # Check for the correct key and return the best AUC score
    if 'valid auc-mean' in cv_results:
        return max(cv_results['valid auc-mean'])
    else:
        raise KeyError("Expected 'valid auc-mean' in cv_results but not found. Available keys are:", cv_results.keys())

# Define Bayesian Optimization
bopt_lgbm = BayesianOptimization(
    f=lgbm_evaluation,
    pbounds={
        'num_leaves': (25, 50),
        'max_depth': (6, 11),
        'min_split_gain': (0, 0.1),
        'min_child_weight': (5, 80),
        'min_child_samples': (5, 80),
        'subsample': (0.5, 1),
        'colsample_bytree': (0.5, 1),
        'reg_alpha': (0.001, 0.3),
        'reg_lambda': (0.001, 0.3)
    },
    random_state=4976
)

# Run Bayesian Optimization
bayesian_optimization = bopt_lgbm.maximize(n_iter=4, init_points=4)

|   iter    |  target   | colsam... | max_depth | min_ch... | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid auc: 0.796697 + 0.000627621
| 1         | 0.7967    | 0.9839    | 9.624     | 65.58     | 60.98     | 0.08223   | 39.55     | 0.1133    | 0.2049    | 0.6677    |
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid auc: 0.796871 + 0.000544809
| 2         | 0.7969    | 0.5453    | 10.99     | 36.09     | 42.7      | 0.02383   | 43.12     | 0.1206    | 0.1951    | 0.8343    |
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid auc: 0.796379 + 0.000655726
| 3         | 0.7964    | 0.7313    | 9.478     | 47.08     | 53.08     | 0.0249    | 36.94     | 0.2417    | 0.1072    | 0.5916    |
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid auc: 0.795094 + 0.000558682
| 4         | 0.7951    | 0.5671    | 7.674     | 26.69     | 8.717     | 0.004937  | 31.48     | 0.118     | 0.09472   | 0.9706    |
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid auc: 0.796242 + 0.00051992
| 5         | 0.7962    | 0.574     | 10.13     | 65.98     | 60.49     | 0.07123   | 38.04     | 0.1395    | 0.242     | 0.7031    |
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid auc: 0.797208 + 0.000713299
| 6         | 0.7972    | 0.7913    | 10.37     | 35.59     | 39.75     | 0.03426   | 41.8      | 0.04194   | 0.1444    | 0.5961    |
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid auc: 0.796093 + 0.000600025
| 7         | 0.7961    | 0.939     | 8.951     | 37.4      | 40.47     | 0.08256   | 38.25     | 0.1884    | 0.08572   | 0.9433    |
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid auc: 0.796555 + 0.000693965
| 8         | 0.7966    | 0.8941    | 10.92     | 36.08     | 39.69     | 0.02191   | 44.26     | 0.1538    | 0.004957  | 0.9993    |
=====================================================================================================================================

# Extracting the best parameters
target_values = []
for result in bopt_lgbm.res:
    target_values.append(result['target'])
    if result['target'] == max(target_values):
        best_params = result['params']

print("Best Hyperparameters obtained are:\n")
print(best_params)

Best Hyperparameters obtained are:

{'colsample_bytree': 0.7912887079037543, 'max_depth': 10.373045152859307, 'min_child_samples': 35.59081043006549, 'min_child_weight': 39.74982026664126, 'min_split_gain': 0.03426485776776513, 'num_leaves': 41.80294469710104, 'reg_alpha': 0.041938208556952246, 'reg_lambda': 0.1444224117364263, 'subsample': 0.5961139776588129}

params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'learning_rate': 0.005,
    'n_estimators': 10000,
    'n_jobs': -1,
    'num_leaves': int(round(41.80294469710104)),  # Rounded to the nearest integer
    'max_depth': int(round(10.373045152859307)),  # Rounded to the nearest integer
    'min_split_gain': 0.03426485776776513,
    'min_child_weight': 39.74982026664126,
    'min_child_samples': int(round(35.59081043006549)),  # Rounded to the nearest integer
    'subsample': 0.5961139776588129,
    'subsample_freq': 1,
    'colsample_bytree': 0.7912887079037543,
    'reg_alpha': 0.041938208556952246,
    'reg_lambda': 0.1444224117364263,
    'verbosity': -1,
    'seed': 266
}

# Create and train the model using the best parameters
lgbm_boosting = Boosting(train_data, target_train, test_data, params, random_state=98, save_model_to_pickle=True)
lgbm_boosting.train(booster='lightgbm')

Training lightgbm with 3-fold cross-validation and Out-Of-Folds Predictions

Fold 1
Training until validation scores don't improve for 200 rounds
[400]	valid_0's auc: 0.788016	valid_0's binary_logloss: 0.239572
[800]	valid_0's auc: 0.79783	valid_0's binary_logloss: 0.234163
[1200]	valid_0's auc: 0.802161	valid_0's binary_logloss: 0.232206
[1600]	valid_0's auc: 0.804369	valid_0's binary_logloss: 0.231247
[2000]	valid_0's auc: 0.805659	valid_0's binary_logloss: 0.230715
[2400]	valid_0's auc: 0.806336	valid_0's binary_logloss: 0.230428
[2800]	valid_0's auc: 0.806785	valid_0's binary_logloss: 0.23023
[3200]	valid_0's auc: 0.807065	valid_0's binary_logloss: 0.230113
[3600]	valid_0's auc: 0.807229	valid_0's binary_logloss: 0.230062
Early stopping, best iteration is:
[3693]	valid_0's auc: 0.807311	valid_0's binary_logloss: 0.230032

Fold 2
Training until validation scores don't improve for 200 rounds
[400]	valid_0's auc: 0.783488	valid_0's binary_logloss: 0.240584
[800]	valid_0's auc: 0.794392	valid_0's binary_logloss: 0.235297
[1200]	valid_0's auc: 0.799714	valid_0's binary_logloss: 0.233142
[1600]	valid_0's auc: 0.802224	valid_0's binary_logloss: 0.232165
[2000]	valid_0's auc: 0.803638	valid_0's binary_logloss: 0.231611
[2400]	valid_0's auc: 0.804523	valid_0's binary_logloss: 0.231269
[2800]	valid_0's auc: 0.805008	valid_0's binary_logloss: 0.231081
[3200]	valid_0's auc: 0.805316	valid_0's binary_logloss: 0.230984
[3600]	valid_0's auc: 0.80548	valid_0's binary_logloss: 0.230918
Early stopping, best iteration is:
[3724]	valid_0's auc: 0.805528	valid_0's binary_logloss: 0.230903

Fold 3
Training until validation scores don't improve for 200 rounds
[400]	valid_0's auc: 0.781848	valid_0's binary_logloss: 0.240495
[800]	valid_0's auc: 0.792144	valid_0's binary_logloss: 0.2354
[1200]	valid_0's auc: 0.797005	valid_0's binary_logloss: 0.233485
[1600]	valid_0's auc: 0.79948	valid_0's binary_logloss: 0.232551
[2000]	valid_0's auc: 0.800852	valid_0's binary_logloss: 0.232056
[2400]	valid_0's auc: 0.801703	valid_0's binary_logloss: 0.231772
[2800]	valid_0's auc: 0.802207	valid_0's binary_logloss: 0.231603
[3200]	valid_0's auc: 0.802514	valid_0's binary_logloss: 0.231492
[3600]	valid_0's auc: 0.802708	valid_0's binary_logloss: 0.231452
Early stopping, best iteration is:
[3462]	valid_0's auc: 0.802715	valid_0's binary_logloss: 0.231445
Training complete. Time elapsed: 0:20:31.213930

# Best model result
lgbm_boosting.results()

# Feature Size
lgbm_boosting.feat_importances_show(num_features=50, figsize=(10, 15))

====================================================================================================
Train Results:

The best selected Threshold as per the J-Statistic, which is J = TPR - FPR, is = 0.07427608047834153

	Train ROC-AUC Score = 0.9006949516141293
	CV ROC-AUC Score = 0.8051536579063042
	Train Precision Score = 0.2228861879349877
	CV Precision Score = 0.18629280346966565
	Train Recall Score = 0.8849546827794562
	CV Recall Score = 0.7440080563947633
Confusion, Precision, and Recall Matrix on CV data:

====================================================================================================
Distribution of Original Class Labels and Predicted CV and Test Class Labels

# Define the directory and file name for saving the submission
submission_dir = '../analytical/assets/submissions'
submission_file = 'LGBM_2folds.csv'

# Ensure the directory exists
os.makedirs(submission_dir, exist_ok=True)

# Define the full path for the submission file
submission_path = os.path.join(submission_dir, submission_file)

# Save the predictions to the submission file
submission_data = pd.DataFrame({
    'SK_ID_CURR': skid_test,  # Ensure this is defined and corresponds to the test dataset IDs
    'TARGET': lgbm_boosting.test_preds_proba_mean  # Assuming lgbm_boosting holds the LGBM model and predictions
})

# Convert probabilities to the required format (e.g., ensure all values are between 0 and 1)
submission_data['TARGET'] = submission_data['TARGET'].clip(0, 1)

# Save the submission to a CSV file
submission_data.to_csv(submission_path, index=False)

# Inform the user
print(f"Submission file has been saved to: {submission_path}")
print("You can manually upload it to the competition page.")

Submission file has been saved to: ../analytical/assets/submissions/LGBM_2folds.csv
You can manually upload it to the competition page.

with open('lgbm_imp_feats.pkl','wb') as f:
    pickle.dump(lgbm_boosting.feature_importance, f)
with open('lgbm_cv_preds.pkl', 'wb') as f:
    pickle.dump(lgbm_boosting.cv_preds_proba, f)
with open('lgbm_test_preds.pkl', 'wb') as f:
    pickle.dump(lgbm_boosting.test_preds_proba_mean, f)

def load_stacking_data(file_directory='', verbose=True):
    '''
    Objective function to prepare the Stacking Training and Test Data. The training data is the
    Out-Of-Fold predictions of each base learner, and the test data are the predicted probabilities
    on the Test data.

    Inputs:
        file_directory: str, default = ''
            The path of the directory where the predictions are located. Insert '/' at the end if needed.
        verbose: bool, default = True
            Whether to keep verbosity or not.

    Returns:
        Training stacking data, Test stacking data, Training target values.
    '''

    if verbose:
        print("Loading the CV (out-of-folds) and Test Predictions from Base Models...")
        start = datetime.now()

    global sgd_lr_l2_cv, sgd_svm_cv, rf_cv, xgb_cv, lgbm_cv
    global sgd_lr_l2_test, sgd_svm_test, rf_test, xgb_test, lgbm_test

    # Loading the CV (out-of-fold) predictions
    with open(file_directory + 'SGD_LR_L2_Penalty_CV_Preds.pkl', 'rb') as f:
        sgd_lr_l2_cv = pickle.load(f)
    with open(file_directory + 'SGD_Linear_SVM_CV_Preds.pkl', 'rb') as f:
        sgd_svm_cv = pickle.load(f)
    with open(file_directory + 'Random_Forest_CV_Preds.pkl', 'rb') as f:
        rf_cv = pickle.load(f)
    with open(file_directory + 'xgb_cv_preds.pkl', 'rb') as f:
        xgb_cv = pickle.load(f)
    with open(file_directory + 'lgbm_cv_preds.pkl', 'rb') as f:
        lgbm_cv = pickle.load(f)


    # Base directory for your submissions folder (relative to the current working directory)
    submissions_dir = os.path.join(file_directory, '../analytical/assets/submissions')

    # Ensure the directory exists before accessing files
    if not os.path.exists(submissions_dir):
        raise FileNotFoundError(f"Submissions directory not found: {submissions_dir}")

    # Loading the Predicted Probabilities of the Test Set
    sgd_lr_l2_test = pd.read_csv(os.path.join(submissions_dir, 'SGD_LR_L2_penalty.csv'))['TARGET']
    sgd_svm_test = pd.read_csv(os.path.join(submissions_dir, 'Linear_SVM_Submission.csv'))['TARGET']
    rf_test = pd.read_csv(os.path.join(submissions_dir, 'Random_Forest_Submission.csv'))['TARGET']
    xgb_test = pd.read_csv(os.path.join(submissions_dir, 'XGB_Boost_final.csv'))['TARGET']
    lgbm_test = pd.read_csv(os.path.join(submissions_dir, 'LGBM_2folds.csv'))['TARGET']
    
    if verbose:
        print("Making the Training and Test Dataset for Stacking...")

    # Defining the training and test datasets
    stack_train = pd.DataFrame(
        np.stack([sgd_lr_l2_cv, sgd_svm_cv, rf_cv, xgb_cv, lgbm_cv], axis=1),
        columns=['SGD_LR_L2', 'SGD_SVM', 'RF', 'XGB', 'LGBM']
    )
    stack_target = target_train.copy()
    stack_test = pd.concat([sgd_lr_l2_test, sgd_svm_test, rf_test, xgb_test, lgbm_test], axis=1)
    stack_test.columns = ['SGD_LR_L2', 'SGD_SVM', 'RF', 'XGB', 'LGBM']

    if verbose:
        print("Done.")
        print(f"Time elapsed = {datetime.now() - start}")
        print(f"Shape of stacking training data = {stack_train.shape}")
        print(f"Shape of stacking test data = {stack_test.shape}")
        print(f"Shape of stacking training class labels = {stack_target.shape}")

    return stack_train, stack_test, stack_target

stack_train, stack_test, stack_target = load_stacking_data()

Loading the CV (out-of-folds) and Test Predictions from Base Models...
Making the Training and Test Dataset for Stacking...
Done.
Time elapsed = 0:00:00.051291
Shape of stacking training data = (307507, 5)
Shape of stacking test data = (48744, 5)
Shape of stacking training class labels = (307507,)

def lgbm_stacker_optimization(num_leaves, max_depth, min_split_gain, min_child_weight,
                              min_child_samples, subsample, colsample_bytree, reg_alpha, reg_lambda):
    """
    Objective function for Bayesian Optimization of LightGBM's hyperparameters.
    This function evaluates the hyperparameters using cross-validation and returns
    the Cross-Validation AUC score.

    Parameters:
        num_leaves (float): Maximum number of leaves for a tree.
        max_depth (float): Maximum depth of a tree.
        min_split_gain (float): Minimum gain for a node split.
        min_child_weight (float): Minimum sum of instance weight for a child.
        min_child_samples (float): Minimum number of data points in a leaf.
        subsample (float): Fraction of data to be sampled for each tree.
        colsample_bytree (float): Fraction of features to be sampled for each tree.
        reg_alpha (float): L1 regularization term.
        reg_lambda (float): L2 regularization term.

    Returns:
        float: Cross-Validation AUC score.
    """

    # Define LightGBM parameters
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'metric': 'auc',
        'learning_rate': 0.005,
        'num_leaves': int(round(num_leaves)),
        'max_depth': int(round(max_depth)),
        'min_split_gain': min_split_gain,
        'min_child_weight': min_child_weight,
        'min_child_samples': int(round(min_child_samples)),
        'subsample': subsample,
        'subsample_freq': 1,
        'colsample_bytree': colsample_bytree,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'verbosity': -1,
        'seed': 8956
    }

    # Prepare LightGBM dataset
    train_set = lgb.Dataset(stack_train, label=stack_target)

    # Perform cross-validation
    cv_results = lgb.cv(
        params=params,
        train_set=train_set,
        num_boost_round=1000,  # Cap the number of boosting rounds
        nfold=4,  # 4-Fold Cross-Validation
        stratified=True,
        shuffle=True,
        metrics=['auc'],
        callbacks=[lgb.early_stopping(stopping_rounds=200)],
    )

    # Check for the correct key and return the best AUC score
    if 'valid auc-mean' in cv_results:
        return max(cv_results['valid auc-mean'])
    else:
        raise KeyError("Expected 'valid auc-mean' in cv_results but not found. Available keys are:", cv_results.keys())

# Bayesian Optimization for the stacker
bopt_stacking = BayesianOptimization(
    f=lgbm_stacker_optimization,
    pbounds={
        'num_leaves': (2, 50),
        'max_depth': (2, 6),
        'min_split_gain': (0, 0.5),
        'min_child_weight': (1, 80),
        'min_child_samples': (5, 80),
        'subsample': (0.001, 1),
        'colsample_bytree': (0.2, 1),
        'reg_alpha': (0.00001, 0.3),
        'reg_lambda': (0.00001, 0.3),
    },
    random_state=42,
)

# Optimization with logs only for the Bayesian Optimization table
bayesian_optimization = bopt_stacking.maximize(init_points=5, n_iter=20)

|   iter    |  target   | colsam... | max_depth | min_ch... | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[997]	cv_agg's valid auc: 0.805923 + 0.00177207
| 1         | 0.8059    | 0.4996    | 5.803     | 59.9      | 48.29     | 0.07801   | 9.488     | 0.01743   | 0.2599    | 0.6015    |
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid auc: 0.806243 + 0.00170084
| 2         | 0.8062    | 0.7665    | 2.082     | 77.74     | 66.76     | 0.1062    | 10.73     | 0.05503   | 0.09128   | 0.5252    |
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[999]	cv_agg's valid auc: 0.806333 + 0.00166857
| 3         | 0.8063    | 0.5456    | 3.165     | 50.89     | 12.02     | 0.1461    | 19.59     | 0.1368    | 0.2356    | 0.2005    |
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid auc: 0.806221 + 0.00172657
| 4         | 0.8062    | 0.6114    | 4.37      | 8.484     | 49.0      | 0.08526   | 5.122     | 0.2847    | 0.2897    | 0.8086    |
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid auc: 0.805684 + 0.00173898
| 5         | 0.8057    | 0.4437    | 2.391     | 56.32     | 35.77     | 0.06102   | 25.77     | 0.01033   | 0.2728    | 0.2595    |
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[175]	cv_agg's valid auc: 0.801583 + 0.002066
| 6         | 0.8016    | 0.2902    | 3.649     | 49.55     | 3.558     | 0.2837    | 16.76     | 0.2661    | 0.16      | 0.5883    |
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[482]	cv_agg's valid auc: 0.806247 + 0.00165217
| 7         | 0.8062    | 0.8085    | 4.909     | 52.13     | 6.96      | 0.1414    | 22.54     | 0.007038  | 0.2902    | 0.2111    |
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[997]	cv_agg's valid auc: 0.805815 + 0.00173393
| 8         | 0.8058    | 0.337     | 4.526     | 53.1      | 5.757     | 0.4224    | 22.59     | 0.07085   | 0.1464    | 0.1816    |
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[175]	cv_agg's valid auc: 0.801592 + 0.00205153
| 9         | 0.8016    | 0.2635    | 3.891     | 54.45     | 10.83     | 0.2285    | 24.66     | 0.01639   | 0.2777    | 0.3277    |
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[937]	cv_agg's valid auc: 0.806174 + 0.0016869
| 10        | 0.8062    | 0.541     | 5.513     | 49.08     | 12.46     | 0.4942    | 17.45     | 0.007644  | 0.01216   | 0.5852    |
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[176]	cv_agg's valid auc: 0.801028 + 0.00209791
| 11        | 0.801     | 0.2686    | 2.375     | 49.8      | 14.96     | 0.3778    | 19.6      | 0.0872    | 0.2714    | 0.08904   |
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[715]	cv_agg's valid auc: 0.806255 + 0.00164606
| 12        | 0.8063    | 0.9896    | 4.353     | 50.42     | 12.03     | 0.3832    | 17.45     | 0.219     | 0.02937   | 0.8907    |
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[632]	cv_agg's valid auc: 0.806255 + 0.00169291
| 13        | 0.8063    | 0.7502    | 5.327     | 51.55     | 9.342     | 0.0993    | 17.9      | 0.2466    | 0.1548    | 0.5895    |
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[998]	cv_agg's valid auc: 0.806252 + 0.00166273
| 14        | 0.8063    | 0.5534    | 3.071     | 49.14     | 9.134     | 0.272     | 19.67     | 0.2033    | 0.2384    | 0.7783    |
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[918]	cv_agg's valid auc: 0.806088 + 0.00167588
| 15        | 0.8061    | 0.5386    | 5.591     | 48.68     | 7.722     | 0.4239    | 21.05     | 0.03899   | 0.2134    | 0.6734    |
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid auc: 0.806226 + 0.00169331
| 16        | 0.8062    | 0.647     | 2.395     | 53.2      | 10.52     | 0.2098    | 18.7      | 0.2157    | 0.2483    | 0.2155    |
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[919]	cv_agg's valid auc: 0.806198 + 0.00170959
| 17        | 0.8062    | 0.6896    | 4.817     | 79.97     | 64.68     | 0.4529    | 11.39     | 0.1165    | 0.03243   | 0.9195    |
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[994]	cv_agg's valid auc: 0.805555 + 0.00175421
| 18        | 0.8056    | 0.4525    | 5.339     | 78.77     | 68.0      | 0.4185    | 9.782     | 0.1531    | 0.258     | 0.05799   |
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[634]	cv_agg's valid auc: 0.806256 + 0.00167097
| 19        | 0.8063    | 0.8265    | 5.812     | 50.36     | 10.84     | 0.4649    | 20.36     | 0.2773    | 0.08155   | 0.2171    |
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[997]	cv_agg's valid auc: 0.806332 + 0.00165916
| 20        | 0.8063    | 0.6734    | 3.365     | 74.68     | 63.91     | 0.2957    | 12.59     | 0.1206    | 0.01589   | 0.2243    |
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid auc: 0.805712 + 0.00176001
| 21        | 0.8057    | 0.3857    | 2.534     | 77.37     | 64.96     | 0.3946    | 14.95     | 0.07117   | 0.0737    | 0.9226    |
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[859]	cv_agg's valid auc: 0.806381 + 0.00171793
| 22        | 0.8064    | 0.9624    | 4.044     | 77.06     | 60.55     | 0.2186    | 8.708     | 0.07141   | 0.05077   | 0.4031    |
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid auc: 0.8055 + 0.00179349
| 23        | 0.8055    | 0.3931    | 5.81      | 75.86     | 63.2      | 0.3673    | 9.304     | 0.05201   | 0.1214    | 0.04878   |
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid auc: 0.801585 + 0.00207885
| 24        | 0.8016    | 0.21      | 2.069     | 74.54     | 57.05     | 0.1512    | 8.975     | 0.04677   | 0.2545    | 0.7701    |
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid auc: 0.805717 + 0.00175311
| 25        | 0.8057    | 0.3696    | 2.443     | 79.59     | 60.87     | 0.1695    | 9.612     | 0.1372    | 0.02053   | 0.2298    |
=====================================================================================================================================

# Extracting the best parameters
target_values = []
for result in bopt_stacking.res:
    target_values.append(result['target'])
    if result['target'] == max(target_values):
        best_params = result['params']

print("Best Hyperparameters obtained are:\n")
print(best_params)

Best Hyperparameters obtained are:

{'colsample_bytree': 0.9623619030850719, 'max_depth': 4.044372175264648, 'min_child_samples': 77.06398337980184, 'min_child_weight': 60.55068432100966, 'min_split_gain': 0.21855186712694075, 'num_leaves': 8.708268447628688, 'reg_alpha': 0.07141471575433322, 'reg_lambda': 0.05077148282564815, 'subsample': 0.4030975636051794}

params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'learning_rate': 0.005,
    'n_estimators': 10000,
    'n_jobs': -1,
    'num_leaves': int(round(8.708268447628688)),  # Rounded to nearest integer
    'max_depth': int(round(4.044372175264648)),  # Rounded to nearest integer
    'min_split_gain': 0.21855186712694075,
    'min_child_weight': 60.55068432100966,
    'min_child_samples': int(round(77.06398337980184)),  # Rounded to nearest integer
    'subsample': 0.4030975636051794,
    'subsample_freq': 1,
    'colsample_bytree': 0.9623619030850719,
    'reg_alpha': 0.07141471575433322,
    'reg_lambda': 0.05077148282564815,
    'verbosity': -1,
    'seed': 266
}

# Initialize and train the stacking LightGBM model
stacker_boosting = Boosting(
    stack_train, stack_target, stack_test, params, random_state=96, num_folds=4
)
stacker_boosting.train(booster='lightgbm')

Training lightgbm with 4-fold cross-validation and Out-Of-Folds Predictions

Fold 1
Training until validation scores don't improve for 200 rounds
[400]	valid_0's auc: 0.804638	valid_0's binary_logloss: 0.231905
[800]	valid_0's auc: 0.804825	valid_0's binary_logloss: 0.2305
Early stopping, best iteration is:
[991]	valid_0's auc: 0.804858	valid_0's binary_logloss: 0.230465

Fold 2
Training until validation scores don't improve for 200 rounds
[400]	valid_0's auc: 0.804732	valid_0's binary_logloss: 0.232269
[800]	valid_0's auc: 0.805016	valid_0's binary_logloss: 0.230829
Early stopping, best iteration is:
[913]	valid_0's auc: 0.805037	valid_0's binary_logloss: 0.230799

Fold 3
Training until validation scores don't improve for 200 rounds
[400]	valid_0's auc: 0.80684	valid_0's binary_logloss: 0.23187
[800]	valid_0's auc: 0.806977	valid_0's binary_logloss: 0.230311
Early stopping, best iteration is:
[786]	valid_0's auc: 0.806987	valid_0's binary_logloss: 0.230315

Fold 4
Training until validation scores don't improve for 200 rounds
[400]	valid_0's auc: 0.807963	valid_0's binary_logloss: 0.231397
[800]	valid_0's auc: 0.808144	valid_0's binary_logloss: 0.229735
Early stopping, best iteration is:
[768]	valid_0's auc: 0.808148	valid_0's binary_logloss: 0.229752
Training complete. Time elapsed: 0:00:24.859915

#displaying the results and metrics
stacker_boosting.results()
#displaying top 50 important features
stacker_boosting.feat_importances_show(5, figsize = (8,5))

====================================================================================================
Train Results:

The best selected Threshold as per the J-Statistic, which is J = TPR - FPR, is = 0.051272180795959126

	Train ROC-AUC Score = 0.8080676922466149
	CV ROC-AUC Score = 0.8061177450807068
	Train Precision Score = 0.1514587967698327
	CV Precision Score = 0.15126483514910308
	Train Recall Score = 0.8529707955689829
	CV Recall Score = 0.851238670694864
Confusion, Precision, and Recall Matrix on CV data:

====================================================================================================
Distribution of Original Class Labels and Predicted CV and Test Class Labels

# Define the directory and file name for saving the submission
submission_dir = '../analytical/assets/submissions'
submission_file = 'LGBM_Stacker_4_Fold.csv'

# Ensure the directory exists
os.makedirs(submission_dir, exist_ok=True)

# Define the full path for the submission file
submission_path = os.path.join(submission_dir, submission_file)

# Save the predictions to the submission file
submission_data = pd.DataFrame({
    'SK_ID_CURR': skid_test,  # Ensure this is defined and corresponds to the test dataset IDs
    'TARGET': stacker_boosting.test_preds_proba_mean  # Assuming stacker_boosting holds the stacking model and predictions
})

# Convert probabilities to the required format (e.g., ensure all values are between 0 and 1)
submission_data['TARGET'] = submission_data['TARGET'].clip(0, 1)

# Save the submission to a CSV file
submission_data.to_csv(submission_path, index=False)

# Inform the user
print(f"Submission file has been saved to: {submission_path}")
print("You can manually upload it to the competition page.")

Submission file has been saved to: ../analytical/assets/submissions/LGBM_Stacker_4_Fold.csv
You can manually upload it to the competition page.

with open('stacker_boosting_imp_feats.pkl','wb') as f:
    pickle.dump(stacker_boosting.feature_importance, f)
with open('stacker_boosting_cv_preds.pkl', 'wb') as f:
    pickle.dump(stacker_boosting.cv_preds_proba, f)
with open('stacker_boosting_test_preds.pkl', 'wb') as f:
    pickle.dump(stacker_boosting.test_preds_proba_mean, f)

print(stacker_boosting.feature_importance)

    features           gain
0       LGBM  790293.152627
4        XGB  148369.383588
3    SGD_SVM   19515.696349
2  SGD_LR_L2   10885.804416
1         RF    7016.198216

# Step 1: Extract feature importance from the stacking classifier
# Ensure feature importance is extracted as a list or 1D structure
feat_imp_gain_stacking = pd.DataFrame({
    'model': ['LGBM', 'XGB', 'SGD_SVM', 'SGD_LR_L2', 'RF'],
    'gain': [790293.152627, 148369.383588, 19515.696349, 10885.804416, 7016.198216]
})

# Step 2: Normalize the gain values to lie between 0 and 1
feat_imp_gain_stacking['normalized_gain'] = feat_imp_gain_stacking['gain'] / feat_imp_gain_stacking['gain'].sum()

# Display normalized gain values
print("Normalized Gain Values as per Stacking Classifier for each base model:")
display(feat_imp_gain_stacking)

# Step 3: Map normalized gains (weights) to the models
weights = feat_imp_gain_stacking.set_index('model')['normalized_gain'].to_dict()

# Step 4: Blend predictions based on weights
# Ensure these test predictions correspond to your models
sgd_lr_l2_test = pd.read_csv('../analytical/assets/submissions/SGD_LR_L2_penalty.csv')['TARGET']
sgd_svm_test = pd.read_csv('../analytical/assets/submissions/Linear_SVM_Submission.csv')['TARGET']
rf_test = pd.read_csv('../analytical/assets/submissions/Random_Forest_Submission.csv')['TARGET']
xgb_test = pd.read_csv('../analytical/assets/submissions/XGB_Boost_final.csv')['TARGET']
lgbm_test = pd.read_csv('../analytical/assets/submissions/LGBM_2folds.csv')['TARGET']

# Combine predictions into a DataFrame for blending
blending_df = pd.DataFrame({
    'SGD_LR_L2': sgd_lr_l2_test,
    'SGD_SVM': sgd_svm_test,
    'RF': rf_test,
    'XGB': xgb_test,
    'LGBM': lgbm_test
})

# Blend predictions using weighted sum
blending_df['blended_prediction'] = (
    blending_df['SGD_LR_L2'] * weights['SGD_LR_L2'] +
    blending_df['SGD_SVM'] * weights['SGD_SVM'] +
    blending_df['RF'] * weights['RF'] +
    blending_df['XGB'] * weights['XGB'] +
    blending_df['LGBM'] * weights['LGBM']
)

# Step 5: Create submission file for blended predictions
submission_dir = '../analytical/assets/submissions'
submission_file = 'Blended_Predictions.csv'

# Ensure the directory exists
os.makedirs(submission_dir, exist_ok=True)

# Save blended predictions to a CSV file
submission_path = os.path.join(submission_dir, submission_file)
submission_data = pd.DataFrame({
    'SK_ID_CURR': skid_test,  # Ensure skid_test corresponds to test IDs
    'TARGET': blending_df['blended_prediction'].clip(0, 1)  # Clip to ensure probabilities are between 0 and 1
})
submission_data.to_csv(submission_path, index=False)

# Inform the user
print(f"Blended predictions saved to: {submission_path}")

Normalized Gain Values as per Stacking Classifier for each base model:

Blended predictions saved to: ../analytical/assets/submissions/Blended_Predictions.csv

# Create a list of model results with updated metrics
model_results = [
    {
        'Model': 'Random Model',
        'Train ROC-AUC': 0.5007396118774923,
        'CV ROC-AUC': '-',
        'Train Precision': 0.08083299082066037,
        'CV Precision': '-',
        'Train Recall': 0.4990936555891239,
        'CV Recall': '-'
    },
    {
        'Model': 'Logistic Regression with L2 Regularization',
        'Train ROC-AUC': 0.795863738737,
        'CV ROC-AUC': 0.789439650259,
        'Train Precision': 0.183668930940,
        'CV Precision': 0.180829869644,
        'Train Recall': 0.731923464250,
        'CV Recall': 0.720281973817
    },
    {
        'Model': 'Linear SVM',
        'Train ROC-AUC': 0.795752803781,
        'CV ROC-AUC': 0.790144219931,
        'Train Precision': 0.185462704694,
        'CV Precision': 0.181318570098,
        'Train Recall': 0.725840886203,
        'CV Recall': 0.721651560926
    },
    {
        'Model': 'Random Forest Classifier',
        'Train ROC-AUC': 0.795882936664,
        'CV ROC-AUC': 0.768744768184,
        'Train Precision': 0.179483397737,
        'CV Precision': 0.170488115035,
        'Train Recall': 0.735307150050,
        'CV Recall': 0.702074521652
    },
    {
        'Model': 'XGBoost with GPU',
        'Train ROC-AUC': 0.8857353397142647,
        'CV ROC-AUC': 0.7999337513848167,
        'Train Precision': 0.20752823805974427,
        'CV Precision': 0.17564249316889446,
        'Train Recall': 0.8748036253776436,
        'CV Recall': 0.7664451158106748
    },
    {
        'Model': 'LightGBM',
        'Train ROC-AUC': 0.9006949516141293,
        'CV ROC-AUC': 0.8051536579063042,
        'Train Precision': 0.2228861879349877,
        'CV Precision': 0.18629280346966565,
        'Train Recall': 0.8849546827794562,
        'CV Recall': 0.7440080563947633
    },
    {
        'Model': 'Stacking Classifier',
        'Train ROC-AUC': 0.8080676922466149,
        'CV ROC-AUC': 0.8061177450807068,
        'Train Precision': 0.1514587967698327,
        'CV Precision': 0.15126483514910308,
        'Train Recall': 0.8529707955689829,
        'CV Recall': 0.851238670694864
    }
]

# Convert the list of results to a DataFrame
results_df = pd.DataFrame(model_results)

# Apply formatting only to numeric columns
numeric_cols = results_df.select_dtypes(include=['float']).columns
results_df[numeric_cols] = results_df[numeric_cols].applymap(lambda x: f"{x:.5f}")

# Display the styled DataFrame without errors
results_df_displayed = results_df.style.set_caption("Summary of Model Results")

# Display the styled DataFrame
results_df_displayed

	params	mean_test_score	mean_train_score	Train_CV_Score_Difference
0	{'n_estimators': 150, 'min_samples_split': 20,...	0.770772	0.907988	0.137217
1	{'n_estimators': 100, 'min_samples_split': 10,...	0.767421	0.817801	0.050380
2	{'n_estimators': 100, 'min_samples_split': 20,...	0.767302	0.818279	0.050977
3	{'n_estimators': 150, 'min_samples_split': 10,...	0.767938	0.957164	0.189226
4	{'n_estimators': 150, 'min_samples_split': 30,...	0.769886	0.912749	0.142863
5	{'n_estimators': 100, 'min_samples_split': 20,...	0.768213	0.912107	0.143895
6	{'n_estimators': 50, 'min_samples_split': 20, ...	0.766535	0.864881	0.098345
7	{'n_estimators': 50, 'min_samples_split': 10, ...	0.767023	0.860042	0.093019
8	{'n_estimators': 150, 'min_samples_split': 20,...	0.770484	0.949115	0.178631
9	{'n_estimators': 150, 'min_samples_split': 10,...	0.768048	0.818001	0.049953

Table of Contents¶

Introduction¶

Feature Engineering and Feature Selection¶

Modeling¶

Notebook settings¶

Pickle Library Intro¶

Data Cleaning and Feature Engineering¶

Bureau and Bureau Balance tables¶

Previous Application table¶

Installment Payments table¶

POS_CASH_balance table¶

Credit Card balance table¶

Application Train and Application Test tables¶

Merging All Preprocessed Tables¶

Features Derived from Interactions Across Tables¶

Saving Processed Data to Pickle Files¶

Preparing Data for Modeling¶

Feature Selection¶

Identifying Empty Features¶

Recursive Feature Selection Using LightGBM¶

Standardizing and Cleaning Data for Model Training¶

Baseline Model Development and Evaluation¶

Random Model¶

Logistic Regression with L2 Regularization¶

Linear SVM¶

Random Forest Classifier¶

XGBoost with GPU¶

Bayesian Optimization¶

LightGBM¶

Hyperparameter Tuning for LightGBM¶

Stacking Classifier¶

Bayesian Optimization for Stacking Model¶

Blending of Predictions¶

Model Evaluation and Comparison¶

Model Results Summary¶

Model Deployment¶

API access for the Stacker and LightGBM models:¶

Improvement Suggestions for Feature Engineering and Modelling¶

iter	target	colsam...	colsam...	colsam...	gamma	max_depth	min_ch...	reg_alpha	reg_la...	subsample
1	0.8051	0.3652	0.9802	0.7419	0.394	10.31	26.42	0.2589	0.01329	0.5542
2	0.8049	0.837	0.336	0.8879	0.2073	11.18	66.4	0.2697	0.2957	0.7484
3	0.8052	0.5466	0.907	0.6984	0.7149	5.205	65.61	0.1295	0.168	0.8896
4	0.8053	0.5818	0.9356	0.825	0.7818	12.8	31.3	0.006878	0.1082	0.6516
5	0.8051	0.8057	0.7387	0.8461	0.9899	5.404	74.35	0.1637	0.2228	0.9098
6	0.8023	0.642	0.9929	0.5992	0.8559	12.96	5.217	0.1832	0.199	0.8784
7	0.8057	0.8291	0.7766	0.7451	0.247	5.027	45.25	0.111	0.2364	0.7915
8	0.8054	0.3113	0.3649	0.7252	0.9805	5.022	34.27	0.1358	0.1028	0.9909
9	0.8048	0.558	0.5259	0.65	0.2643	14.99	48.01	0.09494	0.1122	0.8904
10	0.8056	0.8591	0.6145	0.6762	0.2999	5.015	53.53	0.2283	0.2457	0.7997

	model	gain	normalized_gain
0	LGBM	790293.152627	0.809660
1	XGB	148369.383588	0.152005
2	SGD_SVM	19515.696349	0.019994
3	SGD_LR_L2	10885.804416	0.011153
4	RF	7016.198216	0.007188

	Model	Train ROC-AUC	CV ROC-AUC	Train Precision	CV Precision	Train Recall	CV Recall
0	Random Model	0.50074	-	0.08083	-	0.49909	-
1	Logistic Regression with L2 Regularization	0.79586	0.789440	0.18367	0.180830	0.73192	0.720282
2	Linear SVM	0.79575	0.790144	0.18546	0.181319	0.72584	0.721652
3	Random Forest Classifier	0.79588	0.768745	0.17948	0.170488	0.73531	0.702075
4	XGBoost with GPU	0.88574	0.799934	0.20753	0.175642	0.87480	0.766445
5	LightGBM	0.90069	0.805154	0.22289	0.186293	0.88495	0.744008
6	Stacking Classifier	0.80807	0.806118	0.15146	0.151265	0.85297	0.851239