from assets.utils.functions import *

from assets.utils.functions import *
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import PowerTransformer  
from statsmodels.stats.outliers_influence import variance_inflation_factor

wine_df = pd.read_csv('assets/data/winequality-red.csv')
wine_df.head()

# Replace blank spaces with _ for each column
wine_df.columns = wine_df.columns.str.replace(' ', '_')

print(wine_df.describe())

       fixed_acidity  volatile_acidity  citric_acid  residual_sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free_sulfur_dioxide  total_sulfur_dioxide      density  \
count  1599.000000          1599.000000           1599.000000  1599.000000   
mean      0.087467            15.874922             46.467792     0.996747   
std       0.047065            10.460157             32.895324     0.001887   
min       0.012000             1.000000              6.000000     0.990070   
25%       0.070000             7.000000             22.000000     0.995600   
50%       0.079000            14.000000             38.000000     0.996750   
75%       0.090000            21.000000             62.000000     0.997835   
max       0.611000            72.000000            289.000000     1.003690   

                pH    sulphates      alcohol      quality  
count  1599.000000  1599.000000  1599.000000  1599.000000  
mean      3.311113     0.658149    10.422983     5.636023  
std       0.154386     0.169507     1.065668     0.807569  
min       2.740000     0.330000     8.400000     3.000000  
25%       3.210000     0.550000     9.500000     5.000000  
50%       3.310000     0.620000    10.200000     6.000000  
75%       3.400000     0.730000    11.100000     6.000000  
max       4.010000     2.000000    14.900000     8.000000

wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed_acidity         1599 non-null   float64
 1   volatile_acidity      1599 non-null   float64
 2   citric_acid           1599 non-null   float64
 3   residual_sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free_sulfur_dioxide   1599 non-null   float64
 6   total_sulfur_dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB

print(f"\nNumber of duplicate rows: {wine_df.duplicated().sum()}")

Number of duplicate rows: 240

plt.figure(figsize=(10, 6))
ax = sns.countplot(x='quality',
                   data=wine_df,
                   hue='quality',
                   palette="viridis")
plt.title('Distribution of Wine Quality Ratings')

# Plot counts for each bar
for container in ax.containers:
    ax.bar_label(container)

# Remove y-axis numbers
ax.yaxis.set_ticks([])

# Remove the legend
ax.legend_.remove()

plt.show()

plot_features_against_quality(wine_df)

plot_hist_corr(wine_df)

# Split the data into training and hold-out sets
train_df, holdout_df = train_test_split(
    wine_df, test_size=0.2, random_state=42)

# Select features and the target variable
X_train = train_df.drop(columns=['quality'])
y_train = train_df['quality']
X_holdout = holdout_df.drop(columns=['quality'])
y_holdout = holdout_df['quality']

# Calculate VIF for each feature
X_train_const = sm.add_constant(X_train)
vif = pd.DataFrame()
vif['Variable'] = X_train_const.columns
vif['VIF'] = [variance_inflation_factor(X_train_const.values, i) 
              for i in range(X_train_const.shape[1])]
print(vif)

                Variable           VIF
0                  const  1.678384e+06
1          fixed_acidity  7.374554e+00
2       volatile_acidity  1.813146e+00
3            citric_acid  3.211154e+00
4         residual_sugar  1.731605e+00
5              chlorides  1.519827e+00
6    free_sulfur_dioxide  1.982913e+00
7   total_sulfur_dioxide  2.203259e+00
8                density  6.016488e+00
9                     pH  3.275199e+00
10             sulphates  1.455527e+00
11               alcohol  2.933465e+00

# Fit the model with all features (no transformation)
X_train_const = sm.add_constant(X_train)
X_holdout_const = sm.add_constant(X_holdout)

model_all = sm.OLS(y_train, X_train_const).fit()
print("\nModel with All Features (No Transformation) Summary:")
print(model_all.summary())

Model with All Features (No Transformation) Summary:
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                quality   R-squared:                       0.348
Model:                            OLS   Adj. R-squared:                  0.342
Method:                 Least Squares   F-statistic:                     61.48
Date:                Sat, 10 Aug 2024   Prob (F-statistic):          1.48e-109
Time:                        19:13:48   Log-Likelihood:                -1266.4
No. Observations:                1279   AIC:                             2557.
Df Residuals:                    1267   BIC:                             2619.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   14.3551     23.705      0.606      0.545     -32.150      60.860
fixed_acidity            0.0231      0.029      0.801      0.423      -0.033       0.080
volatile_acidity        -1.0013      0.137     -7.283      0.000      -1.271      -0.732
citric_acid             -0.1408      0.168     -0.839      0.402      -0.470       0.188
residual_sugar           0.0066      0.017      0.391      0.696      -0.026       0.039
chlorides               -1.8065      0.457     -3.949      0.000      -2.704      -0.909
free_sulfur_dioxide      0.0056      0.002      2.252      0.025       0.001       0.011
total_sulfur_dioxide    -0.0036      0.001     -4.419      0.000      -0.005      -0.002
density                -10.3516     24.192     -0.428      0.669     -57.812      37.109
pH                      -0.3937      0.215     -1.830      0.067      -0.816       0.028
sulphates                0.8412      0.126      6.651      0.000       0.593       1.089
alcohol                  0.2819      0.030      9.465      0.000       0.223       0.340
==============================================================================
Omnibus:                       28.708   Durbin-Watson:                   2.003
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               46.050
Skew:                          -0.192   Prob(JB):                     1.00e-10
Kurtosis:                       3.847   Cond. No.                     1.12e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.12e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

boxcox_transformer = PowerTransformer(method='box-cox', standardize=False)
X_train_boxcox = boxcox_transformer.fit_transform(X_train + 1) # avoid 0 values
X_holdout_boxcox = boxcox_transformer.transform(X_holdout + 1)

# Ensure index alignment
X_train_boxcox_df = pd.DataFrame(X_train_boxcox, 
                                 columns=X_train.columns,
                                 index=X_train.index)
X_holdout_boxcox_df = pd.DataFrame(X_holdout_boxcox, 
                                   columns=X_holdout.columns,
                                   index=X_holdout.index)

X_train_boxcox_df = sm.add_constant(X_train_boxcox_df)
X_holdout_boxcox_df = sm.add_constant(X_holdout_boxcox_df)

# Fit the model with Box-Cox transformed features
model_all_boxcox = sm.OLS(y_train, X_train_boxcox_df).fit()
print("\nModel with All Features (Box-Cox Transformed) Summary:")
print(model_all_boxcox.summary())

Model with All Features (Box-Cox Transformed) Summary:
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                quality   R-squared:                       0.358
Model:                            OLS   Adj. R-squared:                  0.352
Method:                 Least Squares   F-statistic:                     64.22
Date:                Sat, 10 Aug 2024   Prob (F-statistic):          9.46e-114
Time:                        19:13:49   Log-Likelihood:                -1256.5
No. Observations:                1279   AIC:                             2537.
Df Residuals:                    1267   BIC:                             2599.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 7.313e+04      3e+04      2.435      0.015    1.42e+04    1.32e+05
fixed_acidity            4.9402      1.680      2.941      0.003       1.645       8.235
volatile_acidity        -2.1957      0.325     -6.754      0.000      -2.833      -1.558
citric_acid             -0.4502      0.219     -2.052      0.040      -0.881      -0.020
residual_sugar           3.0520      1.355      2.252      0.024       0.393       5.711
chlorides              -12.2315      4.509     -2.713      0.007     -21.077      -3.386
free_sulfur_dioxide      0.0744      0.041      1.821      0.069      -0.006       0.155
total_sulfur_dioxide    -0.1268      0.043     -2.916      0.004      -0.212      -0.041
density              -1.047e+06   3.82e+05     -2.740      0.006    -1.8e+06   -2.98e+05
pH                      -1.2888      1.667     -0.773      0.440      -4.559       1.982
sulphates               20.2285      2.159      9.371      0.000      15.993      24.464
alcohol               2.788e+04   3707.756      7.519      0.000    2.06e+04    3.52e+04
==============================================================================
Omnibus:                       31.407   Durbin-Watson:                   2.020
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               51.559
Skew:                          -0.204   Prob(JB):                     6.37e-12
Kurtosis:                       3.895   Cond. No.                     1.12e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 2.87e-12. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

# Drop high VIF features (e.g., 'fixed_acidity', 'density')
X_train_reduced = X_train.drop(columns=['fixed_acidity', 'density',
                                        'citric_acid', 'pH',
                                        'residual_sugar', 'free_sulfur_dioxide',
                                        'total_sulfur_dioxide'])
X_holdout_reduced = X_holdout.drop(columns=['fixed_acidity', 'density',
                                            'citric_acid', 'pH',
                                            'residual_sugar', 'free_sulfur_dioxide',
                                            'total_sulfur_dioxide'])

# Fit model with reduced features
X_train_reduced_const = sm.add_constant(X_train_reduced)
X_holdout_reduced_const = sm.add_constant(X_holdout_reduced)

model_reduced = sm.OLS(y_train, X_train_reduced_const).fit()
print("\nModel with Reduced Features Summary (Based on VIF):")
print(model_reduced.summary())

Model with Reduced Features Summary (Based on VIF):
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                quality   R-squared:                       0.329
Model:                            OLS   Adj. R-squared:                  0.327
Method:                 Least Squares   F-statistic:                     156.0
Date:                Sat, 10 Aug 2024   Prob (F-statistic):          1.03e-108
Time:                        19:13:49   Log-Likelihood:                -1284.9
No. Observations:                1279   AIC:                             2580.
Df Residuals:                    1274   BIC:                             2606.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                2.7454      0.226     12.151      0.000       2.302       3.189
volatile_acidity    -1.1001      0.110     -9.971      0.000      -1.317      -0.884
chlorides           -1.5769      0.426     -3.697      0.000      -2.414      -0.740
sulphates            0.8175      0.122      6.688      0.000       0.578       1.057
alcohol              0.2939      0.019     15.819      0.000       0.257       0.330
==============================================================================
Omnibus:                       22.135   Durbin-Watson:                   1.985
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               35.753
Skew:                          -0.132   Prob(JB):                     1.72e-08
Kurtosis:                       3.776   Cond. No.                         247.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# Hypothesis Testing
# H0: "Alcohol content has no impact on perceived wine quality."
# H1: "Alcohol content has a positive impact on perceived wine quality."

# Extract the coefficient and p-value for the 'alcohol' variable
alcohol_coef = model_reduced.params['alcohol']
alcohol_pvalue = model_reduced.pvalues['alcohol']

# Display the results
print(f"Alcohol Coefficient: {alcohol_coef}")
print(f"Alcohol p-value: {alcohol_pvalue}")

# Hypothesis testing decision
if alcohol_pvalue < 0.05:
    print("Reject the null hypothesis: Alcohol content has a significant "
          "positive impact on perceived wine quality.")
else:
    print("Fail to reject the null hypothesis: Alcohol content does not have "
          "a significant positive impact on perceived wine quality.")

Alcohol Coefficient: 0.2939209003966722
Alcohol p-value: 1.3474711519479874e-51
Reject the null hypothesis: Alcohol content has a significant positive impact on perceived wine quality.

# Evaluate each model
mse_all, mae_all, r2_all = evaluate_model(
    model_all, X_holdout_const, y_holdout)
mse_all_boxcox, mae_all_boxcox, r2_all_boxcox = evaluate_model(
    model_all_boxcox, X_holdout_boxcox_df, y_holdout)
mse_reduced, mae_reduced, r2_reduced = evaluate_model(
    model_reduced, X_holdout_reduced_const, y_holdout)

print("\nComparison of Model Performance:")
print(f"Model with All Features (No Transformation) - MSE: {mse_all}, "
      f"MAE: {mae_all}, R2: {r2_all}")
print(f"Model with All Features (Box-Cox) - MSE: {mse_all_boxcox}, "
      f"MAE: {mae_all_boxcox}, R2: {r2_all_boxcox}")
print(f"Model with Reduced Features (VIF Consideration) - MSE: {mse_reduced}, "
      f"MAE: {mae_reduced}, R2: {r2_reduced}")

Comparison of Model Performance:
Model with All Features (No Transformation) - MSE: 0.39002514396397386, MAE: 0.5035304415524223, R2: 0.4031803412795929
Model with All Features (Box-Cox) - MSE: 0.38402287532804735, MAE: 0.49277876529880127, R2: 0.41236506173744314
Model with Reduced Features (VIF Consideration) - MSE: 0.39583576287046485, MAE: 0.5162232158682217, R2: 0.3942888848019904

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

sm.qqplot(model_all.resid, line='s', ax=axes[0])
axes[0].set_title('QQ Plot (All Features No Transformation)')

sm.qqplot(model_all_boxcox.resid, line='s', ax=axes[1])
axes[1].set_title('QQ Plot (All Features Box-Cox)')

sm.qqplot(model_reduced.resid, line='s', ax=axes[2])
axes[2].set_title('QQ Plot (Reduced Features Based on VIF)')

plt.tight_layout()
plt.show()

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
0	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5
1	7.8	0.88	0.00	2.6	0.098	25.0	67.0	0.9968	3.20	0.68	9.8	5
2	7.8	0.76	0.04	2.3	0.092	15.0	54.0	0.9970	3.26	0.65	9.8	5
3	11.2	0.28	0.56	1.9	0.075	17.0	60.0	0.9980	3.16	0.58	9.8	6
4	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5

Table of Contents¶

Introduction¶

Notebook settings¶

Data Loading, Cleaning, and Checking¶

EDA¶

Target Variable and Hypothesis formulation¶

Data Preprocessing and Splitting¶

Multicollinearity Analysis¶

Model Testing - First Approach: OLS with All Features (No Transformation)¶

Insights from OLS Regression Results -> First Approach:
¶

Model Testing - Second Approach: OLS with All Features -> Box-Cox Transformation¶

Insights from OLS Regression Results -> Second Approach:
¶

Model Testing - Third Approach: OLS with Reduced Features (Based on VIF)¶

Insights from OLS Regression Results (Model with Reduced Features (Based on VIF):
¶

Hypothesis Testing¶

Model Comparison and Evaluation¶

QQ Plots for Residuals Analysis¶

Final Conclusion¶

Suggestions for Improvement¶

Column	Description	Datatype	Count
fixed acidity	most acids involved with wine or fixed or nonvolatile (do not evaporate readily)	float64	1599
volatile acidity	the amount of acetic acid in wine, which at too high of levels can lead to an unpleasant, vinegar taste	float64	1599
citric acid	found in small quantities, citric acid can add 'freshness' and flavor to wines	float64	1599
residual sugar	the amount of sugar remaining after fermentation stops, it's rare to find wines with less than 1 gram/liter and wines with greater than 45 grams/liter are considered sweet	float64	1599
chlorides	the amount of salt in the wine	float64	1599
free sulfur dioxide	the free form of SO2 exists in equilibrium between molecular SO2 (as a dissolved gas) and bisulfite ion; it prevents microbial growth and the oxidation of wine.	float64	1599
total sulfur dioxide	amount of free and bound forms of S02; in low concentrations, SO2 is mostly undetectable in wine, but at free SO2 concentrations over 50 ppm, SO2 becomes evident in the nose and taste of wine	float64	1599
density	the density of wine is close to that of water depending on the percent alcohol and sugar content	float64	1599
pH	describes how acidic or basic a wine is on a scale from 0 (very acidic) to 14 (very basic); most wines are between 3-4 on the pH scale	float64	1599
sulphates	a wine additive which can contribute to sulfur dioxide gas (S02) levels, wich acts as an antimicrobial and antioxidant	float64	1599
alcohol	the percent alcohol content of the wine	float64	1599

Table of Contents¶

Introduction¶

Notebook settings¶

Data Loading, Cleaning, and Checking¶

EDA¶

Target Variable and Hypothesis formulation¶

Data Preprocessing and Splitting¶

Multicollinearity Analysis¶

Model Testing - First Approach: OLS with All Features (No Transformation)¶

Insights from OLS Regression Results -> First Approach:¶

Model Testing - Second Approach: OLS with All Features -> Box-Cox Transformation¶

Insights from OLS Regression Results -> Second Approach:¶

Model Testing - Third Approach: OLS with Reduced Features (Based on VIF)¶

Insights from OLS Regression Results (Model with Reduced Features (Based on VIF):¶

Hypothesis Testing¶

Model Comparison and Evaluation¶

QQ Plots for Residuals Analysis¶

Final Conclusion¶

Suggestions for Improvement¶

Insights from OLS Regression Results -> First Approach:
¶

Insights from OLS Regression Results -> Second Approach:
¶

Insights from OLS Regression Results (Model with Reduced Features (Based on VIF):
¶