from assets.utils.functions import *

%load_ext pycodestyle_magic
#%reload_ext pycodestyle_magic
%pycodestyle_on
%flake8_on
%flake8_on --max_line_length 79
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

mkt_camp_df = pd.read_csv("./assets/data/WA_Marketing-Campaign.csv")

dataframe_info(mkt_camp_df)

DataFrame Shape:
(548, 7)

Column Names:
Index(['MarketID', 'MarketSize', 'LocationID', 'AgeOfStore', 'Promotion',
       'week', 'SalesInThousands'],
      dtype='object')

Data Types:
MarketID              int64
MarketSize           object
LocationID            int64
AgeOfStore            int64
Promotion             int64
week                  int64
SalesInThousands    float64
dtype: object

Summary Stats:
       MarketID  LocationID  AgeOfStore  Promotion    week  SalesInThousands
count    548.00      548.00      548.00     548.00  548.00            548.00
mean       5.72      479.66        8.50       2.03    2.50             53.47
std        2.88      287.97        6.64       0.81    1.12             16.76
min        1.00        1.00        1.00       1.00    1.00             17.34
25%        3.00      216.00        4.00       1.00    1.75             42.54
50%        6.00      504.00        7.00       2.00    2.50             50.20
75%        8.00      708.00       12.00       3.00    3.25             60.48
max       10.00      920.00       28.00       3.00    4.00             99.65

check_unique_values(mkt_camp_df, 'MarketSize')

Unique values in column 'MarketSize':
['Medium' 'Small' 'Large']

check_data_quality(mkt_camp_df)

Missing Values:
MarketID            0
MarketSize          0
LocationID          0
AgeOfStore          0
Promotion           0
week                0
SalesInThousands    0
dtype: int64

Non-null Counts:
MarketID            548
MarketSize          548
LocationID          548
AgeOfStore          548
Promotion           548
week                548
SalesInThousands    548
dtype: int64

Number of duplicate rows: 0

prom_mkt_size_view(mkt_camp_df)

mkt_grp_df = mkt_camp_df.groupby(['MarketSize',
                                  'Promotion']).size().reset_index()
mkt_grp_df.columns = ['MarketSize', 'Promotion', 'Count']

promotion_analysis(mkt_grp_df, mkt_camp_df)

# Distribution of Promotion
print(mkt_camp_df['Promotion'].value_counts())

Promotion
3    188
2    188
1    172
Name: count, dtype: int64

avg_sales_by_mkt_size_and_promotion(mkt_camp_df)

# Sales by AgeOfStore
sns.scatterplot(x='AgeOfStore', y='SalesInThousands',
                hue='Promotion', data=mkt_camp_df)
plt.title('Sales by Age Of Store')
plt.show()

corr_with_sales(mkt_camp_df)

# Define a mapping from 'MarketSize' categories to ordinal values
market_size_mapping = {'Small': 1, 'Medium': 2, 'Large': 3}

# Map the 'MarketSize' column to a new 'MarketSize_Ordinal' column
mkt_camp_df['MarketSize_Ordinal'] = mkt_camp_df['MarketSize']\
    .map(market_size_mapping)

plot_sales_by_market_size(mkt_camp_df)

# Aggregate data to support the findings
mkt_agg_data = (
    mkt_camp_df.groupby('MarketSize_Ordinal')['SalesInThousands']
    .agg(['mean', 'std', 'count'])
    .reset_index()
)

print(mkt_agg_data)

   MarketSize_Ordinal       mean        std  count
0                   1  57.409333   6.633101     60
1                   2  43.985344   9.051378    320
2                   3  70.116726  17.052156    168

distribution_by_sales(mkt_camp_df, 'SalesInThousands')

Quantiles:
0.33    45.2559
0.66    55.3914
1.00    99.6500
Name: SalesInThousands, dtype: float64

calc_confidence_interval(mkt_camp_df['SalesInThousands'])

The 95.0% confidence interval is from 52.06336535702625 to 54.86904340209785

demonstrate_clt(mkt_camp_df, 'SalesInThousands')

95% Confidence Interval for the mean of SalesInThousands: (53.304578273144756, 53.68956372685524)

analytical_promotion_ci(mkt_camp_df, mean_confidence_interval)

Promotion 3:  95% Confidence Interval (Analytical) -> Mean = 55.36, CI = [52.95, 57.78]
Promotion 2:  95% Confidence Interval (Analytical) -> Mean = 47.33, CI = [45.16, 49.50]
Promotion 1:  95% Confidence Interval (Analytical) -> Mean = 58.10, CI = [55.61, 60.59]

bootstrap_promotion_ci(mkt_camp_df)

Promotion 3:  95% Confidence Interval (Bootstrap) -> Mean = 55.35, CI = [53.05, 57.75]
Promotion 2:  95% Confidence Interval (Bootstrap) -> Mean = 47.34, CI = [45.25, 49.50]
Promotion 1:  95% Confidence Interval (Bootstrap) -> Mean = 58.10, CI = [55.66, 60.63]

sales_promo1 = mkt_camp_df[mkt_camp_df['Promotion'] == 1][
    'SalesInThousands']

sales_promo2 = mkt_camp_df[mkt_camp_df['Promotion'] == 2][
    'SalesInThousands']

sales_promo3 = mkt_camp_df[mkt_camp_df['Promotion'] == 3][
    'SalesInThousands']

# Perform t-tests with actual sales data
print("Promotion 1 vs. Promotion 2:")
perform_ttest(sales_promo1, sales_promo2)

print("\nPromotion 1 vs. Promotion 3:")
perform_ttest(sales_promo1, sales_promo3)

print("\nPromotion 2 vs. Promotion 3:")
perform_ttest(sales_promo2, sales_promo3)

Promotion 1 vs. Promotion 2:
T-statistic: 6.453671702181005, P-value: 3.5506696512134566e-10
Estimated treatment effect (mean difference): 10.76959673428997
95% Confidence Interval for the mean difference: [7.4744540993865805, 14.064739369193358]
Reject the null hypothesis - significant difference in mean sales.

Promotion 1 vs. Promotion 3:
T-statistic: 1.5551383687293547, P-value: 0.12079667272313274
Estimated treatment effect (mean difference): 2.734543542800594
95% Confidence Interval for the mean difference: [-0.7215683089533798, 6.190655394554568]
Fail to reject the null hypothesis - no significant difference in mean sales.

Promotion 2 vs. Promotion 3:
T-statistic: -4.88139271089348, P-value: 1.562894303627738e-06
Estimated treatment effect (mean difference): -8.035053191489375
95% Confidence Interval for the mean difference: [-11.271740842965214, -4.7983655400135365]
Reject the null hypothesis - significant difference in mean sales.

# To return values to the bins for Chi-Square test
min_sales = mkt_camp_df['SalesInThousands'].min()
mean_sales = mkt_camp_df['SalesInThousands'].mean()
max_sales = mkt_camp_df['SalesInThousands'].max()

# Print the results
print(f"Minimum SalesInThousands: {min_sales}")
print(f"Mean SalesInThousands: {mean_sales}")
print(f"Maximum SalesInThousands: {max_sales}")

Minimum SalesInThousands: 17.34
Mean SalesInThousands: 53.46620437956205
Maximum SalesInThousands: 99.65

# Step 1: Categorize 'SalesInThousands' Data
mkt_camp_df['SalesCategory'] = pd.cut(
    mkt_camp_df['SalesInThousands'],
    bins=[17.34, 53.46620437956205, 99.65, np.inf],
    labels=['Low', 'Medium', 'High'])

# Step 2: Create a Contingency Table
contingency_table = pd.crosstab(
    mkt_camp_df['Promotion'], mkt_camp_df['SalesCategory'])

# Step 3: Apply the Chi-Square Test
chi2_stat, p_val, dof, expected = stats.chi2_contingency(contingency_table)
print(f"Chi-Square Test: Statistic={chi2_stat}, p-value={p_val}")

# Interpretation
if p_val < 0.05:
    print("There is a significant difference in the distribution "
          "of sales categories between promotions.")
else:
    print("There is no significant difference in the distribution "
          "of sales categories between promotions.")

Chi-Square Test: Statistic=50.972857884948425, p-value=8.538558276393605e-12
There is a significant difference in the distribution of sales categories between promotions.

Table of Contents¶

Introduction¶

Notebook settings¶

A/B Testing Goals¶

Data Loading and Verification¶

EDA¶

Conclusion about Promotion and Market Size¶

Analyzing Sales and Age of Store¶

Correlations¶

Conclusion between Market Size and Sales Analysis¶

A/B Testing¶

Sales Analysis using CLT and Confidence Interval¶

Conclusion for CLT and CI Analysis¶

Confidence Interval: Analytical and Bootstrap Approach¶

Bootstrap method¶

Conclusion about the Confidence Intervals (Analytical x Bootstrap)¶

Hypotheses Formulation and Testing¶

Promotion 1 vs. Promotion 2¶

Promotion 1 vs. Promotion 3¶

Promotion 2 vs. Promotion 3¶

Conclusion about A/B Testing¶

Additional Approach using Chi-Square Test¶

Chi-Square Analysis Conclusion¶

Decision Making¶

Columns	Description	Datatypes	Count
MarketID	unique identifier for market	int64	548
MarketSize	size of market area by sales	object	548
LocationID	unique identifier for store location	int64	548
AgeOfStore	age of store in years	int64	548
Promotion	one of three promotions that were tested	int64	548
week	one of four weeks when the promotions were run	int64	548
SalesInThousands	sales amount for a specific LocationID, Promotion, and week	float64	548