from assets.utils.functions import *

%load_ext pycodestyle_magic
#%reload_ext pycodestyle_magic
%pycodestyle_on
%flake8_on
%flake8_on --max_line_length 79
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_ind, chi2_contingency

cocats_df = pd.read_csv("./assets/data/cookie_cats.csv")
cocats_df.head()

check_data_quality(cocats_df)

Missing Values:
userid            0
version           0
sum_gamerounds    0
retention_1       0
retention_7       0
dtype: int64

Non-null Counts:
userid            90189
version           90189
sum_gamerounds    90189
retention_1       90189
retention_7       90189
dtype: int64

Number of duplicate rows: 0

dataframe_info(cocats_df)

DataFrame Shape:
(90189, 5)

Column Names:
Index(['userid', 'version', 'sum_gamerounds', 'retention_1', 'retention_7'], dtype='object')

Data Types:
userid             int64
version           object
sum_gamerounds     int64
retention_1         bool
retention_7         bool
dtype: object

Summary Stats:
           userid  sum_gamerounds
count    90189.00        90189.00
mean   4998412.23           51.87
std    2883285.61          195.05
min        116.00            0.00
25%    2512230.00            5.00
50%    4995815.00           16.00
75%    7496452.00           51.00
max    9999861.00        49854.00

# Check uique values from object type columns
check_unique_values(cocats_df, 'version')

Unique values in column 'version':
['gate_30' 'gate_40']

# Find user with sum_gamerounds = 49854
user_with_max_gamerounds = cocats_df[cocats_df[
    'sum_gamerounds'] == 49854]['userid'].values[0]
print(f"User with sum_gamerounds = 49854: {user_with_max_gamerounds}\n")

# Filter the DataFrame for rows where userid equals 6390605
records_for_user = cocats_df.loc[cocats_df['userid'] == 6390605]

# Print the filtered records
print(records_for_user)

User with sum_gamerounds = 49854: 6390605

        userid  version  sum_gamerounds  retention_1  retention_7
57702  6390605  gate_30           49854        False         True

# Sort DF by 'sum_gamerounds'
sorted_df = cocats_df.sort_values(by='sum_gamerounds')

# Remove the last entry (outlier)
sorted_df_without_outlier = sorted_df[:-1]

# Maximum value of 'sum_gamerounds' - without the outlier
new_max_sum_gamerounds = sorted_df_without_outlier['sum_gamerounds'].max()

print(f"The max game rounds without the outlier is: {new_max_sum_gamerounds}")

The max game rounds without the outlier is: 2961

cocats_df = cocats_df[cocats_df['sum_gamerounds'] < 49854]

dataframe_info(cocats_df)

DataFrame Shape:
(90188, 5)

Column Names:
Index(['userid', 'version', 'sum_gamerounds', 'retention_1', 'retention_7'], dtype='object')

Data Types:
userid             int64
version           object
sum_gamerounds     int64
retention_1         bool
retention_7         bool
dtype: object

Summary Stats:
           userid  sum_gamerounds
count    90188.00        90188.00
mean   4998396.80           51.32
std    2883297.87          102.68
min        116.00            0.00
25%    2512211.25            5.00
50%    4995804.00           16.00
75%    7496460.75           51.00
max    9999861.00         2961.00

# Assuming cocats_df is defined and contains 'sum_gamerounds'
sum_gamerounds = pd.Series(cocats_df['sum_gamerounds'])

plt.figure(figsize=(10, 6))
plt.hist(cocats_df['sum_gamerounds'], bins=30, edgecolor='black')
sns.kdeplot(sum_gamerounds.to_numpy(), color='red')

plt.xlabel('Game Rounds Played')
plt.ylabel('Frequency')
plt.title('Game Rounds Played Distribution')
plt.show()

analyze_game_rounds(cocats_df)

 As we see, 99.05% of our data is below 500 Game Rounds.
 The 95 percentile of the data is at: 221.0 Game Rounds. 
 This means 85706 players are under this range.

summarize_player_activity(cocats_df)

Total Players: 90188, without the outlier already removed.
Total Inactive Players: 3994, with no rounds played.
Total Active Players: 86194, with more than 1 rounds played.
Average Game Rounds Played by Active Players: 53.70

# Plot distribution of game rounds
df_plot = cocats_df.groupby('sum_gamerounds').count().reset_index()
plt.figure(figsize=(10, 6))
ax = sns.lineplot(x='sum_gamerounds', y='userid', data=df_plot.head(n=100))
ax.set_xlabel('Number of game rounds')
ax.set_ylabel('Number of players')
ax.set_title('Game rounds played during first 14 days')

Text(0.5, 1.0, 'Game rounds played during first 14 days')

plot_game_data(cocats_df)

percent_retention(cocats_df, 'retention_1')

The overall retention for 1 day(s) by version is:
version
gate_30    44.82%
gate_40    44.23%
Name: retention_1, dtype: object

# Bootstrap analysis for retention_1
iterations = 10000  # Number of bootstrap samples
bs_replicates = draw_bs_reps(df=cocats_df,
                             column='retention_1',
                             func=retention,
                             iterations=iterations)

# Stats on bs_replicates, such as the mean and confidence intervals
mean_retention = np.mean(bs_replicates)
conf_interval = np.percentile(bs_replicates, [2.5, 97.5])
print(f"Mean retention: {mean_retention}")
print(f"95% confidence interval: {conf_interval}")

Mean retention: 0.44524557923448793
95% confidence interval: [0.44207655 0.44847458]

ttest_compare_retention(cocats_df, 'version', 'retention_1')

T-statistic: 1.787115337299244
P-value: 0.07392220630182518
There is no statistically significant difference in retention rates between gate_30 and gate_40.

percent_retention(cocats_df, 'retention_7')

The overall retention for 7 day(s) by version is:
version
gate_30    19.02%
gate_40     18.2%
Name: retention_7, dtype: object

# Bootstrap analysis for retention_7
iterations = 10000  # Number of bootstrap samples
bs_replicates = draw_bs_reps(df=cocats_df,
                             column='retention_7',
                             func=retention,
                             iterations=iterations)

# Stats on bs_replicates, such as the mean and confidence intervals
mean_retention = np.mean(bs_replicates)
conf_interval = np.percentile(bs_replicates, [2.5, 97.5])
print(f"Mean retention: {mean_retention}")
print(f"95% confidence interval: {conf_interval}")

Mean retention: 0.18606312147957602
95% confidence interval: [0.18350557 0.18862848]

ttest_compare_retention(cocats_df, 'version', 'retention_7')

T-statistic: 3.1575495965685936
P-value: 0.001591535729785478
There is a statistically significant difference in retention rates between gate_30 and gate_40.

Table of Contents¶

Introduction¶

Notebook settings¶

A/B Testing Goals¶

Data Loading and Verification¶

Outlier Analysis Conclusion and Action Plan¶

EDA¶

Game Rounds Played Distribution¶

Retention 1 Analysis¶

Bootstrapping for 1-day Retention¶

Conclusion Rentention 1 Analysis¶

Conclusion:¶

Retention 7 Analysis¶

Bootstrapping for 7-day Retention¶

Conclusion Rentention 7 Analysis¶

Conclusion:¶

Decision Making¶

Columns	Description	Datatypes	Count
userid	A unique number that identifies each player	int64	90189
version	Whether the player was put in the control group (gate_30 - a gate at level 30) or the group with the moved gate (gate_40 - a gate at level 40)	object	90189
sum_gamerounds	the number of game rounds played by the player during the first 14 days after install	int64	90189
retention_1	Did the player come back and play 1 day after installing?	bool	90189
retention_7	Did the player come back and play 7 days after installing?	bool	90189

	userid	version	sum_gamerounds	retention_1	retention_7
0	116	gate_30	3	False	False
1	337	gate_30	38	True	False
2	377	gate_40	165	True	False
3	483	gate_40	1	False	False
4	488	gate_40	179	True	True