from assets.utils.functions import *

%load_ext pycodestyle_magic
#%reload_ext pycodestyle_magic
%pycodestyle_on
%flake8_on
%flake8_on --max_line_length 79
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

connect_to_db(sqlite_path)

Connected to the SQLite database

<sqlite3.Connection at 0x17ef34940>

info(conn)

Table: Answer
Number of rows: 236898
Number of columns: 4
Columns:
- AnswerText (VARCHAR(10000))
- SurveyID (INTEGER)
- UserID (INTEGER)
- QuestionID (INTEGER)

Table: Question
Number of rows: 105
Number of columns: 2
Columns:
- questiontext (VARCHAR(1000))
- questionid (INTEGER)

Table: Survey
Number of rows: 5
Number of columns: 2
Columns:
- SurveyID (INTEGER)
- Description (VARCHAR(255))

check_nan_values(sqlite_path, table_schema)

Number of NaN values in Answer:
AnswerText    0
SurveyID      0
UserID        0
QuestionID    0
dtype: int64
Number of NaN values in Question:
questiontext    0
questionid      0
dtype: int64
Number of NaN values in Survey:
SurveyID       0
Description    0
dtype: int64

check_missing_values(conn)

Number of missing values in column AnswerText of the Answer table: 0
Number of missing values in column SurveyID of the Answer table: 0
Number of missing values in column UserID of the Answer table: 0
Number of missing values in column QuestionID of the Answer table: 0
Number of missing values in column questiontext of the Question table: 0
Number of missing values in column questionid of the Question table: 0
Number of missing values in column SurveyID of the Survey table: 0
Number of missing values in column Description of the Survey table: 0

check_duplicates(sqlite_path, table_schema)

Number of duplicate rows in Answer: 0
Number of duplicate rows in Question: 0
Number of duplicate rows in Survey: 0

# Check Survey table
query = ("SELECT * FROM Survey ")
survey = pd.read_sql_query(query, conn)
print(survey)

   SurveyID                    Description
0      2014  mental health survey for 2014
1      2016  mental health survey for 2016
2      2017  mental health survey for 2017
3      2018  mental health survey for 2018
4      2019  mental health survey for 2019

# Check Answer table
query = ("SELECT * FROM Answer ")
answer = pd.read_sql_query(query, conn)
print(answer)

                AnswerText  SurveyID  UserID  QuestionID
0                       37      2014       1           1
1                       44      2014       2           1
2                       32      2014       3           1
3                       31      2014       4           1
4                       31      2014       5           1
...                    ...       ...     ...         ...
236893               Other      2016    2689         117
236894             Support      2016    2690         117
236895  Back-end Developer      2016    2691         117
236896     DevOps/SysAdmin      2016    2692         117
236897               Other      2016    2693         117

[236898 rows x 4 columns]

# Check Question table
query = ("SELECT * FROM Question ")
result = pd.read_sql_query(query, conn)
print(result)

                                          questiontext  questionid
0                                    What is your age?           1
1                                 What is your gender?           2
2                         What country do you live in?           3
3    If you live in the United States, which state ...           4
4                               Are you self-employed?           5
..                                                 ...         ...
100  Do you think that team members/co-workers woul...         114
101  If yes, what condition(s) have you been diagno...         115
102  If maybe, what condition(s) do you believe you...         116
103  Which of the following best describes your wor...         117
104                              Do you work remotely?         118

[105 rows x 2 columns]

query_check_qid = ("SELECT q1.questionid + 1 AS start_of_gap, "
                   "(SELECT MIN(q2.questionid) - 1 "
                   "FROM Question q2 "
                   "WHERE q2.questionid > q1.questionid) AS end_of_gap "
                   "FROM Question q1 "
                   "WHERE NOT EXISTS (SELECT 1 "
                   "FROM Question q2 "
                   "WHERE q2.questionid = q1.questionid + 1)")
gaps = pd.read_sql_query(query_check_qid, conn)
print(gaps)

   start_of_gap  end_of_gap
0            35        47.0
1           119         NaN

query = ("SELECT COUNT(DISTINCT questiontext) as num_questions "
         "FROM Question ")
result = pd.read_sql_query(query, conn)
print(result)

   num_questions
0            105

query_merge_all = """
SELECT a.*, q.questionid, q.questiontext
FROM Answer a
INNER JOIN Question q ON a.QuestionID = q.questionid
INNER JOIN Survey s ON a.SurveyID = s.SurveyID
"""
main_df = pd.read_sql_query(
    query_merge_all, conn, index_col='UserID'
).sort_index()
main_df = main_df[['SurveyID', 'questionid', 'questiontext', 'AnswerText']]
main_df = main_df.rename(columns={
    'SurveyID': 'surveyid',
    'AnswerText': 'answertext'
})
main_df

main_df[main_df.index == 1]

main_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 236898 entries, 1 to 4218
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   surveyid      236898 non-null  int64 
 1   questionid    236898 non-null  int64 
 2   questiontext  236898 non-null  object
 3   answertext    236898 non-null  object
dtypes: int64(2), object(2)
memory usage: 9.0+ MB

# Select rows where 'questionid' is 2 and 'answertext' column
selected_answertext = main_df.loc[main_df['questionid'] == 2, 'answertext']

# Apply the function to the selected 'answertext'
updated_answertext = selected_answertext.apply(group_gender_responses)

# Update the 'answertext' column where 'questionid' is 2
main_df.loc[main_df['questionid'] == 2, 'answertext'] = updated_answertext

# Filter the DataFrame where 'questionid' is 2
filtered_df = main_df[main_df['questionid'] == 2]

# Check if there are any values different from 'Female', 'Male', 'Other'
other_values = filtered_df[
    ~filtered_df['answertext'].isin(['Female', 'Male', 'Other'])
]

# If the DataFrame is not empty, print the unique values
if not other_values.empty:
    print(other_values['answertext'].unique())
else:
    print("No other values found.")

No other values found.

# Creating the gender dataframe
gender_df = create_df_from_questionid(main_df, 2, "gender_df")

# Rename the 'answertext' column to 'gender'
gender_df = gender_df.rename(columns={'answertext': 'gender'})

gender_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4218 entries, 1 to 4218
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   surveyid    4218 non-null   int64 
 1   questionid  4218 non-null   int64 
 2   gender      4218 non-null   object
dtypes: int64(2), object(1)
memory usage: 131.8+ KB

plot_gender_distribution(gender_df)

raw_age_df = main_df[main_df['questionid'] == 1]['answertext'].astype(float)
raw_age_df.describe().round(2)

count    4218.00
mean       33.92
std        10.48
min       -29.00
25%        28.00
50%        33.00
75%        38.00
max       329.00
Name: answertext, dtype: float64

filt_age_df = main_df[main_df['questionid'] == 1]
filt_age_df = filt_age_df[
    (filt_age_df['answertext'].astype(int) >= 18) &
    (filt_age_df['answertext'].astype(int) <= 65)
]
filt_age_df['answertext'].astype(int).describe().round(2)

count    4195.00
mean       33.80
std         7.94
min        18.00
25%        28.00
50%        33.00
75%        38.00
max        65.00
Name: answertext, dtype: float64

# Creating the age dataframe
age_df = create_df_from_questionid(main_df, 1, "age_df")

# Converting 'age' to integer
age_df['answertext'] = age_df['answertext'].astype(int)

# Filter the dataframe
age_df = age_df[(age_df['answertext'] >= 18) & (age_df['answertext'] <= 65)]

# Age groups
age_df['group_age'] = age_df['answertext'].apply(group_ages)

# Rename the 'answertext' column to 'age'
age_df = age_df.rename(columns={'answertext': 'age'})

age_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4195 entries, 1 to 4218
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   surveyid    4195 non-null   int64 
 1   questionid  4195 non-null   int64 
 2   age         4195 non-null   int64 
 3   group_age   4195 non-null   object
dtypes: int64(3), object(1)
memory usage: 163.9+ KB

ages = age_df['age'].astype(int)
sns.displot(ages, bins=range(18, 65), kde=True, color='darkblue')

# Labels and title
plt.xlabel('Age')
plt.ylabel('Density')
plt.title('Distribution of Ages')

plt.show()

mean_age = age_df['age'].mean()
print(f'Mean Age Total: {round(mean_age, 2)}')

median_age = age_df['age'].median()
print(f'Median Age Total: {round(median_age, 2)}')

Mean Age Total: 33.8
Median Age Total: 33.0

plot_age_distribution(age_df)

# Merge the dataframes on the index
age_gender_df = age_df.merge(gender_df, left_index=True,
                             right_index=True,
                             suffixes=('_age', '_gender'))

# Drop the 'surveyid_gender' column
age_gender_df.drop('surveyid_gender', axis=1, inplace=True)

# Rename the 'surveyid_age' column to 'surveyid'
age_gender_df.rename(columns={'surveyid_age': 'surveyid'},
                     inplace=True)

age_gender_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4195 entries, 1 to 4218
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   surveyid           4195 non-null   int64 
 1   questionid_age     4195 non-null   int64 
 2   age                4195 non-null   int64 
 3   group_age          4195 non-null   object
 4   questionid_gender  4195 non-null   int64 
 5   gender             4195 non-null   object
dtypes: int64(4), object(2)
memory usage: 229.4+ KB

# Boxplot of 'age' grouped by 'gender'
box_plot = sns.boxplot(x='gender',
                       y='age',
                       data=age_gender_df,
                       hue='gender',
                       palette=['red', 'green', 'yellow'])

plt.xlabel('Gender')
plt.ylabel('Age')
plt.title('Age Distribution by Gender')
plt.show()

plot_gender_age_distribution(age_gender_df)

age_percentage = calculate_percentage(age_gender_df, 'group_age')
gender_percentage = calculate_percentage(age_gender_df, 'gender')

# Create a figure and subplots
fig, axs = plt.subplots(1, 2, figsize=(10, 5))

# Plot the first pie chart
axs[0].pie(age_percentage,
           labels=age_percentage.index,
           autopct='%1.2f%%')
axs[0].set_title('Age Distribution')

# Plot the second pie chart
axs[1].pie(gender_percentage,
           labels=gender_percentage.index,
           autopct='%1.2f%%')
axs[1].set_title('Gender Distribution')

# Adjust the spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()

plot_survey_data(age_gender_df)

# Calculate mean age by gender
mean_age_by_gender = age_gender_df.groupby('gender')['age'].mean()
print(f'Mean Age by Gender:\n{mean_age_by_gender.round(2)}\n')

# Calculate median age by gender
median_age_by_gender = age_gender_df.groupby('gender')['age'].median()
print(f'Median Age by Gender:\n{median_age_by_gender.round(2)}')

Mean Age by Gender:
gender
Female    33.37
Male      34.03
Other     31.65
Name: age, dtype: float64

Median Age by Gender:
gender
Female    32.0
Male      33.0
Other     31.0
Name: age, dtype: float64

# Creating the age dataframe
country_df = create_df_from_questionid(main_df, 3, "countries_df")

# Cleaaning dataset
country_df = country_df.rename(columns={'answertext': 'country'})
country_df.rename(columns={'questionid': 'questionid_country'}, inplace=True)

# Replace 'United States' and 'United States of America' with 'USA'
country_df.loc[:, 'country'] = country_df['country'].replace(
    ['United States', 'United States of America'], 'USA')

plot_top_countries(country_df)

# Merge the dataframes on the index
population_df = age_gender_df.merge(country_df,
                                    left_index=True,
                                    right_index=True,
                                    suffixes=('', '_country'))
# Drop the 'surveyid_gender' column
population_df.drop('surveyid_country', axis=1, inplace=True)

plot_average_age_by_country(population_df)

# Checking the sample size
sample_size = population_df.index.nunique()
print(f"Sample Size: {sample_size}")

# Get sample size by year
sample_size_by_year = population_df.groupby('surveyid').size()
print("-" * 20)
print(f"Sample Size by year: {sample_size_by_year}")

Sample Size: 4195
--------------------
Sample Size by year: surveyid
2014    1251
2016    1425
2017     752
2018     416
2019     351
dtype: int64

plot_average_age_by_country_gender(population_df)

# Most frequently asked questions
top_10_questions = (main_df.groupby(['questionid', 'questiontext'])
                    .size()
                    .sort_values(ascending=False)
                    .head(10))

top_10_questions = top_10_questions.sort_index(level='questionid')
top_10_questions

questionid  questiontext                                                                                                                                  
1           What is your age?                                                                                                                                 4218
2           What is your gender?                                                                                                                              4218
5           Are you self-employed?                                                                                                                            4218
6           Do you have a family history of mental illness?                                                                                                   4218
7           Have you ever sought treatment for a mental health disorder from a mental health professional?                                                    4218
8           How many employees does your company or organization have?                                                                                        4218
9           Is your employer primarily a tech company/organization?                                                                                           4218
10          Does your employer provide mental health benefits as part of healthcare coverage?                                                                 4218
11          Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?    4218
12          Would you bring up a mental health issue with a potential employer in an interview?                                                               4218
dtype: int64

# Creating the age dataframe
selfemp_df = create_df_from_questionid(main_df, 5, "selfemp_df")

# Cleaaning dataset
selfemp_df = selfemp_df.rename(columns={'answertext': 'employment'})
selfemp_df.rename(
    columns={'questionid': 'questionid_employment'}, inplace=True)

# Merge the dataframes on the index
employment_df = age_gender_df.merge(selfemp_df,
                                    left_index=True,
                                    right_index=True)

# Drop the 'surveyid_gender' column
employment_df.drop('surveyid_y', axis=1, inplace=True)

# Rename the 'surveyid_age' column to 'surveyid'
employment_df.rename(columns={'surveyid_x': 'surveyid'},
                     inplace=True)

bar_chart(employment_df['employment'].value_counts())

# Group the data by 'gender' and 'employment' and get the count of each group
gender_employment_counts = employment_df.groupby(
    ['gender', 'employment']).size()

# Calculate the proportion of each group
gender_employment_proportions = (
    gender_employment_counts / gender_employment_counts.sum() * 100).round(2)

# Convert the series to a DataFrame and reset the index
df = gender_employment_proportions.reset_index()
df.columns = ['Gender', 'Employment', 'Proportion']

# Create a bar plot
plt.figure(figsize=(10, 6))
bars = plt.barh(df['Employment'] + ' (' + df['Gender'] + ')', df['Proportion'])

plt.xlabel('Proportion (%)')
plt.ylabel('Employment Status by Gender')
plt.title('Proportion of Employment Status by Gender')

# Add percentages over the bars
for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height() /
             2, f'{width}%', ha='left', va='center')

plt.show()

# Creating the age dataframe
emphealth_df = create_df_from_questionid(main_df, 10, "emphealth_df")

# Cleaaning dataset
emphealth_df = emphealth_df.rename(columns={'answertext': 'healthcare'})
emphealth_df.rename(
    columns={'questionid': 'questionid_healthcare'}, inplace=True)

# Group the answers in the 'healthcare' column
answers = emphealth_df['healthcare'].value_counts()

# Print the result
print(answers)

healthcare
Yes                               1744
No                                 756
I don't know                       658
-1                                 504
Don't know                         408
Not eligible for coverage / NA     148
Name: count, dtype: int64

# Replace "I don't know" with "Don't know" in the 'healthcare' column
emphealth_df['healthcare'] = emphealth_df['healthcare'].replace(
    "I don't know", "Don't know")
emphealth_df['healthcare'] = emphealth_df['healthcare'].replace(
    "-1", "No")

# Group the answers in the 'healthcare' column
answers = emphealth_df['healthcare'].value_counts()

# Print the result
print(answers)

healthcare
Yes                               1744
No                                1260
Don't know                        1066
Not eligible for coverage / NA     148
Name: count, dtype: int64

# Count the values in the 'healthcare' column
healthcare_counts = emphealth_df['healthcare'].value_counts()

# Create a pie chart
plt.figure(figsize=(10, 6))
plt.pie(healthcare_counts.values,
        labels=healthcare_counts.index,
        autopct='%1.1f%%',
        startangle=140,
        colors=plt.cm.Paired(np.arange(len(healthcare_counts))))
plt.axis('equal')
plt.title(
    'Does your employer provide mental health benefits'
    ' as part of healthcare coverage?'
)
plt.show()

# Creating the age dataframe
treathealth_df = create_df_from_questionid(main_df, 7, "treathealth_df")

# Cleaaning dataset
treathealth_df = treathealth_df.rename(columns={'answertext': 'treatment'})
treathealth_df.rename(
    columns={'questionid': 'questionid_treatment'}, inplace=True)

# Group the answers in the 'healthcare' column
answers = treathealth_df['treatment'].value_counts()

# Print the result
print(answers)

treatment
1    2412
0    1806
Name: count, dtype: int64

# Count the values in the 'treatment' column
treatment_counts = treathealth_df['treatment'].value_counts()

# Create a pie chart
plt.figure(figsize=(10, 6))
plt.pie(treatment_counts.values,
        labels=treatment_counts.index,
        autopct='%1.1f%%',
        startangle=140,
        colors=plt.cm.Paired(np.arange(len(treatment_counts))))
plt.axis('equal')
plt.title(
    'Have you ever sought treatment for a mental health disorder '
    'from a mental health professional?'
    )
plt.show()

sample_size = main_df.index.nunique()
print(f"Sample Size: {sample_size}")

# Reset index
main_df_reset = main_df.reset_index()

# Get sample size by year
sample_size_by_year = main_df_reset.groupby('surveyid')['UserID'].nunique()
print("-" * 20)
print(f"Sample Size by year: {sample_size_by_year}")

Sample Size: 4218
--------------------
Sample Size by year: surveyid
2014    1260
2016    1433
2017     756
2018     417
2019     352
Name: UserID, dtype: int64

# Get the sample size
sample_size = employment_df.shape[0]

# Print the sample size
print(f'The sample size is {sample_size}.')

The sample size is 4195.

# Get the count of each gender
gender_counts = employment_df['gender'].value_counts()

# Print the counts
print(gender_counts)

gender
Male      3045
Female    1033
Other      117
Name: count, dtype: int64

plot_sampling_analysis(population_df)

# Reset the index of population_df
population_df.reset_index(inplace=True)

# Filter main_df to only include rows where questionid equals 33 or 34
filtered_main_df = main_df[main_df['questionid'].isin([33, 34])]

# Drop duplicate 'surveyid' rows from filtered_main_df
filtered_main_df = filtered_main_df.drop_duplicates(
    subset=['surveyid', 'questionid'])

# Merge population_df and filtered_main_df on 'surveyid'
prevalence_df = pd.merge(population_df, filtered_main_df,
                         on='surveyid', how='inner')

# Set 'UserID' as the index again
prevalence_df.set_index('UserID', inplace=True)

# Drop questiontext column
prevalence_df.drop('questiontext', axis=1, inplace=True)

# Filter the data for questionid 33 and print unique 'answertext' values
data_33 = prevalence_df[prevalence_df['questionid'] == 33]
print("Unique answers for question 33:", data_33['answertext'].unique())

# Filter the data for questionid 34 and print unique 'answertext' values
data_34 = prevalence_df[prevalence_df['questionid'] == 34]
print("Unique answers for question 34:", data_34['answertext'].unique())

# Check for NaN values for questionid 33
nan_values_33 = data_33[data_33['questionid'] == 33]['answertext'].isna().sum()
print("Number of NaN values for questionid 33:", nan_values_33)

# Check for NaN values for questionid 34
nan_values_34 = data_34[data_34['questionid'] == 34]['answertext'].isna().sum()
print("Number of NaN values for questionid 34:", nan_values_34)

Unique answers for question 33: ['No' 'Possibly' "Don't Know"]
Unique answers for question 34: ['Yes' '-1']
Number of NaN values for questionid 33: 0
Number of NaN values for questionid 34: 0

# Define mappings
mapping_33 = {'No': 0, 'Possibly': 1, "Don't Know": 0.5}
mapping_34 = {'Yes': 1, '-1': 0}

# Apply the mappings to 'questionid' 33 and 34 in 'prevalence_df'
mask_33 = prevalence_df['questionid'] == 33
prevalence_df.loc[mask_33, 'map_33'] = prevalence_df.loc[
    mask_33, 'answertext'].map(mapping_33)

mask_34 = prevalence_df['questionid'] == 34
prevalence_df.loc[mask_34, 'map_34'] = prevalence_df.loc[
    mask_34, 'answertext'].map(mapping_34)

# Remove rows where 'map_33' is NaN for 'questionid' 33
mask_33_nan = (
    (prevalence_df['questionid'] == 33) &
    prevalence_df['map_33'].isna()
)
prevalence_df = prevalence_df.drop(prevalence_df[mask_33_nan].index)

# Remove rows where 'map_34' is NaN for 'questionid' 34
mask_34_nan = (
    (prevalence_df['questionid'] == 34) &
    prevalence_df['map_34'].isna()
)
prevalence_df = prevalence_df.drop(prevalence_df[mask_34_nan].index)

prevalence_33, ci_33 = calculate_prevalence_and_ci(prevalence_df, 'map_33')
prevalence_34, ci_34 = calculate_prevalence_and_ci(prevalence_df, 'map_34')

# Plot for map_33 and map_34
plot_prevalence_and_ci(
    prevalence_33, ci_33,
    'Do you currently have a mental health disorder?')
plot_prevalence_and_ci(
    prevalence_34, ci_34,
    'Have you ever been diagnosed with a mental health disorder?')

/Users/ctw01365/Documents/fabiano/courses/turing/fchapu-DS.v2.5.2.1.5/.dsvenv/lib/python3.10/site-packages/matplotlib/axes/_axes.py:1185: RuntimeWarning: All-NaN axis encountered
  miny = np.nanmin(masked_verts[..., 1])
/Users/ctw01365/Documents/fabiano/courses/turing/fchapu-DS.v2.5.2.1.5/.dsvenv/lib/python3.10/site-packages/matplotlib/axes/_axes.py:1186: RuntimeWarning: All-NaN axis encountered
  maxy = np.nanmax(masked_verts[..., 1])

/Users/ctw01365/Documents/fabiano/courses/turing/fchapu-DS.v2.5.2.1.5/.dsvenv/lib/python3.10/site-packages/matplotlib/axes/_axes.py:1185: RuntimeWarning: All-NaN axis encountered
  miny = np.nanmin(masked_verts[..., 1])
/Users/ctw01365/Documents/fabiano/courses/turing/fchapu-DS.v2.5.2.1.5/.dsvenv/lib/python3.10/site-packages/matplotlib/axes/_axes.py:1186: RuntimeWarning: All-NaN axis encountered
  maxy = np.nanmax(masked_verts[..., 1])

# Filter prevalence_df for the year 2014
data_2014 = prevalence_df[prevalence_df['surveyid'] == 2014]

# Check if there are any occurrences of 'map_33' and 'map_34' for the year 2014
map_33_occurrences = data_2014['map_33'].sum()
map_34_occurrences = data_2014['map_34'].sum()

print(f"Occurrences of 'map_33' in 2014: {map_33_occurrences}")
print(f"Occurrences of 'map_34' in 2014: {map_34_occurrences}")

Occurrences of 'map_33' in 2014: 0.0
Occurrences of 'map_34' in 2014: 0.0

# Filter the DataFrame for surveyid = 2014 and questionid = 33 or 34
filtered_df = prevalence_df[(prevalence_df['surveyid'] == 2014) & (
    (prevalence_df['questionid'] == 33) | (prevalence_df['questionid'] == 34))]

# Print the result
print(filtered_df)

Empty DataFrame
Columns: [surveyid, questionid_age, age, group_age, questionid_gender, gender, questionid_country, country, questionid, answertext, map_33, map_34]
Index: []

prevalence_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5888 entries, 1261 to 4218
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   surveyid            5888 non-null   int64  
 1   questionid_age      5888 non-null   int64  
 2   age                 5888 non-null   int64  
 3   group_age           5888 non-null   object 
 4   questionid_gender   5888 non-null   int64  
 5   gender              5888 non-null   object 
 6   questionid_country  5888 non-null   int64  
 7   country             5888 non-null   object 
 8   questionid          5888 non-null   int64  
 9   answertext          5888 non-null   object 
 10  map_33              2944 non-null   float64
 11  map_34              2944 non-null   float64
dtypes: float64(2), int64(6), object(4)
memory usage: 598.0+ KB

age_map33 = round(prevalence_df['age'].corr(prevalence_df['map_33']), 2)
age_map33

0.05

age_map34 = round(prevalence_df['age'].corr(prevalence_df['map_34']), 2)
age_map34

-0.06

Table of Contents¶

Introduction¶

Notebook settings¶

Dependencies¶

Data Load, Cleaning and Validation¶

Checking for Nan records¶

Handling Missing Values¶

Handling Duplicate and Inconsistent Values¶

Main Dataframe Setup¶

Gender Group Analysis¶

Gender dataframe¶

Age feature and Age group analysis¶

Age dataframe¶

Age / Gender Group Analysis¶

Age Group Distribution by Gender¶

Geographical Analysis¶

Check Most Frequent Questions¶

Question id 5 Analysis: Are you self-employed?¶

Question id 10 Analysis: Does your employer provide mental health benefits as part of healthcare coverage?¶

Question id 7 Analysis: Have you ever sought treatment for a mental health disorder from a mental health professional?¶

Sample Size Analysis¶

Sampling Analysis for Gender over the top 5 countries respondents¶

Sampling Bias Conclusion¶

Prevalence Rate analysis¶

Correlation Analysis for the mapped questions¶

Conclusion¶

Suggestions About How Your Analysis Can Be Improved¶

	surveyid	questionid	questiontext	answertext
UserID
1	2014	1	What is your age?	37
1	2014	101	Would you bring up a physical health issue wit...	Maybe
1	2014	100	Would you be willing to discuss a mental healt...	Yes
1	2014	99	Would you be willing to discuss a mental healt...	Some of them
1	2014	98	Do you think that discussing a mental health i...	No
...	...	...	...	...
4218	2019	29	Would you be willing to bring up a physical he...	Maybe
4218	2019	13	Is your primary role within your company relat...	1
4218	2019	12	Would you bring up a mental health issue with ...	Yes
4218	2019	11	Is your anonymity protected if you choose to t...	Yes
4218	2019	48	If you have a mental health disorder, how ofte...	Sometimes