%load_ext pycodestyle_magic
#%reload_ext pycodestyle_magic
%pycodestyle_on
%flake8_on
%flake8_on --max_line_length 80
%matplotlib inline

from assets.utils.functions import (
    get_geocodes,
    classify_age_bins,
    get_year_data,
    get_grouped_data,
    plot_percentage,
    process_year_data
)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import folium
from geopy.geocoders import Nominatim
import time as time

int_df = pd.read_csv("./assets/data/database.csv")
int_df.head()

print(int_df.shape)

(2142, 14)

# Clean columns
for col in int_df.columns:
    # Check if the column is of object type
    if int_df[col].dtype == 'object':
        # Remove leading and trailing spaces from the column
        int_df[col] = int_df[col].str.strip()

# Geopy: to be used just to create a new dataset
int_df = get_geocodes(int_df)

int_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2142 entries, 0 to 2141
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       2142 non-null   int64  
 1   name                     2123 non-null   object 
 2   date                     2142 non-null   object 
 3   manner_of_death          2142 non-null   object 
 4   armed                    2136 non-null   object 
 5   age                      2099 non-null   float64
 6   gender                   2141 non-null   object 
 7   race                     2039 non-null   object 
 8   city                     2142 non-null   object 
 9   state                    2142 non-null   object 
 10  signs_of_mental_illness  2142 non-null   bool   
 11  threat_level             2142 non-null   object 
 12  flee                     2106 non-null   object 
 13  body_camera              2142 non-null   bool   
 14  latitude                 2130 non-null   float64
 15  longitude                2130 non-null   float64
dtypes: bool(2), float64(3), int64(1), object(10)
memory usage: 238.6+ KB

int_df.head()

# Get the latitude and longitude possible Nan records
nan_records = int_df[int_df['latitude'].isna() | int_df['longitude'].isna()]
print(nan_records.to_string())

        id                        name        date manner_of_death    armed   age gender race                            city state  signs_of_mental_illness  threat_level         flee  body_camera  latitude  longitude
156    196            Shaquille Barrow  2015-03-02            shot      gun  20.0      M    B                          Joilet    IL                    False         other         Foot        False       NaN        NaN
211    531               Justin Fowler  2015-03-20            shot      gun  24.0      M    N                      Red Valley    AZ                    False        attack          Car        False       NaN        NaN
412    518  Alejandro Campos Fernandez  2015-06-06            shot      gun  45.0      M    H                    Watsonsville    CA                    False        attack         Foot        False       NaN        NaN
766    900                   Kevin Lau  2015-10-11            shot      gun  44.0      M    A                   Lakes Charles    LA                    False        attack  Not fleeing        False       NaN        NaN
1090  1244             Michael Laniado  2016-02-07            shot    knife  27.0      M  NaN                      Manchester    NJ                    False         other  Not fleeing        False       NaN        NaN
1165  1342             Brandon Stanley  2016-03-04            shot  unarmed  30.0      M    W                   East Berstadt    KY                    False  undetermined  Not fleeing        False       NaN        NaN
1351  1536                  Cody Marsh  2016-05-14            shot    knife  19.0      M    W                   Jacksonsville    FL                     True         other  Not fleeing        False       NaN        NaN
1560  1752             Limichael Shine  2016-07-31            shot    knife  18.0      M    A                      Manchester    NJ                     True         other  Not fleeing         True       NaN        NaN
1612  1831                David Fulton  2016-08-18            shot      gun  59.0      M    W                   McKinneyville    CA                     True        attack  Not fleeing        False       NaN        NaN
1681  1886      Joseph Allen Schlosser  2016-09-15            shot      gun  69.0      M    W                     Weeki Wachi    FL                     True         other  Not fleeing        False       NaN        NaN
1773  1988                 Renee Davis  2016-10-21            shot      gun  23.0      F    N  Muckleshoot Indian Reservation    WA                     True         other  Not fleeing        False       NaN        NaN
1879  2096            Dylan M. Dalzell  2016-12-02            shot  crowbar  22.0      M    W                   Simpsonsville    KY                    False         other  Not fleeing        False       NaN        NaN

# Correcting city names
int_df.loc[:, 'city'] = int_df['city'].replace({
    'Joilet': 'Joliet',
    'Red Valley': 'Red Valley',
    'Watsonsville': 'Watsonville',
    'Lakes Charles': 'Lake Charles',
    'Manchester': 'Manchester Township',
    'East Berstadt': 'East Bernstadt',
    'Jacksonsville': 'Jacksonville',
    'McKinneyville': 'Mckinleyville',
    'Weeki Wachi': 'Weeki Wachee',
    'Muckleshoot Indian Reservation': 'Muckleshoot',
    'Simpsonsville': 'Simpsonville'})

# Write the DataFrame back to the file
int_df.to_csv("./assets/data/geocordsdb.csv", index=False)

# Update the dataset with lat and long
new_data = pd.DataFrame({
    'city': ['Joliet', 'Red Valley', 'Watsonville', 'Jacksonville',
             'Lake Charles', 'Manchester Township', 'East Bernstadt',
             'Mckinleyville', 'Weeki Wachee', 'Muckleshoot', 'Simpsonville'],
    'state': ['IL', 'AZ', 'CA', 'FL', 'LA', 'NJ', 'KY', 'CA',
              'FL', 'WA', 'KY'],
    'latitude': [41.527154, 35.915556, 36.909277, 30.332184, 30.228453,
                 39.950983, 37.190779, 40.875833, 28.5155513,
                 47.2623, 38.231186],
    'longitude': [-88.08241, -109.029167, -121.752907, -81.655651, -93.187966,
                  -74.360713, -84.145676, -124.083056, -82.57287689,
                  -122.1467, -85.354797]
})

# Create a dictionary from the new data for easy lookup
new_data_dict = new_data.set_index(['city', 'state']).T.to_dict('list')

# Update the latitude and longitude in the original DataFrame
for index, row in int_df.iterrows():
    if (row['city'], row['state']) in new_data_dict:
        lat = new_data_dict[(row['city'], row['state'])][0]
        long = new_data_dict[(row['city'], row['state'])][1]
        int_df.loc[index, 'latitude'] = lat
        int_df.loc[index, 'longitude'] = long

# Save the updated DataFrame to the file
int_df.to_csv("./assets/data/geocordsdb.csv", index=False)

main_df = pd.read_csv("./assets/data/geocordsdb.csv", index_col='id')

main_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2142 entries, 3 to 2394
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   name                     2123 non-null   object 
 1   date                     2142 non-null   object 
 2   manner_of_death          2142 non-null   object 
 3   armed                    2136 non-null   object 
 4   age                      2099 non-null   float64
 5   gender                   2141 non-null   object 
 6   race                     2039 non-null   object 
 7   city                     2142 non-null   object 
 8   state                    2142 non-null   object 
 9   signs_of_mental_illness  2142 non-null   bool   
 10  threat_level             2142 non-null   object 
 11  flee                     2106 non-null   object 
 12  body_camera              2142 non-null   bool   
 13  latitude                 2142 non-null   float64
 14  longitude                2142 non-null   float64
dtypes: bool(2), float64(3), object(10)
memory usage: 238.5+ KB

main_df.describe()

# Check dataframe shape
print(main_df.shape)

(2142, 15)

# Check for duplicated IDs
unique_ids = main_df.index.nunique()
print("Nº of unique ids: ", unique_ids)

duplicated_ids = main_df[main_df.index.duplicated()]

if duplicated_ids.empty:
    print("No duplicated IDs found.")
else:
    print("Duplicated IDs found:")
    print(duplicated_ids)

Nº of unique ids:  2142
No duplicated IDs found.

# Check for duplicate rows (samples)
duplicate_samples = main_df[main_df.duplicated()]
print(duplicate_samples.to_string())

                     name        date manner_of_death armed   age gender race     city state  signs_of_mental_illness threat_level flee  body_camera   latitude  longitude
id                                                                                                                                                                        
2294  Jamake Cason Thomas  2017-02-03            shot   gun  22.0      M    B  Rowland    NC                    False       attack  Car        False  34.536552 -79.291431

# Check for duplicated names
duplicated_names = main_df[main_df['name'].duplicated(keep=False)].dropna()
duplicated_names_sorted = duplicated_names.sort_values(by='name')
print(duplicated_names_sorted.to_string())

                          name        date manner_of_death       armed   age gender race         city state  signs_of_mental_illness  threat_level         flee  body_camera   latitude  longitude
id                                                                                                                                                                                                
229              Brandon Jones  2015-03-19            shot     unarmed  18.0      M    B    Cleveland    OH                    False        attack  Not fleeing        False  41.499657 -81.693677
1991             Brandon Jones  2016-10-25            shot  toy weapon  25.0      M    W       Elkton    MD                     True        attack  Not fleeing        False  39.606507 -75.833093
1129  Daquan Antonio Westbrook  2015-12-24            shot         gun  18.0      M    B    Charlotte    NC                    False        attack  Not fleeing        False  35.227209 -80.843083
2158  Daquan Antonio Westbrook  2016-12-24            shot         gun  18.0      M    B    Charlotte    NC                    False  undetermined  Not fleeing        False  35.227209 -80.843083
291                Eric Harris  2015-04-02            shot     unarmed  44.0      M    B        Tulsa    OK                    False         other  Not fleeing        False  36.156312 -95.992752
1252               Eric Harris  2016-02-08            shot     vehicle  22.0      M    B  New Orleans    LA                    False         other          Car        False  29.975998 -90.078213
2304       Jamake Cason Thomas  2017-02-03            shot         gun  22.0      M    B      Rowland    NC                    False        attack          Car        False  34.536552 -79.291431
2294       Jamake Cason Thomas  2017-02-03            shot         gun  22.0      M    B      Rowland    NC                    False        attack          Car        False  34.536552 -79.291431
1526           Michael Johnson  2016-05-09            shot         gun  26.0      M    W      Chicago    IL                    False         other         Foot        False  41.875562 -87.624421
1586           Michael Johnson  2016-06-01            shot         gun  21.0      M    B       Newnan    GA                    False        attack         Foot         True  33.380672 -84.799657

# Get the indices of the rows to be dropped
ids_to_remove = main_df.index.isin([2158, 2294])
main_df = main_df.drop(main_df.index[ids_to_remove])
# Check if the rows have been dropped
print(main_df.index.isin([2158, 2294]).any(),
      "for ids 2158 and 2294.\n"
      "Records successfully removed.")

False for ids 2158 and 2294.
Records successfully removed.

# Check for Null names
null_names = main_df[main_df['name'].isnull()]
print(null_names.to_string())
print("#####################")
print("Nº of Null names: ", null_names.shape[0])

     name        date   manner_of_death         armed  age gender race             city state  signs_of_mental_illness  threat_level         flee  body_camera   latitude   longitude
id                                                                                                                                                                                   
1074  NaN  2015-12-10              shot           gun  NaN      M  NaN            Hemet    CA                    False        attack  Not fleeing        False  33.747520 -116.971968
1570  NaN  2016-05-26              shot           gun  NaN      M  NaN      San Antonio    TX                    False         other  Not fleeing        False  29.424600  -98.495141
1581  NaN  2016-05-26              shot         knife  NaN      F  NaN           Sneads    FL                    False         other  Not fleeing        False  30.707591  -84.924448
1584  NaN  2016-06-01              shot           gun  NaN      M  NaN          Phoenix    AZ                    False        attack  Not fleeing        False  33.448437 -112.074141
1615  NaN  2016-06-09              shot         baton  NaN      M  NaN         Somerton    AZ                    False        attack  Not fleeing        False  32.596440 -114.709677
1685  NaN  2016-07-04              shot         knife  NaN      M  NaN           Rosser    TX                    False         other  Not fleeing         True  32.462640  -96.453319
1848  NaN  2016-09-01  shot and Tasered          pipe  NaN      M    H  Huntington Park    CA                    False         other  Not fleeing        False  33.980573 -118.225207
2037  NaN  2016-11-11  shot and Tasered           NaN  NaN      M  NaN       East Point    GA                    False  undetermined          NaN        False  33.679553  -84.439372
2110  NaN  2016-12-08              shot           gun  NaN      M  NaN            Allen    TX                    False        attack  Not fleeing        False  33.103174  -96.670550
2141  NaN  2016-12-20              shot         knife  NaN      M  NaN          Brawley    CA                     True         other  Not fleeing        False  33.018515 -115.515943
2154  NaN  2016-12-21              shot         knife  NaN      M  NaN         Stockton    CA                    False  undetermined  Not fleeing        False  37.957702 -121.290779
2168  NaN  2016-12-23              shot       vehicle  NaN      M  NaN         El Monte    CA                    False         other          Car        False  34.075157 -118.036849
2164  NaN  2016-12-24              shot           gun  NaN      M  NaN          Gadsden    AL                    False        attack         Foot        False  34.012832  -86.003025
2182  NaN  2016-12-30              shot           gun  NaN      M  NaN        Pensacola    FL                    False        attack  Not fleeing        False  30.421309  -87.216915
2232  NaN  2017-01-18              shot       hatchet  NaN      M  NaN      Los Angeles    CA                    False         other          Car        False  34.053691 -118.242766
2278  NaN  2017-01-31  shot and Tasered         knife  NaN      M    B        Hollywood    CA                     True        attack  Not fleeing        False  34.098003 -118.329523
2319  NaN  2017-02-09              shot  undetermined  NaN      M  NaN       Crownpoint    NM                    False  undetermined          NaN        False  35.687329 -108.144297
2328  NaN  2017-02-12              shot       vehicle  NaN      M  NaN          Terrell    TX                    False         other        Other        False  30.199074 -102.097488
2384  NaN  2017-02-25              shot           gun  NaN    NaN  NaN          Lumpkin    GA                    False        attack          Car        False  34.554608  -83.992681
#####################
Nº of Null names:  19

# Inputing "Unknown" for Null names
main_df['name'].fillna('Unknown', inplace=True)

# Confirm if Null names have been replaced
null_names = main_df[main_df['name'].isnull()]
print(null_names.to_string())
print("Nº of Null names: ", null_names.shape[0])

Empty DataFrame
Columns: [name, date, manner_of_death, armed, age, gender, race, city, state, signs_of_mental_illness, threat_level, flee, body_camera, latitude, longitude]
Index: []
Nº of Null names:  0

# Check for missing values by columns
missing_values = main_df.isnull().sum()
print("Missing Values:")
print(missing_values)

Missing Values:
name                         0
date                         0
manner_of_death              0
armed                        6
age                         43
gender                       1
race                       103
city                         0
state                        0
signs_of_mental_illness      0
threat_level                 0
flee                        36
body_camera                  0
latitude                     0
longitude                    0
dtype: int64

# Inputing "undetermined" for Null values in the following columns:
columns_to_fill = ['armed', 'gender', 'race', 'flee']
for column in columns_to_fill:
    main_df[column].fillna(value='undetermined', inplace=True)

# Check for missing values by columns
missing_values = main_df.isnull().sum()
print("Missing Values:")
print(missing_values)

Missing Values:
name                        0
date                        0
manner_of_death             0
armed                       0
age                        43
gender                      0
race                        0
city                        0
state                       0
signs_of_mental_illness     0
threat_level                0
flee                        0
body_camera                 0
latitude                    0
longitude                   0
dtype: int64

# Cloning main dataframe for testing
main_clone = main_df.copy()

# Checking total number of records
num_records = main_clone.shape[0]
print("Total number of records:", num_records)

# Checking number of missing ages
age_missing = main_clone['age'].isnull().sum()
print("Nº of missing ages: ", age_missing)

# Determining the average age of the victims
mean_age = main_clone['age'].mean()
print("Average value for ages: ", round(mean_age, 2))

# Percentage of imputed ages into the dataset
age_imputed = age_missing / num_records
print("Percentage of records having 'mean' value as imputed ages: ",
      round(age_imputed * 100, 2))

Total number of records: 2140
Nº of missing ages:  43
Average value for ages:  36.5
Percentage of records having 'mean' value as imputed ages:  2.01

# Inputing the mean age rounded to 1 decimal place for missing values
main_clone['age'].fillna(round(main_clone['age'].mean(), 1), inplace=True)

# Show the plot
fig, axes = plt.subplots(1, 2, figsize=(15, 8), sharey=True)

# First Plot:
sns.histplot(data=main_df['age'], bins=20, kde=True, ax=axes[0])

# Set labels and title
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Occurences')
axes[0].set_title('Distribution of Age before mean imputation')

# Second Plot:
sns.histplot(data=main_clone['age'], bins=20, kde=True, ax=axes[1])

# Set labels and title
axes[1].set_xlabel('Age')
axes[1].set_ylabel('Occurences')
axes[1].set_title('Distribution of Age after mean imputation')

# Adjust spacing between subplots
plt.subplots_adjust(wspace=0.3)

# Show the plots
plt.show()

# Inputing the mean age rounded to 1 decimal place for missing values
main_df['age'].fillna(round(main_df['age'].mean(), 1), inplace=True)

# Checking number of missing ages
age_missing = main_df['age'].isnull().sum()
print("Nº of missing ages: ", age_missing)

Nº of missing ages:  0

# Confirming the numner of inputed records
average_age = main_df['age'] == 36.5
print("Nº of records with average age: ", average_age.sum())

Nº of records with average age:  43

# Check for missing values by columns
missing_values = main_df.isnull().sum()
print("Missing Values:")
print(missing_values)

Missing Values:
name                       0
date                       0
manner_of_death            0
armed                      0
age                        0
gender                     0
race                       0
city                       0
state                      0
signs_of_mental_illness    0
threat_level               0
flee                       0
body_camera                0
latitude                   0
longitude                  0
dtype: int64

print("Before conversion, date datatype is: ", main_df['date'].dtype)
main_df['date'] = pd.to_datetime(main_df['date'])
print("After conversion, date datatype is: ", main_df['date'].dtype)

Before conversion, date datatype is:  object
After conversion, date datatype is:  datetime64[ns]

main_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2140 entries, 3 to 2394
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   name                     2140 non-null   object        
 1   date                     2140 non-null   datetime64[ns]
 2   manner_of_death          2140 non-null   object        
 3   armed                    2140 non-null   object        
 4   age                      2140 non-null   float64       
 5   gender                   2140 non-null   object        
 6   race                     2140 non-null   object        
 7   city                     2140 non-null   object        
 8   state                    2140 non-null   object        
 9   signs_of_mental_illness  2140 non-null   bool          
 10  threat_level             2140 non-null   object        
 11  flee                     2140 non-null   object        
 12  body_camera              2140 non-null   bool          
 13  latitude                 2140 non-null   float64       
 14  longitude                2140 non-null   float64       
dtypes: bool(2), datetime64[ns](1), float64(3), object(9)
memory usage: 238.2+ KB

# Loading additional dataset
us_pop_df = pd.read_csv("./assets/data/sc-est2019-agesex-civ.csv")
us_pop_df.head()

us_pop_df = us_pop_df.drop(
    ['SUMLEV', 'DIVISION', 'ESTBASE2010_CIV',
     'POPEST2010_CIV', 'POPEST2011_CIV',
     'POPEST2012_CIV', 'POPEST2013_CIV',
     'POPEST2014_CIV', 'POPEST2018_CIV',
     'POPEST2019_CIV'], axis=1)

# Create a mask where all conditions are True
mask = (us_pop_df['REGION'] != 0) & (us_pop_df['REGION'] != 999) & \
       (us_pop_df['STATE'] != 0) & (us_pop_df['STATE'] != 999) & \
       (us_pop_df['SEX'] != 0) & (us_pop_df['SEX'] != 999) & \
       (us_pop_df['AGE'] != 999) & (us_pop_df['NAME'] != 'United States')\

# Apply the mask to the DataFrame
us_pop_df = us_pop_df[mask]

# Change feature names to lower case
us_pop_df.columns = us_pop_df.columns.str.lower()

# Renaming feature population names
us_pop_df = us_pop_df.rename(columns={
    'popest2015_civ': 'pop2015',
    'popest2016_civ': 'pop2016',
    'popest2017_civ': 'pop2017',
})

# Recalculate the total population for each year
total_population_2015 = us_pop_df['pop2015'].sum()
total_population_2016 = us_pop_df['pop2016'].sum()
total_population_2017 = us_pop_df['pop2017'].sum()

print("US estimated population for 2015:", total_population_2015)
print("US estimated population for 2016:", total_population_2016)
print("US estimated population for 2017:", total_population_2017)

US estimated population for 2015: 319438649
US estimated population for 2016: 321738826
US estimated population for 2017: 323794924

# List of features to plot
features = ['manner_of_death', 'age', 'gender',
            'race', 'signs_of_mental_illness',
            'threat_level', 'flee', 'body_camera']

# List of categorical features
categorical_features = ['manner_of_death', 'gender',
                        'race', 'signs_of_mental_illness',
                        'threat_level', 'flee', 'body_camera']

# List of continuous features
continuous_features = ['age']

# Calculate the number of rows needed for the subplots
n_rows = int(np.ceil(len(features) / 2))

# Create a figure and a 2D array of subplots
fig, axs = plt.subplots(n_rows, 2, figsize=(15, n_rows*5))

# List of continuous features
continuous_features = ['age']

# Flatten the 2D array of subplots into a 1D array
axs = axs.flatten()

# Plot each feature
for i, feature in enumerate(features):
    if feature in categorical_features:
        sns.countplot(data=main_df,
                      x=feature,
                      ax=axs[i],
                      hue=feature,
                      palette="coolwarm")
    elif feature in continuous_features:
        sns.histplot(data=main_df, x=feature, ax=axs[i], kde=True, bins=20)
    axs[i].set_title(f'Distribution of {feature}')

plt.tight_layout()
plt.show()

# Get the counts of each weapon
weapon_counts = main_df['armed'].value_counts()

# Get the top 10
top_10_weapons = weapon_counts.head(10)

sns.barplot(x=top_10_weapons.index,
            y=top_10_weapons.values,
            hue=top_10_weapons.index,
            palette='coolwarm')

# Title and labels
plt.title('Top 10 Most Used Weapons')
plt.xlabel('Weapon Type')
plt.xticks(rotation=60)
plt.ylabel('Count')

# Show the plot
plt.show()

# Calculate percentages of each body_camera type
for body_camera_type, count in main_df['body_camera'].value_counts().items():
    percentage = count / len(main_df) * 100
    print(f"{body_camera_type}: {count} ({percentage:.2f}%)")

False: 1911 (89.30%)
True: 229 (10.70%)

# Convert the 'date' column to datetime format
main_df['date'] = pd.to_datetime(main_df['date'])

# Extract the year and month
main_df['year'] = main_df['date'].dt.year
main_df['month'] = main_df['date'].dt.month

# Extract the year from the 'date' column
main_df['year'] = main_df['date'].dt.year

# Group the data by year and count the number of deaths
deaths_by_year = main_df.groupby('year').size()

# Plot the number of deaths by year
deaths_by_year.plot(kind='bar', edgecolor='none', figsize=(8, 4))
plt.xlabel('Year')
plt.ylabel('Number of Deaths')
plt.title('Number of Deaths by Year')
plt.show()

# Check the dates from 2017
df_2017 = main_df[main_df['date'].dt.year == 2017]
min_date = df_2017['date'].min()
max_date = df_2017['date'].max()
print('Minimum date:', min_date)
print('Maximum date:', max_date)
print('Total of records: ', len(df_2017))

Minimum date: 2017-01-01 00:00:00
Maximum date: 2017-02-28 00:00:00
Total of records:  187

# Filter the DataFrame to include only the specified races
filtered_df = main_df[main_df['race'].isin(['W', 'B', 'H'])]

# Create a count plot of the races, segmented by gender
plt.figure(figsize=(8, 4))
sns.countplot(data=filtered_df, x='race', edgecolor='none', hue='gender')
plt.title('Incidents by Race and Gender')
plt.xlabel('Race')
plt.ylabel('Count')
plt.legend(title='Gender')
plt.show()

# Calculate the total number of events
total_events = len(main_df)

# Calculate the total for each race
race_filter = main_df['race'].isin(['W', 'B', 'H'])
race_totals = main_df[race_filter]['race'].value_counts()

# Calculate the percentages
race_percentages = race_totals / total_events * 100

# Print the totals and percentages
print(f"Total events: {total_events}")
for race, total in race_totals.items():
    print(f"Total for {race}: {total} ({race_percentages[race]:.2f}%)")

Total events: 2140
Total for W: 1041 (48.64%)
Total for B: 540 (25.23%)
Total for H: 367 (17.15%)

import matplotlib.cm as cm

# # Calculate the distribution of races with threat level "Attack"
filtered_df_attack = filtered_df[filtered_df['threat_level'] == 'attack']
race_distribution_attack = filtered_df_attack['race'].value_counts()

# Calculate the percentages
race_percentages_attack = (race_distribution_attack /
                           race_distribution_attack.sum() * 100
                           )

# Create a horizontal bar plot with a color gradient
colors = cm.coolwarm(np.linspace(0, 1, len(race_distribution_attack)))
plt.figure(figsize=(8, 4))  # Create a new figure
plt.barh(race_distribution_attack.index,
         race_distribution_attack,
         color=colors)
plt.title('Distribution of Races with Threat Level "Attack"')
plt.xlabel('Count')
plt.ylabel('Race')
plt.gca().invert_yaxis()  # Invert the y-axis

# Print the percentages on the plot
for i, race in enumerate(race_distribution_attack.index):
    percentage = race_percentages_attack[race]
    plt.text(race_distribution_attack[race] + 10, i,
             f'{percentage:.2f}%',
             va='center')

plt.show()

print("Race distirbution by threat level 'Attack' are :",
      race_distribution_attack)
print("Total of threat level 'Attack' for these racial groups: ",
      race_distribution_attack.sum())
percentage = (race_distribution_attack.sum() / len(main_df)) * 100
print("The percentage of 'attack' incidents involving the races 'W', "
      f"'B', or 'H' is {percentage:.2f}%")

Race distirbution by threat level 'Attack' are : race
W    694
B    362
H    215
Name: count, dtype: int64
Total of threat level 'Attack' for these racial groups:  1271
The percentage of 'attack' incidents involving the races 'W', 'B', or 'H' is 59.39%

# Filter the DataFrame to include only the specified races and threat level
race_filter = main_df['race'].isin(['W', 'B', 'H'])
attack_df = main_df[race_filter & (main_df['threat_level'] == 'attack')]

# Calculate the distribution of body camera usage
body_camera_distribution = attack_df['body_camera'].value_counts()

print(body_camera_distribution)

body_camera
False    1155
True      116
Name: count, dtype: int64

# Create a pie chart of the body camera usage distribution
body_camera_distribution.plot(kind='pie', figsize=(6, 4), autopct='%1.1f%%')

# Set the title of the plot
plt.title('Distribution of Body Camera Usage with Threat Level "Attack"')

# Show the plot
plt.show()

# Count the number of occurrences in each city
city_counts = main_df['city'].value_counts()

# Select the top 10 cities with the most occurrences
top_10_dangerous_cities = city_counts.nlargest(10)

# Create a DataFrame from the Series
df = top_10_dangerous_cities.reset_index()
df.columns = ['City', 'Number of Occurrences']

# Plot the top 10 most dangerous cities
plt.figure(figsize=(8, 4))
sns.barplot(x='City',
            y='Number of Occurrences',
            hue='City',
            legend=False,
            data=df,
            dodge=False,
            palette="flare")
plt.title('Top 10 Most Dangerous Cities')
plt.xlabel('City')
plt.xticks(rotation=45)
plt.ylabel('Number of Occurrences')

plt.show()

# Group the data by year and state and count the number of occurrences
occurrences_by_year_state = main_df.groupby(
    ['year', 'state']).size()

# Unstack the data for plotting
occurrences_by_year_state_unstacked = (
    occurrences_by_year_state.unstack(level=0)
    )

# Plot the number of occurrences for each state for each year
occurrences_by_year_state_unstacked.plot(
    kind='bar', stacked=True, figsize=(10, 6))
plt.xlabel('State')
plt.ylabel('Number of Occurrences')
plt.title('Number of Occurrences by State by Year')
plt.show()

# Count the number of occurrences for each state
state_counts = main_df['state'].value_counts().reset_index()
state_counts.columns = ['state', 'count']

# Create a choropleth map
fig = px.choropleth(state_counts,
                    locations='state',
                    locationmode="USA-states",
                    color='count',
                    hover_name='state',
                    color_continuous_scale="Viridis",
                    title='Number of Occurrences by State',
                    scope='usa')

# Adjust the map's margins (top, bottom, left, right)
fig.update_layout(
    autosize=False,
    margin=dict(t=50, b=50, l=50, r=50),
    paper_bgcolor="LightSteelBlue",
    uirevision='constant'  # preserve user's interactions
)

fig.show()

state_ids = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

for state_name, state_id in state_ids.items():
    us_pop_df.loc[us_pop_df['name'] == state_name, 'state_id'] = state_id

us_pop_df.head()

us_pop_region_df = (
    us_pop_df.groupby('state_id')['region']
    .first()
    .reset_index()
)
# Set 'state_id' as the index of us_pop_region_df for the merge
us_pop_region_df.set_index('state_id', inplace=True)

main_df = main_df.merge(
    us_pop_region_df,
    left_on='state',
    right_index=True,
    how='left',
    validate="many_to_many")

# Create a map centered on the United States
map = folium.Map(location=[44.582480, -103.529343],
                 zoom_start=2,
                 min_zoom=3,
                 max_zoom=15)
map.get_root().html.add_child(
    folium.Element('<h3 style="text-align:center; color:blue;">'
                   'Police Shootings Locations by Region</h3>')
)

# Color scale for the regions
color_scale = folium.StepColormap(
    colors=['green', 'yellow', 'orange', 'red'],
    vmin=1,
    vmax=4,
    caption='Region'
)

# Add the color scale to the map
map.add_child(color_scale)

# Dictionary to map region numbers to names
region_numbers = {'Northeast': 1, 'Midwest': 2, 'South': 3, 'West': 4}
number_regions = {1: 'Northeast', 2: 'Midwest', 3: 'South', 4: 'West'}

# Iterate over the rows of main_df_regions
for index, row in main_df.iterrows():
    # Get the state, city and region values
    state = row['state']
    city = row['city']
    region = row['region']

    # Latitude and longitude of the state
    latitude = row['latitude']
    longitude = row['longitude']

    # Check if the region exists in the dictionary before trying to access it
    if region in number_regions:
        color = color_scale(region)
    else:
        print(f"Region {region} not found in region_numbers dictionary.")

    # Define the marker
    marker = folium.CircleMarker(location=[latitude, longitude],
                                 radius=5,
                                 color=color,
                                 fill=True,
                                 fill_color=color,
                                 fill_opacity=0.6,
                                 tooltip=f"{city}, {state}")

    # Add the marker to the map
    marker.add_to(map)

# Display the map
map

# Save the map as an HTML file
# map.save('./assets/data/image/events_region_usa.html')

# Fill missing values in the 'region' column with a default value
main_df['region'] = main_df['region'].fillna('Unknown')

# Define a dictionary to map region numbers to names
region_dict = {1: 'Northeast', 2: 'Midwest', 3: 'South', 4: 'West'}

# Replace the numeric values in the 'region' column
main_df['region'] = main_df['region'].replace(region_dict)

# Define a dictionary to map region names to colors
color_dict = {'Northeast': '#FEB24C',
              'Midwest': '#FC4E2A',
              'South': '#67001F',
              'West': '#BD0026',
              'Unknown': '#808080'}  # color for the 'Unknown' region

# Fill missing values in the 'region' column with a default value
main_df['region'] = main_df['region'].fillna('Unknown')

# Create a list of colors for the regions in the same order
colors = main_df['region'].map(color_dict)

# Now you can create your plot with the custom colors
sns.countplot(data=main_df,
              x='race',
              hue='region',
              palette=color_dict)
plt.title('Count of Races by Region')
plt.xlabel('Race')
plt.ylabel('Count')
plt.legend(title='Region')
plt.show()

# Group the DataFrame by 'race' and 'region' and count the number of events
event_counts = main_df.groupby(
    ['race', 'region']
    ).size().reset_index(name='count')
# Sort the DataFrame by 'count' in descending order and get the first 10 rows
top_10_event_counts = event_counts.sort_values(
    by='count', ascending=False
    ).head(10)

# Print the event counts
print(top_10_event_counts)

   race     region  count
22    W      South    461
23    W       West    312
6     B      South    274
11    H       West    244
20    W    Midwest    197
4     B    Midwest    119
10    H      South     90
7     B       West     87
21    W  Northeast     71
5     B  Northeast     60

# Count the total number of events for regions
region_counts = main_df[main_df['region'].isin(['South',
                                                'West',
                                                'Midwest',
                                                'Northeast'])]
region_counts = region_counts.groupby('region').size()

# Count the total number of unique IDs in min_df
total_ids = main_df.index.nunique()

# Divide the region counts by the total number of unique IDs
region_counts_normalized = round(region_counts / total_ids * 100, 2)

# Print the normalized region counts
print(region_counts_normalized)

region
Midwest      16.45
Northeast     7.48
South        41.21
West         34.86
dtype: float64

sns.boxplot(y=main_df['age'])
plt.title('Distribution of Age')
plt.ylabel('Age')
plt.show()

sns.boxplot(x='race', y='age', hue='race', data=main_df)
plt.title('Distribution of Age by Race')
plt.xlabel('Race')
plt.ylabel('Age')
plt.show()

# Filter the data for individuals who are over 25 and armed
total_population = len(main_df)
over_25_armed = main_df.loc[(main_df['age'] >= 25)]
under_25_armed = main_df.loc[(main_df['age'] < 25)]

# Count the number of rows
num_over_25_armed = over_25_armed.shape[0]
num_under_25_armed = under_25_armed.shape[0]

# Percentage calculation
over_25_armed_percentage = round(
    num_over_25_armed / total_population * 100, 2)
under_25_armed_percentage = round(
    num_under_25_armed / total_population * 100, 2)

print("The total of individuals over 25 armed: ", num_over_25_armed)
print("The total of individuals under 25 armed: ", num_under_25_armed)
print("Percentage of over 25 armed: ", over_25_armed_percentage)
print("Percentage of under 25 armed: ", under_25_armed_percentage)

The total of individuals over 25 armed:  1759
The total of individuals under 25 armed:  381
Percentage of over 25 armed:  82.2
Percentage of under 25 armed:  17.8

main_df.describe().round(2)

main_df['age_group'] = main_df['age'].apply(classify_age_bins)
main_df.head()

us_pop_df['age_group'] = us_pop_df['age'].apply(classify_age_bins)
us_pop_df.head()

# Calculate value counts
age_group_counts = main_df['age_group'].value_counts()

# Get labels from the index of the Series
labels = age_group_counts.index

# Get sizes from the values of the Series
sizes = age_group_counts.values

# Create pie chart
fig, ax = plt.subplots()
ax.pie(sizes, labels=labels, autopct='%1.1f%%')

# Add values to the legend
legend_labels = [f'{label}: {size}' for label, size in zip(labels, sizes)]
plt.legend(title='Age Group Counts',
           labels=legend_labels,
           bbox_to_anchor=(1.05, 1),
           loc=2)

ax.title.set_text('Age Group Distribution')
plt.show()

for year in range(2015, 2018):
    process_year_data(main_df, us_pop_df, year)

	age	latitude	longitude
count	2099.000000	2142.000000	2142.000000
mean	36.484516	36.624621	-97.081480
std	12.874000	5.246377	16.679369
min	6.000000	19.622521	-157.941667
25%	26.000000	33.448437	-112.074141
50%	34.000000	36.134698	-94.365801
75%	45.000000	39.940335	-83.000707
max	86.000000	71.386517	-68.015458

	SUMLEV	NAME	AGE	ESTBASE2010_CIV	POPEST2010_CIV	POPEST2011_CIV	POPEST2012_CIV	POPEST2013_CIV	POPEST2014_CIV	POPEST2015_CIV	POPEST2016_CIV	POPEST2017_CIV	POPEST2018_CIV	POPEST2019_CIV
0	10	United States	0	3944160	3951430	3963092	3926570	3931258	3954787	3983981	3954773	3893990	3815343	3783052
1	10	United States	1	3978090	3957730	3966225	3977549	3942698	3948891	3973133	4002903	3972711	3908830	3829599
2	10	United States	2	4096939	4090621	3970654	3978925	3991740	3958711	3966321	3991349	4020045	3987032	3922044
3	10	United States	3	4119051	4111688	4101644	3981531	3991017	4005928	3974351	3982984	4006946	4033038	3998665
4	10	United States	4	4063186	4077346	4121488	4111490	3992502	4004032	4020292	3989750	3997280	4018719	4043323

Feature	Transformation
`name`	value added: `Unknown`
`armed`, `gender`, `race`, `flee`	value added: `undetermined`
`age`	value added: `mean of the feature`, with impact of 2% over the total of ages.
`date`	converted to datetime type

	date	age	latitude	longitude	year	month
count	2140	2140.00	2140.00	2140.00	2140.00	2140.00
mean	2016-02-01 05:27:42.056075008	36.50	36.63	-97.10	2015.62	6.06
min	2015-01-02 00:00:00	6.00	19.62	-157.94	2015.00	1.00
25%	2015-07-18 18:00:00	27.00	33.45	-112.07	2015.00	3.00
50%	2016-01-31 00:00:00	35.00	36.14	-94.38	2016.00	6.00
75%	2016-08-16 06:00:00	45.00	39.94	-83.00	2016.00	9.00
max	2017-02-28 00:00:00	86.00	71.39	-68.02	2017.00	12.00
std	NaN	12.74	5.25	16.68	0.64	3.58

Age	Group
6 -> 25	Young
26 -> 45	Adult
46 -> 65	Middle-Aged
66 -> 86	Senior

Table of Contents¶

Introduction¶

Notebook settings¶

Dependencies¶

Data cleaning¶

Geopy: Usage and findings¶

New Dataset Outcome¶

Fixing city names and geting latitudes and longitudes¶

Data Loading, Cleaning and Transforming Main dataset¶

Data validation: duplicates and Nan records¶

Duplicated names cleaning process¶

Checking for Null names¶

Null names cleaning process¶

Investigation over other Null feature records¶

Null features findings¶

Data Transformation: 'Age' feature investigation¶

Converting data types¶

Data Loading, Cleaning and Transforming Additional dataset¶

Conclusion About Data Cleaning and Validation processes¶

EDA - Exploratory Data Analysis¶

Objective and Analysis goal¶

Data inconsistency finding¶

Demographic analysis: Racial group¶

Exploring events locations¶

Geographical US map with event locations¶

Targeted racial groups by regions¶

'Age' feature and group analysis¶

Analysis for top 3 most dangerous States by age group - 2015 to 2017¶

EDA Conclusion¶

Project Conclusion¶

	id	name	date	manner_of_death	armed	age	gender	race	city	state	signs_of_mental_illness	threat_level	flee	body_camera
0	3	Tim Elliot	2015-01-02	shot	gun	53.0	M	A	Shelton	WA	True	attack	Not fleeing	False
1	4	Lewis Lee Lembke	2015-01-02	shot	gun	47.0	M	W	Aloha	OR	False	attack	Not fleeing	False
2	5	John Paul Quintero	2015-01-03	shot and Tasered	unarmed	23.0	M	H	Wichita	KS	False	other	Not fleeing	False
3	8	Matthew Hoffman	2015-01-04	shot	toy weapon	32.0	M	W	San Francisco	CA	True	attack	Not fleeing	False
4	9	Michael Rodriguez	2015-01-04	shot	nail gun	39.0	M	H	Evans	CO	False	attack	Not fleeing	False

	region	state	name	sex	age	pop2015	pop2016	pop2017	state_id
348	3	1	Alabama	1	0	30028	29856	29701	AL
349	3	1	Alabama	1	1	29846	30152	29940	AL
350	3	1	Alabama	1	2	29463	30047	30298	AL
351	3	1	Alabama	1	3	29896	29530	30103	AL
352	3	1	Alabama	1	4	30557	29903	29636	AL