In [1]:
import altair as alt
import pandas as pd
# import gdown
import geopandas as gpd
import math
In [2]:
# Load violence reduction data
violence_df = pd.read_csv('https://drive.google.com/uc?id=1IVyt5fFTInsmeTOV9T6RILB5XLM2Gc5R')

# Preview
violence_df.head()
print(violence_df.columns)
Index(['CASE_NUMBER', 'DATE', 'BLOCK', 'VICTIMIZATION_PRIMARY',
       'INCIDENT_PRIMARY', 'GUNSHOT_INJURY_I', 'UNIQUE_ID', 'ZIP_CODE', 'WARD',
       'COMMUNITY_AREA', 'STREET_OUTREACH_ORGANIZATION', 'AREA', 'DISTRICT',
       'BEAT', 'AGE', 'SEX', 'RACE', 'VICTIMIZATION_FBI_CD', 'INCIDENT_FBI_CD',
       'VICTIMIZATION_FBI_DESCR', 'INCIDENT_FBI_DESCR',
       'VICTIMIZATION_IUCR_CD', 'INCIDENT_IUCR_CD',
       'VICTIMIZATION_IUCR_SECONDARY', 'INCIDENT_IUCR_SECONDARY',
       'HOMICIDE_VICTIM_FIRST_NAME', 'HOMICIDE_VICTIM_MI',
       'HOMICIDE_VICTIM_LAST_NAME', 'MONTH', 'DAY_OF_WEEK', 'HOUR',
       'LOCATION_DESCRIPTION', 'STATE_HOUSE_DISTRICT', 'STATE_SENATE_DISTRICT',
       'UPDATED', 'LATITUDE', 'LONGITUDE', 'LOCATION'],
      dtype='object')
In [3]:
violence_df['AREA']
Out[3]:
0        3.0
1        2.0
2        2.0
3        1.0
4        2.0
        ... 
57343    1.0
57344    1.0
57345    1.0
57346    1.0
57347    1.0
Name: AREA, Length: 57348, dtype: float64
In [4]:
# Load vacancy and abandoned building data
vacancy_df = pd.read_csv('https://drive.google.com/uc?id=1ysYE_S9b3pEuTa2Oj6srR1M9BNLL70BQ')

# Preview
vacancy_df.head()
print(vacancy_df.columns)


##we dont have to use vacancy, as long as its about crime in the city
Index(['Docket Number', 'Violation Number', 'Issued Date',
       'Issuing Department', 'Last Hearing Date', 'Property Address',
       'Violation Type', 'Entity or Person(s)', 'Disposition Description',
       'Total Fines', 'Total Administrative Costs', 'Interest Amount',
       'Collection Costs or Attorney Fees', 'Court Cost',
       'Original Total Amount Due', 'Total Paid', 'Current Amount Due',
       'Latitude', 'Longitude', 'Location'],
      dtype='object')
In [5]:
vacancy_df['Location']
Out[5]:
0        POINT (-87.62766855145455 41.72433163710899)
1       POINT (-87.70884424604687 41.919468897981616)
2        POINT (-87.63393730049602 41.65966224784699)
3        POINT (-87.65249718861547 41.79047358708863)
4        POINT (-87.65493156629385 41.80222309496589)
                            ...                      
4987     POINT (-87.68512937131078 41.76819724741188)
4988      POINT (-87.6498292763393 41.78117319861422)
4989     POINT (-87.68070798081595 41.77008677281995)
4990     POINT (-87.6850709395897 41.791241248062725)
4991     POINT (-87.68143791102256 41.80309671151574)
Name: Location, Length: 4992, dtype: object
In [6]:
violence_df['COMMUNITY_AREA']
Out[6]:
0                     EDGEWATER
1                   MORGAN PARK
2            WASHINGTON HEIGHTS
3                 MCKINLEY PARK
4                       CHATHAM
                  ...          
57343                  NEW CITY
57344           WASHINGTON PARK
57345           WASHINGTON PARK
57346    GREATER GRAND CROSSING
57347                  NEW CITY
Name: COMMUNITY_AREA, Length: 57348, dtype: object
In [43]:
##sampling violent data cause the data set is too big, the random state just makes sure that if i call it again itll give me the same sample
sampled_violence_data = violence_df[['AGE', 'COMMUNITY_AREA']].sample(n=1000, random_state=20)

#getting the count of each age category
age_counts = sampled_violence_data['AGE'].value_counts().reset_index()
age_counts.columns = ['Age Category', 'Count']

#seperating it by community area
age_counts_by_area = sampled_violence_data.groupby(['COMMUNITY_AREA', 'AGE']).size().reset_index(name='Count')
age_counts_by_area.columns = ['COMMUNITY_AREA', 'Age Category', 'Count']

# violent count total
total_counts_by_area = sampled_violence_data.groupby('COMMUNITY_AREA').size().reset_index(name='TotalCount')

#getting the top 10 community areas with the highest total counts
top_10_areas = total_counts_by_area.sort_values(by='TotalCount', ascending=False).head(10)['COMMUNITY_AREA']

#filter age_counts_by_area to only include the top 10 community areas
filtered_age_counts_by_area = age_counts_by_area[age_counts_by_area['COMMUNITY_AREA'].isin(top_10_areas)]

#combining age counts w total counts
filtered_age_counts_by_area = filtered_age_counts_by_area.merge(total_counts_by_area, on='COMMUNITY_AREA')

print(filtered_age_counts_by_area)


#bar chart time
ViolentAgeCount = alt.Chart(filtered_age_counts_by_area).mark_bar(size=30).encode(
    y=alt.Y('COMMUNITY_AREA:N',
            title='Community Area',
            sort=alt.EncodingSortField(
                field="TotalCount",
                op="sum",
                order="descending"
            ),
    ),
    x=alt.X('Count:Q', title='Number of Violent Offenses'),
    color=alt.Color('Age Category:O', title='Age Group'),
    order=alt.Order('Age Category:O', sort='ascending'),  

).properties(
    height=400,
    title = 'Top 10 Community Areas: Violent Crime Counts by Age Group'
)

ViolentAgeCount
        COMMUNITY_AREA Age Category  Count  TotalCount
0       AUBURN GRESHAM         0-19      9          40
1       AUBURN GRESHAM        20-29     17          40
2       AUBURN GRESHAM        30-39      8          40
3       AUBURN GRESHAM        40-49      2          40
4       AUBURN GRESHAM        50-59      3          40
..                 ...          ...    ...         ...
56  WEST GARFIELD PARK         0-19      4          41
57  WEST GARFIELD PARK        20-29     14          41
58  WEST GARFIELD PARK        30-39     15          41
59  WEST GARFIELD PARK        40-49      4          41
60  WEST GARFIELD PARK        50-59      4          41

[61 rows x 4 columns]
Out[43]:
In [8]:
violence_df = pd.read_csv('https://drive.google.com/uc?id=1IVyt5fFTInsmeTOV9T6RILB5XLM2Gc5R')
vacancy_df = pd.read_csv("https://drive.google.com/uc?id=1ysYE_S9b3pEuTa2Oj6srR1M9BNLL70BQ")

# Convert 'Issued Date' to datetime format
vacancy_df['Issued Date'] = pd.to_datetime(vacancy_df['Issued Date'])

# Extract the year and create a new column 'Year'
vacancy_df['Year'] = vacancy_df['Issued Date'].dt.year

print(violence_df.columns)
print(vacancy_df.columns)
Index(['CASE_NUMBER', 'DATE', 'BLOCK', 'VICTIMIZATION_PRIMARY',
       'INCIDENT_PRIMARY', 'GUNSHOT_INJURY_I', 'UNIQUE_ID', 'ZIP_CODE', 'WARD',
       'COMMUNITY_AREA', 'STREET_OUTREACH_ORGANIZATION', 'AREA', 'DISTRICT',
       'BEAT', 'AGE', 'SEX', 'RACE', 'VICTIMIZATION_FBI_CD', 'INCIDENT_FBI_CD',
       'VICTIMIZATION_FBI_DESCR', 'INCIDENT_FBI_DESCR',
       'VICTIMIZATION_IUCR_CD', 'INCIDENT_IUCR_CD',
       'VICTIMIZATION_IUCR_SECONDARY', 'INCIDENT_IUCR_SECONDARY',
       'HOMICIDE_VICTIM_FIRST_NAME', 'HOMICIDE_VICTIM_MI',
       'HOMICIDE_VICTIM_LAST_NAME', 'MONTH', 'DAY_OF_WEEK', 'HOUR',
       'LOCATION_DESCRIPTION', 'STATE_HOUSE_DISTRICT', 'STATE_SENATE_DISTRICT',
       'UPDATED', 'LATITUDE', 'LONGITUDE', 'LOCATION'],
      dtype='object')
Index(['Docket Number', 'Violation Number', 'Issued Date',
       'Issuing Department', 'Last Hearing Date', 'Property Address',
       'Violation Type', 'Entity or Person(s)', 'Disposition Description',
       'Total Fines', 'Total Administrative Costs', 'Interest Amount',
       'Collection Costs or Attorney Fees', 'Court Cost',
       'Original Total Amount Due', 'Total Paid', 'Current Amount Due',
       'Latitude', 'Longitude', 'Location', 'Year'],
      dtype='object')
In [9]:
# Convert 'Issued Date' to datetime format
vacancy_df['Issued Date'] = pd.to_datetime(vacancy_df['Issued Date'])
violence_df['DATE'] = pd.to_datetime(violence_df['DATE'])

# Extract the year and create a new column 'Year'
vacancy_df['Year'] = vacancy_df['Issued Date'].dt.year
violence_df['Year'] = violence_df['DATE'].dt.year

crime_by_year = violence_df.groupby('Year').size().reset_index(name='Number of Crimes')
vacancy_by_year = vacancy_df.groupby('Year').size().reset_index(name='Number of Vacancy Citations')
merged_data = crime_by_year.merge(vacancy_by_year, on="Year", how="outer")

# Filtering for the desired years
filtered_data = merged_data[merged_data['Year'].between(2018, 2022)]
filtered_data

# Determine the max value across all years for consistent y-axis range
max_value = max(filtered_data['Number of Crimes'].max(), filtered_data['Number of Vacancy Citations'].max())
In [10]:
#percentage change calculations number of crimes

#2018 - 2019

percentageCrime1819 = abs(((2972-2689)/2972)*100)

print(percentageCrime1819 )

#2019 - 2020
# ((2689-2972)/2689)*100
percentageCrime1920 = abs(((2689-4165)/2689)*100)

print(percentageCrime1920)

#2020-2021
percentageCrime2021 = abs(((4165-4473)/4165)*100)
print(percentageCrime2021)

#2021-2022
percentageCrime2122 = abs(((4473-3575)/4473)*100) 

print(percentageCrime2122)

filtered_data
9.522207267833108
54.89029378951283
7.394957983193278
20.076011625307398
Out[10]:
Year Number of Crimes Number of Vacancy Citations
27 2018 2972 194.0
28 2019 2689 129.0
29 2020 4165 63.0
30 2021 4473 41.0
31 2022 3575 53.0
In [11]:
# percentage change vacancy citations 

perctVac1819 = abs(((194.0 - 129.0)/194.0)*100)

print(perctVac1819)


perctVac1920 = abs(((129.0 - 63.0)/129.0)*100)

print(perctVac1920)

perctVac2021 = abs(((63.0 - 41.0)/63.0)*100)

print(perctVac2021)

perctVac2122 = abs(((41.0 - 53.0)/41.0)*100)

print(perctVac2122)
33.50515463917525
51.162790697674424
34.92063492063492
29.268292682926827
In [12]:
#new dataframe for time series charts

timeSeriesFrame = {'YearRange': ['2018-2019', '2019-2020', '2020-2021', '2021-2022'],
        'Percent Change Crimes': [percentageCrime1819, percentageCrime1920, percentageCrime2021, percentageCrime2122],
        'Percent Change Vacancy Citations': [perctVac1819, perctVac1920, perctVac2021, perctVac2122]}

# Create DataFrame
dfSeries = pd.DataFrame(timeSeriesFrame)

# Display the DataFrame
print(dfSeries)
   YearRange  Percent Change Crimes  Percent Change Vacancy Citations
0  2018-2019               9.522207                         33.505155
1  2019-2020              54.890294                         51.162791
2  2020-2021               7.394958                         34.920635
3  2021-2022              20.076012                         29.268293
In [13]:
#put a brush in by year 



crime_change = alt.Chart(dfSeries).mark_line().encode(
    x=alt.X('YearRange:O', title='Year Range'),
    y=alt.Y('Percent Change Crimes:Q', title='Percent Change', scale = alt.Scale(zero = False)),
    color=alt.value('steelblue'), 
#     tooltip = ['Percent Change Crimes:Q']
).properties(
    width = 300, 
    height = 300
)

vacancy_change = alt.Chart(dfSeries).mark_line().encode(
    x=alt.X('YearRange:O', title='Year Range'),
    y=alt.Y('Percent Change Vacancy Citations:Q', title='Percent Change', scale = alt.Scale(zero = False)),
    color = alt.value('purple')
#     color=alt.Color('Percent Change Crimes:Q', scale=alt.Scale(scheme='viridis'), title='Percent Change Crimes')
).properties(
    width = 300, 
    height = 300
)




final_chart = crime_change + vacancy_change

final_chart2 = alt.hconcat(final_chart).properties(
    title=" % Change Violent Crimes & Vacancy Violations" 
).configure_axisX(
    labelAngle=0 
)

final_chart2
Out[13]:
In [70]:
 


url_chi = "https://gist.githubusercontent.com/naomiwag99/176d44066f9cd0b6df0536a46c811f9b/raw/c0f530e99b580e906e322cb436eac6dac0544172/ChicagoCommunityAreas.geojson"
chi = alt.Data(url=url_chi, format=alt.DataFormat(property='features', type='json'))


url_vacant21 = "https://gist.githubusercontent.com/mjk7817/4ebc7d4e625aeb5edf463b9cd5b4baaf/raw/349f32e0a0652d87be0bd55ec0fa8dc72dead8ed/vacant21.geojson"
vacant21 = alt.Data(url=url_vacant21, format=alt.DataFormat(property='features', type='json'))


url_violent21 = "https://gist.githubusercontent.com/mjk7817/f64ff6983dff41b9449fb06fe045e8c3/raw/d552c59e8df2346198c0fa1b24e2eafe3b46fecd/violent21.geojson"
violent21 = alt.Data(url=url_violent21, format=alt.DataFormat(property='features', type='json'))



chi_land = alt.Chart(chi).mark_geoshape(
    color='lightgray',
    stroke='white', 
#     fill = mapfill2
).encode(
    tooltip = alt.Tooltip(['properties.community:N'], title = "Community Area: ")
).project(type="identity", reflectY=True).properties(
    width=600,
    height=600
)

vacant21_land = alt.Chart(vacant21).mark_geoshape(
    color='purple',
    size=100
).encode(
    tooltip = ['properties.entity_or_person_s_:N', 'properties.issued_date:N']
).project(type="identity", reflectY=True).properties(
    width=600,
    height=600
).properties(
    title = "2021 Violent Crimes & Vacant Buildings in Chicago"
)

myColor = ['#E377C2', '#1f78b4', '#33a02c', '#e31a1c', '#ff7f00', '#6a3d9a', '#b15928'] #used ChatGPT to generate hex values here


dropdown = alt.binding_select(options=[None, 'BATTERY', 'HOMICIDE', 'ROBBERY'], name='Type of Crime: ')
selection = alt.selection_point(fields=['properties.incident_primary'], bind=dropdown)
#used ChatGPT to learn how to use the range arguement for my custom scale
map_fill = alt.condition(selection, alt.Color('properties.incident_primary:N', scale=alt.Scale(range = myColor), title='Type of Crime'), alt.value('lightgray'))
opacity_rule = alt.condition(
    crime_selection, #checks to see if its in the selection
    alt.value(0.8),
    alt.value(0.1),
)


violent21_land = alt.Chart(violent21).mark_circle().encode(
    latitude='properties.latitude:Q',
    longitude='properties.longitude:Q',
    color=alt.Color('properties.incident_primary:N', scale=alt.Scale(range= myColor), title='Type of Crime'),
    tooltip=['properties.incident_primary:N', 'properties.location_description:N', 'properties.date:N'], 
    fill = map_fill, 
    opacity = opacity_rule
).project(
    type="identity", reflectY=True
).add_params(
    selection
)


final_map = chi_land + violent21_land + vacant21_land
In [71]:
final_map 
Out[71]:
In [ ]: