# Load necessary libraries
library(tidyverse) # Includes ggplot2, dplyr, and others for data manipulation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf) # For handling spatial data
## Warning: package 'sf' was built under R version 4.3.3
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
library(leaflet) # For interactive maps
library(dbscan) # For density-based clustering
## Warning: package 'dbscan' was built under R version 4.3.3
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(ggthemes) # For extra plot themes
# Read the data from a CSV file
crime_df <- read.csv("crime23.csv")
#explore data
names(crime_df)
## [1] "category" "persistent_id" "date" "lat"
## [5] "long" "street_id" "street_name" "context"
## [9] "id" "location_type" "location_subtype" "outcome_status"
str(crime_df)
## 'data.frame': 6878 obs. of 12 variables:
## $ category : chr "anti-social-behaviour" "anti-social-behaviour" "anti-social-behaviour" "anti-social-behaviour" ...
## $ persistent_id : chr "" "" "" "" ...
## $ date : chr "2023-01" "2023-01" "2023-01" "2023-01" ...
## $ lat : num 51.9 51.9 51.9 51.9 51.9 ...
## $ long : num 0.909 0.902 0.898 0.902 0.895 ...
## $ street_id : int 2153366 2153173 2153077 2153186 2153012 2153379 2153105 2153541 2152937 2153107 ...
## $ street_name : chr "On or near Military Road" "On or near " "On or near Culver Street West" "On or near Ryegate Road" ...
## $ context : logi NA NA NA NA NA NA ...
## $ id : int 107596596 107596646 107595950 107595953 107595979 107595985 107596603 107596291 107596305 107596453 ...
## $ location_type : chr "Force" "Force" "Force" "Force" ...
## $ location_subtype: chr "" "" "" "" ...
## $ outcome_status : chr NA NA NA NA ...
# Check for missing values in each column
missing_values <- colSums(is.na(crime_df))
print(missing_values)
## category persistent_id date lat
## 0 0 0 0
## long street_id street_name context
## 0 0 0 6878
## id location_type location_subtype outcome_status
## 0 0 0 677
- DATA CLEANING AND FEATURE ENGINEERING
# Clean and prepare the data in a single pipeline
crime_clean <- crime_df %>%
# Convert date column from YYYY-MM
mutate(date = as.Date(paste0(date, "-01"))) %>%
# Remove rows with missing location data, which are crucial for mapping
drop_na(lat, long) %>%
# Engineer temporal features for deeper analysis
mutate(
month = fct_reorder(format(date, "%B"), as.numeric(format(date, "%m"))), # Ordered Month Factor
season = case_when(
month %in% c("December", "January", "February") ~ "Winter",
month %in% c("March", "April", "May") ~ "Spring",
month %in% c("June", "July", "August") ~ "Summer",
TRUE ~ "Autumn"
)
)
# Convert to a spatial object (sf) for geospatial analysis
# WGS 84 (EPSG:4326) is the standard for lat/long data
crime_sf <- st_as_sf(crime_clean, coords = c("long", "lat"), crs = 4326, na.fail = FALSE)
# Checking for the cleaned data
glimpse(crime_sf)
## Rows: 6,878
## Columns: 13
## $ category <chr> "anti-social-behaviour", "anti-social-behaviour", "an…
## $ persistent_id <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "…
## $ date <date> 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023…
## $ street_id <int> 2153366, 2153173, 2153077, 2153186, 2153012, 2153379,…
## $ street_name <chr> "On or near Military Road", "On or near ", "On or nea…
## $ context <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ id <int> 107596596, 107596646, 107595950, 107595953, 107595979…
## $ location_type <chr> "Force", "Force", "Force", "Force", "Force", "Force",…
## $ location_subtype <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "…
## $ outcome_status <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ month <fct> January, January, January, January, January, January,…
## $ season <chr> "Winter", "Winter", "Winter", "Winter", "Winter", "Wi…
## $ geometry <POINT [°]> POINT (0.909136 51.88306), POINT (0.901681 51.9…
- WHAT ARE THE MAIN CRIMES?
# Calculate the percentage for the top crime category
total_incidents <- nrow(crime_sf)
violent_crime_count <- crime_sf %>%
filter(category == "violent-crime") %>%
nrow()
violent_crime_percentage <- round((violent_crime_count / total_incidents) * 100, 1)
#bar chart of crime categories
ggplot(crime_sf, aes(y = fct_rev(fct_infreq(category)))) +
geom_bar(fill = "steelblue") +
geom_text(stat = 'count', aes(label = after_stat(count)), hjust = -0.2) +
scale_x_continuous(expand = expansion(mult = c(0, 0.1))) +
labs(
title = "Violent Crime Dominates Incidents in Colchester (2023)",
subtitle = paste0("'Violent and sexual offences' account for ", violent_crime_percentage, "% of all recorded incidents."),
x = "Number of Incidents",
y = "Crime Category"
) +
theme_minimal(base_size = 12) +
theme(panel.grid.major.y = element_blank())

- WHEN DOES CRIME HAPPEN(TEMPORAL PATTERN)?
# Top 6 crime categories to avoid a cluttered plot
top_6_crimes <- crime_sf %>%
sf::st_drop_geometry() %>%
count(category, sort = TRUE) %>%
top_n(6) %>%
pull(category)
## Selecting by n
# Filter for these top categories and plot their trends
crime_sf %>%
filter(category %in% top_6_crimes) %>%
count(month, category) %>%
ggplot(aes(x = month, y = n, group = category, color = category)) +
geom_line(linewidth = 1.2) +
geom_point(size = 2) +
facet_wrap(~ category, ncol = 2, scales = "free_y") +
labs(
title = "Crime Incidents Peak in Summer Months",
subtitle = "Trends for the top 6 crime categories in Colchester, 2023",
x = "Month",
y = "Number of Incidents"
) +
theme_light() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none" # Facet titles make a legend redundant
)

- GEOSPATIAL ANALYSIS: FINDING HOTSPOTS
# Convert to British National Grid (EPSG:27700) for distance calculations
crime_sf_proj <- st_transform(crime_sf, crs = 27700)
coords <- st_coordinates(crime_sf_proj)
# Run DBSCAN with your best settings
# eps = 150: Search radius of 150 meters
# minPts = 20: A cluster must have at least 20 incidents
db <- dbscan(coords, eps = 120, minPts = 25)
# Add cluster assignments to our data. Cluster '0' represents noise (isolated incidents)
crime_sf_proj$cluster <- as.factor(db$cluster)
# --- IMPORTANT DIAGNOSTIC STEP ---
# It shows how many incidents are in each cluster.
# If you see a large number for '0' and very few (or none) for 1, 2, 3..., then the
# parameters above need more tuning. A good result has several clusters with inciden
print("Cluster Distribution:")
## [1] "Cluster Distribution:"
print(table(crime_sf_proj$cluster))
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 507 769 144 3533 66 112 50 31 164 226 49 106 42 45 29 49
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
## 95 50 61 54 66 32 26 45 36 35 37 42 53 70 48 27
## 32 33 34 35 36
## 43 28 29 30 49
# -----------------------------------
# Transform the data BACK to the standard GPS format (CRS 4326) for Leaflet
crime_sf_for_map <- st_transform(crime_sf_proj, crs = 4326)
# Create the interactive map with new clusters
pal <- colorFactor(c("grey", "#FF5733", "#3357FF","#F1C40F", "#8E44AD"), domain = crime_sf_for_map$cluster)
leaflet(crime_sf_for_map) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(
radius = 4,
color = ~pal(cluster),
stroke = FALSE,
fillOpacity = 0.7,
popup = ~paste("<b>Category:</b>", category, "<br>", "<b>Cluster ID:</b>", cluster)
) %>%
addLegend("bottomright", pal = pal, values = ~cluster, title = "Crime Cluster ID") %>%
setView(lng = 0.8919, lat = 51.8959, zoom = 13)
- WHAT CRIMES HAPPEN IN WHICH HOTSPOTS?
# First, identify the top 4 largest clusters (excluding noise)
top_4_clusters <- crime_sf_proj %>%
sf::st_drop_geometry() %>%
filter(cluster != "0") %>%
count(cluster, sort = TRUE) %>%
top_n(4) %>%
pull(cluster)
## Selecting by n
# Now, create the analysis ONLY for these top clusters
cluster_analysis <- crime_sf_proj %>%
sf::st_drop_geometry() %>%
filter(cluster %in% top_4_clusters) %>% # Filter for only the biggest clusters
count(cluster, category, sort = TRUE) %>%
group_by(cluster) %>%
mutate(percentage = round(n / sum(n) * 100, 1)) %>%
top_n(3, n) # Get top 3 crimes for each
# Visualize the compositions of ONLY the top 4 clusters
ggplot(cluster_analysis, aes(x = percentage, y = fct_reorder(category, percentage), fill = category)) +
geom_col(show.legend = FALSE) +
# Use facet_wrap on our cleaner data
facet_wrap(~ paste("Cluster", cluster), ncol = 2) +
labs(
title = "Crime Profile of Major Hotspots",
subtitle = "Comparing the top 3 crimes in the 4 largest clusters",
x = "Percentage of Incidents within Each Cluster",
y = "Crime Category"
) +
theme_light(base_size = 12)
