---
title: "Read in TxDoT Crash data"
author: "Alan Jackson"
format:
html:
code-fold: true
code-tools: true
description: "Read in the crash data from TxDoT for Harris county"
date: "4/12/2025"
image: "cover.png"
categories:
- Mapping
- Supporting Activism
- Texas
- Harris County
execute:
freeze: auto # re-render only when source changes
warning: false
editor: source
---
```{r setup, include=FALSE}
library(tidyverse)
library(lubridate)
# F7$rA4&T1Lq5UL!a
inpath <- "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped"
path <- "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/"
# Crash Severity ID table
Severity <- c(
"0", "UNKNOWN",
"1", "SUSPECTED SERIOUS INJURY",
"2", "NON-INCAPACITATING",
"3", "POSSIBLE INJURY",
"4", "FATAL",
"5", "NOT INJURED",
"94", "REPORTED INVALID",
"95", "NOT REPORTED"
)
Severity <- cbind.data.frame(split(Severity, rep(1:2, times=length(Severity)/2)), stringsAsFactors=F)
names(Severity) <- c("Crash_Sev_ID", "Severity")
# Severity$Crash_Sev_ID <- as.integer(Severity$Crash_Sev_ID)
City <- c(
"29", "Baytown",
"30", "Beaumont",
"33", "Bellaire",
"57", "Bunker Hill Village",
"111", "Deer Park",
"135", "El Lago",
"160", "Friendswood",
"164", "Galena Park",
"208", "Houston",
"211", "Humble",
"212", "Hunters Creek Village",
"219", "Jacinto City",
"223", "Jersey Village",
"228", "Katy",
"244", "La Porte",
"255", "League City",
"260", "Liberty",
"296", "Missouri City",
"304", "Nassau Bay",
"325", "Pasadena",
"326", "Pearland",
"333", "Piney Point Village",
"359", "Richmond",
"383", "San Marcos",
"391", "Seabrook",
"405", "South Houston",
"408", "Spring Valley",
"409", "Stafford",
"422", "Taylor Lake Village",
"430", "Tomball",
"434", "Tyler",
"447", "Webster",
"453", "West University Place",
"478", "Aldine",
"625", "Channelview",
"647", "Cloverleaf",
"672", "Crosby",
"843", "Hedwig Village",
"851", "Highlands",
"855", "Hilshire Village",
"1061", "Mission Bend",
"1073", "Morgans Point",
"1326", "Shoreacres",
"1348", "Southside Place",
"1353", "Spring",
"1444", "Waller",
"1535", "Kingwood",
"1585", "Rural Brazoria County",
"1601", "Rural Chambers County",
"1644", "Rural Fort Bend County",
"1649", "Rural Galveston County",
"1665", "Rural Hardin County",
"1666", "Rural Harris County",
"1667", "Rural Harrison County",
"1668", "Rural Hartley County",
"1678", "Rural Houston County",
"1711", "Rural Liberty County",
"1719", "Rural Madison County",
"1735", "Rural Montgomery County",
"1926", "Alief",
"2071", "Atascosita",
"3313", "Cypress",
"4728", "Huffman",
"5659", "Meadows Place",
"6756", "Porter"
)
City <- cbind.data.frame(split(City, rep(1:2, times=length(City)/2)), stringsAsFactors=F)
names(City) <- c("Rpt_City_ID", "City")
# City$Rpt_City_ID <- as.integer(City$Rpt_City_ID)
# Crash Harm ID table
Harm <- c(
"1", "Pedestrian",
"2", "Motor Vehicle in Transport",
"3", "Rail Road",
"4", "Parked Car",
"5", "Pedal Cyclist",
"6", "Animal",
"7", "Fixed Object",
"8", "Other Object",
"9", "Other Non-Collision",
"10", "Overturned",
"11", "Not Reported",
"93", "Undetermined",
"94", "Reported Invalid"
)
Harm <- cbind.data.frame(split(Harm, rep(1:2, times=length(Harm)/2)), stringsAsFactors=F)
names(Harm) <- c("Harm_Evnt_ID", "Harmed")
# Charge Category ID table
Charge <- c(
"10", "ALCOHOL/DRUGS",
"11", "ASSAULT/MANSLAUGHTER WITH VEHICLE",
"12", "BICYCLE OFFENSES",
"13", "COMMERCIAL & MOTOR CARRIER VIOLATIONS",
"14", "DRIVER LICENSE & REGISTRATION VIOLATIONS",
"15", "DRIVER'S MISCELLANEOUS VIOLATIONS",
"16", "INSURANCE AND INSPECTION",
"17", "MOTORCYCLE VIOLATIONS",
"18", "OVERTAKING (PASSING)",
"19", "PARKING VIOLATIONS",
"20", "REPORTING VIOLATIONS (TRAFFIC VIOLATIONS)",
"21", "RIGHT OF WAY",
"22", "SEATBELT / HELMET USAGE",
"23", "SIGNAL INTENTION",
"24", "SPEEDING",
"25", "TRAFFIC SIGNS, SIGNALS, AND ROAD MARKINGS",
"26", "TURNING VIOLATIONS",
"27", "VEHICLE/EQUIPMENT DEFECTS",
"28", "WRONG SIDE/WRONG WAY",
"29", "OTHER",
"30", "NONE",
"93", "UNDETERMINED - FAIL BR",
"94", "REPORTED INVALID",
"95", "NOT REPORTED"
)
Charge <- cbind.data.frame(split(Charge, rep(1:2, times=length(Charge)/2)), stringsAsFactors=F)
names(Charge) <- c("Charge_Cat_ID", "Charge")
knitr::opts_chunk$set(echo = TRUE)
```
```{r Read files}
filenames <- list.files(path = inpath, pattern="*csv$")
filenames <- paste(inpath, filenames, sep="/")
DF <- read_csv(filenames[1], col_types = cols(.default = col_character()))
for (i in filenames[-1]) {
print(i)
df <- read_csv(i, col_types = cols(.default = col_character()))
DF <- full_join(DF, df)
}
df <- DF %>% select(Crash_ID, Crash_Fatal_Fl, Crash_Date, Crash_Time, Rpt_City_ID,
Rpt_Block_Num, Rpt_Street_Pfx, Rpt_Street_Name, Rpt_Street_Sfx, At_Intrsct_Fl,
Latitude, Longitude, Street_Name, Street_Nbr, Tot_Injry_Cnt, Death_Cnt,
Harm_Evnt_ID, Crash_Sev_ID)
df <- left_join(df, City, by="Rpt_City_ID")
df <- left_join(df, Severity, by="Crash_Sev_ID")
df <- left_join(df, Harm, by="Harm_Evnt_ID")
# Create datetime
df <- df %>%
mutate(Date=mdy_hm(paste(Crash_Date, Crash_Time)))
# df <- df %>%
# rename(City=City.x, Severity=Severity.x) %>%
# select(-City.y, -Severity.y)
# df <- left_join(df, Charge, by="Charge_Cat_ID")
# Save raw data
# saveRDS(DF, paste(path, "Raw_Crash_Data_2015_2024.rds", sep="/"))
```
Do some quality control checks
```{r Look for nulls}
# Look for NA's in the various fields
df %>%
map_df(function(x) sum(is.na(x))) %>%
gather(feature, num_nulls) %>%
print(n = 100)
```
```{r Look at Crash_Fatal_Fl}
sort(unique(df$Crash_Fatal_Fl))
```
```{r Look at Crash_Date}
sort(unique(df$Crash_Date)) %>% head()
# make it a date
df$Crash_Date <- mdy(df$Crash_Date)
df %>%
ggplot(aes(Crash_Date)) +
geom_histogram()
```
```{r Look at Crash_Time}
sort(unique(df$Crash_Time)) %>% head()
# make it a time
df <- df %>%
mutate(Crash_Time=ymd_hm(paste("2000/1/1",Crash_Time)))
df %>%
ggplot(aes(Crash_Time)) +
geom_histogram()
```
```{r Look at City ID}
sort(unique(df$Rpt_City_ID))
```
```{r Look at Rpt_Blk_Num}
sort(unique(df$Rpt_Block_Num)) %>% head()
# Show all non-numeric addresses
df$Rpt_Block_Num[grepl("\\D+", df$Rpt_Block_Num)] %>% head(20)
# Remove non-numeric stuff
df$Rpt_Block_Num <- str_replace(df$Rpt_Block_Num, " BLOCK","")
df$Rpt_Block_Num <- str_replace_all(df$Rpt_Block_Num, "O","0")
df$Rpt_Block_Num <- str_replace_all(df$Rpt_Block_Num, "^\\d+-","")
df$Rpt_Block_Num <- str_replace_all(df$Rpt_Block_Num, "PP","00")
df$Rpt_Block_Num <- str_replace_all(df$Rpt_Block_Num, " 1/2","")
df$Rpt_Block_Num <- str_replace(df$Rpt_Block_Num, "\\D+","")
# Show all non-numeric addresses
df$Rpt_Block_Num[grepl("\\D+", df$Rpt_Block_Num)] %>% head(20)
df$Rpt_Block_Num <- str_replace(df$Rpt_Block_Num, "0CK","")
df$Rpt_Block_Num <- str_replace(df$Rpt_Block_Num, "0\\D+","")
df$Rpt_Block_Num[grepl("\\D+", df$Rpt_Block_Num)] %>% head(20)
```
```{r Look at Rpr_Street_Pfx}
sort(unique(df$Rpt_Street_Pfx))
```
```{r Look at Rpt_Street_Sfx}
sort(unique(df$Rpt_Street_Sfx))
```
```{r Look at Severity}
sort(unique(df$Severity))
```
```{r Look at Harmed}
sort(unique(df$Harmed))
df %>%
ggplot(aes(x=Harmed)) +
geom_bar() +
coord_flip()
```
```{r}
# Save file out
# saveRDS(df, paste(path, "Cleaned_2015_2024.rds", sep="/"))
```