<- read_csv(file = "For_Hire_Vehicles__FHV__-_Active.csv") %>%
fhv_clean clean_names() %>%
rename(hybrid = veh)
7. Mutate
Video Tutorial
Mutate
Mutate is an incredibly powerful tool to create new columns and new variables.
Let’s grab our code to read in the clean dataframe
We create a new column with mutate by setting the name of our new column and a new value
%>%
fhv_clean mutate(city = "New York City")
# A tibble: 98,318 × 24
active vehicle_license_number name license_type expiration_date
<chr> <chr> <chr> <chr> <chr>
1 YES 5608977 AMERICAN,UNITED,T… FOR HIRE VE… 04/30/2025
2 YES 5645622 RAMA,ILIR FOR HIRE VE… 09/11/2023
3 YES 5192507 ORDONEZ,ELIAS FOR HIRE VE… 03/08/2025
4 YES 5378856 RIVERA,ENMA FOR HIRE VE… 11/12/2024
5 YES 5852121 A/VA,SERVICE,CORP FOR HIRE VE… 04/11/2024
6 YES 5415237 REYES,JUAN,E FOR HIRE VE… 10/31/2023
7 YES 5643301 BEGUM,TAZMINUR FOR HIRE VE… 09/30/2025
8 YES 5701439 GONZALEZALVARADO,L FOR HIRE VE… 06/13/2024
9 YES 5790931 GOMEZ,JOSE,A FOR HIRE VE… 05/23/2025
10 YES 5743759 HOSSAIN,SM,KAMAL FOR HIRE VE… 12/08/2024
# ℹ 98,308 more rows
# ℹ 19 more variables: permit_license_number <chr>,
# dmv_license_plate_number <chr>, vehicle_vin_number <chr>,
# wheelchair_accessible <chr>, certification_date <chr>, hack_up_date <chr>,
# vehicle_year <dbl>, base_number <chr>, base_name <chr>, base_type <chr>,
# hybrid <chr>, base_telephone_number <chr>, website <chr>,
# base_address <chr>, reason <chr>, order_date <lgl>, …
You can create multiple new columns (...)
at once
%>%
fhv_clean mutate(city = "New York City",
active = TRUE) #I can overwrite column names too. I've made this active column boolean (true or false)
# A tibble: 98,318 × 24
active vehicle_license_number name license_type expiration_date
<lgl> <chr> <chr> <chr> <chr>
1 TRUE 5608977 AMERICAN,UNITED,T… FOR HIRE VE… 04/30/2025
2 TRUE 5645622 RAMA,ILIR FOR HIRE VE… 09/11/2023
3 TRUE 5192507 ORDONEZ,ELIAS FOR HIRE VE… 03/08/2025
4 TRUE 5378856 RIVERA,ENMA FOR HIRE VE… 11/12/2024
5 TRUE 5852121 A/VA,SERVICE,CORP FOR HIRE VE… 04/11/2024
6 TRUE 5415237 REYES,JUAN,E FOR HIRE VE… 10/31/2023
7 TRUE 5643301 BEGUM,TAZMINUR FOR HIRE VE… 09/30/2025
8 TRUE 5701439 GONZALEZALVARADO,L FOR HIRE VE… 06/13/2024
9 TRUE 5790931 GOMEZ,JOSE,A FOR HIRE VE… 05/23/2025
10 TRUE 5743759 HOSSAIN,SM,KAMAL FOR HIRE VE… 12/08/2024
# ℹ 98,308 more rows
# ℹ 19 more variables: permit_license_number <chr>,
# dmv_license_plate_number <chr>, vehicle_vin_number <chr>,
# wheelchair_accessible <chr>, certification_date <chr>, hack_up_date <chr>,
# vehicle_year <dbl>, base_number <chr>, base_name <chr>, base_type <chr>,
# hybrid <chr>, base_telephone_number <chr>, website <chr>,
# base_address <chr>, reason <chr>, order_date <lgl>, …
Mutate with logical expressions
Where mutate gets powerful is when you use it with logical expressions. Here we use if_else()
<- fhv_clean %>%
fhv_rideshare mutate(rideshare = if_else(
condition = base_name == "UBER USA, LLC",
true = "rideshare",
false = "limo"
#if it's an uber call it rideshare, if its a limo call it something else
)) #notice I named the arguments here! A good practice when the argument is not ...
Tabulate the variable we made with the count
() funtion
%>%
fhv_rideshare count(rideshare)
# A tibble: 2 × 2
rideshare n
<chr> <int>
1 limo 21608
2 rideshare 76710
What if we have more than one logical expression we care about? Check out case_when
.
<- fhv_clean %>%
fhv_blackcar mutate(
ride_type = case_when(
== "UBER USA, LLC" & base_type == "BLACK-CAR" ~ "BLACK CAR RIDESHARE",
base_name != "UBER USA, LLC" & base_type == "BLACK-CAR" ~ "BLACK CAR NON-RIDESHARE",
base_name TRUE ~ base_type #if it doesn't meet either condition, return the base_type
))
Use &
and |
for and and or logical expressions with multiple conditions
%>%
fhv_blackcar count(ride_type)#now we have four categories!
# A tibble: 4 × 2
ride_type n
<chr> <int>
1 BLACK CAR NON-RIDESHARE 16225
2 BLACK CAR RIDESHARE 76710
3 LIVERY 3652
4 LUXURY 1731
Normalizing with Mutate
You can use statistical functions like mean to normalize data with mutate. mean will return the average of all the vehicle years. You can use mutate to generate a new variable that takes the distance from each observation to the mean.
%>%
fhv_clean mutate(year_norm = vehicle_year/mean(vehicle_year, na.rm = T),
year_pct = percent_rank(vehicle_year)) %>%
select(vehicle_license_number, vehicle_year, year_norm, year_pct)
# A tibble: 98,318 × 4
vehicle_license_number vehicle_year year_norm year_pct
<chr> <dbl> <dbl> <dbl>
1 5608977 2015 0.998 0.120
2 5645622 2022 1.00 0.797
3 5192507 2016 0.999 0.212
4 5378856 2018 1.00 0.440
5 5852121 2019 1.00 0.552
6 5415237 2012 0.997 0.0225
7 5643301 2015 0.998 0.120
8 5701439 2016 0.999 0.212
9 5790931 2017 0.999 0.313
10 5743759 2021 1.00 0.714
# ℹ 98,308 more rows