Original from https://www.youtube.com/watch?v=ZM04jn95YP0
install.packages(“tidyverse”)
library(tidyverse)
## ── Attaching packages ─────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# install.packages("nycflights13")
library(nycflights13)
View(flights)
Tip 1 of 18: Create new columns in a count or group_by
flights %>%
mutate(long_flight = (air_time >= 6 * 60)) %>%
View()
flights %>%
mutate(long_flight = (air_time >= 6 * 60)) %>%
count(long_flight)
## # A tibble: 3 x 2
## long_flight n
## <lgl> <int>
## 1 FALSE 322630
## 2 TRUE 4716
## 3 NA 9430
flights %>%
count(long_flight = air_time >= 6 * 60)
## # A tibble: 3 x 2
## long_flight n
## <lgl> <int>
## 1 FALSE 322630
## 2 TRUE 4716
## 3 NA 9430
flights %>%
count(flight_path = str_c(origin, " -> ", dest), sort = TRUE)
## # A tibble: 224 x 2
## flight_path n
## <chr> <int>
## 1 JFK -> LAX 11262
## 2 LGA -> ATL 10263
## 3 LGA -> ORD 8857
## 4 JFK -> SFO 8204
## 5 LGA -> CLT 6168
## 6 EWR -> ORD 6100
## 7 JFK -> BOS 5898
## 8 LGA -> MIA 5781
## 9 JFK -> MCO 5464
## 10 EWR -> BOS 5327
## # … with 214 more rows
flights %>%
group_by(date = make_date(year, month, day)) %>%
summarise(flights_n = n(), air_time_median = median(air_time, na.rm = TRUE)) %>%
ungroup()
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 365 x 3
## date flights_n air_time_median
## <date> <int> <dbl>
## 1 2013-01-01 842 149
## 2 2013-01-02 943 148
## 3 2013-01-03 914 148
## 4 2013-01-04 915 140
## 5 2013-01-05 720 147
## 6 2013-01-06 832 147
## 7 2013-01-07 933 126.
## 8 2013-01-08 899 126.
## 9 2013-01-09 902 135
## 10 2013-01-10 932 126
## # … with 355 more rows
Tip 2 of 18: Sample and randomly shuffle data with slice_sample()
flights %>% slice_sample(n = 10)
flights %>% slice_sample(prop = 0.01)
flights %>% slice_sample(prop = 1)
flights %>% group_by(origin) %>% slice_sample(n = 3) %>% ungroup()
Tip 3 of 18: Create a date column specifying year, month, and day
flights %>% select(year, month, day) %>% mutate(date = make_date(year, month, day))
Tip 4 of 18: Parse numbers with parse_number()
numbers_1 <- tibble(number_col = c(“#1”, “#2”, “#3”)) numbers_2 <- tibble(number_col = c(“Number 5”, “#6”, “7”)) numbers_3 <- tibble(number_col = c(“1.2%”, “2.5%”, “50.9%”))
numbers_1 numbers_1 %>% mutate(number_col = parse_number(number_col))
numbers_2 numbers_2 %>% mutate(number_col = parse_number(number_col))
numbers_3 numbers_3 %>% mutate(number_col = parse_number(number_col))
Tip 5 of 18: Select columns with starts_with, ends_with, etc.
flights %>% select(starts_with(“dep_”))
flights %>% select(starts_with(“dep_”), everything())
flights %>% select(ends_with(“hour”))
flights %>% select(contains(“dep”))
Tip 6 of 18: case_when to create or change a column when conditions are met
flights %>% select(origin)
flights %>% mutate(origin = case_when( (origin == “EWR”) & dep_delay > 20 ~ “Newark International Airport - DELAYED”, (origin == “EWR”) & dep_delay <= 20 ~ “Newark International Airport - ON TIME DEPARTURE”, origin == “JFK” ~ “John F. Kennedy International Airport”, origin == “LGA” ~ “LaGuardia Airport” )) %>% count(origin)
Tip 7 of 18: str_replace_all to find and replace multiple options at once
flights %>% mutate(origin = str_replace_all(origin, c( “^EWR\(" = "Newark International Airport", "^JFK\)” = “John F. Kennedy International Airport”, “^LGA$” = “LaGuardia Airport” ))) %>% count(origin)
Tip 8 of 18: Transmute to create or change columns and keep only those columns
flights %>% transmute(date = make_date(year, month, day), tailnum)
Tip 9 of 18: Use pipes %>% everywhere including inside mutates
airlines %>% mutate(name = name %>% str_to_upper() %>% str_replace_all(" (INC|CO)\.?\(", "") %>% str_replace_all(" AIR ?(LINES|WAYS)?( CORPORATION)?\)“,”“) %>% str_to_title() %>% str_replace_all(”\bUs\b“,”US")) %>% count(name)
airlines %>% mutate(name = str_replace_all(str_to_title(str_replace_all(str_replace_all(str_to_upper(name), " (INC|CO)\.?\(", ""), " AIR ?(LINES|WAYS)?( CORPORATION)?\)“,”“)),”\bUs\b“,”US"))
Tip 10 of 18: Filter groups without making a new column
flights %>% count(carrier, sort = TRUE)
flights_top_carriers <- flights %>% group_by(carrier) %>% filter(n() >= 10000) %>% ungroup()
flights_top_carriers %>% count(carrier, sort = TRUE)
Tip 11 of 18: Split a string into columns based on a regular expression
airlines %>% count(name)
airlines %>% extract( name, into = c(“short_name”, “remainder”), regex = "^([^\\s]+) (.*)$" )
airlines %>% extract( name, into = c(“short_name”, “remainder”), regex = "^([^\\s]+) (.*)$", remove = FALSE )
Tip 12 of 18: semi_join to pick only rows from the first table which are matched in the second table
airways_beginning_with_a <- airlines %>% filter(name %>% str_detect(“^A”))
flights %>% semi_join(airways_beginning_with_a, by = “carrier”) %>% count(carrier)
Tip 13 of 18: anti_join to pick only rows from the first table which are NOT matched in the second table
flights %>% anti_join(airways_beginning_with_a, by = “carrier”)
Tip 14 of 18: fct_reorder to sort bar charts
flights_with_airline_names <- flights %>% left_join(airlines, by = “carrier”)
flights_with_airline_names %>% count(name) %>% ggplot(aes(name, n)) + geom_col()
flights_with_airline_names %>% count(name) %>% mutate(name = fct_reorder(name, n)) %>% ggplot(aes(name, n)) + geom_col()
Tip 15 of 18: coord_flip to display counts more beautifully
flights_with_airline_names %>% count(name) %>% mutate(name = fct_reorder(name, n)) %>% ggplot(aes(name, n)) + geom_col() + coord_flip()
Tip 16 of 18: fct_lump to lump some factor levels into “Other”
flights_with_airline_names %>% mutate(name = fct_lump(name, n = 5)) %>% count(name) %>% mutate(name = fct_reorder(name, n)) %>% ggplot(aes(name, n)) + geom_col() + coord_flip()
Tip 17 of 18: Generate all combinations using crossing
crossing( customer_channel = c(“Online”, “Physical store”), customer_status = c(“New”, “Repeat”), spend_range = c(“$0-$100”, “$100-$200”, “$200-$500”, “$500+”) )
Tip 18 of 18: Create functions that take column names with double curly braces
col_summary <- function(data, col_names, na.rm = TRUE) { data %>% summarise(across({{ col_names }}, list( min = min, max = max, median = median, mean = mean ), na.rm = na.rm, .names = "{col}_{fn}" )) }
flights_with_airline_names %>% col_summary(c(air_time, arr_delay))
flights_with_airline_names %>% group_by(carrier) %>% col_summary(c(air_time, arr_delay))