/
demo_during_talk.R
86 lines (58 loc) · 2.38 KB
/
demo_during_talk.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
library(dplyr)
library(tidyr)
library(nycflights13)
library(ggplot2)
glimpse(flights)
#Subsetting
flights$carrier
flights[,5]
flight_small <- select(flights, carrier, flight, origin)
flight_small
select(flights, matches(".time"))
select(flights, contains("time"))
flights[, -5]
select(flights, -contains("time"))
flights_special <- filter(flights, dest == "DEN" &
between(dep_time, 1000, 1400) &
month %in% c(6, 7, 8))
flights_special
#in base R
dest_is_DEN = flights$dest == "DEN"
flights[dest_is_DEN,]
#Sampling
sample_n(flights, 5, replace = TRUE)
sample_frac(flights, 0.1)
# Mutating ------------------------------------------------------------
flights_km <- mutate(flights, distance = distance * 1.6) #convert to km
flights_cf <- mutate(flights, carrier_flight = paste(carrier, flight))
flights_max <- mutate(flights, max_delay = max(c(dep_delay, arr_delay),
na.rm = T)/60)
flights_lower <- mutate_if(flights, is.character, tolower)
# Grouping ----------------------------------------------------------------
flights_by_month = group_by(flights, month)
flights_max <- mutate(flights, max_day = max(day))
flights_max2 <- mutate(flights_by_month, max_day = max(day))
# Summarising -------------------------------------------------------------
flights_by_carrier <- group_by(flights, carrier)
summarise(flights_by_carrier, Delay_Avg = mean(arr_delay, na.rm = T))
flights_by_date <- group_by(flights, month, day)
summarise(flights_by_date, Delay_Avg = mean(arr_delay, na.rm = T),
Distance_Avg = mean(distance),
Distance_SD = sd(distance),
Most_Common_Carrier = names(sort(table(carrier)))[1])
# Combining ---------------------------------------------------------------
airlines
flights_with_airlines <- left_join(flights, airlines, by = "carrier")
select(flights_with_airlines, carrier, name)
# Piping ------------------------------------------------------------------
flights_sum <- flights %>%
left_join(airlines, by = "carrier") %>%
filter(carrier %in% c("WN", "UA", "VX", "DL")) %>%
group_by(name) %>%
summarise(Avg_Distance = mean(distance, na.rm = T),
Avg_Delay = mean(arr_delay, na.rm = T))
flights_long_delay <- flights %>%
group_by(carrier, month) %>%
arrange(arr_delay) %>%
top_n(5) %>%
summarise(Avg_Delay = mean(arr_delay, na.rm = T))