This article is record note for Hands-on dplyr tutorial.
I updated source code provided as a lecture materials of Gatech MGT6203 based on youtube instruction Hands-on dplyr tutorial for faster data manipulation in R
Packages preparation
1
2
3
4
5
6
7
8
9
10
11
|
suppressMessages(library(dplyr))
if (!require(hflights)) install.packages("hflights")
suppressMessages(library(hflights))
# explore data
data(hflights)
head(hflights)
#tbl_df for pretty printing
flights <- tbl_df(hflights)
|
filter verb
filter rows by value
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
# base R approach to view all flights on January 1
flights[flights$Month==1 & flights$DayofMonth==1, ]
filter(flights, Month==1, DayofMonth==1)
#Using chaining operator %>%
flights %>% filter(Month==1, DayofMonth==1)
#Stroting results
firstDayOfYearFlights<- flights %>% filter(Month==1, DayofMonth==1)
#flights of carrier AA and UA (union of selection. Hence OR is being used)
# use pipe for OR condition
filter(flights, UniqueCarrier=="AA" | UniqueCarrier=="UA")
filter(flights, UniqueCarrier %in% c("AA", "UA"))
|
select
select: Pick columns by name
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
|
# base R approach to select DepTime, ArrTime, and FlightNum columns
flights[, c("DepTime", "ArrTime", "FlightNum")]
# dplyr approach
select(flights, DepTime, ArrTime, FlightNum)
#Using chaining operator %>%
flights %>% select(DepTime, ArrTime, FlightNum)
# use colon to select multiple contiguous columns, and use `contains` to match columns by name
# note: `starts_with`, `ends_with`, and `matches` (for regular expressions) can also be used to match columns by name
select(flights, Year:DayofMonth, contains("Taxi"), contains("Delay"))
#### Using chaining operator for multiple operations
# dplyr nesting method to select UniqueCarrier and DepDelay columns and filter for delays over 60 minutes
filter(select(flights, UniqueCarrier, DepDelay), DepDelay > 60)
# chaining method
flights %>%
select(UniqueCarrier, DepDelay) %>%
filter(DepDelay > 60)
#traditional R method
temp<- flights[,c("UniqueCarrier", "DepDelay")]
temp<- temp[temp$DepDelay>60,]
temp<- drop_na(temp)
##Chaining can be used anywhere in R
# Traditional Method: create two vectors and calculate Euclidian distance between them
x1 <- 1:5; x2 <- 2:6
sqrt(sum((x1-x2)^2))
# chaining method
(x1-x2)^2 %>% sum() %>% sqrt()
|
arrange
arrange: Reorder rows
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
# base R approach to select UniqueCarrier and DepDelay columns and sort by DepDelay
flights[order(flights$DepDelay), c("UniqueCarrier", "DepDelay")]
# dplyr approach
flights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(DepDelay)
# use `desc` for descending
flights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(desc(DepDelay))
|
select
mutate: Add new variables
1
2
3
4
5
6
7
8
9
10
11
12
|
# base R approach to create a new variable Speed (in mph)
flights$Speed <- flights$Distance / flights$AirTime*60
flights[, c("Distance", "AirTime", "Speed")]
# dplyr approach (prints the new variable but does not store it)
flights %>%
select(Distance, AirTime) %>%
mutate(Speed = Distance/AirTime*60)
flights_speed <- flights %>%
select(Distance, AirTime) %>%
mutate(Speed = Distance/AirTime*60)
|
summarise with group_by verb
group_by: Creates the groups that will be operated on
summarise: Uses the provided aggregation function to summarise each group
It’s useful with data that has been grouped by one or more variables
1
2
3
4
5
6
7
8
9
10
11
|
> head(with(flights, tapply(ArrDelay, Dest, mean, na.rm=TRUE)))
ABQ AEX AGS AMA ANC ASE
7.226259 5.839437 4.000000 6.840095 26.080645 6.794643
> head(aggregate(ArrDelay ~ Dest, flights, mean))
Dest ArrDelay
1 ABQ 7.226259
2 AEX 5.839437
3 AGS 4.000000
4 AMA 6.840095
5 ANC 26.080645
6 ASE 6.794643
|
Summarise (summamrize) average of arrival delay for each destination
1
2
3
|
flights %>%
group_by(Dest) %>%
summarize(avg_delay = mean(ArrDelay, na.rm=TRUE))
|
output
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
# A tibble: 116 × 2
Dest avg_delay
<chr> <dbl>
1 ABQ 7.23
2 AEX 5.84
3 AGS 4
4 AMA 6.84
5 ANC 26.1
6 ASE 6.79
7 ATL 8.23
8 AUS 7.45
9 AVL 9.97
10 BFL -13.2
# … with 106 more rows
|
summarise_each function with list()
1
2
3
4
5
|
flights %>%
group_by(UniqueCarrier) %>%
summarise_each(list(mean), Cancelled, Diverted)
# summarise_each(funs(mean), Cancelled, Diverted)
# `funs()` was deprecated in dplyr 0.8.0.
|
output
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
# A tibble: 15 × 3
UniqueCarrier Cancelled Diverted
<chr> <dbl> <dbl>
1 AA 0.0185 0.00185
2 AS 0 0.00274
3 B6 0.0259 0.00576
4 CO 0.00678 0.00263
5 DL 0.0159 0.00303
6 EV 0.0345 0.00318
7 F9 0.00716 0
8 FL 0.00982 0.00327
9 MQ 0.0290 0.00194
10 OO 0.0139 0.00349
11 UA 0.0164 0.00241
12 US 0.0113 0.00147
13 WN 0.0155 0.00229
14 XE 0.0155 0.00345
15 YV 0.0127 0
|
Summarise UniqueCarrier and get min and max
1
2
3
|
flights %>%
group_by(UniqueCarrier) %>%
summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), matches("Delay"))
|
output
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
# A tibble: 15 × 5
UniqueCarrier ArrDelay_min DepDelay_min ArrDelay_max DepDelay_max
<chr> <int> <int> <int> <int>
1 AA -39 -15 978 970
2 AS -43 -15 183 172
3 B6 -44 -14 335 310
4 CO -55 -18 957 981
5 DL -32 -17 701 730
6 EV -40 -18 469 479
7 F9 -24 -15 277 275
8 FL -30 -14 500 507
9 MQ -38 -23 918 931
10 OO -57 -33 380 360
11 UA -47 -11 861 869
12 US -42 -17 433 425
13 WN -44 -10 499 548
14 XE -70 -19 634 628
15 YV -32 -11 72 54
|
Summarize with tally function
1
2
3
|
flights %>%
group_by(Month, DayofMonth) %>%
tally(sort= TRUE)
|
output
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
# A tibble: 365 × 3
# Groups: Month [12]
Month DayofMonth n
<int> <int> <int>
1 8 4 706
2 8 11 706
3 8 12 706
4 8 5 705
5 8 3 704
6 8 10 704
7 1 3 702
8 7 7 702
9 7 14 702
10 7 28 701
# … with 355 more rows
|
Helper function n() counts the number of rows in a group
Helper function n_distinction(vector) count the number of unique items in that vector
1
2
3
4
5
|
# for each destination, count the total number of flights and the number of
# distinct planes that flew there
flights %>%
group_by(Dest) %>%
summarise(flight_count = n(), plane_count = n_distinct(TailNum))
|
output
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
# A tibble: 116 × 3
Dest flight_count plane_count
<chr> <int> <int>
1 ABQ 2812 716
2 AEX 724 215
3 AGS 1 1
4 AMA 1297 158
5 ANC 125 38
6 ASE 125 60
7 ATL 7886 983
8 AUS 5022 1015
9 AVL 350 142
10 BFL 504 70
# … with 106 more rows
|
table function: for eahch destination, show the number of cancelled and not cancelled flight
1
2
3
4
5
|
flights %>%
group_by(Dest) %>%
select(Cancelled) %>%
table() %>%
head()
|
output
1
2
3
4
5
6
7
8
9
|
Adding missing grouping variables: `Dest`
Cancelled
Dest 0 1
ABQ 2787 25
AEX 712 12
AGS 1 0
AMA 1265 32
ANC 125 0
ASE 120 5
|
window function
While aggregtaion function (like mean
) takes n inputs and returns 1 value, window function returns n values. The output of a window function depends on all its input values, so window functions don’t include functions that work element-wise, like + or round().
for each carrier, calculate which two days of the year they had their longes departure delays
1
2
3
4
5
6
7
8
9
|
#
# note: smallest (not largest) value is ranked as 1, so you have to use `desc`
# to rank by largest value
flights %>%
group_by(UniqueCarrier) %>%
select(Month, DayofMonth, DepDelay) %>%
filter(min_rank(desc(DepDelay)) <=2 ) %>%
arrange(UniqueCarrier, desc(DepDelay))
|
output
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
Adding missing grouping variables: `UniqueCarrier`
Selecting by DepDelay
# A tibble: 30 × 4
# Groups: UniqueCarrier [15]
UniqueCarrier Month DayofMonth DepDelay
<chr> <int> <int> <int>
1 AA 12 12 970
2 AA 11 19 677
3 AS 2 28 172
4 AS 7 6 138
5 B6 10 29 310
6 B6 8 19 283
7 CO 8 1 981
8 CO 1 20 780
9 DL 10 25 730
10 DL 4 5 497
# … with 20 more rows
|
top_n function
1
2
3
4
5
6
|
# top_n function
flights %>%
group_by(UniqueCarrier) %>%
select(Month, DayofMonth, DepDelay) %>%
top_n(2) %>%
arrange(UniqueCarrier, desc(DepDelay))
|
output
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
Adding missing grouping variables: `UniqueCarrier`
Selecting by DepDelay
# A tibble: 30 × 4
# Groups: UniqueCarrier [15]
UniqueCarrier Month DayofMonth DepDelay
<chr> <int> <int> <int>
1 AA 12 12 970
2 AA 11 19 677
3 AS 2 28 172
4 AS 7 6 138
5 B6 10 29 310
6 B6 8 19 283
7 CO 8 1 981
8 CO 1 20 780
9 DL 10 25 730
10 DL 4 5 497
# … with 20 more rows
|
lag function: for each month, calculate the number of flight and the change from the previous month
1
2
3
4
5
6
7
8
9
10
11
|
flights %>%
group_by(Month) %>%
summarise(flight_count = n()) %>%
mutate(change = flight_count-lag(flight_count))
# rewrite more simply with tally function.
flights %>%
group_by(Month) %>%
tally() %>%
mutate(change = n-lag(n))
|
output
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
# A tibble: 12 × 3
Month flight_count change
<int> <int> <int>
1 1 18910 NA
2 2 17128 -1782
3 3 19470 2342
4 4 18593 -877
5 5 19172 579
6 6 19600 428
7 7 20548 948
8 8 20176 -372
9 9 18065 -2111
10 10 18696 631
11 11 18021 -675
12 12 19117 1096
|
Other useful function
Randomly sample a fixed number of rows, without replacement
1
|
flights %>% sample_n(5)
|
Randomely sample a franction of rows, with replacement
1
|
flights %> sample_frac(0.25, replace=TRUE)
|
glimpse: better formatting, and adapts to your screen width
References