dplyr introduction

tatsuro included in r

2022-07-10 1709 words 9 minutes

Contents

This article is record note for Hands-on dplyr tutorial.

I updated source code provided as a lecture materials of Gatech MGT6203 based on youtube instruction Hands-on dplyr tutorial for faster data manipulation in R

Packages preparation

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11


suppressMessages(library(dplyr))

if (!require(hflights)) install.packages("hflights")
suppressMessages(library(hflights))

# explore data
data(hflights)
head(hflights)

#tbl_df for pretty printing
flights <- tbl_df(hflights)

filter verb

filter rows by value

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16


# base R approach to view all flights on January 1
flights[flights$Month==1 & flights$DayofMonth==1, ]

filter(flights, Month==1, DayofMonth==1)

#Using chaining operator %>%
flights %>% filter(Month==1, DayofMonth==1)

#Stroting results
firstDayOfYearFlights<- flights %>% filter(Month==1, DayofMonth==1)

#flights of carrier AA and UA (union of selection. Hence OR is being used)
# use pipe for OR condition
filter(flights, UniqueCarrier=="AA" | UniqueCarrier=="UA")

filter(flights, UniqueCarrier %in% c("AA", "UA"))

select

select: Pick columns by name

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36



# base R approach to select DepTime, ArrTime, and FlightNum columns
flights[, c("DepTime", "ArrTime", "FlightNum")]

# dplyr approach
select(flights, DepTime, ArrTime, FlightNum)

#Using chaining operator %>%
flights %>% select(DepTime, ArrTime, FlightNum)

# use colon to select multiple contiguous columns, and use `contains` to match columns by name
# note: `starts_with`, `ends_with`, and `matches` (for regular expressions) can also be used to match columns by name
select(flights, Year:DayofMonth, contains("Taxi"), contains("Delay"))

#### Using chaining operator for multiple operations

# dplyr nesting method to select UniqueCarrier and DepDelay columns and filter for delays over 60 minutes
filter(select(flights, UniqueCarrier, DepDelay), DepDelay > 60)

# chaining method
flights %>%
  select(UniqueCarrier, DepDelay) %>%
  filter(DepDelay > 60)

#traditional R method
temp<- flights[,c("UniqueCarrier", "DepDelay")]
temp<- temp[temp$DepDelay>60,]
temp<- drop_na(temp)

##Chaining can be used anywhere in R
# Traditional Method: create two vectors and calculate Euclidian distance between them
x1 <- 1:5; x2 <- 2:6
sqrt(sum((x1-x2)^2))

# chaining method
(x1-x2)^2 %>% sum() %>% sqrt()

arrange

arrange: Reorder rows

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14



# base R approach to select UniqueCarrier and DepDelay columns and sort by DepDelay
flights[order(flights$DepDelay), c("UniqueCarrier", "DepDelay")]

# dplyr approach
flights %>%
  select(UniqueCarrier, DepDelay) %>%
  arrange(DepDelay)

# use `desc` for descending
flights %>%
  select(UniqueCarrier, DepDelay) %>%
  arrange(desc(DepDelay))

select

mutate: Add new variables

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12


# base R approach to create a new variable Speed (in mph)
flights$Speed <- flights$Distance / flights$AirTime*60
flights[, c("Distance", "AirTime", "Speed")]

# dplyr approach (prints the new variable but does not store it)
flights %>%
  select(Distance, AirTime) %>%
  mutate(Speed = Distance/AirTime*60)

flights_speed <- flights %>%
  select(Distance, AirTime) %>%
  mutate(Speed = Distance/AirTime*60)

summarise with group_by verb

group_by: Creates the groups that will be operated on summarise: Uses the provided aggregation function to summarise each group

It’s useful with data that has been grouped by one or more variables

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11


> head(with(flights, tapply(ArrDelay, Dest, mean, na.rm=TRUE)))
      ABQ       AEX       AGS       AMA       ANC       ASE 
 7.226259  5.839437  4.000000  6.840095 26.080645  6.794643 
> head(aggregate(ArrDelay ~ Dest, flights, mean))
  Dest  ArrDelay
1  ABQ  7.226259
2  AEX  5.839437
3  AGS  4.000000
4  AMA  6.840095
5  ANC 26.080645
6  ASE  6.794643

Summarise (summamrize) average of arrival delay for each destination

1
2
3


flights %>%
  group_by(Dest) %>%
  summarize(avg_delay = mean(ArrDelay, na.rm=TRUE))

output

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14


# A tibble: 116 × 2
   Dest  avg_delay
   <chr>     <dbl>
 1 ABQ        7.23
 2 AEX        5.84
 3 AGS        4   
 4 AMA        6.84
 5 ANC       26.1 
 6 ASE        6.79
 7 ATL        8.23
 8 AUS        7.45
 9 AVL        9.97
10 BFL      -13.2 
# … with 106 more rows

summarise_each function with list()

1
2
3
4
5


flights %>%
  group_by(UniqueCarrier) %>%
  summarise_each(list(mean), Cancelled, Diverted)
# summarise_each(funs(mean), Cancelled, Diverted)
# `funs()` was deprecated in dplyr 0.8.0.

output

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18


# A tibble: 15 × 3
   UniqueCarrier Cancelled Diverted
   <chr>             <dbl>    <dbl>
 1 AA              0.0185   0.00185
 2 AS              0        0.00274
 3 B6              0.0259   0.00576
 4 CO              0.00678  0.00263
 5 DL              0.0159   0.00303
 6 EV              0.0345   0.00318
 7 F9              0.00716  0      
 8 FL              0.00982  0.00327
 9 MQ              0.0290   0.00194
10 OO              0.0139   0.00349
11 UA              0.0164   0.00241
12 US              0.0113   0.00147
13 WN              0.0155   0.00229
14 XE              0.0155   0.00345
15 YV              0.0127   0    

Summarise UniqueCarrier and get min and max

1
2
3


flights %>%
  group_by(UniqueCarrier) %>%
  summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), matches("Delay"))

output

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18


# A tibble: 15 × 5
   UniqueCarrier ArrDelay_min DepDelay_min ArrDelay_max DepDelay_max
   <chr>                <int>        <int>        <int>        <int>
 1 AA                     -39          -15          978          970
 2 AS                     -43          -15          183          172
 3 B6                     -44          -14          335          310
 4 CO                     -55          -18          957          981
 5 DL                     -32          -17          701          730
 6 EV                     -40          -18          469          479
 7 F9                     -24          -15          277          275
 8 FL                     -30          -14          500          507
 9 MQ                     -38          -23          918          931
10 OO                     -57          -33          380          360
11 UA                     -47          -11          861          869
12 US                     -42          -17          433          425
13 WN                     -44          -10          499          548
14 XE                     -70          -19          634          628
15 YV                     -32          -11           72           54

Summarize with tally function

1
2
3


flights %>%
  group_by(Month, DayofMonth) %>%
  tally(sort= TRUE)

output

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15


# A tibble: 365 × 3
# Groups:   Month [12]
   Month DayofMonth     n
   <int>      <int> <int>
 1     8          4   706
 2     8         11   706
 3     8         12   706
 4     8          5   705
 5     8          3   704
 6     8         10   704
 7     1          3   702
 8     7          7   702
 9     7         14   702
10     7         28   701
# … with 355 more rows

Helper function n() counts the number of rows in a group Helper function n_distinction(vector) count the number of unique items in that vector

1
2
3
4
5


# for each destination, count the total number of flights and the number of 
# distinct planes that flew there
flights %>%
  group_by(Dest) %>%
  summarise(flight_count = n(), plane_count = n_distinct(TailNum))

output

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14


# A tibble: 116 × 3
   Dest  flight_count plane_count
   <chr>        <int>       <int>
 1 ABQ           2812         716
 2 AEX            724         215
 3 AGS              1           1
 4 AMA           1297         158
 5 ANC            125          38
 6 ASE            125          60
 7 ATL           7886         983
 8 AUS           5022        1015
 9 AVL            350         142
10 BFL            504          70
# … with 106 more rows

table function: for eahch destination, show the number of cancelled and not cancelled flight

1
2
3
4
5


flights %>%
  group_by(Dest) %>%
  select(Cancelled) %>%
  table() %>%
  head()

output

1
2
3
4
5
6
7
8
9


Adding missing grouping variables: `Dest`
     Cancelled
Dest     0    1
  ABQ 2787   25
  AEX  712   12
  AGS    1    0
  AMA 1265   32
  ANC  125    0
  ASE  120    5

window function

While aggregtaion function (like mean) takes n inputs and returns 1 value, window function returns n values. The output of a window function depends on all its input values, so window functions don’t include functions that work element-wise, like + or round().

for each carrier, calculate which two days of the year they had their longes departure delays

1
2
3
4
5
6
7
8
9


# 
# note: smallest (not largest) value is ranked as 1, so you have to use `desc`
# to rank by largest value

flights %>%
  group_by(UniqueCarrier) %>%
  select(Month, DayofMonth, DepDelay) %>%
  filter(min_rank(desc(DepDelay)) <=2 ) %>%
  arrange(UniqueCarrier, desc(DepDelay))

output

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17


Adding missing grouping variables: `UniqueCarrier`
Selecting by DepDelay
# A tibble: 30 × 4
# Groups:   UniqueCarrier [15]
   UniqueCarrier Month DayofMonth DepDelay
   <chr>         <int>      <int>    <int>
 1 AA               12         12      970
 2 AA               11         19      677
 3 AS                2         28      172
 4 AS                7          6      138
 5 B6               10         29      310
 6 B6                8         19      283
 7 CO                8          1      981
 8 CO                1         20      780
 9 DL               10         25      730
10 DL                4          5      497
# … with 20 more rows

top_n function

1
2
3
4
5
6


# top_n function
flights %>%
  group_by(UniqueCarrier) %>%
  select(Month, DayofMonth, DepDelay) %>%
  top_n(2) %>%
  arrange(UniqueCarrier, desc(DepDelay))

output

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17


Adding missing grouping variables: `UniqueCarrier`
Selecting by DepDelay
# A tibble: 30 × 4
# Groups:   UniqueCarrier [15]
   UniqueCarrier Month DayofMonth DepDelay
   <chr>         <int>      <int>    <int>
 1 AA               12         12      970
 2 AA               11         19      677
 3 AS                2         28      172
 4 AS                7          6      138
 5 B6               10         29      310
 6 B6                8         19      283
 7 CO                8          1      981
 8 CO                1         20      780
 9 DL               10         25      730
10 DL                4          5      497
# … with 20 more rows

lag function: for each month, calculate the number of flight and the change from the previous month

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11


flights %>%
  group_by(Month) %>%
  summarise(flight_count = n()) %>%
  mutate(change = flight_count-lag(flight_count))

# rewrite more simply with tally function. 

flights %>%
  group_by(Month) %>%
  tally() %>%
  mutate(change = n-lag(n))

output

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15


# A tibble: 12 × 3
   Month flight_count change
   <int>        <int>  <int>
 1     1        18910     NA
 2     2        17128  -1782
 3     3        19470   2342
 4     4        18593   -877
 5     5        19172    579
 6     6        19600    428
 7     7        20548    948
 8     8        20176   -372
 9     9        18065  -2111
10    10        18696    631
11    11        18021   -675
12    12        19117   1096

Other useful function

Randomly sample a fixed number of rows, without replacement

1

flights %>% sample_n(5)

Randomely sample a franction of rows, with replacement

1

flights %> sample_frac(0.25, replace=TRUE)

glimpse: better formatting, and adapts to your screen width

1

glimpse(flights)

References

dplyr introduction
reference Youtube tutorial
Video minutes
Micromasters GaTech MGT6203 Data Analytics for Business Module 11 Introduction to dplyr