Exercices autour de data.table
L’atelier du jour avait pour objectif la découvert du package data.table au travers d’exercises proposés lors de la dernière édition de UseR. Les slides avec les énoncés sont disponibles ici.
L’apport de notre atelier est d’avoir systématiquement essayé de traiter les questions avec
- les fonctionalités R
de base
- les fonctionalités offertes par le tidyverse (principalement dplyr)
- les fonctionnalités du package data.table
Pour les animateurs : vous êtes libre de compléter le post de blog. Nous ajouteroins des comparatifs en terme de temps de calcul plus tard (peut-être).
Préliminaires
C’est parti, on charge les packages requis
library(data.table)
library(tidyverse)
Les exercices seront traités à l’aide de deux tables jouets, dont on propose des contreparties en
- object data.frame
- object tibble
- object data.table
set.seed(20170703L)
DF1 = data.frame(id = sample(1:2, 9L, TRUE),
code = sample(letters[1:3], 9, TRUE),
valA = 1:9, valB = 10:18,
stringsAsFactors = FALSE)
DF2 = data.frame(id = c(3L, 1L, 1L, 2L, 3L),
code = c("b", "a", "c", "c", "d"),
mul = 5:1, stringsAsFactors = FALSE)
## corresponding data tibble
TB1 <- as.tibble(DF1)
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
TB2 <- as.tibble(DF2)
## corresponding data tables
DT1 <- as.data.table(DF1)
DT2 <- as.data.table(DF2)
Question 1
Subset all rows where id column equals 1 & code column is not equal to “c”
base
base::subset(DF1, id == 1 & code != "c")
## id code valA valB
## 1 1 a 1 10
## 2 1 a 2 11
## 4 1 a 4 13
with(DF1, DF1[id == 1 & code != "c",])
## id code valA valB
## 1 1 a 1 10
## 2 1 a 2 11
## 4 1 a 4 13
dplyr
filter(TB1, id == 1 & code != "c")
## # A tibble: 3 x 4
## id code valA valB
## <int> <chr> <int> <int>
## 1 1 a 1 10
## 2 1 a 2 11
## 3 1 a 4 13
data.table
DT1[id == 1 & code != "c"]
## id code valA valB
## 1: 1 a 1 10
## 2: 1 a 2 11
## 3: 1 a 4 13
Question 2
Select valA and valB columns from DF1
base R
DF1[, c("valA", "valB")]
## valA valB
## 1 1 10
## 2 2 11
## 3 3 12
## 4 4 13
## 5 5 14
## 6 6 15
## 7 7 16
## 8 8 17
## 9 9 18
dplyr
select(TB1, valA, valB)
## # A tibble: 9 x 2
## valA valB
## <int> <int>
## 1 1 10
## 2 2 11
## 3 3 12
## 4 4 13
## 5 5 14
## 6 6 15
## 7 7 16
## 8 8 17
## 9 9 18
data.table
DT1[ , .(valA,valB)]
## valA valB
## 1: 1 10
## 2: 2 11
## 3: 3 12
## 4: 4 13
## 5: 5 14
## 6: 6 15
## 7: 7 16
## 8: 8 17
## 9: 9 18
Question 3
Get sum(valA) and sum(valB) for id > 1 as a 1-row, 2-col data.frame
base R
colSums(DF1[ DF1$id > 1, c("valA", "valB")])
## valA valB
## 30 66
dplyr
TB1 %>% filter(id > 1) %>% select(valA, valB) %>% summarise_all(sum)
## # A tibble: 1 x 2
## valA valB
## <int> <int>
## 1 30 66
data.table
DT1[id > 1, .(sum(valA), sum(valB))]
## V1 V2
## 1: 30 66
Question 4
Replace valB with valB+1 for all rows where code == “c”
base R
DF1$valB[DF1$code=="c"] = DF1$valB[DF1$code=="c"] + 1
DF1
## id code valA valB
## 1 1 a 1 10
## 2 1 a 2 11
## 3 1 c 3 13
## 4 1 a 4 13
## 5 1 c 5 15
## 6 2 a 6 15
## 7 2 a 7 16
## 8 2 c 8 18
## 9 2 b 9 18
dplyr
TB1 <- TB1 %>% mutate(valB = ifelse(code == "c", valB + 1,valB))
## TB1[TB1$code=="c", ] <- filter(TB1, code=="c") %>% mutate(valB= valB+1) # alternative solution
TB1
## # A tibble: 9 x 4
## id code valA valB
## <int> <chr> <int> <dbl>
## 1 1 a 1 10
## 2 1 a 2 11
## 3 1 c 3 13
## 4 1 a 4 13
## 5 1 c 5 15
## 6 2 a 6 15
## 7 2 a 7 16
## 8 2 c 8 18
## 9 2 b 9 18
data.table
DT1[code == "c", valB := valB + 1L]
DT1
## id code valA valB
## 1: 1 a 1 10
## 2: 1 a 2 11
## 3: 1 c 3 13
## 4: 1 a 4 13
## 5: 1 c 5 15
## 6: 2 a 6 15
## 7: 2 a 7 16
## 8: 2 c 8 18
## 9: 2 b 9 18
Question 5
Add a new column valC column with values equal to valB^2 - valA^2
base R
DF1 <- transform(DF1, valC = valB^2 - valA^2)
## DF1$valC <- DF1$valB^2 - DF1$valA^2 # alternate solution
DF1
## id code valA valB valC
## 1 1 a 1 10 99
## 2 1 a 2 11 117
## 3 1 c 3 13 160
## 4 1 a 4 13 153
## 5 1 c 5 15 200
## 6 2 a 6 15 189
## 7 2 a 7 16 207
## 8 2 c 8 18 260
## 9 2 b 9 18 243
dplyr
TB1 <- mutate(TB1, valC = valB^2 - valA^2)
TB1
## # A tibble: 9 x 5
## id code valA valB valC
## <int> <chr> <int> <dbl> <dbl>
## 1 1 a 1 10 99
## 2 1 a 2 11 117
## 3 1 c 3 13 160
## 4 1 a 4 13 153
## 5 1 c 5 15 200
## 6 2 a 6 15 189
## 7 2 a 7 16 207
## 8 2 c 8 18 260
## 9 2 b 9 18 243
data.table
DT1[, valC := valB^2 - valA^2]
DT1
## id code valA valB valC
## 1: 1 a 1 10 99
## 2: 1 a 2 11 117
## 3: 1 c 3 13 160
## 4: 1 a 4 13 153
## 5: 1 c 5 15 200
## 6: 2 a 6 15 189
## 7: 2 a 7 16 207
## 8: 2 c 8 18 260
## 9: 2 b 9 18 243
Question 6
Get sum(valA) and sum(valB) grouped by id and code (i.e., for each unique combination of id,code)
base
aggregate(.~ id + code, DF1, sum)
## id code valA valB valC
## 1 1 a 7 34 369
## 2 2 a 13 31 396
## 3 2 b 9 18 243
## 4 1 c 8 28 360
## 5 2 c 8 18 260
aggregate(DF1[, c("valA", "valB")], list(DF1$id, DF1$code), sum)
## Group.1 Group.2 valA valB
## 1 1 a 7 34
## 2 2 a 13 31
## 3 2 b 9 18
## 4 1 c 8 28
## 5 2 c 8 18
dplyr
TB1 %>% group_by(id, code) %>% summarise_all(sum)
## # A tibble: 5 x 5
## # Groups: id [2]
## id code valA valB valC
## <int> <chr> <int> <dbl> <dbl>
## 1 1 a 7 34 369
## 2 1 c 8 28 360
## 3 2 a 13 31 396
## 4 2 b 9 18 243
## 5 2 c 8 18 260
data.table
DT1[, .(SumA=sum(valA), SumB = sum(valB)), by=list(id,code) ]
## id code SumA SumB
## 1: 1 a 7 34
## 2: 1 c 8 28
## 3: 2 a 13 31
## 4: 2 c 8 18
## 5: 2 b 9 18
Question 7
Get sum(valA) and sum(valB) grouped by id for id >= 2 & code %in% c(“a”, “c”)
base
aggregate(.~ id , subset(DF1, id >=2 & code %in% c("a","c"), -code), sum)
## id valA valB valC
## 1 2 21 49 656
dplyr
TB1 %>%
group_by(id) %>%
filter(id >=2, code %in% c("a", "c")) %>%
select(-code, -valC) %>%
summarise_all(sum)
## # A tibble: 1 x 3
## id valA valB
## <int> <int> <dbl>
## 1 2 21 49
data.table
DT1[ (id>=2) & code %in% c("a", "c") , .(SumA=sum(valA), SumB = sum(valB)), by=id ]
## id SumA SumB
## 1: 2 21 49
Question 8
Replace valA with max(valA)-min(valA) grouped by code
base
DF1 <- transform(DF1, valA = rep(tapply(valA, code, function(x) diff(range(x)))[code]))
DF1
## id code valA valB valC
## 1 1 a 6 10 99
## 2 1 a 6 11 117
## 3 1 c 5 13 160
## 4 1 a 6 13 153
## 5 1 c 5 15 200
## 6 2 a 6 15 189
## 7 2 a 6 16 207
## 8 2 c 5 18 260
## 9 2 b 0 18 243
dplyr
TB1 <- TB1 %>% group_by(code) %>% mutate(valA= max(valA)-min(valA))
TB1
## # A tibble: 9 x 5
## # Groups: code [3]
## id code valA valB valC
## <int> <chr> <int> <dbl> <dbl>
## 1 1 a 6 10 99
## 2 1 a 6 11 117
## 3 1 c 5 13 160
## 4 1 a 6 13 153
## 5 1 c 5 15 200
## 6 2 a 6 15 189
## 7 2 a 6 16 207
## 8 2 c 5 18 260
## 9 2 b 0 18 243
data.table
DT1[, "valA" := max(valA)-min(valA), by=code]
DT1
## id code valA valB valC
## 1: 1 a 6 10 99
## 2: 1 a 6 11 117
## 3: 1 c 5 13 160
## 4: 1 a 6 13 153
## 5: 1 c 5 15 200
## 6: 2 a 6 15 189
## 7: 2 a 6 16 207
## 8: 2 c 5 18 260
## 9: 2 b 0 18 243
Question 9
Create a new col named valD with max(valB)-min(valA) grouped by code
base
DF1 <- transform(DF1, valD = by(DF1, code, function(x) max(x$valB) - min(x$valA))[code])
DF1
## id code valA valB valC valD
## 1 1 a 6 10 99 10
## 2 1 a 6 11 117 10
## 3 1 c 5 13 160 13
## 4 1 a 6 13 153 10
## 5 1 c 5 15 200 13
## 6 2 a 6 15 189 10
## 7 2 a 6 16 207 10
## 8 2 c 5 18 260 13
## 9 2 b 0 18 243 18
dplyr
TB1 <- TB1 %>% group_by(code) %>% mutate(valD= max(valB)-min(valA))
TB1
## # A tibble: 9 x 6
## # Groups: code [3]
## id code valA valB valC valD
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 1 a 6 10 99 10
## 2 1 a 6 11 117 10
## 3 1 c 5 13 160 13
## 4 1 a 6 13 153 10
## 5 1 c 5 15 200 13
## 6 2 a 6 15 189 10
## 7 2 a 6 16 207 10
## 8 2 c 5 18 260 13
## 9 2 b 0 18 243 18
data.table
DT1[, "valD" := max(valB)-min(valA), by=code]
DT1
## id code valA valB valC valD
## 1: 1 a 6 10 99 10
## 2: 1 a 6 11 117 10
## 3: 1 c 5 13 160 13
## 4: 1 a 6 13 153 10
## 5: 1 c 5 15 200 13
## 6: 2 a 6 15 189 10
## 7: 2 a 6 16 207 10
## 8: 2 c 5 18 260 13
## 9: 2 b 0 18 243 18
Question 10
Subset DF1 by DF2 on id,code column. That is, for each row of DF2$id, DF2$code, get valA and valB cols from DF1. Include rows that have no matches as well.
base
merge(DF1, DF2[, c("id", "code")], by = c("id", "code"), all.y=TRUE)
## id code valA valB valC valD
## 1 1 a 6 13 153 10
## 2 1 a 6 10 99 10
## 3 1 a 6 11 117 10
## 4 1 c 5 13 160 13
## 5 1 c 5 15 200 13
## 6 2 c 5 18 260 13
## 7 3 b NA NA NA NA
## 8 3 d NA NA NA NA
dplyr
right_join(TB1, TB2 %>% select(id, code), by = c("id", "code"))
## # A tibble: 8 x 6
## # Groups: code [4]
## id code valA valB valC valD
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 1 a 6 10 99 10
## 2 1 a 6 11 117 10
## 3 1 c 5 13 160 13
## 4 1 a 6 13 153 10
## 5 1 c 5 15 200 13
## 6 2 c 5 18 260 13
## 7 3 b NA NA NA NA
## 8 3 d NA NA NA NA
## inner_join to keep only rows present in tibbles, left_join to keep all rows from TB1
## anti_join to keep only rows from TB1 with no matching rows in TB2
data.table
merge(DT1, DT2[ , .(id, code)], all.y = TRUE)
## id code valA valB valC valD
## 1: 1 a 6 10 99 10
## 2: 1 a 6 11 117 10
## 3: 1 a 6 13 153 10
## 4: 1 c 5 13 160 13
## 5: 1 c 5 15 200 13
## 6: 2 c 5 18 260 13
## 7: 3 b NA NA NA NA
## 8: 3 d NA NA NA NA
Question 11
Same as (10), but fetch just the first matching row of DF1 for each row of DF2$id, DF2$code. Exclude non-matching rows.
base
merge(DF1[!duplicated(DF1[, c("id", "code")]), ],
DF2[, c("id", "code")], by = c("id", "code"))
## id code valA valB valC valD
## 1 1 a 6 10 99 10
## 2 1 c 5 13 160 13
## 3 2 c 5 18 260 13
dplyr
TB1 %>% group_by(id, code) %>% slice(1) %>% ## first entry in each (id, code) group
inner_join(select(TB2, id, code), by = c("id", "code"))
## # A tibble: 3 x 6
## # Groups: id, code [3]
## id code valA valB valC valD
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 1 a 6 10 99 10
## 2 1 c 5 13 160 13
## 3 2 c 5 18 260 13
data.table
merge(DT1[, .SD[1], by = list(id, code)], ## first entry in each (id, code) group
DT2[ , .(id, code)])
## id code valA valB valC valD
## 1: 1 a 6 10 99 10
## 2: 1 c 5 13 160 13
## 3: 2 c 5 18 260 13
Question 12
For every row of DF2$id, DF2$code that matches with DF1’s, update valA with valA*mul.
base
transform(merge(DF1, DF2, by = c("id", "code"), all.y=TRUE), valA = valA * mul)
## id code valA valB valC valD mul
## 1 1 a 24 13 153 10 4
## 2 1 a 24 10 99 10 4
## 3 1 a 24 11 117 10 4
## 4 1 c 15 13 160 13 3
## 5 1 c 15 15 200 13 3
## 6 2 c 10 18 260 13 2
## 7 3 b NA NA NA NA 5
## 8 3 d NA NA NA NA 1
dplyr
right_join(TB1, TB2, by = c("id", "code")) %>% mutate(valA = valA * mul)
## # A tibble: 8 x 7
## # Groups: code [4]
## id code valA valB valC valD mul
## <int> <chr> <int> <dbl> <dbl> <dbl> <int>
## 1 1 a 24 10 99 10 4
## 2 1 a 24 11 117 10 4
## 3 1 c 15 13 160 13 3
## 4 1 a 24 13 153 10 4
## 5 1 c 15 15 200 13 3
## 6 2 c 10 18 260 13 2
## 7 3 b NA NA NA NA 5
## 8 3 d NA NA NA NA 1
data.table
DT3 <- merge(DT1, DT2, all.y = TRUE)[ , "valA" := valA * mul]
show(DT3) ## je ne comprends pas pourquoi il faut utiliser show pour l'affichage
## id code valA valB valC valD mul
## 1: 1 a 24 10 99 10 4
## 2: 1 a 24 11 117 10 4
## 3: 1 a 24 13 153 10 4
## 4: 1 c 15 13 160 13 3
## 5: 1 c 15 15 200 13 3
## 6: 2 c 10 18 260 13 2
## 7: 3 b NA NA NA NA 5
## 8: 3 d NA NA NA NA 1
Question 13
Add a new column val to DF1 with values from DF2$mul where DF2$id, DF2$code matches with DF1’s. Rows that don’t match should have NA.
base
transform(merge(DF1, DF2, by = c("id", "code"), all.x = TRUE, sort = FALSE), mul = mul)
## id code valA valB valC valD mul
## 1 1 a 6 13 153 10 4
## 2 1 a 6 10 99 10 4
## 3 1 a 6 11 117 10 4
## 4 1 c 5 13 160 13 3
## 5 1 c 5 15 200 13 3
## 6 2 c 5 18 260 13 2
## 7 2 b 0 18 243 18 NA
## 8 2 a 6 15 189 10 NA
## 9 2 a 6 16 207 10 NA
dplyr
left_join(TB1, TB2, by = c("id", "code"))
## # A tibble: 9 x 7
## # Groups: code [3]
## id code valA valB valC valD mul
## <int> <chr> <int> <dbl> <dbl> <dbl> <int>
## 1 1 a 6 10 99 10 4
## 2 1 a 6 11 117 10 4
## 3 1 c 5 13 160 13 3
## 4 1 a 6 13 153 10 4
## 5 1 c 5 15 200 13 3
## 6 2 a 6 15 189 10 NA
## 7 2 a 6 16 207 10 NA
## 8 2 c 5 18 260 13 2
## 9 2 b 0 18 243 18 NA
data.table
merge(DT1, DT2, all.x = TRUE)
## id code valA valB valC valD mul
## 1: 1 a 6 10 99 10 4
## 2: 1 a 6 11 117 10 4
## 3: 1 a 6 13 153 10 4
## 4: 1 c 5 13 160 13 3
## 5: 1 c 5 15 200 13 3
## 6: 2 a 6 15 189 10 NA
## 7: 2 a 6 16 207 10 NA
## 8: 2 b 0 18 243 18 NA
## 9: 2 c 5 18 260 13 2