1 Introduction

1.1 Functionals in R

  • Functionals: Takes functions as arguments.
  • Functionals in R that can replace loops:
    • base
    • purrr
    • plyr: Not under active development.

1.2 Loops vs. functionals

  • Keep loops in the following scenarios
    • Modifying in place
    • Recursive functions
    • While loops

1.3 Load the datasets

library(datasets)
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
summary(ChickWeight)
##      weight           Time           Chick     Diet   
##  Min.   : 35.0   Min.   : 0.00   13     : 12   1:220  
##  1st Qu.: 63.0   1st Qu.: 4.00   9      : 12   2:120  
##  Median :103.0   Median :10.00   20     : 12   3:120  
##  Mean   :121.8   Mean   :10.72   10     : 12   4:118  
##  3rd Qu.:163.8   3rd Qu.:16.00   17     : 12          
##  Max.   :373.0   Max.   :21.00   19     : 12          
##                                  (Other):506

2 base

2.1 apply

  • Apply functions over array margins
  • apply(X, MARGIN, FUN, …)
  • Margin:
    • 1: rows
    • 2: columns
  • Note, apply() will coerce data frame to an array
rowMax <- apply(ChickWeight[,c("weight","Time")],1,max)
head(rowMax)
##  1  2  3  4  5  6 
## 42 51 59 64 76 93
colMax1 <- apply(t(ChickWeight[,c("weight","Time")]),1,max)
colMax1
## weight   Time 
##    373     21
colMax <- apply(ChickWeight[,c("weight","Time")],2, max)
colMax
## weight   Time 
##    373     21

2.2 lapply

  • Apply a function over a list or vector

lapply

  • Returns a list of the same length as X
  • lapply(X, FUN, …)

sapply

  • Wrapper of lapply, returns a vector, matrix, or array
  • sapply(X, FUN, …, simplify = TRUE, USE.NAMES = TRUE)

vapply

  • Similar to sapply, but has a pre-specified type of return value
  • vapply(X, FUN, FUN.VALUE, …, USE.NAMES = TRUE)

simplification

  • sapply: only simplify when X has length >0 and return values from all elements of X are of the same length
  • vapply: always simplify
x <- list(numbers = 1:10, petalLength = iris$Petal.Length, chickWt = ChickWeight$weight)
lapply(x, fivenum)
## $numbers
## [1]  1.0  3.0  5.5  8.0 10.0
## 
## $petalLength
## [1] 1.00 1.60 4.35 5.10 6.90
## 
## $chickWt
## [1]  35  63 103 164 373
sapply(x, fivenum)
##      numbers petalLength chickWt
## [1,]     1.0        1.00      35
## [2,]     3.0        1.60      63
## [3,]     5.5        4.35     103
## [4,]     8.0        5.10     164
## [5,]    10.0        6.90     373
vapply(x, fivenum,  FUN.VALUE = c(Min. = 0, "1st Qu." = 0, Median = 0, "3rd Qu." = 0, Max. = 0))
##         numbers petalLength chickWt
## Min.        1.0        1.00      35
## 1st Qu.     3.0        1.60      63
## Median      5.5        4.35     103
## 3rd Qu.     8.0        5.10     164
## Max.       10.0        6.90     373

2.3 tapply

  • Apply a function to each cell of a ragged array
  • Ragged array: Each (non-empty) group of values given by a unique combination of the levels of certain factors.
  • tapply(X, INDEX, FUN = NULL, …, default = NA, simplify = TRUE)
ChickWeight$Chick <- factor(as.numeric(as.character(ChickWeight$Chick)))
tapply(ChickWeight$weight, ChickWeight$Chick, max )
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## 205 215 202 160 223 160 305 134 100 124 184 205  96 266  68  57 142  39 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
## 157 117 331 167 175  76 265 251 192 233 309 157 256 305 156 341 373 227 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50 
## 178 290 272 321 204 281 200 146 197 238 210 322 237 264

2.4 mapply

  • Apply a function to multiple list or vector arguments
  • mapply(FUN, …, MoreArgs = NULL, SIMPLIFY = TRUE, USE.NAMES = TRUE)
withUnits <- mapply(paste, ChickWeight[,c("weight","Time")], c("gm","days"))
head(withUnits)
##      weight  Time     
## [1,] "42 gm" "0 days" 
## [2,] "51 gm" "2 days" 
## [3,] "59 gm" "4 days" 
## [4,] "64 gm" "6 days" 
## [5,] "76 gm" "8 days" 
## [6,] "93 gm" "10 days"

2.5 eapply

  • Apply a function over values in an environment
  • eapply(env, FUN, …, all.names = FALSE, USE.NAMES = TRUE)

2.6 rapply

  • Recursively apply a function to a list
  • rapply(object, f, classes = “ANY”, deflt = NULL, how = c(“unlist”, “replace”, “list”), …)

3 purrr

Install and load the purrr package

install.packages("purrr")
library(purrr)

3.1 map

  • Apply a function to each element of a vector
  • map(.x, .f, …)

  • map, map_at, map_if: Returns a list

  • map_chr, map_dbl, map_int, map_lgl: Returns an atomic vector of corresponding data type

  • map_df, map_dfc, map_dfr: Returns a dataframe

map(iris[,-5], mean)
## $Sepal.Length
## [1] 5.843333
## 
## $Sepal.Width
## [1] 3.057333
## 
## $Petal.Length
## [1] 3.758
## 
## $Petal.Width
## [1] 1.199333

3.2 map2

  • Map over multiple inputs simultaneously: 2 arguments
  • map2(.x, .y, .f, …)
  • Return value type can also be specified by using _chr, _dbl, _int, _lgl, etc.
sepalL <- as.list(iris$Sepal.Length)
sepalW <- as.list(iris$Sepal.Width)   

sepalArea1 <- map2(sepalL, sepalW, ~ .x * .y)
head(sepalArea1)
## [[1]]
## [1] 17.85
## 
## [[2]]
## [1] 14.7
## 
## [[3]]
## [1] 15.04
## 
## [[4]]
## [1] 14.26
## 
## [[5]]
## [1] 18
## 
## [[6]]
## [1] 21.06
# or
sepalArea2 <- map2(sepalL, sepalW, `*`)
head(sepalArea2)
## [[1]]
## [1] 17.85
## 
## [[2]]
## [1] 14.7
## 
## [[3]]
## [1] 15.04
## 
## [[4]]
## [1] 14.26
## 
## [[5]]
## [1] 18
## 
## [[6]]
## [1] 21.06

3.3 pmap

  • Map over multiple inputs simultaneously: any number of arguments
  • pmap(.l, .f, …)
  • Return value type can also be specified by using _chr, _dbl, _int, _lgl, etc.
sepalPetalRatio <- pmap(list(iris$Sepal.Length,iris$Sepal.Width,
                             iris$Petal.Length, iris$Petal.Width), 
                        function(a, b, c, d) a*b / c*d)
head(sepalPetalRatio)
## [[1]]
## [1] 2.55
## 
## [[2]]
## [1] 2.1
## 
## [[3]]
## [1] 2.313846
## 
## [[4]]
## [1] 1.901333
## 
## [[5]]
## [1] 2.571429
## 
## [[6]]
## [1] 4.955294

3.4 imap

  • Apply a function to each element of a vector, and its index
  • imap(.x, .f, …)
  • Return value type can also be specified by using _chr, _dbl, _int, _lgl, etc.
sepalLengthIdx <- imap(iris$Sepal.Length, ~ paste0(.y, ": ", .x))
head(sepalLengthIdx)
## [[1]]
## [1] "1: 5.1"
## 
## [[2]]
## [1] "2: 4.9"
## 
## [[3]]
## [1] "3: 4.7"
## 
## [[4]]
## [1] "4: 4.6"
## 
## [[5]]
## [1] "5: 5"
## 
## [[6]]
## [1] "6: 5.4"
## clean up the output a bit:
sepalLengthIdx2 <- imap_chr(iris$Sepal.Length, ~ paste0(.y, ": ", .x))
head(sepalLengthIdx2)
## [1] "1: 5.1" "2: 4.9" "3: 4.7" "4: 4.6" "5: 5"   "6: 5.4"

3.5 lmap

  • Apply a function to list-elements of a list
  • lmap(.x, .f, …)
  • Functions that take and returns a list or data frame
##Example from the lmap {purrr} documentation
disjoin <- function(x, sep = "_") {
  name <- names(x)
  x <- as.factor(x[[1]])

  out <- lapply(levels(x), function(level) {
    as.numeric(x == level)
  })

  names(out) <- paste(name, levels(x), sep = sep)
  out
}

# Now, we are ready to map disjoin() on each categorical variable of a
# data frame:
iris %>% lmap_if(is.factor, disjoin)
## # A tibble: 150 x 7
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species_setosa
##           <dbl>       <dbl>        <dbl>       <dbl>          <dbl>
##  1          5.1         3.5          1.4         0.2              1
##  2          4.9         3            1.4         0.2              1
##  3          4.7         3.2          1.3         0.2              1
##  4          4.6         3.1          1.5         0.2              1
##  5          5           3.6          1.4         0.2              1
##  6          5.4         3.9          1.7         0.4              1
##  7          4.6         3.4          1.4         0.3              1
##  8          5           3.4          1.5         0.2              1
##  9          4.4         2.9          1.4         0.2              1
## 10          4.9         3.1          1.5         0.1              1
## # ... with 140 more rows, and 2 more variables: Species_versicolor <dbl>,
## #   Species_virginica <dbl>

4 base vs. purrr

4.1 lapply vs. map

x <- list(numbers = 1:10, petalLength = iris$Petal.Length, chickWt = ChickWeight$weight)
lapply(x, fivenum)
## $numbers
## [1]  1.0  3.0  5.5  8.0 10.0
## 
## $petalLength
## [1] 1.00 1.60 4.35 5.10 6.90
## 
## $chickWt
## [1]  35  63 103 164 373
map(x, fivenum)
## $numbers
## [1]  1.0  3.0  5.5  8.0 10.0
## 
## $petalLength
## [1] 1.00 1.60 4.35 5.10 6.90
## 
## $chickWt
## [1]  35  63 103 164 373

4.2 vapply vs. map_*

vapply(x, mean, numeric(1))
##     numbers petalLength     chickWt 
##      5.5000      3.7580    121.8183
map_dbl(x, mean)
##     numbers petalLength     chickWt 
##      5.5000      3.7580    121.8183

4.3 mapply vs. map2 and pmap

sepalL <- as.list(iris$Sepal.Length)
sepalW <- as.list(iris$Sepal.Width)   
areaBase <- mapply('*', sepalL, sepalW, SIMPLIFY=FALSE)
head(areaBase, 4)
## [[1]]
## [1] 17.85
## 
## [[2]]
## [1] 14.7
## 
## [[3]]
## [1] 15.04
## 
## [[4]]
## [1] 14.26
areaMap2 <- map2(sepalL, sepalW, ~ .x * .y)
head(areaMap2, 4)
## [[1]]
## [1] 17.85
## 
## [[2]]
## [1] 14.7
## 
## [[3]]
## [1] 15.04
## 
## [[4]]
## [1] 14.26
df <- iris[,c("Sepal.Length", "Sepal.Width")]
colnames(df) <- c("x","y")
areaPmap <- pmap(df, function(x, y,...) x * y)
head(areaPmap, 4)
## [[1]]
## [1] 17.85
## 
## [[2]]
## [1] 14.7
## 
## [[3]]
## [1] 15.04
## 
## [[4]]
## [1] 14.26

5 Your thoughts?