Lecture 22: Profiling and microbenchmarking

An order of operations for programming

  1. Make it run
  2. Make it right
  3. Make it fast

When speed matters

  • You are working with very large data
  • You are running a process (a simulation, a data analysis, etc.) many times
  • A piece of code will be called many times (e.g., choosing a split in a decision tree)

Goals

  • Learn how to identify bottlenecks in code
  • Learn approaches for more efficient code in R
  • Time permitting: learn how to use C++ to make code faster

Example: timing code

Suppose we want to compute the mean of each column of a data frame:

n <- 100000
cols <- 150
data_mat <- matrix(rnorm(n * cols, mean = 5), ncol = cols)
data <- as.data.frame(data_mat)

means <- rep(NA, cols)
for(i in 1:cols){
  means[i] <- mean(data[,i])
}

Example: timing code

Suppose we want to compute the mean of each column of a data frame:

n <- 100000
cols <- 150
data_mat <- matrix(rnorm(n * cols, mean = 5), ncol = cols)
data <- as.data.frame(data_mat)

system.time({
  means <- rep(NA, cols)
  for(i in 1:cols){
    means[i] <- mean(data[,i])
  }
})
   user  system elapsed 
  1.930   0.017   1.960 

Alternatives

means <- rep(NA, cols)
for(i in 1:cols){
  means[i] <- mean(data[,i])
}

What are the alternatives to this for-loop approach?

Alternatives

# Option 1: for loop
for_loop_means <- function(data){
  cols <- ncol(data)
  means <- rep(NA, cols)
  for(i in 1:cols){
    means[i] <- mean(data[,i])
  }
  return(means)
}
means <- for_loop_means(data)

# Option 2: apply
means <- apply(data, 2, mean)

# Option 3: colMeans
means <- colMeans(data)

Comparing performance

Microbenchmarking: Evaluating the performance of a small piece of code

bench::mark(
  means <- for_loop_means(data),
  means <- apply(data, 2, mean),
  means <- colMeans(data),
  check = F
)
# A tibble: 3 × 6
  expression                         min   median `itr/sec` mem_alloc `gc/sec`
  <bch:expr>                    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
1 means <- for_loop_means(data)    1.93s    1.93s     0.519    1.85KB    0    
2 means <- apply(data, 2, mean)    2.03s    2.03s     0.493  400.57MB    0.987
3 means <- colMeans(data)        461.4ms 469.34ms     2.13   114.45MB    1.07 

Profiling

library(profvis)
profvis({
  means <- for_loop_means(data)
  means <- apply(data, 2, mean)
  means <- colMeans(data)
})

Space for efficiency increases?

colMeans
function (x, na.rm = FALSE, dims = 1L) 
{
    if (is.data.frame(x)) 
        x <- as.matrix(x)
    if (!is.array(x) || length(dn <- dim(x)) < 2L) 
        stop("'x' must be an array of at least two dimensions")
    if (dims < 1L || dims > length(dn) - 1L) 
        stop("invalid 'dims'")
    n <- prod(dn[id <- seq_len(dims)])
    dn <- dn[-id]
    z <- if (is.complex(x)) 
        .Internal(colMeans(Re(x), n, prod(dn), na.rm)) + (0+1i) * 
            .Internal(colMeans(Im(x), n, prod(dn), na.rm))
    else .Internal(colMeans(x, n, prod(dn), na.rm))
    if (length(dn) > 1L) {
        dim(z) <- dn
        dimnames(z) <- dimnames(x)[-id]
    }
    else names(z) <- dimnames(x)[[dims + 1L]]
    z
}
<bytecode: 0x7fe4b5412cc0>
<environment: namespace:base>

Increase efficiency by avoiding extraneous steps

n <- 100000
cols <- 150
data_mat <- matrix(rnorm(n * cols, mean = 5), ncol = cols)
data <- as.data.frame(data_mat)

bench::mark(
  means <- colMeans(data_mat),
  means <- colMeans(data),
  check = F
)
# A tibble: 2 × 6
  expression                       min   median `itr/sec` mem_alloc `gc/sec`
  <bch:expr>                  <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
1 means <- colMeans(data_mat)    437ms    437ms      2.29    1.22KB     0   
2 means <- colMeans(data)        453ms    453ms      2.21  114.45MB     2.21

Profiling

profvis({
  means <- for_loop_means(data_mat)
  means <- apply(data_mat, 2, mean)
  means <- colMeans(data_mat)
})

Profiling

Class activity

https://sta279-f23.github.io/class_activities/ca_lecture_24.html