Class activity solutions

Author

Ciaran Evans

system.time({
  nsim <- 500
  n <- 100
  beta0 <- 1
  beta1 <- 0.5
  
  save <- vector()
  
  for (i in 1:nsim) {
    x <- rnorm(n)
    y <- beta0 + beta1*x + rnorm(n)
    
    coef <- lm(y ~ x)$coef[2]
    save <- c(save, coef)
  }
  save <- data.frame(save)
})

   user  system elapsed 
  0.200   0.012   0.211

About 0.2 seconds

library(profvis)
profvis({
  nsim <- 500
  n <- 100
  beta0 <- 1
  beta1 <- 0.5
  
  save <- vector()
  
  for (i in 1:nsim) {
    x <- rnorm(n)
    y <- beta0 + beta1*x + rnorm(n)
    
    coef <- lm(y ~ x)$coef[2]
    save <- c(save, coef)
  }
  save <- data.frame(save)
})

Fitting the regression model and extracting the coefficient takes the most time (lm(y ~ x)$coef[2])
Fitting the regression model still takes a lot of time. But now storing the coefficient (save <- c(save, coef)) also takes a lot of time.

profvis({
  nsim <- 10000
  n <- 100
  beta0 <- 1
  beta1 <- 0.5
  
  save <- vector()
  
  for (i in 1:nsim) {
    x <- rnorm(n)
    y <- beta0 + beta1*x + rnorm(n)
    
    coef <- lm(y ~ x)$coef[2]
    save <- c(save, coef)
  }
  save <- data.frame(save)
})

profvis({
  nsim <- 10000
  n <- 100
  beta0 <- 1
  beta1 <- 0.5
  
  save <- vector(length = nsim)
  
  for (i in 1:nsim) {
    x <- rnorm(n)
    y <- beta0 + beta1*x + rnorm(n)
    
    coef <- lm(y ~ x)$coef[2]
    save[i] <- coef
  }
  save <- data.frame(save)
})

The lm function parses formulas, checks for errors, and makes the output look pretty

my_lm <- function(X, y){
  return(c(solve(t(X) %*% X) %*% t(X) %*% y))
}

Using the my_lm function is faster

n <- 100
beta0 <- 1
beta1 <- 0.5
x <- rnorm(n)
y <- beta0 + beta1*x + rnorm(n)
X <- cbind(1, x)

lb <- bench::mark(
  coef <- lm(y ~ x)$coef[2],
  coef <- my_lm(X, y)[2],
  check = F
)

lb

# A tibble: 2 × 6
  expression                     min   median `itr/sec` mem_alloc `gc/sec`
  <bch:expr>                <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
1 coef <- lm(y ~ x)$coef[2]    335µs  345.2µs     2850.    18.7KB     19.0
2 coef <- my_lm(X, y)[2]      21.5µs   23.2µs    41480.    92.8KB     16.6

plot(lb, type="boxplot")

Loading required namespace: tidyr

With both changes, the code now runs about 10 times faster for 10000 simulations.

system.time({
  nsim <- 10000
  n <- 100
  beta0 <- 1
  beta1 <- 0.5
  
  save <- vector()
  
  for (i in 1:nsim) {
    x <- rnorm(n)
    y <- beta0 + beta1*x + rnorm(n)
    
    coef <- lm(y ~ x)$coef[2]
    save <- c(save, coef)
  }
  save <- data.frame(save)
})

   user  system elapsed 
  4.276   0.055   4.335

system.time({
  nsim <- 10000
  n <- 100
  beta0 <- 1
  beta1 <- 0.5
  
  save <- vector(length=nsim)
  
  for (i in 1:nsim) {
    x <- rnorm(n)
    y <- beta0 + beta1*x + rnorm(n)
    X <- cbind(1, x)
    
    coef <- my_lm(X, y)[2]
    save[i] <- coef
  }
  save <- data.frame(save)
})

   user  system elapsed 
  0.367   0.012   0.380