Final exam review solutions

Author

Ciaran Evans

library(tidyverse)
library(palmerpenguins)

sub_data <- penguins |>
  slice_sample(n=10) |>
  select(species, island, bill_length_mm, bill_depth_mm)

sub_data

# A tibble: 10 × 4
   species   island    bill_length_mm bill_depth_mm
   <fct>     <fct>              <dbl>         <dbl>
 1 Chinstrap Dream               51.3          18.2
 2 Gentoo    Biscoe              45.1          14.5
 3 Adelie    Dream               33.1          16.1
 4 Adelie    Dream               38.3          19.2
 5 Gentoo    Biscoe              45.5          14.5
 6 Gentoo    Biscoe              45.5          13.9
 7 Chinstrap Dream               46.1          18.2
 8 Adelie    Torgersen           37.2          19.4
 9 Adelie    Biscoe              36.5          16.6
10 Chinstrap Dream               46.2          17.5

sub_data |>
  count(species, island)

# A tibble: 5 × 3
  species   island        n
  <fct>     <fct>     <int>
1 Adelie    Biscoe        1
2 Adelie    Dream         2
3 Adelie    Torgersen     1
4 Chinstrap Dream         3
5 Gentoo    Biscoe        3

import pandas as pd

sub_data = r.sub_data
(sub_data.groupby(by = ['species', 'island'])
         .agg(n = ('bill_length_mm', 'count')))

                     n
species   island      
Adelie    Biscoe     1
          Dream      2
          Torgersen  1
Chinstrap Biscoe     0
          Dream      3
          Torgersen  0
Gentoo    Biscoe     3
          Dream      0
          Torgersen  0

sub_data |>
  group_by(island, species) |>
  summarize(mean_length = mean(bill_length_mm, na.rm=T))

`summarise()` has grouped output by 'island'. You can override using the
`.groups` argument.

# A tibble: 5 × 3
# Groups:   island [3]
  island    species   mean_length
  <fct>     <fct>           <dbl>
1 Biscoe    Adelie           36.5
2 Biscoe    Gentoo           45.4
3 Dream     Adelie           35.7
4 Dream     Chinstrap        47.9
5 Torgersen Adelie           37.2

(sub_data.groupby(by = ['island', 'species'])
         .agg(mean = ('bill_length_mm', 'mean'))
         .reset_index()
         .dropna())

      island    species       mean
0     Biscoe     Adelie  36.500000
2     Biscoe     Gentoo  45.366667
3      Dream     Adelie  35.700000
4      Dream  Chinstrap  47.866667
6  Torgersen     Adelie  37.200000

sub_data |>
  mutate(bill_ratio = bill_length_mm/bill_depth_mm)

# A tibble: 10 × 5
   species   island    bill_length_mm bill_depth_mm bill_ratio
   <fct>     <fct>              <dbl>         <dbl>      <dbl>
 1 Chinstrap Dream               51.3          18.2       2.82
 2 Gentoo    Biscoe              45.1          14.5       3.11
 3 Adelie    Dream               33.1          16.1       2.06
 4 Adelie    Dream               38.3          19.2       1.99
 5 Gentoo    Biscoe              45.5          14.5       3.14
 6 Gentoo    Biscoe              45.5          13.9       3.27
 7 Chinstrap Dream               46.1          18.2       2.53
 8 Adelie    Torgersen           37.2          19.4       1.92
 9 Adelie    Biscoe              36.5          16.6       2.20
10 Chinstrap Dream               46.2          17.5       2.64

sub_data.assign(bill_ratio = sub_data['bill_length_mm']/sub_data['bill_depth_mm'])

     species     island  bill_length_mm  bill_depth_mm  bill_ratio
0  Chinstrap      Dream            51.3           18.2    2.818681
1     Gentoo     Biscoe            45.1           14.5    3.110345
2     Adelie      Dream            33.1           16.1    2.055901
3     Adelie      Dream            38.3           19.2    1.994792
4     Gentoo     Biscoe            45.5           14.5    3.137931
5     Gentoo     Biscoe            45.5           13.9    3.273381
6  Chinstrap      Dream            46.1           18.2    2.532967
7     Adelie  Torgersen            37.2           19.4    1.917526
8     Adelie     Biscoe            36.5           16.6    2.198795
9  Chinstrap      Dream            46.2           17.5    2.640000

sub_data |>
  filter(species == "Adelie", 
         island == "Dream")

# A tibble: 2 × 4
  species island bill_length_mm bill_depth_mm
  <fct>   <fct>           <dbl>         <dbl>
1 Adelie  Dream            33.1          16.1
2 Adelie  Dream            38.3          19.2

sub_data.query('species == "Adelie" & island == "Dream"')

  species island  bill_length_mm  bill_depth_mm
2  Adelie  Dream            33.1           16.1
3  Adelie  Dream            38.3           19.2

ex_df <- data.frame(
  x1 = c(1, 2, 3),
  x2 = c("a", "b", "c"),
  x3 = c(3, 1, 4),
  y1 = c("d", "e", "f"),
  y2 = c(2, 7, 9),
  y3 = c(7, 1, 2)
)

ex_df

  x1 x2 x3 y1 y2 y3
1  1  a  3  d  2  7
2  2  b  1  e  7  1
3  3  c  4  f  9  2

ex_df |>
  select(where(is.numeric))

  x1 x3 y2 y3
1  1  3  2  7
2  2  1  7  1
3  3  4  9  2

ex_df |>
  select(starts_with("x"))

ex_df |>
  summarize(across(starts_with("x") & where(is.numeric), 
                   mean, .names = "mean_{.col}"))

  mean_x1  mean_x3
1       2 2.666667

ex_df <- data.frame(
  id = c(1, 2, 3),
  x_1 = c(3, 1, 4),
  x_2 = c(5, 8, 9),
  y_1 = c(0, 1, 2),
  y_2 = c(2, 7, 9)
)

ex_df

  id x_1 x_2 y_1 y_2
1  1   3   5   0   2
2  2   1   8   1   7
3  3   4   9   2   9

ex_df |>
  pivot_longer(cols = -id, names_to = c("group", "obs"), names_sep = "_")

# A tibble: 12 × 4
      id group obs   value
   <dbl> <chr> <chr> <dbl>
 1     1 x     1         3
 2     1 x     2         5
 3     1 y     1         0
 4     1 y     2         2
 5     2 x     1         1
 6     2 x     2         8
 7     2 y     1         1
 8     2 y     2         7
 9     3 x     1         4
10     3 x     2         9
11     3 y     1         2
12     3 y     2         9

ex_df = r.ex_df

ex_df_new = ex_df.melt(id_vars = 'id', var_name = 'group', value_name = 'value')
ex_df_new[['group', 'obs']] = ex_df_new['group'].str.split('_', expand=True)
ex_df_new

     id group  value obs
0   1.0     x    3.0   1
1   2.0     x    1.0   1
2   3.0     x    4.0   1
3   1.0     x    5.0   2
4   2.0     x    8.0   2
5   3.0     x    9.0   2
6   1.0     y    0.0   1
7   2.0     y    1.0   1
8   3.0     y    2.0   1
9   1.0     y    2.0   2
10  2.0     y    7.0   2
11  3.0     y    9.0   2

ex_df <- data.frame(
  id = c(1, 1, 2, 2, 3, 3),
  group = c("x", "y", "x", "y", "x", "y"),
  value = sample(1:6, replace=T)
)

ex_df

  id group value
1  1     x     5
2  1     y     5
3  2     x     6
4  2     y     5
5  3     x     6
6  3     y     4

ex_df |>
  pivot_wider(id_cols = id, names_from = group, values_from = value)

# A tibble: 3 × 3
     id     x     y
  <dbl> <int> <int>
1     1     5     5
2     2     6     5
3     3     6     4

ex_df = r.ex_df
(ex_df.pivot(columns = 'group', values = 'value', index = 'id')
      .reset_index())

group   id  x  y
0      1.0  5  5
1      2.0  6  5
2      3.0  6  4

df1 <- data.frame(
  id = c(1, 2, 3),
  x = c(7, 9, 13)
)

df2 <- data.frame(
  id = c(1, 2, 4),
  y = c(10, 12, 14)
)

df1

df2

df1 |>
  left_join(df2, join_by(id))

df1 = r.df1
df2 = r.df2

pd.merge(df1, df2, how = 'left', left_on = 'id', right_on = 'id')

    id     x     y
0  1.0   7.0  10.0
1  2.0   9.0  12.0
2  3.0  13.0   NaN

df1 |>
  inner_join(df2, join_by(id))

  id x  y
1  1 7 10
2  2 9 12

pd.merge(df1, df2, how = 'inner', left_on = 'id', right_on = 'id')

    id    x     y
0  1.0  7.0  10.0
1  2.0  9.0  12.0

df1 = data.frame(
  a_x = 1,
  a_y = 2,
  b_x = 2,
  b_y = 3
)

df2 = data.frame(
  id = c("a", "b"),
  z = c(4, 5)
)

df1

  a_x a_y b_x b_y
1   1   2   2   3

df2

  id z
1  a 4
2  b 5

df1 |> 
  pivot_longer(cols = -c(), names_to = c("id", ".value"), names_sep = "_") |>
  left_join(df2, join_by(id))

# A tibble: 2 × 4
  id        x     y     z
  <chr> <dbl> <dbl> <dbl>
1 a         1     2     4
2 b         2     3     5

df1 = r.df1
df2 = r.df2

df1_new = df1.melt(id_vars = [], var_name = 'id')
df1_new[['id', 'group']] = df1_new['id'].str.split('_', expand=True)
df1_new = df1_new.pivot(columns = 'group', values='value', index = 'id').reset_index()

pd.merge(df1_new, df2, how='left', left_on = 'id', right_on = 'id')

  id    x    y    z
0  a  1.0  2.0  4.0
1  b  2.0  3.0  5.0

strings <- c(
  "George Washington: February 22, 1732",
  "Thomas Jefferson: April 13, 1743",
  "Abraham Lincoln: February 12, 1809",
  "Theodore Roosevelt: October 27, 1858"
)

strings

[1] "George Washington: February 22, 1732"
[2] "Thomas Jefferson: April 13, 1743"    
[3] "Abraham Lincoln: February 12, 1809"  
[4] "Theodore Roosevelt: October 27, 1858"

str_extract(strings, ".+(?=:)")

[1] "George Washington"  "Thomas Jefferson"   "Abraham Lincoln"   
[4] "Theodore Roosevelt"

str_extract(strings, "(?<=: ).+")

[1] "February 22, 1732" "April 13, 1743"    "February 12, 1809"
[4] "October 27, 1858"

str_extract(strings, "(?<=\\s).+(?=:)")

[1] "Washington" "Jefferson"  "Lincoln"    "Roosevelt"

str_extract(strings, "(?<=, ).+")

[1] "1732" "1743" "1809" "1858"

strings <- c("apple", "banana", "canteloupe", "durian",
             "eggplant", "french fries", "goat cheese",
             "pizza", "99 red balloons", "101 dalmatians",
             "route 66")

strings

 [1] "apple"           "banana"          "canteloupe"      "durian"         
 [5] "eggplant"        "french fries"    "goat cheese"     "pizza"          
 [9] "99 red balloons" "101 dalmatians"  "route 66"

str_subset(strings, "\\d+")

[1] "99 red balloons" "101 dalmatians"  "route 66"

str_subset(strings, "^\\d+")

[1] "99 red balloons" "101 dalmatians"

str_subset(strings, "a")

[1] "apple"           "banana"          "canteloupe"      "durian"         
[5] "eggplant"        "goat cheese"     "pizza"           "99 red balloons"
[9] "101 dalmatians"

str_subset(strings, "\\s")

[1] "french fries"    "goat cheese"     "99 red balloons" "101 dalmatians" 
[5] "route 66"

str_subset(strings, "(.)\\1{1}")

[1] "apple"           "eggplant"        "goat cheese"     "pizza"          
[5] "99 red balloons" "route 66"

x <- list()
for(i in 1:10){
  x[i] <- i
}
x[2] + 1

Error in x[2] + 1: non-numeric argument to binary operator

x[2] is a list, so we can’t add 1. If we want to add 1 to the second element of x, we should use x[[2]] + 1

nsim <- 1000 # number of games
results <- rep(NA, nsim)

for(i in 1:nsim){
  # each game starts with the marker in the middle
  marker <- 0
  
  while(abs(marker) < 0){
    robotA <- runif(1, 0, 0.5)
    robotB <- runif(1, 0, 0.5)
    marker <- marker + robotA - robotB
  }
  
  # check whether robot A wins
  results[i] <- marker >= 0.5
}

# fraction of the time that robot A wins
mean(results)

[1] 0

The output of this code will be 0, because the while loop never runs (the absolute value of a number can never be less than 0).

mat <- matrix(0, nrow=5, ncol=3)
for(i in 1:5){
  for(j in 1:3){
    mat <- i + j
  }
}

mat

[1] 8

mat starts off as a matrix, but then gets over-written to just be a number.

mat <- matrix(0, nrow=5, ncol=3)
for(i in 1:5){
  for(j in 1:3){
    mat[j, i] <- i + j
  }
}

Error in `[<-`(`*tmp*`, j, i, value = i + j): subscript out of bounds

mat

     [,1] [,2] [,3]
[1,]    2    3    4
[2,]    3    4    5
[3,]    4    5    6
[4,]    0    0    0
[5,]    0    0    0

Using mat[i,j] would fix the error.

mat <- matrix(1, 3, 3)
for(i in 2:3){
  for(j in 2:3){
    mat[i,j] <- mat[i-1, j-1] + mat[i, j-1]
  }
}

mat

     [,1] [,2] [,3]
[1,]    1    1    1
[2,]    1    2    3
[3,]    1    2    4

f1 <- function(x = 1){
  return(x + 1)
}
g1 <- function(x){
  return(f1() + x)
}

f1(g1(3))

[1] 6

f1 <- function(n, groups){
  x <- matrix(1, nrow=n, ncol=n)
  unique_groups = unique(groups)
  means <- matrix(nrow = length(unique_groups), ncol = n)
  for(i in 1:length(unique_groups)){
    means[i,] <- colMeans(x[groups == unique_groups[i],])
  }
  
  return(means)
}

f1(5, groups = c(1, 1, 2, 2, 2))

     [,1] [,2] [,3] [,4] [,5]
[1,]    1    1    1    1    1
[2,]    1    1    1    1    1

x <- runif(100)

set.seed(442)

nsim <- 1000
results <- rep(NA, nsim)

card_numbers <- rep(1:10, 4)
card_colors <- rep(1:4, each=10)

for(i in 1:nsim){
  cards_drawn <- sample(1:40, 2, replace=F)
  results[i] <- (card_numbers[cards_drawn[1]] != card_numbers[cards_drawn[2]]) &
    (card_colors[cards_drawn[1]] != card_colors[cards_drawn[2]])
}

mean(results)

[1] 0.68

p <- 20
q <- 10
nsim <- 1000
votes <- rep(c(0, 1), times = c(q, p))
results <- rep(NA, nsim)

for(i in 1:nsim){
  shuffled_votes <- sample(votes, p+q, replace=F)
  results[i] <- sum(shuffled_votes) > sum(1 - shuffled_votes)
}

mean(results)

This code doesn’t work because it doesn’t check anything about the counting process – it only looks at the final count. And regardless how we shuffle the votes, the final counts will remain the same. What we need to do instead is look at the count for each candidates after each vote has been counted.

p <- 20
q <- 10
nsim <- 1000
votes <- rep(c(0, 1), times = c(q, p))
results <- rep(1, nsim)

for(i in 1:nsim){
  shuffled_votes <- sample(votes, p+q, replace=F)
  count_a <- 0
  count_b <- 0
  for(j in 1:(p+q)){
    count_a <- count_a + shuffled_votes[j]
    count_b <- count_b + (1-shuffled_votes[j])
    if(count_b >= count_a){
      results[i] <- 0
    }
  }
}

mean(results)

[1] 0.324

A: How does the “distance” between groups impact performance of the k-means algorithm? D: Generate data

X_{1}, . . ., X_{n}

with

n = 300

. Suppose 100 observations in each group, with

G_{1}, . . ., G_{100} = 1

G_{101}, . . ., G_{200} = 2

, and

G_{201}, . . ., G_{300} = 3

. Each

X_{i}

is simulated from

$X_{i} \sim N (μ_{G_{i}}, 1)$

To examine the impact of distance between groups, we will let $μ_{2} = 0$ and consider all combinations of $μ_{1} = - 1, - 2, - 3$ and $μ_{3} = 1, 2, 3$ .

E: Group assignments for all observations

M: Use the $k$ -means algorithm (kmeans function in R) to assign groups. We will assume we know the true number of groups (3), so the “correct” $k = 3$ is used.

P: The probability that all group assignments are correct. (Alternatively, could look at the expected fraction of group assignments which are correct)