library(tidyverse)
library(palmerpenguins)
<- penguins |>
sub_data slice_sample(n=10) |>
select(species, island, bill_length_mm, bill_depth_mm)
Final exam review solutions
sub_data
# A tibble: 10 × 4
species island bill_length_mm bill_depth_mm
<fct> <fct> <dbl> <dbl>
1 Chinstrap Dream 51.3 18.2
2 Gentoo Biscoe 45.1 14.5
3 Adelie Dream 33.1 16.1
4 Adelie Dream 38.3 19.2
5 Gentoo Biscoe 45.5 14.5
6 Gentoo Biscoe 45.5 13.9
7 Chinstrap Dream 46.1 18.2
8 Adelie Torgersen 37.2 19.4
9 Adelie Biscoe 36.5 16.6
10 Chinstrap Dream 46.2 17.5
|>
sub_data count(species, island)
# A tibble: 5 × 3
species island n
<fct> <fct> <int>
1 Adelie Biscoe 1
2 Adelie Dream 2
3 Adelie Torgersen 1
4 Chinstrap Dream 3
5 Gentoo Biscoe 3
import pandas as pd
= r.sub_data
sub_data = ['species', 'island'])
(sub_data.groupby(by = ('bill_length_mm', 'count'))) .agg(n
n
species island
Adelie Biscoe 1
Dream 2
Torgersen 1
Chinstrap Biscoe 0
Dream 3
Torgersen 0
Gentoo Biscoe 3
Dream 0
Torgersen 0
|>
sub_data group_by(island, species) |>
summarize(mean_length = mean(bill_length_mm, na.rm=T))
`summarise()` has grouped output by 'island'. You can override using the
`.groups` argument.
# A tibble: 5 × 3
# Groups: island [3]
island species mean_length
<fct> <fct> <dbl>
1 Biscoe Adelie 36.5
2 Biscoe Gentoo 45.4
3 Dream Adelie 35.7
4 Dream Chinstrap 47.9
5 Torgersen Adelie 37.2
= ['island', 'species'])
(sub_data.groupby(by = ('bill_length_mm', 'mean'))
.agg(mean
.reset_index() .dropna())
island species mean
0 Biscoe Adelie 36.500000
2 Biscoe Gentoo 45.366667
3 Dream Adelie 35.700000
4 Dream Chinstrap 47.866667
6 Torgersen Adelie 37.200000
|>
sub_data mutate(bill_ratio = bill_length_mm/bill_depth_mm)
# A tibble: 10 × 5
species island bill_length_mm bill_depth_mm bill_ratio
<fct> <fct> <dbl> <dbl> <dbl>
1 Chinstrap Dream 51.3 18.2 2.82
2 Gentoo Biscoe 45.1 14.5 3.11
3 Adelie Dream 33.1 16.1 2.06
4 Adelie Dream 38.3 19.2 1.99
5 Gentoo Biscoe 45.5 14.5 3.14
6 Gentoo Biscoe 45.5 13.9 3.27
7 Chinstrap Dream 46.1 18.2 2.53
8 Adelie Torgersen 37.2 19.4 1.92
9 Adelie Biscoe 36.5 16.6 2.20
10 Chinstrap Dream 46.2 17.5 2.64
= sub_data['bill_length_mm']/sub_data['bill_depth_mm']) sub_data.assign(bill_ratio
species island bill_length_mm bill_depth_mm bill_ratio
0 Chinstrap Dream 51.3 18.2 2.818681
1 Gentoo Biscoe 45.1 14.5 3.110345
2 Adelie Dream 33.1 16.1 2.055901
3 Adelie Dream 38.3 19.2 1.994792
4 Gentoo Biscoe 45.5 14.5 3.137931
5 Gentoo Biscoe 45.5 13.9 3.273381
6 Chinstrap Dream 46.1 18.2 2.532967
7 Adelie Torgersen 37.2 19.4 1.917526
8 Adelie Biscoe 36.5 16.6 2.198795
9 Chinstrap Dream 46.2 17.5 2.640000
|>
sub_data filter(species == "Adelie",
== "Dream") island
# A tibble: 2 × 4
species island bill_length_mm bill_depth_mm
<fct> <fct> <dbl> <dbl>
1 Adelie Dream 33.1 16.1
2 Adelie Dream 38.3 19.2
'species == "Adelie" & island == "Dream"') sub_data.query(
species island bill_length_mm bill_depth_mm
2 Adelie Dream 33.1 16.1
3 Adelie Dream 38.3 19.2
<- data.frame(
ex_df x1 = c(1, 2, 3),
x2 = c("a", "b", "c"),
x3 = c(3, 1, 4),
y1 = c("d", "e", "f"),
y2 = c(2, 7, 9),
y3 = c(7, 1, 2)
)
ex_df
x1 x2 x3 y1 y2 y3
1 1 a 3 d 2 7
2 2 b 1 e 7 1
3 3 c 4 f 9 2
|>
ex_df select(where(is.numeric))
x1 x3 y2 y3
1 1 3 2 7
2 2 1 7 1
3 3 4 9 2
|>
ex_df select(starts_with("x"))
x1 x2 x3
1 1 a 3
2 2 b 1
3 3 c 4
|>
ex_df summarize(across(starts_with("x") & where(is.numeric),
.names = "mean_{.col}")) mean,
mean_x1 mean_x3
1 2 2.666667
<- data.frame(
ex_df id = c(1, 2, 3),
x_1 = c(3, 1, 4),
x_2 = c(5, 8, 9),
y_1 = c(0, 1, 2),
y_2 = c(2, 7, 9)
)
ex_df
id x_1 x_2 y_1 y_2
1 1 3 5 0 2
2 2 1 8 1 7
3 3 4 9 2 9
|>
ex_df pivot_longer(cols = -id, names_to = c("group", "obs"), names_sep = "_")
# A tibble: 12 × 4
id group obs value
<dbl> <chr> <chr> <dbl>
1 1 x 1 3
2 1 x 2 5
3 1 y 1 0
4 1 y 2 2
5 2 x 1 1
6 2 x 2 8
7 2 y 1 1
8 2 y 2 7
9 3 x 1 4
10 3 x 2 9
11 3 y 1 2
12 3 y 2 9
= r.ex_df
ex_df
= ex_df.melt(id_vars = 'id', var_name = 'group', value_name = 'value')
ex_df_new 'group', 'obs']] = ex_df_new['group'].str.split('_', expand=True)
ex_df_new[[ ex_df_new
id group value obs
0 1.0 x 3.0 1
1 2.0 x 1.0 1
2 3.0 x 4.0 1
3 1.0 x 5.0 2
4 2.0 x 8.0 2
5 3.0 x 9.0 2
6 1.0 y 0.0 1
7 2.0 y 1.0 1
8 3.0 y 2.0 1
9 1.0 y 2.0 2
10 2.0 y 7.0 2
11 3.0 y 9.0 2
<- data.frame(
ex_df id = c(1, 1, 2, 2, 3, 3),
group = c("x", "y", "x", "y", "x", "y"),
value = sample(1:6, replace=T)
)
ex_df
id group value
1 1 x 5
2 1 y 5
3 2 x 6
4 2 y 5
5 3 x 6
6 3 y 4
|>
ex_df pivot_wider(id_cols = id, names_from = group, values_from = value)
# A tibble: 3 × 3
id x y
<dbl> <int> <int>
1 1 5 5
2 2 6 5
3 3 6 4
= r.ex_df
ex_df = 'group', values = 'value', index = 'id')
(ex_df.pivot(columns .reset_index())
group id x y
0 1.0 5 5
1 2.0 6 5
2 3.0 6 4
<- data.frame(
df1 id = c(1, 2, 3),
x = c(7, 9, 13)
)
<- data.frame(
df2 id = c(1, 2, 4),
y = c(10, 12, 14)
)
df1
id x
1 1 7
2 2 9
3 3 13
df2
id y
1 1 10
2 2 12
3 4 14
|>
df1 left_join(df2, join_by(id))
id x y
1 1 7 10
2 2 9 12
3 3 13 NA
= r.df1
df1 = r.df2
df2
= 'left', left_on = 'id', right_on = 'id') pd.merge(df1, df2, how
id x y
0 1.0 7.0 10.0
1 2.0 9.0 12.0
2 3.0 13.0 NaN
|>
df1 inner_join(df2, join_by(id))
id x y
1 1 7 10
2 2 9 12
= 'inner', left_on = 'id', right_on = 'id') pd.merge(df1, df2, how
id x y
0 1.0 7.0 10.0
1 2.0 9.0 12.0
= data.frame(
df1 a_x = 1,
a_y = 2,
b_x = 2,
b_y = 3
)
= data.frame(
df2 id = c("a", "b"),
z = c(4, 5)
)
df1
a_x a_y b_x b_y
1 1 2 2 3
df2
id z
1 a 4
2 b 5
|>
df1 pivot_longer(cols = -c(), names_to = c("id", ".value"), names_sep = "_") |>
left_join(df2, join_by(id))
# A tibble: 2 × 4
id x y z
<chr> <dbl> <dbl> <dbl>
1 a 1 2 4
2 b 2 3 5
= r.df1
df1 = r.df2
df2
= df1.melt(id_vars = [], var_name = 'id')
df1_new 'id', 'group']] = df1_new['id'].str.split('_', expand=True)
df1_new[[= df1_new.pivot(columns = 'group', values='value', index = 'id').reset_index()
df1_new
='left', left_on = 'id', right_on = 'id') pd.merge(df1_new, df2, how
id x y z
0 a 1.0 2.0 4.0
1 b 2.0 3.0 5.0
<- c(
strings "George Washington: February 22, 1732",
"Thomas Jefferson: April 13, 1743",
"Abraham Lincoln: February 12, 1809",
"Theodore Roosevelt: October 27, 1858"
)
strings
[1] "George Washington: February 22, 1732"
[2] "Thomas Jefferson: April 13, 1743"
[3] "Abraham Lincoln: February 12, 1809"
[4] "Theodore Roosevelt: October 27, 1858"
str_extract(strings, ".+(?=:)")
[1] "George Washington" "Thomas Jefferson" "Abraham Lincoln"
[4] "Theodore Roosevelt"
str_extract(strings, "(?<=: ).+")
[1] "February 22, 1732" "April 13, 1743" "February 12, 1809"
[4] "October 27, 1858"
str_extract(strings, "(?<=\\s).+(?=:)")
[1] "Washington" "Jefferson" "Lincoln" "Roosevelt"
str_extract(strings, "(?<=, ).+")
[1] "1732" "1743" "1809" "1858"
<- c("apple", "banana", "canteloupe", "durian",
strings "eggplant", "french fries", "goat cheese",
"pizza", "99 red balloons", "101 dalmatians",
"route 66")
strings
[1] "apple" "banana" "canteloupe" "durian"
[5] "eggplant" "french fries" "goat cheese" "pizza"
[9] "99 red balloons" "101 dalmatians" "route 66"
str_subset(strings, "\\d+")
[1] "99 red balloons" "101 dalmatians" "route 66"
str_subset(strings, "^\\d+")
[1] "99 red balloons" "101 dalmatians"
str_subset(strings, "a")
[1] "apple" "banana" "canteloupe" "durian"
[5] "eggplant" "goat cheese" "pizza" "99 red balloons"
[9] "101 dalmatians"
str_subset(strings, "\\s")
[1] "french fries" "goat cheese" "99 red balloons" "101 dalmatians"
[5] "route 66"
str_subset(strings, "(.)\\1{1}")
[1] "apple" "eggplant" "goat cheese" "pizza"
[5] "99 red balloons" "route 66"
<- list()
x for(i in 1:10){
<- i
x[i]
}2] + 1 x[
Error in x[2] + 1: non-numeric argument to binary operator
x[2]
is a list, so we can’t add 1. If we want to add 1 to the second element of x
, we should use x[[2]] + 1
<- 1000 # number of games
nsim <- rep(NA, nsim)
results
for(i in 1:nsim){
# each game starts with the marker in the middle
<- 0
marker
while(abs(marker) < 0){
<- runif(1, 0, 0.5)
robotA <- runif(1, 0, 0.5)
robotB <- marker + robotA - robotB
marker
}
# check whether robot A wins
<- marker >= 0.5
results[i]
}
# fraction of the time that robot A wins
mean(results)
[1] 0
The output of this code will be 0, because the while loop never runs (the absolute value of a number can never be less than 0).
<- matrix(0, nrow=5, ncol=3)
mat for(i in 1:5){
for(j in 1:3){
<- i + j
mat
}
}
mat
[1] 8
mat
starts off as a matrix, but then gets over-written to just be a number.
<- matrix(0, nrow=5, ncol=3)
mat for(i in 1:5){
for(j in 1:3){
<- i + j
mat[j, i]
} }
Error in `[<-`(`*tmp*`, j, i, value = i + j): subscript out of bounds
mat
[,1] [,2] [,3]
[1,] 2 3 4
[2,] 3 4 5
[3,] 4 5 6
[4,] 0 0 0
[5,] 0 0 0
Using mat[i,j]
would fix the error.
<- matrix(1, 3, 3)
mat for(i in 2:3){
for(j in 2:3){
<- mat[i-1, j-1] + mat[i, j-1]
mat[i,j]
}
}
mat
[,1] [,2] [,3]
[1,] 1 1 1
[2,] 1 2 3
[3,] 1 2 4
<- function(x = 1){
f1 return(x + 1)
}<- function(x){
g1 return(f1() + x)
}
f1(g1(3))
[1] 6
<- function(n, groups){
f1 <- matrix(1, nrow=n, ncol=n)
x = unique(groups)
unique_groups <- matrix(nrow = length(unique_groups), ncol = n)
means for(i in 1:length(unique_groups)){
<- colMeans(x[groups == unique_groups[i],])
means[i,]
}
return(means)
}
f1(5, groups = c(1, 1, 2, 2, 2))
[,1] [,2] [,3] [,4] [,5]
[1,] 1 1 1 1 1
[2,] 1 1 1 1 1
<- runif(100) x
set.seed(442)
<- 1000
nsim <- rep(NA, nsim)
results
<- rep(1:10, 4)
card_numbers <- rep(1:4, each=10)
card_colors
for(i in 1:nsim){
<- sample(1:40, 2, replace=F)
cards_drawn <- (card_numbers[cards_drawn[1]] != card_numbers[cards_drawn[2]]) &
results[i] 1]] != card_colors[cards_drawn[2]])
(card_colors[cards_drawn[
}
mean(results)
[1] 0.68
<- 20
p <- 10
q <- 1000
nsim <- rep(c(0, 1), times = c(q, p))
votes <- rep(NA, nsim)
results
for(i in 1:nsim){
<- sample(votes, p+q, replace=F)
shuffled_votes <- sum(shuffled_votes) > sum(1 - shuffled_votes)
results[i]
}
mean(results)
This code doesn’t work because it doesn’t check anything about the counting process – it only looks at the final count. And regardless how we shuffle the votes, the final counts will remain the same. What we need to do instead is look at the count for each candidates after each vote has been counted.
<- 20
p <- 10
q <- 1000
nsim <- rep(c(0, 1), times = c(q, p))
votes <- rep(1, nsim)
results
for(i in 1:nsim){
<- sample(votes, p+q, replace=F)
shuffled_votes <- 0
count_a <- 0
count_b for(j in 1:(p+q)){
<- count_a + shuffled_votes[j]
count_a <- count_b + (1-shuffled_votes[j])
count_b if(count_b >= count_a){
<- 0
results[i]
}
}
}
mean(results)
[1] 0.324
A: How does the “distance” between groups impact performance of the k-means algorithm?
D: Generate data \(X_1,...,X_n\) with \(n = 300\). Suppose 100 observations in each group, with \(G_1,...,G_{100} = 1\), \(G_{101},...,G_{200} = 2\), and \(G_{201},...,G_{300} = 3\). Each \(X_i\) is simulated from
\[X_i \sim N(\mu_{G_i}, 1)\]
To examine the impact of distance between groups, we will let \(\mu_2 = 0\) and consider all combinations of \(\mu_1 = -1, -2, -3\) and \(\mu_3 = 1, 2, 3\).
E: Group assignments for all observations
M: Use the \(k\)-means algorithm (kmeans
function in R) to assign groups. We will assume we know the true number of groups (3), so the “correct” \(k=3\) is used.
P: The probability that all group assignments are correct. (Alternatively, could look at the expected fraction of group assignments which are correct)