library(tidyverse)
library(palmerpenguins)
sub_data <- penguins |>
slice_sample(n=10) |>
select(species, island, bill_length_mm, bill_depth_mm)Final exam review solutions
sub_data# A tibble: 10 × 4
species island bill_length_mm bill_depth_mm
<fct> <fct> <dbl> <dbl>
1 Chinstrap Dream 51.3 18.2
2 Gentoo Biscoe 45.1 14.5
3 Adelie Dream 33.1 16.1
4 Adelie Dream 38.3 19.2
5 Gentoo Biscoe 45.5 14.5
6 Gentoo Biscoe 45.5 13.9
7 Chinstrap Dream 46.1 18.2
8 Adelie Torgersen 37.2 19.4
9 Adelie Biscoe 36.5 16.6
10 Chinstrap Dream 46.2 17.5
sub_data |>
count(species, island)# A tibble: 5 × 3
species island n
<fct> <fct> <int>
1 Adelie Biscoe 1
2 Adelie Dream 2
3 Adelie Torgersen 1
4 Chinstrap Dream 3
5 Gentoo Biscoe 3
import pandas as pd
sub_data = r.sub_data
(sub_data.groupby(by = ['species', 'island'])
.agg(n = ('bill_length_mm', 'count'))) n
species island
Adelie Biscoe 1
Dream 2
Torgersen 1
Chinstrap Biscoe 0
Dream 3
Torgersen 0
Gentoo Biscoe 3
Dream 0
Torgersen 0
sub_data |>
group_by(island, species) |>
summarize(mean_length = mean(bill_length_mm, na.rm=T))`summarise()` has grouped output by 'island'. You can override using the
`.groups` argument.
# A tibble: 5 × 3
# Groups: island [3]
island species mean_length
<fct> <fct> <dbl>
1 Biscoe Adelie 36.5
2 Biscoe Gentoo 45.4
3 Dream Adelie 35.7
4 Dream Chinstrap 47.9
5 Torgersen Adelie 37.2
(sub_data.groupby(by = ['island', 'species'])
.agg(mean = ('bill_length_mm', 'mean'))
.reset_index()
.dropna()) island species mean
0 Biscoe Adelie 36.500000
2 Biscoe Gentoo 45.366667
3 Dream Adelie 35.700000
4 Dream Chinstrap 47.866667
6 Torgersen Adelie 37.200000
sub_data |>
mutate(bill_ratio = bill_length_mm/bill_depth_mm)# A tibble: 10 × 5
species island bill_length_mm bill_depth_mm bill_ratio
<fct> <fct> <dbl> <dbl> <dbl>
1 Chinstrap Dream 51.3 18.2 2.82
2 Gentoo Biscoe 45.1 14.5 3.11
3 Adelie Dream 33.1 16.1 2.06
4 Adelie Dream 38.3 19.2 1.99
5 Gentoo Biscoe 45.5 14.5 3.14
6 Gentoo Biscoe 45.5 13.9 3.27
7 Chinstrap Dream 46.1 18.2 2.53
8 Adelie Torgersen 37.2 19.4 1.92
9 Adelie Biscoe 36.5 16.6 2.20
10 Chinstrap Dream 46.2 17.5 2.64
sub_data.assign(bill_ratio = sub_data['bill_length_mm']/sub_data['bill_depth_mm']) species island bill_length_mm bill_depth_mm bill_ratio
0 Chinstrap Dream 51.3 18.2 2.818681
1 Gentoo Biscoe 45.1 14.5 3.110345
2 Adelie Dream 33.1 16.1 2.055901
3 Adelie Dream 38.3 19.2 1.994792
4 Gentoo Biscoe 45.5 14.5 3.137931
5 Gentoo Biscoe 45.5 13.9 3.273381
6 Chinstrap Dream 46.1 18.2 2.532967
7 Adelie Torgersen 37.2 19.4 1.917526
8 Adelie Biscoe 36.5 16.6 2.198795
9 Chinstrap Dream 46.2 17.5 2.640000
sub_data |>
filter(species == "Adelie",
island == "Dream")# A tibble: 2 × 4
species island bill_length_mm bill_depth_mm
<fct> <fct> <dbl> <dbl>
1 Adelie Dream 33.1 16.1
2 Adelie Dream 38.3 19.2
sub_data.query('species == "Adelie" & island == "Dream"') species island bill_length_mm bill_depth_mm
2 Adelie Dream 33.1 16.1
3 Adelie Dream 38.3 19.2
ex_df <- data.frame(
x1 = c(1, 2, 3),
x2 = c("a", "b", "c"),
x3 = c(3, 1, 4),
y1 = c("d", "e", "f"),
y2 = c(2, 7, 9),
y3 = c(7, 1, 2)
)
ex_df x1 x2 x3 y1 y2 y3
1 1 a 3 d 2 7
2 2 b 1 e 7 1
3 3 c 4 f 9 2
ex_df |>
select(where(is.numeric)) x1 x3 y2 y3
1 1 3 2 7
2 2 1 7 1
3 3 4 9 2
ex_df |>
select(starts_with("x")) x1 x2 x3
1 1 a 3
2 2 b 1
3 3 c 4
ex_df |>
summarize(across(starts_with("x") & where(is.numeric),
mean, .names = "mean_{.col}")) mean_x1 mean_x3
1 2 2.666667
ex_df <- data.frame(
id = c(1, 2, 3),
x_1 = c(3, 1, 4),
x_2 = c(5, 8, 9),
y_1 = c(0, 1, 2),
y_2 = c(2, 7, 9)
)
ex_df id x_1 x_2 y_1 y_2
1 1 3 5 0 2
2 2 1 8 1 7
3 3 4 9 2 9
ex_df |>
pivot_longer(cols = -id, names_to = c("group", "obs"), names_sep = "_")# A tibble: 12 × 4
id group obs value
<dbl> <chr> <chr> <dbl>
1 1 x 1 3
2 1 x 2 5
3 1 y 1 0
4 1 y 2 2
5 2 x 1 1
6 2 x 2 8
7 2 y 1 1
8 2 y 2 7
9 3 x 1 4
10 3 x 2 9
11 3 y 1 2
12 3 y 2 9
ex_df = r.ex_df
ex_df_new = ex_df.melt(id_vars = 'id', var_name = 'group', value_name = 'value')
ex_df_new[['group', 'obs']] = ex_df_new['group'].str.split('_', expand=True)
ex_df_new id group value obs
0 1.0 x 3.0 1
1 2.0 x 1.0 1
2 3.0 x 4.0 1
3 1.0 x 5.0 2
4 2.0 x 8.0 2
5 3.0 x 9.0 2
6 1.0 y 0.0 1
7 2.0 y 1.0 1
8 3.0 y 2.0 1
9 1.0 y 2.0 2
10 2.0 y 7.0 2
11 3.0 y 9.0 2
ex_df <- data.frame(
id = c(1, 1, 2, 2, 3, 3),
group = c("x", "y", "x", "y", "x", "y"),
value = sample(1:6, replace=T)
)
ex_df id group value
1 1 x 5
2 1 y 5
3 2 x 6
4 2 y 5
5 3 x 6
6 3 y 4
ex_df |>
pivot_wider(id_cols = id, names_from = group, values_from = value)# A tibble: 3 × 3
id x y
<dbl> <int> <int>
1 1 5 5
2 2 6 5
3 3 6 4
ex_df = r.ex_df
(ex_df.pivot(columns = 'group', values = 'value', index = 'id')
.reset_index())group id x y
0 1.0 5 5
1 2.0 6 5
2 3.0 6 4
df1 <- data.frame(
id = c(1, 2, 3),
x = c(7, 9, 13)
)
df2 <- data.frame(
id = c(1, 2, 4),
y = c(10, 12, 14)
)df1 id x
1 1 7
2 2 9
3 3 13
df2 id y
1 1 10
2 2 12
3 4 14
df1 |>
left_join(df2, join_by(id)) id x y
1 1 7 10
2 2 9 12
3 3 13 NA
df1 = r.df1
df2 = r.df2
pd.merge(df1, df2, how = 'left', left_on = 'id', right_on = 'id') id x y
0 1.0 7.0 10.0
1 2.0 9.0 12.0
2 3.0 13.0 NaN
df1 |>
inner_join(df2, join_by(id)) id x y
1 1 7 10
2 2 9 12
pd.merge(df1, df2, how = 'inner', left_on = 'id', right_on = 'id') id x y
0 1.0 7.0 10.0
1 2.0 9.0 12.0
df1 = data.frame(
a_x = 1,
a_y = 2,
b_x = 2,
b_y = 3
)
df2 = data.frame(
id = c("a", "b"),
z = c(4, 5)
)df1 a_x a_y b_x b_y
1 1 2 2 3
df2 id z
1 a 4
2 b 5
df1 |>
pivot_longer(cols = -c(), names_to = c("id", ".value"), names_sep = "_") |>
left_join(df2, join_by(id))# A tibble: 2 × 4
id x y z
<chr> <dbl> <dbl> <dbl>
1 a 1 2 4
2 b 2 3 5
df1 = r.df1
df2 = r.df2
df1_new = df1.melt(id_vars = [], var_name = 'id')
df1_new[['id', 'group']] = df1_new['id'].str.split('_', expand=True)
df1_new = df1_new.pivot(columns = 'group', values='value', index = 'id').reset_index()
pd.merge(df1_new, df2, how='left', left_on = 'id', right_on = 'id') id x y z
0 a 1.0 2.0 4.0
1 b 2.0 3.0 5.0
strings <- c(
"George Washington: February 22, 1732",
"Thomas Jefferson: April 13, 1743",
"Abraham Lincoln: February 12, 1809",
"Theodore Roosevelt: October 27, 1858"
)
strings[1] "George Washington: February 22, 1732"
[2] "Thomas Jefferson: April 13, 1743"
[3] "Abraham Lincoln: February 12, 1809"
[4] "Theodore Roosevelt: October 27, 1858"
str_extract(strings, ".+(?=:)")[1] "George Washington" "Thomas Jefferson" "Abraham Lincoln"
[4] "Theodore Roosevelt"
str_extract(strings, "(?<=: ).+")[1] "February 22, 1732" "April 13, 1743" "February 12, 1809"
[4] "October 27, 1858"
str_extract(strings, "(?<=\\s).+(?=:)")[1] "Washington" "Jefferson" "Lincoln" "Roosevelt"
str_extract(strings, "(?<=, ).+")[1] "1732" "1743" "1809" "1858"
strings <- c("apple", "banana", "canteloupe", "durian",
"eggplant", "french fries", "goat cheese",
"pizza", "99 red balloons", "101 dalmatians",
"route 66")strings [1] "apple" "banana" "canteloupe" "durian"
[5] "eggplant" "french fries" "goat cheese" "pizza"
[9] "99 red balloons" "101 dalmatians" "route 66"
str_subset(strings, "\\d+")[1] "99 red balloons" "101 dalmatians" "route 66"
str_subset(strings, "^\\d+")[1] "99 red balloons" "101 dalmatians"
str_subset(strings, "a")[1] "apple" "banana" "canteloupe" "durian"
[5] "eggplant" "goat cheese" "pizza" "99 red balloons"
[9] "101 dalmatians"
str_subset(strings, "\\s")[1] "french fries" "goat cheese" "99 red balloons" "101 dalmatians"
[5] "route 66"
str_subset(strings, "(.)\\1{1}")[1] "apple" "eggplant" "goat cheese" "pizza"
[5] "99 red balloons" "route 66"
x <- list()
for(i in 1:10){
x[i] <- i
}
x[2] + 1Error in x[2] + 1: non-numeric argument to binary operator
x[2] is a list, so we can’t add 1. If we want to add 1 to the second element of x, we should use x[[2]] + 1
nsim <- 1000 # number of games
results <- rep(NA, nsim)
for(i in 1:nsim){
# each game starts with the marker in the middle
marker <- 0
while(abs(marker) < 0){
robotA <- runif(1, 0, 0.5)
robotB <- runif(1, 0, 0.5)
marker <- marker + robotA - robotB
}
# check whether robot A wins
results[i] <- marker >= 0.5
}
# fraction of the time that robot A wins
mean(results)[1] 0
The output of this code will be 0, because the while loop never runs (the absolute value of a number can never be less than 0).
mat <- matrix(0, nrow=5, ncol=3)
for(i in 1:5){
for(j in 1:3){
mat <- i + j
}
}
mat[1] 8
mat starts off as a matrix, but then gets over-written to just be a number.
mat <- matrix(0, nrow=5, ncol=3)
for(i in 1:5){
for(j in 1:3){
mat[j, i] <- i + j
}
}Error in `[<-`(`*tmp*`, j, i, value = i + j): subscript out of bounds
mat [,1] [,2] [,3]
[1,] 2 3 4
[2,] 3 4 5
[3,] 4 5 6
[4,] 0 0 0
[5,] 0 0 0
Using mat[i,j] would fix the error.
mat <- matrix(1, 3, 3)
for(i in 2:3){
for(j in 2:3){
mat[i,j] <- mat[i-1, j-1] + mat[i, j-1]
}
}
mat [,1] [,2] [,3]
[1,] 1 1 1
[2,] 1 2 3
[3,] 1 2 4
f1 <- function(x = 1){
return(x + 1)
}
g1 <- function(x){
return(f1() + x)
}
f1(g1(3))[1] 6
f1 <- function(n, groups){
x <- matrix(1, nrow=n, ncol=n)
unique_groups = unique(groups)
means <- matrix(nrow = length(unique_groups), ncol = n)
for(i in 1:length(unique_groups)){
means[i,] <- colMeans(x[groups == unique_groups[i],])
}
return(means)
}
f1(5, groups = c(1, 1, 2, 2, 2)) [,1] [,2] [,3] [,4] [,5]
[1,] 1 1 1 1 1
[2,] 1 1 1 1 1
x <- runif(100)set.seed(442)
nsim <- 1000
results <- rep(NA, nsim)
card_numbers <- rep(1:10, 4)
card_colors <- rep(1:4, each=10)
for(i in 1:nsim){
cards_drawn <- sample(1:40, 2, replace=F)
results[i] <- (card_numbers[cards_drawn[1]] != card_numbers[cards_drawn[2]]) &
(card_colors[cards_drawn[1]] != card_colors[cards_drawn[2]])
}
mean(results)[1] 0.68
p <- 20
q <- 10
nsim <- 1000
votes <- rep(c(0, 1), times = c(q, p))
results <- rep(NA, nsim)
for(i in 1:nsim){
shuffled_votes <- sample(votes, p+q, replace=F)
results[i] <- sum(shuffled_votes) > sum(1 - shuffled_votes)
}
mean(results)This code doesn’t work because it doesn’t check anything about the counting process – it only looks at the final count. And regardless how we shuffle the votes, the final counts will remain the same. What we need to do instead is look at the count for each candidates after each vote has been counted.
p <- 20
q <- 10
nsim <- 1000
votes <- rep(c(0, 1), times = c(q, p))
results <- rep(1, nsim)
for(i in 1:nsim){
shuffled_votes <- sample(votes, p+q, replace=F)
count_a <- 0
count_b <- 0
for(j in 1:(p+q)){
count_a <- count_a + shuffled_votes[j]
count_b <- count_b + (1-shuffled_votes[j])
if(count_b >= count_a){
results[i] <- 0
}
}
}
mean(results)[1] 0.324
A: How does the “distance” between groups impact performance of the k-means algorithm?
D: Generate data \(X_1,...,X_n\) with \(n = 300\). Suppose 100 observations in each group, with \(G_1,...,G_{100} = 1\), \(G_{101},...,G_{200} = 2\), and \(G_{201},...,G_{300} = 3\). Each \(X_i\) is simulated from
\[X_i \sim N(\mu_{G_i}, 1)\]
To examine the impact of distance between groups, we will let \(\mu_2 = 0\) and consider all combinations of \(\mu_1 = -1, -2, -3\) and \(\mu_3 = 1, 2, 3\).
E: Group assignments for all observations
M: Use the \(k\)-means algorithm (kmeans function in R) to assign groups. We will assume we know the true number of groups (3), so the “correct” \(k=3\) is used.
P: The probability that all group assignments are correct. (Alternatively, could look at the expected fraction of group assignments which are correct)