Class activity solutions

Author

Ciaran Evans

library(tidyverse)
library(googlesheets4)
gs4_deauth()
gdp_data <- read_sheet("https://docs.google.com/spreadsheets/d/1RctTQmKB0hzbm1E8rGcufYdMshRdhmYdeL29nXqmvsc/pub?gid=0")
lit_data = read_sheet("https://docs.google.com/spreadsheets/d/1hDinTIRHQIaZg1RUn6Z_6mo12PtKwEPFIz_mJVF6P5I/pub?gid=0")
gdp_data <- gdp_data |>
  rename(country = starts_with("Income")) |>
  pivot_longer(-country, names_to = "year", values_to = "gdp",
               values_drop_na = TRUE)

lit_data <- lit_data |>
  rename(country = starts_with("Adult")) |>
  pivot_longer(-country, names_to = "year", values_to = "literacy",
               values_drop_na = TRUE)
lit_gdp <- lit_data |>
  inner_join(gdp_data, join_by(country, year))

In R:

air_quality <- read.csv("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/air_quality_long.csv")
air_parameters <- read.csv("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/air_quality_parameters.csv")

air_quality <- air_quality |>
  left_join(air_parameters, join_by(parameter == id))

head(air_quality)
       city country                  date.utc location parameter value  unit
1 Antwerpen      BE 2019-06-18 06:00:00+00:00  BETR801      pm25  18.0 µg/m³
2 Antwerpen      BE 2019-06-17 08:00:00+00:00  BETR801      pm25   6.5 µg/m³
3 Antwerpen      BE 2019-06-17 07:00:00+00:00  BETR801      pm25  18.5 µg/m³
4 Antwerpen      BE 2019-06-17 06:00:00+00:00  BETR801      pm25  16.0 µg/m³
5 Antwerpen      BE 2019-06-17 05:00:00+00:00  BETR801      pm25   7.5 µg/m³
6 Antwerpen      BE 2019-06-17 04:00:00+00:00  BETR801      pm25   7.5 µg/m³
                                               description  name
1 Particulate matter less than 2.5 micrometers in diameter PM2.5
2 Particulate matter less than 2.5 micrometers in diameter PM2.5
3 Particulate matter less than 2.5 micrometers in diameter PM2.5
4 Particulate matter less than 2.5 micrometers in diameter PM2.5
5 Particulate matter less than 2.5 micrometers in diameter PM2.5
6 Particulate matter less than 2.5 micrometers in diameter PM2.5

In Python:

import pandas as pd

air_quality = pd.read_csv("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/air_quality_long.csv")
air_parameters = pd.read_csv("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/air_quality_parameters.csv")

air_quality = pd.merge(air_quality, air_parameters, how = 'left',  
                       left_on = 'parameter', right_on = 'id')
air_quality.head()
        city country  ...                                        description   name
0  Antwerpen      BE  ...  Particulate matter less than 2.5 micrometers i...  PM2.5
1  Antwerpen      BE  ...  Particulate matter less than 2.5 micrometers i...  PM2.5
2  Antwerpen      BE  ...  Particulate matter less than 2.5 micrometers i...  PM2.5
3  Antwerpen      BE  ...  Particulate matter less than 2.5 micrometers i...  PM2.5
4  Antwerpen      BE  ...  Particulate matter less than 2.5 micrometers i...  PM2.5

[5 rows x 10 columns]
library(nycflights13)

flights |>
  left_join(planes, join_by(tailnum)) |>
  head()
# A tibble: 6 × 27
  year.x month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
1   2013     1     1      517            515         2      830            819
2   2013     1     1      533            529         4      850            830
3   2013     1     1      542            540         2      923            850
4   2013     1     1      544            545        -1     1004           1022
5   2013     1     1      554            600        -6      812            837
6   2013     1     1      554            558        -4      740            728
# ℹ 19 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>, year.y <int>, type <chr>,
#   manufacturer <chr>, model <chr>, engines <int>, seats <int>, speed <int>,
#   engine <chr>
flights |>
  left_join(planes, join_by(tailnum)) |>
  rename(manufacture_date = year.y) |>
  select(tailnum, manufacture_date) |>
  distinct() |>
  slice_min(manufacture_date, n=3)
# A tibble: 3 × 2
  tailnum manufacture_date
  <chr>              <int>
1 N381AA              1956
2 N201AA              1959
3 N567AA              1959

The oldest plane has tail number N381AA.