Formative Exercise 07: Data Wrangling

Edit the code chunks below and knit the document. You can pipe your objects to glimpse() or print() to display them.

UK Baby Names

Here we will convert the data table scotbabynames from the ukbabynames package to a tibble and assign it the variable name sbn. Use this data tibble for questions 1-13.

# do not alter this code chunk
sbn <- as_tibble(scotbabynames) # convert to a tibble

Question 1

How many records are in the dataset?

nrecords <- nrow(sbn)

## or: 

nrecords <- count(sbn) |> pull(n) |> print()

## [1] 248420

Question 2

Remove the column rank from the dataset.

norank <- sbn |>
  select(-rank) |>
  glimpse()

## Rows: 248,420
## Columns: 5
## $ year   <dbl> 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 197…
## $ sex    <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M…
## $ name   <chr> "David", "John", "Paul", "Mark", "James", "Andrew", "Scott", "Steven", "Robert", "Steph…
## $ n      <dbl> 1794, 1528, 1260, 1234, 1202, 1067, 1060, 1020, 885, 866, 777, 749, 735, 710, 702, 685,…
## $ nation <chr> "Scotland", "Scotland", "Scotland", "Scotland", "Scotland", "Scotland", "Scotland", "Sc…

Question 3

What is the range of birth years contained in the dataset? Use summarise to make a table with two columns: minyear and maxyear.

birth_range <- sbn |>
  summarise(minyear = min(year),
            maxyear = max(year)) |>
  print()

## # A tibble: 1 × 2
##   minyear maxyear
##     <dbl>   <dbl>
## 1    1974    2020

Question 4

Make a table of only the data from babies named Hermione.

hermiones <- sbn |>
  filter(name == "Hermione") |>
  print()

## # A tibble: 22 × 6
##     year sex   name         n  rank nation  
##    <dbl> <chr> <chr>    <dbl> <dbl> <chr>   
##  1  1976 F     Hermione     1   833 Scotland
##  2  1990 F     Hermione     1  1033 Scotland
##  3  1994 F     Hermione     1  1112 Scotland
##  4  1995 F     Hermione     1  1101 Scotland
##  5  1998 F     Hermione     1  1149 Scotland
##  6  2000 F     Hermione     1  1162 Scotland
##  7  2001 F     Hermione     1  1129 Scotland
##  8  2002 F     Hermione     1  1139 Scotland
##  9  2004 F     Hermione     1  1230 Scotland
## 10  2005 F     Hermione     1  1258 Scotland
## # … with 12 more rows

Question 5

Sort the dataset by sex and then by year (descending) and then by rank (descending).

sorted_babies <- sbn |>
  arrange(sex, desc(year), desc(rank)) |>
  glimpse()

## Rows: 248,420
## Columns: 6
## $ year   <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 202…
## $ sex    <chr> "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F…
## $ name   <chr> "Aadhya", "Aadyantaa", "Aahana", "Aahna", "Aaila", "Aalayah", "Aalisha", "Aaliya", "Aal…
## $ n      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ rank   <dbl> 1488, 1488, 1488, 1488, 1488, 1488, 1488, 1488, 1488, 1488, 1488, 1488, 1488, 1488, 148…
## $ nation <chr> "Scotland", "Scotland", "Scotland", "Scotland", "Scotland", "Scotland", "Scotland", "Sc…

Question 6

Create a new numeric column, decade, that contains the decade of birth (1990, 2000, 2010). Hint: see ?floor

sbn_decade <- sbn |>
  mutate(decade = floor(year / 10) * 10)

# alternatively
sbn_decade <- sbn |>
  mutate(decade = substr(year, 1, 3) |> paste0("0") |> as.integer()) |>
  glimpse()

## Rows: 248,420
## Columns: 7
## $ year   <dbl> 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 197…
## $ sex    <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M…
## $ name   <chr> "David", "John", "Paul", "Mark", "James", "Andrew", "Scott", "Steven", "Robert", "Steph…
## $ n      <dbl> 1794, 1528, 1260, 1234, 1202, 1067, 1060, 1020, 885, 866, 777, 749, 735, 710, 702, 685,…
## $ rank   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, …
## $ nation <chr> "Scotland", "Scotland", "Scotland", "Scotland", "Scotland", "Scotland", "Scotland", "Sc…
## $ decade <int> 1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970, 197…

Question 7

Make a table of only the data from male babies named Courtney that were born between 1988 and 2001 (inclusive).

courtney <- sbn |>
  filter(name == "Courtney", sex == "M",
         year >= 1988, year <= 2001) |>
  print()

## # A tibble: 5 × 6
##    year sex   name         n  rank nation  
##   <dbl> <chr> <chr>    <dbl> <dbl> <chr>   
## 1  1989 M     Courtney     1   551 Scotland
## 2  1991 M     Courtney     1   618 Scotland
## 3  1994 M     Courtney     3   375 Scotland
## 4  1995 M     Courtney     2   455 Scotland
## 5  1996 M     Courtney     1   704 Scotland

Question 8

How many distinct names are represented in the dataset? Make sure distinct_names is an integer, not a data table.

distinct_names <- n_distinct(sbn$name)

# or

distinct_names <- sbn |>
  distinct(name) |>
  count() |>
  pull() |>
  print()

## [1] 66620

Question 9

Make a table of only the data from the Scottish female babies named Frankie that were born before 1990 or after 2015. Order it by year.

frankie <- sbn |>
  filter(nation == "Scotland",
         name == "Frankie", 
         sex == "F",
         (year < 1990) | (year > 2015)) |>
  arrange(year) |>
  print()

## # A tibble: 9 × 6
##    year sex   name        n  rank nation  
##   <dbl> <chr> <chr>   <dbl> <dbl> <chr>   
## 1  1984 F     Frankie     1   875 Scotland
## 2  1985 F     Frankie     1   919 Scotland
## 3  1987 F     Frankie     1   970 Scotland
## 4  1988 F     Frankie     1  1005 Scotland
## 5  2016 F     Frankie    22   205 Scotland
## 6  2017 F     Frankie    23   191 Scotland
## 7  2018 F     Frankie    24   182 Scotland
## 8  2019 F     Frankie    26   168 Scotland
## 9  2020 F     Frankie    25   162 Scotland

Question 10

How many total babies in the dataset were named ‘Emily’? Make sure emily is an integer, not a data table.

emily <- sbn |>
  filter(name == "Emily") |>
  summarise(total = sum(n)) |>
  pull(total) |>
  print()

## [1] 11939

Question 11

How many distinct names are there for each sex?

names_per_sex <- sbn |> 
  group_by(sex) |>
  distinct(name) |>
  count() |>
  print()

## # A tibble: 2 × 2
## # Groups:   sex [2]
##   sex       n
##   <chr> <int>
## 1 F     42628
## 2 M     27999

Question 12

What is the most popular name in the sbn dataset? Make sure most_popular_scottish_name is a character vector, not a table.

most_popular_scottish_name <- sbn |>
  # calculate the total number of babies per name
  group_by(name) |>
  summarise(total = sum(n), .groups = "drop") |>
  # find the top name
  arrange(desc(total)) |>
  slice(1) |>
  # pull the name vector from the table
  pull(name)

## alternatively, this will give you all the top names if there are ties
most_popular_scottish_name <- sbn |>
  group_by(name) |>
  summarise(total = sum(n), .groups = "drop") |>
  filter(rank(total) == max(rank(total))) |>
  pull(name) |> 
  print()

## [1] "David"

Question 12b

What is the most popular name for each nation and sex in the ukbabynames dataset? Make a table with the columns nation, male and female, with three rows: one for each nation.

most_popular <- ukbabynames |>
  # calculate the total number of babies per name:sex:nation
  group_by(nation, sex, name) |>
  summarise(total = sum(n), .groups = "drop") |>
  # find the top name per sex:nation
  group_by(nation, sex) |>
  arrange(desc(total)) |>
  slice(1) |>
  ungroup() |>
  # rearrange the table from long to wide
  select(-total) |> # check what happens if you leave this out
  spread(key = sex, value = name) |>
  # fix the names
  select(nation, male = M, female = F) |>
  print()

## # A tibble: 3 × 3
##   nation           male  female
##   <chr>            <chr> <chr> 
## 1 England & Wales  JACK  EMILY 
## 2 Northern Ireland Jack  Sophie
## 3 Scotland         David Emma

Question 13

How many babies were born each year for each sex? Make a plot where the y-axis starts at 0 so you have the right perspective on changes.

babies_per_year <- sbn |>
  group_by(year, sex) |>
  summarise(total = sum(n), .groups = "drop")

ggplot(babies_per_year, aes(year, total, color = sex)) +
  geom_line() +
  ylim(0, 36000)

Select helpers

Load the dataset reprores::personality.

Select only the personality question columns (not the user_id or date).

q_only <- reprores::personality |>
  select(-user_id, -date) |>
  glimpse()

## Rows: 15,000
## Columns: 41
## $ Op1  <dbl> 3, 6, 6, 6, 6, 3, 3, 6, 6, 3, 4, 5, 5, 5, 6, 4, 1, 2, 5, 6, 4, 6, 3, NA, 5, 6, 6, 0, 6, 6…
## $ Ne1  <dbl> 4, 0, 0, 4, 1, 2, 3, 4, 0, 3, 3, 3, 2, 1, 1, 3, 4, 5, 2, 4, 5, 1, 4, NA, 2, 3, 3, 3, 2, 1…
## $ Ne2  <dbl> 0, 6, 6, 4, 2, 1, 2, 3, 1, 2, 5, 5, 3, 1, 1, 1, 1, 6, 1, 2, 5, 1, 3, NA, 4, 3, 2, 4, 4, 6…
## $ Op2  <dbl> 6, 0, 0, 4, 6, 4, 4, 0, 0, 3, 4, 3, 3, 4, 5, 3, 3, 4, 1, 6, 6, 4, 4, NA, 3, 6, 6, 6, 6, 5…
## $ Ex1  <dbl> 3, 0, 0, 2, 2, 4, 4, 3, 5, 4, 1, 1, 3, 3, 1, 3, 5, 1, 0, 4, 1, 5, 5, NA, 3, 3, 6, 1, 0, 1…
## $ Ex2  <dbl> 3, 0, 0, 3, 3, 4, 5, 2, 5, 3, 4, 1, 3, 2, 1, 6, 5, 3, 4, 4, 1, 6, 5, NA, 3, 3, 6, 0, 3, 4…
## $ Co1  <dbl> 3, 0, 0, 3, 5, 4, 3, 4, 5, 3, 3, 3, 1, 5, 5, 4, 4, 5, 6, 4, 2, 5, 4, 6, 4, 2, 3, 6, 6, 1,…
## $ Co2  <dbl> 3, 0, 0, 3, 4, 3, 3, 4, 5, 3, 5, 3, 3, 4, 5, 1, 5, 4, 5, 2, 5, 4, 5, NA, 4, 3, 2, 6, 5, 2…
## $ Ne3  <dbl> 0, 0, 0, 1, 0, 1, 4, 4, 0, 4, 2, 5, 1, 2, 5, 5, 2, 2, 1, 2, 5, 1, 4, NA, 0, 1, 1, 3, 1, 4…
## $ Ag1  <dbl> 2, 0, 0, 4, 6, 5, 5, 4, 2, 5, 4, 3, 2, 4, 5, 3, 5, 5, 5, 4, 4, 5, 4, NA, 6, 2, 5, 3, 5, 5…
## $ Ag2  <dbl> 1, 6, 6, 0, 5, 4, 5, 3, 4, 3, 5, 1, 5, 4, 2, 6, 5, 5, 5, 5, 2, 5, 6, NA, 4, 2, 1, 2, 5, 5…
## $ Ne4  <dbl> 3, 6, 6, 2, 3, 2, 3, 3, 0, 4, 4, 5, 5, 4, 5, 3, 2, 5, 2, 4, 5, 0, 5, NA, 2, 4, 5, 3, 1, 5…
## $ Ex3  <dbl> 3, 6, 5, 5, 3, 3, 3, 0, 6, 1, 4, 2, 3, 2, 1, 2, 5, 1, 0, 5, 5, 6, 5, NA, 5, 3, 6, 0, 3, 3…
## $ Co3  <dbl> 2, 0, 1, 3, 4, 4, 5, 4, 5, 3, 4, 3, 4, 4, 5, 4, 2, 4, 5, 2, 2, 5, 5, 3, 3, 5, 3, 6, 6, 2,…
## $ Op3  <dbl> 2, 6, 5, 5, 5, 4, 3, 2, 4, 3, 3, 6, 5, 5, 6, 5, 4, 4, 3, 6, 5, 6, 5, NA, 6, 6, 1, 3, 6, 5…
## $ Ex4  <dbl> 1, 0, 1, 3, 3, 3, 4, 3, 5, 3, 2, 0, 3, 3, 1, 2, NA, 4, 4, 4, 1, 4, 6, 0, 4, 5, 6, 0, 3, 1…
## $ Op4  <dbl> 3, 0, 1, 6, 6, 3, 3, 0, 6, 3, 4, 5, 4, 5, 6, 6, 2, 2, 4, 5, 5, 5, 5, NA, 0, 4, 6, 0, 5, 5…
## $ Ex5  <dbl> 3, 0, 1, 6, 3, 3, 4, 2, 5, 2, 2, 4, 2, 3, 0, 4, 5, 2, 3, 1, 1, 5, 4, NA, 3, 3, 6, 5, 3, 4…
## $ Ag3  <dbl> 1, 0, 1, 1, 0, 4, 4, 4, 3, 3, 4, 4, 3, 4, 4, 5, 5, 4, 5, 3, 4, 6, 4, NA, 4, 2, 6, 0, 5, 4…
## $ Co4  <dbl> 3, 6, 5, 5, 5, 3, 2, 4, 3, 1, 4, 3, 1, 2, 4, 2, NA, 5, 6, 1, 1, 3, 1, 5, 2, 3, 3, 6, 2, 0…
## $ Co5  <dbl> 0, 6, 5, 5, 5, 3, 3, 1, 5, 1, 2, 4, 4, 4, 2, 1, 6, 4, 3, 1, 3, 3, 5, NA, 5, 2, 2, 3, 2, 1…
## $ Ne5  <dbl> 3, 0, 1, 4, 1, 1, 4, 5, 0, 3, 4, 6, 2, 0, 1, 1, 0, 4, 3, 1, 5, 1, 1, NA, 1, 1, 4, 1, 1, 2…
## $ Op5  <dbl> 6, 6, 5, 2, 5, 4, 3, 2, 6, 6, 2, 4, 3, 4, 6, 6, 6, 5, 3, 3, 5, 5, 1, NA, 5, 6, 5, 1, 4, 1…
## $ Ag4  <dbl> 1, 0, 1, 4, 6, 5, 5, 6, 6, 6, 4, 2, 4, 5, 4, 5, 6, 4, 5, 6, 5, 4, 5, NA, 5, 6, 6, 1, 4, 5…
## $ Op6  <dbl> 0, 6, 5, 1, 6, 4, 6, 0, 0, 3, 5, 3, 5, 5, 5, 2, 5, 1, 1, 6, 2, 4, 5, NA, 6, 6, 6, 6, 5, 4…
## $ Co6  <dbl> 6, 0, 1, 4, 6, 5, 6, 5, 4, 3, 5, 5, 4, 6, 6, 1, 3, 4, 5, 4, 6, 3, 5, NA, 6, 2, 4, 6, 5, 6…
## $ Ex6  <dbl> 3, 6, 5, 3, 0, 4, 3, 1, 6, 3, 2, 1, 4, 2, 1, 5, 6, 2, 1, 2, 1, 6, 4, NA, 2, 3, 6, 1, 3, 2…
## $ Ne6  <dbl> 1, 6, 5, 1, 0, 1, 3, 4, 0, 4, 4, 5, 2, 1, 5, 6, 1, 2, 2, 3, 5, 0, 4, NA, 2, 2, 3, 5, 1, 4…
## $ Co7  <dbl> 3, 6, 5, 1, 3, 4, NA, 2, 3, 3, 2, 2, 4, 2, 5, 2, 5, 5, 3, 1, 1, 2, 2, NA, 5, 5, 3, 3, 4, …
## $ Ag5  <dbl> 3, 6, 5, 0, 2, 5, 6, 2, 2, 3, 4, 1, 3, 5, 2, 6, 5, 6, 5, 3, 3, 5, 4, NA, 6, 3, 5, 3, 4, 4…
## $ Co8  <dbl> 3, 0, 1, 1, 3, 4, 3, 0, 1, 3, 2, 2, 1, 2, 4, 3, 2, 4, 5, 2, 6, 2, 4, NA, 5, 1, 1, 5, 3, 1…
## $ Ex7  <dbl> 3, 6, 5, 4, 1, 2, 5, 3, 6, 3, 4, 3, 5, 1, 1, 6, 6, 3, 1, 1, 3, 6, 5, NA, 2, 4, 6, 3, 2, 4…
## $ Ne7  <dbl> NA, 0, 1, 2, 0, 2, 4, 4, 0, 3, 2, 5, 1, 2, 5, 2, 2, 4, 1, 3, 5, 1, 2, NA, 1, 4, 0, 3, 2, …
## $ Co9  <dbl> 3, 6, 5, 4, 3, 4, 5, 3, 5, 3, 4, 3, 4, 4, 2, 4, 6, 5, 5, 2, 2, 4, 3, NA, 6, 3, 4, 5, 5, 1…
## $ Op7  <dbl> 0, 6, 5, 5, 5, 4, 6, 2, 1, 3, 2, 4, 5, 5, 6, 3, 6, 5, 2, 6, 5, 6, 5, NA, 6, 6, 6, 6, 6, 5…
## $ Ne8  <dbl> 2, 0, 1, 1, 1, 1, 5, 4, 0, 4, 4, 5, 1, 2, 5, 2, 1, 5, 1, 2, 5, 1, 3, NA, 1, 3, 2, 3, 1, 5…
## $ Ag6  <dbl> NA, 6, 5, 2, 3, 4, 5, 6, 1, 3, 4, 2, 3, 5, 1, 6, 2, 6, 6, 5, 3, 5, 2, NA, 5, 5, 1, 2, 6, …
## $ Ag7  <dbl> 3, 0, 1, 1, 1, 3, 3, 5, 0, 3, 2, 1, 2, 3, 5, 6, 4, 4, 6, 6, 2, 4, 5, NA, 6, 1, 1, 0, 4, 4…
## $ Co10 <dbl> 1, 6, 5, 5, 3, 5, 1, 2, 5, 2, 4, 3, 4, 4, 3, 2, 5, 5, 5, 2, 2, 4, 5, NA, 5, 3, 3, 6, 6, 6…
## $ Ex8  <dbl> 2, 0, 1, 4, 3, 4, 2, 4, 6, 2, 4, 0, 4, 4, 1, 3, 5, 4, 3, 1, 1, 6, 5, NA, 5, 3, 6, 0, 2, 4…
## $ Ex9  <dbl> 4, 6, 5, 5, 5, 2, 3, 3, 6, 3, 3, 4, 4, 3, 2, 5, 5, 4, 4, 0, 4, 6, 4, 1, 3, 2, 6, 2, 3, 5,…

Select the user_id column and all of the columns with questions about openness.

openness <- reprores::personality |>
  select(user_id, starts_with("Op")) |>
  glimpse()

## Rows: 15,000
## Columns: 8
## $ user_id <dbl> 0, 1, 2, 5, 8, 108, 233, 298, 426, 436, 685, 807, 871, 881, 948, 1023, 1052, 1197, 129…
## $ Op1     <dbl> 3, 6, 6, 6, 6, 3, 3, 6, 6, 3, 4, 5, 5, 5, 6, 4, 1, 2, 5, 6, 4, 6, 3, NA, 5, 6, 6, 0, 6…
## $ Op2     <dbl> 6, 0, 0, 4, 6, 4, 4, 0, 0, 3, 4, 3, 3, 4, 5, 3, 3, 4, 1, 6, 6, 4, 4, NA, 3, 6, 6, 6, 6…
## $ Op3     <dbl> 2, 6, 5, 5, 5, 4, 3, 2, 4, 3, 3, 6, 5, 5, 6, 5, 4, 4, 3, 6, 5, 6, 5, NA, 6, 6, 1, 3, 6…
## $ Op4     <dbl> 3, 0, 1, 6, 6, 3, 3, 0, 6, 3, 4, 5, 4, 5, 6, 6, 2, 2, 4, 5, 5, 5, 5, NA, 0, 4, 6, 0, 5…
## $ Op5     <dbl> 6, 6, 5, 2, 5, 4, 3, 2, 6, 6, 2, 4, 3, 4, 6, 6, 6, 5, 3, 3, 5, 5, 1, NA, 5, 6, 5, 1, 4…
## $ Op6     <dbl> 0, 6, 5, 1, 6, 4, 6, 0, 0, 3, 5, 3, 5, 5, 5, 2, 5, 1, 1, 6, 2, 4, 5, NA, 6, 6, 6, 6, 5…
## $ Op7     <dbl> 0, 6, 5, 5, 5, 4, 6, 2, 1, 3, 2, 4, 5, 5, 6, 3, 6, 5, 2, 6, 5, 6, 5, NA, 6, 6, 6, 6, 6…

Select the user_id column and all of the columns with the first question for each personality trait.

q1 <- reprores::personality |>
  select(user_id, ends_with("1")) |>
  glimpse()

## Rows: 15,000
## Columns: 6
## $ user_id <dbl> 0, 1, 2, 5, 8, 108, 233, 298, 426, 436, 685, 807, 871, 881, 948, 1023, 1052, 1197, 129…
## $ Op1     <dbl> 3, 6, 6, 6, 6, 3, 3, 6, 6, 3, 4, 5, 5, 5, 6, 4, 1, 2, 5, 6, 4, 6, 3, NA, 5, 6, 6, 0, 6…
## $ Ne1     <dbl> 4, 0, 0, 4, 1, 2, 3, 4, 0, 3, 3, 3, 2, 1, 1, 3, 4, 5, 2, 4, 5, 1, 4, NA, 2, 3, 3, 3, 2…
## $ Ex1     <dbl> 3, 0, 0, 2, 2, 4, 4, 3, 5, 4, 1, 1, 3, 3, 1, 3, 5, 1, 0, 4, 1, 5, 5, NA, 3, 3, 6, 1, 0…
## $ Co1     <dbl> 3, 0, 0, 3, 5, 4, 3, 4, 5, 3, 3, 3, 1, 5, 5, 4, 4, 5, 6, 4, 2, 5, 4, 6, 4, 2, 3, 6, 6,…
## $ Ag1     <dbl> 2, 0, 0, 4, 6, 5, 5, 4, 2, 5, 4, 3, 2, 4, 5, 3, 5, 5, 5, 4, 4, 5, 4, NA, 6, 2, 5, 3, 5…

Window fuctions

The code below sets up a fake dataset where 10 subjects respond to 20 trials with a dv on a 5-point Likert scale.

set.seed(10)

fake_data <- tibble(
  subj_id = rep(1:10, each = 20),
  trial = rep(1:20, times = 10),
  dv = sample.int(5, 10*20, TRUE)
)

Question 14

You want to know how many times each subject responded with the same dv as their last trial. For example, if someone responded 2,3,3,3,4 for five trials they would have repeated their previous response on the third and fourth trials. Use an offset function to determine how many times each subject repeated a response.

repeated_data <- fake_data |>
  group_by(subj_id) |>
  mutate(repeated = dv == lag(dv)) |>
  summarise(repeats = sum(repeated, na.rm = TRUE),
            .groups = "drop") |>
  print()

## # A tibble: 10 × 2
##    subj_id repeats
##      <int>   <int>
##  1       1       4
##  2       2       3
##  3       3       6
##  4       4       4
##  5       5       5
##  6       6       2
##  7       7       3
##  8       8       4
##  9       9       5
## 10      10       4

Question 15

Create a table too_many_repeats with the subject who have the two highest-ranked and second-highest ranked unique repeats values from repeated_data using ranking functions. For example, if 3 people are tied for the highest value and 2 people are tied for the next-highest value, the table would return 5 people. (Hint: check the differences among rank(), min_rank() and dense_rank())

too_many_repeats <- repeated_data |>
  mutate(rank = dense_rank(repeats)) |>
  filter(rank == max(rank) | rank == max(rank)-1) |>
  print()

## # A tibble: 3 × 3
##   subj_id repeats  rank
##     <int>   <int> <int>
## 1       3       6     5
## 2       5       5     4
## 3       9       5     4

Advanced Questions

There are several ways to complete the following two tasks. Different people will solve them different ways, but you should be able to tell if your answers make sense.

Question 16

Load the dataset reprores::family_composition from last week’s exercise.

Calculate how many siblings of each sex each person has, narrow the dataset down to people with fewer than 6 siblings, and generate at least two different ways to graph this.

# get total number of brothers and sisters per person
sib6 <- reprores::family_composition |>
  gather("sibtype", "n", oldbro:twinsis) |>
  separate(sibtype, c("sibage", "sibsex"), sep = -3) |>
  group_by(user_id, sex, sibsex) |>
  summarise(n = sum(n), .groups = "drop") |>
  group_by(user_id) |>
  filter(sex %in% c("male", "female"), sum(n) < 6)

# transform to wide format
sib6_wide <- sib6 |>
  spread(sibsex, n)

ggplot(sib6, aes(n, fill = sibsex)) +
  geom_histogram(binwidth = 1, colour = "black", position = "dodge") +
  scale_fill_discrete(name = "", labels = c("Brothers", "Sisters")) +
  labs(x = "Number of Siblings",
       y = "Number of Participants")

ggplot(sib6_wide, aes(bro, sis)) +
  geom_count() +
  labs(x = "Number of brothers",
       y = "Number of sisters")

ggplot(sib6_wide, aes(bro, sis)) +
  geom_bin2d(binwidth = c(1,1), show.legend = FALSE) +
  stat_bin2d(geom = "text", aes(label = ..count..), 
             binwidth = c(1, 1), color = "white") +
  labs(x = "Number of brothers",
       y = "Number of sisters")

Question 17

Use the dataset reprores::eye_descriptions from last week’s exercise.

Create a list of the 10 most common descriptions from the eyes dataset. Remove useless descriptions and merge redundant descriptions.

eyes <- reprores::eye_descriptions |>
  gather("face_id", "description", t1:t50) |>
  separate(description, c("d1", "d2", "d3", "d4"), sep = "(,|;|\\/)+", extra = "merge", fill = "right") |>
  gather("desc_n", "description", d1:d4) |>
  filter(!is.na(description)) |>          # gets rid of rows with no description
  mutate(
    description = trimws(description),     # get rid of white space around string
    description = tolower(description)     # make all characters lowercase
  ) |>
  group_by(description) |>
  summarise(n = n(), .groups = "drop") |> # count occurrences of each description
  arrange(desc(n)) |>                     # sort by count (descending)
  filter(nchar(description) > 1) |>       # get rid of 1-character descriptions
  filter(row_number() < 11) |>
  print()

## # A tibble: 10 × 2
##    description     n
##    <chr>       <int>
##  1 brown         364
##  2 blue          314
##  3 small         276
##  4 pretty        261
##  5 big           240
##  6 round         233
##  7 sad           225
##  8 tired         219
##  9 dark          190
## 10 average       176