Formative Exercise 06: Tidy Data

Edit the code chunks below and knit the document. You can pipe your objects to glimpse() or print() to display them.

Tidy data

The following data table is not tidy. Use tibble() or tribble()to manually create the tidy version of this table.

# do not edit this chunk
untidy <- tribble(
  ~id, ~stats, ~p.value, ~conf.int,
  "A", "t(26) = -0.424", 0.6749,  "[-0.444, 0.292]",
  "B", "t(19) =  0.754", 0.4600,  "[-0.287, 0.610]",
  "C", "t(19) =  4.289", 0.0004,  "[ 0.374, 1.088]"
) %>% print()

## # A tibble: 3 x 4
##   id    stats          p.value conf.int       
##   <chr> <chr>            <dbl> <chr>          
## 1 A     t(26) = -0.424  0.675  [-0.444, 0.292]
## 2 B     t(19) =  0.754  0.46   [-0.287, 0.610]
## 3 C     t(19) =  4.289  0.0004 [ 0.374, 1.088]

# your version can have different column names in a different order
tidy <- tribble(
  ~id, ~df, ~t.value, ~p.value, ~conf.int.low, ~conf.int.high,
  "A",  26,   -0.424,   0.6749,        -0.444,          0.292,
  "B",  19,    0.754,   0.4600,        -0.287,          0.610,
  "C",  19,    4.289,   0.0004,         0.374,          1.088
) %>% print()

## # A tibble: 3 x 6
##   id       df t.value p.value conf.int.low conf.int.high
##   <chr> <dbl>   <dbl>   <dbl>        <dbl>         <dbl>
## 1 A        26  -0.424  0.675        -0.444         0.292
## 2 B        19   0.754  0.46         -0.287         0.61 
## 3 C        19   4.29   0.0004        0.374         1.09

Pivot functions

The questions in this section all have errors. Fix the errors.

Load data

Load the dataset reprores::sensation_seeking as ss.

# has an error
ss <- read_csv(reprores::sensation_seeking)

## Error: `file` must be a string, raw vector or a connection.

# corrects the error
ss <- reprores::sensation_seeking

## alternatively

ss <- read_csv("https://psyteachr.github.io/reprores/data/sensation_seeking.csv")

pivot_longer

Convert from wide to long format.

# has an error
ss_long <- ss %>%
  pivot_longer(names_to = "question", 
               values_to = "score") %>%
  glimpse()

## Error: `cols` must select at least one column.

# corrects the error
ss_long <- ss %>%
  pivot_longer(sss1:sss14, 
               names_to = "question", 
               values_to = "score") %>%
  glimpse()

## Rows: 378,294
## Columns: 5
## $ id       <dbl> 3144, 3144, 3144, 3144, 3144, 3144, 3144, 3144, 3144, 3144, 3144…
## $ user_id  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1…
## $ date     <date> 2006-05-07, 2006-05-07, 2006-05-07, 2006-05-07, 2006-05-07, 200…
## $ question <chr> "sss1", "sss2", "sss3", "sss4", "sss5", "sss6", "sss7", "sss8", …
## $ score    <dbl> 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1…

pivot_wider

Convert back to wide format. Make sure ss_wide is the same as ss.

# has an error
ss_wide <- ss_long %>%
  pivot_wider(question, score) %>%
  glimpse()

## Rows: 14
## Columns: 1
## $ question <chr> "sss1", "sss2", "sss3", "sss4", "sss5", "sss6", "sss7", "sss8", …

# corrects the error
ss_wide <- ss_long %>%
  pivot_wider(names_from = question, 
              values_from = score) %>%
  glimpse()

## Rows: 27,021
## Columns: 17
## $ id      <dbl> 3144, 133, 175, 285, 1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 11, 13, 14, …
## $ user_id <dbl> 0, 1, 2, 5, 8, 9, 10, 17, 19, 20, 21, 22, 23, 24, 27, 30, 31, 32,…
## $ date    <date> 2006-05-07, 2004-12-08, 2005-01-14, 2005-02-22, 2004-09-29, 2004…
## $ sss1    <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1,…
## $ sss2    <dbl> 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,…
## $ sss3    <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,…
## $ sss4    <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,…
## $ sss5    <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,…
## $ sss6    <dbl> 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,…
## $ sss7    <dbl> 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1,…
## $ sss8    <dbl> 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,…
## $ sss9    <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,…
## $ sss10   <dbl> 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,…
## $ sss11   <dbl> 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,…
## $ sss12   <dbl> 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,…
## $ sss13   <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,…
## $ sss14   <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,…

Tidy verbs

The questions in this section all have errors. Fix the errors.

gather

Use the gather() function to convert ss from wide to long.

# has an error
ss_long <- gather(ss, "question", "score") %>%
  glimpse()

## Warning: attributes are not identical across measure variables;
## they will be dropped

## Rows: 459,357
## Columns: 2
## $ question <chr> "id", "id", "id", "id", "id", "id", "id", "id", "id", "id", "id"…
## $ score    <dbl> 3144, 133, 175, 285, 1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 11, 13, 14,…

# corrects the error
ss_long <- gather(ss, "question", "score", sss1:sss14) %>%
  glimpse()

## Rows: 378,294
## Columns: 5
## $ id       <dbl> 3144, 133, 175, 285, 1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 11, 13, 14,…
## $ user_id  <dbl> 0, 1, 2, 5, 8, 9, 10, 17, 19, 20, 21, 22, 23, 24, 27, 30, 31, 32…
## $ date     <date> 2006-05-07, 2004-12-08, 2005-01-14, 2005-02-22, 2004-09-29, 200…
## $ question <chr> "sss1", "sss1", "sss1", "sss1", "sss1", "sss1", "sss1", "sss1", …
## $ score    <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1…

separate

Split the question column from ss_long into two columns: domain and qnumber.

# has an error
ss_sep <- ss_long %>%
  separate(question, domain, qnumber, sep = 3) %>%
  glimpse()

## Error in str_separate(value, into = into, sep = sep, convert = convert, : object 'domain' not found

# corrects the error
ss_sep <- ss_long %>%
  separate(question, c("domain", "qnumber"), sep = 3) %>%
  glimpse()

## Rows: 378,294
## Columns: 6
## $ id      <dbl> 3144, 133, 175, 285, 1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 11, 13, 14, …
## $ user_id <dbl> 0, 1, 2, 5, 8, 9, 10, 17, 19, 20, 21, 22, 23, 24, 27, 30, 31, 32,…
## $ date    <date> 2006-05-07, 2004-12-08, 2005-01-14, 2005-02-22, 2004-09-29, 2004…
## $ domain  <chr> "sss", "sss", "sss", "sss", "sss", "sss", "sss", "sss", "sss", "s…
## $ qnumber <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", …
## $ score   <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1,…

unite

Put the id and user_id columns together into a new column named super_id. Make it in a format like “id-user_id”.

# has an error
ss_unite <- ss_sep %>%
  unite(id, user_id, "super_id", sep = "-") %>%
  glimpse()

## Error: Can't subset columns that don't exist.
## x Column `super_id` doesn't exist.

# corrects the error
ss_unite <- ss_sep %>%
  unite("super_id", id, user_id, sep = "-") %>%
  glimpse()

## Rows: 378,294
## Columns: 5
## $ super_id <chr> "3144-0", "133-1", "175-2", "285-5", "1-8", "3-9", "4-10", "5-17…
## $ date     <date> 2006-05-07, 2004-12-08, 2005-01-14, 2005-02-22, 2004-09-29, 200…
## $ domain   <chr> "sss", "sss", "sss", "sss", "sss", "sss", "sss", "sss", "sss", "…
## $ qnumber  <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",…
## $ score    <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1…

spread

Convert back to wide format. (N.B. the new question columns headers will just be numbers, not “sss#”)

# has an error
ss_wide <- ss_unite %>%
  spreadr(qnumber, score, ) %>%
  glimpse()

## Error in spreadr(., qnumber, score, ): could not find function "spreadr"

# corrects the error
ss_wide <- ss_unite %>%
  spread(qnumber, score) %>%
  glimpse()

## Rows: 27,021
## Columns: 17
## $ super_id <chr> "1-8", "10-23", "100-426", "10000-64553", "10001-64554", "10002-…
## $ date     <date> 2004-09-29, 2004-10-08, 2004-11-25, 2007-01-25, 2007-01-25, 200…
## $ domain   <chr> "sss", "sss", "sss", "sss", "sss", "sss", "sss", "sss", "sss", "…
## $ `1`      <dbl> 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0…
## $ `10`     <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0…
## $ `11`     <dbl> 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0…
## $ `12`     <dbl> 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0…
## $ `13`     <dbl> 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0…
## $ `14`     <dbl> 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0…
## $ `2`      <dbl> 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0…
## $ `3`      <dbl> 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0…
## $ `4`      <dbl> 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0…
## $ `5`      <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1…
## $ `6`      <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1…
## $ `7`      <dbl> 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1…
## $ `8`      <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ `9`      <dbl> 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1…

Pipes

Connect with pipes

Re-write the following sequence of commands into a single ‘pipeline’.

# do not edit this chunk
x <- 1:20      # integers from 1:20
y <- rep(x, 2) # then repeat them twice
z <- sum(y)    # and then take the sum
z

## [1] 420

x <- 1:20 %>% rep(2) %>% sum() %>% print()

## [1] 420

Deconnect pipes

Deconstruct the pipeline below back into separate commands.

# do not edit this chunk
lager <- LETTERS[c(18, 5, 7, 1, 12)] %>%
  rev() %>%
  paste(collapse = "") %>%
  print()

## [1] "LAGER"

regal <- LETTERS[c(18, 5, 7, 1, 12)]
reversed <- rev(regal)
lager <- paste(reversed, collapse = "") # make it into a string
lager

## [1] "LAGER"

Pivot vs tidy verbs

Load the dataset reprores::family_composition.

The columns oldbro through twinsis give the number of siblings of that age and sex. Put this into long format and create separate columns for sibling age (sibage = old, young, twin) and sex (sibsex = bro, sis).

Use pivot functions

family_pivot <- reprores::family_composition %>%
  pivot_longer(cols = oldbro:twinsis,
               names_to = c("sibage", "sibsex"),
               names_sep = -3,
               values_to = "n") %>%
  glimpse()

## Rows: 115,014
## Columns: 8
## $ user_id <dbl> 8, 8, 8, 8, 8, 8, 67, 67, 67, 67, 67, 67, 98, 98, 98, 98, 98, 98,…
## $ sex     <chr> "male", "male", "male", "male", "male", "male", "female", "female…
## $ age     <dbl> 38.1, 38.1, 38.1, 38.1, 38.1, 38.1, 19.7, 19.7, 19.7, 19.7, 19.7,…
## $ momage  <dbl> 25, 25, 25, 25, 25, 25, 29, 29, 29, 29, 29, 29, NA, NA, NA, NA, N…
## $ dadage  <dbl> 27, 27, 27, 27, 27, 27, 31, 31, 31, 31, 31, 31, NA, NA, NA, NA, N…
## $ sibage  <chr> "old", "old", "young", "young", "twin", "twin", "old", "old", "yo…
## $ sibsex  <chr> "bro", "sis", "bro", "sis", "bro", "sis", "bro", "sis", "bro", "s…
## $ n       <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 2, 0, 0, 0,…

Use tidy verbs

family_tidy <- reprores::family_composition %>%
  gather("sibtype", "n", oldbro:twinsis) %>%
  separate(sibtype, c("sibage", "sibsex"), sep = -3) %>%
  glimpse()

## Rows: 115,014
## Columns: 8
## $ user_id <dbl> 8, 67, 98, 103, 164, 233, 235, 253, 256, 271, 298, 332, 426, 429,…
## $ sex     <chr> "male", "female", "female", "female", "female", "female", "male",…
## $ age     <dbl> 38.1, 19.7, 19.4, 20.6, 20.3, 19.3, 18.7, 19.5, 19.7, 24.5, 17.7,…
## $ momage  <dbl> 25, 29, NA, NA, 24, NA, NA, 24, NA, 21, 28, NA, NA, NA, NA, NA, N…
## $ dadage  <dbl> 27, 31, NA, NA, NA, NA, NA, 25, NA, 22, NA, NA, NA, NA, NA, NA, N…
## $ sibage  <chr> "old", "old", "old", "old", "old", "old", "old", "old", "old", "o…
## $ sibsex  <chr> "bro", "bro", "bro", "bro", "bro", "bro", "bro", "bro", "bro", "b…
## $ n       <dbl> 0, 1, 1, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 2, 0, 1, 0, 0, 0, 0,…

Multiple steps

Tidy the data from reprores::eye_descriptions. This dataset contains descriptions of the eyes of 50 people by 220 raters (user_id). Some raters wrote more than one description per face (maximum 4), separated by commas, semicolons, or slashes.

Create a dataset with separate columns for face_id, description, and description number (desc_n).

Hint: to separate a string by tildes or commas, you would set the sep argument to "(~|,)+".

eyes <- reprores::eye_descriptions %>%
  gather("face_id", "description", t1:t50) %>%
  separate(description, c("d1", "d2", "d3", "d4"), sep = "(,|;|\\/)+", fill = "right") %>%
  gather("desc_n", "description", d1:d4) %>%
  filter(!is.na(description)) %>%  # gets rid of rows with no description
  glimpse()

## Rows: 12,304
## Columns: 6
## $ user_id     <dbl> 508844, 508966, 508976, 509196, 509286, 509400, 509503, 50966…
## $ sex         <chr> "male", "female", "female", "female", "female", "male", "fema…
## $ age         <dbl> 19.0, 20.4, 24.8, 14.6, 16.7, NA, 36.2, 20.7, 5.0, 18.2, 17.4…
## $ face_id     <chr> "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1", "…
## $ desc_n      <chr> "d1", "d1", "d1", "d1", "d1", "d1", "d1", "d1", "d1", "d1", "…
## $ description <chr> "empty", "bored", "Dark high on drugs", "soft brown", "brown"…