Formative Exercise 04: MSc Data Skills Course

Edit the code chunks below and knit the document. You can pipe your objects to glimpse() or print() to display them.

Tidy data

The following data table is not tidy. Use tibble() or tribble()to manually create the tidy version of this table.

# do not edit this chunk
untidy <- tribble(
  ~id, ~stats, ~p.value, ~conf.int,
  "A", "t(26) = -0.424", 0.6749,  "[-0.444, 0.292]",
  "B", "t(19) =  0.754", 0.4600,  "[-0.287, 0.610]",
  "C", "t(19) =  4.289", 0.0004,  "[ 0.374, 1.088]"
) %>% print()

## # A tibble: 3 x 4
##   id    stats          p.value conf.int       
##   <chr> <chr>            <dbl> <chr>          
## 1 A     t(26) = -0.424  0.675  [-0.444, 0.292]
## 2 B     t(19) =  0.754  0.46   [-0.287, 0.610]
## 3 C     t(19) =  4.289  0.0004 [ 0.374, 1.088]

# your version can have different column names in a different order
tidy <- tribble(
  ~id, ~df, ~t.value, ~p.value, ~conf.int.low, ~conf.int.high,
  "A",  26,   -0.424,   0.6749,        -0.444,          0.292,
  "B",  19,    0.754,   0.4600,        -0.287,          0.610,
  "C",  19,    4.289,   0.0004,         0.374,          1.088
) %>% print()

## # A tibble: 3 x 6
##   id       df t.value p.value conf.int.low conf.int.high
##   <chr> <dbl>   <dbl>   <dbl>        <dbl>         <dbl>
## 1 A        26  -0.424  0.675        -0.444         0.292
## 2 B        19   0.754  0.46         -0.287         0.61 
## 3 C        19   4.29   0.0004        0.374         1.09

Pivot functions

The questions in this section all have errors. Fix the errors.

Load data

Load the dataset dataskills::sensation_seeking as ss.

ss <- read_csv(dataskills::sensation_seeking)

## Error: `file` must be a string, raw vector or a connection.

ss <- dataskills::sensation_seeking

## alternatively

ss <- read_csv("https://psyteachr.github.io/msc-data-skills/data/sensation_seeking.csv")

pivot_longer

Convert from wide to long format.

ss_long <- ss %>%
  pivot_longer(names_to = "question", 
               values_to = "score") %>%
  glimpse()

## Error: `cols` must select at least one column.

ss_long <- ss %>%
  pivot_longer(sss1:sss14, 
               names_to = "question", 
               values_to = "score") %>%
  glimpse()

## Rows: 378,294
## Columns: 5
## $ id       <dbl> 3144, 3144, 3144, 3144, 3144, 3144, 3144,…
## $ user_id  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ date     <date> 2006-05-07, 2006-05-07, 2006-05-07, 2006…
## $ question <chr> "sss1", "sss2", "sss3", "sss4", "sss5", "…
## $ score    <dbl> 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,…

pivot_wider

Convert back to wide format. Make sure ss_wide is the same as ss.

ss_wide <- ss_long %>%
  pivot_wider(question, score) %>%
  glimpse()

## Rows: 14
## Columns: 1
## $ question <chr> "sss1", "sss2", "sss3", "sss4", "sss5", "…

ss_wide <- ss_long %>%
  pivot_wider(names_from = question, 
              values_from = score) %>%
  glimpse()

## Rows: 27,021
## Columns: 17
## $ id      <dbl> 3144, 133, 175, 285, 1, 3, 4, 5, 6, 7, 8, …
## $ user_id <dbl> 0, 1, 2, 5, 8, 9, 10, 17, 19, 20, 21, 22, …
## $ date    <date> 2006-05-07, 2004-12-08, 2005-01-14, 2005-…
## $ sss1    <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, …
## $ sss2    <dbl> 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, …
## $ sss3    <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, …
## $ sss4    <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, …
## $ sss5    <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, …
## $ sss6    <dbl> 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, …
## $ sss7    <dbl> 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, …
## $ sss8    <dbl> 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, …
## $ sss9    <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, …
## $ sss10   <dbl> 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ sss11   <dbl> 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ sss12   <dbl> 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, …
## $ sss13   <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, …
## $ sss14   <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, …

Tidy verbs

The questions in this section all have errors. Fix the errors.

gather

Use the gather() function to convert ss from wide to long.

ss_long <- gather(ss, "question", "score") %>%
  glimpse()

## Warning: attributes are not identical across measure variables;
## they will be dropped

## Rows: 459,357
## Columns: 2
## $ question <chr> "id", "id", "id", "id", "id", "id", "id",…
## $ score    <dbl> 3144, 133, 175, 285, 1, 3, 4, 5, 6, 7, 8,…

ss_long <- gather(ss, "question", "score", sss1:sss14) %>%
  glimpse()

## Rows: 378,294
## Columns: 5
## $ id       <dbl> 3144, 133, 175, 285, 1, 3, 4, 5, 6, 7, 8,…
## $ user_id  <dbl> 0, 1, 2, 5, 8, 9, 10, 17, 19, 20, 21, 22,…
## $ date     <date> 2006-05-07, 2004-12-08, 2005-01-14, 2005…
## $ question <chr> "sss1", "sss1", "sss1", "sss1", "sss1", "…
## $ score    <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,…

separate

Split the question column from ss_long into two columns: domain and qnumber.

ss_sep <- ss_long %>%
  separate(question, domain, qnumber, sep = 3) %>%
  glimpse()

## Error in str_separate(value, into = into, sep = sep, convert = convert, : object 'domain' not found

ss_sep <- ss_long %>%
  separate(question, c("domain", "qnumber"), sep = 3) %>%
  glimpse()

## Rows: 378,294
## Columns: 6
## $ id      <dbl> 3144, 133, 175, 285, 1, 3, 4, 5, 6, 7, 8, …
## $ user_id <dbl> 0, 1, 2, 5, 8, 9, 10, 17, 19, 20, 21, 22, …
## $ date    <date> 2006-05-07, 2004-12-08, 2005-01-14, 2005-…
## $ domain  <chr> "sss", "sss", "sss", "sss", "sss", "sss", …
## $ qnumber <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1…
## $ score   <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, …

unite

Put the id and user_id columns together into a new column named super_id. Make it in a format like “id-user_id”.

ss_unite <- ss_sep %>%
  unite(id, user_id, "super_id", sep = "-") %>%
  glimpse()

## Error: Can't subset columns that don't exist.
## [31mx[39m Column `super_id` doesn't exist.

ss_unite <- ss_sep %>%
  unite("super_id", id, user_id, sep = "-") %>%
  glimpse()

## Rows: 378,294
## Columns: 5
## $ super_id <chr> "3144-0", "133-1", "175-2", "285-5", "1-8…
## $ date     <date> 2006-05-07, 2004-12-08, 2005-01-14, 2005…
## $ domain   <chr> "sss", "sss", "sss", "sss", "sss", "sss",…
## $ qnumber  <chr> "1", "1", "1", "1", "1", "1", "1", "1", "…
## $ score    <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,…

spread

Convert back to wide format. (N.B. the new question columns headers will just be numbers, not “sss#”)

ss_wide <- ss_unite %>%
  spreadr(qnumber, score, ) %>%
  glimpse()

## Error in spreadr(., qnumber, score, ): could not find function "spreadr"

ss_wide <- ss_unite %>%
  spread(qnumber, score) %>%
  glimpse()

## Rows: 27,021
## Columns: 17
## $ super_id <chr> "1-8", "10-23", "100-426", "10000-64553",…
## $ date     <date> 2004-09-29, 2004-10-08, 2004-11-25, 2007…
## $ domain   <chr> "sss", "sss", "sss", "sss", "sss", "sss",…
## $ `1`      <dbl> 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,…
## $ `10`     <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ `11`     <dbl> 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1,…
## $ `12`     <dbl> 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0,…
## $ `13`     <dbl> 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,…
## $ `14`     <dbl> 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,…
## $ `2`      <dbl> 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,…
## $ `3`      <dbl> 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ `4`      <dbl> 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0,…
## $ `5`      <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,…
## $ `6`      <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,…
## $ `7`      <dbl> 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,…
## $ `8`      <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,…
## $ `9`      <dbl> 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,…

Pipes

Connect with pipes

Re-write the following sequence of commands into a single ‘pipeline’.

# do not edit this chunk
x <- 1:20      # integers from 1:20
y <- rep(x, 2) # then repeat them twice
z <- sum(y)    # and then take the sum
z

## [1] 420

x <- 1:20 %>% rep(2) %>% sum() %>% print()

## [1] 420

Deconnect pipes

Deconstruct the pipeline below back into separate commands.

# do not edit this chunk
lager <- LETTERS[c(18, 5, 7, 1, 12)] %>%
  rev() %>%
  paste(collapse = "") %>%
  print()

## [1] "LAGER"

regal <- LETTERS[c(18, 5, 7, 1, 12)]
reversed <- rev(regal)
lager <- paste(reversed, collapse = "") # make it into a string
lager

## [1] "LAGER"

Pivot vs tidy verbs

Load the dataset dataskills::family_composition.

The columns oldbro through twinsis give the number of siblings of that age and sex. Put this into long format and create separate columns for sibling age (sibage = old, young, twin) and sex (sibsex = bro, sis).

Use pivot functions

family_pivot <- dataskills::family_composition %>%
  pivot_longer(cols = oldbro:twinsis,
               names_to = c("sibage", "sibsex"),
               names_sep = -3,
               values_to = "n") %>%
  glimpse()

## Rows: 115,014
## Columns: 8
## $ user_id <dbl> 8, 8, 8, 8, 8, 8, 67, 67, 67, 67, 67, 67, …
## $ sex     <chr> "male", "male", "male", "male", "male", "m…
## $ age     <dbl> 38.1, 38.1, 38.1, 38.1, 38.1, 38.1, 19.7, …
## $ momage  <dbl> 25, 25, 25, 25, 25, 25, 29, 29, 29, 29, 29…
## $ dadage  <dbl> 27, 27, 27, 27, 27, 27, 31, 31, 31, 31, 31…
## $ sibage  <chr> "old", "old", "young", "young", "twin", "t…
## $ sibsex  <chr> "bro", "sis", "bro", "sis", "bro", "sis", …
## $ n       <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, …

Use tidy verbs

family_tidy <- dataskills::family_composition %>%
  gather("sibtype", "n", oldbro:twinsis) %>%
  separate(sibtype, c("sibage", "sibsex"), sep = -3) %>%
  glimpse()

## Rows: 115,014
## Columns: 8
## $ user_id <dbl> 8, 67, 98, 103, 164, 233, 235, 253, 256, 2…
## $ sex     <chr> "male", "female", "female", "female", "fem…
## $ age     <dbl> 38.1, 19.7, 19.4, 20.6, 20.3, 19.3, 18.7, …
## $ momage  <dbl> 25, 29, NA, NA, 24, NA, NA, 24, NA, 21, 28…
## $ dadage  <dbl> 27, 31, NA, NA, NA, NA, NA, 25, NA, 22, NA…
## $ sibage  <chr> "old", "old", "old", "old", "old", "old", …
## $ sibsex  <chr> "bro", "bro", "bro", "bro", "bro", "bro", …
## $ n       <dbl> 0, 1, 1, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, …

Multiple steps

Tidy the data from dataskills::eye_descriptions. This dataset contains descriptions of the eyes of 50 people by 220 raters (user_id). Some raters wrote more than one description per face (maximum 4), separated by commas, semicolons, or slashes.

Create a dataset with separate columns for face_id, description, and description number (desc_n).

Hint: to separate a string by tildes or commas, you would set the sep argument to "(~|,)+".

eyes <- dataskills::eye_descriptions %>%
  gather("face_id", "description", t1:t50) %>%
  separate(description, c("d1", "d2", "d3", "d4"), sep = "(,|;|\\/)+", fill = "right") %>%
  gather("desc_n", "description", d1:d4) %>%
  filter(!is.na(description)) %>%  # gets rid of rows with no description
  glimpse()

## Rows: 12,304
## Columns: 6
## $ user_id     <dbl> 508844, 508966, 508976, 509196, 509286…
## $ sex         <chr> "male", "female", "female", "female", …
## $ age         <dbl> 19.0, 20.4, 24.8, 14.6, 16.7, NA, 36.2…
## $ face_id     <chr> "t1", "t1", "t1", "t1", "t1", "t1", "t…
## $ desc_n      <chr> "d1", "d1", "d1", "d1", "d1", "d1", "d…
## $ description <chr> "empty", "bored", "Dark high on drugs"…