Formative Exercise 05: Data Relations

Edit the code chunks below and knit the document. You can pipe your objects to glimpse() or print() to display them.

Mutating joins

Question 1A

Load the following data from the reprores package (or access the linked CSV files online). Each participant is identified by a unique user_id.

data("disgust_scores")
data("personality_scores")
data("users")

# or 

disgust_scores <- read_csv("https://psyteachr.github.io/reprores/data/disgust_scores.csv")
personality_scores <- read_csv("https://psyteachr.github.io/reprores/data/personality_scores.csv")
users <- read_csv("https://psyteachr.github.io/reprores/data/users.csv")

Question 1B

Add users data to the disgust_scores table.

study1 <- left_join(disgust_scores, users, by = "user_id")

glimpse(study1)

## Rows: 20,000
## Columns: 8
## $ id        [3m[38;5;246m<dbl>[39m[23m 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, …
## $ user_id   [3m[38;5;246m<dbl>[39m[23m 1, 155324, 155366, 155370, 155386, 155409, 155427, 155425, 155429, 155450, 155451, 1…
## $ date      [3m[38;5;246m<date>[39m[23m 2008-07-10, 2008-07-11, 2008-07-12, 2008-07-12, 2008-07-12, 2008-07-12, 2008-07-13,…
## $ moral     [3m[38;5;246m<dbl>[39m[23m 1.428571, 3.000000, 5.571429, 5.714286, 1.428571, 4.142857, 3.285714, 4.428571, 3.71…
## $ pathogen  [3m[38;5;246m<dbl>[39m[23m 2.714286, 2.571429, 4.000000, 4.857143, 3.857143, 4.142857, 5.285714, 3.857143, 5.42…
## $ sexual    [3m[38;5;246m<dbl>[39m[23m 1.7142857, 1.8571429, 0.4285714, 4.7142857, 3.7142857, 1.5714286, 3.0000000, 2.42857…
## $ sex       [3m[38;5;246m<chr>[39m[23m "female", "female", "male", "female", "male", "male", "female", "female", "female", …
## $ birthyear [3m[38;5;246m<dbl>[39m[23m 1976, 1984, 1982, 1968, 1983, 1983, 1987, 1978, 1986, 1970, 1986, 1985, 1966, 1984, …

Question 2

Add the users data to the disgust_scores data, but have the columns from the participant table first.

study2 <- right_join(users, disgust_scores, by = "user_id")

glimpse(study2)

## Rows: 20,000
## Columns: 8
## $ user_id   [3m[38;5;246m<dbl>[39m[23m 0, 1, 2, 2118, 2311, 3630, 4458, 4651, 4976, 5469, 6066, 6093, 6172, 7129, 7646, 837…
## $ sex       [3m[38;5;246m<chr>[39m[23m NA, "female", "male", "female", "male", "male", "female", "female", "male", "female"…
## $ birthyear [3m[38;5;246m<dbl>[39m[23m NA, 1976, 1985, 1985, 1982, 1968, 1933, 1979, 1981, 1988, 1977, 1965, 1987, 1988, 19…
## $ id        [3m[38;5;246m<dbl>[39m[23m 1199, 1, 1599, 13332, 23, 1160, 7980, 552, 37829, 6902, 6158, 4850, 1134, 6362, 2248…
## $ date      [3m[38;5;246m<date>[39m[23m 2008-10-07, 2008-07-10, 2008-10-27, 2012-01-02, 2008-07-15, 2008-10-06, 2011-09-05,…
## $ moral     [3m[38;5;246m<dbl>[39m[23m 5.2857143, 1.4285714, NA, 1.0000000, 4.0000000, NA, 3.4285714, 3.8571429, 3.4285714,…
## $ pathogen  [3m[38;5;246m<dbl>[39m[23m 4.714286, 2.714286, NA, 5.000000, 4.285714, 2.142857, 3.571429, 4.857143, 4.285714, …
## $ sexual    [3m[38;5;246m<dbl>[39m[23m 2.1428571, 1.7142857, NA, 3.0000000, 1.8571429, 1.1428571, 3.0000000, 4.2857143, 0.0…

Question 3A

Create a table with only disgust_scores and personality_scores data from the same user_id collected on the same date.

study3 <- inner_join(disgust_scores, personality_scores, 
                     by = c("user_id", "date"))

glimpse(study3)

## Rows: 555
## Columns: 11
## $ id       [3m[38;5;246m<dbl>[39m[23m 3, 6, 17, 18, 21, 22, 24, 25, 32, 33, 34, 37, 39, 43, 44, 46, 47, 49, 51, 54, 55, 56,…
## $ user_id  [3m[38;5;246m<dbl>[39m[23m 155324, 155386, 155567, 155571, 155665, 155682, 155712, 155764, 155864, 155880, 15588…
## $ date     [3m[38;5;246m<date>[39m[23m 2008-07-11, 2008-07-12, 2008-07-14, 2008-07-14, 2008-07-15, 2008-07-15, 2008-07-15, …
## $ moral    [3m[38;5;246m<dbl>[39m[23m 3.000000, 1.428571, 5.571429, 2.714286, 4.142857, 2.714286, 4.428571, 2.571429, 5.571…
## $ pathogen [3m[38;5;246m<dbl>[39m[23m 2.571429, 3.857143, 4.714286, 6.000000, 4.142857, 3.000000, 4.000000, 4.571429, 4.571…
## $ sexual   [3m[38;5;246m<dbl>[39m[23m 1.8571429, 3.7142857, 2.5714286, 4.4285714, 3.4285714, 0.7142857, 1.1428571, 4.285714…
## $ Ag       [3m[38;5;246m<dbl>[39m[23m 4.000000, 3.142857, 5.285714, 3.714286, 2.857143, 3.428571, 3.571429, 2.142857, 4.285…
## $ Co       [3m[38;5;246m<dbl>[39m[23m 3.300000, 2.600000, 5.700000, 3.800000, 1.800000, 3.000000, 4.000000, 4.600000, 3.200…
## $ Ex       [3m[38;5;246m<dbl>[39m[23m 4.8888889, 4.0000000, 3.8888889, 4.5555556, 4.6666667, 3.5555556, 2.7777778, 3.666666…
## $ Ne       [3m[38;5;246m<dbl>[39m[23m 2.375000, 0.250000, 1.125000, 2.250000, 3.125000, 1.375000, 3.375000, 3.000000, 4.375…
## $ Op       [3m[38;5;246m<dbl>[39m[23m 4.714286, 5.142857, 3.142857, 2.857143, 4.571429, 4.857143, 5.285714, 3.857143, 5.142…

Question 3B

Join data from the same user_id, regardless of date. Does this give you the same data table as above?

study3_nodate <- inner_join(disgust_scores, personality_scores, 
                            by = c("user_id"))

glimpse(study3_nodate)

## Rows: 677
## Columns: 12
## $ id       [3m[38;5;246m<dbl>[39m[23m 1, 3, 6, 17, 18, 20, 21, 22, 24, 25, 32, 33, 34, 35, 36, 37, 39, 41, 43, 44, 46, 47, …
## $ user_id  [3m[38;5;246m<dbl>[39m[23m 1, 155324, 155386, 155567, 155571, 124756, 155665, 155682, 155712, 155764, 155864, 15…
## $ date.x   [3m[38;5;246m<date>[39m[23m 2008-07-10, 2008-07-11, 2008-07-12, 2008-07-14, 2008-07-14, 2008-07-14, 2008-07-15, …
## $ moral    [3m[38;5;246m<dbl>[39m[23m 1.428571, 3.000000, 1.428571, 5.571429, 2.714286, 5.428571, 4.142857, 2.714286, 4.428…
## $ pathogen [3m[38;5;246m<dbl>[39m[23m 2.714286, 2.571429, 3.857143, 4.714286, 6.000000, 5.142857, 4.142857, 3.000000, 4.000…
## $ sexual   [3m[38;5;246m<dbl>[39m[23m 1.7142857, 1.8571429, 3.7142857, 2.5714286, 4.4285714, 2.7142857, 3.4285714, 0.714285…
## $ date.y   [3m[38;5;246m<date>[39m[23m 2006-02-08, 2008-07-11, 2008-07-12, 2008-07-14, 2008-07-14, 2008-01-23, 2008-07-15, …
## $ Ag       [3m[38;5;246m<dbl>[39m[23m 2.571429, 4.000000, 3.142857, 5.285714, 3.714286, 4.857143, 2.857143, 3.428571, 3.571…
## $ Co       [3m[38;5;246m<dbl>[39m[23m 3.000000, 3.300000, 2.600000, 5.700000, 3.800000, 3.800000, 1.800000, 3.000000, 4.000…
## $ Ex       [3m[38;5;246m<dbl>[39m[23m 2.6666667, 4.8888889, 4.0000000, 3.8888889, 4.5555556, 2.1111111, 4.6666667, 3.555555…
## $ Ne       [3m[38;5;246m<dbl>[39m[23m 2.250000, 2.375000, 0.250000, 1.125000, 2.250000, 3.375000, 3.125000, 1.375000, 3.375…
## $ Op       [3m[38;5;246m<dbl>[39m[23m 4.285714, 4.714286, 5.142857, 3.142857, 2.857143, 5.285714, 4.571429, 4.857143, 5.285…

Question 4

Create a table of the disgust_scores and personality_scores data containing all of the data from both tables.

study4 <- full_join(disgust_scores, personality_scores, 
                    by = c("user_id", "date"))

glimpse(study4)

## Rows: 34,445
## Columns: 11
## $ id       [3m[38;5;246m<dbl>[39m[23m 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 2…
## $ user_id  [3m[38;5;246m<dbl>[39m[23m 1, 155324, 155366, 155370, 155386, 155409, 155427, 155425, 155429, 155450, 155451, 15…
## $ date     [3m[38;5;246m<date>[39m[23m 2008-07-10, 2008-07-11, 2008-07-12, 2008-07-12, 2008-07-12, 2008-07-12, 2008-07-13, …
## $ moral    [3m[38;5;246m<dbl>[39m[23m 1.428571, 3.000000, 5.571429, 5.714286, 1.428571, 4.142857, 3.285714, 4.428571, 3.714…
## $ pathogen [3m[38;5;246m<dbl>[39m[23m 2.714286, 2.571429, 4.000000, 4.857143, 3.857143, 4.142857, 5.285714, 3.857143, 5.428…
## $ sexual   [3m[38;5;246m<dbl>[39m[23m 1.7142857, 1.8571429, 0.4285714, 4.7142857, 3.7142857, 1.5714286, 3.0000000, 2.428571…
## $ Ag       [3m[38;5;246m<dbl>[39m[23m NA, 4.000000, NA, NA, 3.142857, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 5.285714, 3.7…
## $ Co       [3m[38;5;246m<dbl>[39m[23m NA, 3.3, NA, NA, 2.6, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 5.7, 3.8, NA, NA, 1.8, …
## $ Ex       [3m[38;5;246m<dbl>[39m[23m NA, 4.888889, NA, NA, 4.000000, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 3.888889, 4.5…
## $ Ne       [3m[38;5;246m<dbl>[39m[23m NA, 2.375, NA, NA, 0.250, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1.125, 2.250, NA, N…
## $ Op       [3m[38;5;246m<dbl>[39m[23m NA, 4.714286, NA, NA, 5.142857, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 3.142857, 2.8…

Filtering joins

Question 5

Create a table of just the data from the disgust_scores table for users who completed the personality_scores questionnaire that same day.

study5 <- semi_join(disgust_scores, personality_scores, 
                    by = c("user_id", "date"))

glimpse(study5)

## Rows: 555
## Columns: 6
## $ id       [3m[38;5;246m<dbl>[39m[23m 3, 6, 17, 18, 21, 22, 24, 25, 32, 33, 34, 37, 39, 43, 44, 46, 47, 49, 51, 54, 55, 56,…
## $ user_id  [3m[38;5;246m<dbl>[39m[23m 155324, 155386, 155567, 155571, 155665, 155682, 155712, 155764, 155864, 155880, 15588…
## $ date     [3m[38;5;246m<date>[39m[23m 2008-07-11, 2008-07-12, 2008-07-14, 2008-07-14, 2008-07-15, 2008-07-15, 2008-07-15, …
## $ moral    [3m[38;5;246m<dbl>[39m[23m 3.000000, 1.428571, 5.571429, 2.714286, 4.142857, 2.714286, 4.428571, 2.571429, 5.571…
## $ pathogen [3m[38;5;246m<dbl>[39m[23m 2.571429, 3.857143, 4.714286, 6.000000, 4.142857, 3.000000, 4.000000, 4.571429, 4.571…
## $ sexual   [3m[38;5;246m<dbl>[39m[23m 1.8571429, 3.7142857, 2.5714286, 4.4285714, 3.4285714, 0.7142857, 1.1428571, 4.285714…

Question 6

Create a table of data from users who did not complete either the personality_scores questionnaire or the disgust_scores questionnaire. (Hint: this will require two steps.)

no_personality <- anti_join(users, personality_scores, by = "user_id")
study6 <- anti_join(no_personality, disgust_scores, by = "user_id")

glimpse(study6)

## Rows: 17,728
## Columns: 3
## $ user_id   [3m[38;5;246m<dbl>[39m[23m 9, 10, 17, 19, 20, 21, 22, 23, 24, 27, 30, 31, 32, 33, 34, 35, 37, 47, 51, 67, 78, 8…
## $ sex       [3m[38;5;246m<chr>[39m[23m "male", "female", "female", "female", "male", "male", "male", "female", "female", "f…
## $ birthyear [3m[38;5;246m<dbl>[39m[23m 1972, 1978, 1981, 1980, 1964, 1945, 1973, 1985, 1982, 1965, 1985, 1974, 1984, 1954, …

Binding and sets

Question 7

Load new user data from users2. Bind this table and the original users table into a single table called users_all.

data("users2")
users_all <- bind_rows(users, users2)

glimpse(users_all)

## Rows: 112,043
## Columns: 3
## $ user_id   [3m[38;5;246m<dbl>[39m[23m 0, 1, 2, 5, 8, 9, 10, 17, 19, 20, 21, 22, 23, 24, 27, 30, 31, 32, 33, 34, 35, 37, 47…
## $ sex       [3m[38;5;246m<chr>[39m[23m NA, "female", "male", "male", "male", "male", "female", "female", "female", "male", …
## $ birthyear [3m[38;5;246m<dbl>[39m[23m NA, 1976, 1985, 1980, 1968, 1972, 1978, 1981, 1980, 1964, 1945, 1973, 1985, 1982, 19…

Question 8

How many users are in both the first and second user table? Use code to get this number; don’t read the row number from the environment and type it in. (Hint: What does nrow(mtcars) return?)

b_table <- dplyr::intersect(users, users2)
both_n <- nrow(b_table)

print(both_n)

## [1] 11602

Question 9

How many unique users are there in total across the first and second user tables?

uu_table <- dplyr::union(users, users2)
unique_users <- nrow(uu_table)

print(unique_users)

## [1] 100441

Question 10

How many users are in the first, but not the second, user table?

fu_table <- dplyr::setdiff(users, users2)
first_users <- nrow(fu_table)

print(first_users)

## [1] 40441

Question 11

How many users are in the second, but not the first, user table?

su_table <- dplyr::setdiff(users2, users)
second_users <- nrow(su_table)

print(second_users)

## [1] 48398