Edit the code chunks below and knit the document. You can pipe your objects to glimpse()
or print()
to display them.
Load the following data from the reprores package (or access the linked CSV files online). Each participant is identified by a unique user_id
.
data("disgust_scores")
data("personality_scores")
data("users")
# or
disgust_scores <- read_csv("https://psyteachr.github.io/reprores/data/disgust_scores.csv")
personality_scores <- read_csv("https://psyteachr.github.io/reprores/data/personality_scores.csv")
users <- read_csv("https://psyteachr.github.io/reprores/data/users.csv")
Add users
data to the disgust_scores
table.
study1 <- left_join(disgust_scores, users, by = "user_id")
glimpse(study1)
## Rows: 20,000
## Columns: 8
## $ id <dbl> 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,…
## $ user_id <dbl> 1, 155324, 155366, 155370, 155386, 155409, 155427, 155425, 1554…
## $ date <date> 2008-07-10, 2008-07-11, 2008-07-12, 2008-07-12, 2008-07-12, 20…
## $ moral <dbl> 1.428571, 3.000000, 5.571429, 5.714286, 1.428571, 4.142857, 3.2…
## $ pathogen <dbl> 2.714286, 2.571429, 4.000000, 4.857143, 3.857143, 4.142857, 5.2…
## $ sexual <dbl> 1.7142857, 1.8571429, 0.4285714, 4.7142857, 3.7142857, 1.571428…
## $ sex <chr> "female", "female", "male", "female", "male", "male", "female",…
## $ birthyear <dbl> 1976, 1984, 1982, 1968, 1983, 1983, 1987, 1978, 1986, 1970, 198…
Add the users
data to the disgust_scores
data, but have the columns from the participant table first.
study2 <- right_join(users, disgust_scores, by = "user_id")
glimpse(study2)
## Rows: 20,000
## Columns: 8
## $ user_id <dbl> 0, 1, 2, 2118, 2311, 3630, 4458, 4651, 4976, 5469, 6066, 6093, …
## $ sex <chr> NA, "female", "male", "female", "male", "male", "female", "fema…
## $ birthyear <dbl> NA, 1976, 1985, 1985, 1982, 1968, 1933, 1979, 1981, 1988, 1977,…
## $ id <dbl> 1199, 1, 1599, 13332, 23, 1160, 7980, 552, 37829, 6902, 6158, 4…
## $ date <date> 2008-10-07, 2008-07-10, 2008-10-27, 2012-01-02, 2008-07-15, 20…
## $ moral <dbl> 5.2857143, 1.4285714, NA, 1.0000000, 4.0000000, NA, 3.4285714, …
## $ pathogen <dbl> 4.714286, 2.714286, NA, 5.000000, 4.285714, 2.142857, 3.571429,…
## $ sexual <dbl> 2.1428571, 1.7142857, NA, 3.0000000, 1.8571429, 1.1428571, 3.00…
Create a table with only disgust_scores
and personality_scores
data from the same user_id
collected on the same date
.
study3 <- inner_join(disgust_scores, personality_scores,
by = c("user_id", "date"))
glimpse(study3)
## Rows: 555
## Columns: 11
## $ id <dbl> 3, 6, 17, 18, 21, 22, 24, 25, 32, 33, 34, 37, 39, 43, 44, 46, 47…
## $ user_id <dbl> 155324, 155386, 155567, 155571, 155665, 155682, 155712, 155764, …
## $ date <date> 2008-07-11, 2008-07-12, 2008-07-14, 2008-07-14, 2008-07-15, 200…
## $ moral <dbl> 3.000000, 1.428571, 5.571429, 2.714286, 4.142857, 2.714286, 4.42…
## $ pathogen <dbl> 2.571429, 3.857143, 4.714286, 6.000000, 4.142857, 3.000000, 4.00…
## $ sexual <dbl> 1.8571429, 3.7142857, 2.5714286, 4.4285714, 3.4285714, 0.7142857…
## $ Ag <dbl> 4.000000, 3.142857, 5.285714, 3.714286, 2.857143, 3.428571, 3.57…
## $ Co <dbl> 3.300000, 2.600000, 5.700000, 3.800000, 1.800000, 3.000000, 4.00…
## $ Ex <dbl> 4.8888889, 4.0000000, 3.8888889, 4.5555556, 4.6666667, 3.5555556…
## $ Ne <dbl> 2.375000, 0.250000, 1.125000, 2.250000, 3.125000, 1.375000, 3.37…
## $ Op <dbl> 4.714286, 5.142857, 3.142857, 2.857143, 4.571429, 4.857143, 5.28…
Join data from the same user_id
, regardless of date
. Does this give you the same data table as above?
study3_nodate <- inner_join(disgust_scores, personality_scores,
by = c("user_id"))
glimpse(study3_nodate)
## Rows: 677
## Columns: 12
## $ id <dbl> 1, 3, 6, 17, 18, 20, 21, 22, 24, 25, 32, 33, 34, 35, 36, 37, 39,…
## $ user_id <dbl> 1, 155324, 155386, 155567, 155571, 124756, 155665, 155682, 15571…
## $ date.x <date> 2008-07-10, 2008-07-11, 2008-07-12, 2008-07-14, 2008-07-14, 200…
## $ moral <dbl> 1.428571, 3.000000, 1.428571, 5.571429, 2.714286, 5.428571, 4.14…
## $ pathogen <dbl> 2.714286, 2.571429, 3.857143, 4.714286, 6.000000, 5.142857, 4.14…
## $ sexual <dbl> 1.7142857, 1.8571429, 3.7142857, 2.5714286, 4.4285714, 2.7142857…
## $ date.y <date> 2006-02-08, 2008-07-11, 2008-07-12, 2008-07-14, 2008-07-14, 200…
## $ Ag <dbl> 2.571429, 4.000000, 3.142857, 5.285714, 3.714286, 4.857143, 2.85…
## $ Co <dbl> 3.000000, 3.300000, 2.600000, 5.700000, 3.800000, 3.800000, 1.80…
## $ Ex <dbl> 2.6666667, 4.8888889, 4.0000000, 3.8888889, 4.5555556, 2.1111111…
## $ Ne <dbl> 2.250000, 2.375000, 0.250000, 1.125000, 2.250000, 3.375000, 3.12…
## $ Op <dbl> 4.285714, 4.714286, 5.142857, 3.142857, 2.857143, 5.285714, 4.57…
Create a table of the disgust_scores
and personality_scores
data containing all of the data from both tables.
study4 <- full_join(disgust_scores, personality_scores,
by = c("user_id", "date"))
glimpse(study4)
## Rows: 34,445
## Columns: 11
## $ id <dbl> 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, …
## $ user_id <dbl> 1, 155324, 155366, 155370, 155386, 155409, 155427, 155425, 15542…
## $ date <date> 2008-07-10, 2008-07-11, 2008-07-12, 2008-07-12, 2008-07-12, 200…
## $ moral <dbl> 1.428571, 3.000000, 5.571429, 5.714286, 1.428571, 4.142857, 3.28…
## $ pathogen <dbl> 2.714286, 2.571429, 4.000000, 4.857143, 3.857143, 4.142857, 5.28…
## $ sexual <dbl> 1.7142857, 1.8571429, 0.4285714, 4.7142857, 3.7142857, 1.5714286…
## $ Ag <dbl> NA, 4.000000, NA, NA, 3.142857, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Co <dbl> NA, 3.3, NA, NA, 2.6, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 5.…
## $ Ex <dbl> NA, 4.888889, NA, NA, 4.000000, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Ne <dbl> NA, 2.375, NA, NA, 0.250, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ Op <dbl> NA, 4.714286, NA, NA, 5.142857, NA, NA, NA, NA, NA, NA, NA, NA, …
Create a table of just the data from the disgust_scores
table for users who completed the personality_scores
questionnaire that same day.
study5 <- semi_join(disgust_scores, personality_scores,
by = c("user_id", "date"))
glimpse(study5)
## Rows: 555
## Columns: 6
## $ id <dbl> 3, 6, 17, 18, 21, 22, 24, 25, 32, 33, 34, 37, 39, 43, 44, 46, 47…
## $ user_id <dbl> 155324, 155386, 155567, 155571, 155665, 155682, 155712, 155764, …
## $ date <date> 2008-07-11, 2008-07-12, 2008-07-14, 2008-07-14, 2008-07-15, 200…
## $ moral <dbl> 3.000000, 1.428571, 5.571429, 2.714286, 4.142857, 2.714286, 4.42…
## $ pathogen <dbl> 2.571429, 3.857143, 4.714286, 6.000000, 4.142857, 3.000000, 4.00…
## $ sexual <dbl> 1.8571429, 3.7142857, 2.5714286, 4.4285714, 3.4285714, 0.7142857…
Create a table of data from users
who did not complete either the personality_scores
questionnaire or the disgust_scores
questionnaire. (Hint: this will require two steps.)
no_personality <- anti_join(users, personality_scores, by = "user_id")
study6 <- anti_join(no_personality, disgust_scores, by = "user_id")
glimpse(study6)
## Rows: 17,728
## Columns: 3
## $ user_id <dbl> 9, 10, 17, 19, 20, 21, 22, 23, 24, 27, 30, 31, 32, 33, 34, 35, …
## $ sex <chr> "male", "female", "female", "female", "male", "male", "male", "…
## $ birthyear <dbl> 1972, 1978, 1981, 1980, 1964, 1945, 1973, 1985, 1982, 1965, 198…
Load new user data from users2. Bind this table and the original users
table into a single table called users_all
.
data("users2")
users_all <- bind_rows(users, users2)
glimpse(users_all)
## Rows: 112,043
## Columns: 3
## $ user_id <dbl> 0, 1, 2, 5, 8, 9, 10, 17, 19, 20, 21, 22, 23, 24, 27, 30, 31, 3…
## $ sex <chr> NA, "female", "male", "male", "male", "male", "female", "female…
## $ birthyear <dbl> NA, 1976, 1985, 1980, 1968, 1972, 1978, 1981, 1980, 1964, 1945,…
How many users are in both the first and second user table? Use code to get this number; don’t read the row number from the environment and type it in. (Hint: What does nrow(mtcars)
return?)
b_table <- dplyr::intersect(users, users2)
both_n <- nrow(b_table)
print(both_n)
## [1] 11602
How many unique users are there in total across the first and second user tables?
uu_table <- dplyr::union(users, users2)
unique_users <- nrow(uu_table)
print(unique_users)
## [1] 100441
How many users are in the first, but not the second, user table?
fu_table <- dplyr::setdiff(users, users2)
first_users <- nrow(fu_table)
print(first_users)
## [1] 40441
How many users are in the second, but not the first, user table?
su_table <- dplyr::setdiff(users2, users)
second_users <- nrow(su_table)
print(second_users)
## [1] 48398