List the datasets in dplyr
.
data(package = "dplyr")
Load the built-in dataset starwars
and use
glimpse()
to see an overview.
data("starwars")
glimpse(starwars)
## Rows: 87
## Columns: 14
## $ name [3m[38;5;246m<chr>[39m[23m "Luke Skywalker", "C-3PO", "R2-D2", "Darth Vader", "Leia Organa", "Owen Lars", "Ber…
## $ height [3m[38;5;246m<int>[39m[23m 172, 167, 96, 202, 150, 178, 165, 97, 183, 182, 188, 180, 228, 180, 173, 175, 170, …
## $ mass [3m[38;5;246m<dbl>[39m[23m 77.0, 75.0, 32.0, 136.0, 49.0, 120.0, 75.0, 32.0, 84.0, 77.0, 84.0, NA, 112.0, 80.0…
## $ hair_color [3m[38;5;246m<chr>[39m[23m "blond", NA, NA, "none", "brown", "brown, grey", "brown", NA, "black", "auburn, whi…
## $ skin_color [3m[38;5;246m<chr>[39m[23m "fair", "gold", "white, blue", "white", "light", "light", "light", "white, red", "l…
## $ eye_color [3m[38;5;246m<chr>[39m[23m "blue", "yellow", "red", "yellow", "brown", "blue", "blue", "red", "brown", "blue-g…
## $ birth_year [3m[38;5;246m<dbl>[39m[23m 19.0, 112.0, 33.0, 41.9, 19.0, 52.0, 47.0, NA, 24.0, 57.0, 41.9, 64.0, 200.0, 29.0,…
## $ sex [3m[38;5;246m<chr>[39m[23m "male", "none", "none", "male", "female", "male", "female", "none", "male", "male",…
## $ gender [3m[38;5;246m<chr>[39m[23m "masculine", "masculine", "masculine", "masculine", "feminine", "masculine", "femin…
## $ homeworld [3m[38;5;246m<chr>[39m[23m "Tatooine", "Tatooine", "Naboo", "Tatooine", "Alderaan", "Tatooine", "Tatooine", "T…
## $ species [3m[38;5;246m<chr>[39m[23m "Human", "Droid", "Droid", "Human", "Human", "Human", "Human", "Droid", "Human", "H…
## $ films [3m[38;5;246m<list>[39m[23m <"The Empire Strikes Back", "Revenge of the Sith", "Return of the Jedi", "A New Ho…
## $ vehicles [3m[38;5;246m<list>[39m[23m <"Snowspeeder", "Imperial Speeder Bike">, <>, <>, <>, "Imperial Speeder Bike", <>,…
## $ starships [3m[38;5;246m<list>[39m[23m <"X-wing", "Imperial shuttle">, <>, <>, "TIE Advanced x1", <>, <>, <>, <>, "X-wing…
Convert the built-in base R mtcars
dataset to a tibble
(you will need to find the function for this; it isn’t in the chapter),
and store it in the object mt
.
mt <- as_tibble(mtcars)
Using the data directory created by reprores::getdata()
(or download the zip
file, read “disgust_scores.csv” into a table.
disgust <- read_csv("data/disgust_scores.csv")
Override the default column specifications to skip the
id
column.
my_cols <- cols(
id = col_skip()
)
disgust_skip <- read_csv("data/disgust_scores.csv", col_types = my_cols)
How many rows and columns are in the disgust
dataset?
## dim() returns a vector c(rows, cols)
dimensions <- dim(disgust)
disgust_rows <- dimensions[1]
disgust_cols <- dimensions[2]
## nrow() returns the number of rows
disgust_rows <- nrow(disgust)
## ncol() returns the number of columns
disgust_cols <- ncol(disgust)
Load the data in “data/stroop.csv” as stroop1
and
“data/stroop.xlsx” as stroop2
.
stroop1 <- read_csv("data/stroop.csv")
stroop2 <- readxl::read_xlsx("data/stroop.xlsx")
Use glimpse()
to figure out the difference between the
two data tables and fix the problem.
# the difference is the data type of rt is double in stroop1 and character in stroop 2
glimpse(stroop1)
## Rows: 12,500
## Columns: 5
## $ sub_id [3m[38;5;246m<chr>[39m[23m "S01", "S01", "S01", "S01", "S01", "S01", "S01", "S01", "S01", "S01", "S01", "S01", "…
## $ word [3m[38;5;246m<chr>[39m[23m "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue…
## $ ink [3m[38;5;246m<chr>[39m[23m "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "purp…
## $ response [3m[38;5;246m<chr>[39m[23m "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "purp…
## $ rt [3m[38;5;246m<dbl>[39m[23m 447.9500, 447.9500, 532.8690, 532.8690, 502.0845, 502.0845, 601.9780, 601.9780, 616.4…
glimpse(stroop2)
## Rows: 12,500
## Columns: 5
## $ sub_id [3m[38;5;246m<chr>[39m[23m "S01", "S01", "S01", "S01", "S01", "S01", "S01", "S01", "S01", "S01", "S01", "S01", "…
## $ word [3m[38;5;246m<chr>[39m[23m "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue…
## $ ink [3m[38;5;246m<chr>[39m[23m "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "purp…
## $ response [3m[38;5;246m<chr>[39m[23m "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "purp…
## $ rt [3m[38;5;246m<chr>[39m[23m "447.94998317180102", "447.94998317180102", "532.86898595419302", "532.86898595419302…
# missing values use the characters "NA", so define the NA value when importing
stroop2b <- readxl::read_xlsx("data/stroop.xlsx", na = "NA")
Create a tibble with the columns name
, age
,
and country
of origin for 2 people you know.
# you can do this with the tibble function
people <- tibble(name = c("Lisa", "Robbie"),
age = c(43, 12),
country = c("US","UK") )
# also note:
# you can type this in row by row, rather than column by column,
# using the 'tribble' function
people <- tribble(~name, ~age, ~country,
"Lisa", 43, "US",
"Robbie", 12, "UK")
Create a tibble that has the structure of the table below, using the
minimum typing possible. (Hint: rep()
). Store it in the
variable my_tbl
.
ID | A | B | C |
---|---|---|---|
1 | A1 | B1 | C1 |
2 | A1 | B2 | C1 |
3 | A1 | B1 | C1 |
4 | A1 | B2 | C1 |
5 | A2 | B1 | C1 |
6 | A2 | B2 | C1 |
7 | A2 | B1 | C1 |
8 | A2 | B2 | C1 |
my_tbl <- tibble(ID = 1:8,
A = rep(c("A1", "A2"), each = 4),
B = rep(c("B1", "B2"), 4),
C = "C1")
Set the following objects to the number 1 with the indicated data type:
one_int
(integer)one_dbl
(double)one_chr
(character)one_int <- 1L
one_dbl <- 1.0
one_chr <- "1"
Set the objects T_log
, T_chr
,
T_int
and T_dbl
to logical, character, integer
and double values that will all be equal to TRUE
.
T_log <- TRUE
T_chr <- "TRUE"
T_int <- 1L
T_dbl <- 1.0
Check your answers with this code:
# these should all evaluate to TRUE
tests <- list(
T_log_is_TRUE = T_log == TRUE,
T_chr_is_TRUE = T_chr == TRUE,
T_int_is_TRUE = T_int == TRUE,
T_dbl_is_TRUE = T_dbl == TRUE,
T_log_is_log = is.logical(T_log),
T_chr_is_chr = is.character(T_chr),
T_int_is_int = is.integer(T_int),
T_dbl_is_dbl = is.double(T_dbl)
)
str(tests) # this shows a condensed version of the list
## List of 8
## $ T_log_is_TRUE: logi TRUE
## $ T_chr_is_TRUE: logi TRUE
## $ T_int_is_TRUE: logi TRUE
## $ T_dbl_is_TRUE: logi TRUE
## $ T_log_is_log : logi TRUE
## $ T_chr_is_chr : logi TRUE
## $ T_int_is_int : logi TRUE
## $ T_dbl_is_dbl : logi TRUE
Create a vector of the numbers 3, 6, and 9.
threes <- c(3, 6, 9)
The built-in vector letters
contains the letters of the
English alphabet. Use an indexing vector of integers to extract the
letters that spell ‘cat’.
cat <- letters[c(3, 1, 20)]
The function colors()
returns all of the color names
that R is aware of. What is the length of the vector returned by this
function? (Use code to find the answer.)
col_length <- length(colors())
Create a named list called col_types
where the name is
each column in the built-in dataset table1
and the value is
the column data type (e.g., “double”, “character”, “integer”,
“logical”).
# you can do this manually
col_types <- list(
country = "character",
year = "integer",
cases = "integer",
population = "integer"
)
# or with coding
col_types <- list(
typeof(table1[[1]]),
typeof(table1[[2]]),
typeof(table1[[3]]),
typeof(table1[[4]])
)
names(col_types) <- names(table1)
# here is a shortcut to do it all in one step
# lapply applies the function (FUN) to each item in the list (X)
col_types <- lapply(X = table1, FUN = typeof)
Set the object x
to the integers 1 to 100. Use
vectorised operations to set y
to x
squared.
Use plot(x, y)
to visualise the relationship between these
two numbers.
x <- -100:100
y <- x^2
plot(x, y)
Set t
to the numbers 0 to 100 in increments of 0.1. Set
x
to the sine of t
and y
to the
cosine of t
(you will need to find the functions for sine
and cosine). Plot x
against y
.
t <- seq(0, 100, 0.1)
x <- sin(t)
y <- cos(t)
plot(x, y)
The function call runif(n, min, max)
will draw
n
numbers from a uniform distribution from min
to max
. If you set n
to 10000,
min
to 0 and max
to 1, this simulates the
p-values that you would get from 10000 experiments where the null
hypothesis is true. Create the following objects:
pvals
: 10000 simulated p-values using
runif()
is_sig
: a logical vector that is TRUE
if
the corresponding element of pvals
is less than .05,
FALSE
otherwisesig_vals
: a vector of just the significant
p-valuesprop_sig
: the proportion of those p-values that were
significantset.seed(8675309) # ensures you get the same random numbers each time you run this code chunk
pvals <- runif(10000, 0, 1)
is_sig <- pvals < .05
sig_vals <- pvals[is_sig]
prop_sig <- length(sig_vals) / length(pvals)
# alternatively:
prop_sig <- mean(is_sig)
prop_sig <- mean(pvals < .05)