Edit the code chunks below and knit the document. You can pipe your objects to glimpse()
or print()
to display them.
Set the vector v1
equal to the following: 11, 13, 15, 17, 19, …, 99, 101 (use a function; don’t just type all the numbers).
v1 <- seq(11, 101, by = 2) %>% print()
## [1] 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47
## [20] 49 51 53 55 57 59 61 63 65 67 69 71 73 75 77 79 81 83 85
## [39] 87 89 91 93 95 97 99 101
Set the vector v2
equal to the following: “A” “A” “B” “B” “C” “C” “D” “D” “E” “E” (note the letters are all uppercase).
v2 <- rep(LETTERS[1:5], each = 2) %>% print()
## [1] "A" "A" "B" "B" "C" "C" "D" "D" "E" "E"
Set the vector v3
equal to the words “dog” 10 times, “cat” 9 times, “fish” 6 times, and “ferret” 1 time.
pets <- c("dog", "cat", "fish", "ferret")
pet_n <- c(10, 9, 6, 1)
v3 <- rep(pets, times = pet_n) %>% print()
## [1] "dog" "dog" "dog" "dog" "dog" "dog" "dog" "dog"
## [9] "dog" "dog" "cat" "cat" "cat" "cat" "cat" "cat"
## [17] "cat" "cat" "cat" "fish" "fish" "fish" "fish" "fish"
## [25] "fish" "ferret"
Use apply()
or map()
functions to create a list of 11 vectors of 100 numbers sampled from 11 random normal distributions with means of 0 to 1.0 (in steps of 0.1) and SDs of 1. Assign this list to the object samples
. Set the seed to 321
before you generate the random numbers to ensure reproducibility.
set.seed(321)
mu <- seq(0, 1, 0.1)
samples <- map(mu, rnorm, n = 100)
# alternatively
samples <- lapply(mu, rnorm, n = 100)
Use apply()
or map()
functions to create a vector of the sample means from the list samples
in the previous question.
sample_means <- map_dbl(samples, mean)
## alternatively
sample_means <- sapply(samples, mean) %>% print()
## [1] 0.08304484 0.21226580 0.16840128 0.12930317 0.53216771 0.40842952 0.72589166
## [8] 0.63446436 0.81040369 1.02317536 0.91126680
Write a function called my_add
that adds two numbers (x
and y
) together and returns the results.
my_add <- function(x, y) {
x+y
}
Create a vector testing your function my_add
. Every item in the vector should evaluate to TRUE
if your function is working correctly.
my_add_test <- c(
my_add(1, 2) == 3,
my_add(10, 20) == 30,
my_add(-1, -3) == -4,
my_add(0.2, 0.334) == 0.534
) %>% print()
## [1] TRUE TRUE TRUE TRUE
Copy the function my_add
above and add an error message that returns “x and y must be numbers” if x
or y
are not both numbers.
my_add <- function(x, y) {
if (!is.numeric(x) | !is.numeric(y)) stop("x and y must be numbers")
x+y
}
Create a tibble called dat
that contains 20 rows and three columns: id
(integers 101 through 120), pre
and post
(both 20-item vectors of random numbers from a normal distribution with mean = 0 and sd = 1). Set seed to 90210
to ensure reproducible values.
set.seed(90210)
dat <- tibble(
id = 101:120,
pre = rnorm(20),
post = rnorm(20)
) %>% print()
## # A tibble: 20 x 3
## id pre post
## <int> <dbl> <dbl>
## 1 101 -0.444 0.664
## 2 102 -0.515 -1.78
## 3 103 1.18 -1.21
## 4 104 1.57 -1.40
## 5 105 0.0934 1.83
## 6 106 0.165 -1.43
## 7 107 -0.339 -2.17
## 8 108 -0.780 -0.789
## 9 109 0.386 0.172
## 10 110 -1.83 -1.24
## 11 111 -0.544 0.319
## 12 112 -0.138 -1.63
## 13 113 0.296 0.395
## 14 114 -0.434 -1.85
## 15 115 -0.502 -2.53
## 16 116 0.113 0.279
## 17 117 -0.781 0.824
## 18 118 0.916 -0.622
## 19 119 0.179 0.928
## 20 120 0.0141 0.266
Run a two-tailed, paired-samples t-test comparing pre
and post
. (check the help for t.test
)
t <- t.test(dat$post, dat$pre, paired = TRUE) %>% print()
##
## Paired t-test
##
## data: dat$post and dat$pre
## t = -1.5392, df = 19, p-value = 0.1402
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.1299798 0.1722842
## sample estimates:
## mean of the differences
## -0.4788478
Use broom::tidy
to save the results of the t-test in question 8 in a table called stats
.
stats <- t.test(dat$post, dat$pre, paired = TRUE) %>%
broom::tidy() %>%
print()
## # A tibble: 1 x 8
## estimate statistic p.value parameter conf.low conf.high method alternative
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 -0.479 -1.54 0.140 19 -1.13 0.172 Paired t-test two.sided
Create a function called report_t
that takes a data table as an argument and returns the result of a two-tailed, paired-samples t-test between the columns pre
and post
in the following format:
“The mean increase from pre-test to post-test was #.###: t(#) = #.###, p = 0.###, 95% CI = [#.###, #.###].”
Hint: look at the function paste0()
(simpler) or sprintf()
(complicated but more powerful).
NB: Make sure all numbers are reported to three decimal places (except degrees of freedom).
report_t <- function(data) {
stats <- t.test(data$post, data$pre, paired = TRUE) %>%
broom::tidy()
diff <- pull(stats, estimate) %>% round(3)
t <- pull(stats, statistic) %>% round(3)
p <- pull(stats, p.value) %>% round(3)
df <- pull(stats, parameter)
ci1 <- pull(stats, conf.low) %>% round(3)
ci2 <- pull(stats, conf.high) %>% round(3)
paste0("The mean increase from pre-test to post-test was ", diff,
": t(", df, ") = ", t,
", p = ", p,
", 95% CI = [", ci1, ", ", ci2, "].")
}
# glue works well in a pipeline
# but doesn't format digits as nicely as sprintf (below),
# so we have to round the values before constructing the text
report_t <- function(data) {
t.test(data$post, data$pre, paired = TRUE) %>%
broom::tidy() %>%
mutate(across(.cols = where(is.numeric),
.fns = round,
digits = 3)) %>%
mutate(
text = glue::glue("The mean increase from pre-test to post-test was {estimate}: t({parameter}) = {statistic}, p = {p.value}, 95% CI = [{conf.low}, {conf.high}].")) %>%
pull(text)
}
# sprintf() is a complicated function, but can be easier to use in long text strings with a lot of things to replace
# it also formats values with leading or trailing zeroes
report_t <- function(data) {
stats <- t.test(data$post, data$pre, paired = TRUE) %>%
broom::tidy() %>%
# make sure to round values
mutate(across(.cols = where(is.numeric),
.fns = round,
digits = 3))
sprintf("The mean increase from pre-test to post-test was %.3f: t(%.0f) = %.3f, p = %.3f, 95%% CI = [%.3f, %.3f].",
pull(stats, estimate),
pull(stats, parameter),
pull(stats, statistic),
pull(stats, p.value),
pull(stats, conf.low),
pull(stats, conf.high)
)
}
Use inline R to include the results of report_t()
on the dat
data table in a paragraph below.
The mean increase from pre-test to post-test was -0.479: t(19) = -1.539, p = 0.140, 95% CI = [-1.130, 0.172].
Write a function to simulate data with the form.
\(Y_i = \beta_0 + \beta_1 X_i + e_i\)
The function should take arguments for the number of observations to return (n
), the intercept (b0
), the effect (b1
), the mean and SD of the predictor variable X (X_mu
and X_sd
), and the SD of the residual error (err_sd
). The function should return a tibble with n
rows and the columns id
, X
and Y
.
sim_lm_data <- function(n = 100, b0 = 0, b1 = 0,
X_mu = 0, X_sd = 1, err_sd = 1) {
tibble(
id = 1:n,
X = rnorm(n, X_mu, X_sd),
err = rnorm(n, 0, err_sd),
Y = b0 + b1*X + err
) %>%
select(id, X, Y)
}
dat12 <- sim_lm_data(n = 10) %>% print() # do not edit
## # A tibble: 10 x 3
## id X Y
## <int> <dbl> <dbl>
## 1 1 -0.652 -0.596
## 2 2 -0.203 -2.74
## 3 3 1.47 0.925
## 4 4 0.167 1.97
## 5 5 1.05 0.430
## 6 6 -1.54 2.87
## 7 7 -0.844 0.863
## 8 8 1.28 -1.53
## 9 9 0.228 3.08
## 10 10 -0.139 -1.03
Use the function from Question 12 to generate a data table with 10000 subjects, an intercept of 80, an effect of X of 0.5, where X has a mean of 0 and SD of 1, and residual error SD of 2.
dat13 <- sim_lm_data(n = 10000, b0 = 80, b1 = 0.5,
X_mu = 0, X_sd = 1, err_sd = 2)
Analyse the data with lm()
. Find where the analysis summary estimates the values of b0
and b1
. What happens if you change the simulation values?
mod13 <- lm(Y ~ X, data = dat13)
summary(mod13) # print summary
##
## Call:
## lm(formula = Y ~ X, data = dat13)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.0142 -1.3482 -0.0096 1.3461 7.6196
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80.02120 0.01995 4011.41 <2e-16 ***
## X 0.51326 0.02001 25.65 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.995 on 9998 degrees of freedom
## Multiple R-squared: 0.06174, Adjusted R-squared: 0.06165
## F-statistic: 657.9 on 1 and 9998 DF, p-value: < 2.2e-16
Use the function from Question 6 to calculate power by simulation for the effect of X on Y in a design with 50 subjects, an intercept of 80, an effect of X of 0.5, where X has a mean of 0 and SD of 1, residual error SD of 2, and alpha of 0.05.
Hint: use broom::tidy()
to get the p-value for the effect of X.
# ... lets you include any arguments to send to sim_lm_data()
sim_lm_power <- function(...) {
dat <- sim_lm_data(...)
lm(Y~X, dat) %>%
broom::tidy() %>%
filter(term == "X") %>%
pull(p.value)
}
p_values <- replicate(1000, sim_lm_power(n = 50, b0 = 80, b1 = 0.5, X_mu = 0, X_sd = 1, err_sd = 2))
power <- mean(p_values < .05)
power # print the value
## [1] 0.424
Calculate power (i.e., the false positive rate) for the effect of X on Y in a design with 50 subjects where there is no effect and alpha is 0.05.
p_values <- replicate(1000, sim_lm_power(n = 50, b1 = 0))
false_pos <- mean(p_values < .05)
false_pos # print the value
## [1] 0.056
Make a histogram of the p-values from the simulation above. Use geom_histogram with binwidth=0.05
and boundary=0
. What kind of distribution is this?
ggplot(mapping = aes(x = p_values)) +
geom_histogram(color = "black", fill = "white", binwidth = .05, boundary = 0)