Boot Tutorial

```{r setup, include=FALSE} knitr::opts_chunk$set(echo = FALSE) knitr::opts_knit$set(root.dir = './') source("resources/preamble.R") f <- function (x) {formatC(x, format="d", big.mark=',')} bold <- function(x) {paste('{\\textbf{',x,'}}', sep ='')} gray <- function(x) {paste('{\\textcolor{gray}{',x,'}}', sep ='')} wrapify <- function (x) {paste("{", x, "}", sep="")} p <- function (x) {formatC(x, format='f', digits=1, big.mark=',')} ```

Introduction to boot


Jeremy Foote
# Bootstrapping ## What is it? >- Calculating the statistic of interest on "bootstrap resamples" of data > - Resamples come by sampling with replacement many times >- [Seeing Theory](https://seeing-theory.brown.edu/frequentist-inference/index.html#section3) ## What is the point? >- Estimate statistics when assumptions of parametric models may not hold >- Reduce distortions caused by small sample size ## What's the catch? >- Has its own assumptions: > - Sample is representative of population > - Samples are independent # Boot package ## Basic function ```{r, echo = T, eval=FALSE} library(boot) boot(data = data, statistic = fun, # What function are we using to generate a statistic? R=reps # How many times should we repeat this?, ... # Can also pass additional parameters to the function ) ``` ## Estimating the mean ```{r, echo = T, message=F, collapse=T} # Setup library(boot) library(tidyverse) get_mpg_mean <- function(data, indices){ # Function has to take in data and indices new_data = data[indices,] # Resample based on indices return(mean(new_data$mpg)) # Return statistic of interest } # Create the boot object boot_obj = boot(data = mtcars, statistic = get_mpg_mean, R = 1000) boot_obj ``` ## Estimating the mean - visualization ```{r, echo = T, message=F, fig.width=5, fig.height=4} boot_obj$t %>% as.tibble %>% ggplot() + geom_histogram(aes(x=V1), fill = 'orange',binwidth=.2) + xlab('Mean weight in bootstrapped samples') + theme_light() ``` ## Advanced example - bootstrapped confidence intervals for regression >- This should be easier! ```{r, echo = T, message=F, fig.width=5, fig.height=4} get_coefs <- function(data, indices){ new_data = data[indices,] fit_obj = lm(mpg ~ wt + hp + disp, data = new_data) return(coef(fit_obj)) } boot_obj = boot(data = mtcars, statistic = get_coefs, R = 2000) ``` ## Visualize bootstrapped coefficients ```{r, echo = T, message=F, fig.width=6, fig.height=2.7} boot_df <- as.data.frame(boot_obj$t) var_names = names(boot_obj$t0) colnames(boot_df) <- var_names library(ggridges) boot_df %>% stack %>% filter(ind != '(Intercept)') %>% ggplot() + theme_light() + geom_density_ridges(aes(x=values, y=ind), fill='orange', alpha=.4) ``` ## Calculate confidence intervals ```{r, echo = T, message=F, fig.width=5, fig.height=4} simple_cis <- sapply(boot_df, quantile, probs=c(.025, .975)) print(simple_cis) cis <- sapply(1:length(var_names), function(x) boot.ci(boot_obj, index=x, type = 'bca')$bca[4:5]) colnames(cis) <- var_names print(cis) ``` ## Use bootstrapped confidence intervals ```{r, echo = T, message=F, fig.width=5, fig.height=2.5} library(dotwhisker) library(broom) tidy(lm(mpg ~ wt + hp + disp, data=mtcars), conf.int = T) %>% mutate(conf.low = as.numeric(cis[1,]), conf.high = as.numeric(cis[2,])) %>% by_2sd(mtcars) %>% dwplot(show_intercept = F) + theme_bw() + theme(legend.position="none") + xlab('Beta coefficient with bootstrapped 95% CIs') + ylab('Variable') + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) # ```