library(tidyverse) # for data import, manipulation, and plotting
library(GGally) # for ggpairs() function
library(ggrepel) # for geom_text_repel() function
library(car) # for vif() function
library(conflicted)
conflicts_prefer(dplyr::filter)
6 R demo
See also Agresti 2.6, Dunn and Smyth 2.6
The R demo will be based on the ScotsRaces
data from the Agresti textbook. Data description (quoted from the textbook):
“Each year the Scottish Hill Runners Association publishes a list of hill races in Scotland for the year. The table below shows data on the record time for some of the races (in minutes). Explanatory variables listed are the distance of the race (in miles) and the cumulative climb (in thousands of feet).”
We will also familiarize ourselves with several important functions from the tidyverse
packages, including the ggplot2
package for data visualization and dplyr
package for data manipulation.
# read the data into R
<- read_tsv("data/ScotsRaces.dat") # read_tsv from readr for data import
scots_races scots_races
# A tibble: 35 × 4
race distance climb time
<chr> <dbl> <dbl> <dbl>
1 GreenmantleNewYearDash 2.5 0.65 16.1
2 Carnethy5HillRace 6 2.5 48.4
3 CraigDunainHillRace 6 0.9 33.6
4 BenRhaHillRace 7.5 0.8 45.6
5 BenLomondHillRace 8 3.07 62.3
6 GoatfellHillRace 8 2.87 73.2
7 BensofJuraFellRace 16 7.5 205.
8 CairnpappleHillRace 6 0.8 36.4
9 ScoltyHillRace 5 0.8 29.8
10 TraprainLawRace 6 0.65 39.8
# ℹ 25 more rows
6.1 Exploration
Before modeling our data, let’s first explore it.
# pairs plot
# Q: What are the typical ranges of the variables?
# Q: What are the relationships among the variables?
|>
scots_races select(-race) |> # select() from dplyr for selecting columns
ggpairs() # ggpairs() from GGally to create pairs plot
# mile time versus distance
# Q: How does mile time vary with distance?
# Q: What races deviate from this trend?
# Q: How does climb play into it?
# add mile time variable to scots_races
<- scots_races |>
scots_races mutate(mile_time = time / distance) # mutate() from dplyr to add column
# plot mile time versus distance
|>
scots_races ggplot(aes(x = distance, y = mile_time)) +
geom_point()
# add climb information as point color
|>
scots_races ggplot(aes(x = distance, y = mile_time, colour = climb)) +
geom_point()
# highlight extreme points
<- scots_races |>
scots_races_extreme filter(distance > 15 | mile_time > 9) # filter() from dplyr to subset rows
# plot mile time versus distance
|>
scots_races ggplot(aes(x = distance, y = mile_time, label = race, colour = climb)) +
geom_point() +
geom_text_repel(aes(label = race), data = scots_races_extreme)
# clean up plot
|>
scots_races ggplot(aes(x = distance, y = mile_time, label = race, color = climb)) +
geom_point() +
geom_text_repel(aes(label = race), data = scots_races_extreme) +
labs(
x = "Distance (miles)",
y = "Mile Time (minutes per mile)",
color = "Climb\n(thousands of ft)"
)
6.2 Linear model coefficient interpretation
Let’s fit some linear models and interpret the coefficients.
# Q: What is the effect of an extra mile of distance on time?
<- lm(time ~ distance + climb, data = scots_races)
lm_fit coef(lm_fit)
(Intercept) distance climb
-13.108551 6.350955 11.780133
# Linear model with interaction
# Q: What is the effect of an extra mile of distance on time
# for a run with low climb?
# Q: What is the effect of an extra mile of distance on time
# for a run with high climb?
<- lm(time ~ distance * climb, data = scots_races)
lm_fit_int coef(lm_fit_int)
(Intercept) distance climb distance:climb
-0.7671925 4.9622542 3.7132519 0.6598256
|>
scots_races summarise(min_climb = min(climb), max_climb = max(climb))
# A tibble: 1 × 2
min_climb max_climb
<dbl> <dbl>
1 0.3 7.5
Let’s take a look at the regression summary for lm_fit
:
<- lm(time ~ distance + climb, data = scots_races)
lm_fit summary(lm_fit)
Call:
lm(formula = time ~ distance + climb, data = scots_races)
Residuals:
Min 1Q Median 3Q Max
-16.654 -4.842 1.110 4.667 27.762
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -13.1086 2.5608 -5.119 1.41e-05 ***
distance 6.3510 0.3578 17.751 < 2e-16 ***
climb 11.7801 1.2206 9.651 5.37e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.734 on 32 degrees of freedom
Multiple R-squared: 0.9717, Adjusted R-squared: 0.97
F-statistic: 549.9 on 2 and 32 DF, p-value: < 2.2e-16
We get a coefficient of 6.35 with standard error 0.36 for distance
, where the standard error is an estimate of the quantity (5.5).
6.3 \(R^2\) and sum-of-squared decompositions.
We can extract the \(R^2\) from this fit by reading it off from the bottom of the summary, or by typing
summary(lm_fit)$r.squared
[1] 0.971725
We can construct sum-of-squares decompositions (4.1) using the anova
function. This function takes as arguments the partial model and the full model. For example, consider the partial model time ~ distance
.
<- lm(time ~ distance, data = scots_races)
lm_fit_partial anova(lm_fit_partial, lm_fit)
Analysis of Variance Table
Model 1: time ~ distance
Model 2: time ~ distance + climb
Res.Df RSS Df Sum of Sq F Pr(>F)
1 33 9546.9
2 32 2441.3 1 7105.6 93.14 5.369e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We find that adding the predictor climb
reduces the RSS by 7106, from 9547 to 2441. As another example, we can compute the \(R^2\) by comparing the full model with the null model:
<- lm(time ~ 1, data = scots_races)
lm_fit_null anova(lm_fit_null, lm_fit)
Analysis of Variance Table
Model 1: time ~ 1
Model 2: time ~ distance + climb
Res.Df RSS Df Sum of Sq F Pr(>F)
1 34 86340
2 32 2441 2 83899 549.87 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Therefore, the \(R^2\) is 83899/86340 = 0.972, consistent with the above regression summary.
6.4 Adjustment and collinearity.
We can also test the adjustment formula (5.4) numerically. Let’s consider the coefficient of distance
in the regression time ~ distance + climb
. We can obtain this coefficient by first regressing climb
out of distance
and time
:
<- lm(distance ~ climb, data = scots_races)
lm_dist_on_climb <- lm(time ~ climb, data = scots_races)
lm_time_on_climb
<- tibble(
scots_races_resid dist_residuals = residuals(lm_dist_on_climb),
time_residuals = residuals(lm_time_on_climb)
)
<- lm(time_residuals ~ dist_residuals - 1,
lm_adjusted data = scots_races_resid
)summary(lm_adjusted)
Call:
lm(formula = time_residuals ~ dist_residuals - 1, data = scots_races_resid)
Residuals:
Min 1Q Median 3Q Max
-16.654 -4.842 1.110 4.667 27.762
Coefficients:
Estimate Std. Error t value Pr(>|t|)
dist_residuals 6.3510 0.3471 18.3 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.474 on 34 degrees of freedom
Multiple R-squared: 0.9078, Adjusted R-squared: 0.9051
F-statistic: 334.8 on 1 and 34 DF, p-value: < 2.2e-16
We find a coefficient of 6.35 with standard error 0.35, which matches that obtained in the original regression.
We can get the partial correlation between distance
and time
by taking the empirical correlation between the residuals. We can compare this quantity to the usual correlation.
|>
scots_races_resid summarise(cor(dist_residuals, time_residuals)) |>
pull()
[1] 0.9527881
|>
scots_races summarise(cor(distance, time)) |>
pull()
[1] 0.9430944
In this case, the two correlation quantities are similar.
To obtain the variance inflation factors defined in equation (5.6), we can use the vif
function from the car
package:
vif(lm_fit)
distance climb
1.740812 1.740812
Why are these two VIF values the same?