18  R demo

We illustrate how to deal with heteroskedasticity, group-correlated errors, autocorrelated errors, and outliers in the following sections.

18.1 Heteroskedasticity

Next, let’s look at another dataset, from the Current Population Survey (CPS).

library(readr)
library(ggplot2)
library(dplyr)
library(tibble)
library(tidyr)

cps_data <- read_tsv("data/cps2.tsv")
cps_data
# A tibble: 1,000 × 10
    wage  educ exper female black married union south fulltime metro
   <dbl> <dbl> <dbl>  <dbl> <dbl>   <dbl> <dbl> <dbl>    <dbl> <dbl>
 1  2.03    13     2      1     0       0     0     1        0     0
 2  2.07    12     7      0     0       0     0     0        0     1
 3  2.12    12    35      0     0       0     0     1        1     1
 4  2.54    16    20      1     0       0     0     1        1     1
 5  2.68    12    24      1     0       1     0     1        0     1
 6  3.09    13     4      0     0       0     0     1        0     1
 7  3.16    13     1      0     0       0     0     0        0     0
 8  3.17    12    22      1     0       1     0     1        0     1
 9  3.2     12    23      0     0       1     0     1        1     1
10  3.27    12     4      1     0       0     0     0        1     1
# ℹ 990 more rows

Suppose we want to regress wage on educ, exper, and metro.

lm_fit <- lm(wage ~ educ + exper + metro, data = cps_data)

18.1.1 Diagnostics

Let’s take a look at the standard linear model diagnostic plots built into R.

# residuals versus fitted
plot(lm_fit, which = 1)

# residual QQ plot
plot(lm_fit, which = 2)

# residuals versus leverage (with Cook's distance)
plot(lm_fit, which = 5)

The residuals versus fitted plot suggests significant heteroskedasticity, with variance growing as a function of the fitted value.

18.1.2 Sandwich standard errors

To get standard errors robust to this heteroskedasticity, we can use one of the robust estimators discussed in Section 13.2. Most of the robust standard error constructions discussed in that section are implemented in the R package sandwich.

library(sandwich)

For example, Huber-White’s heteroskedasticity-consistent estimate \(\widehat{\text{Var}}[\boldsymbol{\widehat \beta}]\) can be obtained via vcovHC:

HW_cov <- vcovHC(lm_fit)
HW_cov
             (Intercept)          educ         exper         metro
(Intercept)  1.484328645 -0.0967891868 -0.0096871141 -0.1218518012
educ        -0.096789187  0.0070467982  0.0004037764  0.0018334348
exper       -0.009687114  0.0004037764  0.0002517826  0.0008369831
metro       -0.121851801  0.0018334348  0.0008369831  0.1197713348

Compare this to the traditional estimate:

usual_cov <- vcovHC(lm_fit, type = "const")
usual_cov
             (Intercept)          educ         exper         metro
(Intercept)  1.157049852 -0.0671656102 -0.0070323974 -0.1287058354
educ        -0.067165610  0.0048945781  0.0001924359 -0.0018227782
exper       -0.007032397  0.0001924359  0.0002320022  0.0001471354
metro       -0.128705835 -0.0018227782  0.0001471354  0.1858394060
# extract the variance estimates from the diagonal
tibble(
  variable = rownames(usual_cov),
  usual_variance = sqrt(diag(usual_cov)),
  HW_variance = sqrt(diag(HW_cov))
)
# A tibble: 4 × 3
  variable    usual_variance HW_variance
  <chr>                <dbl>       <dbl>
1 (Intercept)         1.08        1.22  
2 educ                0.0700      0.0839
3 exper               0.0152      0.0159
4 metro               0.431       0.346 

Bootstrap standard errors are also implemented in sandwich:

# pairs bootstrap
bootstrap_cov <- vcovBS(lm_fit, type = "xy")
tibble(
  variable = rownames(usual_cov),
  usual_variance = diag(usual_cov),
  HW_variance = diag(HW_cov),
  bootstrap_variance = diag(bootstrap_cov)
)
# A tibble: 4 × 4
  variable    usual_variance HW_variance bootstrap_variance
  <chr>                <dbl>       <dbl>              <dbl>
1 (Intercept)       1.16        1.48               1.54    
2 educ              0.00489     0.00705            0.00769 
3 exper             0.000232    0.000252           0.000245
4 metro             0.186       0.120              0.116   

The covariance estimate produced by sandwich can be easily integrated into linear model inference using the package lmtest.

library(lmtest)
Loading required package: zoo

Attaching package: 'zoo'
The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
# fit linear model as usual
lm_fit <- lm(wage ~ educ + exper + metro, data = cps_data)

# robust t-tests for coefficients
coeftest(lm_fit, vcov. = vcovHC)

t test of coefficients:

             Estimate Std. Error t value  Pr(>|t|)    
(Intercept) -9.913984   1.218330 -8.1374 1.197e-15 ***
educ         1.233964   0.083945 14.6996 < 2.2e-16 ***
exper        0.133244   0.015868  8.3972 < 2.2e-16 ***
metro        1.524104   0.346080  4.4039 1.178e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# robust confidence intervals for coefficients
coefci(lm_fit, vcov. = vcovHC)
                  2.5 %     97.5 %
(Intercept) -12.3047729 -7.5231954
educ          1.0692342  1.3986938
exper         0.1021058  0.1643816
metro         0.8449747  2.2032337
# robust F-test
lm_fit_partial <- lm(wage ~ educ, data = cps_data) # a partial model
waldtest(lm_fit_partial, lm_fit, vcov = vcovHC)
Wald test

Model 1: wage ~ educ
Model 2: wage ~ educ + exper + metro
  Res.Df Df      F    Pr(>F)    
1    998                        
2    996  2 40.252 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

18.1.3 Bootstrap confidence intervals

One R package for performing bootstrap inference is simpleboot. Let’s see how to get pairs bootstrap distributions for the coefficient estimates.

library(simpleboot)
Simple Bootstrap Routines (1.1-8)
boot_out <- lm.boot(
  lm.object = lm_fit, # input the fit object from lm()
  R = 1000
) # R is the number of bootstrap replicates
perc(boot_out) # get the percentile 95% confidence intervals
      (Intercept)     educ     exper     metro
2.5%   -12.593797 1.071418 0.1031716 0.8449157
97.5%   -7.645662 1.417886 0.1681944 2.1429463

We can extract the resampling distributions for the coefficient estimates using the samples function:

samples(boot_out, name = "coef")[, 1:5]
                    1          2          3           4           5
(Intercept) -9.506667 -9.9339448 -7.8354993 -11.5963414 -11.2588844
educ         1.174144  1.2300298  1.1084766   1.3158311   1.3653399
exper        0.149829  0.1301164  0.1103397   0.1497207   0.1170897
metro        1.866064  1.5479406  1.6266666   1.8888379   1.7062736

We can plot these as follows:

boot_pctiles <- boot_out |>
  perc() |>
  t() |>
  as.data.frame() |>
  rownames_to_column(var = "var") |>
  filter(var != "(Intercept)")

samples(boot_out, name = "coef") |>
  as.data.frame() |>
  rownames_to_column(var = "var") |>
  filter(var != "(Intercept)") |>
  pivot_longer(-var, names_to = "resample", values_to = "coefficient") |>
  group_by(var) |>
  ggplot(aes(x = coefficient)) +
  geom_histogram(bins = 30, colour = "black") +
  geom_vline(aes(xintercept = `2.5%`), data = boot_pctiles, linetype = "dashed") +
  geom_vline(aes(xintercept = `97.5%`), data = boot_pctiles, linetype = "dashed") +
  facet_wrap(~var, scales = "free")

In this case, the bootstrap sampling distributions look roughly normal.

18.2 Group-correlated errors

Credit for this data example: https://www.r-bloggers.com/2021/05/clustered-standard-errors-with-r/.

Let’s consider the nslwork data from the webuse package:

library(webuse)
nlswork_orig <- webuse("nlswork")
nlswork <- nlswork_orig |>
  filter(idcode <= 100) |>
  select(idcode, year, ln_wage, age, tenure, union) |>
  na.omit() |>
  mutate(
    union = as.integer(union),
    idcode = as.factor(idcode)
  )
nlswork
# A tibble: 386 × 6
   idcode  year ln_wage   age tenure union
   <fct>  <dbl>   <dbl> <dbl>  <dbl> <int>
 1 1         72    1.59    20  0.917     1
 2 1         77    1.78    25  1.5       0
 3 1         80    2.55    28  1.83      1
 4 1         83    2.42    31  0.667     1
 5 1         85    2.61    33  1.92      1
 6 1         87    2.54    35  3.92      1
 7 1         88    2.46    37  5.33      1
 8 2         71    1.36    19  0.25      0
 9 2         77    1.73    25  2.67      1
10 2         78    1.69    26  3.67      1
# ℹ 376 more rows

The data comes from the US National Longitudinal Survey (NLS) and contains information about more than 4,000 young working women. We’re interested in the relationship between wage (here as log-scaled GNP-adjusted wage) ln_wage and survey participant’s current age, job tenure in years, and union membership as independent variables. It’s a longitudinal survey, so subjects were asked repeatedly between 1968 and 1988, and each subject is identified by a unique idcode idcode. Here we restrict attention to the first 100 subjects and remove any rows with missing data.

Let’s start by fitting a linear regression of the log wage on age, tenure, union, and the interaction between tenure and union:

lm_fit <- lm(ln_wage ~ age + tenure + union + tenure:union, data = nlswork)
summary(lm_fit)

Call:
lm(formula = ln_wage ~ age + tenure + union + tenure:union, data = nlswork)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.42570 -0.28330  0.01694  0.27303  1.65052 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)   1.379103   0.099658  13.838  < 2e-16 ***
age           0.013553   0.003388   4.000 7.60e-05 ***
tenure        0.022175   0.008051   2.754  0.00617 ** 
union         0.309936   0.070344   4.406 1.37e-05 ***
tenure:union -0.009629   0.012049  -0.799  0.42473    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4099 on 381 degrees of freedom
Multiple R-squared:  0.1811,    Adjusted R-squared:  0.1725 
F-statistic: 21.07 on 4 and 381 DF,  p-value: 1.047e-15

Let’s plot the residuals against the individuals:

nlswork |>
  mutate(resid = lm_fit$residuals) |>
  ggplot(aes(x = idcode, y = resid)) +
  geom_boxplot() +
  labs(
    x = "Subject",
    y = "Residual"
  ) +
  theme(axis.text.x = element_blank())

Clearly, there is dependency among the residuals within subjects. Therefore, we have either model bias, or correlated errors, or both. To help assess whether we have model bias or not, we must check whether the variables of interest are correlated with the grouping variable idcode. We can check this with a plot, e.g., for the tenure variable:

nlswork |>
  ggplot(aes(x = idcode, y = tenure)) +
  geom_boxplot() +
  labs(
    x = "Subject",
    y = "Tenure"
  ) +
  theme(axis.text.x = element_blank())

Again, there seems to be nontrivial association between tenure and idcode. We can check this more formally with an ANOVA test:

summary(aov(tenure ~ idcode, data = nlswork))
             Df Sum Sq Mean Sq F value   Pr(>F)    
idcode       81   2529  31.220   3.558 8.83e-16 ***
Residuals   304   2668   8.775                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

So, in this case, we do have model bias on our hands. We can address this using fixed effects for each subject.

lm_fit_FE <- lm(ln_wage ~ age + tenure + union + tenure:union + idcode, data = nlswork)
lm_fit_FE |>
  summary() |>
  coef() |>
  as.data.frame() |>
  rownames_to_column(var = "var") |>
  filter(!grepl("idcode", var)) |> # remove coefficients for fixed effects
  column_to_rownames(var = "var")
                Estimate  Std. Error   t value     Pr(>|t|)
(Intercept)  1.882478232 0.131411504 14.325064 8.022367e-36
age          0.005630809 0.003109803  1.810664 7.119315e-02
tenure       0.020756426 0.006964417  2.980353 3.114742e-03
union        0.174619394 0.060646038  2.879321 4.272027e-03
tenure:union 0.014974113 0.009548509  1.568215 1.178851e-01

Note the changes in the standard errors and p-values. Sometimes, we may have remaining correlation among residuals even after adding cluster fixed effects. Therefore, it is common practice to compute clustered (i.e., Liang-Zeger) standard errors in conjunction with cluster fixed effects. We can get clustered standard errors via the vcovCL function from sandwich:

LZ_cov <- vcovCL(lm_fit_FE, cluster = nlswork$idcode)
coeftest(lm_fit_FE, vcov. = LZ_cov)[, ] |>
  as.data.frame() |>
  rownames_to_column(var = "var") |>
  filter(!grepl("idcode", var)) |> # remove coefficients for fixed effects
  column_to_rownames(var = "var")
                Estimate  Std. Error    t value     Pr(>|t|)
(Intercept)  1.882478232 0.157611390 11.9437956 3.667970e-27
age          0.005630809 0.006339777  0.8881715 3.751601e-01
tenure       0.020756426 0.011149190  1.8616981 6.362342e-02
union        0.174619394 0.101970509  1.7124500 8.784708e-02
tenure:union 0.014974113 0.009646023  1.5523613 1.216301e-01

Again, note the changes in the standard errors and p-values.

18.3 Autocorrelated errors

Let’s take a look at the EuStockMarkets data built into R, containing the daily closing prices of major European stock indices: Germany DAX (Ibis), Switzerland SMI, France CAC, and UK FTSE. Let’s regress DAX on FTSE and take a look at the residuals:

lm_fit <- lm(DAX ~ FTSE, data = EuStockMarkets)
summary(lm_fit)

Call:
lm(formula = DAX ~ FTSE, data = EuStockMarkets)

Residuals:
    Min      1Q  Median      3Q     Max 
-408.43 -172.53  -45.71  137.68  989.96 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1.331e+03  2.109e+01  -63.12   <2e-16 ***
FTSE         1.083e+00  5.705e-03  189.84   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 240.3 on 1858 degrees of freedom
Multiple R-squared:  0.951, Adjusted R-squared:  0.9509 
F-statistic: 3.604e+04 on 1 and 1858 DF,  p-value: < 2.2e-16

We find an extremely significant association between the two stock indices. But let’s examine the residuals for autocorrelation:

EuStockMarkets |>
  as.data.frame() |>
  mutate(
    date = row_number(),
    resid = lm_fit$residuals
  ) |>
  ggplot(aes(x = date, y = resid)) +
  geom_line() +
  labs(
    x = "Day",
    y = "Residual"
  )

There is clearly some autocorrelation in the residuals. Let’s quantify it using the autocorrelation function (acf() in R):

acf(lm_fit$residuals, lag.max = 1000)

We see that the autocorrelation gets into a reasonably low range around lag 200. We can then construct Newey-West standard errors based on this lag:

NW_cov <- NeweyWest(lm_fit)
coeftest(lm_fit, vcov. = NW_cov)

t test of coefficients:

              Estimate Std. Error t value Pr(>|t|)
(Intercept) -1331.2374  4398.3722 -0.3027   0.7622
FTSE            1.0831     1.4645  0.7396   0.4597

We see that the p-value for the association goes from 2e-16 to 0.46, after accounting for autocorrelation.

18.4 Outliers

Let’s take a look at the crime data from HW2:

# read crime data
crime_data <- read_tsv("data/Statewide_crime.dat")
Rows: 51 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): STATE
dbl (5): Violent, Murder, Metro, HighSchool, Poverty

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# read and transform population data
population_data <- read_csv("data/state-populations.csv")
Rows: 52 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): State
dbl (8): rank, Pop, Growth, Pop2018, Pop2010, growthSince2010, Percent, density

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
population_data <- population_data |>
  filter(State != "Puerto Rico") |>
  select(State, Pop) |>
  rename(state_name = State, state_pop = Pop)

# collate state abbreviations
state_abbreviations <- tibble(
  state_name = state.name,
  state_abbrev = state.abb
) |>
  add_row(state_name = "District of Columbia", state_abbrev = "DC")

# add CrimeRate to crime_data
crime_data <- crime_data |>
  mutate(STATE = ifelse(STATE == "IO", "IA", STATE)) |>
  rename(state_abbrev = STATE) |>
  left_join(state_abbreviations, by = "state_abbrev") |>
  left_join(population_data, by = "state_name") |>
  mutate(CrimeRate = Violent / state_pop) |>
  select(state_abbrev, CrimeRate, Metro, HighSchool, Poverty)

crime_data
# A tibble: 51 × 5
   state_abbrev CrimeRate Metro HighSchool Poverty
   <chr>            <dbl> <dbl>      <dbl>   <dbl>
 1 AK           0.000819   65.6       90.2     8  
 2 AL           0.0000871  55.4       82.4    13.7
 3 AR           0.000150   52.5       79.2    12.1
 4 AZ           0.0000682  88.2       84.4    11.9
 5 CA           0.0000146  94.4       81.3    10.5
 6 CO           0.0000585  84.5       88.3     7.3
 7 CT           0.0000867  87.7       88.8     6.4
 8 DE           0.000664   80.1       86.5     5.8
 9 FL           0.0000333  89.3       85.9     9.7
10 GA           0.0000419  71.6       85.2    10.8
# ℹ 41 more rows

Let’s fit the linear regression:

# note: we make the state abbreviations row names for better diagnostic plots
lm_fit <- lm(CrimeRate ~ Metro + HighSchool + Poverty, data = crime_data |> column_to_rownames(var = "state_abbrev"))

We can get the standard linear regression diagnostic plots as follows:

# residuals versus fitted
plot(lm_fit, which = 1)

# residual QQ plot
plot(lm_fit, which = 2)

# residuals versus leverage (with Cook's distance)
plot(lm_fit, which = 5)

The information underlying these diagnostic plots can be extracted as follows:

tibble(
  state = crime_data$state_abbrev,
  std_residual = rstandard(lm_fit),
  fitted_value = fitted.values(lm_fit),
  leverage = hatvalues(lm_fit),
  cooks_dist = cooks.distance(lm_fit)
)
# A tibble: 51 × 5
   state std_residual fitted_value leverage cooks_dist
   <chr>        <dbl>        <dbl>    <dbl>      <dbl>
 1 AK           2.17     0.000227    0.0463   0.0574  
 2 AL          -0.422    0.000200    0.0769   0.00371 
 3 AR           1.10    -0.000132    0.153    0.0547  
 4 AZ          -1.02     0.000344    0.0568   0.0156  
 5 CA          -0.264    0.0000839   0.114    0.00224 
 6 CO          -0.383    0.000163    0.0405   0.00155 
 7 CT          -0.175    0.000134    0.0561   0.000456
 8 DE           2.81    -0.0000888   0.0754   0.161   
 9 FL          -0.804    0.000252    0.0452   0.00764 
10 GA          -0.599    0.000207    0.0232   0.00213 
# ℹ 41 more rows

Clearly, DC is an outlier. We can either run a robust estimation procedure or redo the analysis without DC. Let’s try both. First, we try robust regression using rlm() from the MASS package:

rlm_fit <- MASS::rlm(CrimeRate ~ Metro + HighSchool + Poverty, data = crime_data)
summary(rlm_fit)

Call: rlm(formula = CrimeRate ~ Metro + HighSchool + Poverty, data = crime_data)
Residuals:
       Min         1Q     Median         3Q        Max 
-8.297e-05 -3.787e-05 -2.249e-05  4.407e-05  2.063e-03 

Coefficients:
            Value   Std. Error t value
(Intercept) -0.0009  0.0004    -2.2562
Metro        0.0000  0.0000    -1.2963
HighSchool   0.0000  0.0000     2.6506
Poverty      0.0000  0.0000     2.7546

Residual standard error: 6.048e-05 on 47 degrees of freedom

For some reason, the p-values are not computed automatically. We can compute them ourselves instead:

summary(rlm_fit)$coef |>
  as.data.frame() |>
  rename(Estimate = Value) |>
  mutate(`p value` = 2 * dnorm(-abs(`t value`)))
                 Estimate   Std. Error   t value    p value
(Intercept) -8.538466e-04 3.784466e-04 -2.256188 0.06260042
Metro       -8.639252e-07 6.664623e-07 -1.296285 0.34439400
HighSchool   1.037849e-05 3.915573e-06  2.650568 0.02378865
Poverty      1.252839e-05 4.548172e-06  2.754600 0.01795833

To see the robust estimation action visually, let’s consider a univariate example:

lm_fit <- lm(CrimeRate ~ Metro, data = crime_data)
rlm_fit <- MASS::rlm(CrimeRate ~ Metro, data = crime_data)

# collate the fits into a tibble
line_fits <- tibble(
  method = c("Usual", "Robust"),
  intercept = c(
    coef(lm_fit)["(Intercept)"],
    coef(rlm_fit)["(Intercept)"]
  ),
  slope = c(
    coef(lm_fit)["Metro"],
    coef(rlm_fit)["Metro"]
  )
)
# usual and robust univariate fits
# plot the fits
crime_data |>
  ggplot() +
  geom_point(aes(x = Metro, y = CrimeRate)) +
  geom_abline(aes(intercept = intercept, slope = slope, colour = method), data = line_fits)

Next, let’s try removing DC and running a usual linear regression.

lm_fit_no_dc <- lm(CrimeRate ~ Metro + HighSchool + Poverty,
  data = crime_data |>
    filter(state_abbrev != "DC") |>
    column_to_rownames(var = "state_abbrev")
)

# residuals versus fitted
plot(lm_fit_no_dc, which = 1)

# residual QQ plot
plot(lm_fit_no_dc, which = 2)

# residuals versus leverage (with Cook's distance)
plot(lm_fit_no_dc, which = 5)