Regression Modelling for Epidemiology

Riga Stradins University Workshop, July 2020

  • Lecturer: Sergio Uribe, Assoc Prof, PhD
  • Riga Stradins University, July 2020
pacman::p_load(
  car,
  broom,
  tidyverse,
  ggfortify,
  mosaic,
  huxtable,
  jtools,
  latex2exp,
  pubh,
  sjlabelled,
  sjPlot,
  sjmisc
)

theme_set(sjPlot::theme_sjplot2(base_size = 10))
theme_update(legend.position = "top")
# options('huxtable.knit_print_df' = FALSE)
options('huxtable.knit_print_df_theme' = theme_article)
options('huxtable.autoformat_number_format' = list(numeric = "%5.2f"))
knitr::opts_chunk$set(collapse = TRUE, comment = NA)

Regression

data(birthwt, package = "MASS")
birthwt <- birthwt %>%
  mutate(smoke = factor(smoke, labels = c("Non-smoker", "Smoker")),
         race = factor(race, labels = c("White", "African American", "Other"))) %>%
  var_labels(bwt = 'Birth weight (g)',
             smoke = 'Smoking status',
             race = 'Race')
birthwt %>%
  group_by(race, smoke) %>%
  summarise(
    n = n(),
    Mean = mean(bwt, na.rm = TRUE),
    SD = sd(bwt, na.rm = TRUE),
    Median = median(bwt, na.rm = TRUE),
    CV = rel_dis(bwt)
  )
`summarise()` regrouping output by 'race' (override with `.groups` argument)
# A tibble: 6 x 7
# Groups:   race [3]
  race             smoke          n  Mean    SD Median    CV
  <fct>            <fct>      <int> <dbl> <dbl>  <dbl> <dbl>
1 White            Non-smoker    44 3429.  710.  3593  0.207
2 White            Smoker        52 2827.  626.  2776. 0.222
3 African American Non-smoker    16 2854.  621.  2920  0.218
4 African American Smoker        10 2504   637.  2381  0.254
5 Other            Non-smoker    55 2816.  709.  2807  0.252
6 Other            Smoker        12 2757.  810.  3146. 0.294
birthwt %>%
  gen_bst_df(bwt ~ race|smoke)
Birth weight (g)LowerCIUpperCIRaceSmoking status
3428.753214.983623.23WhiteNon-smoker
2826.852661.952988.81WhiteSmoker
2854.502565.293141.09African AmericanNon-smoker
2504.002089.212867.61African AmericanSmoker
2815.782632.033009.99OtherNon-smoker
2757.172300.543144.49OtherSmoker
birthwt %>%
  bar_error(bwt ~ race, fill = ~ smoke) %>%
  axis_labs() %>%
  gf_labs(fill = "Smoking status:")
No summary function supplied, defaulting to `mean_se()`

model_norm <- lm(bwt ~ smoke + race, data = birthwt)
model_norm %>% Anova() %>% tidy()
termsumsqdfstatisticp.value
smoke7322574.73 1.0015.46 0.00
race8712354.03 2.00 9.20 0.00
Residuals87631355.83185.00      
model_norm %>% tidy()
termestimatestd.errorstatisticp.value
(Intercept)3334.9591.7836.34 0.00
smokeSmoker-428.73109.04-3.93 0.00
raceAfrican American-450.36153.12-2.94 0.00
raceOther-452.88116.48-3.89 0.00
model_norm %>% 
  glm_coef(labels = model_labels(model_norm))

ParameterCoefficientPr(>|t|)
Constant3334.95 (3036.11, 3633.78)< 0.001
Smoking status: Smoker-428.73 (-667.02, -190.44)< 0.001
Race: African American-450.36 (-750.31, -150.4)0.003
Race: Other-452.88 (-775.17, -130.58)0.006
Compare models

model_norm %>%
  glm_coef(se_rob = TRUE, labels = model_labels(model_norm))
ParameterCoefficientPr(>|t|)
Constant3334.95 (3036.11, 3633.78)< 0.001
Smoking status: Smoker-428.73 (-667.02, -190.44)< 0.001
Race: African American-450.36 (-750.31, -150.4)0.003
Race: Other-452.88 (-775.17, -130.58)0.006
model_norm %>% glance()
r.squaredadj.r.squaredsigmastatisticp.valuedflogLikAICBICdeviancedf.residual
0.12 0.11688.25 8.68 0.004-1501.113012.223028.4387631355.83185
model_norm %>%
  plot_model("pred", terms = ~race|smoke, dot.size = 1.5, title = "")

emmip(model_norm, smoke ~ race) %>%
  gf_labs(y = get_label(birthwt$bwt), x = "", col = "Smoking status")

Logistic regression

data(diet, package = "Epi")
diet <- diet %>%
  mutate(
    chd = factor(chd, labels = c("No CHD", "CHD"))
  ) %>%
  var_labels(
    chd = "Coronary Heart Disease",
    fibre = "Fibre intake (10 g/day)"
    )
diet %>% estat(~ fibre|chd)
Coronary Heart DiseaseNMin.Max.MeanMedianSDCV
Fibre intake (10 g/day)No CHD288.00 0.60 5.35 1.75 1.69 0.58 0.33
CHD45.00 0.76 2.43 1.49 1.51 0.40 0.27
diet %>%
  gf_boxploth(chd ~ fibre, fill = "indianred3", alpha = 0.7) %>%
  axis_labs()
Warning: Removed 4 rows containing non-finite values (stat_boxploth).

model_binom <- glm(chd ~ fibre, data = diet, family = binomial)

model_binom %>%
  glm_coef(labels = model_labels(model_binom))
ParameterOdds ratioPr(>|z|)
Constant0.95 (0.3, 3.01) 0.93
Fibre intake (10 g/day)0.33 (0.16, 0.67) 0.00
model_binom %>%
  plot_model("pred", terms = "fibre [all]", title = "")

Matched Case-Control Studies: Conditional Logistic Regression

data(bdendo, package = "Epi") 
bdendo <- bdendo %>%
  mutate(
    cancer = factor(d, labels = c('Control', 'Case')),
    gall = factor(gall, labels = c("No GBD", "GBD")),
    est = factor(est, labels = c("No oestrogen", "Oestrogen"))
  ) %>%
  var_labels(
    cancer = 'Endometrial cancer',
    gall = 'Gall bladder disease',
    est = 'Oestrogen'
  )
bdendo %>%
  mutate(
    cancer = relevel(cancer, ref = "Case"),
    est = relevel(est, ref = "Oestrogen"),
    gall = relevel(gall, ref = "GBD")
  ) %>%
  copy_labels(bdendo) %>%
  cross_tab(cancer ~ est + gall) 
Endometrial cancerlevelsCaseControlTotal
Total N (%)63 (20.0)252 (80.0)315
OestrogenOestrogen56 (88.9)127 (50.4)183 (58.1)
No oestrogen7 (11.1)125 (49.6)132 (41.9)
Gall bladder diseaseGBD17 (27.0)24 (9.5)41 (13.0)
No GBD46 (73.0)228 (90.5)274 (87.0)
library(survival)
model_clogit <- clogit(cancer == 'Case'  ~ est * gall + strata(set), data = bdendo)

model_clogit %>%
  glm_coef(labels = c("Oestrogen/No oestrogen", "GBD/No GBD",
                      "Oestrogen:GBD Interaction"))
ParameterOdds ratioPr(>|z|)
Oestrogen/No oestrogen14.88 (4.49, 49.36)< 0.001
GBD/No GBD18.07 (3.2, 102.01)0.001
Oestrogen:GBD Interaction0.13 (0.02, 0.9)0.039
require(ggeffects)
Loading required package: ggeffects
bdendo_pred <- ggemmeans(model_clogit, terms = c('gall', 'est'))
bdendo_pred %>%
  gf_pointrange(predicted + conf.low + conf.high ~ x|group, col = ~ x) %>%
  gf_labs(y = "P(cancer)", x = "", col = get_label(bdendo$gall))

Ordinal Logistic Regression

library(ordinal)

Attaching package: 'ordinal'
The following object is masked from 'package:dplyr':

    slice
data(housing, package = "MASS")
housing <- housing %>%
  var_labels(
    Sat = "Satisfaction",
    Infl = "Perceived influence",
    Type = "Type of rental",
    Cont = "Contact"
    )
model_clm <- clm(Sat ~ Infl + Type + Cont, weights = Freq, data = housing)
model_clm %>%
  glm_coef(labels = model_labels(model_clm, intercept = FALSE))
ParameterOrdinal ORPr(>|Z|)
Perceived influence: Low0.61 (0.48, 0.78)< 0.001
Perceived influence: Medium2 (1.56, 2.55)< 0.001
Contact: Low1.76 (1.44, 2.16)< 0.001
Perceived influence: High3.63 (2.83, 4.66)< 0.001
Contact: High0.56 (0.45, 0.71)< 0.001
Type of rental: Apartment0.69 (0.51, 0.94)0.018
Type of rental: Atrium0.34 (0.25, 0.45)< 0.001
Type of rental: Terrace1.43 (1.19, 1.73)< 0.001
model_clm %>%
  plot_model(type = "pred", terms = c("Infl", "Cont"), 
           dot.size = 1, title = "") %>%
  gf_theme(axis.text.x = element_text(angle = 45, hjust = 1))

model_clm %>%
  plot_model(type = "pred", terms = c("Infl", "Type"), 
           dot.size = 1, title = "") %>%
  gf_theme(axis.text.x = element_text(angle = 45, hjust = 1))

emmip(model_clm, Infl ~ Type |Cont) %>%
  gf_labs(x = "Type of rental", col = "Perceived influence")

Poisson Regression

data(quine, package = "MASS")
levels(quine$Eth) <- c("Aboriginal", "White")
levels(quine$Sex) <- c("Female", "Male")
quine <- quine %>%
  var_labels(
    Days = "Number of absent days",
    Eth = "Ethnicity",
    Age = "Age group"
    )

EDA

quine %>%
  group_by(Eth, Sex, Age) %>%
  summarise(
    n = n(),
    Mean = mean(Days, na.rm = TRUE),
    SD = sd(Days, na.rm = TRUE),
    Median = median(Days, na.rm = TRUE),
    CV = rel_dis(Days)
  )
`summarise()` regrouping output by 'Eth', 'Sex' (override with `.groups` argument)
# A tibble: 16 x 8
# Groups:   Eth, Sex [4]
   Eth        Sex    Age       n  Mean    SD Median    CV
   <fct>      <fct>  <fct> <int> <dbl> <dbl>  <dbl> <dbl>
 1 Aboriginal Female F0        5 17.6  17.4      11 0.987
 2 Aboriginal Female F1       15 18.9  16.3      13 0.865
 3 Aboriginal Female F2        9 32.6  27.3      20 0.839
 4 Aboriginal Female F3        9 14.6  14.9      10 1.02 
 5 Aboriginal Male   F0        8 11.5   7.23     12 0.629
 6 Aboriginal Male   F1        5  9.6   4.51      7 0.469
 7 Aboriginal Male   F2       11 30.9  17.8      32 0.576
 8 Aboriginal Male   F3        7 27.1  10.4      28 0.382
 9 White      Female F0        5 19.8   9.68     20 0.489
10 White      Female F1       17  7.76  6.48      6 0.834
11 White      Female F2       10  5.7   4.97      4 0.872
12 White      Female F3       10 13.5  11.5      12 0.851
13 White      Male   F0        9 13.6  20.9       7 1.54 
14 White      Male   F1        9  5.56  5.39      5 0.970
15 White      Male   F2       10 15.2  12.9      12 0.848
16 White      Male   F3        7 27.3  22.9      27 0.840

Model

model_pois <- glm(Days ~ Eth + Sex + Age, family = poisson, data = quine)

model_pois %>%
  glm_coef(labels = model_labels(model_pois), se_rob = TRUE)
ParameterRate ratioPr(>|z|)
Constant17.66 (10.66, 29.24)< 0.001
Ethnicity: White0.59 (0.39, 0.88)0.01
Sex: Male1.11 (0.77, 1.6)0.57
Age group: F10.8 (0.42, 1.5)0.482
Age group: F21.42 (0.85, 2.36)0.181
Age group: F31.35 (0.77, 2.34)0.292
model_pois %>% glance()
null.deviancedf.nulllogLikAICBICdeviancedf.residual
2073.53145-1165.492342.982360.881742.50140

Negative-binomial

deviance(model_pois) / df.residual(model_pois)
[1] 12.44646

Thus, we have over-dispersion. One option is to use a negative binomial distribution.

library(MASS)

Attaching package: 'MASS'
The following objects are masked _by_ '.GlobalEnv':

    birthwt, housing, quine
The following object is masked from 'package:dplyr':

    select
model_negbin <- glm.nb(Days ~ Eth + Sex + Age, data = quine)

model_negbin %>%
  glm_coef(labels = model_labels(model_negbin), se_rob = TRUE)
ParameterRate ratioPr(>|z|)
Constant20.24 (12.24, 33.47)< 0.001
Ethnicity: White0.57 (0.38, 0.84)0.005
Sex: Male1.07 (0.74, 1.53)0.73
Age group: F10.69 (0.39, 1.23)0.212
Age group: F21.2 (0.7, 2.03)0.507
Age group: F31.29 (0.73, 2.28)0.385
model_negbin %>% glance()
null.deviancedf.nulllogLikAICBICdeviancedf.residual
192.24145-547.831109.651130.54167.86140

age group is a factor with more than two levels and is significant:

model_negbin %>% Anova()
LR ChisqDfPr(>Chisq)
12.66 1.00 0.00
0.15 1.00 0.70
9.48 3.00 0.02
model_negbin %>%
  plot_model(type = "pred", terms = c("Age", "Eth"), 
           dot.size = 1.5, title = "")

emmip(model_negbin, Eth ~ Age|Sex) %>%
  gf_labs(y = "Number of absent days", x = "Age group", col = "Ethnicity")

## Adjusting CIs and p-values for multiple comparisons

multiple(model_negbin, ~ Age|Eth)$df
contrastEthratioSEz.ratiop.valuelower.CLupper.CL
F0 / F1Aboriginal 1.44 0.34 1.57 0.40 0.79 2.62
F0 / F2Aboriginal 0.84 0.19-0.77 0.86 0.46 1.52
F0 / F3Aboriginal 0.78 0.19-1.04 0.72 0.42 1.45
F1 / F2Aboriginal 0.58 0.12-2.65 0.04 0.34 0.98
F1 / F3Aboriginal 0.54 0.12-2.89 0.02 0.31 0.93
F2 / F3Aboriginal 0.93 0.20-0.34 0.99 0.53 1.63
F0 / F1White 1.44 0.34 1.57 0.40 0.79 2.62
F0 / F2White 0.84 0.19-0.77 0.86 0.46 1.52
F0 / F3White 0.78 0.19-1.04 0.72 0.42 1.45
F1 / F2White 0.58 0.12-2.65 0.04 0.34 0.98
F1 / F3White 0.54 0.12-2.89 0.02 0.31 0.93
F2 / F3White 0.93 0.20-0.34 0.99 0.53 1.63
multiple(model_negbin, ~ Age|Eth)$fig_ci %>%
  gf_labs(x = "IRR")

multiple(model_negbin, ~ Age|Eth)$fig_ci %>%
  gf_labs(x = "IRR")

Survival Analysis

effect of thiotepa versus placebo on the development of bladder cancer.

library(survival)
data(bladder)
bladder <- bladder %>%
  mutate(times = stop,
         rx = factor(rx, labels=c("Placebo", "Thiotepa"))
         ) %>%
  var_labels(times = "Survival time",
             rx = "Treatment")

Parametric method

model_surv <- survreg(Surv(times, event) ~ rx, data = bladder)
model_surv %>%
  glm_coef(labels = c("Treatment: Thiotepa/Placebo", "Scale"), se_rob = TRUE)
ParameterSurvival time ratioPr(>|z|)
Treatment: Thiotepa/Placebo1.64 (0.89, 3.04)0.116
Scale1 (0.85, 1.18)0.992
model_exp <- survreg(Surv(times, event) ~ rx, data = bladder, dist = "exponential")
model_exp %>%
  glm_coef(labels = c("Treatment: Thiotepa/Placebo"), se_rob = TRUE)

ParameterSurvival time ratioPr(>|z|)
Treatment: Thiotepa/Placebo1.64 (0.85, 3.16)0.139
Interpretation: Patients receiving Thiotepa live on average 64% more than those in the Placebo group.

Using naive standard errors

model_exp %>%
  glm_coef(labels = c("Treatment: Thiotepa/Placebo"))
ParameterSurvival time ratioPr(>|z|)
Treatment: Thiotepa/Placebo1.64 (0.85, 3.16)0.139
model_exp %>%
  plot_model(type = "pred", terms = ~ rx, dot.size = 1.5, title = "") %>%
  gf_labs(y = "Survival time", x = "Treatment", title = "")

### Cox proportional hazards regression

model_cox <-  coxph(Surv(times, event) ~ rx, data = bladder)
model_cox %>%
  glm_coef(labels = c("Treatment: Thiotepa/Placebo"))
ParameterHazard ratioPr(>|z|)
Treatment: Thiotepa/Placebo0.64 (0.44, 0.94) 0.02
model_cox %>%
  plot_model(type = "pred", terms = ~ rx, dot.size = 1.5, 
           title = "") %>%
  gf_labs(x = "Treatment", title = "")

Mixed Linear Regression Models

Continuous outcomes

relationship between sex and age on the distance from the pituitary to the pterygomaxillary fissure (mm).

library(nlme)

Attaching package: 'nlme'
The following objects are masked from 'package:ordinal':

    ranef, VarCorr
The following object is masked from 'package:dplyr':

    collapse
data(Orthodont)
Orthodont <- Orthodont %>%
  var_labels(
    distance = "Pituitary distance (mm)",
    age = "Age (years)"
    )
model_lme <- lme(distance ~ Sex * I(age - mean(age, na.rm = TRUE)), random = ~ 1|Subject, 
                 method = "ML", data = Orthodont)
model_lme %>%
  glm_coef(labels = c(
    "Constant", 
    "Sex: female-male", 
    "Age (years)",
    "Sex:Age interaction"
    ))
ParameterCoefficientPr(>|t|)
Constant24.97 (24.03, 25.9)< 0.001
Sex: female-male-2.32 (-3.78, -0.86)0.005
Age (years)0.78 (0.63, 0.94)< 0.001
Sex:Age interaction-0.3 (-0.54, -0.07)0.015
model_lme %>%
  plot_model("pred", terms = age ~ Sex, 
           show.data = TRUE, jitter = 0.1, dot.size = 1.5) %>%
  gf_labs(y = get_label(Orthodont$distance), x = "Age (years)", title = "")

comments powered by Disqus