Skip to contents

R setup

output_directory <- file.path(tempdir(), "models-diagnostics")
dir.create(output_directory, recursive = TRUE, showWarnings = FALSE, mode = "0775")

library(eggla)
library(growthcleanr)

library(data.table)
library(nlme)
library(stats)

library(grDevices)
library(ggplot2)
library(ggtext)
library(patchwork)
library(scales)

library(gt)
library(performance)
library(see)
library(qqplotr)

library(future)
library(future.apply)
library(future.callr)
plan("callr", workers = availableCores())
message(sprintf("Number of workers: %d", nbrOfWorkers()))
theme_update(
  plot.title = element_markdown(),
  plot.subtitle = element_markdown(face = "italic"),
  plot.caption = element_markdown(face = "italic"),
  axis.title.x = element_markdown(),
  axis.text.x = element_markdown(),
  axis.title.y = element_markdown(),
  axis.text.y = element_markdown()
)
okabe_ito_palette <- c(
  "#E69F00FF", "#56B4E9FF", "#009E73FF", "#F0E442FF", "#0072B2FF",
  "#D55E00FF", "#CC79A7FF", "#999999FF"
)

Data

# pheno_dt <- setDT(get(data("bmigrowth", package = "eggla")))
pheno_dt <- fread("pheno_file")

Tidy data for Daymont’s quality control

  • age - age in years (double).
  • agedays - age in days (integer).
  • WEIGHTKG - weight in kilograms (double).
  • HEIGHTCM - height in centimetres (double).
is_male_zero <- FALSE
pheno_dt[
  j = `:=`(
    "agedays" = floor(age * 365.25), # convert to age in days and as integers ...
    "WEIGHTKG" = as.numeric(weight),
    "HEIGHTCM" = as.numeric(height),
    "sex_daymont" = {
      if (is_male_zero) {
        as.character(sex)
      } else {
        c("0" = "1", "1" = "0")[as.character(sex)]
      }
    }
  )
]

Add Daymont’s quality control tags

visits_long <- melt(
  data = pheno_dt,
  id.vars = c("ID", "age", "sex", "agedays", "sex_daymont"),
  measure.vars = c("WEIGHTKG", "HEIGHTCM"),
  variable.name = "param",
  value.name = "measurement"
)[
  j = clean := cleangrowth(
    subjid = ID,
    param = param,
    agedays = agedays,
    sex = sex_daymont,
    measurement = measurement,
    quietly = FALSE
  )
]

Compute/Update BMI

  • BMI based on raw measures

    visits_raw <- dcast(
      data = visits_long,
      formula = ... ~ param,
      value.var = "measurement"
    )[
      j = "bmi" := WEIGHTKG / (HEIGHTCM / 100)^2 # recompute bmi based on QC variables
    ][
      !is.na(bmi) # exclude missing BMI related to measurements exclusion
    ]
  • BMI based on Clean measures

    visits_clean <- dcast(
      data = visits_long[clean %in% "Include"], # Exclude all flags
      formula = ... ~ param,
      value.var = "measurement"
    )[
      j = "bmi" := WEIGHTKG / (HEIGHTCM / 100)^2 # recompute bmi based on QC variables
    ][
      !is.na(bmi) # exclude missing BMI related to measurements exclusion
    ]

Model diagnostics

With Daymont’s code for sex, Female: 1 and Male: 0.

Setup all models to be tested

random_effect <- c(
  "~ gsp(age, knots = c(0.75, 5.5, 11), degree = rep(1, 4), smooth = rep(0, 3)) | ID",
  "~ 1 | ID",
  "~ gsp(age, knots = c(2, 8, 12), degree = rep(3, 4), smooth = rep(2, 3)) | ID",
  "~ gsp(age, knots = c(2, 8, 12), degree = rep(3, 4), smooth = rep(2, 3))[, 1:3] | ID",
  "~ gsp(age, knots = c(2, 8, 12), degree = rep(2, 4), smooth = rep(2, 3)) | ID",
  "~ gsp(age, knots = c(2, 8, 12), degree = rep(1, 4), smooth = rep(2, 3)) | ID",
  "~ 1 | ID",
  "~ poly(age, degree = 3) | ID",
  "~ poly(age, degree = 2) | ID",
  "~ age | ID",
  "~ 1 | ID"
)

fixed_effect <- c(
  rep("log(bmi) ~ gsp(age, knots = c(0.75, 5.5, 11), degree = rep(1, 4), smooth = rep(0, 3))", 2),
  rep("log(bmi) ~ gsp(age, knots = c(2, 8, 12), degree = rep(3, 4), smooth = rep(2, 3))", 5),
  rep("log(bmi) ~ poly(age, degree = 3)", 4)
)
all_models <- setDT(
  expand.grid(
    sex = c(0, 1),
    dataset = c(
      "visits_raw",
      "visits_clean",
      "visits_clean[between(age, 1, 18)]"
    ),
    correlation_structure = c("NULL", "corCAR1(form = ~ 1 | ID)"),
    models = sprintf("fixed = %s, random = %s", fixed_effect, random_effect),
    default = paste0(
      c(
        "na.action = na.omit",
        "method = \"ML\"",
        "control = lmeControl(opt = \"optim\", maxIter = 500, msMaxIter = 500)"
      ),
      collapse = ", "
    ),
    lme = list(NULL),
    results = list("Good"),
    perf_check_model = list(NULL),
    perf_model_performance = list(NULL),
    stringsAsFactors = FALSE
  )
)

Run all models

In parallel using future.

all_models[["lme"]] <- future_mapply(
  FUN = function(models, dataset, sex, correlation_structure, default) {
    m <- sprintf(
      "list(lme(%s, data = %s[sex_daymont == %s], correlation = %s, %s))",
      models,
      dataset,
      sex,
      correlation_structure,
      default
    )
    tryCatch(
      expr = eval(parse(text = m)),
      error = function(e) sprintf("Error: %s", e$message),
      warning = function(w) sprintf("Warning: %s", w$message)
    )
  },
  models = all_models[["models"]],
  dataset = all_models[["dataset"]],
  sex = all_models[["sex"]],
  correlation_structure = all_models[["correlation_structure"]],
  default = all_models[["default"]],
  future.globals = c("visits_raw", "visits_clean"),
  future.packages = c("eggla", "nlme", "data.table"),
  USE.NAMES = FALSE
)

all_models[["is_good"]] <- future_mapply(
  FUN = function(lme) inherits(summary(lme), "lme"),
  lme = all_models[["lme"]],
  future.globals = FALSE,
  future.packages = c("nlme"),
  USE.NAMES = FALSE
)

all_models[
  i = !(is_good),
  j = results := list(lme)
]

all_models[
  i = (is_good),
  j = `:=`(
    perf_check_model = lapply(
      X = lme,
      FUN = function(x) {
        devnull <- capture.output(
          out <- check_model(
            x = x,
            panel = FALSE,
            check = c("normality", "linearity", "qq"),
            verbose = FALSE
          )
        )
        out
      }
    ),
    perf_model_performance = lapply(
      X = lme,
      FUN = function(x) {
        performance::model_performance(
          model = x,
          metrics = "all",
          estimator = "ML"
        )[c("AIC", "BIC", "R2_conditional", "R2_marginal", "ICC", "RMSE", "Sigma")]
      }
    )
  )
]

saveRDS(
  object = all_models,
  file = file.path(output_directory, "summary_models.rds"),
  compress = FALSE
)

Models results

best_models <- paste(
  "fixed = log(bmi) ~ gsp(age, knots = c(2, 8, 12), degree = rep(3, 4), smooth = rep(2, 3)),",
  "random = ~ gsp(age, knots = c(2, 8, 12), degree = rep(3, 4), smooth = rep(2, 3)) | ID"
)
best_dataset <- "visits_clean"
good_res_models <- readRDS(file.path(output_directory, "summary_models.rds"))[
  correlation_structure != "NULL" &
    models %in% best_models &
    dataset %in% best_dataset
]
wrap_plots(
  lapply(
    X = good_res_models[j = perf_check_model],
    FUN = function(.x) {
      if (is.null(.x)) {
        wrap_plots(rep(list(ggplot() + geom_blank()), 3), nrow = 1, tag_level = "new")
      } else {
        wrap_plots(print(.x + theme_get()), nrow = 1, tag_level = "new")
      }
    }
  ),
  nrow = 2
) +
  plot_annotation(
    title = "Cubic Splines (Fixed Effects = Random Effects)",
    subtitle = "With A) sex = 0, and with B) sex = 1",
    tag_levels = c("A", "1")
  )
ggplot(
  data = good_res_models[
    j = rbindlist(lapply(
      X = lme,
      FUN = function(.lme) {
        if (inherits(.lme, "lme")) {
          .lme[["data"]][j = .SD, .SDcols = !"sex"]
        }
      }
    )),
    by = list(sex, dataset, models)
  ]
) +
  aes(
    x = age,
    y = bmi,
    linetype = c("0" = "Male", "1" = "Female")[as.character(sex)]
  ) +
  geom_path( # comment/remove to not draw individual trajectories
    mapping = aes(group = ID),
    alpha = 0.10,
    show.legend = FALSE
  ) +
  stat_smooth(
    mapping = aes(colour = "On observed values"),
    se = FALSE,
    method = "gam",
    formula = y ~ s(x, bs = "cr")
  ) +
  stat_smooth(
    data = setnames(
      x = good_res_models[
        j = rbindlist(lapply(
          X = lme,
          FUN = function(.lme) {
            if (inherits(.lme, "lme")) {
              predict_bmi(
                fit = .lme,
                start = min(.lme[["data"]][["age"]]),
                end = max(.lme[["data"]][["age"]])
              )
            }
          }
        )),
        by = list(sex, dataset, models)
      ],
      old = c("egg_ageyears", "egg_bmi"),
      new = c("age", "bmi")
    ),
    mapping = aes(colour = "On predicted values"),
    se = FALSE,
    method = "gam",
    formula = y ~ s(x, bs = "cr")
  ) +
  scale_x_sqrt(
    expand = c(0, 0),
    breaks = c(0, 0.5, 1, 1.5, 2, 5, 10, 15),
    limits = c(0, NA)
  ) +
  scale_colour_manual(values = c(okabe_ito_palette[1], okabe_ito_palette[3])) +
  labs(
    title = "Cubic Splines (Fixed Effects = Random Effects)",
    x = "AGE (years)",
    y = "BMI (kg/m\u00B2)",
    colour = "GAM: y ~ s(x, bs = \"cr\")",
    linetype = "Sex"
  ) +
  theme(
    legend.position = c(0.01, 0.99),
    legend.justification = c("left", "top")
  )
fmt_models <- all_models[
  j = list(
    sex = c("0" = "Male", "1" = "Female")[as.character(sex)],
    dataset = c(
      "visits_raw" = "Raw",
      "visits_clean" = "Clean",
      "visits_clean[between(age, 1, 18)]" = "Clean<br><i style='font-size: 6pt'>(1 ≤ age ≤ 18)</i>"
    )[dataset],
    correlation_structure,
    model = setNames(
      object = c(
        sprintf("<b style='color: %s'>Linear Splines</b> - Random Linear Splines", okabe_ito_palette[4]),
        sprintf("<b style='color: %s'>Linear Splines</b> - Random Intercepts", okabe_ito_palette[4]),
        sprintf("<b style='color: %s'>Cubic Splines</b> - Random Cubic Splines", okabe_ito_palette[5]),
        sprintf("<b style='color: %s'>Cubic Splines</b> - Random Cubic Splines (1:3)", okabe_ito_palette[5]),
        sprintf("<b style='color: %s'>Cubic Splines</b> - Random Quadratic Splines", okabe_ito_palette[5]),
        sprintf("<b style='color: %s'>Cubic Splines</b> - Random Linear Splines", okabe_ito_palette[5]),
        sprintf("<b style='color: %s'>Cubic Splines</b> - Random Intercepts", okabe_ito_palette[5]),
        sprintf("<b style='color: %s'>Cubic Slope</b> - Random Cubic Slopes", okabe_ito_palette[6]),
        sprintf("<b style='color: %s'>Cubic Slope</b> - Random Quadratic Slopes", okabe_ito_palette[6]),
        sprintf("<b style='color: %s'>Cubic Slope</b> - Random Linear Slopes", okabe_ito_palette[6]),
        sprintf("<b style='color: %s'>Cubic Slope</b> - Random Intercepts", okabe_ito_palette[6])
      ),
      nm = sprintf("fixed = %s, random = %s", fixed_effect, random_effect)
    )[models],
    results = sub(", block [0-9]+$| = [0-9.e-]+$", "", results),
    perf = perf_model_performance
  )
][
  j = c("model", "random") := tstrsplit(model, " - ")
][
  j = `:=`(
    y = paste0(
      model,
      fifelse(correlation_structure == "NULL", "<br>", " -- C-AR1<br>"),
      "<i style='font-size: 8pt'>", random, "</i>"
    )
  )
][
  j = `:=`(
    x = factor(x = dataset, levels = unique(dataset)),
    y = factor(x = y, levels = rev(unique(y))),
    fill = factor(
      x = sub(":.*", "", results),
      levels = c("Good", "Warning", "Error")
    )
  )
]

fwrite(
  x = fmt_models[
    j = unlist(perf, recursive = FALSE),
    by = list(
      sex,
      dataset = gsub("<b [^>]*>|</b>|<i [^>]*>|</i>", "", x),
      model = gsub("<br>", " -- ", gsub("<b [^>]*>|</b>|<i [^>]*>|</i>", "", y)),
      status = sub("TRUE", "Good", results)
    )
  ],
  file = file.path(output_directory, "models-performance.csv")
)

Models status

ggplot(data = fmt_models) +
  aes(x = x, y = y, fill = fill) +
  facet_grid(cols = vars(sex)) +
  geom_tile(colour = "white") +
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0)) +
  scale_fill_manual(
    values = c(
      "Good" = okabe_ito_palette[2],
      "Warning" = okabe_ito_palette[3],
      "Error" = okabe_ito_palette[1]
    )
  ) +
  labs(x = NULL, y = NULL, fill = NULL) +
  theme(legend.position = "top")
if (fmt_models[fill != "Good", .N > 0]) {
  diag_tab <- gt(
    data = fmt_models[
      i = fill != "Good",
      j = list(
        Model = gsub("<i [^>]*>|</i>", "", gsub("<br>", " -- ", gsub("8pt", "0.75em", as.character(y)))),
        Dataset = gsub("<br>", " ", gsub("|<i [^>]*>|</i>", "", gsub("6pt", "0.75em", as.character(x)))),
        Result = results
      )
    ]
  ) %>%
    fmt(columns = everything(), fns = identity) %>%
    tab_header(title = "Models Diagnostics") %>%
    data_color(
      columns = "Result",
      colors = function(x) {
        c(
          "Good" = okabe_ito_palette[2],
          "Warning" = okabe_ito_palette[3],
          "Error" = okabe_ito_palette[1]
        )[sub("TRUE", "Good", sub(":.*", "", x))]
      },
      apply_to = "text"
    ) %>%
    tab_options(
      table.font.size = "10pt",
      data_row.padding = "2pt"
    ) %>%
    opt_all_caps() %>%
    opt_row_striping()

  gtsave(diag_tab, file.path(output_directory, "models-diagnostics.html"))
}

Models performance

perf_tab <- gt(
  data = fmt_models[
    i = fill %in% "Good",
    j = list(
      Sex = sex,
      Model = gsub("<i [^>]*>|</i>", "", gsub("<br>", " -- ", gsub("8pt", "0.75em", as.character(y)))),
      Dataset = gsub("<br>", " ", gsub("|<i [^>]*>|</i>", "", gsub("6pt", "0.75em", as.character(x)))),
      rbindlist(perf, fill = TRUE)
    )
  ],
  groupname_col = c("Sex", "Dataset")
) %>%
  fmt(columns = c("Model", "Sex", "Dataset"), fns = identity) %>%
  fmt_number(
    columns = c("AIC", "BIC"),
    decimals = 1,
    drop_trailing_zeros = FALSE
  ) %>%
  fmt_number(
    columns = c("R2_conditional", "R2_marginal"),
    n_sigfig = 3,
    drop_trailing_zeros = FALSE
  ) %>%
  fmt_number(
    columns = c("RMSE", "Sigma"),
    n_sigfig = 3,
    drop_trailing_zeros = FALSE
  ) %>%
  fmt_number(
    columns = "ICC",
    rows = ICC >= 0.10,
    decimals = 3,
    n_sigfig = 3,
    drop_trailing_zeros = FALSE
  ) %>%
  fmt_scientific(
    columns = "ICC",
    rows = ICC < 0.10,
    decimals = 2,
    drop_trailing_zeros = FALSE
  ) %>%
  tab_header(title = "Models Performance") %>%
  data_color(
    columns = c("AIC", "BIC", "R2_conditional", "R2_marginal", "ICC", "RMSE", "Sigma"),
    colors = function(x) {
      sx <- rescale(x, to = c(0, 1))
      col_numeric(palette = "plasma", domain = c(0, 1))(sx)
    }
  ) %>%
  tab_options(
    table.font.size = "10pt",
    data_row.padding = "2pt"
  ) %>%
  opt_all_caps() %>%
  opt_row_striping()

gtsave(perf_tab, file.path(output_directory, "models-performance.html"))

Update performance metrics

Load, update, and write new performance metrics

models_file_rds <- file.path(output_directory, "summary_models.rds")
all_models <- readRDS(models_file_rds)
all_models[
  i = (is_good),
  j = `:=`(
    perf_check_model = lapply(
      X = lme,
      FUN = function(x) {
        devnull <- capture.output(
          out <- check_model(
            x = x,
            panel = FALSE,
            check = c("normality", "linearity", "qq"),
            verbose = FALSE
          )
        )
        out
      }
    ),
    perf_model_performance = lapply(lme, model_performance)
  )
]
saveRDS(object = all_models, file = models_file_rds, compress = FALSE)

Export performance as CSV

random_effect <- c(
  "~ gsp(age, knots = c(0.75, 5.5, 11), degree = rep(1, 4), smooth = rep(0, 3)) | ID",
  "~ 1 | ID",
  "~ gsp(age, knots = c(2, 8, 12), degree = rep(3, 4), smooth = rep(2, 3)) | ID",
  "~ gsp(age, knots = c(2, 8, 12), degree = rep(3, 4), smooth = rep(2, 3))[, 1:3] | ID",
  "~ gsp(age, knots = c(2, 8, 12), degree = rep(2, 4), smooth = rep(2, 3)) | ID",
  "~ gsp(age, knots = c(2, 8, 12), degree = rep(1, 4), smooth = rep(2, 3)) | ID",
  "~ 1 | ID",
  "~ poly(age, degree = 3) | ID",
  "~ poly(age, degree = 2) | ID",
  "~ age | ID",
  "~ 1 | ID"
)

fixed_effect <- c(
  rep("log(bmi) ~ gsp(age, knots = c(0.75, 5.5, 11), degree = rep(1, 4), smooth = rep(0, 3))", 2),
  rep("log(bmi) ~ gsp(age, knots = c(2, 8, 12), degree = rep(3, 4), smooth = rep(2, 3))", 5),
  rep("log(bmi) ~ poly(age, degree = 3)", 4)
)
fmt_models <- all_models[
  j = list(
    sex = c("0" = "Male", "1" = "Female")[as.character(sex)],
    dataset = c(
      "visits_raw" = "Raw",
      "visits_clean" = "Clean",
      "visits_clean[between(age, 1, 18)]" = "Clean<br><i style='font-size: 6pt'>(1 ≤ age ≤ 18)</i>"
    )[dataset],
    correlation_structure,
    model = setNames(
      object = c(
        sprintf("<b style='color: %s'>Linear Splines</b> - Random Linear Splines", okabe_ito_palette[4]),
        sprintf("<b style='color: %s'>Linear Splines</b> - Random Intercepts", okabe_ito_palette[4]),
        sprintf("<b style='color: %s'>Cubic Splines</b> - Random Cubic Splines", okabe_ito_palette[5]),
        sprintf("<b style='color: %s'>Cubic Splines</b> - Random Cubic Splines (1:3)", okabe_ito_palette[5]),
        sprintf("<b style='color: %s'>Cubic Splines</b> - Random Quadratic Splines", okabe_ito_palette[5]),
        sprintf("<b style='color: %s'>Cubic Splines</b> - Random Linear Splines", okabe_ito_palette[5]),
        sprintf("<b style='color: %s'>Cubic Splines</b> - Random Intercepts", okabe_ito_palette[5]),
        sprintf("<b style='color: %s'>Cubic Slope</b> - Random Cubic Slopes", okabe_ito_palette[6]),
        sprintf("<b style='color: %s'>Cubic Slope</b> - Random Quadratic Slopes", okabe_ito_palette[6]),
        sprintf("<b style='color: %s'>Cubic Slope</b> - Random Linear Slopes", okabe_ito_palette[6]),
        sprintf("<b style='color: %s'>Cubic Slope</b> - Random Intercepts", okabe_ito_palette[6])
      ),
      nm = sprintf("fixed = %s, random = %s", fixed_effect, random_effect)
    )[models],
    results = sub(", block [0-9]+$| = [0-9.e-]+$", "", results),
    perf = perf_model_performance
  )
][
  j = c("model", "random") := tstrsplit(model, " - ")
][
  j = `:=`(
    y = paste0(
      model,
      fifelse(correlation_structure == "NULL", "<br>", " -- C-AR1<br>"),
      "<i style='font-size: 8pt'>", random, "</i>"
    )
  )
][
  j = `:=`(
    x = factor(x = dataset, levels = unique(dataset)),
    y = factor(x = y, levels = rev(unique(y))),
    fill = factor(
      x = sub(":.*", "", results),
      levels = c("Good", "Warning", "Error")
    )
  )
]

fwrite(
  x = fmt_models[
    j = unlist(perf, recursive = FALSE),
    by = list(
      sex,
      dataset = gsub("<b [^>]*>|</b>|<i [^>]*>|</i>", "", x),
      model = gsub("<br>", " -- ", gsub("<b [^>]*>|</b>|<i [^>]*>|</i>", "", y)),
      status = sub("TRUE", "Good", results)
    )
  ],
  file = file.path(output_directory, "models-performance.csv")
)

Tidy performance data (subset)

performance_file_csv <- file.path(output_directory, "models-performance.csv")
fwrite(
  file = file.path(output_directory, "models-performance-tidy.csv"),
  scipen = 10,
  x = dcast(
    data = melt(
      data = fread(performance_file_csv)[
        dataset %in% "Clean" & !grepl("Intercepts", model)
      ][
        j = .SD,
        .SDcols = -c("dataset", "status")
      ][
        j = model := factor(
          x = model,
          levels = c(
            "Cubic Slope -- Random Cubic Slopes",
            "Cubic Slope -- Random Quadratic Slopes",
            "Cubic Slope -- Random Linear Slopes",
            "Cubic Slope -- C-AR1 -- Random Cubic Slopes",
            "Cubic Slope -- C-AR1 -- Random Quadratic Slopes",
            "Cubic Slope -- C-AR1 -- Random Linear Slopes",
            "Linear Splines -- Random Linear Splines",
            "Linear Splines -- C-AR1 -- Random Linear Splines",
            "Cubic Splines -- Random Cubic Splines",
            "Cubic Splines -- Random Cubic Splines (1:3)",
            "Cubic Splines -- Random Quadratic Splines",
            "Cubic Splines -- Random Linear Splines",
            "Cubic Splines -- C-AR1 -- Random Cubic Splines",
            "Cubic Splines -- C-AR1 -- Random Cubic Splines (1:3)",
            "Cubic Splines -- C-AR1 -- Random Quadratic Splines",
            "Cubic Splines -- C-AR1 -- Random Linear Splines"
          )
        )
      ][
        order(model)
      ],
      id.vars = c("model", "sex"),
      variable.name = "perf_metrics_name",
      value.name = "perf_metrics_value"
    )[
      j = perf_metrics_name := factor(
        x = perf_metrics_name,
        levels = c(
          "AIC", "BIC", "R2_conditional", "R2_marginal",
          "ICC", "RMSE", "Sigma"
        )
      )
    ][
      order(perf_metrics_name)
    ],
    formula = model + perf_metrics_name ~ sex,
    value.var = "perf_metrics_value"
  )
)