Models Diagnostics
Mickaël Canouil, Ph.D. (mickael.canouil@cnrs.fr)
Source:vignettes/articles/models-diagnostics.Rmd
models-diagnostics.Rmd
R setup
output_directory <- file.path(tempdir(), "models-diagnostics")
dir.create(output_directory, recursive = TRUE, showWarnings = FALSE, mode = "0775")
library(eggla)
library(growthcleanr)
library(data.table)
library(nlme)
library(stats)
library(grDevices)
library(ggplot2)
library(ggtext)
library(patchwork)
library(scales)
library(gt)
library(performance)
library(see)
library(qqplotr)
library(future)
library(future.apply)
library(future.callr)
plan("callr", workers = availableCores())
message(sprintf("Number of workers: %d", nbrOfWorkers()))
theme_update(
plot.title = element_markdown(),
plot.subtitle = element_markdown(face = "italic"),
plot.caption = element_markdown(face = "italic"),
axis.title.x = element_markdown(),
axis.text.x = element_markdown(),
axis.title.y = element_markdown(),
axis.text.y = element_markdown()
)
okabe_ito_palette <- c(
"#E69F00FF", "#56B4E9FF", "#009E73FF", "#F0E442FF", "#0072B2FF",
"#D55E00FF", "#CC79A7FF", "#999999FF"
)
Data
# pheno_dt <- setDT(get(data("bmigrowth", package = "eggla")))
pheno_dt <- fread("pheno_file")
Tidy data for Daymont’s quality control
-
age
- age in years (double). -
agedays
- age in days (integer). -
WEIGHTKG
- weight in kilograms (double). -
HEIGHTCM
- height in centimetres (double).
is_male_zero <- FALSE
pheno_dt[
j = `:=`(
"agedays" = floor(age * 365.25), # convert to age in days and as integers ...
"WEIGHTKG" = as.numeric(weight),
"HEIGHTCM" = as.numeric(height),
"sex_daymont" = {
if (is_male_zero) {
as.character(sex)
} else {
c("0" = "1", "1" = "0")[as.character(sex)]
}
}
)
]
Add Daymont’s quality control tags
visits_long <- melt(
data = pheno_dt,
id.vars = c("ID", "age", "sex", "agedays", "sex_daymont"),
measure.vars = c("WEIGHTKG", "HEIGHTCM"),
variable.name = "param",
value.name = "measurement"
)[
j = clean := cleangrowth(
subjid = ID,
param = param,
agedays = agedays,
sex = sex_daymont,
measurement = measurement,
quietly = FALSE
)
]
Model diagnostics
With Daymont’s code for sex, Female: 1 and Male: 0.
Setup all models to be tested
random_effect <- c(
"~ gsp(age, knots = c(0.75, 5.5, 11), degree = rep(1, 4), smooth = rep(0, 3)) | ID",
"~ 1 | ID",
"~ gsp(age, knots = c(2, 8, 12), degree = rep(3, 4), smooth = rep(2, 3)) | ID",
"~ gsp(age, knots = c(2, 8, 12), degree = rep(3, 4), smooth = rep(2, 3))[, 1:3] | ID",
"~ gsp(age, knots = c(2, 8, 12), degree = rep(2, 4), smooth = rep(2, 3)) | ID",
"~ gsp(age, knots = c(2, 8, 12), degree = rep(1, 4), smooth = rep(2, 3)) | ID",
"~ 1 | ID",
"~ poly(age, degree = 3) | ID",
"~ poly(age, degree = 2) | ID",
"~ age | ID",
"~ 1 | ID"
)
fixed_effect <- c(
rep("log(bmi) ~ gsp(age, knots = c(0.75, 5.5, 11), degree = rep(1, 4), smooth = rep(0, 3))", 2),
rep("log(bmi) ~ gsp(age, knots = c(2, 8, 12), degree = rep(3, 4), smooth = rep(2, 3))", 5),
rep("log(bmi) ~ poly(age, degree = 3)", 4)
)
all_models <- setDT(
expand.grid(
sex = c(0, 1),
dataset = c(
"visits_raw",
"visits_clean",
"visits_clean[between(age, 1, 18)]"
),
correlation_structure = c("NULL", "corCAR1(form = ~ 1 | ID)"),
models = sprintf("fixed = %s, random = %s", fixed_effect, random_effect),
default = paste0(
c(
"na.action = na.omit",
"method = \"ML\"",
"control = lmeControl(opt = \"optim\", maxIter = 500, msMaxIter = 500)"
),
collapse = ", "
),
lme = list(NULL),
results = list("Good"),
perf_check_model = list(NULL),
perf_model_performance = list(NULL),
stringsAsFactors = FALSE
)
)
Run all models
In parallel using future
.
all_models[["lme"]] <- future_mapply(
FUN = function(models, dataset, sex, correlation_structure, default) {
m <- sprintf(
"list(lme(%s, data = %s[sex_daymont == %s], correlation = %s, %s))",
models,
dataset,
sex,
correlation_structure,
default
)
tryCatch(
expr = eval(parse(text = m)),
error = function(e) sprintf("Error: %s", e$message),
warning = function(w) sprintf("Warning: %s", w$message)
)
},
models = all_models[["models"]],
dataset = all_models[["dataset"]],
sex = all_models[["sex"]],
correlation_structure = all_models[["correlation_structure"]],
default = all_models[["default"]],
future.globals = c("visits_raw", "visits_clean"),
future.packages = c("eggla", "nlme", "data.table"),
USE.NAMES = FALSE
)
all_models[["is_good"]] <- future_mapply(
FUN = function(lme) inherits(summary(lme), "lme"),
lme = all_models[["lme"]],
future.globals = FALSE,
future.packages = c("nlme"),
USE.NAMES = FALSE
)
all_models[
i = !(is_good),
j = results := list(lme)
]
all_models[
i = (is_good),
j = `:=`(
perf_check_model = lapply(
X = lme,
FUN = function(x) {
devnull <- capture.output(
out <- check_model(
x = x,
panel = FALSE,
check = c("normality", "linearity", "qq"),
verbose = FALSE
)
)
out
}
),
perf_model_performance = lapply(
X = lme,
FUN = function(x) {
performance::model_performance(
model = x,
metrics = "all",
estimator = "ML"
)[c("AIC", "BIC", "R2_conditional", "R2_marginal", "ICC", "RMSE", "Sigma")]
}
)
)
]
saveRDS(
object = all_models,
file = file.path(output_directory, "summary_models.rds"),
compress = FALSE
)
Models results
best_models <- paste(
"fixed = log(bmi) ~ gsp(age, knots = c(2, 8, 12), degree = rep(3, 4), smooth = rep(2, 3)),",
"random = ~ gsp(age, knots = c(2, 8, 12), degree = rep(3, 4), smooth = rep(2, 3)) | ID"
)
best_dataset <- "visits_clean"
good_res_models <- readRDS(file.path(output_directory, "summary_models.rds"))[
correlation_structure != "NULL" &
models %in% best_models &
dataset %in% best_dataset
]
wrap_plots(
lapply(
X = good_res_models[j = perf_check_model],
FUN = function(.x) {
if (is.null(.x)) {
wrap_plots(rep(list(ggplot() + geom_blank()), 3), nrow = 1, tag_level = "new")
} else {
wrap_plots(print(.x + theme_get()), nrow = 1, tag_level = "new")
}
}
),
nrow = 2
) +
plot_annotation(
title = "Cubic Splines (Fixed Effects = Random Effects)",
subtitle = "With A) sex = 0, and with B) sex = 1",
tag_levels = c("A", "1")
)
ggplot(
data = good_res_models[
j = rbindlist(lapply(
X = lme,
FUN = function(.lme) {
if (inherits(.lme, "lme")) {
.lme[["data"]][j = .SD, .SDcols = !"sex"]
}
}
)),
by = list(sex, dataset, models)
]
) +
aes(
x = age,
y = bmi,
linetype = c("0" = "Male", "1" = "Female")[as.character(sex)]
) +
geom_path( # comment/remove to not draw individual trajectories
mapping = aes(group = ID),
alpha = 0.10,
show.legend = FALSE
) +
stat_smooth(
mapping = aes(colour = "On observed values"),
se = FALSE,
method = "gam",
formula = y ~ s(x, bs = "cr")
) +
stat_smooth(
data = setnames(
x = good_res_models[
j = rbindlist(lapply(
X = lme,
FUN = function(.lme) {
if (inherits(.lme, "lme")) {
predict_bmi(
fit = .lme,
start = min(.lme[["data"]][["age"]]),
end = max(.lme[["data"]][["age"]])
)
}
}
)),
by = list(sex, dataset, models)
],
old = c("egg_ageyears", "egg_bmi"),
new = c("age", "bmi")
),
mapping = aes(colour = "On predicted values"),
se = FALSE,
method = "gam",
formula = y ~ s(x, bs = "cr")
) +
scale_x_sqrt(
expand = c(0, 0),
breaks = c(0, 0.5, 1, 1.5, 2, 5, 10, 15),
limits = c(0, NA)
) +
scale_colour_manual(values = c(okabe_ito_palette[1], okabe_ito_palette[3])) +
labs(
title = "Cubic Splines (Fixed Effects = Random Effects)",
x = "AGE (years)",
y = "BMI (kg/m\u00B2)",
colour = "GAM: y ~ s(x, bs = \"cr\")",
linetype = "Sex"
) +
theme(
legend.position = c(0.01, 0.99),
legend.justification = c("left", "top")
)
fmt_models <- all_models[
j = list(
sex = c("0" = "Male", "1" = "Female")[as.character(sex)],
dataset = c(
"visits_raw" = "Raw",
"visits_clean" = "Clean",
"visits_clean[between(age, 1, 18)]" = "Clean<br><i style='font-size: 6pt'>(1 ≤ age ≤ 18)</i>"
)[dataset],
correlation_structure,
model = setNames(
object = c(
sprintf("<b style='color: %s'>Linear Splines</b> - Random Linear Splines", okabe_ito_palette[4]),
sprintf("<b style='color: %s'>Linear Splines</b> - Random Intercepts", okabe_ito_palette[4]),
sprintf("<b style='color: %s'>Cubic Splines</b> - Random Cubic Splines", okabe_ito_palette[5]),
sprintf("<b style='color: %s'>Cubic Splines</b> - Random Cubic Splines (1:3)", okabe_ito_palette[5]),
sprintf("<b style='color: %s'>Cubic Splines</b> - Random Quadratic Splines", okabe_ito_palette[5]),
sprintf("<b style='color: %s'>Cubic Splines</b> - Random Linear Splines", okabe_ito_palette[5]),
sprintf("<b style='color: %s'>Cubic Splines</b> - Random Intercepts", okabe_ito_palette[5]),
sprintf("<b style='color: %s'>Cubic Slope</b> - Random Cubic Slopes", okabe_ito_palette[6]),
sprintf("<b style='color: %s'>Cubic Slope</b> - Random Quadratic Slopes", okabe_ito_palette[6]),
sprintf("<b style='color: %s'>Cubic Slope</b> - Random Linear Slopes", okabe_ito_palette[6]),
sprintf("<b style='color: %s'>Cubic Slope</b> - Random Intercepts", okabe_ito_palette[6])
),
nm = sprintf("fixed = %s, random = %s", fixed_effect, random_effect)
)[models],
results = sub(", block [0-9]+$| = [0-9.e-]+$", "", results),
perf = perf_model_performance
)
][
j = c("model", "random") := tstrsplit(model, " - ")
][
j = `:=`(
y = paste0(
model,
fifelse(correlation_structure == "NULL", "<br>", " -- C-AR1<br>"),
"<i style='font-size: 8pt'>", random, "</i>"
)
)
][
j = `:=`(
x = factor(x = dataset, levels = unique(dataset)),
y = factor(x = y, levels = rev(unique(y))),
fill = factor(
x = sub(":.*", "", results),
levels = c("Good", "Warning", "Error")
)
)
]
fwrite(
x = fmt_models[
j = unlist(perf, recursive = FALSE),
by = list(
sex,
dataset = gsub("<b [^>]*>|</b>|<i [^>]*>|</i>", "", x),
model = gsub("<br>", " -- ", gsub("<b [^>]*>|</b>|<i [^>]*>|</i>", "", y)),
status = sub("TRUE", "Good", results)
)
],
file = file.path(output_directory, "models-performance.csv")
)
Models status
ggplot(data = fmt_models) +
aes(x = x, y = y, fill = fill) +
facet_grid(cols = vars(sex)) +
geom_tile(colour = "white") +
scale_x_discrete(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0)) +
scale_fill_manual(
values = c(
"Good" = okabe_ito_palette[2],
"Warning" = okabe_ito_palette[3],
"Error" = okabe_ito_palette[1]
)
) +
labs(x = NULL, y = NULL, fill = NULL) +
theme(legend.position = "top")
if (fmt_models[fill != "Good", .N > 0]) {
diag_tab <- gt(
data = fmt_models[
i = fill != "Good",
j = list(
Model = gsub("<i [^>]*>|</i>", "", gsub("<br>", " -- ", gsub("8pt", "0.75em", as.character(y)))),
Dataset = gsub("<br>", " ", gsub("|<i [^>]*>|</i>", "", gsub("6pt", "0.75em", as.character(x)))),
Result = results
)
]
) %>%
fmt(columns = everything(), fns = identity) %>%
tab_header(title = "Models Diagnostics") %>%
data_color(
columns = "Result",
colors = function(x) {
c(
"Good" = okabe_ito_palette[2],
"Warning" = okabe_ito_palette[3],
"Error" = okabe_ito_palette[1]
)[sub("TRUE", "Good", sub(":.*", "", x))]
},
apply_to = "text"
) %>%
tab_options(
table.font.size = "10pt",
data_row.padding = "2pt"
) %>%
opt_all_caps() %>%
opt_row_striping()
gtsave(diag_tab, file.path(output_directory, "models-diagnostics.html"))
}
Models performance
perf_tab <- gt(
data = fmt_models[
i = fill %in% "Good",
j = list(
Sex = sex,
Model = gsub("<i [^>]*>|</i>", "", gsub("<br>", " -- ", gsub("8pt", "0.75em", as.character(y)))),
Dataset = gsub("<br>", " ", gsub("|<i [^>]*>|</i>", "", gsub("6pt", "0.75em", as.character(x)))),
rbindlist(perf, fill = TRUE)
)
],
groupname_col = c("Sex", "Dataset")
) %>%
fmt(columns = c("Model", "Sex", "Dataset"), fns = identity) %>%
fmt_number(
columns = c("AIC", "BIC"),
decimals = 1,
drop_trailing_zeros = FALSE
) %>%
fmt_number(
columns = c("R2_conditional", "R2_marginal"),
n_sigfig = 3,
drop_trailing_zeros = FALSE
) %>%
fmt_number(
columns = c("RMSE", "Sigma"),
n_sigfig = 3,
drop_trailing_zeros = FALSE
) %>%
fmt_number(
columns = "ICC",
rows = ICC >= 0.10,
decimals = 3,
n_sigfig = 3,
drop_trailing_zeros = FALSE
) %>%
fmt_scientific(
columns = "ICC",
rows = ICC < 0.10,
decimals = 2,
drop_trailing_zeros = FALSE
) %>%
tab_header(title = "Models Performance") %>%
data_color(
columns = c("AIC", "BIC", "R2_conditional", "R2_marginal", "ICC", "RMSE", "Sigma"),
colors = function(x) {
sx <- rescale(x, to = c(0, 1))
col_numeric(palette = "plasma", domain = c(0, 1))(sx)
}
) %>%
tab_options(
table.font.size = "10pt",
data_row.padding = "2pt"
) %>%
opt_all_caps() %>%
opt_row_striping()
gtsave(perf_tab, file.path(output_directory, "models-performance.html"))
Update performance metrics
Load, update, and write new performance metrics
models_file_rds <- file.path(output_directory, "summary_models.rds")
all_models <- readRDS(models_file_rds)
all_models[
i = (is_good),
j = `:=`(
perf_check_model = lapply(
X = lme,
FUN = function(x) {
devnull <- capture.output(
out <- check_model(
x = x,
panel = FALSE,
check = c("normality", "linearity", "qq"),
verbose = FALSE
)
)
out
}
),
perf_model_performance = lapply(lme, model_performance)
)
]
saveRDS(object = all_models, file = models_file_rds, compress = FALSE)
Export performance as CSV
random_effect <- c(
"~ gsp(age, knots = c(0.75, 5.5, 11), degree = rep(1, 4), smooth = rep(0, 3)) | ID",
"~ 1 | ID",
"~ gsp(age, knots = c(2, 8, 12), degree = rep(3, 4), smooth = rep(2, 3)) | ID",
"~ gsp(age, knots = c(2, 8, 12), degree = rep(3, 4), smooth = rep(2, 3))[, 1:3] | ID",
"~ gsp(age, knots = c(2, 8, 12), degree = rep(2, 4), smooth = rep(2, 3)) | ID",
"~ gsp(age, knots = c(2, 8, 12), degree = rep(1, 4), smooth = rep(2, 3)) | ID",
"~ 1 | ID",
"~ poly(age, degree = 3) | ID",
"~ poly(age, degree = 2) | ID",
"~ age | ID",
"~ 1 | ID"
)
fixed_effect <- c(
rep("log(bmi) ~ gsp(age, knots = c(0.75, 5.5, 11), degree = rep(1, 4), smooth = rep(0, 3))", 2),
rep("log(bmi) ~ gsp(age, knots = c(2, 8, 12), degree = rep(3, 4), smooth = rep(2, 3))", 5),
rep("log(bmi) ~ poly(age, degree = 3)", 4)
)
fmt_models <- all_models[
j = list(
sex = c("0" = "Male", "1" = "Female")[as.character(sex)],
dataset = c(
"visits_raw" = "Raw",
"visits_clean" = "Clean",
"visits_clean[between(age, 1, 18)]" = "Clean<br><i style='font-size: 6pt'>(1 ≤ age ≤ 18)</i>"
)[dataset],
correlation_structure,
model = setNames(
object = c(
sprintf("<b style='color: %s'>Linear Splines</b> - Random Linear Splines", okabe_ito_palette[4]),
sprintf("<b style='color: %s'>Linear Splines</b> - Random Intercepts", okabe_ito_palette[4]),
sprintf("<b style='color: %s'>Cubic Splines</b> - Random Cubic Splines", okabe_ito_palette[5]),
sprintf("<b style='color: %s'>Cubic Splines</b> - Random Cubic Splines (1:3)", okabe_ito_palette[5]),
sprintf("<b style='color: %s'>Cubic Splines</b> - Random Quadratic Splines", okabe_ito_palette[5]),
sprintf("<b style='color: %s'>Cubic Splines</b> - Random Linear Splines", okabe_ito_palette[5]),
sprintf("<b style='color: %s'>Cubic Splines</b> - Random Intercepts", okabe_ito_palette[5]),
sprintf("<b style='color: %s'>Cubic Slope</b> - Random Cubic Slopes", okabe_ito_palette[6]),
sprintf("<b style='color: %s'>Cubic Slope</b> - Random Quadratic Slopes", okabe_ito_palette[6]),
sprintf("<b style='color: %s'>Cubic Slope</b> - Random Linear Slopes", okabe_ito_palette[6]),
sprintf("<b style='color: %s'>Cubic Slope</b> - Random Intercepts", okabe_ito_palette[6])
),
nm = sprintf("fixed = %s, random = %s", fixed_effect, random_effect)
)[models],
results = sub(", block [0-9]+$| = [0-9.e-]+$", "", results),
perf = perf_model_performance
)
][
j = c("model", "random") := tstrsplit(model, " - ")
][
j = `:=`(
y = paste0(
model,
fifelse(correlation_structure == "NULL", "<br>", " -- C-AR1<br>"),
"<i style='font-size: 8pt'>", random, "</i>"
)
)
][
j = `:=`(
x = factor(x = dataset, levels = unique(dataset)),
y = factor(x = y, levels = rev(unique(y))),
fill = factor(
x = sub(":.*", "", results),
levels = c("Good", "Warning", "Error")
)
)
]
fwrite(
x = fmt_models[
j = unlist(perf, recursive = FALSE),
by = list(
sex,
dataset = gsub("<b [^>]*>|</b>|<i [^>]*>|</i>", "", x),
model = gsub("<br>", " -- ", gsub("<b [^>]*>|</b>|<i [^>]*>|</i>", "", y)),
status = sub("TRUE", "Good", results)
)
],
file = file.path(output_directory, "models-performance.csv")
)
Tidy performance data (subset)
performance_file_csv <- file.path(output_directory, "models-performance.csv")
fwrite(
file = file.path(output_directory, "models-performance-tidy.csv"),
scipen = 10,
x = dcast(
data = melt(
data = fread(performance_file_csv)[
dataset %in% "Clean" & !grepl("Intercepts", model)
][
j = .SD,
.SDcols = -c("dataset", "status")
][
j = model := factor(
x = model,
levels = c(
"Cubic Slope -- Random Cubic Slopes",
"Cubic Slope -- Random Quadratic Slopes",
"Cubic Slope -- Random Linear Slopes",
"Cubic Slope -- C-AR1 -- Random Cubic Slopes",
"Cubic Slope -- C-AR1 -- Random Quadratic Slopes",
"Cubic Slope -- C-AR1 -- Random Linear Slopes",
"Linear Splines -- Random Linear Splines",
"Linear Splines -- C-AR1 -- Random Linear Splines",
"Cubic Splines -- Random Cubic Splines",
"Cubic Splines -- Random Cubic Splines (1:3)",
"Cubic Splines -- Random Quadratic Splines",
"Cubic Splines -- Random Linear Splines",
"Cubic Splines -- C-AR1 -- Random Cubic Splines",
"Cubic Splines -- C-AR1 -- Random Cubic Splines (1:3)",
"Cubic Splines -- C-AR1 -- Random Quadratic Splines",
"Cubic Splines -- C-AR1 -- Random Linear Splines"
)
)
][
order(model)
],
id.vars = c("model", "sex"),
variable.name = "perf_metrics_name",
value.name = "perf_metrics_value"
)[
j = perf_metrics_name := factor(
x = perf_metrics_name,
levels = c(
"AIC", "BIC", "R2_conditional", "R2_marginal",
"ICC", "RMSE", "Sigma"
)
)
][
order(perf_metrics_name)
],
formula = model + perf_metrics_name ~ sex,
value.var = "perf_metrics_value"
)
)