Model construction
library(h2o)
nfolds = 5
# GLM grid parameters
glm_params <- list(lambda = seq(from = 0, to = 1, by = 0.1))
# DRF grid parameters
rf_params <- list(ntrees = 10000, max_depth = c(5, 10, 15, 20),
nbins = c(30, 100, 300), nbins_cats = c(64, 256, 1024), sample_rate = c(0.7,
0.8, 0.9, 1))
# GBM grid parameters
gbm_params <- list(ntrees = 10000, max_depth = c(5, 10, 15, 20),
learn_rate = c(0.01, 0.1), sample_rate = c(0.7, 0.8, 0.9,
1), col_sample_rate = c(0.7, 0.8, 0.9, 1), nbins = c(30,
100, 300), nbins_cats = c(64, 256, 1024))
# XGboost grid parameters
xgboost_params <- list(max_depth = c(5, 10, 15, 20), ntrees = 10000,
sample_rate = c(0.7, 0.8, 0.9, 1))
# Deep learning grid parameters
deeplearning_params <- list(activation = c("Rectifier", "Maxout",
"Tanh", "RectifierWithDropout", "MaxoutWithDropout", "TanhWithDropout"),
hidden = list(c(50, 50, 50, 50), c(200, 200), c(200, 200,
200), c(200, 200, 200, 200)), epochs = c(50, 100, 200),
adaptive_rate = c(TRUE, FALSE), rate = c(0, 0.1, 0.005, 0.001),
input_dropout_ratio = c(0, 0.1, 0.2))
# Train a super learner model on a
# split of data
train_SL <- function(train_data, split_num,
iter_num) {
models_path = paste0("./models/", classes,
"/split", split_num, "/iter", iter_num)
train = as.h2o(train_data)
y = "response"
x = setdiff(names(train), y)
train[y] <- as.factor(train[y])
res <- as.data.frame(train_data$response)
samp_factors <- as.vector(mean(table(res))/table(res))
# train XGboost base models
xgboost <- h2o.grid(algorithm = "xgboost",
x = x, y = y, training_frame = train,
nfolds = nfolds, keep_cross_validation_predictions = TRUE,
hyper_params = xgboost_params, stopping_rounds = 3,
search_criteria = list(strategy = "RandomDiscrete",
max_models = 100), keep_cross_validation_models = FALSE,
fold_assignment = "Modulo", parallelism = 0)
# train GLM base models
glm <- h2o.grid(algorithm = "glm", x = x,
y = y, training_frame = train, nfolds = nfolds,
keep_cross_validation_predictions = TRUE,
hyper_params = glm_params, stopping_rounds = 3,
balance_classes = TRUE, class_sampling_factors = samp_factors,
search_criteria = list(strategy = "RandomDiscrete",
max_models = 100), keep_cross_validation_models = FALSE,
fold_assignment = "Modulo", parallelism = 0)
# train DFR base models
rf <- h2o.grid(algorithm = "randomForest",
x = x, y = y, training_frame = train,
nfolds = nfolds, keep_cross_validation_predictions = TRUE,
hyper_params = rf_params, stopping_rounds = 3,
balance_classes = TRUE, class_sampling_factors = samp_factors,
search_criteria = list(strategy = "RandomDiscrete",
max_models = 100), keep_cross_validation_models = FALSE,
fold_assignment = "Modulo", parallelism = 0)
# train GBM base models
gbm <- h2o.grid(algorithm = "gbm", x = x,
y = y, training_frame = train, nfolds = nfolds,
keep_cross_validation_predictions = TRUE,
hyper_params = gbm_params, stopping_rounds = 3,
balance_classes = TRUE, class_sampling_factors = samp_factors,
search_criteria = list(strategy = "RandomDiscrete",
max_models = 100), keep_cross_validation_models = FALSE,
fold_assignment = "Modulo", parallelism = 0)
# train Deep learning base models
deeplearning <- h2o.grid(algorithm = "deeplearning",
x = x, y = y, training_frame = train,
nfolds = nfolds, keep_cross_validation_predictions = TRUE,
hyper_params = deeplearning_params,
stopping_rounds = 3, balance_classes = TRUE,
class_sampling_factors = samp_factors,
search_criteria = list(strategy = "RandomDiscrete",
max_models = 100), keep_cross_validation_models = FALSE,
fold_assignment = "Modulo", parallelism = 0)
# list base models
base_models <- as.list(c(unlist(glm@model_ids),
unlist(rf@model_ids), unlist(gbm@model_ids),
unlist(xgboost@model_ids), unlist(deeplearning@model_ids)))
# get prediction of each base mode
for (model_id in base_models) {
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_",
model_id))$predict, col.names = model_id))
h2o.saveModel(h2o.getModel(model_id),
path = models_path, force = TRUE)
}
# build SL model
sl <- h2o.stackedEnsemble(x = x, y = y,
model_id = "superlearner", training_frame = train,
base_models = base_models, metalearner_algorithm = "glm",
metalearner_nfolds = nfolds, keep_levelone_frame = TRUE,
metalearner_params = list(standardize = TRUE,
keep_cross_validation_predictions = TRUE))
# save metalearner predictions
model_id = sl@model[["metalearner"]][["name"]]
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_",
model_id))$predict, col.names = model_id))
# save SL model
h2o.saveModel(sl, path = models_path,
force = TRUE)
h2o.saveModel(h2o.getModel(sl@model[["metalearner"]][["name"]]),
path = models_path, force = TRUE)
# save all models predictions
write.csv(res, file = paste0(models_path,
"/cv_holdout_predictions.csv"), row.names = FALSE)
}
# Build a SL model for all shuffles and splits of a data
build_SL <- function(data, all_splits, setup_name) {
seq = 1:5
# loop over all shuffles
for (split_num in 1:5) {
split = all_splits[[split_num]]
split_preds = data.frame(matrix(nrow = 0, ncol = 4))
colnames(split_preds) = c("predict", "p0", "p1", "PDGP")
# loop over splits of a shuffle
for (i in 1:5) {
test = split[[i]]
train_indx = seq[seq != i]
train = data.frame(matrix(nrow = 0, ncol = length(data)))
colnames(train) = colnames(data)
for (indx in train_indx) {
train = rbind(train, split[[indx]])
}
train_PDGP = train$PDGP
test_PDGP = test$PDGP
train = subset(train, select = -PDGP)
test = subset(test, select = -PDGP)
# train the SL model
train_SL(train, split_num = split_num, iter_num = i)
# predict on test data
sl = h2o.loadModel(paste0("./models/", setup_name,
"/split", split_num, "/iter", i, "/superlearner"))
preds = as.data.frame(h2o.predict(sl, as.h2o(test)))
preds$PDGP = test_PDGP
# combine splits predictions
split_preds = rbind(split_preds, preds)
}
# save predictions of the shuffled data
write.csv(split_preds, file = paste0("./models/", setup_name,
"/split", split_num, "/sl_predictions.csv"), row.names = FALSE)
}
}
PD participants and controls
load("./rdata/var_reduct_PD_control_splits.RData")
build_SL(data, all_splits, "PD_HC")
Mild PD participants and controls
load("./rdata/var_reduct_HY_early_HC_splits.RData")
build_SL(data, all_splits, "HY_early_HC")
Moderate PD participants and controls
load("./rdata/var_reduct_HY_mild_HC_splits.RData")
build_SL(data, all_splits, "HY_mild_HC")
Severe PD participants and controls
load("./rdata/var_reduct_HY_severe_HC_splits.RData")
build_SL(data, all_splits, "HY_severe_HC")