Section 4 Model construction

library(h2o)

nfolds = 5
# GLM grid parameters
glm_params <- list(lambda = seq(from = 0, to = 1, by = 0.1))
# DRF grid parameters
rf_params <- list(ntrees = 10000, max_depth = c(5, 10, 15, 20),
    nbins = c(30, 100, 300), nbins_cats = c(64, 256, 1024), sample_rate = c(0.7,
        0.8, 0.9, 1))
# GBM grid parameters
gbm_params <- list(ntrees = 10000, max_depth = c(5, 10, 15, 20),
    learn_rate = c(0.01, 0.1), sample_rate = c(0.7, 0.8, 0.9,
        1), col_sample_rate = c(0.7, 0.8, 0.9, 1), nbins = c(30,
        100, 300), nbins_cats = c(64, 256, 1024))
# XGboost grid parameters
xgboost_params <- list(max_depth = c(5, 10, 15, 20), ntrees = 10000,
    sample_rate = c(0.7, 0.8, 0.9, 1))
# Deep learning grid parameters
deeplearning_params <- list(activation = c("Rectifier", "Maxout",
    "Tanh", "RectifierWithDropout", "MaxoutWithDropout", "TanhWithDropout"),
    hidden = list(c(50, 50, 50, 50), c(200, 200), c(200, 200,
        200), c(200, 200, 200, 200)), epochs = c(50, 100, 200),
    adaptive_rate = c(TRUE, FALSE), rate = c(0, 0.1, 0.005, 0.001),
    input_dropout_ratio = c(0, 0.1, 0.2))
# Train a super learner model on a
# split of data
train_SL <- function(train_data, split_num,
    iter_num) {
    models_path = paste0("./models/", classes,
        "/split", split_num, "/iter", iter_num)
    train = as.h2o(train_data)
    y = "response"
    x = setdiff(names(train), y)
    train[y] <- as.factor(train[y])
    res <- as.data.frame(train_data$response)
    samp_factors <- as.vector(mean(table(res))/table(res))
    # train XGboost base models
    xgboost <- h2o.grid(algorithm = "xgboost",
        x = x, y = y, training_frame = train,
        nfolds = nfolds, keep_cross_validation_predictions = TRUE,
        hyper_params = xgboost_params, stopping_rounds = 3,
        search_criteria = list(strategy = "RandomDiscrete",
            max_models = 100), keep_cross_validation_models = FALSE,
        fold_assignment = "Modulo", parallelism = 0)
    # train GLM base models
    glm <- h2o.grid(algorithm = "glm", x = x,
        y = y, training_frame = train, nfolds = nfolds,
        keep_cross_validation_predictions = TRUE,
        hyper_params = glm_params, stopping_rounds = 3,
        balance_classes = TRUE, class_sampling_factors = samp_factors,
        search_criteria = list(strategy = "RandomDiscrete",
            max_models = 100), keep_cross_validation_models = FALSE,
        fold_assignment = "Modulo", parallelism = 0)
    # train DFR base models
    rf <- h2o.grid(algorithm = "randomForest",
        x = x, y = y, training_frame = train,
        nfolds = nfolds, keep_cross_validation_predictions = TRUE,
        hyper_params = rf_params, stopping_rounds = 3,
        balance_classes = TRUE, class_sampling_factors = samp_factors,
        search_criteria = list(strategy = "RandomDiscrete",
            max_models = 100), keep_cross_validation_models = FALSE,
        fold_assignment = "Modulo", parallelism = 0)
    # train GBM base models
    gbm <- h2o.grid(algorithm = "gbm", x = x,
        y = y, training_frame = train, nfolds = nfolds,
        keep_cross_validation_predictions = TRUE,
        hyper_params = gbm_params, stopping_rounds = 3,
        balance_classes = TRUE, class_sampling_factors = samp_factors,
        search_criteria = list(strategy = "RandomDiscrete",
            max_models = 100), keep_cross_validation_models = FALSE,
        fold_assignment = "Modulo", parallelism = 0)
    # train Deep learning base models
    deeplearning <- h2o.grid(algorithm = "deeplearning",
        x = x, y = y, training_frame = train,
        nfolds = nfolds, keep_cross_validation_predictions = TRUE,
        hyper_params = deeplearning_params,
        stopping_rounds = 3, balance_classes = TRUE,
        class_sampling_factors = samp_factors,
        search_criteria = list(strategy = "RandomDiscrete",
            max_models = 100), keep_cross_validation_models = FALSE,
        fold_assignment = "Modulo", parallelism = 0)
    # list base models
    base_models <- as.list(c(unlist(glm@model_ids),
        unlist(rf@model_ids), unlist(gbm@model_ids),
        unlist(xgboost@model_ids), unlist(deeplearning@model_ids)))
    # get prediction of each base mode
    for (model_id in base_models) {
        res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_",
            model_id))$predict, col.names = model_id))
        h2o.saveModel(h2o.getModel(model_id),
            path = models_path, force = TRUE)
    }
    # build SL model
    sl <- h2o.stackedEnsemble(x = x, y = y,
        model_id = "superlearner", training_frame = train,
        base_models = base_models, metalearner_algorithm = "glm",
        metalearner_nfolds = nfolds, keep_levelone_frame = TRUE,
        metalearner_params = list(standardize = TRUE,
            keep_cross_validation_predictions = TRUE))
    # save metalearner predictions
    model_id = sl@model[["metalearner"]][["name"]]
    res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_",
        model_id))$predict, col.names = model_id))
    # save SL model
    h2o.saveModel(sl, path = models_path,
        force = TRUE)
    h2o.saveModel(h2o.getModel(sl@model[["metalearner"]][["name"]]),
        path = models_path, force = TRUE)
    # save all models predictions
    write.csv(res, file = paste0(models_path,
        "/cv_holdout_predictions.csv"), row.names = FALSE)
}
# Build a SL model for all shuffles and splits of a data
build_SL <- function(data, all_splits, setup_name) {
    seq = 1:5
    # loop over all shuffles
    for (split_num in 1:5) {
        split = all_splits[[split_num]]
        split_preds = data.frame(matrix(nrow = 0, ncol = 4))
        colnames(split_preds) = c("predict", "p0", "p1", "PDGP")
        # loop over splits of a shuffle
        for (i in 1:5) {
            test = split[[i]]
            train_indx = seq[seq != i]
            train = data.frame(matrix(nrow = 0, ncol = length(data)))
            colnames(train) = colnames(data)
            for (indx in train_indx) {
                train = rbind(train, split[[indx]])
            }
            train_PDGP = train$PDGP
            test_PDGP = test$PDGP
            train = subset(train, select = -PDGP)
            test = subset(test, select = -PDGP)
            # train the SL model
            train_SL(train, split_num = split_num, iter_num = i)
            # predict on test data
            sl = h2o.loadModel(paste0("./models/", setup_name,
                "/split", split_num, "/iter", i, "/superlearner"))
            preds = as.data.frame(h2o.predict(sl, as.h2o(test)))
            preds$PDGP = test_PDGP
            # combine splits predictions
            split_preds = rbind(split_preds, preds)
        }
        # save predictions of the shuffled data
        write.csv(split_preds, file = paste0("./models/", setup_name,
            "/split", split_num, "/sl_predictions.csv"), row.names = FALSE)
    }
}

4.1 PD participants and controls

load("./rdata/var_reduct_PD_control_splits.RData")
build_SL(data, all_splits, "PD_HC")

4.2 Mild PD participants and controls

load("./rdata/var_reduct_HY_early_HC_splits.RData")
build_SL(data, all_splits, "HY_early_HC")

4.3 Moderate PD participants and controls

load("./rdata/var_reduct_HY_mild_HC_splits.RData")
build_SL(data, all_splits, "HY_mild_HC")

4.4 Severe PD participants and controls

load("./rdata/var_reduct_HY_severe_HC_splits.RData")
build_SL(data, all_splits, "HY_severe_HC")