Section 5 Evaluation

library(stringr)
library(MLmetrics)
library(h2o)
library(reshape2)
library(ggplot2)
library(readxl)
library(stringr)
library(ggforce)
library(cowplot)
library(plotly)
library(ggrepel)
library(gridExtra)
library(scales)

5.1 Performance evaluation

5.1.1 PD participants and controls

load("./rdata/var_reduct_PD_control_splits.RData")

sl_preds = read.csv("./files/PD_HC/split1/sl_predictions.csv")
sl_preds <- sl_preds[order(sl_preds$PDGP), ]
PDGP = sl_preds$PDGP
sl_preds = subset(sl_preds, select = -PDGP)

num_splits = 5
all_sl_preds = sl_preds
colnames(all_sl_preds) = paste0(colnames(sl_preds), "_split1")
for (i in 2:num_splits) {
    sl_preds = read.csv(paste0("./files/PD_HC/split", i,
        "/sl_predictions.csv"))
    sl_preds <- sl_preds[order(sl_preds$PDGP), ]
    sl_preds = subset(sl_preds, select = -PDGP)
    colnames(sl_preds) = paste0(colnames(sl_preds), "_split",
        i)
    all_sl_preds = cbind(all_sl_preds, sl_preds)
}
all_sl_pred_cls = all_sl_preds[, str_detect(colnames(all_sl_preds),
    "predict")]
all_sl_pred_cls$predict_mode = apply(all_sl_pred_cls, 1,
    function(x) {
        uniqx <- unique(na.omit(x))
        uniqx[which.max(tabulate(match(x, uniqx)))]
    })

cf = caret::confusionMatrix(data = factor(all_sl_pred_cls$predict_mode,
    levels = c(1, 0)), factor(data$response, levels = c(1,
    0)), positive = "1")
sens = Sensitivity(factor(data$response, levels = c(1, 0)),
    factor(all_sl_pred_cls$predict_mode, levels = c(1, 0)))
spec = Specificity(factor(data$response, levels = c(1, 0)),
    factor(all_sl_pred_cls$predict_mode, levels = c(1, 0)))
f1_score = F1_Score(factor(data$response, levels = c(1,
    0)), factor(all_sl_pred_cls$predict_mode, levels = c(1,
    0)))

print(cf$table)
##           Reference
## Prediction   1   0
##          1 248   9
##          0  14  41
cat("Accuracy = ", round(cf$overall[["Accuracy"]], 4) *
    100, "%\n")
## Accuracy =  92.63 %
cat("Sensitivity = ", round(sens, 3), "\n")
## Sensitivity =  0.947
cat("Specificity = ", round(spec, 3), "\n")
## Specificity =  0.82
cat("F1 score = ", round(f1_score, 3), "\n")
## F1 score =  0.956

5.1.2 Mild PD participants and controls

load("./rdata/var_reduct_HY_early_HC_splits.RData")

sl_preds = read.csv("./files/HY_early_HC/split1/sl_predictions.csv")
sl_preds <- sl_preds[order(sl_preds$PDGP), ]
PDGP = sl_preds$PDGP
sl_preds = subset(sl_preds, select = -PDGP)

num_splits = 5
all_sl_preds = sl_preds
colnames(all_sl_preds) = paste0(colnames(sl_preds), "_split1")
for (i in 2:num_splits) {
    sl_preds = read.csv(paste0("./files/HY_early_HC/split",
        i, "/sl_predictions.csv"))
    sl_preds <- sl_preds[order(sl_preds$PDGP), ]
    sl_preds = subset(sl_preds, select = -PDGP)
    colnames(sl_preds) = paste0(colnames(sl_preds), "_split",
        i)
    all_sl_preds = cbind(all_sl_preds, sl_preds)
}
all_sl_pred_cls = all_sl_preds[, str_detect(colnames(all_sl_preds),
    "predict")]
all_sl_pred_cls$predict_mode = apply(all_sl_pred_cls, 1,
    function(x) {
        uniqx <- unique(na.omit(x))
        uniqx[which.max(tabulate(match(x, uniqx)))]
    })

cf = caret::confusionMatrix(data = factor(all_sl_pred_cls$predict_mode,
    levels = c(1, 0)), factor(data$response, levels = c(1,
    0)), positive = "1")
sens = Sensitivity(factor(data$response, levels = c(1, 0)),
    factor(all_sl_pred_cls$predict_mode, levels = c(1, 0)))
spec = Specificity(factor(data$response, levels = c(1, 0)),
    factor(all_sl_pred_cls$predict_mode, levels = c(1, 0)))
f1_score = F1_Score(factor(data$response, levels = c(1,
    0)), factor(all_sl_pred_cls$predict_mode, levels = c(1,
    0)))

print(cf$table)
##           Reference
## Prediction   1   0
##          1 174  14
##          0  11  36
cat("Accuracy = ", round(cf$overall[["Accuracy"]], 4) *
    100, "%\n")
## Accuracy =  89.36 %
cat("Sensitivity = ", round(sens, 3), "\n")
## Sensitivity =  0.941
cat("Specificity = ", round(spec, 3), "\n")
## Specificity =  0.72
cat("F1 score = ", round(f1_score, 3), "\n")
## F1 score =  0.933

5.1.3 Moderate PD participants and controls

load("./rdata/var_reduct_HY_mild_HC_splits.RData")

sl_preds = read.csv("./files/HY_mild_HC/split1/sl_predictions.csv")
sl_preds <- sl_preds[order(sl_preds$PDGP), ]
PDGP = sl_preds$PDGP
sl_preds = subset(sl_preds, select = -PDGP)

num_splits = 5
all_sl_preds = sl_preds
colnames(all_sl_preds) = paste0(colnames(sl_preds), "_split1")
for (i in 2:num_splits) {
    sl_preds = read.csv(paste0("./files/HY_mild_HC/split",
        i, "/sl_predictions.csv"))
    sl_preds <- sl_preds[order(sl_preds$PDGP), ]
    sl_preds = subset(sl_preds, select = -PDGP)
    colnames(sl_preds) = paste0(colnames(sl_preds), "_split",
        i)
    all_sl_preds = cbind(all_sl_preds, sl_preds)
}
all_sl_pred_cls = all_sl_preds[, str_detect(colnames(all_sl_preds),
    "predict")]
all_sl_pred_cls$predict_mode = apply(all_sl_pred_cls, 1,
    function(x) {
        uniqx <- unique(na.omit(x))
        uniqx[which.max(tabulate(match(x, uniqx)))]
    })

cf = caret::confusionMatrix(data = factor(all_sl_pred_cls$predict_mode,
    levels = c(1, 0)), factor(data$response, levels = c(1,
    0)), positive = "1")
sens = Sensitivity(factor(data$response, levels = c(1, 0)),
    factor(all_sl_pred_cls$predict_mode, levels = c(1, 0)))
spec = Specificity(factor(data$response, levels = c(1, 0)),
    factor(all_sl_pred_cls$predict_mode, levels = c(1, 0)))
f1_score = F1_Score(factor(data$response, levels = c(1,
    0)), factor(all_sl_pred_cls$predict_mode, levels = c(1,
    0)))

print(cf$table)
##           Reference
## Prediction  1  0
##          1 55  6
##          0  5 44
cat("Accuracy = ", round(cf$overall[["Accuracy"]], 4) *
    100, "%\n")
## Accuracy =  90 %
cat("Sensitivity = ", round(sens, 3), "\n")
## Sensitivity =  0.917
cat("Specificity = ", round(spec, 3), "\n")
## Specificity =  0.88
cat("F1 score = ", round(f1_score, 3), "\n")
## F1 score =  0.909

5.1.4 Severe PD participants and controls

load("./rdata/var_reduct_HY_severe_HC_splits.RData")

sl_preds = read.csv("./files/HY_severe_HC/split1/sl_predictions.csv")
sl_preds <- sl_preds[order(sl_preds$PDGP), ]
PDGP = sl_preds$PDGP
sl_preds = subset(sl_preds, select = -PDGP)

num_splits = 5
all_sl_preds = sl_preds
colnames(all_sl_preds) = paste0(colnames(sl_preds), "_split1")
for (i in 2:num_splits) {
    sl_preds = read.csv(paste0("./files/HY_severe_HC/split",
        i, "/sl_predictions.csv"))
    sl_preds <- sl_preds[order(sl_preds$PDGP), ]
    sl_preds = subset(sl_preds, select = -PDGP)
    colnames(sl_preds) = paste0(colnames(sl_preds), "_split",
        i)
    all_sl_preds = cbind(all_sl_preds, sl_preds)
}
all_sl_pred_cls = all_sl_preds[, str_detect(colnames(all_sl_preds),
    "predict")]
all_sl_pred_cls$predict_mode = apply(all_sl_pred_cls, 1,
    function(x) {
        uniqx <- unique(na.omit(x))
        uniqx[which.max(tabulate(match(x, uniqx)))]
    })

cf = caret::confusionMatrix(data = factor(all_sl_pred_cls$predict_mode,
    levels = c(1, 0)), factor(data$response, levels = c(1,
    0)), positive = "1")
sens = Sensitivity(factor(data$response, levels = c(1, 0)),
    factor(all_sl_pred_cls$predict_mode, levels = c(1, 0)))
spec = Specificity(factor(data$response, levels = c(1, 0)),
    factor(all_sl_pred_cls$predict_mode, levels = c(1, 0)))
f1_score = F1_Score(factor(data$response, levels = c(1,
    0)), factor(all_sl_pred_cls$predict_mode, levels = c(1,
    0)))

print(cf$table)
##           Reference
## Prediction  1  0
##          1 15  0
##          0  2 50
cat("Accuracy = ", round(cf$overall[["Accuracy"]], 4) *
    100, "%\n")
## Accuracy =  97.01 %
cat("Sensitivity = ", round(sens, 3), "\n")
## Sensitivity =  0.882
cat("Specificity = ", round(spec, 3), "\n")
## Specificity =  1
cat("F1 score = ", round(f1_score, 3), "\n")
## F1 score =  0.938

5.2 Model interpretation

# Calculate and save feature importance of a data using top
# model of each iteration
save_var_imp <- function(data, all_splits, top_models, setup_name) {
    h2o.init(nthreads = -1)
    h2o_var_imp = data.frame(matrix(nrow = 0, ncol = 2))
    colnames(h2o_var_imp) = c("Variable", "avg_score")
    seq = 1:5
    # loop over shuffles and splits
    for (i in 0:24) {
        split_num = floor(i/5) + 1
        iter_num = i%%5 + 1
        top_model = top_models[i + 1]
        split = all_splits[[split_num]]
        train_indx = seq[seq != iter_num]
        train = data.frame(matrix(nrow = 0, ncol = length(data)))
        colnames(train) = colnames(data)
        for (indx in train_indx) {
            train = rbind(train, split[[indx]])
        }
        train = subset(train, select = -PDGP)
        # load top model
        model = h2o.loadModel(paste0("./models/", setup_name,
            "/split", split_num, "/iter", iter_num, "/", top_model))
        # calculate feature importance
        imp = h2o.permutation_importance(model, as.h2o(train),
            n_repeats = 10)
        imp$avg_score = rowMeans(imp[, 2:11])
        imp = subset(imp, select = c(Variable, avg_score))
        h2o_var_imp = rbind(h2o_var_imp, imp)
    }
    # average importance from shuffles and splits
    variables = unique(h2o_var_imp$Variable)
    var_imp = data.frame(matrix(nrow = length(variables), ncol = 2))
    colnames(var_imp) = c("variable", "relative_importance")
    scores = c()
    for (variable in variables) {
        scores = c(scores, mean(h2o_var_imp[h2o_var_imp$Variable ==
            variable, ]$avg_score))
    }
    var_imp$variable = variables
    var_imp$relative_importance = scores
    write.csv(var_imp, paste0("./files/", setup_name, "/top_imp_scores.csv"))
    h2o.removeAll()
}
import h2o
import pandas as pd
import numpy as np
import shap
import math

# Calculate and save feature SHAP values of a data using top model of each iteration
class H2OProbWrapper:
    def __init__(self, h2o_model, feature_names):
        self.h2o_model = h2o_model
        self.feature_names = feature_names
        
    def predict_binary_prob(self, X):
        if isinstance(X, pd.Series):
            X = X.values.reshape(1,-1)
        self.dataframe= pd.DataFrame(X, columns=self.feature_names)
        self.dataframe_hex  = h2o.H2OFrame(self.dataframe)
        self.predictions = self.h2o_model.predict(self.dataframe_hex).as_data_frame().values
        return self.predictions.astype('float64')[:,-1] 

def save_var_shap(data_columns, top_models, setup_name):
  h2o.init(nthreads=-1)
  shap_res = []
  for i in range(25):
      split_num = math.floor(i/5)+1
      iter_num = i%5+1
      top_model = top_models[i]
      # load train and test data
      train = pd.read_csv('./files/'+setup_name+'/train_test_files/train_split' + \
                str(split_num) + '_iter' + str(iter_num) + '.csv', index_col=0)
      X_train = train.loc[:, train.columns != 'PDGP']
      X_train['response'] = X_train['response'].astype('category')
      test = pd.read_csv('./files/'+setup_name+'/train_test_files/test_split' + \
                str(split_num) + '_iter' + str(iter_num) + '.csv', index_col=0)
      X_test = test.loc[:, test.columns != 'PDGP']
      X_test['response'] = X_test['response'].astype('category')
      # load top model
      model = h2o.load_model('./models/'+setup_name+'/split'+ str(split_num) +\
              '/iter' + str(iter_num) + '/'+ model_names[i])
      # calculate SHAP values
      h2o_wrapper = H2OProbWrapper(model,data_columns)
      explainer = shap.KernelExplainer(h2o_wrapper.predict_binary_prob, X_train, \
              nsamples=100)
      shap_values = explainer.shap_values(X_test)
      shap_res.append(shap_values)
  shap_res = pd.DataFrame(np.vstack(shap_res), columns=X_train.columns)
  shap_res.to_csv('./files/'+setup_name+'/top_shap_values.csv', index=False)
  h2o.remove_all()

5.2.1 PD participants and controls

load("./rdata/var_reduct_PD_control_splits.RData")
top_models = c("GBM_model_1683058139810_399641", "GBM_model_1683058139810_464011",
    "GBM_model_1683058139810_526922", "GBM_model_1683058139810_586894",
    "GBM_model_1683058139810_645861", "GBM_model_1683058139810_997263",
    "GLM_model_1683058139810_1043946", "GBM_model_1683058139810_1118072",
    "GBM_model_1683058139810_1177862", "GBM_model_1683058139810_1238334",
    "GBM_model_1683058139810_1581860", "DeepLearning_model_1683058139810_1673920",
    "GBM_model_1683058139810_1692858", "DRF_model_1683058139810_1738496",
    "DeepLearning_model_1683058139810_1848898", "GBM_model_1683058139810_5261909",
    "GBM_model_1683058139810_5369141", "GLM_model_1683058139810_5465253",
    "DRF_model_1683058139810_5567905", "DeepLearning_model_1683058139810_5724832",
    "GBM_model_1683058139810_5792244", "GBM_model_1683058139810_5904724",
    "GBM_model_1683297158524_88077", "GBM_model_1683297158524_205274",
    "DRF_model_1683297158524_303765")
save_var_imp(data, all_splits, top_models, "PD_HC")
top_models=["GBM_model_1683058139810_399641","GBM_model_1683058139810_464011",\
"GBM_model_1683058139810_526922","GBM_model_1683058139810_586894",\
"GBM_model_1683058139810_645861","GBM_model_1683058139810_997263",\
"GLM_model_1683058139810_1043946","GBM_model_1683058139810_1118072",\
"GBM_model_1683058139810_1177862","GBM_model_1683058139810_1238334",\
"GBM_model_1683058139810_1581860","DeepLearning_model_1683058139810_1673920",\
"GBM_model_1683058139810_1692858","DRF_model_1683058139810_1738496",\
"DeepLearning_model_1683058139810_1848898","GBM_model_1683058139810_5261909",\
"GBM_model_1683058139810_5369141","GLM_model_1683058139810_5465253",\
"DRF_model_1683058139810_5567905","DeepLearning_model_1683058139810_5724832",\
"GBM_model_1683058139810_5792244","GBM_model_1683058139810_5904724",\
"GBM_model_1683297158524_88077","GBM_model_1683297158524_205274",\
"DRF_model_1683297158524_303765"]
data_columns = ["gyfSR_walk1.t6","axtVFD_Turn1.t7","aytMYOP_Turn2.t7","gytMIN_walk1.t7",\
"aytSSC_sTs2.t6t7","az_fft1_sTs2.t6t7","aztSSC_sTs2.t6t7","gxtMAVS_sTs2.t6t7","gztWL.t3",\
"response"]
save_var_shap(data_columns, top_models, "PD_HC")
imp_data = read.csv("./files/PD_HC/top_imp_scores.csv")
imp_data = subset(imp_data, select = -X)
imp_data <- imp_data[order(imp_data$relative_importance,
    decreasing = TRUE), ]

load("./rdata/var_reduct_PD_control_splits.RData")

rownames(imp_data) = imp_data$variable
top_features_names = c("Mean binarized values of 2nd turn",
    "Variation fractal dimension of 1st turn", "Slope sign change of stand-to-sit",
    "Waveform length", "Slope sign change of stand-to-sit",
    "Mean absolute value slope of stand-to-sit", "Frequency at 1st peak of DFT of stand-to-sit",
    "DFT spectral roll-off of 1st walk", "Minimum value of 1st walk")
tasks_channels = c("CogTUG2 (ay)", "CogTUG2 (ax)", "Mean CogTUG (az)",
    "Stand eyes open (gz)", "Mean CogTUG (ay)", "Mean CogTUG (gx)",
    "Mean CogTUG (az)", "CogTUG1 (gy)", "CogTUG2 (gy)")
task_pos = c(1.15, 2.15, 3.13, 4.3, 5.15, 6, 7.15, 8.15,
    9.17)

imp_data$Subset <- factor(sapply(rownames(imp_data), function(x) strsplit(x,
    split = "\\.")[[1]][2]))
p_imp = ggplot(imp_data, aes(x = reorder(variable, relative_importance),
    y = relative_importance)) + geom_bar(stat = "identity",
    width = 0.5, fill = "#A4A2A6") + coord_flip(ylim = c(0.0025,
    max(imp_data$relative_importance) + 0.005), xlim = c(1,
    9), clip = "off") + theme_bw() + theme(panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(), panel.grid.major.y = element_line(color = "grey80",
        size = 0.3, linetype = 2), strip.background = element_blank(),
    panel.border = element_rect(color = "black"), plot.margin = ggplot2::margin(0,
        65, 0, 150, "pt"), axis.title.x = element_text(color = "black",
        size = 15, family = "sans"), axis.title.y = element_text(color = "black",
        size = 15, family = "sans"), axis.text.x = element_text(color = "black",
        size = 12, family = "sans"), axis.text.y = element_text(color = "black",
        size = 12, family = "sans"), legend.title = element_blank(),
    legend.position = "top", legend.margin = ggplot2::margin(b = -10)) +
    scale_y_continuous(breaks = seq(0.01, 0.05, 0.01)) +
    scale_x_discrete(labels = lapply(rev(top_features_names),
        str_wrap, width = 22)) + xlab("") + ylab("Relative importance") +
    ggtitle("") + annotate("text", label = rev(tasks_channels),
    x = task_pos, y = -0.105, fontface = "bold", family = "sans",
    hjust = 0, size = 4) + annotate("text", label = "Task (channel)",
    x = -0.2, y = -0.09, fontface = "bold", family = "sans",
    size = 4)  #+



test_data = as.data.frame(matrix(nrow = 0, ncol = length(data)))
colnames(test_data) = colnames(data)
for (split_num in 1:5) {
    for (iter_num in 1:5) {
        test = read.csv(paste0("./files/PD_HC/train_test_files/test_split",
            split_num, "_iter", iter_num, ".csv"))
        test_data = rbind(test_data, test)
    }
}
shap_data = read.csv("./files/PD_HC/top_shap_values.csv")
shap_data = subset(shap_data, select = -c(response))
data_ = subset(test_data, select = -c(PDGP, response))
data_ <- apply(data_, 2, function(x) rank(x))
data_ <- apply(data_, 2, function(x) rescale(x, to = c(0,
    100)))

var_diff <- apply(shap_data, 2, function(x) max(x) - min(x))
var_diff <- var_diff[order(-var_diff)]
df <- data.frame(variable = melt(shap_data)$variable, shap = melt(shap_data)$value,
    variable_value = melt(data_)$value)

top_vars <- factor(names(var_diff))
imp_scores = c()
for (variable in top_vars) {
    score = imp_data$relative_importance[imp_data$variable ==
        variable]
    imp_scores = c(imp_scores, score)
}


df <- df[df$variable %in% top_vars, ]
df$variable <- factor(df$variable, levels = top_vars[order(imp_scores)],
    labels = top_vars[order(imp_scores)])

p_shap = ggplot(df, aes(x = shap, y = variable)) + geom_hline(yintercept = top_vars,
    linetype = "dotted", color = "grey80") + geom_vline(xintercept = 0,
    color = "grey80", size = 1) + geom_point(aes(fill = variable_value),
    color = "#6C6C82", shape = 21, alpha = 0.5, size = 2,
    position = "auto", stroke = 0.1) + scale_fill_gradient2(low = "#2B26CB",
    mid = "white", high = "red", midpoint = 50, breaks = c(0,
        100), labels = c("Low", "High")) + theme_bw() +
    theme(plot.margin = ggplot2::margin(18, 0, 0, -60, "pt"),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.grid.major.y = element_line(color = "grey80",
            size = 0.3, linetype = 2), strip.background = element_blank(),
        panel.border = element_rect(color = "black"), axis.line = element_line(color = "black"),
        axis.title.x = element_text(colour = "black", size = 15,
            family = "sans"), axis.title.y = element_text(colour = "black",
            size = 15, family = "sans"), axis.text.x = element_text(colour = "black",
            size = 12, family = "sans"), axis.text.y = element_blank(),
        legend.title = element_text(size = 15, family = "sans"),
        legend.text = element_text(colour = "black", size = 12,
            family = "sans"), legend.title.align = 0.5) +
    guides(fill = guide_colourbar("Variable\nvalue\n", ticks = FALSE,
        barheight = 10, barwidth = 0.5), color = "none") +
    xlab("SHAP value") + ylab("")

plot_all = arrangeGrob(p_imp, p_shap, ncol = 2)
ggsave(plot_all, file = "figures/PD_HC_imp_plot_bar.png",
    dpi = 500, width = 14, height = 5)

5.2.2 Mild PD participants and controls

load("./rdata/var_reduct_HY_early_HC_splits.RData")
top_models = c("GBM_model_1684292205921_1340782", "DRF_model_1684292205921_1431707",
    "GBM_model_1684292205921_1572491", "GBM_model_1684292205921_1687878",
    "GBM_model_1684292205921_1810644", "GBM_model_1684334187433_82010",
    "GBM_model_1684334187433_210422", "GBM_model_1684334187433_340011",
    "DRF_model_1684334187433_455088", "GBM_model_1684334187433_592881",
    "GBM_model_1684334187433_717214", "GBM_model_1684334187433_844597",
    "DeepLearning_model_1684334187433_999941", "GBM_model_1684334187433_1060851",
    "GBM_model_1684334187433_1138483", "DeepLearning_model_1684334187433_1242659",
    "GBM_model_1684334187433_1299542", "XGBoost_model_1684334187433_1320794",
    "GBM_model_1684334187433_1485635", "GBM_model_1684334187433_1591419",
    "DRF_model_1684334187433_1694185", "GBM_model_1684334187433_1832852",
    "DeepLearning_model_1684334187433_1918047", "DeepLearning_model_1684334187433_1973941",
    "GBM_model_1684334187433_1995264")
save_var_imp(data, all_splits, top_models, "HY_early_HC")
top_models=["GBM_model_1684292205921_1340782", "DRF_model_1684292205921_1431707", \
"GBM_model_1684292205921_1572491", "GBM_model_1684292205921_1687878", \
"GBM_model_1684292205921_1810644", "GBM_model_1684334187433_82010",\
 "GBM_model_1684334187433_210422", "GBM_model_1684334187433_340011", \
 "DRF_model_1684334187433_455088", "GBM_model_1684334187433_592881",\
  "GBM_model_1684334187433_717214", "GBM_model_1684334187433_844597", \
  "DeepLearning_model_1684334187433_999941", "GBM_model_1684334187433_1060851", \
  "GBM_model_1684334187433_1138483", "DeepLearning_model_1684334187433_1242659",\
   "GBM_model_1684334187433_1299542", "XGBoost_model_1684334187433_1320794", \
   "GBM_model_1684334187433_1485635", "GBM_model_1684334187433_1591419", \
   "DRF_model_1684334187433_1694185", "GBM_model_1684334187433_1832852", \
   "DeepLearning_model_1684334187433_1918047", "DeepLearning_model_1684334187433_1973941", \
   "GBM_model_1684334187433_1995264"]
data_columns = ["gyfTTP_walk1.t6","axtVFD_Turn1.t7","aytMYOP_Turn2.t7","axtSSC_sTs2.t6t7",\
"aytSSC_sTs2.t6t7","az_fft1_sTs2.t6t7","aztSSC_sTs2.t6t7","gxfSS_sTs2.t6t7",\
"gxtWAMP_sTs2.t6t7","gxtSSC_sTs2.t6t7","gytSSC_sTs2.t6t7","gztSSC_sTs2.t6t7",\
"az_fft3_Turn2.t6t7","gxfFR.t2" ,"response"]
save_var_shap(data_columns, top_models, "HY_early_HC")
imp_data = read.csv("./files/HY_early_HC/top_imp_scores.csv")
imp_data = subset(imp_data, select = -X)
imp_data <- imp_data[order(imp_data$relative_importance,
    decreasing = TRUE), ]
load("./rdata/var_reduct_HY_early_HC_splits.RData")

rownames(imp_data) = imp_data$variable
top_features_names = c("Variation fractal dimension of 1st turn",
    "Mean binarized values of 2nd turn", "Total power of 1st walk",
    "DFT frequency ratio", "Frequency at 3rd peak of DFT of 2nd turn",
    "Frequency at 1st peak of DFT of stand-to-sit", "Slope sign change of stand-to-sit",
    "Skewness of DFT power of stand-to-sit", "Slope sign change of stand-to-sit",
    "Slope sign change of stand-to-sit", "Slope sign change of stand-to-sit",
    "Rate of change of stand-to-sit", "Slope sign change of stand-to-sit",
    "Slope sign change of stand-to-sit")
tasks_channels = c("CogTUG2 (ax)", "CogTUG2 (ay)", "CogTUG1 (gy)",
    "Stand eyes closed (gx)", "Mean CogTUG (az)", "Mean CogTUG (az)",
    "Mean CogTUG (gz)", "Mean CogTUG (gx)", "Mean CogTUG (ay)",
    "Mean CogTUG (az)", "Mean CogTUG (gy)", "Mean CogTUG (gx)",
    "Mean CogTUG (gx)", "Mean CogTUG (ax)")
task_pos = c(1.15, 2.2, 3.15, 4.2, 5.17, 6.15, 7.15, 8.18,
    9.21, 10.18, 11.05, 12, 13.18, 14.22)


imp_data$Subset <- factor(sapply(rownames(imp_data), function(x) strsplit(x,
    split = "\\.")[[1]][2]))
p_imp = ggplot(imp_data, aes(x = reorder(variable, relative_importance),
    y = relative_importance)) + geom_bar(stat = "identity",
    width = 0.5, fill = "#A4A2A6") + coord_flip(ylim = c(0.003,
    max(imp_data$relative_importance) + 0.01), xlim = c(1,
    14), clip = "off") + theme_bw() + theme(panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(), panel.grid.major.y = element_line(color = "grey80",
        size = 0.3, linetype = 2), strip.background = element_blank(),
    panel.border = element_rect(color = "black"), legend.text = element_text(color = "black",
        size = 10, family = "sans"), plot.margin = ggplot2::margin(0,
        65, 0, 150, "pt"), axis.title.x = element_text(color = "black",
        size = 15, family = "sans"), axis.title.y = element_text(color = "black",
        size = 15, family = "sans"), axis.text.x = element_text(color = "black",
        size = 12, family = "sans"), axis.text.y = element_text(color = "black",
        size = 12, family = "sans"), legend.title = element_blank(),
    legend.position = "top", legend.margin = ggplot2::margin(b = -10)) +
    scale_y_continuous(breaks = seq(0.01, 0.06, 0.02)) +
    scale_x_discrete(labels = lapply(rev(top_features_names),
        str_wrap, width = 25)) + xlab("") + ylab("Relative importance") +
    ggtitle("") + annotate("text", label = rev(tasks_channels),
    x = task_pos, y = -0.182, fontface = "bold", family = "sans",
    hjust = 0, size = 4) + annotate("text", label = "Task (channel)",
    x = 0, y = -0.155, fontface = "bold", family = "sans",
    size = 4)  #+


test_data = as.data.frame(matrix(nrow = 0, ncol = length(data)))
colnames(test_data) = colnames(data)
for (split_num in 1:5) {
    for (iter_num in 1:5) {
        test = read.csv(paste0("./files/HY_early_HC/train_test_files/test_split",
            split_num, "_iter", iter_num, ".csv"))
        test_data = rbind(test_data, test)
    }
}
shap_data = read.csv("./files/HY_early_HC/top_shap_values.csv")
shap_data = subset(shap_data, select = -c(response))
data_ = subset(test_data, select = -c(PDGP, response))
data_ <- apply(data_, 2, function(x) rank(x))
data_ <- apply(data_, 2, function(x) rescale(x, to = c(0,
    100)))

var_diff <- apply(shap_data, 2, function(x) max(x) - min(x))
var_diff <- var_diff[order(-var_diff)]
df <- data.frame(variable = melt(shap_data)$variable, shap = melt(shap_data)$value,
    variable_value = melt(data_)$value)

top_vars <- factor(names(var_diff))
imp_scores = c()
for (variable in top_vars) {
    score = imp_data$relative_importance[imp_data$variable ==
        variable]
    imp_scores = c(imp_scores, score)
}


df <- df[df$variable %in% top_vars, ]
df$variable <- factor(df$variable, levels = top_vars[order(imp_scores)],
    labels = top_vars[order(imp_scores)])

p_shap = ggplot(df, aes(x = shap, y = variable)) + geom_hline(yintercept = top_vars,
    linetype = "dotted", color = "grey80") + geom_vline(xintercept = 0,
    color = "grey80", size = 1) + geom_point(aes(fill = variable_value),
    color = "#6C6C82", shape = 21, alpha = 0.5, size = 2,
    position = "auto", stroke = 0.1) + scale_fill_gradient2(low = "#2B26CB",
    mid = "white", high = "red", midpoint = 50, breaks = c(0,
        100), labels = c("Low", "High")) + theme_bw() +
    theme(plot.margin = ggplot2::margin(18, 0, 0, -60, "pt"),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.grid.major.y = element_line(color = "grey80",
            size = 0.3, linetype = 2), strip.background = element_blank(),
        panel.border = element_rect(color = "black"), axis.line = element_line(color = "black"),
        axis.title.x = element_text(colour = "black", size = 15,
            family = "sans"), axis.title.y = element_text(colour = "black",
            size = 15, family = "sans"), axis.text.x = element_text(colour = "black",
            size = 12, family = "sans"), axis.text.y = element_blank(),
        legend.title = element_text(size = 15, family = "sans"),
        legend.text = element_text(colour = "black", size = 12,
            family = "sans"), legend.title.align = 0.5) +
    guides(fill = guide_colourbar("Variable\nvalue\n", ticks = FALSE,
        barheight = 10, barwidth = 0.5), color = "none") +
    xlab("SHAP value") + ylab("")

plot_all = arrangeGrob(p_imp, p_shap, ncol = 2)
ggsave(plot_all, file = "figures/HY_early_HC_imp_plot_bar.png",
    dpi = 500, width = 14, height = 7)

5.2.3 Moderate PD participants and controls

load("./rdata/var_reduct_HY_mild_HC_splits.RData")
top_models = c("DeepLearning_model_1684292205921_1319234", "DRF_model_1684292205921_1381265",
    "DeepLearning_model_1684292205921_1554632", "DeepLearning_model_1684292205921_1670051",
    "DeepLearning_model_1684292205921_1793667", "DRF_model_1684334187433_12138",
    "DRF_model_1684334187433_123741", "DeepLearning_model_1684334187433_322304",
    "GBM_model_1684334187433_393672", "DRF_model_1684334187433_511971",
    "DRF_model_1684334187433_636647", "DRF_model_1684334187433_759936",
    "DeepLearning_model_1684334187433_948250", "DeepLearning_model_1684334187433_1043046",
    "DeepLearning_model_1684334187433_1120152", "DeepLearning_model_1684334187433_1207087",
    "DeepLearning_model_1684334187433_1281223", "DRF_model_1684334187433_1321702",
    "DeepLearning_model_1684334187433_1468319", "DRF_model_1684334187433_1528798",
    "DeepLearning_model_1684431909716_60276", "DeepLearning_model_1684431909716_140703",
    "DRF_model_1684431909716_148995", "DRF_model_1684431909716_226293",
    "DRF_model_1684431909716_297316")
save_var_imp(data, all_splits, top_models, "HY_mild_HC")
top_models=["DeepLearning_model_1684292205921_1319234", "DRF_model_1684292205921_1381265", \
"DeepLearning_model_1684292205921_1554632", "DeepLearning_model_1684292205921_1670051", \
"DeepLearning_model_1684292205921_1793667", "DRF_model_1684334187433_12138", \
"DRF_model_1684334187433_123741", "DeepLearning_model_1684334187433_322304", \
"GBM_model_1684334187433_393672", "DRF_model_1684334187433_511971", \
"DRF_model_1684334187433_636647", "DRF_model_1684334187433_759936", \
"DeepLearning_model_1684334187433_948250", "DeepLearning_model_1684334187433_1043046", \
"DeepLearning_model_1684334187433_1120152", "DeepLearning_model_1684334187433_1207087",\
"DeepLearning_model_1684334187433_1281223", "DRF_model_1684334187433_1321702", \
"DeepLearning_model_1684334187433_1468319", "DRF_model_1684334187433_1528798", \
"DeepLearning_model_1684431909716_60276", "DeepLearning_model_1684431909716_140703", \
"DRF_model_1684431909716_148995", "DRF_model_1684431909716_226293", \
"DRF_model_1684431909716_297316"]
data_columns = ["gytIAV_walk2.t5","aztSSC_sTs2.t7","gxfMNP_sTs2.t7","gytSSI_walk2.t7",\
"az_lsp5_sTs1.t6t7","oars_a.demo","oars_i.demo","oars.t.demo",\
"response"]
save_var_shap(data_columns, top_models, "HY_mild_HC")
imp_data = read.csv("./files/HY_mild_HC/top_imp_scores.csv")
imp_data = subset(imp_data, select = -X)
imp_data <- imp_data[order(imp_data$relative_importance,
    decreasing = TRUE), ]
load("./rdata/var_reduct_HY_mild_HC_splits.RData")

rownames(imp_data) = imp_data$variable
top_features_names = c("Frequency at 5th peak of LSP of sit-to-stand",
    "Slope sign change of stand-to-sit", "Simple square interval of 2nd walk",
    "DFT mean power of stand-to-sit", "OARS total score",
    "Integrated absolute value of 2nd walk", "OARS total ADLs score",
    "OARS total IADLs score")
tasks_channels = c("Mean CogTUG (az)", "CogTUG2 (az)", "CogTUG2 (gy)",
    "CogTUG2 (gx)", "---", "TUG2 (gy)", "---", "---")
task_pos = c(1, 2, 3.1, 4, 5.12, 6.14, 7.15, 8.14)


imp_data$Subset <- factor(sapply(rownames(imp_data), function(x) strsplit(x,
    split = "\\.")[[1]][2]))
p_imp = ggplot(imp_data, aes(x = reorder(variable, relative_importance),
    y = relative_importance)) + geom_bar(stat = "identity",
    width = 0.3, fill = "#A4A2A6") + coord_flip(ylim = c(0.002,
    max(imp_data$relative_importance) + 0.005), xlim = c(1,
    8), clip = "off") + theme_bw() + theme(panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(), panel.grid.major.y = element_line(color = "grey80",
        size = 0.3, linetype = 2), strip.background = element_blank(),
    panel.border = element_rect(color = "black"), legend.text = element_text(color = "black",
        size = 10, family = "sans"), plot.margin = ggplot2::margin(0,
        65, 0, 150, "pt"), axis.title.x = element_text(color = "black",
        size = 15, family = "sans"), axis.title.y = element_text(color = "black",
        size = 15, family = "sans"), axis.text.x = element_text(color = "black",
        size = 12, family = "sans"), axis.text.y = element_text(color = "black",
        size = 12, family = "sans"), legend.title = element_blank(),
    legend.position = "top", legend.margin = ggplot2::margin(b = -10)) +
    scale_y_continuous(breaks = seq(0.01, 0.05, 0.01)) +
    scale_x_discrete(labels = lapply(rev(top_features_names),
        str_wrap, width = 25)) + xlab("") + ylab("Relative importance") +
    ggtitle("") + annotate("text", label = rev(tasks_channels),
    x = task_pos, y = -0.14, fontface = "bold", family = "sans",
    hjust = 0, size = 4) + annotate("text", label = "Task (channel)",
    x = 0, y = -0.12, fontface = "bold", family = "sans",
    size = 4)  #+


test_data = as.data.frame(matrix(nrow = 0, ncol = length(data)))
colnames(test_data) = colnames(data)
for (split_num in 1:5) {
    for (iter_num in 1:5) {
        test = read.csv(paste0("./files/HY_mild_HC/train_test_files/test_split",
            split_num, "_iter", iter_num, ".csv"))
        test_data = rbind(test_data, test)
    }
}
shap_data = read.csv("./files/HY_mild_HC/top_shap_values.csv")
shap_data = subset(shap_data, select = -c(response))
data_ = subset(test_data, select = -c(PDGP, response))
data_ <- apply(data_, 2, function(x) rank(x))
data_ <- apply(data_, 2, function(x) rescale(x, to = c(0,
    100)))

var_diff <- apply(shap_data, 2, function(x) max(x) - min(x))
var_diff <- var_diff[order(-var_diff)]
df <- data.frame(variable = melt(shap_data)$variable, shap = melt(shap_data)$value,
    variable_value = melt(data_)$value)

top_vars <- factor(names(var_diff))
imp_scores = c()
for (variable in top_vars) {
    score = imp_data$relative_importance[imp_data$variable ==
        variable]
    imp_scores = c(imp_scores, score)
}


df <- df[df$variable %in% top_vars, ]
df$variable <- factor(df$variable, levels = top_vars[order(imp_scores)],
    labels = top_vars[order(imp_scores)])

p_shap = ggplot(df, aes(x = shap, y = variable)) + geom_hline(yintercept = top_vars,
    linetype = "dotted", color = "grey80") + geom_vline(xintercept = 0,
    color = "grey80", size = 1) + geom_point(aes(fill = variable_value),
    color = "#6C6C82", shape = 21, alpha = 0.5, size = 2,
    position = "auto", stroke = 0.1) + scale_fill_gradient2(low = "#2B26CB",
    mid = "white", high = "red", midpoint = 50, breaks = c(0,
        100), labels = c("Low", "High")) + theme_bw() +
    theme(plot.margin = ggplot2::margin(18, 0, 0, -60, "pt"),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.grid.major.y = element_line(color = "grey80",
            size = 0.3, linetype = 2), strip.background = element_blank(),
        panel.border = element_rect(color = "black"), axis.line = element_line(color = "black"),
        axis.title.x = element_text(colour = "black", size = 15,
            family = "sans"), axis.title.y = element_text(colour = "black",
            size = 15, family = "sans"), axis.text.x = element_text(colour = "black",
            size = 12, family = "sans"), axis.text.y = element_blank(),
        legend.title = element_text(size = 15, family = "sans"),
        legend.text = element_text(colour = "black", size = 12,
            family = "sans"), legend.title.align = 0.5) +
    guides(fill = guide_colourbar("Variable\nvalue\n", ticks = FALSE,
        barheight = 10, barwidth = 0.5), color = "none") +
    xlab("SHAP value") + ylab("")

plot_all = arrangeGrob(p_imp, p_shap, ncol = 2)
ggsave(plot_all, file = "figures/HY_mild_HC_imp_plot_bar.png",
    dpi = 500, width = 14, height = 7)

5.2.4 Severe PD participants and controls

load("./rdata/var_reduct_HY_severe_HC_splits.RData")
top_models = c("GBM_model_1684381314577_68957", "DeepLearning_model_1684381314577_261294",
    "DRF_model_1684381314577_323560", "DeepLearning_model_1684381314577_524854",
    "GBM_model_1684381314577_603296", "DRF_model_1684381314577_717893",
    "DRF_model_1684381314577_847473", "DeepLearning_model_1684381314577_1060632",
    "DeepLearning_model_1684381314577_1192576", "DeepLearning_model_1684381314577_1317450",
    "DeepLearning_model_1684381314577_1446169", "DRF_model_1684381314577_1504662",
    "DRF_model_1684381314577_1622170", "GBM_model_1684381314577_1753250",
    "GBM_model_1684381314577_1869370", "DeepLearning_model_1684421828027_111476",
    "DeepLearning_model_1684421828027_206688", "GBM_model_1684421828027_271687",
    "GBM_model_1684421828027_408646", "GLM_model_1684421828027_529561",
    "DeepLearning_model_1684421828027_734593", "DeepLearning_model_1684421828027_847931",
    "DeepLearning_model_1684421828027_963294", "DeepLearning_model_1684421828027_1095195",
    "GLM_model_1684421828027_1158212")
save_var_imp(data, all_splits, top_models, "HY_severe_HC")
top_models=["GBM_model_1684381314577_68957", "DeepLearning_model_1684381314577_261294",\
 "DRF_model_1684381314577_323560", "DeepLearning_model_1684381314577_524854", \
 "GBM_model_1684381314577_603296", "DRF_model_1684381314577_717893", \
 "DRF_model_1684381314577_847473", "DeepLearning_model_1684381314577_1060632",\
  "DeepLearning_model_1684381314577_1192576", "DeepLearning_model_1684381314577_1317450",\
   "DeepLearning_model_1684381314577_1446169", "DRF_model_1684381314577_1504662",\
    "DRF_model_1684381314577_1622170", "GBM_model_1684381314577_1753250", \
    "GBM_model_1684381314577_1869370", "DeepLearning_model_1684421828027_111476", \
    "DeepLearning_model_1684421828027_206688", "GBM_model_1684421828027_271687", \
    "GBM_model_1684421828027_408646", "GLM_model_1684421828027_529561", \
    "DeepLearning_model_1684421828027_734593", "DeepLearning_model_1684421828027_847931",\
     "DeepLearning_model_1684421828027_963294", "DeepLearning_model_1684421828027_1095195", \
     "GLM_model_1684421828027_1158212"]
data_columns = ["aztSSI_sTs2.t6","gxtIAV_walk1.t7","aztIAV_sTs2.t6t7","oars.t.demo",\
"response"]
save_var_shap(data_columns, top_models, "HY_severe_HC")
imp_data = read.csv("./files/HY_severe_HC/top_imp_scores.csv")
imp_data = subset(imp_data, select = -X)
imp_data <- imp_data[order(imp_data$relative_importance,
    decreasing = TRUE), ]
load("./rdata/var_reduct_HY_severe_HC_splits.RData")

rownames(imp_data) = imp_data$variable
top_features_names = c("Integrated absolute value of 1st walk",
    "OARS total score", "Simple square interval of stand-to-sit",
    "Integrated absolute value of stand-to-sit")
tasks_channels = c("CogTUG2 (gx)", "---", "CogTUG1 (az)",
    "Mean CogTUG (az)")
task_pos = c(1.05, 2, 3.05, 4.05)


imp_data$Subset <- factor(sapply(rownames(imp_data), function(x) strsplit(x,
    split = "\\.")[[1]][2]))
p_imp = ggplot(imp_data, aes(x = reorder(variable, relative_importance),
    y = relative_importance)) + geom_bar(stat = "identity",
    width = 0.2, fill = "#A4A2A6") + coord_flip(ylim = c(0.005,
    max(imp_data$relative_importance) + 0.005), xlim = c(1,
    4), clip = "off") + theme_bw() + theme(panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(), panel.grid.major.y = element_line(color = "grey80",
        size = 0.3, linetype = 2), strip.background = element_blank(),
    panel.border = element_rect(color = "black"), legend.text = element_text(color = "black",
        size = 10, family = "sans"), plot.margin = ggplot2::margin(0,
        65, 0, 150, "pt"), axis.title.x = element_text(color = "black",
        size = 15, family = "sans"), axis.title.y = element_text(color = "black",
        size = 15, family = "sans"), axis.text.x = element_text(color = "black",
        size = 12, family = "sans"), axis.text.y = element_text(color = "black",
        size = 12, family = "sans"), legend.title = element_blank(),
    legend.position = "top", legend.margin = ggplot2::margin(b = -10)) +
    scale_y_continuous(breaks = seq(0.01, 0.15, 0.03)) +
    scale_x_discrete(labels = lapply(rev(top_features_names),
        str_wrap, width = 25)) + xlab("") + ylab("Relative importance") +
    ggtitle("") + annotate("text", label = rev(tasks_channels),
    x = task_pos, y = -0.4, fontface = "bold", family = "sans",
    hjust = 0, size = 4) + annotate("text", label = "Task (channel)",
    x = 0.22, y = -0.34, fontface = "bold", family = "sans",
    size = 4)  #+


test_data = as.data.frame(matrix(nrow = 0, ncol = length(data)))
colnames(test_data) = colnames(data)
for (split_num in 1:5) {
    for (iter_num in 1:5) {
        test = read.csv(paste0("./files/HY_severe_HC/train_test_files/test_split",
            split_num, "_iter", iter_num, ".csv"))
        test_data = rbind(test_data, test)
    }
}
shap_data = read.csv("./files/HY_severe_HC/top_shap_values.csv")
shap_data = subset(shap_data, select = -c(response))
data_ = subset(test_data, select = -c(PDGP, response))
data_ <- apply(data_, 2, function(x) rank(x))
data_ <- apply(data_, 2, function(x) rescale(x, to = c(0,
    100)))

var_diff <- apply(shap_data, 2, function(x) max(x) - min(x))
var_diff <- var_diff[order(-var_diff)]
df <- data.frame(variable = melt(shap_data)$variable, shap = melt(shap_data)$value,
    variable_value = melt(data_)$value)

top_vars <- factor(names(var_diff))
imp_scores = c()
for (variable in top_vars) {
    score = imp_data$relative_importance[imp_data$variable ==
        variable]
    imp_scores = c(imp_scores, score)
}


df <- df[df$variable %in% top_vars, ]
df$variable <- factor(df$variable, levels = top_vars[order(imp_scores)],
    labels = top_vars[order(imp_scores)])

p_shap = ggplot(df, aes(x = shap, y = variable)) + geom_hline(yintercept = top_vars,
    linetype = "dotted", color = "grey80") + geom_vline(xintercept = 0,
    color = "grey80", size = 1) + geom_point(aes(fill = variable_value),
    color = "#6C6C82", shape = 21, alpha = 0.5, size = 2,
    position = "auto", stroke = 0.1) + scale_fill_gradient2(low = "#2B26CB",
    mid = "white", high = "red", midpoint = 50, breaks = c(0,
        100), labels = c("Low", "High")) + theme_bw() +
    theme(plot.margin = ggplot2::margin(18, 0, 0, -60, "pt"),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.grid.major.y = element_line(color = "grey80",
            size = 0.3, linetype = 2), strip.background = element_blank(),
        panel.border = element_rect(color = "black"), axis.line = element_line(color = "black"),
        axis.title.x = element_text(colour = "black", size = 15,
            family = "sans"), axis.title.y = element_text(colour = "black",
            size = 15, family = "sans"), axis.text.x = element_text(colour = "black",
            size = 12, family = "sans"), axis.text.y = element_blank(),
        legend.title = element_text(size = 15, family = "sans"),
        legend.text = element_text(colour = "black", size = 12,
            family = "sans"), legend.title.align = 0.5) +
    guides(fill = guide_colourbar("Variable\nvalue\n", ticks = FALSE,
        barheight = 10, barwidth = 0.5), color = "none") +
    xlab("SHAP value") + ylab("")

plot_all = arrangeGrob(p_imp, p_shap, ncol = 2)
ggsave(plot_all, file = "figures/HY_severe_HC_imp_plot_bar.png",
    dpi = 500, width = 14, height = 7)