Section 5 Evaluation
library(stringr)
library(MLmetrics)
library(h2o)
library(reshape2)
library(ggplot2)
library(readxl)
library(stringr)
library(ggforce)
library(cowplot)
library(plotly)
library(ggrepel)
library(gridExtra)
library(scales)5.1 Performance evaluation
5.1.1 PD participants and controls
load("./rdata/var_reduct_PD_control_splits.RData")
sl_preds = read.csv("./files/PD_HC/split1/sl_predictions.csv")
sl_preds <- sl_preds[order(sl_preds$PDGP), ]
PDGP = sl_preds$PDGP
sl_preds = subset(sl_preds, select = -PDGP)
num_splits = 5
all_sl_preds = sl_preds
colnames(all_sl_preds) = paste0(colnames(sl_preds), "_split1")
for (i in 2:num_splits) {
sl_preds = read.csv(paste0("./files/PD_HC/split", i,
"/sl_predictions.csv"))
sl_preds <- sl_preds[order(sl_preds$PDGP), ]
sl_preds = subset(sl_preds, select = -PDGP)
colnames(sl_preds) = paste0(colnames(sl_preds), "_split",
i)
all_sl_preds = cbind(all_sl_preds, sl_preds)
}
all_sl_pred_cls = all_sl_preds[, str_detect(colnames(all_sl_preds),
"predict")]
all_sl_pred_cls$predict_mode = apply(all_sl_pred_cls, 1,
function(x) {
uniqx <- unique(na.omit(x))
uniqx[which.max(tabulate(match(x, uniqx)))]
})
cf = caret::confusionMatrix(data = factor(all_sl_pred_cls$predict_mode,
levels = c(1, 0)), factor(data$response, levels = c(1,
0)), positive = "1")
sens = Sensitivity(factor(data$response, levels = c(1, 0)),
factor(all_sl_pred_cls$predict_mode, levels = c(1, 0)))
spec = Specificity(factor(data$response, levels = c(1, 0)),
factor(all_sl_pred_cls$predict_mode, levels = c(1, 0)))
f1_score = F1_Score(factor(data$response, levels = c(1,
0)), factor(all_sl_pred_cls$predict_mode, levels = c(1,
0)))
print(cf$table)## Reference
## Prediction 1 0
## 1 248 9
## 0 14 41
## Accuracy = 92.63 %
## Sensitivity = 0.947
## Specificity = 0.82
## F1 score = 0.956
5.1.2 Mild PD participants and controls
load("./rdata/var_reduct_HY_early_HC_splits.RData")
sl_preds = read.csv("./files/HY_early_HC/split1/sl_predictions.csv")
sl_preds <- sl_preds[order(sl_preds$PDGP), ]
PDGP = sl_preds$PDGP
sl_preds = subset(sl_preds, select = -PDGP)
num_splits = 5
all_sl_preds = sl_preds
colnames(all_sl_preds) = paste0(colnames(sl_preds), "_split1")
for (i in 2:num_splits) {
sl_preds = read.csv(paste0("./files/HY_early_HC/split",
i, "/sl_predictions.csv"))
sl_preds <- sl_preds[order(sl_preds$PDGP), ]
sl_preds = subset(sl_preds, select = -PDGP)
colnames(sl_preds) = paste0(colnames(sl_preds), "_split",
i)
all_sl_preds = cbind(all_sl_preds, sl_preds)
}
all_sl_pred_cls = all_sl_preds[, str_detect(colnames(all_sl_preds),
"predict")]
all_sl_pred_cls$predict_mode = apply(all_sl_pred_cls, 1,
function(x) {
uniqx <- unique(na.omit(x))
uniqx[which.max(tabulate(match(x, uniqx)))]
})
cf = caret::confusionMatrix(data = factor(all_sl_pred_cls$predict_mode,
levels = c(1, 0)), factor(data$response, levels = c(1,
0)), positive = "1")
sens = Sensitivity(factor(data$response, levels = c(1, 0)),
factor(all_sl_pred_cls$predict_mode, levels = c(1, 0)))
spec = Specificity(factor(data$response, levels = c(1, 0)),
factor(all_sl_pred_cls$predict_mode, levels = c(1, 0)))
f1_score = F1_Score(factor(data$response, levels = c(1,
0)), factor(all_sl_pred_cls$predict_mode, levels = c(1,
0)))
print(cf$table)## Reference
## Prediction 1 0
## 1 174 14
## 0 11 36
## Accuracy = 89.36 %
## Sensitivity = 0.941
## Specificity = 0.72
## F1 score = 0.933
5.1.3 Moderate PD participants and controls
load("./rdata/var_reduct_HY_mild_HC_splits.RData")
sl_preds = read.csv("./files/HY_mild_HC/split1/sl_predictions.csv")
sl_preds <- sl_preds[order(sl_preds$PDGP), ]
PDGP = sl_preds$PDGP
sl_preds = subset(sl_preds, select = -PDGP)
num_splits = 5
all_sl_preds = sl_preds
colnames(all_sl_preds) = paste0(colnames(sl_preds), "_split1")
for (i in 2:num_splits) {
sl_preds = read.csv(paste0("./files/HY_mild_HC/split",
i, "/sl_predictions.csv"))
sl_preds <- sl_preds[order(sl_preds$PDGP), ]
sl_preds = subset(sl_preds, select = -PDGP)
colnames(sl_preds) = paste0(colnames(sl_preds), "_split",
i)
all_sl_preds = cbind(all_sl_preds, sl_preds)
}
all_sl_pred_cls = all_sl_preds[, str_detect(colnames(all_sl_preds),
"predict")]
all_sl_pred_cls$predict_mode = apply(all_sl_pred_cls, 1,
function(x) {
uniqx <- unique(na.omit(x))
uniqx[which.max(tabulate(match(x, uniqx)))]
})
cf = caret::confusionMatrix(data = factor(all_sl_pred_cls$predict_mode,
levels = c(1, 0)), factor(data$response, levels = c(1,
0)), positive = "1")
sens = Sensitivity(factor(data$response, levels = c(1, 0)),
factor(all_sl_pred_cls$predict_mode, levels = c(1, 0)))
spec = Specificity(factor(data$response, levels = c(1, 0)),
factor(all_sl_pred_cls$predict_mode, levels = c(1, 0)))
f1_score = F1_Score(factor(data$response, levels = c(1,
0)), factor(all_sl_pred_cls$predict_mode, levels = c(1,
0)))
print(cf$table)## Reference
## Prediction 1 0
## 1 55 6
## 0 5 44
## Accuracy = 90 %
## Sensitivity = 0.917
## Specificity = 0.88
## F1 score = 0.909
5.1.4 Severe PD participants and controls
load("./rdata/var_reduct_HY_severe_HC_splits.RData")
sl_preds = read.csv("./files/HY_severe_HC/split1/sl_predictions.csv")
sl_preds <- sl_preds[order(sl_preds$PDGP), ]
PDGP = sl_preds$PDGP
sl_preds = subset(sl_preds, select = -PDGP)
num_splits = 5
all_sl_preds = sl_preds
colnames(all_sl_preds) = paste0(colnames(sl_preds), "_split1")
for (i in 2:num_splits) {
sl_preds = read.csv(paste0("./files/HY_severe_HC/split",
i, "/sl_predictions.csv"))
sl_preds <- sl_preds[order(sl_preds$PDGP), ]
sl_preds = subset(sl_preds, select = -PDGP)
colnames(sl_preds) = paste0(colnames(sl_preds), "_split",
i)
all_sl_preds = cbind(all_sl_preds, sl_preds)
}
all_sl_pred_cls = all_sl_preds[, str_detect(colnames(all_sl_preds),
"predict")]
all_sl_pred_cls$predict_mode = apply(all_sl_pred_cls, 1,
function(x) {
uniqx <- unique(na.omit(x))
uniqx[which.max(tabulate(match(x, uniqx)))]
})
cf = caret::confusionMatrix(data = factor(all_sl_pred_cls$predict_mode,
levels = c(1, 0)), factor(data$response, levels = c(1,
0)), positive = "1")
sens = Sensitivity(factor(data$response, levels = c(1, 0)),
factor(all_sl_pred_cls$predict_mode, levels = c(1, 0)))
spec = Specificity(factor(data$response, levels = c(1, 0)),
factor(all_sl_pred_cls$predict_mode, levels = c(1, 0)))
f1_score = F1_Score(factor(data$response, levels = c(1,
0)), factor(all_sl_pred_cls$predict_mode, levels = c(1,
0)))
print(cf$table)## Reference
## Prediction 1 0
## 1 15 0
## 0 2 50
## Accuracy = 97.01 %
## Sensitivity = 0.882
## Specificity = 1
## F1 score = 0.938
5.2 Model interpretation
# Calculate and save feature importance of a data using top
# model of each iteration
save_var_imp <- function(data, all_splits, top_models, setup_name) {
h2o.init(nthreads = -1)
h2o_var_imp = data.frame(matrix(nrow = 0, ncol = 2))
colnames(h2o_var_imp) = c("Variable", "avg_score")
seq = 1:5
# loop over shuffles and splits
for (i in 0:24) {
split_num = floor(i/5) + 1
iter_num = i%%5 + 1
top_model = top_models[i + 1]
split = all_splits[[split_num]]
train_indx = seq[seq != iter_num]
train = data.frame(matrix(nrow = 0, ncol = length(data)))
colnames(train) = colnames(data)
for (indx in train_indx) {
train = rbind(train, split[[indx]])
}
train = subset(train, select = -PDGP)
# load top model
model = h2o.loadModel(paste0("./models/", setup_name,
"/split", split_num, "/iter", iter_num, "/", top_model))
# calculate feature importance
imp = h2o.permutation_importance(model, as.h2o(train),
n_repeats = 10)
imp$avg_score = rowMeans(imp[, 2:11])
imp = subset(imp, select = c(Variable, avg_score))
h2o_var_imp = rbind(h2o_var_imp, imp)
}
# average importance from shuffles and splits
variables = unique(h2o_var_imp$Variable)
var_imp = data.frame(matrix(nrow = length(variables), ncol = 2))
colnames(var_imp) = c("variable", "relative_importance")
scores = c()
for (variable in variables) {
scores = c(scores, mean(h2o_var_imp[h2o_var_imp$Variable ==
variable, ]$avg_score))
}
var_imp$variable = variables
var_imp$relative_importance = scores
write.csv(var_imp, paste0("./files/", setup_name, "/top_imp_scores.csv"))
h2o.removeAll()
}import h2o
import pandas as pd
import numpy as np
import shap
import math
# Calculate and save feature SHAP values of a data using top model of each iteration
class H2OProbWrapper:
def __init__(self, h2o_model, feature_names):
self.h2o_model = h2o_model
self.feature_names = feature_names
def predict_binary_prob(self, X):
if isinstance(X, pd.Series):
X = X.values.reshape(1,-1)
self.dataframe= pd.DataFrame(X, columns=self.feature_names)
self.dataframe_hex = h2o.H2OFrame(self.dataframe)
self.predictions = self.h2o_model.predict(self.dataframe_hex).as_data_frame().values
return self.predictions.astype('float64')[:,-1]
def save_var_shap(data_columns, top_models, setup_name):
h2o.init(nthreads=-1)
shap_res = []
for i in range(25):
split_num = math.floor(i/5)+1
iter_num = i%5+1
top_model = top_models[i]
# load train and test data
train = pd.read_csv('./files/'+setup_name+'/train_test_files/train_split' + \
str(split_num) + '_iter' + str(iter_num) + '.csv', index_col=0)
X_train = train.loc[:, train.columns != 'PDGP']
X_train['response'] = X_train['response'].astype('category')
test = pd.read_csv('./files/'+setup_name+'/train_test_files/test_split' + \
str(split_num) + '_iter' + str(iter_num) + '.csv', index_col=0)
X_test = test.loc[:, test.columns != 'PDGP']
X_test['response'] = X_test['response'].astype('category')
# load top model
model = h2o.load_model('./models/'+setup_name+'/split'+ str(split_num) +\
'/iter' + str(iter_num) + '/'+ model_names[i])
# calculate SHAP values
h2o_wrapper = H2OProbWrapper(model,data_columns)
explainer = shap.KernelExplainer(h2o_wrapper.predict_binary_prob, X_train, \
nsamples=100)
shap_values = explainer.shap_values(X_test)
shap_res.append(shap_values)
shap_res = pd.DataFrame(np.vstack(shap_res), columns=X_train.columns)
shap_res.to_csv('./files/'+setup_name+'/top_shap_values.csv', index=False)
h2o.remove_all()5.2.1 PD participants and controls
load("./rdata/var_reduct_PD_control_splits.RData")
top_models = c("GBM_model_1683058139810_399641", "GBM_model_1683058139810_464011",
"GBM_model_1683058139810_526922", "GBM_model_1683058139810_586894",
"GBM_model_1683058139810_645861", "GBM_model_1683058139810_997263",
"GLM_model_1683058139810_1043946", "GBM_model_1683058139810_1118072",
"GBM_model_1683058139810_1177862", "GBM_model_1683058139810_1238334",
"GBM_model_1683058139810_1581860", "DeepLearning_model_1683058139810_1673920",
"GBM_model_1683058139810_1692858", "DRF_model_1683058139810_1738496",
"DeepLearning_model_1683058139810_1848898", "GBM_model_1683058139810_5261909",
"GBM_model_1683058139810_5369141", "GLM_model_1683058139810_5465253",
"DRF_model_1683058139810_5567905", "DeepLearning_model_1683058139810_5724832",
"GBM_model_1683058139810_5792244", "GBM_model_1683058139810_5904724",
"GBM_model_1683297158524_88077", "GBM_model_1683297158524_205274",
"DRF_model_1683297158524_303765")
save_var_imp(data, all_splits, top_models, "PD_HC")top_models=["GBM_model_1683058139810_399641","GBM_model_1683058139810_464011",\
"GBM_model_1683058139810_526922","GBM_model_1683058139810_586894",\
"GBM_model_1683058139810_645861","GBM_model_1683058139810_997263",\
"GLM_model_1683058139810_1043946","GBM_model_1683058139810_1118072",\
"GBM_model_1683058139810_1177862","GBM_model_1683058139810_1238334",\
"GBM_model_1683058139810_1581860","DeepLearning_model_1683058139810_1673920",\
"GBM_model_1683058139810_1692858","DRF_model_1683058139810_1738496",\
"DeepLearning_model_1683058139810_1848898","GBM_model_1683058139810_5261909",\
"GBM_model_1683058139810_5369141","GLM_model_1683058139810_5465253",\
"DRF_model_1683058139810_5567905","DeepLearning_model_1683058139810_5724832",\
"GBM_model_1683058139810_5792244","GBM_model_1683058139810_5904724",\
"GBM_model_1683297158524_88077","GBM_model_1683297158524_205274",\
"DRF_model_1683297158524_303765"]
data_columns = ["gyfSR_walk1.t6","axtVFD_Turn1.t7","aytMYOP_Turn2.t7","gytMIN_walk1.t7",\
"aytSSC_sTs2.t6t7","az_fft1_sTs2.t6t7","aztSSC_sTs2.t6t7","gxtMAVS_sTs2.t6t7","gztWL.t3",\
"response"]
save_var_shap(data_columns, top_models, "PD_HC")imp_data = read.csv("./files/PD_HC/top_imp_scores.csv")
imp_data = subset(imp_data, select = -X)
imp_data <- imp_data[order(imp_data$relative_importance,
decreasing = TRUE), ]
load("./rdata/var_reduct_PD_control_splits.RData")
rownames(imp_data) = imp_data$variable
top_features_names = c("Mean binarized values of 2nd turn",
"Variation fractal dimension of 1st turn", "Slope sign change of stand-to-sit",
"Waveform length", "Slope sign change of stand-to-sit",
"Mean absolute value slope of stand-to-sit", "Frequency at 1st peak of DFT of stand-to-sit",
"DFT spectral roll-off of 1st walk", "Minimum value of 1st walk")
tasks_channels = c("CogTUG2 (ay)", "CogTUG2 (ax)", "Mean CogTUG (az)",
"Stand eyes open (gz)", "Mean CogTUG (ay)", "Mean CogTUG (gx)",
"Mean CogTUG (az)", "CogTUG1 (gy)", "CogTUG2 (gy)")
task_pos = c(1.15, 2.15, 3.13, 4.3, 5.15, 6, 7.15, 8.15,
9.17)
imp_data$Subset <- factor(sapply(rownames(imp_data), function(x) strsplit(x,
split = "\\.")[[1]][2]))
p_imp = ggplot(imp_data, aes(x = reorder(variable, relative_importance),
y = relative_importance)) + geom_bar(stat = "identity",
width = 0.5, fill = "#A4A2A6") + coord_flip(ylim = c(0.0025,
max(imp_data$relative_importance) + 0.005), xlim = c(1,
9), clip = "off") + theme_bw() + theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), panel.grid.major.y = element_line(color = "grey80",
size = 0.3, linetype = 2), strip.background = element_blank(),
panel.border = element_rect(color = "black"), plot.margin = ggplot2::margin(0,
65, 0, 150, "pt"), axis.title.x = element_text(color = "black",
size = 15, family = "sans"), axis.title.y = element_text(color = "black",
size = 15, family = "sans"), axis.text.x = element_text(color = "black",
size = 12, family = "sans"), axis.text.y = element_text(color = "black",
size = 12, family = "sans"), legend.title = element_blank(),
legend.position = "top", legend.margin = ggplot2::margin(b = -10)) +
scale_y_continuous(breaks = seq(0.01, 0.05, 0.01)) +
scale_x_discrete(labels = lapply(rev(top_features_names),
str_wrap, width = 22)) + xlab("") + ylab("Relative importance") +
ggtitle("") + annotate("text", label = rev(tasks_channels),
x = task_pos, y = -0.105, fontface = "bold", family = "sans",
hjust = 0, size = 4) + annotate("text", label = "Task (channel)",
x = -0.2, y = -0.09, fontface = "bold", family = "sans",
size = 4) #+
test_data = as.data.frame(matrix(nrow = 0, ncol = length(data)))
colnames(test_data) = colnames(data)
for (split_num in 1:5) {
for (iter_num in 1:5) {
test = read.csv(paste0("./files/PD_HC/train_test_files/test_split",
split_num, "_iter", iter_num, ".csv"))
test_data = rbind(test_data, test)
}
}
shap_data = read.csv("./files/PD_HC/top_shap_values.csv")
shap_data = subset(shap_data, select = -c(response))
data_ = subset(test_data, select = -c(PDGP, response))
data_ <- apply(data_, 2, function(x) rank(x))
data_ <- apply(data_, 2, function(x) rescale(x, to = c(0,
100)))
var_diff <- apply(shap_data, 2, function(x) max(x) - min(x))
var_diff <- var_diff[order(-var_diff)]
df <- data.frame(variable = melt(shap_data)$variable, shap = melt(shap_data)$value,
variable_value = melt(data_)$value)
top_vars <- factor(names(var_diff))
imp_scores = c()
for (variable in top_vars) {
score = imp_data$relative_importance[imp_data$variable ==
variable]
imp_scores = c(imp_scores, score)
}
df <- df[df$variable %in% top_vars, ]
df$variable <- factor(df$variable, levels = top_vars[order(imp_scores)],
labels = top_vars[order(imp_scores)])
p_shap = ggplot(df, aes(x = shap, y = variable)) + geom_hline(yintercept = top_vars,
linetype = "dotted", color = "grey80") + geom_vline(xintercept = 0,
color = "grey80", size = 1) + geom_point(aes(fill = variable_value),
color = "#6C6C82", shape = 21, alpha = 0.5, size = 2,
position = "auto", stroke = 0.1) + scale_fill_gradient2(low = "#2B26CB",
mid = "white", high = "red", midpoint = 50, breaks = c(0,
100), labels = c("Low", "High")) + theme_bw() +
theme(plot.margin = ggplot2::margin(18, 0, 0, -60, "pt"),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.grid.major.y = element_line(color = "grey80",
size = 0.3, linetype = 2), strip.background = element_blank(),
panel.border = element_rect(color = "black"), axis.line = element_line(color = "black"),
axis.title.x = element_text(colour = "black", size = 15,
family = "sans"), axis.title.y = element_text(colour = "black",
size = 15, family = "sans"), axis.text.x = element_text(colour = "black",
size = 12, family = "sans"), axis.text.y = element_blank(),
legend.title = element_text(size = 15, family = "sans"),
legend.text = element_text(colour = "black", size = 12,
family = "sans"), legend.title.align = 0.5) +
guides(fill = guide_colourbar("Variable\nvalue\n", ticks = FALSE,
barheight = 10, barwidth = 0.5), color = "none") +
xlab("SHAP value") + ylab("")
plot_all = arrangeGrob(p_imp, p_shap, ncol = 2)
ggsave(plot_all, file = "figures/PD_HC_imp_plot_bar.png",
dpi = 500, width = 14, height = 5)
5.2.2 Mild PD participants and controls
load("./rdata/var_reduct_HY_early_HC_splits.RData")
top_models = c("GBM_model_1684292205921_1340782", "DRF_model_1684292205921_1431707",
"GBM_model_1684292205921_1572491", "GBM_model_1684292205921_1687878",
"GBM_model_1684292205921_1810644", "GBM_model_1684334187433_82010",
"GBM_model_1684334187433_210422", "GBM_model_1684334187433_340011",
"DRF_model_1684334187433_455088", "GBM_model_1684334187433_592881",
"GBM_model_1684334187433_717214", "GBM_model_1684334187433_844597",
"DeepLearning_model_1684334187433_999941", "GBM_model_1684334187433_1060851",
"GBM_model_1684334187433_1138483", "DeepLearning_model_1684334187433_1242659",
"GBM_model_1684334187433_1299542", "XGBoost_model_1684334187433_1320794",
"GBM_model_1684334187433_1485635", "GBM_model_1684334187433_1591419",
"DRF_model_1684334187433_1694185", "GBM_model_1684334187433_1832852",
"DeepLearning_model_1684334187433_1918047", "DeepLearning_model_1684334187433_1973941",
"GBM_model_1684334187433_1995264")
save_var_imp(data, all_splits, top_models, "HY_early_HC")top_models=["GBM_model_1684292205921_1340782", "DRF_model_1684292205921_1431707", \
"GBM_model_1684292205921_1572491", "GBM_model_1684292205921_1687878", \
"GBM_model_1684292205921_1810644", "GBM_model_1684334187433_82010",\
"GBM_model_1684334187433_210422", "GBM_model_1684334187433_340011", \
"DRF_model_1684334187433_455088", "GBM_model_1684334187433_592881",\
"GBM_model_1684334187433_717214", "GBM_model_1684334187433_844597", \
"DeepLearning_model_1684334187433_999941", "GBM_model_1684334187433_1060851", \
"GBM_model_1684334187433_1138483", "DeepLearning_model_1684334187433_1242659",\
"GBM_model_1684334187433_1299542", "XGBoost_model_1684334187433_1320794", \
"GBM_model_1684334187433_1485635", "GBM_model_1684334187433_1591419", \
"DRF_model_1684334187433_1694185", "GBM_model_1684334187433_1832852", \
"DeepLearning_model_1684334187433_1918047", "DeepLearning_model_1684334187433_1973941", \
"GBM_model_1684334187433_1995264"]
data_columns = ["gyfTTP_walk1.t6","axtVFD_Turn1.t7","aytMYOP_Turn2.t7","axtSSC_sTs2.t6t7",\
"aytSSC_sTs2.t6t7","az_fft1_sTs2.t6t7","aztSSC_sTs2.t6t7","gxfSS_sTs2.t6t7",\
"gxtWAMP_sTs2.t6t7","gxtSSC_sTs2.t6t7","gytSSC_sTs2.t6t7","gztSSC_sTs2.t6t7",\
"az_fft3_Turn2.t6t7","gxfFR.t2" ,"response"]
save_var_shap(data_columns, top_models, "HY_early_HC")imp_data = read.csv("./files/HY_early_HC/top_imp_scores.csv")
imp_data = subset(imp_data, select = -X)
imp_data <- imp_data[order(imp_data$relative_importance,
decreasing = TRUE), ]
load("./rdata/var_reduct_HY_early_HC_splits.RData")
rownames(imp_data) = imp_data$variable
top_features_names = c("Variation fractal dimension of 1st turn",
"Mean binarized values of 2nd turn", "Total power of 1st walk",
"DFT frequency ratio", "Frequency at 3rd peak of DFT of 2nd turn",
"Frequency at 1st peak of DFT of stand-to-sit", "Slope sign change of stand-to-sit",
"Skewness of DFT power of stand-to-sit", "Slope sign change of stand-to-sit",
"Slope sign change of stand-to-sit", "Slope sign change of stand-to-sit",
"Rate of change of stand-to-sit", "Slope sign change of stand-to-sit",
"Slope sign change of stand-to-sit")
tasks_channels = c("CogTUG2 (ax)", "CogTUG2 (ay)", "CogTUG1 (gy)",
"Stand eyes closed (gx)", "Mean CogTUG (az)", "Mean CogTUG (az)",
"Mean CogTUG (gz)", "Mean CogTUG (gx)", "Mean CogTUG (ay)",
"Mean CogTUG (az)", "Mean CogTUG (gy)", "Mean CogTUG (gx)",
"Mean CogTUG (gx)", "Mean CogTUG (ax)")
task_pos = c(1.15, 2.2, 3.15, 4.2, 5.17, 6.15, 7.15, 8.18,
9.21, 10.18, 11.05, 12, 13.18, 14.22)
imp_data$Subset <- factor(sapply(rownames(imp_data), function(x) strsplit(x,
split = "\\.")[[1]][2]))
p_imp = ggplot(imp_data, aes(x = reorder(variable, relative_importance),
y = relative_importance)) + geom_bar(stat = "identity",
width = 0.5, fill = "#A4A2A6") + coord_flip(ylim = c(0.003,
max(imp_data$relative_importance) + 0.01), xlim = c(1,
14), clip = "off") + theme_bw() + theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), panel.grid.major.y = element_line(color = "grey80",
size = 0.3, linetype = 2), strip.background = element_blank(),
panel.border = element_rect(color = "black"), legend.text = element_text(color = "black",
size = 10, family = "sans"), plot.margin = ggplot2::margin(0,
65, 0, 150, "pt"), axis.title.x = element_text(color = "black",
size = 15, family = "sans"), axis.title.y = element_text(color = "black",
size = 15, family = "sans"), axis.text.x = element_text(color = "black",
size = 12, family = "sans"), axis.text.y = element_text(color = "black",
size = 12, family = "sans"), legend.title = element_blank(),
legend.position = "top", legend.margin = ggplot2::margin(b = -10)) +
scale_y_continuous(breaks = seq(0.01, 0.06, 0.02)) +
scale_x_discrete(labels = lapply(rev(top_features_names),
str_wrap, width = 25)) + xlab("") + ylab("Relative importance") +
ggtitle("") + annotate("text", label = rev(tasks_channels),
x = task_pos, y = -0.182, fontface = "bold", family = "sans",
hjust = 0, size = 4) + annotate("text", label = "Task (channel)",
x = 0, y = -0.155, fontface = "bold", family = "sans",
size = 4) #+
test_data = as.data.frame(matrix(nrow = 0, ncol = length(data)))
colnames(test_data) = colnames(data)
for (split_num in 1:5) {
for (iter_num in 1:5) {
test = read.csv(paste0("./files/HY_early_HC/train_test_files/test_split",
split_num, "_iter", iter_num, ".csv"))
test_data = rbind(test_data, test)
}
}
shap_data = read.csv("./files/HY_early_HC/top_shap_values.csv")
shap_data = subset(shap_data, select = -c(response))
data_ = subset(test_data, select = -c(PDGP, response))
data_ <- apply(data_, 2, function(x) rank(x))
data_ <- apply(data_, 2, function(x) rescale(x, to = c(0,
100)))
var_diff <- apply(shap_data, 2, function(x) max(x) - min(x))
var_diff <- var_diff[order(-var_diff)]
df <- data.frame(variable = melt(shap_data)$variable, shap = melt(shap_data)$value,
variable_value = melt(data_)$value)
top_vars <- factor(names(var_diff))
imp_scores = c()
for (variable in top_vars) {
score = imp_data$relative_importance[imp_data$variable ==
variable]
imp_scores = c(imp_scores, score)
}
df <- df[df$variable %in% top_vars, ]
df$variable <- factor(df$variable, levels = top_vars[order(imp_scores)],
labels = top_vars[order(imp_scores)])
p_shap = ggplot(df, aes(x = shap, y = variable)) + geom_hline(yintercept = top_vars,
linetype = "dotted", color = "grey80") + geom_vline(xintercept = 0,
color = "grey80", size = 1) + geom_point(aes(fill = variable_value),
color = "#6C6C82", shape = 21, alpha = 0.5, size = 2,
position = "auto", stroke = 0.1) + scale_fill_gradient2(low = "#2B26CB",
mid = "white", high = "red", midpoint = 50, breaks = c(0,
100), labels = c("Low", "High")) + theme_bw() +
theme(plot.margin = ggplot2::margin(18, 0, 0, -60, "pt"),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.grid.major.y = element_line(color = "grey80",
size = 0.3, linetype = 2), strip.background = element_blank(),
panel.border = element_rect(color = "black"), axis.line = element_line(color = "black"),
axis.title.x = element_text(colour = "black", size = 15,
family = "sans"), axis.title.y = element_text(colour = "black",
size = 15, family = "sans"), axis.text.x = element_text(colour = "black",
size = 12, family = "sans"), axis.text.y = element_blank(),
legend.title = element_text(size = 15, family = "sans"),
legend.text = element_text(colour = "black", size = 12,
family = "sans"), legend.title.align = 0.5) +
guides(fill = guide_colourbar("Variable\nvalue\n", ticks = FALSE,
barheight = 10, barwidth = 0.5), color = "none") +
xlab("SHAP value") + ylab("")
plot_all = arrangeGrob(p_imp, p_shap, ncol = 2)
ggsave(plot_all, file = "figures/HY_early_HC_imp_plot_bar.png",
dpi = 500, width = 14, height = 7)
5.2.3 Moderate PD participants and controls
load("./rdata/var_reduct_HY_mild_HC_splits.RData")
top_models = c("DeepLearning_model_1684292205921_1319234", "DRF_model_1684292205921_1381265",
"DeepLearning_model_1684292205921_1554632", "DeepLearning_model_1684292205921_1670051",
"DeepLearning_model_1684292205921_1793667", "DRF_model_1684334187433_12138",
"DRF_model_1684334187433_123741", "DeepLearning_model_1684334187433_322304",
"GBM_model_1684334187433_393672", "DRF_model_1684334187433_511971",
"DRF_model_1684334187433_636647", "DRF_model_1684334187433_759936",
"DeepLearning_model_1684334187433_948250", "DeepLearning_model_1684334187433_1043046",
"DeepLearning_model_1684334187433_1120152", "DeepLearning_model_1684334187433_1207087",
"DeepLearning_model_1684334187433_1281223", "DRF_model_1684334187433_1321702",
"DeepLearning_model_1684334187433_1468319", "DRF_model_1684334187433_1528798",
"DeepLearning_model_1684431909716_60276", "DeepLearning_model_1684431909716_140703",
"DRF_model_1684431909716_148995", "DRF_model_1684431909716_226293",
"DRF_model_1684431909716_297316")
save_var_imp(data, all_splits, top_models, "HY_mild_HC")top_models=["DeepLearning_model_1684292205921_1319234", "DRF_model_1684292205921_1381265", \
"DeepLearning_model_1684292205921_1554632", "DeepLearning_model_1684292205921_1670051", \
"DeepLearning_model_1684292205921_1793667", "DRF_model_1684334187433_12138", \
"DRF_model_1684334187433_123741", "DeepLearning_model_1684334187433_322304", \
"GBM_model_1684334187433_393672", "DRF_model_1684334187433_511971", \
"DRF_model_1684334187433_636647", "DRF_model_1684334187433_759936", \
"DeepLearning_model_1684334187433_948250", "DeepLearning_model_1684334187433_1043046", \
"DeepLearning_model_1684334187433_1120152", "DeepLearning_model_1684334187433_1207087",\
"DeepLearning_model_1684334187433_1281223", "DRF_model_1684334187433_1321702", \
"DeepLearning_model_1684334187433_1468319", "DRF_model_1684334187433_1528798", \
"DeepLearning_model_1684431909716_60276", "DeepLearning_model_1684431909716_140703", \
"DRF_model_1684431909716_148995", "DRF_model_1684431909716_226293", \
"DRF_model_1684431909716_297316"]
data_columns = ["gytIAV_walk2.t5","aztSSC_sTs2.t7","gxfMNP_sTs2.t7","gytSSI_walk2.t7",\
"az_lsp5_sTs1.t6t7","oars_a.demo","oars_i.demo","oars.t.demo",\
"response"]
save_var_shap(data_columns, top_models, "HY_mild_HC")imp_data = read.csv("./files/HY_mild_HC/top_imp_scores.csv")
imp_data = subset(imp_data, select = -X)
imp_data <- imp_data[order(imp_data$relative_importance,
decreasing = TRUE), ]
load("./rdata/var_reduct_HY_mild_HC_splits.RData")
rownames(imp_data) = imp_data$variable
top_features_names = c("Frequency at 5th peak of LSP of sit-to-stand",
"Slope sign change of stand-to-sit", "Simple square interval of 2nd walk",
"DFT mean power of stand-to-sit", "OARS total score",
"Integrated absolute value of 2nd walk", "OARS total ADLs score",
"OARS total IADLs score")
tasks_channels = c("Mean CogTUG (az)", "CogTUG2 (az)", "CogTUG2 (gy)",
"CogTUG2 (gx)", "---", "TUG2 (gy)", "---", "---")
task_pos = c(1, 2, 3.1, 4, 5.12, 6.14, 7.15, 8.14)
imp_data$Subset <- factor(sapply(rownames(imp_data), function(x) strsplit(x,
split = "\\.")[[1]][2]))
p_imp = ggplot(imp_data, aes(x = reorder(variable, relative_importance),
y = relative_importance)) + geom_bar(stat = "identity",
width = 0.3, fill = "#A4A2A6") + coord_flip(ylim = c(0.002,
max(imp_data$relative_importance) + 0.005), xlim = c(1,
8), clip = "off") + theme_bw() + theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), panel.grid.major.y = element_line(color = "grey80",
size = 0.3, linetype = 2), strip.background = element_blank(),
panel.border = element_rect(color = "black"), legend.text = element_text(color = "black",
size = 10, family = "sans"), plot.margin = ggplot2::margin(0,
65, 0, 150, "pt"), axis.title.x = element_text(color = "black",
size = 15, family = "sans"), axis.title.y = element_text(color = "black",
size = 15, family = "sans"), axis.text.x = element_text(color = "black",
size = 12, family = "sans"), axis.text.y = element_text(color = "black",
size = 12, family = "sans"), legend.title = element_blank(),
legend.position = "top", legend.margin = ggplot2::margin(b = -10)) +
scale_y_continuous(breaks = seq(0.01, 0.05, 0.01)) +
scale_x_discrete(labels = lapply(rev(top_features_names),
str_wrap, width = 25)) + xlab("") + ylab("Relative importance") +
ggtitle("") + annotate("text", label = rev(tasks_channels),
x = task_pos, y = -0.14, fontface = "bold", family = "sans",
hjust = 0, size = 4) + annotate("text", label = "Task (channel)",
x = 0, y = -0.12, fontface = "bold", family = "sans",
size = 4) #+
test_data = as.data.frame(matrix(nrow = 0, ncol = length(data)))
colnames(test_data) = colnames(data)
for (split_num in 1:5) {
for (iter_num in 1:5) {
test = read.csv(paste0("./files/HY_mild_HC/train_test_files/test_split",
split_num, "_iter", iter_num, ".csv"))
test_data = rbind(test_data, test)
}
}
shap_data = read.csv("./files/HY_mild_HC/top_shap_values.csv")
shap_data = subset(shap_data, select = -c(response))
data_ = subset(test_data, select = -c(PDGP, response))
data_ <- apply(data_, 2, function(x) rank(x))
data_ <- apply(data_, 2, function(x) rescale(x, to = c(0,
100)))
var_diff <- apply(shap_data, 2, function(x) max(x) - min(x))
var_diff <- var_diff[order(-var_diff)]
df <- data.frame(variable = melt(shap_data)$variable, shap = melt(shap_data)$value,
variable_value = melt(data_)$value)
top_vars <- factor(names(var_diff))
imp_scores = c()
for (variable in top_vars) {
score = imp_data$relative_importance[imp_data$variable ==
variable]
imp_scores = c(imp_scores, score)
}
df <- df[df$variable %in% top_vars, ]
df$variable <- factor(df$variable, levels = top_vars[order(imp_scores)],
labels = top_vars[order(imp_scores)])
p_shap = ggplot(df, aes(x = shap, y = variable)) + geom_hline(yintercept = top_vars,
linetype = "dotted", color = "grey80") + geom_vline(xintercept = 0,
color = "grey80", size = 1) + geom_point(aes(fill = variable_value),
color = "#6C6C82", shape = 21, alpha = 0.5, size = 2,
position = "auto", stroke = 0.1) + scale_fill_gradient2(low = "#2B26CB",
mid = "white", high = "red", midpoint = 50, breaks = c(0,
100), labels = c("Low", "High")) + theme_bw() +
theme(plot.margin = ggplot2::margin(18, 0, 0, -60, "pt"),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.grid.major.y = element_line(color = "grey80",
size = 0.3, linetype = 2), strip.background = element_blank(),
panel.border = element_rect(color = "black"), axis.line = element_line(color = "black"),
axis.title.x = element_text(colour = "black", size = 15,
family = "sans"), axis.title.y = element_text(colour = "black",
size = 15, family = "sans"), axis.text.x = element_text(colour = "black",
size = 12, family = "sans"), axis.text.y = element_blank(),
legend.title = element_text(size = 15, family = "sans"),
legend.text = element_text(colour = "black", size = 12,
family = "sans"), legend.title.align = 0.5) +
guides(fill = guide_colourbar("Variable\nvalue\n", ticks = FALSE,
barheight = 10, barwidth = 0.5), color = "none") +
xlab("SHAP value") + ylab("")
plot_all = arrangeGrob(p_imp, p_shap, ncol = 2)
ggsave(plot_all, file = "figures/HY_mild_HC_imp_plot_bar.png",
dpi = 500, width = 14, height = 7)
5.2.4 Severe PD participants and controls
load("./rdata/var_reduct_HY_severe_HC_splits.RData")
top_models = c("GBM_model_1684381314577_68957", "DeepLearning_model_1684381314577_261294",
"DRF_model_1684381314577_323560", "DeepLearning_model_1684381314577_524854",
"GBM_model_1684381314577_603296", "DRF_model_1684381314577_717893",
"DRF_model_1684381314577_847473", "DeepLearning_model_1684381314577_1060632",
"DeepLearning_model_1684381314577_1192576", "DeepLearning_model_1684381314577_1317450",
"DeepLearning_model_1684381314577_1446169", "DRF_model_1684381314577_1504662",
"DRF_model_1684381314577_1622170", "GBM_model_1684381314577_1753250",
"GBM_model_1684381314577_1869370", "DeepLearning_model_1684421828027_111476",
"DeepLearning_model_1684421828027_206688", "GBM_model_1684421828027_271687",
"GBM_model_1684421828027_408646", "GLM_model_1684421828027_529561",
"DeepLearning_model_1684421828027_734593", "DeepLearning_model_1684421828027_847931",
"DeepLearning_model_1684421828027_963294", "DeepLearning_model_1684421828027_1095195",
"GLM_model_1684421828027_1158212")
save_var_imp(data, all_splits, top_models, "HY_severe_HC")top_models=["GBM_model_1684381314577_68957", "DeepLearning_model_1684381314577_261294",\
"DRF_model_1684381314577_323560", "DeepLearning_model_1684381314577_524854", \
"GBM_model_1684381314577_603296", "DRF_model_1684381314577_717893", \
"DRF_model_1684381314577_847473", "DeepLearning_model_1684381314577_1060632",\
"DeepLearning_model_1684381314577_1192576", "DeepLearning_model_1684381314577_1317450",\
"DeepLearning_model_1684381314577_1446169", "DRF_model_1684381314577_1504662",\
"DRF_model_1684381314577_1622170", "GBM_model_1684381314577_1753250", \
"GBM_model_1684381314577_1869370", "DeepLearning_model_1684421828027_111476", \
"DeepLearning_model_1684421828027_206688", "GBM_model_1684421828027_271687", \
"GBM_model_1684421828027_408646", "GLM_model_1684421828027_529561", \
"DeepLearning_model_1684421828027_734593", "DeepLearning_model_1684421828027_847931",\
"DeepLearning_model_1684421828027_963294", "DeepLearning_model_1684421828027_1095195", \
"GLM_model_1684421828027_1158212"]
data_columns = ["aztSSI_sTs2.t6","gxtIAV_walk1.t7","aztIAV_sTs2.t6t7","oars.t.demo",\
"response"]
save_var_shap(data_columns, top_models, "HY_severe_HC")imp_data = read.csv("./files/HY_severe_HC/top_imp_scores.csv")
imp_data = subset(imp_data, select = -X)
imp_data <- imp_data[order(imp_data$relative_importance,
decreasing = TRUE), ]
load("./rdata/var_reduct_HY_severe_HC_splits.RData")
rownames(imp_data) = imp_data$variable
top_features_names = c("Integrated absolute value of 1st walk",
"OARS total score", "Simple square interval of stand-to-sit",
"Integrated absolute value of stand-to-sit")
tasks_channels = c("CogTUG2 (gx)", "---", "CogTUG1 (az)",
"Mean CogTUG (az)")
task_pos = c(1.05, 2, 3.05, 4.05)
imp_data$Subset <- factor(sapply(rownames(imp_data), function(x) strsplit(x,
split = "\\.")[[1]][2]))
p_imp = ggplot(imp_data, aes(x = reorder(variable, relative_importance),
y = relative_importance)) + geom_bar(stat = "identity",
width = 0.2, fill = "#A4A2A6") + coord_flip(ylim = c(0.005,
max(imp_data$relative_importance) + 0.005), xlim = c(1,
4), clip = "off") + theme_bw() + theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), panel.grid.major.y = element_line(color = "grey80",
size = 0.3, linetype = 2), strip.background = element_blank(),
panel.border = element_rect(color = "black"), legend.text = element_text(color = "black",
size = 10, family = "sans"), plot.margin = ggplot2::margin(0,
65, 0, 150, "pt"), axis.title.x = element_text(color = "black",
size = 15, family = "sans"), axis.title.y = element_text(color = "black",
size = 15, family = "sans"), axis.text.x = element_text(color = "black",
size = 12, family = "sans"), axis.text.y = element_text(color = "black",
size = 12, family = "sans"), legend.title = element_blank(),
legend.position = "top", legend.margin = ggplot2::margin(b = -10)) +
scale_y_continuous(breaks = seq(0.01, 0.15, 0.03)) +
scale_x_discrete(labels = lapply(rev(top_features_names),
str_wrap, width = 25)) + xlab("") + ylab("Relative importance") +
ggtitle("") + annotate("text", label = rev(tasks_channels),
x = task_pos, y = -0.4, fontface = "bold", family = "sans",
hjust = 0, size = 4) + annotate("text", label = "Task (channel)",
x = 0.22, y = -0.34, fontface = "bold", family = "sans",
size = 4) #+
test_data = as.data.frame(matrix(nrow = 0, ncol = length(data)))
colnames(test_data) = colnames(data)
for (split_num in 1:5) {
for (iter_num in 1:5) {
test = read.csv(paste0("./files/HY_severe_HC/train_test_files/test_split",
split_num, "_iter", iter_num, ".csv"))
test_data = rbind(test_data, test)
}
}
shap_data = read.csv("./files/HY_severe_HC/top_shap_values.csv")
shap_data = subset(shap_data, select = -c(response))
data_ = subset(test_data, select = -c(PDGP, response))
data_ <- apply(data_, 2, function(x) rank(x))
data_ <- apply(data_, 2, function(x) rescale(x, to = c(0,
100)))
var_diff <- apply(shap_data, 2, function(x) max(x) - min(x))
var_diff <- var_diff[order(-var_diff)]
df <- data.frame(variable = melt(shap_data)$variable, shap = melt(shap_data)$value,
variable_value = melt(data_)$value)
top_vars <- factor(names(var_diff))
imp_scores = c()
for (variable in top_vars) {
score = imp_data$relative_importance[imp_data$variable ==
variable]
imp_scores = c(imp_scores, score)
}
df <- df[df$variable %in% top_vars, ]
df$variable <- factor(df$variable, levels = top_vars[order(imp_scores)],
labels = top_vars[order(imp_scores)])
p_shap = ggplot(df, aes(x = shap, y = variable)) + geom_hline(yintercept = top_vars,
linetype = "dotted", color = "grey80") + geom_vline(xintercept = 0,
color = "grey80", size = 1) + geom_point(aes(fill = variable_value),
color = "#6C6C82", shape = 21, alpha = 0.5, size = 2,
position = "auto", stroke = 0.1) + scale_fill_gradient2(low = "#2B26CB",
mid = "white", high = "red", midpoint = 50, breaks = c(0,
100), labels = c("Low", "High")) + theme_bw() +
theme(plot.margin = ggplot2::margin(18, 0, 0, -60, "pt"),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.grid.major.y = element_line(color = "grey80",
size = 0.3, linetype = 2), strip.background = element_blank(),
panel.border = element_rect(color = "black"), axis.line = element_line(color = "black"),
axis.title.x = element_text(colour = "black", size = 15,
family = "sans"), axis.title.y = element_text(colour = "black",
size = 15, family = "sans"), axis.text.x = element_text(colour = "black",
size = 12, family = "sans"), axis.text.y = element_blank(),
legend.title = element_text(size = 15, family = "sans"),
legend.text = element_text(colour = "black", size = 12,
family = "sans"), legend.title.align = 0.5) +
guides(fill = guide_colourbar("Variable\nvalue\n", ticks = FALSE,
barheight = 10, barwidth = 0.5), color = "none") +
xlab("SHAP value") + ylab("")
plot_all = arrangeGrob(p_imp, p_shap, ncol = 2)
ggsave(plot_all, file = "figures/HY_severe_HC_imp_plot_bar.png",
dpi = 500, width = 14, height = 7)