Section 3 Train/ Test split
3.1 PD and parkinsonism participants
set.seed(NULL)
load("./rdata/PD_PDism_tug.RData")
load("./rdata/sensor_features_all_tasks.RData")
data = sensor_df
colnames(data) = str_replace_all(colnames(data), "_turn", "_Turn")
colnames(data) = str_replace_all(colnames(data), "_t", ".t")
parkinsonism_ids = c(38, 168, 194, 199, 207, 212, 214, 223, 224,
235, 241, 259, 261, 263, 265, 268, 297, 298)
excluded_PD = c(246, 252, 293)
data = data[data$PDGP < 600, ]
data = data[!data$PDGP %in% excluded_PD, ]
labels = rep(1, nrow(data))
labels[data$PDGP %in% parkinsonism_ids] = 2
imp_data = data.frame(randomForest::importance(rf)[, 1, drop = FALSE])
data = data[, colnames(data) %in% c(rownames(imp_data), "PDGP")]
data$response = as.factor(labels)
predictor_vars = subset(data, select = -c(PDGP, response))
response_var = data$response
PDGP = data$PDGP
# Convert predictors to data frame
predictor_vars = data.frame(predictor_vars, stringsAsFactors = TRUE,
check.names = FALSE)
# Convert non-integer and non-numeric variables to factor
# variables
factor_cols = sapply(predictor_vars, function(x) class(x) !=
"integer" & class(x) != "numeric")
predictor_vars[, factor_cols] = data.frame(apply(predictor_vars[,
factor_cols, drop = FALSE], 2, as.factor), stringsAsFactors = TRUE,
check.names = FALSE)
# Remove variables with constant values (including all NAs)
predictor_vars = predictor_vars %>%
select(where(~n_distinct(.) > 1))
# Impute missing values
predictor_vars = na.roughfix(predictor_vars)
# Remove variables having infinite values
predictor_vars = predictor_vars[, unlist(lapply(predictor_vars,
function(x) if (class(x) != "factor") is.finite(sum(x)) else TRUE))]
# Select rows with no NAs in the response variable
predictor_vars = predictor_vars[!is.na(response_var), ]
response_var = response_var[!is.na(response_var)]
PDGP = PDGP[!is.na(response_var)]
# Normalize
norm = preProcess(predictor_vars)
predictor_vars = predict(norm, predictor_vars)
data = cbind(PDGP, predictor_vars, response_var)
create_groups <- function(num_folds) {
groups = list()
PD_gp_size = ceiling(nrow(PD_data)/num_folds)
PDism_gp_size = ceiling(nrow(PDism_data)/num_folds)
for (i in 1:(num_folds - 1)) {
PD_gp = sample(1:nrow(PD_data), size = PD_gp_size, replace = FALSE)
PD_gp = PD_data[PD_gp, ]
PD_data = PD_data[!rownames(PD_data) %in% rownames(PD_gp),
]
PDism_gp = sample(1:nrow(PDism_data), size = PDism_gp_size,
replace = FALSE)
PDism_gp = PDism_data[PDism_gp, ]
PDism_data = PDism_data[!rownames(PDism_data) %in% rownames(PDism_gp),
]
group = rbind(PD_gp, PDism_gp)
groups[[i]] = group
}
group = rbind(PD_data, PDism_data)
groups[[num_folds]] = group
return(groups)
}
all_splits = list()
PD_data = data[data$response == 1, ]
PDism_data = data[data$response == 2, ]
num_folds = 3
for (i in 1:5) {
groups = create_groups(num_folds = num_folds)
all_splits[[i]] = groups
}
data = rbind(PD_data, PDism_data)
data <- data[order(data$PDGP), ]
save(data, all_splits, file = "./rdata/var_reduct_PD_PDism_splits_tug.RData")3.2 Mild PD and parkinsonism participants
set.seed(NULL)
load("./rdata/HY_PDism_early_tug.RData")
load("./rdata/HY_early_sensor_features_all_tasks.RData")
data = data_early
colnames(data) = str_replace_all(colnames(data), "_turn", "_Turn")
colnames(data) = str_replace_all(colnames(data), "_t", ".t")
parkinsonism_ids = c(38, 168, 194, 199, 207, 212, 214, 223, 224,
235, 241, 259, 261, 263, 265, 268, 297, 298)
excluded_PD = c(246, 252, 293)
data = data[data$PDGP < 600, ]
data = data[!data$PDGP %in% excluded_PD, ]
labels = rep(1, nrow(data))
labels[data$PDGP %in% parkinsonism_ids] = 2
imp_data = data.frame(randomForest::importance(rf)[, 1, drop = FALSE])
data = data[, colnames(data) %in% c(rownames(imp_data), "PDGP")]
data$response = as.factor(labels)
predictor_vars = subset(data, select = -c(PDGP, response))
response_var = data$response
PDGP = data$PDGP
# Convert predictors to data frame
predictor_vars = data.frame(predictor_vars, stringsAsFactors = TRUE,
check.names = FALSE)
# Convert non-integer and non-numeric variables to factor
# variables
factor_cols = sapply(predictor_vars, function(x) class(x) !=
"integer" & class(x) != "numeric")
predictor_vars[, factor_cols] = data.frame(apply(predictor_vars[,
factor_cols, drop = FALSE], 2, as.factor), stringsAsFactors = TRUE,
check.names = FALSE)
# Remove variables with constant values (including all NAs)
predictor_vars = predictor_vars %>%
select(where(~n_distinct(.) > 1))
# Impute missing values
predictor_vars = na.roughfix(predictor_vars)
# Remove variables having infinite values
predictor_vars = predictor_vars[, unlist(lapply(predictor_vars,
function(x) if (class(x) != "factor") is.finite(sum(x)) else TRUE))]
# Select rows with no NAs in the response variable
predictor_vars = predictor_vars[!is.na(response_var), ]
response_var = response_var[!is.na(response_var)]
PDGP = PDGP[!is.na(response_var)]
# Normalize
norm = preProcess(predictor_vars)
predictor_vars = predict(norm, predictor_vars)
data = cbind(PDGP, predictor_vars, response_var)
create_groups <- function(num_folds) {
groups = list()
PD_gp_size = ceiling(nrow(PD_data)/num_folds)
PDism_gp_size = ceiling(nrow(PDism_data)/num_folds)
for (i in 1:(num_folds - 1)) {
PD_gp = sample(1:nrow(PD_data), size = PD_gp_size, replace = FALSE)
PD_gp = PD_data[PD_gp, ]
PD_data = PD_data[!rownames(PD_data) %in% rownames(PD_gp),
]
PDism_gp = sample(1:nrow(PDism_data), size = PDism_gp_size,
replace = FALSE)
PDism_gp = PDism_data[PDism_gp, ]
PDism_data = PDism_data[!rownames(PDism_data) %in% rownames(PDism_gp),
]
group = rbind(PD_gp, PDism_gp)
groups[[i]] = group
}
group = rbind(PD_data, PDism_data)
groups[[num_folds]] = group
return(groups)
}
all_splits = list()
PD_data = data[data$response == 1, ]
PDism_data = data[data$response == 2, ]
num_folds = 3
for (i in 1:5) {
groups = create_groups(num_folds = num_folds)
all_splits[[i]] = groups
}
data = rbind(PD_data, PDism_data)
data <- data[order(data$PDGP), ]
save(data, all_splits, file = "./rdata/var_reduct_HY_early_PDism_splits_tug.RData")3.3 Moderate PD and parkinsonism participants
set.seed(NULL)
load("./rdata/HY_PDism_mild_tug.RData")
load("./rdata/HY_mild_sensor_features_all_tasks.RData")
data = data_mild
colnames(data) = str_replace_all(colnames(data), "_turn", "_Turn")
colnames(data) = str_replace_all(colnames(data), "_t", ".t")
parkinsonism_ids = c(38, 168, 194, 199, 207, 212, 214, 223, 224,
235, 241, 259, 261, 263, 265, 268, 297, 298)
excluded_PD = c(246, 252, 293)
data = data[data$PDGP < 600, ]
data = data[!data$PDGP %in% excluded_PD, ]
labels = rep(1, nrow(data))
labels[data$PDGP %in% parkinsonism_ids] = 2
imp_data = data.frame(randomForest::importance(rf)[, 1, drop = FALSE])
data = data[, colnames(data) %in% c(rownames(imp_data), "PDGP")]
data$response = as.factor(labels)
predictor_vars = subset(data, select = -c(PDGP, response))
response_var = data$response
PDGP = data$PDGP
# Convert predictors to data frame
predictor_vars = data.frame(predictor_vars, stringsAsFactors = TRUE,
check.names = FALSE)
# Convert non-integer and non-numeric variables to factor
# variables
factor_cols = sapply(predictor_vars, function(x) class(x) !=
"integer" & class(x) != "numeric")
predictor_vars[, factor_cols] = data.frame(apply(predictor_vars[,
factor_cols, drop = FALSE], 2, as.factor), stringsAsFactors = TRUE,
check.names = FALSE)
# Remove variables with constant values (including all NAs)
predictor_vars = predictor_vars %>%
select(where(~n_distinct(.) > 1))
# Impute missing values
predictor_vars = na.roughfix(predictor_vars)
# Remove variables having infinite values
predictor_vars = predictor_vars[, unlist(lapply(predictor_vars,
function(x) if (class(x) != "factor") is.finite(sum(x)) else TRUE))]
# Select rows with no NAs in the response variable
predictor_vars = predictor_vars[!is.na(response_var), ]
response_var = response_var[!is.na(response_var)]
PDGP = PDGP[!is.na(response_var)]
# Normalize
norm = preProcess(predictor_vars)
predictor_vars = predict(norm, predictor_vars)
data = cbind(PDGP, predictor_vars, response_var)
create_groups <- function(num_folds) {
groups = list()
PD_gp_size = ceiling(nrow(PD_data)/num_folds)
PDism_gp_size = ceiling(nrow(PDism_data)/num_folds)
for (i in 1:(num_folds - 1)) {
PD_gp = sample(1:nrow(PD_data), size = PD_gp_size, replace = FALSE)
PD_gp = PD_data[PD_gp, ]
PD_data = PD_data[!rownames(PD_data) %in% rownames(PD_gp),
]
PDism_gp = sample(1:nrow(PDism_data), size = PDism_gp_size,
replace = FALSE)
PDism_gp = PDism_data[PDism_gp, ]
PDism_data = PDism_data[!rownames(PDism_data) %in% rownames(PDism_gp),
]
group = rbind(PD_gp, PDism_gp)
groups[[i]] = group
}
group = rbind(PD_data, PDism_data)
groups[[num_folds]] = group
return(groups)
}
all_splits = list()
PD_data = data[data$response == 1, ]
PDism_data = data[data$response == 2, ]
num_folds = 3
for (i in 1:5) {
groups = create_groups(num_folds = num_folds)
all_splits[[i]] = groups
}
data = rbind(PD_data, PDism_data)
data <- data[order(data$PDGP), ]
save(data, all_splits, file = "./rdata/var_reduct_HY_mild_PDism_splits_tug.RData")3.4 Severe PD and parkinsonism participants
set.seed(NULL)
load("./rdata/HY_PDism_severe_tug.RData")
load("./rdata/HY_severe_sensor_features_all_tasks.RData")
data = data_severe
colnames(data) = str_replace_all(colnames(data), "_turn", "_Turn")
colnames(data) = str_replace_all(colnames(data), "_t", ".t")
parkinsonism_ids = c(38, 168, 194, 199, 207, 212, 214, 223, 224,
235, 241, 259, 261, 263, 265, 268, 297, 298)
excluded_PD = c(246, 252, 293)
data = data[data$PDGP < 600, ]
data = data[!data$PDGP %in% excluded_PD, ]
labels = rep(1, nrow(data))
labels[data$PDGP %in% parkinsonism_ids] = 2
imp_data = data.frame(randomForest::importance(rf)[, 1, drop = FALSE])
data = data[, colnames(data) %in% c(rownames(imp_data), "PDGP")]
data$response = as.factor(labels)
predictor_vars = subset(data, select = -c(PDGP, response))
response_var = data$response
PDGP = data$PDGP
# Convert predictors to data frame
predictor_vars = data.frame(predictor_vars, stringsAsFactors = TRUE,
check.names = FALSE)
# Convert non-integer and non-numeric variables to factor
# variables
factor_cols = sapply(predictor_vars, function(x) class(x) !=
"integer" & class(x) != "numeric")
predictor_vars[, factor_cols] = data.frame(apply(predictor_vars[,
factor_cols, drop = FALSE], 2, as.factor), stringsAsFactors = TRUE,
check.names = FALSE)
# Remove variables with constant values (including all NAs)
predictor_vars = predictor_vars %>%
select(where(~n_distinct(.) > 1))
# Impute missing values
predictor_vars = na.roughfix(predictor_vars)
# Remove variables having infinite values
predictor_vars = predictor_vars[, unlist(lapply(predictor_vars,
function(x) if (class(x) != "factor") is.finite(sum(x)) else TRUE))]
# Select rows with no NAs in the response variable
predictor_vars = predictor_vars[!is.na(response_var), ]
response_var = response_var[!is.na(response_var)]
PDGP = PDGP[!is.na(response_var)]
# Normalize
norm = preProcess(predictor_vars)
predictor_vars = predict(norm, predictor_vars)
data = cbind(PDGP, predictor_vars, response_var)
create_groups <- function(num_folds) {
groups = list()
PD_gp_size = ceiling(nrow(PD_data)/num_folds)
PDism_gp_size = ceiling(nrow(PDism_data)/num_folds)
for (i in 1:(num_folds - 1)) {
PD_gp = sample(1:nrow(PD_data), size = PD_gp_size, replace = FALSE)
PD_gp = PD_data[PD_gp, ]
PD_data = PD_data[!rownames(PD_data) %in% rownames(PD_gp),
]
PDism_gp = sample(1:nrow(PDism_data), size = PDism_gp_size,
replace = FALSE)
PDism_gp = PDism_data[PDism_gp, ]
PDism_data = PDism_data[!rownames(PDism_data) %in% rownames(PDism_gp),
]
group = rbind(PD_gp, PDism_gp)
groups[[i]] = group
}
group = rbind(PD_data, PDism_data)
groups[[num_folds]] = group
return(groups)
}
all_splits = list()
PD_data = data[data$response == 1, ]
PDism_data = data[data$response == 2, ]
num_folds = 3
for (i in 1:5) {
groups = create_groups(num_folds = num_folds)
all_splits[[i]] = groups
}
data = rbind(PD_data, PDism_data)
data <- data[order(data$PDGP), ]
save(data, all_splits, file = "./rdata/var_reduct_HY_severe_PDism_splits_tug.RData")