Section 3 Train/ Test split

library(UBL)
library(DMwR)
library(stringr)
library(dplyr)
library(randomForest)
library(caret)

3.1 PD and parkinsonism participants

set.seed(NULL)
load("./rdata/PD_PDism_tug.RData")
load("./rdata/sensor_features_all_tasks.RData")

data = sensor_df
colnames(data) = str_replace_all(colnames(data), "_turn", "_Turn")
colnames(data) = str_replace_all(colnames(data), "_t", ".t")

parkinsonism_ids = c(38, 168, 194, 199, 207, 212, 214, 223, 224,
    235, 241, 259, 261, 263, 265, 268, 297, 298)
excluded_PD = c(246, 252, 293)

data = data[data$PDGP < 600, ]
data = data[!data$PDGP %in% excluded_PD, ]


labels = rep(1, nrow(data))
labels[data$PDGP %in% parkinsonism_ids] = 2
imp_data = data.frame(randomForest::importance(rf)[, 1, drop = FALSE])
data = data[, colnames(data) %in% c(rownames(imp_data), "PDGP")]
data$response = as.factor(labels)

predictor_vars = subset(data, select = -c(PDGP, response))
response_var = data$response
PDGP = data$PDGP

# Convert predictors to data frame
predictor_vars = data.frame(predictor_vars, stringsAsFactors = TRUE,
    check.names = FALSE)

# Convert non-integer and non-numeric variables to factor
# variables
factor_cols = sapply(predictor_vars, function(x) class(x) !=
    "integer" & class(x) != "numeric")
predictor_vars[, factor_cols] = data.frame(apply(predictor_vars[,
    factor_cols, drop = FALSE], 2, as.factor), stringsAsFactors = TRUE,
    check.names = FALSE)


# Remove variables with constant values (including all NAs)
predictor_vars = predictor_vars %>%
    select(where(~n_distinct(.) > 1))

# Impute missing values
predictor_vars = na.roughfix(predictor_vars)

# Remove variables having infinite values
predictor_vars = predictor_vars[, unlist(lapply(predictor_vars,
    function(x) if (class(x) != "factor") is.finite(sum(x)) else TRUE))]

# Select rows with no NAs in the response variable
predictor_vars = predictor_vars[!is.na(response_var), ]
response_var = response_var[!is.na(response_var)]
PDGP = PDGP[!is.na(response_var)]

# Normalize
norm = preProcess(predictor_vars)
predictor_vars = predict(norm, predictor_vars)
data = cbind(PDGP, predictor_vars, response_var)

create_groups <- function(num_folds) {
    groups = list()
    PD_gp_size = ceiling(nrow(PD_data)/num_folds)
    PDism_gp_size = ceiling(nrow(PDism_data)/num_folds)

    for (i in 1:(num_folds - 1)) {
        PD_gp = sample(1:nrow(PD_data), size = PD_gp_size, replace = FALSE)
        PD_gp = PD_data[PD_gp, ]
        PD_data = PD_data[!rownames(PD_data) %in% rownames(PD_gp),
            ]

        PDism_gp = sample(1:nrow(PDism_data), size = PDism_gp_size,
            replace = FALSE)
        PDism_gp = PDism_data[PDism_gp, ]
        PDism_data = PDism_data[!rownames(PDism_data) %in% rownames(PDism_gp),
            ]

        group = rbind(PD_gp, PDism_gp)
        groups[[i]] = group
    }
    group = rbind(PD_data, PDism_data)
    groups[[num_folds]] = group
    return(groups)
}

all_splits = list()
PD_data = data[data$response == 1, ]
PDism_data = data[data$response == 2, ]
num_folds = 3

for (i in 1:5) {
    groups = create_groups(num_folds = num_folds)
    all_splits[[i]] = groups
}

data = rbind(PD_data, PDism_data)
data <- data[order(data$PDGP), ]
save(data, all_splits, file = "./rdata/var_reduct_PD_PDism_splits_tug.RData")

3.2 Mild PD and parkinsonism participants

set.seed(NULL)
load("./rdata/HY_PDism_early_tug.RData")
load("./rdata/HY_early_sensor_features_all_tasks.RData")

data = data_early
colnames(data) = str_replace_all(colnames(data), "_turn", "_Turn")
colnames(data) = str_replace_all(colnames(data), "_t", ".t")

parkinsonism_ids = c(38, 168, 194, 199, 207, 212, 214, 223, 224,
    235, 241, 259, 261, 263, 265, 268, 297, 298)
excluded_PD = c(246, 252, 293)

data = data[data$PDGP < 600, ]
data = data[!data$PDGP %in% excluded_PD, ]


labels = rep(1, nrow(data))
labels[data$PDGP %in% parkinsonism_ids] = 2
imp_data = data.frame(randomForest::importance(rf)[, 1, drop = FALSE])
data = data[, colnames(data) %in% c(rownames(imp_data), "PDGP")]
data$response = as.factor(labels)

predictor_vars = subset(data, select = -c(PDGP, response))
response_var = data$response
PDGP = data$PDGP

# Convert predictors to data frame
predictor_vars = data.frame(predictor_vars, stringsAsFactors = TRUE,
    check.names = FALSE)

# Convert non-integer and non-numeric variables to factor
# variables
factor_cols = sapply(predictor_vars, function(x) class(x) !=
    "integer" & class(x) != "numeric")
predictor_vars[, factor_cols] = data.frame(apply(predictor_vars[,
    factor_cols, drop = FALSE], 2, as.factor), stringsAsFactors = TRUE,
    check.names = FALSE)


# Remove variables with constant values (including all NAs)
predictor_vars = predictor_vars %>%
    select(where(~n_distinct(.) > 1))

# Impute missing values
predictor_vars = na.roughfix(predictor_vars)

# Remove variables having infinite values
predictor_vars = predictor_vars[, unlist(lapply(predictor_vars,
    function(x) if (class(x) != "factor") is.finite(sum(x)) else TRUE))]

# Select rows with no NAs in the response variable
predictor_vars = predictor_vars[!is.na(response_var), ]
response_var = response_var[!is.na(response_var)]
PDGP = PDGP[!is.na(response_var)]

# Normalize
norm = preProcess(predictor_vars)
predictor_vars = predict(norm, predictor_vars)
data = cbind(PDGP, predictor_vars, response_var)

create_groups <- function(num_folds) {
    groups = list()
    PD_gp_size = ceiling(nrow(PD_data)/num_folds)
    PDism_gp_size = ceiling(nrow(PDism_data)/num_folds)

    for (i in 1:(num_folds - 1)) {
        PD_gp = sample(1:nrow(PD_data), size = PD_gp_size, replace = FALSE)
        PD_gp = PD_data[PD_gp, ]
        PD_data = PD_data[!rownames(PD_data) %in% rownames(PD_gp),
            ]

        PDism_gp = sample(1:nrow(PDism_data), size = PDism_gp_size,
            replace = FALSE)
        PDism_gp = PDism_data[PDism_gp, ]
        PDism_data = PDism_data[!rownames(PDism_data) %in% rownames(PDism_gp),
            ]

        group = rbind(PD_gp, PDism_gp)
        groups[[i]] = group
    }
    group = rbind(PD_data, PDism_data)
    groups[[num_folds]] = group
    return(groups)
}

all_splits = list()
PD_data = data[data$response == 1, ]
PDism_data = data[data$response == 2, ]
num_folds = 3

for (i in 1:5) {
    groups = create_groups(num_folds = num_folds)
    all_splits[[i]] = groups
}

data = rbind(PD_data, PDism_data)
data <- data[order(data$PDGP), ]
save(data, all_splits, file = "./rdata/var_reduct_HY_early_PDism_splits_tug.RData")

3.3 Moderate PD and parkinsonism participants

set.seed(NULL)
load("./rdata/HY_PDism_mild_tug.RData")
load("./rdata/HY_mild_sensor_features_all_tasks.RData")

data = data_mild
colnames(data) = str_replace_all(colnames(data), "_turn", "_Turn")
colnames(data) = str_replace_all(colnames(data), "_t", ".t")

parkinsonism_ids = c(38, 168, 194, 199, 207, 212, 214, 223, 224,
    235, 241, 259, 261, 263, 265, 268, 297, 298)
excluded_PD = c(246, 252, 293)

data = data[data$PDGP < 600, ]
data = data[!data$PDGP %in% excluded_PD, ]


labels = rep(1, nrow(data))
labels[data$PDGP %in% parkinsonism_ids] = 2
imp_data = data.frame(randomForest::importance(rf)[, 1, drop = FALSE])
data = data[, colnames(data) %in% c(rownames(imp_data), "PDGP")]
data$response = as.factor(labels)

predictor_vars = subset(data, select = -c(PDGP, response))
response_var = data$response
PDGP = data$PDGP

# Convert predictors to data frame
predictor_vars = data.frame(predictor_vars, stringsAsFactors = TRUE,
    check.names = FALSE)

# Convert non-integer and non-numeric variables to factor
# variables
factor_cols = sapply(predictor_vars, function(x) class(x) !=
    "integer" & class(x) != "numeric")
predictor_vars[, factor_cols] = data.frame(apply(predictor_vars[,
    factor_cols, drop = FALSE], 2, as.factor), stringsAsFactors = TRUE,
    check.names = FALSE)


# Remove variables with constant values (including all NAs)
predictor_vars = predictor_vars %>%
    select(where(~n_distinct(.) > 1))

# Impute missing values
predictor_vars = na.roughfix(predictor_vars)

# Remove variables having infinite values
predictor_vars = predictor_vars[, unlist(lapply(predictor_vars,
    function(x) if (class(x) != "factor") is.finite(sum(x)) else TRUE))]

# Select rows with no NAs in the response variable
predictor_vars = predictor_vars[!is.na(response_var), ]
response_var = response_var[!is.na(response_var)]
PDGP = PDGP[!is.na(response_var)]

# Normalize
norm = preProcess(predictor_vars)
predictor_vars = predict(norm, predictor_vars)
data = cbind(PDGP, predictor_vars, response_var)

create_groups <- function(num_folds) {
    groups = list()
    PD_gp_size = ceiling(nrow(PD_data)/num_folds)
    PDism_gp_size = ceiling(nrow(PDism_data)/num_folds)

    for (i in 1:(num_folds - 1)) {
        PD_gp = sample(1:nrow(PD_data), size = PD_gp_size, replace = FALSE)
        PD_gp = PD_data[PD_gp, ]
        PD_data = PD_data[!rownames(PD_data) %in% rownames(PD_gp),
            ]

        PDism_gp = sample(1:nrow(PDism_data), size = PDism_gp_size,
            replace = FALSE)
        PDism_gp = PDism_data[PDism_gp, ]
        PDism_data = PDism_data[!rownames(PDism_data) %in% rownames(PDism_gp),
            ]

        group = rbind(PD_gp, PDism_gp)
        groups[[i]] = group
    }
    group = rbind(PD_data, PDism_data)
    groups[[num_folds]] = group
    return(groups)
}

all_splits = list()
PD_data = data[data$response == 1, ]
PDism_data = data[data$response == 2, ]
num_folds = 3

for (i in 1:5) {
    groups = create_groups(num_folds = num_folds)
    all_splits[[i]] = groups
}

data = rbind(PD_data, PDism_data)
data <- data[order(data$PDGP), ]
save(data, all_splits, file = "./rdata/var_reduct_HY_mild_PDism_splits_tug.RData")

3.4 Severe PD and parkinsonism participants

set.seed(NULL)
load("./rdata/HY_PDism_severe_tug.RData")
load("./rdata/HY_severe_sensor_features_all_tasks.RData")

data = data_severe
colnames(data) = str_replace_all(colnames(data), "_turn", "_Turn")
colnames(data) = str_replace_all(colnames(data), "_t", ".t")

parkinsonism_ids = c(38, 168, 194, 199, 207, 212, 214, 223, 224,
    235, 241, 259, 261, 263, 265, 268, 297, 298)
excluded_PD = c(246, 252, 293)

data = data[data$PDGP < 600, ]
data = data[!data$PDGP %in% excluded_PD, ]


labels = rep(1, nrow(data))
labels[data$PDGP %in% parkinsonism_ids] = 2
imp_data = data.frame(randomForest::importance(rf)[, 1, drop = FALSE])
data = data[, colnames(data) %in% c(rownames(imp_data), "PDGP")]
data$response = as.factor(labels)

predictor_vars = subset(data, select = -c(PDGP, response))
response_var = data$response
PDGP = data$PDGP

# Convert predictors to data frame
predictor_vars = data.frame(predictor_vars, stringsAsFactors = TRUE,
    check.names = FALSE)

# Convert non-integer and non-numeric variables to factor
# variables
factor_cols = sapply(predictor_vars, function(x) class(x) !=
    "integer" & class(x) != "numeric")
predictor_vars[, factor_cols] = data.frame(apply(predictor_vars[,
    factor_cols, drop = FALSE], 2, as.factor), stringsAsFactors = TRUE,
    check.names = FALSE)


# Remove variables with constant values (including all NAs)
predictor_vars = predictor_vars %>%
    select(where(~n_distinct(.) > 1))

# Impute missing values
predictor_vars = na.roughfix(predictor_vars)

# Remove variables having infinite values
predictor_vars = predictor_vars[, unlist(lapply(predictor_vars,
    function(x) if (class(x) != "factor") is.finite(sum(x)) else TRUE))]

# Select rows with no NAs in the response variable
predictor_vars = predictor_vars[!is.na(response_var), ]
response_var = response_var[!is.na(response_var)]
PDGP = PDGP[!is.na(response_var)]

# Normalize
norm = preProcess(predictor_vars)
predictor_vars = predict(norm, predictor_vars)
data = cbind(PDGP, predictor_vars, response_var)

create_groups <- function(num_folds) {
    groups = list()
    PD_gp_size = ceiling(nrow(PD_data)/num_folds)
    PDism_gp_size = ceiling(nrow(PDism_data)/num_folds)

    for (i in 1:(num_folds - 1)) {
        PD_gp = sample(1:nrow(PD_data), size = PD_gp_size, replace = FALSE)
        PD_gp = PD_data[PD_gp, ]
        PD_data = PD_data[!rownames(PD_data) %in% rownames(PD_gp),
            ]

        PDism_gp = sample(1:nrow(PDism_data), size = PDism_gp_size,
            replace = FALSE)
        PDism_gp = PDism_data[PDism_gp, ]
        PDism_data = PDism_data[!rownames(PDism_data) %in% rownames(PDism_gp),
            ]

        group = rbind(PD_gp, PDism_gp)
        groups[[i]] = group
    }
    group = rbind(PD_data, PDism_data)
    groups[[num_folds]] = group
    return(groups)
}

all_splits = list()
PD_data = data[data$response == 1, ]
PDism_data = data[data$response == 2, ]
num_folds = 3

for (i in 1:5) {
    groups = create_groups(num_folds = num_folds)
    all_splits[[i]] = groups
}

data = rbind(PD_data, PDism_data)
data <- data[order(data$PDGP), ]
save(data, all_splits, file = "./rdata/var_reduct_HY_severe_PDism_splits_tug.RData")