Section 2 Feature engineering
library(lomb)
library(infotheo)
library(moments)
library(signal)
library(pracma)
library(seewave)
library(e1071)
library(Metrics)
library(nonlinearTseries)
library(musclesyneRgies)
library(fractaldim)
library(Rdimtools)
library(readxl)
library(psych)
library(randomForest)
library(stringr)
sampling_rate = 100
max_freq = sampling_rate/2
num_pairs = 6
fft_peaks = 10 # fft or lsp
frequency_features = 24
time_features = 47
cross_features = 4
fft_features = fft_peaks * 2 # for fft and lsp
num_channels = 6
features_per_channel = fft_features + frequency_features + time_features
features_per_task = num_channels * features_per_channel + num_pairs *
cross_features
num_sub_tasks_TUGs = 6
num_sub_tasks_walk = 7
num_cases = 3002.1 Define time-domain and frequency-domain features
# Get top 10 FFT peaks
fourier_peaks <- function(signal) {
sig_fft <- fft(signal)
L = length(signal)
amplitude <- abs(sig_fft/L)[1:(L/2 + 1)]
frequencies <- sampling_rate * (0:(L/2))/L
sorted <- sort.int(amplitude, decreasing = TRUE, index.return = TRUE)
top <- sorted$ix[1:fft_peaks] # indexes of the largest n components
return(frequencies[top]) # convert indexes to frequencies
}
# Get top 10 LSP peaks
lsp_peaks <- function(signal) {
X.k <- lsp(signal, plot = FALSE)
power = X.k$power
frequencies = X.k$scanned * 100
sorted <- sort.int(power, decreasing = TRUE, index.return = TRUE)
top <- sorted$ix[1:fft_peaks] # indexes of the largest n components
return(frequencies[top]) # convert indexes to frequencies
}
# Transform to frequency domain
to_fft <- function(signal) {
sig_fft <- fft(signal)
L = length(signal)
sig_fft = abs(sig_fft/L)[1:(L/2 + 1)]
f = sampling_rate * (0:(L/2))/L
return(cbind(f, sig_fft))
}
# Calculate frequency domain features
frequency_measures <- function(signal) {
# return frequencies and power amplitudes
signal_to_freq = to_fft(signal)
freq = signal_to_freq[, 1]
ampl = signal_to_freq[, 2]
# mean frequency
MNF = sum(freq * ampl)/sum(ampl)
# median frequency
median_ampl = median(ampl)
MDF = freq[which.min(abs(ampl - median_ampl))]
# maximum to minimum drop in power density
N = 13
dpr_filter = rep(1/N, N)
mean_psd = na.omit(stats::filter(ampl, dpr_filter, sides = 2))
DPR = max(mean_psd)/min(mean_psd)
# signal to noise ratio
N = round(length(freq)/5)
index_f_upper = (length(freq) - N + 1):length(freq)
N = length(index_f_upper)
noise_power = sum(ampl[index_f_upper])/N * length(ampl)
total_power = sum(ampl)
SNR = total_power/noise_power
# power spectrum deformation
M0 = sum(ampl)
M1 = sum(ampl * freq)
M2 = sum(ampl * freq^2)
PSD = sqrt(M2/M0)/(M1/M0)
# freeze index
j_half = which.min(abs(freq - 0.5))
j_three = which.min(abs(freq - 3))
j_eight = which.min(abs(freq - 8))
FI = sum(ampl[j_three:j_eight])/sum(ampl[j_half:j_three])
# entropy
prob = ampl/sum(ampl)
ENT = -sum(prob * log(prob))
# total power
TTP = sum(ampl)
# mean power
MNP = sum(ampl)/length(ampl)
# peak frequency
PKF = freq[which.max(ampl)]
# peak Power
PKP = max(ampl)
# frequency Ratio
non_zero_ampl = (ampl >= 0.001)
last_non_zero_indx = length(ampl) - match(TRUE, rev(non_zero_ampl)) +
1
max_freq = freq[last_non_zero_indx]
min_freq = freq[2]
FR = max_freq/min_freq
# power spectrum ratio
max_indx = which.max(ampl)
n = ifelse(max_indx > 10, 10, max_indx - 1)
n = ifelse(n > 1, n, 0)
PSR = sum(ampl[max_indx - n:max_indx + n])/sum(ampl)
# spectral moments
SM1 = sum(ampl * freq)
SM2 = sum(ampl * freq^2)
SM3 = sum(ampl * freq^3)
# variance
VR = var(ampl)
# standard deviation
SD = sd(ampl)
# skewness
SS = skewness(ampl)
# kurtosis
SK = kurtosis(ampl)
# spectral bandwidth
SBW = sum((freq - MNF)^2 * ampl)/sum(ampl)
# spectral roll-off
non_zero_ampl = ampl[ampl > 0]
SR = 0.95 * sum(non_zero_ampl)
# variance of central frequency
SM0 = sum(ampl)
VCF = (SM2/SM0) - (SM1/SM0)^2
# mean spectral energy
MSE = mean((abs(ampl))^2)
return(c(MNF, MDF, DPR, SNR, PSD, FI, ENT, TTP, MNP, PKF,
PKP, FR, PSR, SM1, SM2, SM3, VR, SD, SS, SK, SBW, SR,
VCF, MSE))
}
# Calculate time domain features
time_measures <- function(signal) {
signal = scale(signal, center = TRUE, scale = FALSE)
N = length(signal)
# mean
MN = mean(signal)
# variance
VR = var(signal)
# standard deviation
SD = sd(signal)
# skewness
SS = skewness(signal)
# kurtosis
SK = kurtosis(signal)
# integrated absolute value
IAV = sum(abs(signal))
# mean absolute value
MAV = mean(abs(signal))
# simple square interval
SSI = sum((abs(signal))^2)
# root mean square
RMS = sqrt(mean(signal^2))
# V-order 2 and 3
V2 = (mean(signal^2))^(1/2)
V3 = (mean((abs(signal))^3))^(1/3)
# waveform length
WL = sum(abs(diff(signal)))
# average amplitude change
AAC = mean(abs(diff(signal)))
# difference absolute standard deviation value
DASDV = sqrt((sum((diff(signal))^2))/(N - 1))
# maximum fractal length
MFL = log(sqrt(sum((diff(signal))^2)))
# zero crossing
ZC = sum(diff(sign(signal)) != 0)
# rate of change
RC = sum(abs(diff(signal)) >= 0)
# slope sign change
SSC = sum((diff(signal)[1:(N - 2)] * diff(signal[2:N]) *
-1) >= 0)
# data range
DR = max(signal) - min(signal)
# entropy
ENT = entropy(entropy::discretize(signal, numBins = 10))
# log detector
LOG = exp(mean(log(abs(signal))))
# mean absolute deviation
MAD = mean(abs(signal - MN))
# percentiles
Qs = quantile(signal, names = FALSE)
Q1 = Qs[2]
Q2 = Qs[3]
Q3 = Qs[4]
# inter-quartile range
IQR = Q3 - Q1
# CV
CV = (Q3 - Q1)/Q2
# median
MED = median(signal)
# mode
MOD <- Mode(signal)
# Teager-Kaiser energy operator
TKEO = mean(TKEO(signal, f = sampling_rate, plot = F)[, 2],
na.rm = T)
# auto-regressive coefficients
ar_model = ar(signal, aic = FALSE, order.max = 4)
ar_coeff = ar_model$ar
AR_1 = ar_coeff[1]
AR_2 = ar_coeff[2]
AR_3 = ar_coeff[3]
AR_4 = ar_coeff[4]
# box-counting dimension
box_dim = est.boxcount(signal)$estdim
# detrended fluctuation Analysis
dfa_model = dfa(time.series = signal, do.plot = FALSE)
DFA = estimate(dfa_model, do.plot = FALSE)
# Higuchi's fractal dimension
HFD = HFD(signal)$Higuchi
# Katz’s fractal dimension
T = 1/sampling_rate
t = (0:(length(signal) - 1)) * T
L = sum(sqrt(diff(t)^2 + diff(signal)^2))
a = mean(sqrt(diff(t)^2 + diff(signal)^2))
d = max(sapply(signal, function(x) sqrt((x - signal[1])^2 +
T^2)))
KATZ = log(L/a)/log(d/a)
# modified mean absolute value type 1
N = length(signal)
w_i = rep(0.5, N)
w_i[(0.25 * N):(0.75 * N)] = 1
MAV1 = mean(abs(w_i * signal))
# modified mean absolute value type 2
N = length(signal)
w_i = vector()
for (i in 1:N) {
if (i >= 0.25 * N && i <= 0.75 * N) {
w = 1
} else if (i < 0.25 * N) {
w = 4 * i/N
} else {
w = 4 * (i - N)/N
}
w_i = c(w_i, w)
}
MAV2 = mean(abs(w_i * signal))
# mean absolute value slope
channel_sub1 = signal[1:(N/2)]
channel_sub2 = signal[(N/2 + 1):N]
MAV_sub1 = mean(abs(channel_sub1))
MAV_sub2 = mean(abs(channel_sub2))
MAVS = MAV_sub2 - MAV_sub1
# mean binarized values
threshold = 0.5
f = rep(0, length(signal))
f[abs(signal) >= threshold] = 1
MBV = sum(f)/N
# absolute temporal moment
TM4 = mean(signal^4)
# variation fractal dimension
VFD = fd.estim.variation(signal)$fd
# maximum
MAX = max(signal)
# minimum
MIN = min(signal)
# geometric mean
GMN = exp(mean(log(signal[signal > 0])))
# harmonic mean
HMN = harmonic.mean(signal)
# median absolute deviation
MDAD = median(abs(signal - MED))
return(c(MN, VR, SD, SS, SK, IAV, MAV, SSI, RMS, V2, V3,
WL, AAC, DASDV, MFL, ZC, RC, SSC, DR, ENT, LOG, MAD,
Q1, Q3, IQR, CV, MED, MOD, TKEO, AR_1, AR_2, AR_3, AR_4,
DFA, HFD, KATZ, MAV1, MAV2, MAVS, MBV, TM4, VFD, MAX,
MIN, GMN, HMN, MDAD))
}
# Calculate cross time domain features
cross_measures <- function(signal1, signal2) {
dis1 = entropy::discretize(signal1, numBins = 5)
dis2 = entropy::discretize(signal2, numBins = 5)
# cross entropy
prob1 = dis1/sum(dis1)
prob2 = dis2/sum(dis2)
ENT = -sum(prob1 * log(prob2))
# cross correlation function
cc = ccf(signal1, signal2, plot = FALSE)
acf = cc$acf
CCP = acf[which.max(abs(acf))]
CCL = cc$lag[which.max(abs(acf))]
# Mutulal information
MI = mutinformation(dis1, dis2)
return(c(ENT, CCP, CCL, MI))
}
# Get cross measures between each pair of x,y,z channels
get_cross_measures <- function(xyz) {
cross_measures_vec = cross_measures(xyz[, 1], xyz[, 2])
cross_measures_vec = c(cross_measures_vec, cross_measures(xyz[,
1], xyz[, 3]))
cross_measures_vec = c(cross_measures_vec, cross_measures(xyz[,
2], xyz[, 3]))
return(cross_measures_vec)
}2.2 Build features table
# Extract time domain and frequency domain features for
# each subject and a complex task
calculate_features_complex <- function(segmented_folder) {
if (segmented_folder == "walk_seg") {
num_sub_tasks = num_sub_tasks_walk
} else {
# TUG
num_sub_tasks = num_sub_tasks_TUGs
}
# create empty data frame
feature_mat = data.frame(matrix(nrow = 0, ncol = features_per_task *
num_sub_tasks + 1))
indx = 0
# loop over all cases
subj_start = 1
subj_end = num_cases
start_id = 0
# loop over subjects
for (i in subj_start:subj_end) {
current_path = "./data/"
sub_folder = paste0("s", i)
current_path = paste0(current_path, sub_folder, "/",
segmented_folder)
files <- list.files(path = current_path, pattern = "*.txt",
full.names = TRUE, recursive = FALSE)
if (length(files) > 0) {
indx = indx + 1
current_row = i + start_id
# loop over components of a task and calculate
# features
for (f in 1:num_sub_tasks) {
file_name = files[f]
channels <- read.table(file_name, header = FALSE) # load file
names(channels) <- c("ax", "ay", "az", "gx",
"gy", "gz")
# loop over channels
for (j in 1:num_channels) {
channel = channels[, j]
# get FFT and LSP features
fft_top = fourier_peaks(channel)
lsp_top = lsp_peaks(channel)
fft_top = as.vector(t(fft_top))
lsp_top = as.vector(t(lsp_top))
sig_fft_lsp = c(fft_top, lsp_top)
current_row = c(current_row, sig_fft_lsp)
# get frequency domain features
freq_measures = frequency_measures(channel)
current_row = c(current_row, freq_measures)
# get time domain features
t_measures = time_measures(channel)
current_row = c(current_row, t_measures)
}
# get cross time domain features
channel_acc = channels[, 1:3]
channel_gyro = channels[, 4:6]
current_row = c(current_row, get_cross_measures(channel_acc))
current_row = c(current_row, get_cross_measures(channel_gyro))
}
# add subject features to the data frame
feature_mat = rbind(feature_mat, current_row)
}
}
return(feature_mat)
}
# Define column names for complex tasks
get_col_names_complex <- function(task_name) {
col_names = c("PDGP")
freq_names = c("fMNF", "fMDF", "fDPR", "fSN", "fOHM", "fFI",
"fENT", "fTTP", "fMNP", "fPKF", "fPKP", "fFR", "fPSR",
"fSM1", "fSM2", "fSM3", "fVR", "fSD", "fSS", "fSK", "fSBW",
"fSR", "fVCF", "fMSE")
time_names = c("tMN", "tVR", "tSD", "tSS", "tSK", "tIAV",
"tMAV", "tSSI", "tRMS", "tV2", "tV3", "tWL", "tAAC",
"tDASDV", "tMFL", "tZC", "tRC", "tSSC", "tDR", "tENT",
"tLOG", "tMAD", "tQ1", "tQ3", "tIQR", "tCV", "tMED",
"tMOD", "tTKEO", "tAR_1", "tAR_2", "tAR_3", "tAR_4",
"tDFA", "tHFD", "tKATZ", "tMAV1", "tMAV2", "tMAVS", "tMBV",
"tTM4", "tVFD", "tMAX", "tMIN", "tGMN", "tHMN", "tMDAD")
channel_names = c("ax", "ay", "az", "gx", "gy", "gz")
cross_acc = c("axy", "axz", "ayz")
cross_gyro = c("gxy", "gxz", "gyz")
cross_names = c("ENT", "CCP", "CCL", "MI")
if (task_name == "walk") {
num_sub_tasks = num_sub_tasks_walk
tasks_names = c("_turn1", "_turn2", "_turn3", "_walk1",
"_walk2", "_walk3", "_walk4")
} else {
num_sub_tasks = num_sub_tasks_TUGs
tasks_names = c("_sTs1", "_sTs2", "_turn1", "_turn2",
"_walk1", "_walk2")
}
# loop over task components
for (t in 1:num_sub_tasks) {
task_name = tasks_names[t]
for (c in 1:num_channels) {
ch_name = channel_names[c]
for (j in 1:fft_peaks) {
peak_name = paste0("_fft", j)
col_name = paste0(ch_name, peak_name)
col_name = paste0(col_name, task_name)
col_names <- c(col_names, col_name)
}
for (j in 1:fft_peaks) {
peak_name = paste0("_lsp", j)
col_name = paste0(ch_name, peak_name)
col_name = paste0(col_name, task_name)
col_names <- c(col_names, col_name)
}
for (j in 1:frequency_features) {
col_name = paste0(ch_name, freq_names[j])
col_name = paste0(col_name, task_name)
col_names <- c(col_names, col_name)
}
for (j in 1:time_features) {
col_name = paste0(ch_name, time_names[j])
col_name = paste0(col_name, task_name)
col_names <- c(col_names, col_name)
}
}
for (j in 1:(num_pairs/2)) {
for (k in 1:cross_features) {
col_name = paste0(cross_acc[j], "_")
col_name = paste0(col_name, cross_names[k])
col_name = paste0(col_name, task_name)
col_names <- c(col_names, col_name)
}
}
for (j in 1:(num_pairs/2)) {
for (k in 1:cross_features) {
col_name = paste0(cross_gyro[j], "_")
col_name = paste0(col_name, cross_names[k])
col_name = paste0(col_name, task_name)
col_names <- c(col_names, col_name)
}
}
}
return(col_names)
}
# Extract time domain and frequency domain features for
# each subject and a simple task
calculate_features <- function(task_name) {
# create empty data frame
feature_mat = data.frame(matrix(nrow = 0, ncol = features_per_task +
1))
indx = 0
subj_start = 1
subj_end = 300
start_id = 0
# loop over subjects
for (i in subj_start:subj_end) {
current_path = "./data/"
sub_folder = paste0("s", i)
current_path = paste0(current_path, sub_folder)
files <- list.files(path = current_path, pattern = "*.txt",
full.names = TRUE, recursive = FALSE)
found = FALSE
file_name = ""
if (length(files) > 0) {
if (task_name == "EyesClosed" && (length(grep("Eyes closed",
files)) > 0 || length(grep("EyesClosed", files)) >
0)) {
found = TRUE
if (length(grep("Eyes closed", files)) > 0) {
file_name = files[grep("Eyes closed", files)]
} else {
file_name = files[grep("EyesClosed", files)]
}
} else if (task_name == "EyesOpen" && (length(grep("Eyes open",
files)) > 0 || length(grep("EyesOpen", files)) >
0)) {
found = TRUE
if (length(grep("Eyes open", files)) > 0) {
file_name = files[grep("Eyes open", files)]
} else {
file_name = files[grep("EyesOpen", files)]
}
} else if (task_name == "TUG1" && length(grep("_TUG_1",
files)) > 0) {
found = TRUE
file_name = files[grep("_TUG_1", files)]
}
}
if (found) {
indx = indx + 1
print(file_name)
# read 6 channels data
channels = read.table(file_name, header = FALSE) # load file
names(channels) <- c("ax", "ay", "az", "gx", "gy",
"gz")
current_row = i + start_id
# loop over channels
for (j in 1:num_channels) {
channel = channels[, j]
if (sum(is.na(channel)) == nrow(channels)) {
current_row = c(current_row, rep(NA, features_per_channel))
} else {
# get FFT and LSP features
fft_top = fourier_peaks(channel)
lsp_top = lsp_peaks(channel)
fft_top = as.vector(t(fft_top))
lsp_top = as.vector(t(lsp_top))
sig_fft_lsp = c(fft_top, lsp_top)
current_row = c(current_row, sig_fft_lsp)
# get frequency domain features
freq_measures = frequency_measures(channel)
current_row = c(current_row, freq_measures)
# get time domain features
t_measures = time_measures(channel)
current_row = c(current_row, t_measures)
}
}
# get cross time domain features
channel_acc = channels[, 1:3]
channel_gyro = channels[, 4:6]
current_row = c(current_row, get_cross_measures(channel_acc))
current_row = c(current_row, get_cross_measures(channel_gyro))
# add subject features to the data frame
feature_mat = rbind(feature_mat, current_row)
}
}
return(feature_mat)
}
# Define column names for simple tasks
get_col_names <- function() {
col_names = c("PDGP")
freq_names = c("fMNF", "fMDF", "fDPR", "fSN", "fOHM", "fFI",
"fENT", "fTTP", "fMNP", "fPKF", "fPKP", "fFR", "fPSR",
"fSM1", "fSM2", "fSM3", "fVR", "fSD", "fSS", "fSK", "fSBW",
"fSR", "fVCF", "fMSE")
time_names = c("tMN", "tVR", "tSD", "tSS", "tSK", "tIAV",
"tMAV", "tSSI", "tRMS", "tV2", "tV3", "tWL", "tAAC",
"tDASDV", "tMFL", "tZC", "tRC", "tSSC", "tDR", "tENT",
"tLOG", "tMAD", "tQ1", "tQ3", "tIQR", "tCV", "tMED",
"tMOD", "tTKEO", "tAR_1", "tAR_2", "tAR_3", "tAR_4",
"tDFA", "tHFD", "tKATZ", "tMAV1", "tMAV2", "tMAVS", "tMBV",
"tTM4", "tVFD", "tMAX", "tMIN", "tGMN", "tHMN", "tMDAD")
cross_names = c("ENT", "CCP", "CCL", "MI")
channel_names = c("ax", "ay", "az", "gx", "gy", "gz")
cross_acc = c("axy", "axz", "ayz")
cross_gyro = c("gxy", "gxz", "gyz")
for (c in 1:num_channels) {
ch_name = channel_names[c]
for (j in 1:fft_peaks) {
peak_name = paste0("_fft", j)
col_name = paste0(ch_name, peak_name)
col_names <- c(col_names, col_name)
}
for (j in 1:fft_peaks) {
peak_name = paste0("_lsp", j)
col_name = paste0(ch_name, peak_name)
col_names <- c(col_names, col_name)
}
for (j in 1:frequency_features) {
col_name = paste0(ch_name, freq_names[j])
col_names <- c(col_names, col_name)
}
for (j in 1:time_features) {
col_name = paste0(ch_name, time_names[j])
col_names <- c(col_names, col_name)
}
}
for (j in 1:(num_pairs/2)) {
for (k in 1:cross_features) {
col_name = paste0(cross_acc[j], "_")
col_name = paste0(col_name, cross_names[k])
col_names <- c(col_names, col_name)
}
}
for (j in 1:(num_pairs/2)) {
for (k in 1:cross_features) {
col_name = paste0(cross_gyro[j], "_")
col_name = paste0(col_name, cross_names[k])
col_names <- c(col_names, col_name)
}
}
return(col_names)
}# Build features tables for simple tasks
names_tasks = c("EyesClosed", "EyesOpen")
tasks_feature_mat = list()
for (i in 1:length(names_tasks)) {
task = names_tasks[i]
feature_mat = calculate_features(task)
colnames(feature_mat) = get_col_names()
tasks_feature_mat[[i]] = feature_mat
}
# Build features tables for complex tasks
task_names = c("TUG1", "TUG2", "CogTUG1", "CogTUG2")
seg_folders = paste0(task_names, "_seg")
for (i in 1:length(seg_folders)) {
task = seg_folders[i]
feature_mat = calculate_features_complex(task)
colnames(feature_mat) = get_col_names_complex(task_names[i])
tasks_feature_mat[[i + 2]] = feature_mat
}
feature_mat = calculate_features_complex("walk_seg")
colnames(feature_mat) = get_col_names_complex("walk")
tasks_feature_mat[[7]] = feature_mat
# save features table
save(tasks_feature_mat, file = "./rdata/sensor_features.RData")load("./rdata/sensor_features.RData")
# Add demographics and rating scales variables to data
# table
add_demographics_cc <- function(data_table) {
demo_table = read_excel("./data/dynaportbase2020Nov_no PID_updated_imputed.xlsx")
demo_table$disease_dur = as.numeric(format(as.Date(demo_table$DOV,
format = "%m/%d/%Y"), "%Y")) - as.numeric(demo_table$dxs_dxyr)
demo_table = subset(demo_table, select = -c(DOV, rdate, edu,
datediff, dxs_dxyr, dxs_sxyr, deltadate, absdiff, rdate1_moca,
`MoCA date rvc`, Medication))
demo_table = demo_table[, !str_detect(colnames(demo_table),
"adf_OLD")]
demo_table = demo_table[, !str_detect(colnames(demo_table),
"kmoca")]
demo_table[, str_detect(colnames(demo_table), "updrs") &
!str_detect(colnames(demo_table), "updrs_piii") & !str_detect(colnames(demo_table),
"updrs_se") & !str_detect(colnames(demo_table), "updrs_t")] = data.frame(apply(demo_table[,
str_detect(colnames(demo_table), "updrs") & !str_detect(colnames(demo_table),
"updrs_piii") & !str_detect(colnames(demo_table),
"updrs_se") & !str_detect(colnames(demo_table), "updrs_t")],
2, as.factor), stringsAsFactors = TRUE)
demo_table[, str_detect(colnames(demo_table), "moca") & !str_detect(colnames(demo_table),
"moca_t")] = data.frame(apply(demo_table[, str_detect(colnames(demo_table),
"moca") & !str_detect(colnames(demo_table), "moca_t")],
2, as.factor), stringsAsFactors = TRUE)
demo_table[, str_detect(colnames(demo_table), "oars") & !str_detect(colnames(demo_table),
"oars_a") & !str_detect(colnames(demo_table), "oars_i") &
!str_detect(colnames(demo_table), "oars_t")] = data.frame(apply(demo_table[,
str_detect(colnames(demo_table), "oars") & !str_detect(colnames(demo_table),
"oars_a") & !str_detect(colnames(demo_table), "oars_i") &
!str_detect(colnames(demo_table), "oars_t")], 2,
as.factor), stringsAsFactors = TRUE)
demo_table[, str_detect(colnames(demo_table), "adf")] = data.frame(apply(demo_table[,
str_detect(colnames(demo_table), "adf")], 2, as.factor),
stringsAsFactors = TRUE)
demo_table$HYstg = as.factor(demo_table$HYstg)
demo_table$Race = as.factor(demo_table$Race)
demo_table$maxgender = as.factor(demo_table$maxgender)
missing_PDGP = setdiff(data_table$PDGP, demo_table$PDGP)
extended_mat = demo_table
for (id in missing_PDGP) {
insert_indx = grep(paste0("\\b", id - 1, "\\b"), extended_mat$PDGP)
extended_mat = rbind(extended_mat[1:insert_indx, ], c(id,
rep(NA, ncol(demo_table) - 1)), extended_mat[-(1:insert_indx),
])
}
extended_mat = extended_mat[extended_mat$PDGP %in% data_table$PDGP,
]
extended_mat = subset(extended_mat, select = -PDGP)
colnames(extended_mat) = paste0(colnames(extended_mat), ".demo")
vec_data = cbind(data_table, extended_mat)
return(vec_data)
}
# Exclude subjects with missing CogTUGs, then merge data
# frames of all tasks
PDGP_CogTUGs = intersect(tasks_feature_mat[[5]]$PDGP, tasks_feature_mat[[6]]$PDGP)
sensor_df = data.frame(matrix(nrow = length(PDGP_CogTUGs), ncol = 0))
sensor_df$PDGP = PDGP_CogTUGs
tasks_order = c(7, 1, 2, 3, 4, 5, 6)
for (k in 1:length(tasks_feature_mat)) {
i = tasks_order[k]
current_mat = tasks_feature_mat[[i]]
missing_PDGP = setdiff(PDGP_CogTUGs, current_mat$PDGP)
# set missing tasks with NA
extended_mat = current_mat
for (id in missing_PDGP) {
insert_indx = grep(paste0("\\b", id - 1, "\\b"), extended_mat$PDGP)
extended_mat = rbind(extended_mat[1:insert_indx, ], c(id,
rep(NA, ncol(current_mat) - 1)), extended_mat[-(1:insert_indx),
])
}
extended_mat = extended_mat[extended_mat$PDGP %in% PDGP_CogTUGs,
]
extended_mat = subset(extended_mat, select = -PDGP)
colnames(extended_mat) = paste0(colnames(extended_mat), "_t",
k)
sensor_df = cbind(sensor_df, extended_mat)
}
# Extend feature table with mean and difference columns of
# TUG and CogTUG
tug1 = sensor_df[, grep("_t4", colnames(sensor_df))]
tug2 = sensor_df[, grep("_t5", colnames(sensor_df))]
mean_tug = (tug1 + tug2)/2
colnames(mean_tug) = paste0(colnames(mean_tug), "t5")
diff_tug = tug1 - tug2
colnames(diff_tug) = paste0(colnames(diff_tug), "t5_diff")
cogtug1 = sensor_df[, grep("_t6", colnames(sensor_df))]
cogtug2 = sensor_df[, grep("_t7", colnames(sensor_df))]
mean_cogtug = (cogtug1 + cogtug2)/2
colnames(mean_cogtug) = paste0(colnames(mean_cogtug), "t7")
diff_cogtug = cogtug1 - cogtug2
colnames(diff_cogtug) = paste0(colnames(diff_cogtug), "t7_diff")
sensor_df = cbind(sensor_df, mean_tug, diff_tug, mean_cogtug,
diff_cogtug)
# Add demographics and rating scales variables
sensor_df = add_demographics_cc(sensor_df)
# save merged features table
save(sensor_df, file = "./rdata/sensor_features_all_tasks.RData")2.3 Feature reduction
# AIC calculations
crossEntropy = function(y_true, y_pred, eps = .Machine$double.xmin) {
y_pred = y_pred/apply(y_pred, 1, sum)
y_pred = pmax(pmin(y_pred, 1 - eps), eps)
y_true = as.numeric(y_true == levels(y_true)[-1])
H = -mean(y_true * log(y_pred) + (1 - y_true) * log(1 - y_pred))
return(H)
}
clfAIC = function(y_true, y_pred, k, eps = .Machine$double.xmin) {
H = crossEntropy(y_true, y_pred, eps)
AIC = 2 * length(y_true) * H + 2 * k
return(AIC)
}
# Find elbow point
elbowRule = function(x, y = NULL, norm = F, nonegative = T) {
# account for extreme input cases of having constant
# values or no values
if (length(unique(x)) == 1 || length(x) == 0) {
return(NA)
}
n = length(x)
if (is.null(y)) {
x = sort(x)
y = 1:n
} else {
stopifnot(n == length(y))
y = y[order(x)]
x = sort(x)
}
if (nonegative) {
n = sum(x >= 0)
y = y[x >= 0]
x = x[x >= 0]
}
if (norm) {
x0 = (x - min(x))/(max(x) - min(x))
y0 = (y - min(y))/(max(y) - min(y))
} else {
x0 = x
y0 = y
}
m = (y0[n] - y0[1])/(x0[n] - x0[1]) # calculate slope
b = y0[1] - m * x0[1] # calculate intercept
dist_norm = sapply(1:n, function(i) {
abs(-1 * m * x0[i] + y0[i] - b)
})
return(x[which.max(dist_norm)])
}# Apply forward selection and return RF with selected
# features
forward_select <- function(predictor_vars, response_var) {
n = nrow(predictor_vars)
K = ncol(predictor_vars)
# downsample to the smaller class size
count = t(as.data.frame(table(response_var))[, 2])
min_size = min(count)
num_classes = length(unique(response_var))
sampsize = rep(min_size, num_classes)
# build initial RF
rf = randomForest(x = predictor_vars, y = response_var, ntree = 1e+05,
importance = TRUE, sampsize = sampsize, proximity = TRUE)
y_pred = rf$votes
aic = clfAIC(y_true = response_var, y_pred = y_pred, k = K)
min_aic = aic
# get initial variable importance
var_imp = getVarImp(rf)
var_imp_ = var_imp[, 1]
best_features = rownames(var_imp)
best_rf = rf
# find elbow point
threshold = which(var_imp_ == elbowRule(var_imp_))[1]
# train multiple RF using subsets of important
# variables before the elbow point
res = for (i in 1:threshold) {
predictor_vars_ = predictor_vars[, colnames(predictor_vars) %in%
rownames(var_imp)[1:i], drop = FALSE]
rf = randomForest(x = predictor_vars_, y = response_var,
ntree = 1000, importance = TRUE, proximity = TRUE)
y_pred = rf$votes
aic = clfAIC(y_true = response_var, y_pred = y_pred,
k = ncol(predictor_vars_))
# find model with min AIC
if (aic < min_aic) {
min_aic = aic
var_imp = getVarImp(rf)
best_features = rownames(var_imp)
best_rf = rf
}
}
return(best_rf)
}2.3.1 PD and parkinsonism participants
load("./rdata/sensor_features_all_tasks.RData")
parkinsonism_ids = c(38, 168, 194, 199, 207, 212, 214, 223, 224,
235, 241, 259, 261, 263, 265, 268, 297, 298)
excluded_PD = c(246, 252, 293)
## All tasks model
sensor_df_ = sensor_df[!sensor_df$PDGP %in% excluded_PD, ]
labels = rep("PD", nrow(sensor_df_))
labels[sensor_df_$PDGP %in% parkinsonism_ids] = "PDism"
response_var = as.factor(labels)
predictor_vars = subset(sensor_df_, select = -PDGP)
# Save RF with features corresponding to min AIC
best_rf = forward_select(predictor_vars, response_var)
save(best_rf, file = "./rdata/PD_PDism.RData")
## TUG-only model
sensor_df_ = sensor_df[!sensor_df$PDGP %in% excluded_PD, ]
labels = rep("PD", nrow(sensor_df_))
labels[sensor_df_$PDGP %in% parkinsonism_ids] = "PDism"
response_var = as.factor(labels)
predictor_vars = sensor_df_
predictor_vars = predictor_vars[, str_detect(colnames(predictor_vars),
"PDGP") | str_detect(colnames(predictor_vars), "_t4") | str_detect(colnames(predictor_vars),
"_t5") | str_detect(colnames(predictor_vars), "_t4t5") |
str_detect(colnames(predictor_vars), "_t4t5_diff") | str_detect(colnames(predictor_vars),
".demo")]
predictor_vars = subset(predictor_vars, select = -PDGP)
# Save RF with features corresponding to min AIC
best_rf = forward_select(predictor_vars, response_var)
save(best_rf, file = "./rdata/PD_PDism_tug.RData")# Split PD patient according to HY stage
data = sensor_df
case_demos = read_excel("./data/dynaportbase2020Nov_no PID_updated_imputed.xlsx")
PD_demos = case_demos[case_demos$PDGP %in% data$PDGP, c("PDGP",
"HYstg")]
PD_demos$HYstg = as.factor(PD_demos$HYstg)
missing_PDGP = setdiff(data$PDGP, PD_demos$PDGP)
extended_PD_demos = PD_demos
for (id in missing_PDGP) {
insert_indx = grep(paste0("\\b", id - 1, "\\b"), extended_PD_demos$PDGP)
extended_PD_demos = rbind(extended_PD_demos[1:insert_indx,
], c(id, rep(NA, ncol(PD_demos) - 1)), extended_PD_demos[-(1:insert_indx),
])
}
PD_demos = extended_PD_demos
PD_demos = na.roughfix(PD_demos)
PD_data = data
PD_early = PD_data[PD_demos$HYstg == 1 | PD_demos$HYstg == 1.5 |
PD_demos$HYstg == 2, ]
PD_mild = PD_data[PD_demos$HYstg == 2.5 | PD_demos$HYstg == 3,
]
PD_severe = PD_data[PD_demos$HYstg == 4, ]
data_early = PD_early
data_mild = PD_mild
data_severe = PD_severe
save(data_early, file = "./rdata/HY_early_sensor_features_all_tasks.RData")
save(data_mild, file = "./rdata/HY_mild_sensor_features_all_tasks.RData")
save(data_severe, file = "./rdata/HY_severe_sensor_features_all_tasks.RData")2.3.2 Mild PD and parkinsonism participants
load("./rdata/HY_early_sensor_features_all_tasks.RData")
## All tasks model
data_early = data_early[!data_early$PDGP %in% excluded_PD, ]
labels = rep("PD", nrow(data_early))
labels[data_early$PDGP %in% parkinsonism_ids] = "PDism"
response_var = as.factor(labels)
predictor_vars = subset(data_early, select = -PDGP)
# Save RF with features corresponding to min AIC
best_rf = forward_select(predictor_vars, response_var)
save(best_rf, file = "./rdata/HY_PDism_early.RData")
## TUG-only model
data_early = data_early[!data_early$PDGP %in% excluded_PD, ]
labels = rep("PD", nrow(data_early))
labels[data_early$PDGP %in% parkinsonism_ids] = "PDism"
response_var = as.factor(labels)
predictor_vars = data_early
predictor_vars = predictor_vars[, str_detect(colnames(predictor_vars),
"PDGP") | str_detect(colnames(predictor_vars), "_t4") | str_detect(colnames(predictor_vars),
"_t5") | str_detect(colnames(predictor_vars), "_t4t5") |
str_detect(colnames(predictor_vars), "_t4t5_diff") | str_detect(colnames(predictor_vars),
".demo")]
predictor_vars = subset(predictor_vars, select = -PDGP)
# Save RF with features corresponding to min AIC
best_rf = forward_select(predictor_vars, response_var)
save(best_rf, file = "./rdata/HY_PDism_early_tug.RData")2.3.3 Moderate PD and parkinsonism participants
load("./rdata/HY_mild_sensor_features_all_tasks.RData")
## All tasks model
data_mild = data_mild[!data_mild$PDGP %in% excluded_PD, ]
labels = rep("PD", nrow(data_mild))
labels[data_mild$PDGP %in% parkinsonism_ids] = "PDism"
response_var = as.factor(labels)
predictor_vars = subset(data_mild, select = -PDGP)
# Save RF with features corresponding to min AIC
best_rf = forward_select(predictor_vars, response_var)
save(best_rf, file = "./rdata/HY_PDism_mild.RData")
## TUG-only model
data_mild = data_mild[!data_mild$PDGP %in% excluded_PD, ]
labels = rep("PD", nrow(data_mild))
labels[data_mild$PDGP %in% parkinsonism_ids] = "PDism"
response_var = as.factor(labels)
predictor_vars = data_mild
predictor_vars = predictor_vars[, str_detect(colnames(predictor_vars),
"PDGP") | str_detect(colnames(predictor_vars), "_t4") | str_detect(colnames(predictor_vars),
"_t5") | str_detect(colnames(predictor_vars), "_t4t5") |
str_detect(colnames(predictor_vars), "_t4t5_diff") | str_detect(colnames(predictor_vars),
".demo")]
predictor_vars = subset(predictor_vars, select = -PDGP)
# Save RF with features corresponding to min AIC
best_rf = forward_select(predictor_vars, response_var)
save(best_rf, file = "./rdata/HY_PDism_mild_tug.RData")2.3.4 Severe PD and parkinsonism participants
load("./rdata/HY_severe_sensor_features_all_tasks.RData")
## All tasks model
data_severe = data_severe[!data_severe$PDGP %in% excluded_PD,
]
labels = rep("PD", nrow(data_severe))
labels[data_severe$PDGP %in% parkinsonism_ids] = "PDism"
response_var = as.factor(labels)
predictor_vars = subset(data_severe, select = -PDGP)
# Save RF with features corresponding to min AIC
best_rf = forward_select(predictor_vars, response_var)
save(best_rf, file = "./rdata/HY_PDism_severe.RData")
## TUG-only model
data_severe = data_severe[!data_severe$PDGP %in% excluded_PD,
]
labels = rep("PD", nrow(data_severe))
labels[data_severe$PDGP %in% parkinsonism_ids] = "PDism"
response_var = as.factor(labels)
predictor_vars = data_severe
predictor_vars = predictor_vars[, str_detect(colnames(predictor_vars),
"PDGP") | str_detect(colnames(predictor_vars), "_t4") | str_detect(colnames(predictor_vars),
"_t5") | str_detect(colnames(predictor_vars), "_t4t5") |
str_detect(colnames(predictor_vars), "_t4t5_diff") | str_detect(colnames(predictor_vars),
".demo")]
predictor_vars = subset(predictor_vars, select = -PDGP)
# Save RF with features corresponding to min AIC
best_rf = forward_select(predictor_vars, response_var)
save(best_rf, file = "./rdata/HY_PDism_severe_tug.RData")