# EMIT Main Quarantine Paper Data Analysis
# Program Objective: Using the authoritative, publishable dataset for the EMIT Quarantine study, to produce the three study tables and the figure and other supporting analysis for the manuscript
# Author: Jacob Bueno de Mesquita
# Date: September 23, 2018 - March 2019
# Summary: This script creates the Tables 1-3 (all the tables in the paper) and Figure 3 for the EMIT Main Quarantine Paper. Also generates other tables and figures that support statements made in the manuscript text, and potentially useful in SI

#### Load required packages, set working directory, and read in data file ####
library(tidyverse)
library(RcppRoll)
library(readxl)
library(knitr)
library(data.table)
library(lubridate)
library(devtools)
library(xtable)
library(DT)
library(kableExtra)
library(magrittr)
library(qwraps2)
library(arsenal)

session_info()


# There was an issue in creating the markdown report of this file if we set this working directory because the working directory of this R script is different.
# Therefore, we are eliminating the need to use this /Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine working directory and will load in each file using it's full directory. 

Qdata <- read.csv("/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Curated Data/Analytical Datasets/QuarantineMergedData.csv")


#### * TABLE 1 ---------------------------####
#### Overview of columns to produce for Table 1 (Donor Status) in the paper ####

# Table 1 is "Donor status". It gives for each of the 3 quarantines:
# a) Number of infected/ number of inoculated (and %)
# b) Number of symptomatic (and % of infected)
# c) Number of symptomatic, non-ILI (and % of infected)
# d) Number of ILI (and % of infected)
# e) Number of febrile (and % of infected)
# f) Number of PCR confirmed infection (and % of infected)
# g) Number of PCR confirmed infection and seroconversion (and % of infected)
# h) Number of seroconversion by HAI: MN: Either (and % of infected)
# i) Number of seroconversion prior to quarantine by HAI: MN: Both (and % of infected)

#### Table 1 (donors): a) Number of infected, number of inoculated (and % infected of inoculated) ####

# number of inoculated donors

Qdata_inoculated_donors <- Qdata %>% 
  filter(Randomization_DorIRorCR == "D")
Qdata_inoculated_donors_table1 <- Qdata_inoculated_donors %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_Inoculated_Donors = n_distinct(SubjectID))
print(Qdata_inoculated_donors_table1)

# number of infected donors

# positive by PCR (seroconversion, or PCR positive on more than 1 day)
# let's get the list with at least one day PCR positive, then merge up with seroconversion data
Qdata_pcr_pos1_or_more_days <- Qdata %>%
  filter(Randomization_DorIRorCR == "D") %>%
  filter(!is.na(InfA_Ct)) %>%
  filter((InfA_Ct<38 & InfA_Ct!=0)) %>%
  group_by(SubjectID, StudyDay) %>%
  summarize(count = n()) %>%
  summarize(NumberDaysPosPCR = n_distinct(StudyDay))
print(Qdata_pcr_pos1_or_more_days)

# let's get the list with seroconversion by Microneuts (CDC serology)

# First only select the subjectIDs that were serosusceptible by MN at baseline (<80 at baseline)
# Revision, even if they were less serosusceptible, there was still a chance for seroconversion so do not filter out the <80 at baseline
#Qdata_Microneut_susceptible <- Qdata %>%
 #filter(Randomization_DorIRorCR == "D" & Microneut_VisitType == "Q baseline" & Microneutralization.Titer.to.A.Wisconsin.67.2005 < 80) %>%
 #distinct(SubjectID, .keep_all = FALSE)

Qdata_Microneut_pos <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & Microneut_VisitType == "F/up" & Microneut_Seroconvert == 1) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, QuarantineNumber)
print(Qdata_Microneut_pos)
# let's get the list with seroconversion by HAI (Glasgow serology)

# First only select the subjectIDs that were serosusceptible by HAI at baseline (<=10 at baseline)
# Revision, even if they were less serosusceptible, there was still a chance for seroconversion so do not filter out the <=10 at baseline
#Qdata_HAI_susceptible <- Qdata %>%
  #filter(Randomization_DorIRorCR == "D" & HAI_dayminus2_recodeNDA <= 10) %>%
  #distinct(SubjectID, .keep_all = FALSE)

Qdata_HAI_pos <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & HAI_Seroconversion == 1) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, QuarantineNumber)
print(Qdata_HAI_pos)

# Now let's merge the datasets together to get full list of volunteers who meet positivity criteria
Qdata_infected <- Qdata_HAI_pos %>%
  full_join(Qdata_Microneut_pos, by = c("SubjectID" = "SubjectID")) %>%
  full_join(Qdata_pcr_pos1_or_more_days, by = c("SubjectID" = "SubjectID")) %>%
  arrange(SubjectID)
print(Qdata_infected)

# Identify whom among the pcr positive individuals with only a single day of PCR positivity did not also seroconvert to confirm infection
Qdata_1pcrpos_nosero <- Qdata_infected %>%
  filter(NumberDaysPosPCR == 1) %>%
  filter(is.na(QuarantineNumber.x) & is.na(QuarantineNumber.y))
print(Qdata_1pcrpos_nosero)

# Among the individuals that were PCR positive on only a single day, which study day was the positive day?
Qdata_pcr_pos1_or_more_days_studydays <- Qdata %>%
  filter(Randomization_DorIRorCR == "D") %>%
  filter(!is.na(InfA_Ct)) %>%
  filter((InfA_Ct<38 & InfA_Ct!=0)) %>%
  group_by(SubjectID, StudyDay) %>%
  summarize(count = n())
Qdata_pcr_pos1_day <- Qdata_infected %>%
  filter(NumberDaysPosPCR ==1) %>%
  left_join(Qdata_pcr_pos1_or_more_days_studydays, by = c("SubjectID" = "SubjectID")) %>%
  select(-`count`)
print(Qdata_pcr_pos1_day)
# Note on this outcome: all subjects that were only PCR on one day, were positive on day 2 or day 4 (not day 1)
# This may be useful if the group is interested in changing the criteria for PCR positivity such that a single PCR test would count as criteria for infection as long as it wasn't on day 0 or day 1.
# Conversation with Alex Mann leads me to believe we should keep the 2 separate day PCR positivity criteria for now.

# Remove those who were only 1 day pcr positive and no seroconversion (protocol criteria for positivity)
Qdata_infected <- Qdata_infected %>% 
  anti_join(Qdata_1pcrpos_nosero, by = c("SubjectID" = "SubjectID"))
print(Qdata_infected)

# Summarize number of infected (by any criteria) for each Q
# First need to attach a full set of quarantine numbers on the "Qdata_infected" df
Qdata_QuarantineNumbers <- Qdata %>%
  select(SubjectID, QuarantineNumber) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  filter(!is.na(SubjectID))
Qdata_infected_donors <- Qdata_infected %>%
  left_join(Qdata_QuarantineNumbers, by = c("SubjectID" = "SubjectID"))
Qdata_infected_donors_table1 <- Qdata_infected_donors %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_Infected_Donors = n_distinct(SubjectID))
print(Qdata_infected_donors_table1)

# write the Qdata_infected_donors df out because it is needed in the data preparation for the Natural_vs_Artificial Inoculation project
write.csv(Qdata_infected_donors, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/Natural_vs_Artificial_Infection/Analytical Datasets/Qdata_Infected_Donors_by_PCR_or_Serology.csv")

# Also write it out to the Quarantine transmission projct
write.csv(Qdata_infected_donors, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine_Transmission/Analytical Datasets/Qdata_Infected_Donors_by_PCR_or_Serology.csv")


#### Generation of Table 1 for paper ####

# To output a nice summary table with numInfected, numInoculated, and %infected of inoculated
Qdata_table1 <- Qdata_infected_donors_table1 %>% #using the df just created, above
  left_join(Qdata_inoculated_donors_table1, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
  mutate(Fraction_Infected_of_Inoculated = Number_Infected_Donors/Number_Inoculated_Donors)
print(Qdata_table1)

#### Table 1 (donors): b) Number of symptomatic (and % of infected) ####

# How to define "symptomatic"? (Includes both "symptomatic non-ILI" and "symptomatic ILI")
# Ben Killingley: "Symptomatic are the ones with symptoms (had to be more than 1 symptom on 1 day) but who did not reach ILI def"
# Really, this means having evidence of 2 or more symptoms that occurred together over at least 2 consecutive days, or fever at least once
# Figured out this classification criteria using information from Alex Mann and from reverse engineering the criteria from the CDC final report, which classified each volunteer (compared CDC final report with symptom profile of each volunteer)

## Note: The symptomatic, symptomatic non-ILI, ILI, and febrile are meant to be displayed as a fraction of the infected. However there are symptomatic, symtpomatic non-ILI, ILI, and perhaps even febrile who were not infected. Perhaps we should report these with symptoms but who were not infected in their own supplementatry table?

# Based on email correspondence with Don Milton and the EMIT team on October 3, 2018 we will forgo the "symptomatic" classification in the table
# Thus, the next two sections of code (implementing versions 1 and 2) will not be further persued and published at this time.
# Revision: Later email correspondence from October 4, 2018 resolves that we will use a milder criteria for symptomatic (and afebile). 
# Thus, Version 2 of Symptomatic will be used and is coded in this script somewhere below.
# This was later revised and a new version called version 3 was used
# I have commented out the versions that have become obsolete over time. 

#### ## Implementing Version 1 of "Symptomatic" ####

# “Evidence of at least 2 symptoms of any grade that persist for least 2 consecutive study days, where at least two of those consecutive study days are the same days; or have fever at least once.”

# First, manipulate the data to prepare for a loop that can classify symptomatic by using self report and DPE symptoms
# Combine symptom severity measures (grades 1, 2, and 3) because grade >1 doesn't matter for this definition of symptomatic afebrile
# Symptomatic_donors_infected_grade123 <- Qdata_infected_donors %>%
#   left_join(Qdata) %>%
#   filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0 | StudyDay == 1 | StudyDay == 2 | StudyDay == 3 |
#            StudyDay == 4 | StudyDay == 5 | StudyDay == 6) %>%
#   mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis,
#          LRI = cough+SOB,
#          SystemicI = headache+muscleAches+malaise) %>%
#   mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
#   mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3, 
#          stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3, 
#          sneezing123 = sneezing==1 | sneezing==2 | sneezing==3, 
#          soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
#          DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3, 
#          DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3, 
#          DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3, 
#          DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
#          cough123 = cough==1 | cough==2 | cough==3, 
#          SOB123 = SOB==1 | SOB==2 | SOB==3,
#          headache123 = headache==1 | headache==2 | headache==3, 
#          muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3, 
#          malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
#   mutate(Febrile = as.numeric(Febrile),
#          runnyNose123 = as.numeric(runnyNose123), 
#          stuffyNose123 = as.numeric(stuffyNose123), 
#          sneezing123 = as.numeric(sneezing123), 
#          soreThroat123 = as.numeric(soreThroat123),
#          DPENasalDischarge123 = as.numeric(DPENasalDischarge123), 
#          DPEOtits123 = as.numeric(DPEOtits123), 
#          DPESinusTenderness123 = as.numeric(DPESinusTenderness123), 
#          DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
#          cough123 = as.numeric(cough123), 
#          SOB123 = as.numeric(SOB123),
#          headache123 = as.numeric(headache123), 
#          muscleAches123 = as.numeric(muscleAches123), 
#          malaise123 = as.numeric(malaise123)) %>%
#   group_by(SubjectID, StudyDay, QuarantineNumber) %>%
#   summarize(Febrile = max(Febrile),
#             runnyNose123 = max(runnyNose123), 
#             stuffyNose123 = max(stuffyNose123), 
#             sneezing123 = max(sneezing123), 
#             soreThroat123 = max(soreThroat123),
#             DPENasalDischarge123 = max(DPENasalDischarge123), 
#             DPEOtits123 = max(DPEOtits123), 
#             DPESinusTenderness123 = max(DPESinusTenderness123), 
#             DPEPharyngitis123 = max(DPEPharyngitis123),
#             cough123 = max(cough123), 
#             SOB123 = max(SOB123),
#             headache123 = max(headache123), 
#             muscleAches123 = max(muscleAches123), 
#             malaise123 = max(malaise123)) %>%
#   select(SubjectID, QuarantineNumber, StudyDay, Febrile, runnyNose123, stuffyNose123, sneezing123, soreThroat123,
#          cough123, SOB123, headache123, muscleAches123, malaise123) %>%
#   ungroup()
# # The above gets us to a dataset where symptoms with grade 1, 2, or 3 are summarized by whether there was...
# # ... at least one symptoms (of any grade) detection per study day
# 
# # Now to select which of the subjects were symptomatic (version1) (excluding the febrile criteria for now)
# # Note: Using the breaks in the loops for efficiency. If a subject is detected as symptomatic, ...
# # ... the loop restarts on the next subjectID
# # But for this we only want to include symptoms for study days 1 to 6 so we need to cut a new df
# Symptomatic_donors_infected_grade123_day1to6 <- Symptomatic_donors_infected_grade123 %>%
#   filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6)
# 
# sub <- unique(Symptomatic_donors_infected_grade123_day1to6$SubjectID)
# c_sub <- c()
# token<-0
# for (i in 1:length(sub)) {
#   token<-0
#   subid <- sub[i]
#   temp <- Symptomatic_donors_infected_grade123_day1to6[Symptomatic_donors_infected_grade123_day1to6$SubjectID == subid, ]
#   temp1<-temp[,4:13]
#   temp1[is.na(temp1)]<-0
#   temp<-cbind(temp[,1:3],temp1)
#   for (j in 1:(nrow(temp)-1)) {
#     for (k in 4:12) {
#       for (l in (k+1):13){
#         if (!is.na(temp[j, k]) & !is.na(temp[j, l]) & !is.na(temp[j+1, k]) & !is.na(temp[j+1, l])) {
#           if (temp[j, k] + temp[j, l] + temp[j+1, k] + temp[j+1, l] == 4) {
#             if (temp$StudyDay[j+1] == temp$StudyDay[j]+1) {
#               c_sub <- rbind(c_sub, subid)
#               token<-1
#               break  
#               }
#             }
#           }
#         if (token==1){
#           break
#         } 
#       }
#       if (token==1){
#         break
#       } 
#     }
#     if (token==1){
#       break
#     }
#   }
# }
# # Note that the above loop did not take into account symptoms that may have occurred before study day 1
# # One way of looking at this is that is there was a symptom that occured before day 1, ...
# # ... then that symptom should not contribute to classification criteria for symptomatic.
# # Will do another loop that implements this to see if this makes a difference at all
# sub <- unique(Symptomatic_donors_infected_grade123$SubjectID)
# c_sub2 <- c()
# token<-0
# for (i in 1:length(sub)) {
#   token<-0
#   subid <- sub[i]
#   temp <- Symptomatic_donors_infected_grade123[Symptomatic_donors_infected_grade123$SubjectID == subid, ]
#   temp1<-temp[,4:13]
#   temp1[is.na(temp1)]<-0
#   temp<-cbind(temp[,1:3],temp1)
#   for (j in 1:(nrow(temp)-1)) {
#     for (k in 4:12) {
#       for (l in (k+1):13){
#         if (!is.na(temp[j, k]) & !is.na(temp[j, l]) & !is.na(temp[j+1, k]) & !is.na(temp[j+1, l])) {
#           if (temp[j, k] + temp[j, l] + temp[j+1, k] + temp[j+1, l] == 4) {
#             if (temp$StudyDay[j+1] == temp$StudyDay[j]+1) {
#               sum1<-0
#               sum2<-0
#               for (m in 1:(tail(which(temp$StudyDay==0), n=1))){
#                 sum1<-sum1+temp[m,k]
#                 sum2<-sum1+temp[m,l]
#               }
#               if (sum1==0 & sum2==0) {
#                 c_sub2 <- rbind(c_sub2, subid)
#                 token<-1
#                 break
#               }
#             }
#           }
#         }
#       }
#       if (token==1){
#         break
#       } 
#     }
#     if (token==1){
#       break
#     }
#   }
# }
# # Both this loop and the less stringent one both yieleded n=26 study IDs, thus we move ahead using either version for now
# # We will have to discuss how we should work the definition in the paper. 
# 
# # The above loop prints a vector with 26 subject IDs who meet the criteria for symptomatic as...
# # ... having at least 2 symptoms on at least the same two consecutive study days
# # Note that febrile could count here as one of the two symptoms, but the only donor subjectID for where that occurred, 
# # ... also had other symptoms that would have classified them as symptomatic without looking at the febrile symptom
# 
# # But having fever at least once would also be considered as part of the symptomatic definition, so...
# # ... we will run some more lines that search for febrile cases and add them to this set of symptomatic subjectIDs...
# # ... in order to get a full list of the symptomatics
# 
# # First checking to see if there was fever in any of the donors prior to study day 1
# Febrile_before_day1 <- Symptomatic_donors_infected_grade123 %>%
#   filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 |StudyDay == 0) %>%
#   filter(Febrile == 1)
# # None of the infected donors had fever before day1
# # Now can look at who might have had fever during study days 1 to 6
# 
# Symptomatic_by_fever <- Symptomatic_donors_infected_grade123_day1to6 %>%
#   filter(Febrile == 1) %>%
#   select(SubjectID)
# print(Symptomatic_by_fever)
# 
# Symptomatic_V1 <- as.data.frame(c_sub) %>%
#   rename(SubjectID = "V1") %>%
#   full_join(Symptomatic_by_fever, by = "SubjectID") %>%
#   distinct(SubjectID, .keep_all = TRUE)
# 
# # Now adding the QuarantineNumber on to the Symptomatic_V1 df 
# # Then we can sort by Q for the table1
# Symptomatic_V1_QuarantineNumber_table1 <- Symptomatic_V1 %>%
#   left_join(Qdata_QuarantineNumbers, by = c("SubjectID" = "SubjectID")) %>%
#   group_by(QuarantineNumber) %>%
#   summarize(Number_Symptomatic_V1 = n_distinct(SubjectID))
# 
# # Add onto Table1 the number of symptomatic by version 1 criteria and % of infected
# # Note: we are commenting this out, because the team decided on a version 3 for symptomatic to use instead.
# #Qdata_table1 <- Qdata_table1 %>%
#   #left_join(Symptomatic_V1_QuarantineNumber_table1, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
#   #mutate(Fraction_Symptomatic_V1_of_Infected = Number_Symptomatic_V1/Number_Infected_Donors)
# #print(Qdata_table1)

#### ## Implementing Version 2 of "Symptomatic" ####

# # Symptomatic_V2: “Evidence of at least 2 symptoms of any grade that do not necessarily...
# # ...need to persist for consecutive study days, nor persist for the same consecutive study days, but ... 
# # ...where each of the symptoms appeared on at least two different study days.”
# # Note: the "Symptomatic_donors_infected_grade123" df needed for this code was created in the Version 1 of Symptomatic code
# 
# # Going to implement Symptomatic_V2 for afebrile to make a well-defined milder criteria for "symptomatic afebrile"
# sub <- unique(Symptomatic_donors_infected_grade123_day1to6$SubjectID)
# c_sub <- c()
# token<-0
# for (i in 1:length(sub)) {
#   token<-0
#   subid <- sub[i]
#   temp <- Symptomatic_donors_infected_grade123[Symptomatic_donors_infected_grade123$SubjectID == subid, ]
#   temp1<-temp[,4:13]
#   temp1[is.na(temp1)]<-0
#   temp<-cbind(temp[,1:3],temp1)
#   for (j in 1:(nrow(temp))) {
#     for (k in 5:12) {
#       for (l in (k+1):13){
#         if (sum(temp[,k], na.rm = TRUE)>=2 & sum(temp[,l], na.rm = TRUE)>=2) {
#           c_sub <- rbind(c_sub, subid)
#           token<-1
#           break
#         }
#       }
#       if (token==1){
#         break
#       } 
#     }
#     if (token==1){
#       break
#     }
#   }
# }
# # produces a c_sub with 29 subjectIDs
# # Note that the above loop did not take into account symptoms that may have occurred before study day 1
# # One way of looking at this is that is there was a symptom that occured before day 1, ...
# # ... then that symptom should not contribute to classification criteria for symptomatic.
# # Will do another loop that implements this to see if this makes a difference at all
# sub <- unique(Symptomatic_donors_infected_grade123$SubjectID)
# c_sub2 <- c()
# token<-0
# for (i in 1:length(sub)) {
#   token<-0
#   subid <- sub[i]
#   temp <- Symptomatic_donors_infected_grade123[Symptomatic_donors_infected_grade123$SubjectID == subid, ]
#   temp1<-temp[,4:13]
#   temp1[is.na(temp1)]<-0
#   temp<-cbind(temp[,1:3],temp1)
#   for (j in 1:(nrow(temp))) {
#     for (k in 5:12) {
#       for (l in (k+1):13){
#         if (sum(temp[,k], na.rm = TRUE)>=2 & sum(temp[,l], na.rm = TRUE)>=2) {
#           sum1<-0
#           sum2<-0
#           for (m in 1:(tail(which(temp$StudyDay==0), n=1))){
#             sum1<-sum1+temp[m,k]
#             sum2<-sum1+temp[m,l]
#           }
#           if (sum1==0 & sum2==0){
#             c_sub2 <- rbind(c_sub2, subid)
#             token<-1
#             break
#           }
#         }
#       }
#       if (token==1){
#         break
#       } 
#     }
#     if (token==1){
#       break
#     }
#   }
# }
# # produces c_sub2 with 27 subjectIDs, therefore it matters when we implement the more stringent criteria
# # For now we will use the more less stringent criteria, unless we are given the go ahead to use the more stringent criteria
# 
# # Now adding the c_sub vector of studyIDs to the table1 (donors)
# # Remember this is symptomatic version 2: a milder criteria for symptomatic, however it is symptomatic afebrile (unlike in Version 1)
# Symptomatic_V2 <- as.data.frame(c_sub) %>%
#   rename(SubjectID = "V1") 
# 
# # Eliminate the ones that had fever
# # First find the SubjectIDs from among the infected, that had fever
# Qdata_infected_febrile <- Qdata_infected_donors %>%
#   left_join(Qdata) %>%
#   filter(Randomization_DorIRorCR == "D" & Tympanic.temp..degrees.C. >37.9) %>%
#   distinct(SubjectID, .keep_all = FALSE)
# 
# Symptomatic_V2 <- Symptomatic_V2 %>%
#   anti_join(Qdata_infected_febrile)
# # This reduced the number of Symptomatic_V2 from 29 to 22
# 
# # Now adding the QuarantineNumber on to the Symptomatic df 
# # Then we can sort by Q for the table1
# Symptomatic_V2_QuarantineNumber_table1 <- Symptomatic_V2 %>%
#   left_join(Qdata_QuarantineNumbers, by = c("SubjectID" = "SubjectID")) %>%
#   group_by(QuarantineNumber) %>%
#   summarize(Number_Symptomatic_V2 = n_distinct(SubjectID))
# 
# # Add onto Table1 the number of symptomatic by version 2 criteria and % of infected
# # Note: we are commenting this out, because the team decided on a version 3 for symptomatic to use instead.
# #Qdata_table1 <- Qdata_table1 %>%
#   #left_join(Symptomatic_V2_QuarantineNumber_table1, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
#   #mutate(Fraction_Symptomatic_V2_of_Infected = Number_Symptomatic_V2/Number_Infected_Donors)
# #print(Qdata_table1)

#### ## Implementing Version 3 of "Symptomatic"  ####
## The purpose of this version of symptomatic is so that we are consistent with the definitions from the proof-of-concept study (Killingley, 2012 JID)

# Thus, this version 3 of symptomatic is:
# "Any respiratory symptom that occurs at all over 2 consecutive days, or occurs for 3/3 (am, early pm, late pm) symptom measurements on a single day, where respiratory symptoms include: runny nose, stuffy nose, sneezing, sore throat, cough, and shortness of breath"

# First we are going to cut the new df that has only the 6 respiratory symptoms of interest 
# (and also to include fever, just in case of future analyses)
Symptomatic_donors_infected_V3_days1to6 <- Qdata_infected_donors %>%
  left_join(Qdata) %>% 
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0 | StudyDay == 1 | StudyDay == 2 | 
           StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6) %>%
  mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis, 
         LRI = cough+SOB, 
         SystemicI = headache+muscleAches+malaise) %>%
  filter(Microneut_VisitType == "Q baseline") %>%
  mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
  mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3, 
         stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3, 
         sneezing123 = sneezing==1 | sneezing==2 | sneezing==3, 
         soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
         DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3, 
         DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3, 
         DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3, 
         DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
         cough123 = cough==1 | cough==2 | cough==3, 
         SOB123 = SOB==1 | SOB==2 | SOB==3,
         headache123 = headache==1 | headache==2 | headache==3, 
         muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3, 
         malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
  mutate(Febrile = as.numeric(Febrile),
         runnyNose123 = as.numeric(runnyNose123), 
         stuffyNose123 = as.numeric(stuffyNose123), 
         sneezing123 = as.numeric(sneezing123), 
         soreThroat123 = as.numeric(soreThroat123),
         DPENasalDischarge123 = as.numeric(DPENasalDischarge123), 
         DPEOtits123 = as.numeric(DPEOtits123), 
         DPESinusTenderness123 = as.numeric(DPESinusTenderness123), 
         DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
         cough123 = as.numeric(cough123), 
         SOB123 = as.numeric(SOB123),
         headache123 = as.numeric(headache123), 
         muscleAches123 = as.numeric(muscleAches123), 
         malaise123 = as.numeric(malaise123)) %>%
  select(SubjectID, StudyDay, Sx_Date, SDC_time, QuarantineNumber, Febrile,
         runnyNose123, stuffyNose123, sneezing123, soreThroat123, cough123, SOB123) %>%
  group_by(SubjectID, StudyDay, QuarantineNumber) %>%
  distinct(SDC_time, .keep_all = TRUE) %>%
  arrange(SubjectID, StudyDay) %>%
  ungroup()
# This is great but the way the data is put together, this leaves out the day -3 through day0 data
# Therefore, as a quick fix, we will cut a new dataset that only filters in the data from day -3 through day0
# Then we will bind it back to the "Symptomatic_donors_infected_V3" that was just created.
Symptomatic_donors_infected_before_day1 <- Qdata_infected_donors %>%
  left_join(Qdata, by = c("SubjectID" = "SubjectID", "QuarantineNumber" = "QuarantineNumber")) %>% 
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0) %>%
  mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
  mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3, 
         stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3, 
         sneezing123 = sneezing==1 | sneezing==2 | sneezing==3, 
         soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
         DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3, 
         DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3, 
         DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3, 
         DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
         cough123 = cough==1 | cough==2 | cough==3, 
         SOB123 = SOB==1 | SOB==2 | SOB==3,
         headache123 = headache==1 | headache==2 | headache==3, 
         muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3, 
         malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
  mutate(Febrile = as.numeric(Febrile),
         runnyNose123 = as.numeric(runnyNose123), 
         stuffyNose123 = as.numeric(stuffyNose123), 
         sneezing123 = as.numeric(sneezing123), 
         soreThroat123 = as.numeric(soreThroat123),
         DPENasalDischarge123 = as.numeric(DPENasalDischarge123), 
         DPEOtits123 = as.numeric(DPEOtits123), 
         DPESinusTenderness123 = as.numeric(DPESinusTenderness123), 
         DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
         cough123 = as.numeric(cough123), 
         SOB123 = as.numeric(SOB123),
         headache123 = as.numeric(headache123), 
         muscleAches123 = as.numeric(muscleAches123), 
         malaise123 = as.numeric(malaise123)) %>%
  select(SubjectID, StudyDay, Sx_Date, SDC_time, QuarantineNumber, Febrile,
         runnyNose123, stuffyNose123, sneezing123, soreThroat123, cough123, SOB123)
# Now binding together and sorting
Symptomatic_donors_infected_V3_dayneg3to6 <- bind_rows(Symptomatic_donors_infected_V3_days1to6, Symptomatic_donors_infected_before_day1) 
Symptomatic_donors_infected_V3_dayneg3to6 <- Symptomatic_donors_infected_V3_dayneg3to6 %>%
  arrange(SubjectID, StudyDay)

# Filter those with three measurements positive in a single study day for any of the respiratory pathogens
sub <- unique(Symptomatic_donors_infected_V3_days1to6$SubjectID)
c_sub <- c()
token <- 0
for (i in 1:length(sub)) {
  token <- 0
  subid <- sub[i]
  temp <- Symptomatic_donors_infected_V3_days1to6[Symptomatic_donors_infected_V3_days1to6$SubjectID == subid, ]
  temp1<-temp[,6:12]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:5],temp1)
  stud <- unique(temp$StudyDay)
  for (j in 1:length(stud)) {
    studyday <- stud[j]
    temp2 <- temp[temp$StudyDay == studyday, ]
    for (k in 1:(nrow(temp2))) {
      for (l in 7:12) {
        if (sum(temp2[,l]) == 3) {
          token <- 1
          c_sub <- rbind(c_sub, subid)
          break
        }
        if (token == 1) {
          break
        }
      }
      if (token == 1) {
        break
      }
    }
    if (token == 1) {
      break
    }
  }
} 
# But what if we want to disqualify a symptom from the classification scheme if it already appeared before study day 1
# We will compare results using this more stringent criteria, with the results using the less stringent criteria and select one (or both) for publication
sub <- unique(Symptomatic_donors_infected_V3_dayneg3to6$SubjectID)
c_sub2 <- c()
token <- 0
for (i in 1:length(sub)) {
  token <- 0
  subid <- sub[i]
  temp <- Symptomatic_donors_infected_V3_dayneg3to6[Symptomatic_donors_infected_V3_dayneg3to6$SubjectID == subid, ]
  temp1<-temp[,6:12]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:5],temp1)
  stud <- unique(temp$StudyDay)
  for (j in 1:length(stud)) {
    studyday <- stud[j]
    temp2 <- temp[temp$StudyDay == studyday, ]
    for (k in 1:(nrow(temp2))) {
      for (l in 7:12) {
        if (sum(temp2[,l]) == 3) {
          sum1<-0
          for (m in 1:(tail(which(temp$StudyDay==0), n=1))) {
            sum1<-sum1+temp[m,l]
          }
          if (sum1==0) {
            token <- 1
            c_sub2 <- rbind(c_sub2, subid)
            break
          }
        }
      }
      if (token == 1) {
        break
      }
    }
    if (token == 1) {
      break
    }
  }
} 
# Note that this loop with the stringent criteria of not using symptoms that appear before day 1 in the classification of sympomatic_V3 yields c_sub2 with n=19 subjectIDs (5 fewer than in the less stringent loop above)
# For now we will use the version with less stringent criteria but we will have the other version on deck in case we decide to change or publish both
# Get get the list of subject IDs from c_sub into a df
Symptomatic_V3_donors_infected_singleday <- as.data.frame(c_sub) %>%
  rename(SubjectID = "V1")

# Now use a loop to classify those with any sort of respiratory symptom on two consecutive days
# For this we should use a "Symptomatic_donors_infected_grade123" df that marks with indicator of 1 when any of the 3 symptom measurements in a day showed evidence of symptoms of any grade.
Symptomatic_donors_infected_grade123 <- Qdata_infected_donors %>%
  left_join(Qdata) %>%
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0 | StudyDay == 1 | StudyDay == 2 | StudyDay == 3 |
           StudyDay == 4 | StudyDay == 5 | StudyDay == 6) %>%
  mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis,
         LRI = cough+SOB,
         SystemicI = headache+muscleAches+malaise) %>%
  mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
  mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3,
         stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3,
         sneezing123 = sneezing==1 | sneezing==2 | sneezing==3,
         soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
         DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3,
         DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3,
         DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3,
         DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
         cough123 = cough==1 | cough==2 | cough==3,
         SOB123 = SOB==1 | SOB==2 | SOB==3,
         headache123 = headache==1 | headache==2 | headache==3,
         muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3,
         malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
  mutate(Febrile = as.numeric(Febrile),
         runnyNose123 = as.numeric(runnyNose123),
         stuffyNose123 = as.numeric(stuffyNose123),
         sneezing123 = as.numeric(sneezing123),
         soreThroat123 = as.numeric(soreThroat123),
         DPENasalDischarge123 = as.numeric(DPENasalDischarge123),
         DPEOtits123 = as.numeric(DPEOtits123),
         DPESinusTenderness123 = as.numeric(DPESinusTenderness123),
         DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
         cough123 = as.numeric(cough123),
         SOB123 = as.numeric(SOB123),
         headache123 = as.numeric(headache123),
         muscleAches123 = as.numeric(muscleAches123),
         malaise123 = as.numeric(malaise123)) %>%
  group_by(SubjectID, StudyDay, QuarantineNumber) %>%
  summarize(Febrile = max(Febrile),
            runnyNose123 = max(runnyNose123),
            stuffyNose123 = max(stuffyNose123),
            sneezing123 = max(sneezing123),
            soreThroat123 = max(soreThroat123),
            DPENasalDischarge123 = max(DPENasalDischarge123),
            DPEOtits123 = max(DPEOtits123),
            DPESinusTenderness123 = max(DPESinusTenderness123),
            DPEPharyngitis123 = max(DPEPharyngitis123),
            cough123 = max(cough123),
            SOB123 = max(SOB123),
            headache123 = max(headache123),
            muscleAches123 = max(muscleAches123),
            malaise123 = max(malaise123)) %>%
  select(SubjectID, QuarantineNumber, StudyDay, Febrile, runnyNose123, stuffyNose123, sneezing123, soreThroat123,
         cough123, SOB123, headache123, muscleAches123, malaise123) %>%
  ungroup()
# The above gets us to a dataset where symptoms with grade 1, 2, or 3 are summarized by whether there was at least one symptoms (of any grade) detection per study day

# Now to select which of the subjects were symptomatic (version1) (excluding the febrile criteria for now)
# Note: Using the breaks in the loops for efficiency. If a subject is detected as symptomatic, the loop restarts on the next subjectID
# But for this we only want to include symptoms for study days 1 to 6 so we need to cut a new df
Symptomatic_donors_infected_grade123_day1to6 <- Symptomatic_donors_infected_grade123 %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6)

sub <- unique(Symptomatic_donors_infected_grade123_day1to6$SubjectID)
c_sub <- c()
token<-0
for (i in 1:length(sub)) {
  token<-0
  subid <- sub[i]
  temp <- Symptomatic_donors_infected_grade123_day1to6[Symptomatic_donors_infected_grade123_day1to6$SubjectID == subid, ]
  temp1<-temp[,4:10]
  temp1[is.na(temp1)]<-0
  temp<-cbind(temp[,1:3],temp1)
  for (j in 1:(nrow(temp)-1)) {
    for (k in 5:10) {
      if (temp[j, k] + temp[j+1, k] == 2){
        if (temp$StudyDay[j+1] == temp$StudyDay[j]+1) {
            c_sub <- rbind(c_sub, subid)
            token<-1
            break
        }
      }   
    }
    if (token==1){
      break
    } 
  }
}
# This yields a c_sub of 32 subjectIDs but let's now apply the more stringent version (eliminating Sx if appear before day 1)
sub <- unique(Symptomatic_donors_infected_grade123$SubjectID)
c_sub2 <- c()
token<-0
for (i in 1:length(sub)) {
  token<-0
  subid <- sub[i]
  temp <- Symptomatic_donors_infected_grade123[Symptomatic_donors_infected_grade123$SubjectID == subid, ]
  temp1<-temp[,4:10]
  temp1[is.na(temp1)]<-0
  temp<-cbind(temp[,1:3],temp1)
  for (j in 1:(nrow(temp)-1)) {
    for (k in 5:10) {
      if (temp[j, k] + temp[j+1, k] == 2){
        if (temp$StudyDay[j+1] == temp$StudyDay[j]+1) {
          sum1<-0
          for (l in 1:(tail(which(temp$StudyDay==0), n=1))) {
            sum1 <- sum1+temp[l,k]
          }
          if (sum1 == 0) {
            c_sub2 <- rbind(c_sub2, subid)
            token<-1
            break
          }
        }
      }   
    }
    if (token==1){
      break
    } 
  }
}
# This yields a c_sub2 of 30 subjectIDs, so it cut out 2 
# For now, however we will stick to using the less stringent criteria with regards to Sx before day1

# Rename "V1" as SubjectID
Symptomatic_V3_donors_infected_twodays <- as.data.frame(c_sub) %>%
  rename(SubjectID = "V1")

# Combine the Symptomatic_V3_donors_infected_singleday df and the Symptomatic_V3_donors_infected_twodays df
Symptomatic_V3_donors_infected_combined <- Symptomatic_V3_donors_infected_twodays %>%
  full_join(Symptomatic_V3_donors_infected_singleday, by = c("SubjectID" = "SubjectID")) %>%
  arrange(SubjectID)
print(Symptomatic_V3_donors_infected_combined)

# Writing out this object to the Box Sync/EMIT/EMIT_Data_Analysis_Jake/Natural_vs_Artificial_Infection/Analytical Datasets directory so that it can be used in the Natural vs Artificial Infection analysis. 
# write.csv(Symptomatic_V3_donors_infected_combined, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/Natural_vs_Artificial_Infection/Analytical Datasets/Qdata_Infected_Donors_Symptomatic.csv")

 # But the above definition of symptomatic (V3) doesn't make any mention of febrile illness
# Let's check to see if the febrile are already accounted for among the group of symptomatic version 3
Symptomatic_by_fever <- Symptomatic_donors_infected_grade123 %>%
  filter(Febrile == 1) %>%
  select(SubjectID) %>%
  anti_join(Symptomatic_V3_donors_infected_combined, by = c("SubjectID" = "SubjectID"))
print(Symptomatic_by_fever)
# Important to note that 0 subjectIDs were febrile but not symptomatic - that is, all subjects who were febrile were also symptomatic. 

# ... include febrile (without respiratory Sx) along with the other respiratory symptoms as part of the symptomatic V3 set
Symptomatic_V3_donors_infected_combined_table1 <- Symptomatic_V3_donors_infected_combined %>%
  left_join(Qdata_QuarantineNumbers, by = c("SubjectID" = "SubjectID")) %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_Symptomatic_V3 = n_distinct(SubjectID))
print(Symptomatic_V3_donors_infected_combined_table1)

# Add onto Table1 the number of symptomatic by version 3 criteria and % of infected
Qdata_table1 <- Qdata_table1 %>%
  left_join(Symptomatic_V3_donors_infected_combined_table1, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
  mutate(Fraction_Symptomatic_V3_of_Infected = Number_Symptomatic_V3/Number_Infected_Donors)
print(Qdata_table1)

#### Table 1 (donors): c) Number of symptomatic, non-ILI (and % of infected) ####

# We will ignore this category based on discussion above about removing "symptomatic" classification from the final report
# Find note under "Table 1 (donors): b)..." section of code

#### Table 1 (donors): d) Number of ILI (and % of infected) ####

## Definition of ILI by CDC: "Case definitions for influenza-like illness are nonspecific for influenza and vary depending on the purpose for which they are used. A case definition of fever 100°F or greater, oral or equivalent, and cough and/or sore throat is used by CDC in its U.S."

# We will create 2 versions of this definition: one including fever (as written above), and one not including fever
# We will eliminate a symptom from contributing to classification criteria if it appeared during D-1 or D-2
# Later a third version was added based on teleconference on October 12, 2018 and the desire to be consistent with terminology following the previously published Killingley et al., 2012 paper

# Later discussion has revealed an ILI Version 3 that we should use. 
# I have commented out the other versions of classifying ILI that have become obsolete.

#### ## First version of classification of ILI (febrile ILI, exact CDC definition) (and % of infected) ####

# # Operationally, this means evidence of fever >100F (>37.9C) & any evidence of cough or sore throat or DPE Pharyngitis
# # Technically we should be using >=37.8 since 37.8 == 100.04 but since Killingley et al., 2012 and...
# # ... other sources from the EMIT consortium believe in using the >37.9 criteria, we will implement that here.
# 
# # First, cut the dataset to only the infected donors who meet the definition for fever
# # Note, none of the volunteers registered a fever on any of the study days prior to inoculation day
# ILI_V1_data <- Qdata_infected_donors %>%
#   left_join(Qdata) %>%
#   select(SubjectID, QuarantineNumber, StudyDay, Sx_Date, Tympanic.temp..degrees.C., cough, soreThroat, DPEPharyngitis) %>%
#   filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0 | StudyDay == 1 | StudyDay == 2 | 
#            StudyDay == 4 | StudyDay == 5 | StudyDay == 6)
# 
# # Let's consolidate the sore throat and pharyngitis variables to make one cumulative variable (soreThroat or DPEPharyngitis) called st
# ILI_V1_data <- ILI_V1_data %>%
#   mutate(st = cough>=1 | soreThroat>=1 | DPEPharyngitis>=1, st = as.numeric(st))
# 
# # Before using the loop we need to cut a df that only looks at sx from day1-6
# ILI_V1_data_day1to6 <- ILI_V1_data %>%
#   filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6)
# 
# # Applying loop to select subjectIDs that have fever (>37.9C) and either cough or sore throat
# sub <- unique(ILI_V1_data_day1to6$SubjectID)
# c_sub <- c()
# token_t<-0
# token_s<-0
# for (i in 1:length(sub)) {
#   token_t<-0
#   token_s<-0
#   subid <- sub[i]
#   temp <- ILI_V1_data_day1to6[ILI_V1_data_day1to6$SubjectID == subid, ]
#   for (j in 1:(nrow(temp))) {
#     if (!is.na(temp$Tympanic.temp..degrees.C.[j])) {
#       if (temp$Tympanic.temp..degrees.C.[j] > 37.9) {
#         token_t<-1
#       }
#     }
#     if (sum(temp$cough[j], temp$st[j], na.rm = TRUE) >=1) {
#         token_s<-1
#     }
#     if (token_t==1 & token_s==1){
#       c_sub <- rbind(c_sub, subid)
#       break
#     }
#   }
# }
# # Note: the above code does not check for the case that someone had fever, cough, or sore throat...
# # ... prior to inoculation day. Would take some effort to figure out and since there were only 4 instances. 
# # I checked the raw data and see that none of the 4 had temp>37.9, cough or st symptoms prior to inoculation
# # Thus these 4 can be printed into the table
# # Future iterations of this code would do well to implement logic that would not count symptoms ...
# # ... that appeared before inoculation day in the classification criteria
# 
# # Now adding this vector of studyIDs to the table1 (donors)
# ILI_V1_febrile <- as.data.frame(c_sub) %>%
#   rename(SubjectID = "V1") 
# 
# # Now adding the QuarantineNumber on to the Febrile ILI df 
# # Then we can sort by Q for the table1
# ILI_V1_febrile_table1 <- ILI_V1_febrile %>%
#   left_join(Qdata_QuarantineNumbers, by = c("SubjectID" = "SubjectID")) %>%
#   group_by(QuarantineNumber) %>%
#   summarize(Number_ILI_V1_Febrile = n_distinct(SubjectID))
# 
# # Add onto Table1 the number of symptomatic by version 1 criteria and % of infected
# # Note: we are commenting this out, because the team decided on a version 3 for ILI to use instead.
# #Qdata_table1 <- Qdata_table1 %>%
#   #left_join(ILI_V1_febrile_table1, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
#   #mutate(Fraction_ILI_V1_Febrile_of_Infected = Number_ILI_V1_Febrile/Number_Infected_Donors)
# #print(Qdata_table1)

#### ## Second version of classification of ILI (afebrile ILI) (and % of infected) ####
# 
# # This definition of afebrile ILI is: symptom of grade >=1 for cough or soreThroat (or DPEPharyngitis)
# # First we will do a loop that doesn't restrict symptoms that appear before study day1 from contributing to criteria
# sub <- unique(ILI_V1_data_day1to6$SubjectID)
# c_sub <- c()
# token<-0
# for (i in 1:length(sub)) {
#   token_c<-0
#   token_st<-0
#   subid <- sub[i]
#   temp <- ILI_V1_data_day1to6[ILI_V1_data_day1to6$SubjectID == subid, ]
#   temp1<-temp[,6:9]
#   temp1[is.na(temp1)]<-0
#   temp<-cbind(temp[,1:5],temp1)
#   for (j in 1:(nrow(temp))) {
#     if (temp$cough[j] >=1) {
#           token_c<-1
#     }
#     if (temp$st[j] >= 1) {
#           token_st<-1
#     }
#   }
#   if (token_c + token_st >=1) {
#     c_sub <- rbind(c_sub, subid)
#   }
# }
# # This yields a c_sub of 24 subjectIDs
# # But this version allows symptoms that appear before day1 to count towards criteria.
# # In contrast, the version below doesn't
# # The study team can make the decision about which version to use
# sub <- unique(ILI_V1_data$SubjectID)
# c_sub2 <- c()
# token<-0
# for (i in 1:length(sub)) {
#   token_c<-0
#   token_st<-0
#   subid <- sub[i]
#   temp <- ILI_V1_data[ILI_V1_data$SubjectID == subid, ]
#   temp1<-temp[,6:9]
#   temp1[is.na(temp1)]<-0
#   temp<-cbind(temp[,1:5],temp1)
#   for (j in 1:(nrow(temp))) {
#     if (temp$cough[j] >=1) {
#       sum1<-0
#       for (k in 1:(tail(which(temp$StudyDay==0), n=1))) {
#         sum1 <- sum1+temp$cough[k]
#         if (sum1 == 0) {
#           token_c<-1
#         }
#       }
#     }
#     if (temp$st[j] >= 1) {
#       sum2<-0
#       for (l in 1:(tail(which(temp$StudyDay==0), n=1))) {
#         sum2 <- sum2+temp$st[l]
#         if (sum2 == 0) {
#           token_st<-1
#         }
#       }
#     }
#   }
#   if (token_c + token_st >=1) {
#     c_sub2 <- rbind(c_sub2, subid)
#   }
# }
# # This version yields a c_sub2 with n=24 subjectIDs (same as c_sub)
# 
# # For now we will include the less stringent criteria version but we have both versions available for use.
# # Now adding the c_sub (less stringent criteria) vector of studyIDs to the table1 (donors)
# ILI_V2_afebrile <- as.data.frame(c_sub) %>%
#   rename(SubjectID = "V1") 
# 
# # Getting rid of subjectIDs that actually were febrile
# # First find the SubjectIDs from among the infected, that had fever
# Qdata_infected_febrile <- Qdata_infected_donors %>%
#   left_join(Qdata) %>%
#   filter(Randomization_DorIRorCR == "D" & Tympanic.temp..degrees.C. >37.9) %>%
#   distinct(SubjectID, .keep_all = FALSE)
# 
# ILI_V2_afebrile <- ILI_V2_afebrile %>%
#   anti_join(Qdata_infected_febrile)
# 
# # Now adding the QuarantineNumber on to the ILI afebrile df 
# # Then we can sort by Q for the table1
# ILI_V2_afebrile_table1 <- ILI_V2_afebrile %>%
#   left_join(Qdata_QuarantineNumbers, by = c("SubjectID" = "SubjectID")) %>%
#   group_by(QuarantineNumber) %>%
#   summarize(Number_ILI_V2_Afebrile = n_distinct(SubjectID))
# 
# # Add onto Table1 the number of ILI afebrile and % of infected
# # Note: we are commenting this out, because the team decided on a version 3 for ILI to use instead.
# #Qdata_table1 <- Qdata_table1 %>%
#   #left_join(ILI_V2_afebrile_table1, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
#   #mutate(Fraction_ILI_V2_Afebrile_of_Infected = Number_ILI_V2_Afebrile/Number_Infected_Donors)
# #print(Qdata_table1)

#### ## Third version of classification of ILI (to match Killingley et al., 2012) (and % of infected) ####
# This version came about during the October 12, 2018 conference call with the EMIT team (UK and UMD groups present)
# For now this version sounds like it will be the one that we use for the paper. 

# This definition of ILI is: "an illness lasting >=24 hours with either (1) fever >37.9°C plus at least 1 respiratory symptom or (2) >=2 symptoms, at least 1 of which must be respiratory."
# Where "respiratory symptom" means evidence of any grade of runny nose, stuffy nose, sneeze, sore throat, cough, shortness of breath
# Where "lasting >=24 hours" means evidence of the symptom over all three instances of symptom measurements for a single day, or evidence of the symptom over two days at any frequency (1-3/3 instances of symptom recordings)

# First, let's program the first criteria (fever > 37.9C plus at least 1 respiratory symptom)
# To do this, we can: 
# a) create the set of subject IDs that meet the fever criteria, and then check them for
# b) evidence of three instances during a single day, or
# c) evidence of any frequency of instances >=1 for 2 consecutive days
# Then, we can deal with the second criteria for ILI (>=2 symptoms one of which being a respiratory)

# Find the SubjectIDs from among the infected, that had fever
# First check to see if anyone had fever before day 1. 
Qdata_infected_febrile_pre_day1 <- Qdata_infected_donors %>%
  left_join(Qdata) %>%
  filter(Randomization_DorIRorCR == "D") %>%
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0) %>%
  filter(Tympanic.temp..degrees.C. >37.9) %>%
  distinct(SubjectID, .keep_all = FALSE)
print(Qdata_infected_febrile_pre_day1)
# As it turns out, none of the infected donors had fever before day 1 
# Now we can see who among the infected subject IDs had fever at least once over study days 1-6
Qdata_infected_febrile_day1to6 <- Qdata_infected_donors %>%
  left_join(Qdata) %>%
  filter(Randomization_DorIRorCR == "D") %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6) %>%
  filter(Tympanic.temp..degrees.C. >37.9) %>%
  distinct(SubjectID, .keep_all = FALSE)
print(Qdata_infected_febrile_day1to6)

# Now see among the "Qdata_infected_febrile_day1to6" df, who had symptoms all day during a single study day

# First we are going to cut the a new df that has the 9 symptoms of interest (this includes the 6 respiratory symptoms) and fever for only those in the "Qdata_infected_febrile_day1to6" df
ILI_V3_donors_infected_febrile_studyday1to6 <- Qdata_infected_febrile_day1to6 %>%
  left_join(Qdata) %>% 
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0 | StudyDay == 1 | StudyDay == 2 | 
           StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6) %>%
  mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis,
         LRI = cough+SOB, 
         SystemicI = headache+muscleAches+malaise) %>%
  filter(Microneut_VisitType == "Q baseline") %>%
  mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
  mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3, 
         stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3, 
         sneezing123 = sneezing==1 | sneezing==2 | sneezing==3, 
         soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
         DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3, 
         DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3, 
         DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3, 
         DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
         cough123 = cough==1 | cough==2 | cough==3, 
         SOB123 = SOB==1 | SOB==2 | SOB==3,
         headache123 = headache==1 | headache==2 | headache==3, 
         muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3, 
         malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
  mutate(Febrile = as.numeric(Febrile),
         runnyNose123 = as.numeric(runnyNose123), 
         stuffyNose123 = as.numeric(stuffyNose123), 
         sneezing123 = as.numeric(sneezing123), 
         soreThroat123 = as.numeric(soreThroat123),
         DPENasalDischarge123 = as.numeric(DPENasalDischarge123), 
         DPEOtits123 = as.numeric(DPEOtits123), 
         DPESinusTenderness123 = as.numeric(DPESinusTenderness123), 
         DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
         cough123 = as.numeric(cough123), 
         SOB123 = as.numeric(SOB123),
         headache123 = as.numeric(headache123), 
         muscleAches123 = as.numeric(muscleAches123), 
         malaise123 = as.numeric(malaise123)) %>%
  select(SubjectID, StudyDay, Sx_Date, SDC_time, QuarantineNumber, Febrile,
         runnyNose123, stuffyNose123, sneezing123, soreThroat123, cough123, SOB123,
         headache123, muscleAches123, malaise123) %>%
  group_by(SubjectID, StudyDay, QuarantineNumber) %>%
  distinct(SDC_time, .keep_all = TRUE) %>%
  arrange(SubjectID, StudyDay) %>%
  ungroup()
# This is great but the way the data is put together, this leaves out the day -3 through day0 data
# Therefore, as a quick fix, we will cut a new dataset that only filters in the data from day -3 through day0
# Then we will bind it back to the "Symptomatic_donors_infected_V3_febrile" that was just created.
ILI_V3_donors_infected_before_day1_febrile <- Qdata_infected_febrile_day1to6 %>%
  left_join(Qdata) %>% 
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0) %>%
  mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis,
         LRI = cough+SOB, 
         SystemicI = headache+muscleAches+malaise) %>%
  mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
  mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3, 
         stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3, 
         sneezing123 = sneezing==1 | sneezing==2 | sneezing==3, 
         soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
         DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3, 
         DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3, 
         DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3, 
         DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
         cough123 = cough==1 | cough==2 | cough==3, 
         SOB123 = SOB==1 | SOB==2 | SOB==3,
         headache123 = headache==1 | headache==2 | headache==3, 
         muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3, 
         malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
  mutate(Febrile = as.numeric(Febrile),
         runnyNose123 = as.numeric(runnyNose123), 
         stuffyNose123 = as.numeric(stuffyNose123), 
         sneezing123 = as.numeric(sneezing123), 
         soreThroat123 = as.numeric(soreThroat123),
         DPENasalDischarge123 = as.numeric(DPENasalDischarge123), 
         DPEOtits123 = as.numeric(DPEOtits123), 
         DPESinusTenderness123 = as.numeric(DPESinusTenderness123), 
         DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
         cough123 = as.numeric(cough123), 
         SOB123 = as.numeric(SOB123),
         headache123 = as.numeric(headache123), 
         muscleAches123 = as.numeric(muscleAches123), 
         malaise123 = as.numeric(malaise123)) %>%
  select(SubjectID, StudyDay, Sx_Date, SDC_time, QuarantineNumber, Febrile,
         runnyNose123, stuffyNose123, sneezing123, soreThroat123, cough123, SOB123,
         headache123, muscleAches123, malaise123)
# Now binding together and sorting
ILI_V3_donors_infected_febrile <- bind_rows(ILI_V3_donors_infected_febrile_studyday1to6, ILI_V3_donors_infected_before_day1_febrile) 
ILI_V3_donors_infected_febrile <- ILI_V3_donors_infected_febrile %>%
  arrange(SubjectID, StudyDay)
# But this definition is just for post day0 so we will filter just day1-6 so use the "ILI_V3_donors_infected_febrile_studyday1to6" df
sub <- unique(ILI_V3_donors_infected_febrile_studyday1to6$SubjectID)
c_sub <- c()
token <- 0
for (i in 1:length(sub)) {
  token <- 0
  subid <- sub[i]
  temp <- ILI_V3_donors_infected_febrile_studyday1to6[ILI_V3_donors_infected_febrile_studyday1to6$SubjectID == subid, ]
  temp1<-temp[,6:15]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:5],temp1)
  stud <- unique(temp$StudyDay)
  for (j in 1:length(stud)) {
    studyday <- stud[j]
    temp2 <- temp[temp$StudyDay == studyday, ]
    for (k in 1:(nrow(temp2))) {
      for (l in 7:12) { 
        if (sum(temp2[,l]) == 3) {
          c_sub <- rbind(c_sub, subid)
          token <- 1
          break
        }
      }
      if(token == 1) {
        break
      }
    }
    if(token == 1) {
      break
    }
  }
} 
# This yields a c_sub vector with 7 subjectIDs
# If we want to do the same loop, but exclude symptoms that were positive before study day 1 we use ILI_V3_donors_infected_febrile
sub <- unique(ILI_V3_donors_infected_febrile$SubjectID)
c_sub2 <- c()
token <- 0
for (i in 1:length(sub)) {
  token <- 0
  subid <- sub[i]
  temp <- ILI_V3_donors_infected_febrile[ILI_V3_donors_infected_febrile$SubjectID == subid, ]
  temp1<-temp[,6:12]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:5],temp1)
  stud <- unique(temp$StudyDay)
  for (j in 1:length(stud)) {
    studyday <- stud[j]
    temp2 <- temp[temp$StudyDay == studyday, ]
    for (k in 1:(nrow(temp2))) {
      for (l in 7:12) {
        if (sum(temp2[,l]) == 3) {
          sum1<-0
          for (m in 1:(tail(which(temp$StudyDay==-0), n=1))){
            sum1<-sum1+temp[m,l]
          }
          if (sum1==0) {
            c_sub2 <- rbind(c_sub2, subid)
            token<-1
            break
          }
        }
      }
      if(token == 1) {
      break
      }
    }
    if(token == 1) {
    break
    }
  }
}
# This yields a c_sub2 vector with 7 subjectIDs, which matches what the less stringent loop from above produced
# For now we will you the less stringent criteria
# Now get the df of subject IDs from the c_sub vector
ILI_V3_infected_donors_criteria1_singleday <- as.data.frame(c_sub) %>%
  rename(SubjectID = "V1")
print(ILI_V3_infected_donors_criteria1_singleday)
# This is the output for the first criteria for ILI (fever and 1 respiratory symptom for >=24 hours), where the symptoms occured on three times in the same day (counts as >= 24 hours)

# Moving to the second part of criteria 1 (febrile plus 2 consecutive study days of a resp. Sx at any freq >=1)
# Now to implement the criteria 1 (febrile plus 1 resp Sx) for those who had symptoms over 2 consecutive study days
# Use the "Symptomatic_donors_infected_grade123" df, cut it to the 6 resp. symptoms, and febrile only
# Remember: the "Symptomatic_donors_infected_grade123" df created under Version 1 of Symptomatic for the infected donors
Symptomatic_febrile_donors_grade123_ILI_V3_criteria1_2days <- Symptomatic_donors_infected_grade123 %>%
  right_join(Qdata_infected_febrile_day1to6) %>%
  select(SubjectID, StudyDay, QuarantineNumber, Febrile, runnyNose123, stuffyNose123, sneezing123, 
         soreThroat123, cough123, SOB123)

# Now implement loop to make this classification of having resp Sx at frequency >=1 over 2 consecutive study days
# First without checking for symptoms prior to study day 1
Symptomatic_febrile_donors_grade123_ILI_V3_criteria1_2days_1to6 <- Symptomatic_febrile_donors_grade123_ILI_V3_criteria1_2days %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6)

sub <- unique(Symptomatic_febrile_donors_grade123_ILI_V3_criteria1_2days_1to6$SubjectID)
c_sub <- c()
token<-0
for (i in 1:length(sub)) {
  token<-0
  subid <- sub[i]
  temp <- Symptomatic_febrile_donors_grade123_ILI_V3_criteria1_2days_1to6[Symptomatic_febrile_donors_grade123_ILI_V3_criteria1_2days_1to6$SubjectID == subid, ]
  temp1<-temp[,4:10]
  temp1[is.na(temp1)]<-0
  temp<-cbind(temp[,1:3],temp1)
  for (j in 1:(nrow(temp)-1)) {
    for (k in 5:10) {
      if (temp[j, k] + temp[j+1, k] == 2){
        if (temp$StudyDay[j+1] == temp$StudyDay[j]+1) {
            c_sub <- rbind(c_sub, subid)
            token<-1
            break
        }
      }
    }   
    if (token==1){
      break
    } 
  }
}
# This yielded a c_sub vector of 7 subjectIDs.
# Check to see if any of the symptoms appeared before day 1 and thus should be considered as disqualifying for that symptom to contribute to classification criteria.
sub <- unique(Symptomatic_febrile_donors_grade123_ILI_V3_criteria1_2days$SubjectID)
c_sub2 <- c()
token<-0
for (i in 1:length(sub)) {
  token<-0
  subid <- sub[i]
  temp <- Symptomatic_febrile_donors_grade123_ILI_V3_criteria1_2days[Symptomatic_febrile_donors_grade123_ILI_V3_criteria1_2days$SubjectID == subid, ]
  temp1<-temp[,4:10]
  temp1[is.na(temp1)]<-0
  temp<-cbind(temp[,1:3],temp1)
  for (j in 1:(nrow(temp)-1)) {
    for (k in 5:10) {
      if (temp[j, k] + temp[j+1, k] == 2){
        if (temp$StudyDay[j+1] == temp$StudyDay[j]+1) {
          sum1<-0
          for (m in 1:(tail(which(temp$StudyDay==0), n=1))){
            sum1<-sum1+temp[m,k]
        }
          if (sum1==0) {
            c_sub2 <- rbind(c_sub2, subid)
            token<-1
            break
          }
        }
      }   
    }
    if (token==1){
      break
    } 
  }
}
# This also yielded a vector of 7 subjectIDs
# Thus, using the more stringent criteria here doesn't make any difference
# For now we will use the less stringent criteria
# Create df from the vector of 7 subjectIDs from c_sub 
# Remember this is the classification of febrile with symptoms of >=1 frequency over 2 days
ILI_V3_infected_donors_criteria1_2days <- as.data.frame(c_sub) %>%
  rename(SubjectID = "V1")
print(ILI_V3_infected_donors_criteria1_2days)

# Now will merge the 2 parts of criteria 1 for ILI together to get a single set of subjectIDs that meet the 1st ILI criteria
# Reminder that 1st ILI criteria is: 
#  febrile (>37.9C) plus >=24 hours of a respiratory symptom (one of the 6 self-reported resp Sxs)
# The "2 parts of criteria 1" refer to:
# 1) febrile plus 3 observations of self-reported resp. Sx in a single day, and 
# 2) febrile plus 2 consecutive days of at least 1 self-reported resp. Sx at any daily frequency >=1

ILI_V3_infected_donors_criteria1 <- full_join(ILI_V3_infected_donors_criteria1_singleday, 
                                              ILI_V3_infected_donors_criteria1_2days)
print(ILI_V3_infected_donors_criteria1)

# Writing out this ILI_V3_infected_donors_criteria1 object for use in the Natural_vs_Artificial analysis
# write.csv(ILI_V3_infected_donors_criteria1, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/Natural_vs_Artificial_Infection/Analytical Datasets/Qdata_Infected_Donors_ILI.csv")

## Plan for implementing the second criteria for ILI_V3:
# That is: >= 2 symptoms for >=24 hours, 1 of which is respiratory, and merging with the first criteria for ILI

# To do this, first we will filter those subject IDs without fever (find the always afebrile group)
# Then we will see who among the always afebrile had:
# >=1 resp. symptom on 1 day plus at least one other symptom on the same single day (grade >=1 for 3/3 observations in a day)
# Then we will see who among the always afebrile had:
# >=1 resp. sympom at frequency >=1 over 2 days plus at least one other resp. symptom at freq >=1 for same 2 days

# Then we will add those subject IDs (from the above 2 criteria) together to form ILI_V3_infected_donors_criteria2
# Then we will merge ILI_V3_infected_donors_criteria1 and ILI_V3_infected_donors_criteria2 together to make ILI_V3_infected_donors

# First we will get the list of subjectIDs who never had fever (the always afebrile group)
# Now we can see who among the infected subject IDs had fever at least once over study days 1-6
Qdata_infected_afebrile_day1to6 <- Qdata_infected_donors %>%
  select (SubjectID) %>%
  anti_join(Qdata_infected_febrile_day1to6)
print(Qdata_infected_afebrile_day1to6)

# Now we will see who among these always afebrile had resp. symptom on a single day plus at least one other respiratory symptom on the same single day (grade >=1 for 3/3 observations in a day)

# First we are going to cut the a new df that has the 6 self-reported respiratory symptoms of interest plus the 3 self-reported non-resp symptoms, and fever (just for kicks, it should always be <37.9 because we already filtered but interesting to have the values)
# for only those in the "Qdata_infected_afebrile_day1to6" df
ILI_V3_donors_infected_afebrile <- Qdata_infected_afebrile_day1to6 %>%
  left_join(Qdata) %>% 
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0 | StudyDay == 1 | StudyDay == 2 | 
           StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6) %>%
  mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis, 
         LRI = cough+SOB, 
         SystemicI = headache+muscleAches+malaise) %>%
  filter(Microneut_VisitType == "Q baseline") %>%
  mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
  mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3, 
         stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3, 
         sneezing123 = sneezing==1 | sneezing==2 | sneezing==3, 
         soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
         DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3, 
         DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3, 
         DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3, 
         DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
         cough123 = cough==1 | cough==2 | cough==3, 
         SOB123 = SOB==1 | SOB==2 | SOB==3,
         headache123 = headache==1 | headache==2 | headache==3, 
         muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3, 
         malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
  mutate(Febrile = as.numeric(Febrile),
         runnyNose123 = as.numeric(runnyNose123), 
         stuffyNose123 = as.numeric(stuffyNose123), 
         sneezing123 = as.numeric(sneezing123), 
         soreThroat123 = as.numeric(soreThroat123),
         DPENasalDischarge123 = as.numeric(DPENasalDischarge123), 
         DPEOtits123 = as.numeric(DPEOtits123), 
         DPESinusTenderness123 = as.numeric(DPESinusTenderness123), 
         DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
         cough123 = as.numeric(cough123), 
         SOB123 = as.numeric(SOB123),
         headache123 = as.numeric(headache123), 
         muscleAches123 = as.numeric(muscleAches123), 
         malaise123 = as.numeric(malaise123)) %>%
  select(SubjectID, StudyDay, Sx_Date, SDC_time, QuarantineNumber, Febrile,
         runnyNose123, stuffyNose123, sneezing123, soreThroat123, cough123, SOB123,
         headache123, muscleAches123, malaise123) %>%
  group_by(SubjectID, StudyDay, QuarantineNumber) %>%
  distinct(SDC_time, .keep_all = TRUE) %>%
  arrange(SubjectID, StudyDay) %>%
  ungroup()
# This is great but the way the data is put together, this leaves out the day -3 through day0 data
# Therefore, as a quick fix, we will cut a new dataset that only filters in the data from day -3 through day0
# Then we will bind it back to the "Symptomatic_V3_donors_infected_afebrile" that was just created.
ILI_V3_donors_infected_before_day1_afebrile <- Qdata_infected_afebrile_day1to6 %>%
  left_join(Qdata) %>% 
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0) %>%
  mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis, 
         LRI = cough+SOB, 
         SystemicI = headache+muscleAches+malaise) %>%
  mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
  mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3, 
         stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3, 
         sneezing123 = sneezing==1 | sneezing==2 | sneezing==3, 
         soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
         DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3, 
         DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3, 
         DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3, 
         DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
         cough123 = cough==1 | cough==2 | cough==3, 
         SOB123 = SOB==1 | SOB==2 | SOB==3,
         headache123 = headache==1 | headache==2 | headache==3, 
         muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3, 
         malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
  mutate(Febrile = as.numeric(Febrile),
         runnyNose123 = as.numeric(runnyNose123), 
         stuffyNose123 = as.numeric(stuffyNose123), 
         sneezing123 = as.numeric(sneezing123), 
         soreThroat123 = as.numeric(soreThroat123),
         DPENasalDischarge123 = as.numeric(DPENasalDischarge123), 
         DPEOtits123 = as.numeric(DPEOtits123), 
         DPESinusTenderness123 = as.numeric(DPESinusTenderness123), 
         DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
         cough123 = as.numeric(cough123), 
         SOB123 = as.numeric(SOB123),
         headache123 = as.numeric(headache123), 
         muscleAches123 = as.numeric(muscleAches123), 
         malaise123 = as.numeric(malaise123)) %>%
  select(SubjectID, StudyDay, Sx_Date, SDC_time, QuarantineNumber, Febrile,
         runnyNose123, stuffyNose123, sneezing123, soreThroat123, cough123, SOB123)
# Now binding together and sorting
ILI_V3_donors_infected_afebrile <- bind_rows(ILI_V3_donors_infected_afebrile, ILI_V3_donors_infected_before_day1_afebrile) 
ILI_V3_donors_infected_afebrile <- ILI_V3_donors_infected_afebrile %>%
  arrange(SubjectID, StudyDay)

# But, the current definition is just for post day0 so we will filter just day1-6
ILI_V3_donors_infected_afebrile_1to6 <- ILI_V3_donors_infected_afebrile %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6)

# Loop to get subjectIDs where there were 2 symptoms (one of which respiratory), each observed 3 times on the same day
sub <- unique(ILI_V3_donors_infected_afebrile_1to6$SubjectID)
c_sub <- c()
token <- 0
for (i in 1:length(sub)) {
  token <- 0
  subid <- sub[i]
  temp <- ILI_V3_donors_infected_afebrile_1to6[ILI_V3_donors_infected_afebrile_1to6$SubjectID == subid, ]
  temp1<-temp[,6:15]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:5],temp1)
  stud <- unique(temp$StudyDay)
  for (j in 1:length(stud)) {
    studyday <- stud[j]
    temp2 <- temp[temp$StudyDay == studyday, ]
    for (k in 1:(nrow(temp2))) {
      for (l in 7:12) {
        for (m in (l+1):15) { 
          if (sum(temp2[,l]) + sum(temp2[,m]) == 6) {
            c_sub <- rbind(c_sub, subid)
            token <- 1
            break
          }
        }
        if(token == 1) {
          break
        }
      }
      if(token == 1) {
        break
      }
    }
    if(token == 1) {
      break
    }
  }
} 
# This prints a vector c_sub of 11 subjectIDs
# If we want to do the same loop, but exclude symptoms that were positive before study day 1 we would do...
# Use the "ILI_V3_donors_infected_afebrile" df because it has all three study days for day-3 to day6 for each always afebrile subjectID
sub <- unique(ILI_V3_donors_infected_afebrile$SubjectID)
c_sub2 <- c()
token <- 0
for (i in 1:length(sub)) {
  token <- 0
  subid <- sub[i]
  temp <- ILI_V3_donors_infected_afebrile[ILI_V3_donors_infected_afebrile$SubjectID == subid, ]
  temp1<-temp[,6:13]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:5],temp1)
  stud <- unique(temp$StudyDay)
  for (j in 1:length(stud)) {
    studyday <- stud[j]
    temp2 <- temp[temp$StudyDay == studyday, ]
    for (k in 1:(nrow(temp2))) {
      for (l in 5:10) {
        for (m in (l+1):13) { 
          if (sum(temp2[,l]) + sum(temp2[,m]) == 6) {
            sum1<-0
            sum2<-0
            for (n in 1:(tail(which(temp$StudyDay==0), n=1))){
              sum1<-sum1+temp[n,l]
              sum2<-sum2+temp[n,m]
            }
            if (sum1==0 & sum2==0){
              c_sub2 <- rbind(c_sub2, subid)
              token <- 1
              break
            }
          }
        }
        if(token == 1) {
          break
        }
      }
      if(token == 1) {
        break
      }
    }
    if(token == 1) {
      break
    }
  }
} 
# This yields a vector of 8 subjectIDs (3 less than the c_sub, less stringent criteria)
# For now we will take the less stringent criteria that doesn't exclude Sx occuring before day1
# So this means we will stick to using c_sub and not c_sub2
# Now get c_sub vector of 11 subjectIDs into adf of subjectIDs
ILI_V3_infected_donors_criteria2_2resp_singleday <- as.data.frame(c_sub) %>%
  rename(SubjectID = "V1")
print(ILI_V3_infected_donors_criteria2_2resp_singleday)
# This "ILI_V3_infected_donors_criteria2_2resp_singleday" df is the output for the 1st part (single day) of the ILI_V3 2nd criteria. 

# Now for the 2nd part (2 consec days) of the ILI_V3 2nd criteria
# Which is >=2 Sx with >=1 of those Sx being respiratory, and the >=2 Sx occuring over 2 consecutive days

# Need to make a new df that has, for all of the always afebrile individuals, 1 row of data for each study day-3to6
# Recall: "Symptomatic_donors_infected_grade123" df was created in V1 of Sympomtatic and has all the Sx we need and spans study days-3 to 6, but we want to keep only the always afebrile ones
ILI_V3_donors_infected_afebrile_1row_per_studyday_neg3to6 <- Symptomatic_donors_infected_grade123 %>%
  right_join(Qdata_infected_afebrile_day1to6) 

# Now, using the "ILI_V3_donors_infected_afebrile_1row_per_studyday_neg3to6" df
# Columns 5-10 are respiratory symptoms and columns 11-13 are the other symptoms that matter for this definition
# Do a loop to list subjectIDs for those with 2 or more symptoms, one of which is respiratory and where this occurred over 2 consecutive study days (at least 2 consecutive study days)
# first, do not exclude symptoms that were positive before day 1 as part of the classification criteria
# This means we must cut a new df with just data from study days 1-6
ILI_V3_donors_infected_afebrile_1row_per_studyday_day1to6 <- ILI_V3_donors_infected_afebrile_1row_per_studyday_neg3to6 %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6)

sub <- unique(ILI_V3_donors_infected_afebrile_1row_per_studyday_day1to6$SubjectID)
c_sub <- c()
token<-0
for (i in 1:length(sub)) {
  token<-0
  subid <- sub[i]
  temp <- ILI_V3_donors_infected_afebrile_1row_per_studyday_day1to6[ILI_V3_donors_infected_afebrile_1row_per_studyday_day1to6$SubjectID == subid, ]
  temp1<-temp[,4:13]
  temp1[is.na(temp1)]<-0
  temp<-cbind(temp[,1:3],temp1)
  for (j in 1:(nrow(temp)-1)) {
    for (k in 5:10) {
      for (l in (k+1):13){
        if (temp[j, k] + temp[j, l] + temp[j+1, k] + temp[j+1, l] == 4) {
          if (temp$StudyDay[j+1] == temp$StudyDay[j]+1) {
            c_sub <- rbind(c_sub, subid)
            token<-1
            break
          }
        }
      }
      if (token==1){
        break
      } 
    }
    if (token==1){
      break
    }
  }
}
# This gives a vector c_sub of 18 subjectIDs
# If we want to exclude symptoms that were positive before day 1 as part of the classification criteria, we use the following loop
sub <- unique(ILI_V3_donors_infected_afebrile_1row_per_studyday_neg3to6$SubjectID)
c_sub2 <- c()
token<-0
for (i in 1:length(sub)) {
  token<-0
  subid <- sub[i]
  temp <- ILI_V3_donors_infected_afebrile_1row_per_studyday_neg3to6[ILI_V3_donors_infected_afebrile_1row_per_studyday_neg3to6$SubjectID == subid, ]
  temp1<-temp[,4:13]
  temp1[is.na(temp1)]<-0
  temp<-cbind(temp[,1:3],temp1)
  for (j in 1:(nrow(temp)-1)) {
    for (k in 5:10) {
      for (l in (k+1):13){
        if (temp[j, k] + temp[j, l] + temp[j+1, k] + temp[j+1, l] == 4) {
          if (temp$StudyDay[j+1] == temp$StudyDay[j]+1) {
            sum1<-0
            sum2<-0
            for (m in 1:(tail(which(temp$StudyDay==-1), n=1))){
              sum1<-sum1+temp[m,k]
              sum2<-sum1+temp[m,l]
            }
            if (sum1==0 & sum2==0) {
              c_sub2 <- rbind(c_sub2, subid)
              token<-1
              break
            }
          }
        }
      }
      if (token==1){
        break
      } 
    }
    if (token==1){
      break
    }
  }
}
# This yields 18 subjectIDs and doesn't change the result compared with the less stringent criteria.
# Thus, we will keep the original c_sub that is from the less stringent criteria
# Get the original c_sub vector of 18 subjectIDs into a dataframe
ILI_V3_infected_donors_criteria2_part2 <- as.data.frame(c_sub) %>%
  rename(SubjectID = "V1")
print(ILI_V3_infected_donors_criteria2_part2)

# merge together the ILI criteria2 parts 1 and 2
ILI_V3_infected_donors_criteria2 <- full_join(ILI_V3_infected_donors_criteria2_2resp_singleday, ILI_V3_infected_donors_criteria2_part2) %>%
  arrange(SubjectID)
print(ILI_V3_infected_donors_criteria2)

# merge together the ILI criteria 1 and 2 dfs
ILI_V3_infected_donors <- full_join(ILI_V3_infected_donors_criteria1, ILI_V3_infected_donors_criteria2) %>%
  arrange(SubjectID)
print(ILI_V3_infected_donors)

# Now adding the QuarantineNumber on to the ILI df 
# Then we can sort by Q for the table1
ILI_V3_infected_donors_table1 <- ILI_V3_infected_donors %>%
  left_join(Qdata_QuarantineNumbers, by = c("SubjectID" = "SubjectID")) %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_ILI_V3 = n_distinct(SubjectID))
print(ILI_V3_infected_donors_table1)

# Add onto Table1 the number of ILI by version 3 criteria and % of infected
Qdata_table1 <- Qdata_table1 %>%
  left_join(ILI_V3_infected_donors_table1, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
  mutate(Fraction_ILI_V3_of_Infected = Number_ILI_V3/Number_Infected_Donors)
print(Qdata_table1)

#### Table 1 (donors): e) Number of febrile (and % of infected)  ####

# Use the list of infected donors to do this analysis
Qdata_infected_febrile <- Qdata_infected_donors %>%
  left_join(Qdata) %>%
  filter(Randomization_DorIRorCR == "D" & Tympanic.temp..degrees.C. >37.9) 

Qdata_infected_febrile_table1 <- Qdata_infected_febrile %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_Febrile_Infected = n_distinct(SubjectID))
print(Qdata_infected_febrile_table1)

# Add febrile count and fraction febrile to Table 1
Qdata_table1 <- Qdata_table1 %>%
  left_join(Qdata_infected_febrile_table1, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
  mutate(Fraction_Febrile_Infected_of_Total_Infected = Number_Febrile_Infected/Number_Infected_Donors)

#### Aysymptomatic -- for the text of the paper ####
# Asymptomatic is defined as not symptomatic
# Since ILI must also be symptomatic they are not included in this definition
# Asymptomatic should probably not be febrile, but let's check to make sure all the febrile fell within the Symptomatic pool
# Then we can take the inverse of the Symptomatic group as the Asymptomatic group.
# Critical question: Are we talking about the 42 infected donors or the 52 total donors?
# Answer: We are talking about out of the 42 infected donors. 

# Are all the febrile cases also symptomatic?
Qdata_infected_febrile_subjectID <- Qdata_infected_febrile %>%
  distinct(SubjectID, .keep_all = FALSE)
print(Qdata_infected_febrile_subjectID)
# There are 8 subjectIDs that were febrile among the infected donors
# Now let's see the list of symptomatic subjectIDs and compare
# Apparently this analysis was already done as part of figuring out the group that was symptomatic
# Found one instance where there was someone who was febrile, who didn't make the list of symptomatic (#148)
# We can see if we get the same result here.
Febrile_not_symptomatic <- Qdata_infected_febrile_subjectID %>%
  distinct(SubjectID, .keep_all = FALSE) %>%
  anti_join(Symptomatic_V3_donors_infected_combined)
print(Febrile_not_symptomatic)
# Indeed we get the same result here. There is one subject (#148) among the 42 infected donors who was febrile and not symptomatic
# Which Q was this donor a part of? How high was the fever? How long did the fever persist?
Qdata_148 <- Qdata %>%
  filter(SubjectID == 148) %>%
  filter(Tympanic.temp..degrees.C. > 37.9) %>%
  select(QuarantineNumber, SubjectID, Tympanic.temp..degrees.C., StudyDay, Sx_Date, SDC_time) %>%
  distinct(SubjectID, StudyDay, SDC_time, .keep_all = TRUE)

# Let's get the list of those among the 42 infected donors who never met symptomatic criteria
never_symptomatic <- Qdata_infected_donors %>%
  select(QuarantineNumber, SubjectID) %>%
  anti_join(Symptomatic_V3_donors_infected_combined)
print(never_symptomatic)
# This generates a list of 10
# I will now eliminate from the list of 10, the febrile but not symptomatic 
asymptomatic_without148 <- never_symptomatic %>%
  anti_join(Febrile_not_symptomatic)
print(asymptomatic_without148)

asymptomatic_with148 <- never_symptomatic
print(asymptomatic_with148)
# Since there were no other symptoms with 148 and the fever was low-grade and only appeared during a single instance we will not exclude 148 from the list of asymptomatic

# I will now make an aggregated table by Q#
asymptomatic_by_Q <- asymptomatic_with148 %>%
  group_by(QuarantineNumber) %>%
  summarize(Asymptomatic = n_distinct(SubjectID))
print(asymptomatic_by_Q)

# Cleaning up "asymptomatic_by_Q" for inclusion in the SI
asymptomatic_by_Q <- asymptomatic_by_Q %>%
  rename(`Quarantine #` = QuarantineNumber) %>%
  bind_rows(summarise_all(., funs(if(is.numeric(.)) sum(.) else "Total")))
# Change the 4th row of the Quarantine # column to "Total"
asymptomatic_by_Q$`Quarantine #`[4] <- "Total"

# write these files out to the results folder
write_csv(asymptomatic_by_Q, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Asymptomatic_Q.csv")

#### Table 1 (donors): f) Number of PCR confirmed infection (and % of infected) ####

# This was already done to get the number of infected donors for the first few columns in this Table 1
# Redo what was done earlier, but tweaking for the purpose of this column in the table 1

# Get list of SubjectID and the number of days each was positive by PCR
Qdata_pcr_pos2_or_more_days <- Qdata %>%
  filter(Randomization_DorIRorCR == "D") %>%
  filter(!is.na(InfA_Ct)) %>%
  filter((InfA_Ct<38 & InfA_Ct!=0)) %>%
  group_by(SubjectID, StudyDay) %>%
  summarize(count = n()) %>%
  summarize(NumberDaysPosPCR = n_distinct(StudyDay)) %>%
  filter(NumberDaysPosPCR >=2)
print(Qdata_pcr_pos2_or_more_days)

# Add the Q numbers to the list of SubjectIDs and the number of PCR positive days and summarize by Q
Qdata_pcr_pos2_or_more_days_table1 <- Qdata_pcr_pos2_or_more_days %>%
  left_join(Qdata_QuarantineNumbers) %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_PCR_Infected_Donors = n_distinct(SubjectID))
print(Qdata_pcr_pos2_or_more_days_table1)

# To add to table1
Qdata_table1 <- Qdata_table1 %>%
  left_join(Qdata_pcr_pos2_or_more_days_table1, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
  mutate(Fraction_PCR_Infected_Donors_of_Infected = Number_PCR_Infected_Donors/Number_Infected_Donors)
print(Qdata_table1)

#### But the text for the paper also asks for the number of volunteers that were positive on each study day 1-6 ####
Qdata_pcr_pos2_or_more_days_days1to6_total_samples <- Qdata_pcr_pos2_or_more_days %>%
  left_join(Qdata) %>%
  select(QuarantineNumber, SubjectID, StudyDay, InfA_Ct) %>%
  group_by(SubjectID) %>%
  distinct(StudyDay, .keep_all = TRUE) %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6) %>%
  ungroup() %>%
  group_by(StudyDay) %>%
  summarize(Number_samples_each_studyday_positive_subjects = n())
# This shows that indeed there were 36 samples taken (1 for each subjectID) on each of study days 1-6 for each of the PCR infected subjects
Qdata_pcr_pos2_or_more_days_days1to6 <- Qdata_pcr_pos2_or_more_days %>%
  left_join(Qdata) %>%
  select(QuarantineNumber, SubjectID, StudyDay, InfA_Ct) %>%
  group_by(SubjectID) %>%
  distinct(StudyDay, .keep_all = TRUE) %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6) %>%
  ungroup() %>%
  filter(InfA_Ct>0) %>%
  group_by(StudyDay) %>%
  summarize(number_PCR_positive = n_distinct(SubjectID)) %>%
  left_join(Qdata_pcr_pos2_or_more_days_days1to6_total_samples) %>%
  mutate(percent_pos = (number_PCR_positive/Number_samples_each_studyday_positive_subjects)*100) %>%
  rename(`Study Day` = StudyDay) %>%
  select(-Number_samples_each_studyday_positive_subjects) # Note that this value was 36 for each study day

Qdata_pcr_pos2_or_more_days_days1to6[7,] = c(7, colSums(Qdata_pcr_pos2_or_more_days_days1to6[,2]), 
                                             (colSums(Qdata_pcr_pos2_or_more_days_days1to6[,2])/(6*36))*100)
# Note that the 7 in the first column is a place holder for what should be later labeled "Total"
# Had to keep it in numeric format for now

Qdata_pcr_pos2_or_more_days_days1to6_line <- Qdata_pcr_pos2_or_more_days_days1to6 %>%
  mutate(percent_pos = as.numeric(percent_pos)) %>%
  mutate_at(vars(percent_pos), funs(round(., 2))) %>%
  rename(`PCR Positive Subjects` = number_PCR_positive, `Percent Positive` = percent_pos)

# write this out because it is used in the results in text for line chart.
write_csv(Qdata_pcr_pos2_or_more_days_days1to6_line, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/PCR_Positive_Study_Day_line.csv")

# Now get this data into tabular format in case we choose to present it like that or both ways
# First round to the nearest whole number since this seems to be how the Nottingham team has drafted the paper. 
Qdata_pcr_pos2_or_more_days_days1to6_line_round_parenth <- Qdata_pcr_pos2_or_more_days_days1to6_line %>%
  mutate_at(vars(`Percent Positive`), funs(round(., 0))) 
# Now add the () around the percent value
Qdata_pcr_pos2_or_more_days_days1to6_line_round_parenth$`Percent Positive` <- 
  paste0("(", Qdata_pcr_pos2_or_more_days_days1to6_line_round_parenth$`Percent Positive`, ")")

Qdata_pcr_pos2_or_more_days_days1to6_table <- Qdata_pcr_pos2_or_more_days_days1to6_line_round_parenth %>%
  unite(`PCR Positive Subjects (%)`, `PCR Positive Subjects`, `Percent Positive`, sep = " ", remove = TRUE)

Qdata_pcr_pos2_or_more_days_days1to6_table[7,1] <- "Total"

print(Qdata_pcr_pos2_or_more_days_days1to6_table)

# write this out because it is used in the results in text.
write_csv(Qdata_pcr_pos2_or_more_days_days1to6_table, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/PCR_Positive_Study_Day_Table.csv")
# Note that the percent in parentheses here is the percent positive of all PCR positive samples 

#### What if we wanted to compare how the PCR positives compared over the 6 study days between Qs? ####
# Let's stratify this by Quarantine and add each Q as a column 
# Then we can try some categorical statistics to test if there are any between group differences

Qdata_pcr_pos2_or_more_days_days1to6_byQ <- Qdata_pcr_pos2_or_more_days %>%
  left_join(Qdata) %>%
  select(QuarantineNumber, SubjectID, StudyDay, InfA_Ct) %>%
  group_by(SubjectID) %>%
  distinct(StudyDay, .keep_all = TRUE) %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6) %>%
  ungroup() %>%
  filter(InfA_Ct>0) %>%
  spread(QuarantineNumber, InfA_Ct) 

## Let's get each Q individually and then put them together ##

## First deal with Q1

Qdata_pcr_pos2_or_more_days_days1to6_byQ1 <- Qdata_pcr_pos2_or_more_days_days1to6_byQ %>%
  select(SubjectID, StudyDay, "1") %>%
  rename(Q1 = "1") %>%
  filter(!is.na(Q1)) %>%
  group_by(StudyDay) %>%
  summarize(Q1_number_PCR_positive = n_distinct(SubjectID, na.rm = TRUE))

# Now create the % positive for table 1
# First need to figure out how many total samples were taken for each of these study days for Q1
Qdata_pcr_pos2_or_more_days_days1to6_byQ_Q1totalsamples <- Qdata_pcr_pos2_or_more_days %>%
  left_join(Qdata) %>%
  select(QuarantineNumber, SubjectID, StudyDay, InfA_Ct) %>%
  group_by(SubjectID) %>%
  distinct(StudyDay, .keep_all = TRUE) %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6) %>%
  ungroup() %>%
  filter(QuarantineNumber == 1) %>%
  group_by(StudyDay) %>%
  summarize(Q1_Number_samples_each_studyday_positive_subjects = n())
# Shows that there were 12 samples taken on each study day (one for each subjectID per day) among the positive subjects for Q1

# Now add the total column derived above to the Q1 df
Qdata_pcr_pos2_or_more_days_days1to6_byQ1 <- Qdata_pcr_pos2_or_more_days_days1to6_byQ1 %>%
  left_join(Qdata_pcr_pos2_or_more_days_days1to6_byQ_Q1totalsamples) 

# Add the totals row for Q1 df
Qdata_pcr_pos2_or_more_days_days1to6_byQ1[7,] <- c(7, colSums(Qdata_pcr_pos2_or_more_days_days1to6_byQ1[,2:3]))
# Note that the first column was made to be 7 as a place holder, we will chance it to "total" when we are ready, but need numeric for now

# Now add the percents column to the Q1 df
Qdata_pcr_pos2_or_more_days_days1to6_byQ1 <- Qdata_pcr_pos2_or_more_days_days1to6_byQ1 %>%
  mutate(Q1_Percent_Positive = (Q1_number_PCR_positive/Q1_Number_samples_each_studyday_positive_subjects)*100)

# round and rename
Qdata_pcr_pos2_or_more_days_days1to6_byQ1 <- Qdata_pcr_pos2_or_more_days_days1to6_byQ1 %>%
  mutate_at(vars(Q1_Percent_Positive), funs(round(., 2))) %>%
  rename(`Study Day` = StudyDay,
         `Q1 PCR Positive Samples` = Q1_number_PCR_positive, 
         `Q1 Total Samples from Positive Subjects` = Q1_Number_samples_each_studyday_positive_subjects,
         `Q1 Percent Positive Samples` = Q1_Percent_Positive)

## Now deal with Q2

Qdata_pcr_pos2_or_more_days_days1to6_byQ2 <- Qdata_pcr_pos2_or_more_days_days1to6_byQ %>%
  select(SubjectID, StudyDay, "2") %>%
  rename(Q2 = "2") %>%
  filter(!is.na(Q2)) %>%
  group_by(StudyDay) %>%
  summarize(Q2_number_PCR_positive = n_distinct(SubjectID, na.rm = TRUE))

# Now create the % positive for table 1
# First need to figure out how many total samples were taken for each of these study days for Q2
Qdata_pcr_pos2_or_more_days_days1to6_byQ_Q2totalsamples <- Qdata_pcr_pos2_or_more_days %>%
  left_join(Qdata) %>%
  select(QuarantineNumber, SubjectID, StudyDay, InfA_Ct) %>%
  group_by(SubjectID) %>%
  distinct(StudyDay, .keep_all = TRUE) %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6) %>%
  ungroup() %>%
  filter(QuarantineNumber == 2) %>%
  group_by(StudyDay) %>%
  summarize(Q2_Number_samples_each_studyday_positive_subjects = n())
# Shows that there were 10 samples taken on each study day (one for each subjectID per day) among the positive subjects for Q2

# Now add the total column derived above to the Q2 df
Qdata_pcr_pos2_or_more_days_days1to6_byQ2 <- Qdata_pcr_pos2_or_more_days_days1to6_byQ2 %>%
  left_join(Qdata_pcr_pos2_or_more_days_days1to6_byQ_Q2totalsamples) 

# Add the totals row for Q2 df
Qdata_pcr_pos2_or_more_days_days1to6_byQ2[7,] <- c(7, colSums(Qdata_pcr_pos2_or_more_days_days1to6_byQ2[,2:3]))
# Note that the first column was made to be 7 as a place holder, we will chance it to "total" when we are ready, but need numeric for now

# Now add the percents column to the Q2 df
Qdata_pcr_pos2_or_more_days_days1to6_byQ2 <- Qdata_pcr_pos2_or_more_days_days1to6_byQ2 %>%
  mutate(Q2_Percent_Positive = (Q2_number_PCR_positive/Q2_Number_samples_each_studyday_positive_subjects)*100)

# round and rename
Qdata_pcr_pos2_or_more_days_days1to6_byQ2 <- Qdata_pcr_pos2_or_more_days_days1to6_byQ2 %>%
  mutate_at(vars(Q2_Percent_Positive), funs(round(., 2))) %>%
  rename(`Study Day` = StudyDay,
         `Q2 PCR Positive Samples` = Q2_number_PCR_positive, 
         `Q2 Total Samples from Positive Subjects` = Q2_Number_samples_each_studyday_positive_subjects,
         `Q2 Percent Positive Samples` = Q2_Percent_Positive)

## Now deal with Q3

Qdata_pcr_pos2_or_more_days_days1to6_byQ3 <- Qdata_pcr_pos2_or_more_days_days1to6_byQ %>%
  select(SubjectID, StudyDay, "3") %>%
  rename(Q3 = "3") %>%
  filter(!is.na(Q3)) %>%
  group_by(StudyDay) %>%
  summarize(Q3_number_PCR_positive = n_distinct(SubjectID, na.rm = TRUE))

# Now create the % positive for table 1
# First need to figure out how many total samples were taken for each of these study days for Q3
Qdata_pcr_pos2_or_more_days_days1to6_byQ_Q3totalsamples <- Qdata_pcr_pos2_or_more_days %>%
  left_join(Qdata) %>%
  select(QuarantineNumber, SubjectID, StudyDay, InfA_Ct) %>%
  group_by(SubjectID) %>%
  distinct(StudyDay, .keep_all = TRUE) %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6) %>%
  ungroup() %>%
  filter(QuarantineNumber == 3) %>%
  group_by(StudyDay) %>%
  summarize(Q3_Number_samples_each_studyday_positive_subjects = n())
# Shows that there were 14 samples taken on each study day (one for each subjectID per day) among the positive subjects for Q3

# Now add the total column derived above to the Q3 df
Qdata_pcr_pos2_or_more_days_days1to6_byQ3 <- Qdata_pcr_pos2_or_more_days_days1to6_byQ3 %>%
  left_join(Qdata_pcr_pos2_or_more_days_days1to6_byQ_Q3totalsamples) 

# Add the totals row for Q3 df
Qdata_pcr_pos2_or_more_days_days1to6_byQ3[7,] <- c(7, colSums(Qdata_pcr_pos2_or_more_days_days1to6_byQ3[,2:3]))
# Note that the first column was made to be 7 as a place holder, we will chance it to "total" when we are ready, but need numeric for now

# Now add the percents column to the Q3 df
Qdata_pcr_pos2_or_more_days_days1to6_byQ3 <- Qdata_pcr_pos2_or_more_days_days1to6_byQ3 %>%
  mutate(Q3_Percent_Positive = (Q3_number_PCR_positive/Q3_Number_samples_each_studyday_positive_subjects)*100)

# round and rename
Qdata_pcr_pos2_or_more_days_days1to6_byQ3 <- Qdata_pcr_pos2_or_more_days_days1to6_byQ3 %>%
  mutate_at(vars(Q3_Percent_Positive), funs(round(., 2))) %>%
  rename(`Study Day` = StudyDay,
         `Q3 PCR Positive Samples` = Q3_number_PCR_positive, 
         `Q3 Total Samples from Positive Subjects` = Q3_Number_samples_each_studyday_positive_subjects,
         `Q3 Percent Positive Samples` = Q3_Percent_Positive)

## Now merge these Q1, Q2, and Q3 together into one larger df

Qdata_pcr_pos2_or_more_days_days1to6_byQ123_line <- Qdata_pcr_pos2_or_more_days_days1to6_byQ1 %>%
  left_join(Qdata_pcr_pos2_or_more_days_days1to6_byQ2) %>%
  left_join(Qdata_pcr_pos2_or_more_days_days1to6_byQ3) %>%
  filter(`Study Day` !=7)

# Would like to add some columns that sum the number of positive, total samples, and make new percents for cumulative day1-6
# Note that these values looking at all Q combined were already generated, but to generate them again here is a good, redundant check

Qdata_pcr_pos2_or_more_days_days1to6_byQ123_line <- Qdata_pcr_pos2_or_more_days_days1to6_byQ123_line %>%
  group_by(`Study Day`) %>%
  mutate(`All Q Total PCR Positive Samples` = sum(`Q1 PCR Positive Samples`, `Q2 PCR Positive Samples`, `Q3 PCR Positive Samples`),
         `All Q Total Samples from Positive Subjects` = sum(`Q1 Total Samples from Positive Subjects`, 
                                                            `Q2 Total Samples from Positive Subjects`,
                                                            `Q3 Total Samples from Positive Subjects`)) %>%
  mutate(`All Q Percent Positive Samples` = (`All Q Total PCR Positive Samples`/`All Q Total Samples from Positive Subjects`)*100) %>%
  mutate_at(vars(`All Q Percent Positive Samples`), funs(round(., 2)))
         
# Perhaps need to reshape this data for plotting in ggplot2
# Try gathering it so that there are just two variables to plot, the study day (x-axis), and the percent positive (y-axis)
# The Q1, Q2, and Q3 will be the groups
Qdata_pcr_pos2_or_more_days_days1to6_byQ123_line_for_plot <- Qdata_pcr_pos2_or_more_days_days1to6_byQ123_line %>%
  gather("Quarantine", "Percent Positive", 4,7,10,13) %>%
  select(`Study Day`, `Quarantine`, `Percent Positive`)
# Convert the Quarantine column data to simply Q1, Q2, Q3, and All Q
Qdata_pcr_pos2_or_more_days_days1to6_byQ123_line_for_plot$Quarantine[1:18] <- 
  str_sub(Qdata_pcr_pos2_or_more_days_days1to6_byQ123_line_for_plot$Quarantine[1:18], 1,2)
Qdata_pcr_pos2_or_more_days_days1to6_byQ123_line_for_plot$Quarantine[19:24] <- 
  str_sub(Qdata_pcr_pos2_or_more_days_days1to6_byQ123_line_for_plot$Quarantine[19:24], 1,5)

# Check to see if the plot works
p <- ggplot(Qdata_pcr_pos2_or_more_days_days1to6_byQ123_line_for_plot, 
            aes(x = `Study Day`, y = `Percent Positive`, group = Quarantine, colour = Quarantine)) +
  geom_line() +
  geom_point() +
  scale_color_manual(values = c("black", "chartreuse4", "cyan2", "darkorchid1")) +
  theme_bw() +
  xlab("Study Day") +
  ylab("Percent Positive")
p

# Now write out this df for line plotting
write_csv(Qdata_pcr_pos2_or_more_days_days1to6_byQ123_line_for_plot, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/PCR_Pos_StudyDay_Q123_line.csv")

## Now for a version for table output, do some adjustments
Qdata_pcr_pos2_or_more_days_days1to6_byQ123_table <- Qdata_pcr_pos2_or_more_days_days1to6_byQ1 %>%
  left_join(Qdata_pcr_pos2_or_more_days_days1to6_byQ2) %>%
  left_join(Qdata_pcr_pos2_or_more_days_days1to6_byQ3) %>%
  left_join(Qdata_pcr_pos2_or_more_days_days1to6_line) %>%
  rename(`All Q PCR Positive Samples` = `PCR Positive Subjects`,
         `All Q Percent Positive` = `Percent Positive`)

# Cleaning up the table df to make it more presentable.
# For example, we probably don't need to keep the columns with the Total Sample counts, let's remove these, let's also round the percents to the nearest whole percent
Qdata_pcr_pos2_or_more_days_days1to6_byQ123_table <- Qdata_pcr_pos2_or_more_days_days1to6_byQ123_table %>%
  select(`Study Day`, 
         `Q1 PCR Positive Samples`, `Q1 Percent Positive Samples`,
         `Q2 PCR Positive Samples`, `Q2 Percent Positive Samples`,
         `Q3 PCR Positive Samples`, `Q3 Percent Positive Samples`,
         `All Q PCR Positive Samples`, `All Q Percent Positive`) %>%
  mutate_all(funs(round(., 0)))

# Add the () to the percents
Qdata_pcr_pos2_or_more_days_days1to6_byQ123_table$`Q1 Percent Positive Samples` <- 
  paste0("(", Qdata_pcr_pos2_or_more_days_days1to6_byQ123_table$`Q1 Percent Positive Samples`, ")")
Qdata_pcr_pos2_or_more_days_days1to6_byQ123_table$`Q2 Percent Positive Samples` <- 
  paste0("(", Qdata_pcr_pos2_or_more_days_days1to6_byQ123_table$`Q2 Percent Positive Samples`, ")")
Qdata_pcr_pos2_or_more_days_days1to6_byQ123_table$`Q3 Percent Positive Samples` <- 
  paste0("(", Qdata_pcr_pos2_or_more_days_days1to6_byQ123_table$`Q3 Percent Positive Samples`, ")")
Qdata_pcr_pos2_or_more_days_days1to6_byQ123_table$`All Q Percent Positive` <- 
  paste0("(", Qdata_pcr_pos2_or_more_days_days1to6_byQ123_table$`All Q Percent Positive`, ")")

# Unite the positive sample number columns with the percents
Qdata_pcr_pos2_or_more_days_days1to6_byQ123_table <- Qdata_pcr_pos2_or_more_days_days1to6_byQ123_table%>%
  unite(`Q1 PCR Positive Samples (%)`, 
        `Q1 PCR Positive Samples`, `Q1 Percent Positive Samples`,
        sep = c(" "), remove = TRUE) %>%
  unite(`Q2 PCR Positive Samples (%)`, 
        `Q2 PCR Positive Samples`, `Q2 Percent Positive Samples`,
        sep = c(" "), remove = TRUE) %>%
  unite(`Q3 PCR Positive Samples (%)`, 
        `Q3 PCR Positive Samples`, `Q3 Percent Positive Samples`,
        sep = c(" "), remove = TRUE) %>%
  unite(`All Q PCR Positive Samples (%)`, 
        `All Q PCR Positive Samples`, `All Q Percent Positive`,
        sep = c(" "), remove = TRUE)

# Switch the 7 in column 1 to "Total"
Qdata_pcr_pos2_or_more_days_days1to6_byQ123_table[7,1] <- "Total"

# Now write out this table file
write_csv(Qdata_pcr_pos2_or_more_days_days1to6_byQ123_table, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/PCR_Pos_StudyDay_Q123_table.csv")

#### What if we want to  plot the NPSwab Ct values over the 6 study days (positive swabs only)? ####
Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT <- Qdata_pcr_pos2_or_more_days %>%
  left_join(Qdata) %>%
  select(QuarantineNumber, SubjectID, StudyDay, InfA_Ct) %>%
  group_by(SubjectID) %>%
  distinct(StudyDay, .keep_all = TRUE) %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6) %>%
  ungroup() %>%
  filter(InfA_Ct>0)

## Let's get each Q individually and then put them together ##

## Q1
Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Q1 <- Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT %>%
  filter(QuarantineNumber == 1) %>%
  group_by(StudyDay) %>%
  summarise(Q1meanCT = mean(InfA_Ct),
            Q1sdCT = sd(InfA_Ct))

## Q2
Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Q2 <- Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT %>%
  filter(QuarantineNumber == 2) %>%
  group_by(StudyDay) %>%
  summarise(Q2meanCT = mean(InfA_Ct),
            Q2sdCT = sd(InfA_Ct))

## Q3
Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Q3 <- Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT %>%
  filter(QuarantineNumber == 3) %>%
  group_by(StudyDay) %>%
  summarise(Q3meanCT = mean(InfA_Ct),
            Q3sdCT = sd(InfA_Ct))

## To get the total mean and sd over all three Q, we will go back to work with the "Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT" df

Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_Mean_SD <- Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT %>%
  group_by(StudyDay) %>%
  summarize(Qall_meanCT = mean(InfA_Ct),
            Qall_sdCT = sd(InfA_Ct))

## Combine Q1, Q2, Q3, Qall
Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall <- Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Q1 %>%
  left_join(Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Q2) %>%
  left_join(Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Q3) %>%
  left_join(Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_Mean_SD)

# Prepare for plotting the lines
# Convert to long format

# First convert the ct means to long
Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_Mean <- Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall %>%
  gather("Quarantine", "CT", 2,4,6,8) %>%
  select(StudyDay, Quarantine, CT)
# Just select the Q1, Q1, Q3, Qall from the means df
Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_Mean$Quarantine[1:18] <- 
  str_sub(Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_Mean$Quarantine[1:18], 1,2)
Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_Mean$Quarantine[19:24] <- 
  str_sub(Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_Mean$Quarantine[19:24], 1,4)

# Now convert the ct sd to long
Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_SD <- Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall %>%
  gather("Quarantine", "SD", 3,5,7,9) %>%
  select(StudyDay, Quarantine, SD)
# Just select the Q1, Q1, Q3, Qall from the sd df
Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_SD$Quarantine[1:18] <- 
  str_sub(Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_SD$Quarantine[1:18], 1,2)
Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_SD$Quarantine[19:24] <- 
  str_sub(Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_SD$Quarantine[19:24], 1,4)

# Now merge together the mean and sd dfs from above
Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_long <- Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_Mean %>%
  left_join(Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_SD)
# Clean up names, etc. on the df to be used for the plot
Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_long <- Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_long %>%
  rename(`Study Day` = StudyDay)
Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_long$Quarantine[19:24] <- "All Q"
  
# Check to see if plot works before trying in RMarkdown
pd <- position_dodge(0.25)
p <- ggplot(Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_long, 
            aes(x = `Study Day`, y = `CT`, group = Quarantine, colour = Quarantine)) +
  geom_line() +
  geom_errorbar(width=.1, aes(ymin=`CT` - `SD`, ymax=`CT` + `SD`), position = pd) +
  geom_point(position = pd) +
  scale_color_manual(values = c("black", "darkorange3", "cyan2", "darkorchid1")) +
  theme_bw() +
  xlab("Study Day") +
  ylab("NPS CT Value")
p

# Now write out this table file
write_csv(Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall_long, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/PCR_CT_StudyDay_Q123_line.csv")

#### If we wanted this Ct line chart in a table format we could print out the following table ####
# Can work with the "Qdata_pcr_pos2_or_more_days_days1to6_byQ_NPS_CT_Qall" df but need to add the totals for each Q and then the overall total
# Would need to work on this separately. For now the figure serves to get the point across. 

#### Table 1 (donors): g) Number of PCR confirmed infection and seroconversion (and % of infected) ####

# This was already done to get the number of infected donors for the first few columns in this Table 1
# Work with the "Qdata_infected_donors" df that was created in section a) above.

Inf_PCR_and_Sero <- Qdata_infected_donors %>%
  filter(NumberDaysPosPCR >=2) %>%
  filter(!is.na(QuarantineNumber.x) | !is.na(QuarantineNumber.y))

#Now summarize the number of unique SubjectIDs that meet this criteria by Q for the table
Inf_PCR_and_Sero_table1 <- Inf_PCR_and_Sero %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_Positive_PCR_and_Seroconversion = n_distinct(SubjectID))

# Now add Inf_PCR_and_Sero_table1 to the cumulative table 1
Qdata_table1 <- Qdata_table1 %>%
  left_join(Inf_PCR_and_Sero_table1, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
  mutate(Fraction_Infected_by_PCR_and_Serology = Number_Positive_PCR_and_Seroconversion/Number_Infected_Donors)

#### Additional table for SI about positivity by HAI, MN, and/or 2 PCR positive tests ####
# What if we are interested in the full summary, by Q of how many had HAI, MN, and 2 PCR positive tests, ...
# ... who had HAI or MN, and 2 PCR positive tests, ...
# ... who had neither HAI or MN, and 2 PCR positive tests, ...
# ... who had HAI and MN, but did not meet PCR positivity, ...
# ... who had HAI or MN, but did not meet PCR positivity?
# Here we can provide the data in two different ways, one by subjectIDs, and one by aggregating by Q
# When aggregating by Q, it may be good to include totals and percents (out of the number infected per Q)

# First create the full set of SubjectIDs with the basic info of positive by HAI, MN, PCR, PCR study days positive
# Work with the "Qdata_infected_donors" df that was created earlier.

# Add a column for `PCR Positive`
Qdata_infected_donors$`PCR Positive` <- NA

SI_Inf_PCR_and_Sero <- Qdata_infected_donors %>%
  mutate(QuarantineNumber.x = ifelse(QuarantineNumber.x >=1, 1, QuarantineNumber.x)) %>%
  mutate(QuarantineNumber.y = ifelse(QuarantineNumber.y >=1, 1, QuarantineNumber.y)) %>%
  mutate(`PCR Positive` = ifelse(NumberDaysPosPCR >=2, 1, `PCR Positive`)) %>%
  rename(`Seroconversion by HAI` = QuarantineNumber.x, `Seroconversion by MN` = QuarantineNumber.y,
         `Number of Days with PCR Positive NPS` = NumberDaysPosPCR, `Quarantine #` = QuarantineNumber,
         `Volunteer ID` = SubjectID) %>%
  mutate_at(vars(`Seroconversion by HAI`:`PCR Positive`), funs(replace(., is.na(.), 0))) # convert NAs to 0
# Move the Quarantine # to the first column so the RMarkdown can create blocks for Qs 1, 2, and 3
# Also put the number of PCR days positive column to the right of the "PCR positive" column
SI_Inf_PCR_and_Sero <- SI_Inf_PCR_and_Sero[, c(5,1:3,6,4)]

# Write this df out for use in the RMarkdown file for SI material
write_csv(SI_Inf_PCR_and_Sero, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Positive_HAI_MN_PCR_SubjectID.csv")

#### Discussion text: How many donors were PCR positive on at least one day (of days1-4)? ####
pcr_positive <- Qdata %>%
  filter(InfA_Ct > 0) %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4) %>%
  distinct(SubjectID)
# Shows that there were 43 donors with at least one day of PCR positivity on study days 1-4
# 43 out of the 52 donors = 82.3%

# Now summarize the number of unique SubjectIDs that meet this criteria by Q for the table
# Create the following categories:
# how many had HAI, MN, and 2 PCR positive tests, ...
# who had HAI or MN, and 2 PCR positive tests, ...
# who had neither HAI or MN, and 2 PCR positive tests, ...
# who had HAI and MN, but did not meet PCR positivity, ...
# who had HAI or MN, but did not meet PCR positivity?

# First add the columns of interest to the df
# Note that here "pcr positive means meeting the case definition, meaning having two positive tests. 
SI_Inf_PCR_and_Sero_by_Q_table1 <- SI_Inf_PCR_and_Sero
SI_Inf_PCR_and_Sero_by_Q_table1$`Seroconversion by HAI or MN, plus PCR Positive` <- NA
SI_Inf_PCR_and_Sero_by_Q_table1$`Seroconversion by HAI and MN, plus PCR Positive` <- NA
SI_Inf_PCR_and_Sero_by_Q_table1$`Seroconversion by HAI or MN, but not PCR Positive` <- NA
SI_Inf_PCR_and_Sero_by_Q_table1$`Seroconversion by HAI and MN, but not PCR Positive` <- NA

SI_Inf_PCR_and_Sero_by_Q_table1 <- SI_Inf_PCR_and_Sero_by_Q_table1 %>%
  mutate(`Seroconversion by HAI or MN, plus PCR Positive` = 
           ifelse(`PCR Positive` == 1 & (`Seroconversion by HAI` == 1 | `Seroconversion by MN` == 1), 
                  1, `Seroconversion by HAI or MN, plus PCR Positive`)) %>%
  mutate(`Seroconversion by HAI and MN, plus PCR Positive` = 
           ifelse(`PCR Positive` == 1 & (`Seroconversion by HAI` == 1 & `Seroconversion by MN` == 1), 
                  1, `Seroconversion by HAI and MN, plus PCR Positive`)) %>%
  mutate(`Seroconversion by HAI or MN, but not PCR Positive` = 
           ifelse(`PCR Positive` == 0 & (`Seroconversion by HAI` == 1 | `Seroconversion by MN` == 1), 
                  1, `Seroconversion by HAI or MN, but not PCR Positive`)) %>%
  mutate(`Seroconversion by HAI and MN, but not PCR Positive` = 
           ifelse(`PCR Positive` == 0 & (`Seroconversion by HAI` == 1 & `Seroconversion by MN` == 1), 
                  1, `Seroconversion by HAI and MN, but not PCR Positive`)) %>%
  group_by(`Quarantine #`) %>%
  summarize(`Seroconversion by HAI or MN, plus PCR Positive` = sum(`Seroconversion by HAI or MN, plus PCR Positive`, na.rm = TRUE),
            `Seroconversion by HAI and MN, plus PCR Positive` = sum(`Seroconversion by HAI and MN, plus PCR Positive`, na.rm = TRUE),
            `Seroconversion by HAI or MN, but not PCR Positive` = sum(`Seroconversion by HAI or MN, but not PCR Positive`, na.rm = TRUE),
            `Seroconversion by HAI and MN, but not PCR Positive` = sum(`Seroconversion by HAI and MN, but not PCR Positive`, na.rm = TRUE)) %>%
  left_join(Qdata_inoculated_donors_table1, by = c("Quarantine #" = "QuarantineNumber")) %>%
  left_join(Qdata_infected_donors_table1, by = c("Quarantine #" = "QuarantineNumber")) 

# before moving further, deal with sums and means and rounding (don't take mean the percents because the denominators are different for each Q!)
SI_Inf_PCR_and_Sero_by_Q_table1[4,] = c(4, colSums(SI_Inf_PCR_and_Sero_by_Q_table1[,2:7]))
# Note that the first column was made to be 4 as a place holder, we will chance it to "total" when we are ready, but need numeric for now

# Now add the percents on and the calculation of the percents for the totals will work without Q size bias introduced by taking col mean of percentages
SI_Inf_PCR_and_Sero_by_Q_table1 <- SI_Inf_PCR_and_Sero_by_Q_table1 %>%
  mutate(`Percent Infected of Inoculated` = (Number_Infected_Donors/Number_Inoculated_Donors)*100,
         `Percent HAI or MN, and PCR of Infected` = (`Seroconversion by HAI or MN, plus PCR Positive`/Number_Infected_Donors)*100,
         `Percent HAI or MN, and PCR of Inoculated` = (`Seroconversion by HAI or MN, plus PCR Positive`/Number_Inoculated_Donors)*100,
         `Percent HAI and MN, and PCR of Infected` = (`Seroconversion by HAI and MN, plus PCR Positive`/Number_Infected_Donors)*100,
         `Percent HAI and MN, and PCR of Inoculated` = (`Seroconversion by HAI and MN, plus PCR Positive`/Number_Inoculated_Donors)*100,
         `Percent HAI or MN, no PCR of Infected` = (`Seroconversion by HAI or MN, but not PCR Positive`/Number_Infected_Donors)*100,
         `Percent HAI or MN, no PCR of Inoculated` = (`Seroconversion by HAI or MN, but not PCR Positive`/Number_Inoculated_Donors)*100,
         `Percent HAI and MN, no PCR of Infected` = (`Seroconversion by HAI and MN, but not PCR Positive`/Number_Infected_Donors)*100,
         `Percent HAI and MN, no PCR of Inoculated` = (`Seroconversion by HAI and MN, but not PCR Positive`/Number_Inoculated_Donors)*100) %>%
  mutate_at(8:16, round, 0)

# deal with adding () around the percents
SI_Inf_PCR_and_Sero_by_Q_table1$`Percent HAI or MN, and PCR of Infected` <- paste0("(", SI_Inf_PCR_and_Sero_by_Q_table1$`Percent HAI or MN, and PCR of Infected`, ")")
SI_Inf_PCR_and_Sero_by_Q_table1$`Percent HAI or MN, and PCR of Inoculated` <- paste0("(", SI_Inf_PCR_and_Sero_by_Q_table1$`Percent HAI or MN, and PCR of Inoculated`, ")")
SI_Inf_PCR_and_Sero_by_Q_table1$`Percent HAI and MN, and PCR of Infected` <- paste0("(", SI_Inf_PCR_and_Sero_by_Q_table1$`Percent HAI and MN, and PCR of Infected`, ")")
SI_Inf_PCR_and_Sero_by_Q_table1$`Percent HAI and MN, and PCR of Inoculated` <- paste0("(", SI_Inf_PCR_and_Sero_by_Q_table1$`Percent HAI and MN, and PCR of Inoculated`, ")")

SI_Inf_PCR_and_Sero_by_Q_table1$`Percent HAI or MN, no PCR of Infected` <- paste0("(", SI_Inf_PCR_and_Sero_by_Q_table1$`Percent HAI or MN, no PCR of Infected`, ")")
SI_Inf_PCR_and_Sero_by_Q_table1$`Percent HAI or MN, no PCR of Inoculated` <- paste0("(", SI_Inf_PCR_and_Sero_by_Q_table1$`Percent HAI or MN, no PCR of Inoculated`, ")")
SI_Inf_PCR_and_Sero_by_Q_table1$`Percent HAI and MN, no PCR of Infected` <- paste0("(", SI_Inf_PCR_and_Sero_by_Q_table1$`Percent HAI and MN, no PCR of Infected`, ")")
SI_Inf_PCR_and_Sero_by_Q_table1$`Percent HAI and MN, no PCR of Inoculated` <- paste0("(", SI_Inf_PCR_and_Sero_by_Q_table1$`Percent HAI and MN, no PCR of Inoculated`, ")")

SI_Inf_PCR_and_Sero_by_Q_table1$`Percent Infected of Inoculated` <- paste0("(", SI_Inf_PCR_and_Sero_by_Q_table1$`Percent Infected of Inoculated`, ")")

# deal with uniting the right percents with the right columns
# will do first set of parentheses for "of infected" and the second set for "of inoculated"
SI_Inf_PCR_and_Sero_by_Q_table1_Inf_Inoc <- SI_Inf_PCR_and_Sero_by_Q_table1 %>%
  unite(`Infected/Inoculated`, 
        c(Number_Infected_Donors, Number_Inoculated_Donors),
        sep = c("/"), remove = TRUE) %>% 
  unite(`Infected/Inoculated (%)`,
        c(`Infected/Inoculated`, `Percent Infected of Inoculated`),
        sep = c(" "), remove = TRUE) %>%
  unite(`Seroconversion by HAI or MN, plus PCR Positive (% of Infected) (% of Inoculated)`, 
        c(`Seroconversion by HAI or MN, plus PCR Positive`, `Percent HAI or MN, and PCR of Infected`, `Percent HAI or MN, and PCR of Inoculated`), 
        sep = " ", remove = TRUE) %>%
  unite(`Seroconversion by HAI and MN, plus PCR Positive (% of Infected) (% of Inoculated)`, 
        c(`Seroconversion by HAI and MN, plus PCR Positive`, `Percent HAI and MN, and PCR of Infected`, `Percent HAI and MN, and PCR of Inoculated`), 
        sep = " ", remove = TRUE) %>%
  unite(`Seroconversion by HAI or MN, but not PCR Positive (% of Infected) (% of Inoculated)`, 
        c(`Seroconversion by HAI or MN, but not PCR Positive`, `Percent HAI or MN, no PCR of Infected`, `Percent HAI or MN, no PCR of Inoculated`), 
        sep = " ", remove = TRUE) %>%
  unite(`Seroconversion by HAI and MN, but not PCR Positive (% of Infected) (% of Inoculated)`, 
        c(`Seroconversion by HAI and MN, but not PCR Positive`, `Percent HAI and MN, no PCR of Infected`, `Percent HAI and MN, no PCR of Inoculated`), 
        sep = " ", remove = TRUE)

# deal with adding the "total" marker and getting the order of the columns right, and any other design (think about the multiple headers for markdown)
SI_Inf_PCR_and_Sero_by_Q_table1_Inf_Inoc$`Quarantine #`[4] <- "Total"
SI_Inf_PCR_and_Sero_by_Q_table1_Inf_Inoc <- SI_Inf_PCR_and_Sero_by_Q_table1_Inf_Inoc[,c(1,6,2:5)]

# finish and print to box sync and prepare markdown file to draw from here. 
write_csv(SI_Inf_PCR_and_Sero_by_Q_table1_Inf_Inoc, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/SI_Inf_PCR_and_Sero_by_Q_table1_Inf_Inoc.csv")

#### But what if we don't want to know about the inoculated donors and the percent of the inoculated? ####
# Creating new version of the "SI_Inf_PCR_and_Sero_by_Q_table1_manuscript" df that drops the percent inoculated.

# deal with uniting the right percents with the right columns
# will do first set of parentheses for "of infected" and the second set for "of inoculated"
SI_Inf_PCR_and_Sero_by_Q_table1_Inf <- SI_Inf_PCR_and_Sero_by_Q_table1 %>%
  unite(`Infected/Inoculated`, 
        c(Number_Infected_Donors, Number_Inoculated_Donors),
        sep = c("/"), remove = TRUE) %>% 
  unite(`Infected/Inoculated (%)`,
        c(`Infected/Inoculated`, `Percent Infected of Inoculated`),
        sep = c(" "), remove = TRUE) %>%
  unite(`Seroconversion by HAI or MN, plus PCR Positive (% of Infected)`, 
        c(`Seroconversion by HAI or MN, plus PCR Positive`, `Percent HAI or MN, and PCR of Infected`), 
        sep = " ", remove = TRUE) %>%
  unite(`Seroconversion by HAI and MN, plus PCR Positive (% of Infected)`, 
        c(`Seroconversion by HAI and MN, plus PCR Positive`, `Percent HAI and MN, and PCR of Infected`), 
        sep = " ", remove = TRUE) %>%
  unite(`Seroconversion by HAI or MN, but not PCR Positive (% of Infected)`, 
        c(`Seroconversion by HAI or MN, but not PCR Positive`, `Percent HAI or MN, no PCR of Infected`), 
        sep = " ", remove = TRUE) %>%
  unite(`Seroconversion by HAI and MN, but not PCR Positive (% of Infected)`, 
        c(`Seroconversion by HAI and MN, but not PCR Positive`, `Percent HAI and MN, no PCR of Infected`), 
        sep = " ", remove = TRUE)

# deal with adding the "total" marker and getting the order of the columns right, and any other design (think about the multiple headers for markdown)
SI_Inf_PCR_and_Sero_by_Q_table1_Inf$`Quarantine #`[4] <- "Total"
SI_Inf_PCR_and_Sero_by_Q_table1_Inf <- SI_Inf_PCR_and_Sero_by_Q_table1_Inf[,c(1,6,2:5)]

# finish and print to box sync and prepare markdown file to draw from here. 
write_csv(SI_Inf_PCR_and_Sero_by_Q_table1_Inf, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/SI_Inf_PCR_and_Sero_by_Q_table1_Inf.csv")

#### Table 1 (donors): h) Number of seroconversion by HAI: MN: Either ####

# This was already done to get the number of infected donors for the first few columns in this Table 1
# Reworking here to tailor the current Table 1 columns in question

## HAI

# Qdata_HAI_pos is the list (generated in section a) above) with seroconversion by HAI (Glasgow serology)
# Group by Q day and summarize number of distinct SubjectIDs
Qdata_HAI_pos_table1 <- Qdata_HAI_pos %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_HAI_Positive = n_distinct(SubjectID))

# Add HAI_pos column to the table (and % HAI_pos of infected)
# For now will comment out the part that creates the percentage for this column
Qdata_table1 <- Qdata_table1 %>%
  left_join(Qdata_HAI_pos_table1, by = c("QuarantineNumber" = "QuarantineNumber")) #%>%
  #mutate(Fraction_HAI_Positive_of_Infected = Number_HAI_Positive/Number_Infected_Donors)

## Microneuts

# Qdata_Microneut_pos is the list (generated in section a) above) with seroconversion by Microneuts (CDC serology)
# Group by Q day and summarize number of distinct SubjectIDs
Qdata_Microneut_pos_table1 <- Qdata_Microneut_pos %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_Microneut_Positive = n_distinct(SubjectID))

# Add Microneut_pos column to the table (and % Microneut_pos of infected)
# For now will comment out the part that creates the percentage for this column
Qdata_table1 <- Qdata_table1 %>%
  left_join(Qdata_Microneut_pos_table1, by = c("QuarantineNumber" = "QuarantineNumber")) #%>%
  #mutate(Fraction_MN_Positive_of_Infected = Number_Microneut_Positive/Number_Infected_Donors)

## Either HAI or MN

# Already have this generated in the Qdata_infected_donors df
Pos_Either_HAI_or_MN_table1 <- Qdata_infected_donors %>%
  filter(!is.na(QuarantineNumber.x) | !is.na(QuarantineNumber.y)) %>%
  group_by(QuarantineNumber) %>%
  summarize(Positive_By_Either_HAI_or_MN = n_distinct(SubjectID))

# Add Pos_Either_HAI_or_MN_table1 to the cumulative Qdata_table1
# For now will comment out the part that creates the percentage for this column
Qdata_table1 <- Qdata_table1 %>%
  left_join(Pos_Either_HAI_or_MN_table1, by = c("QuarantineNumber" = "QuarantineNumber")) #%>%
  #mutate(Fraction_Positive_By_Either_HAI_or_MN = Positive_By_Either_HAI_or_MN/Number_Infected_Donors)

#### Table 1 (donors) footnote: i) Number of those with greater immunity that expected prior to quarantine by HAI: MN: Both ####

# Definition of serosusceptible for this analysis, which will be included in the footnote of table 1 is from Alex Mann from email correspondence on September 28, 2018. He states:
# "An HI titre of ≤10 and/or an MN titre of <80 at baseline was retrospectively taken to indicate susceptibility to infection"
# Thus we will use this criteria to tell who among the inoculated donors was serosusceptible at baseline (entry to quarantine)
# We won't filter these individuals, but we will note who among those who above the MN of 80 (>=80) and HAI of 10 (>10) thresholds seroconverted, since the likelihood of seroconversion among those above the thresholds is lower.
# Based on teleconference with team on October 12, 2018 we will use the term greater than anticipated immunity upon admission to Q

HI_susceptibility_table1_footnote <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & HAI_dayminus2_recodeNDA >10) %>%
  group_by(QuarantineNumber) %>%
  summarize(HI_greater_anticip_immunity_at_baseline = n_distinct(SubjectID))

MN_susceptibility_table1_footnote <- Qdata %>%
  filter(Microneut_VisitType == "Screening") %>%
  filter(Randomization_DorIRorCR == "D" & Microneutralization.Titer.to.A.Wisconsin.67.2005 >=80) %>%
  group_by(QuarantineNumber) %>%
  summarize(MN_greater_anticip_immunity_at_baseline = n_distinct(SubjectID))

MN_seroconvert_between_screening_baseline_table1 <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & Microneut_VisitType == "Q baseline") %>%
  filter(Microneut_Seroconvert == 1) %>%
  group_by(QuarantineNumber)%>%
  summarize(MN_seroconvert_between_screening_baseline = n_distinct(SubjectID))

table1_footnote <- left_join(HI_susceptibility_table1_footnote, 
                             MN_susceptibility_table1_footnote, 
                             by = c("QuarantineNumber"= "QuarantineNumber")) %>%
  left_join(MN_seroconvert_between_screening_baseline_table1, 
            by = c("QuarantineNumber"= "QuarantineNumber"))

# Looking into more detail on who exactly might have greater than anticip. immunity or may have seroconverted before admission to Q
# Need to check over the below to ensure that it matches the proper criteria for seroconversion, serosusceptible, serosuitable, and seropositive, etc. 
# Alex Mann has some good comments about this. 

# Which SubjectID's were these with low serosusceptible prior to Q (by HAI, retrospectively)?

# Old version commented out below followed by corrected version
#Qdata_HAIprior_SubjectIDs <- Qdata %>%
  #filter(Randomization_DorIRorCR == "D" & HAI_dayminus2_recodeNDA > 10 & HAI_Seroconversion != 1) %>%
  #group_by(QuarantineNumber) %>%
  #distinct(SubjectID, .keep_all = TRUE) %>%
  #select(QuarantineNumber, SubjectID, Randomization_DorIRorCR, HAI_dayminus2, HAI_day28, HAI_dayminus2_recodeNDA, 
         #HAI_day28_recodeNDA, HAI_dayminus2_recodeNDA_x4, HAI_Seroconversion, Preliminary.HAI.Classification)

HI_susceptibility_table1_footnote_SubIDs <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & HAI_dayminus2_recodeNDA >10) %>%
  group_by(QuarantineNumber) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(QuarantineNumber, SubjectID, Randomization_DorIRorCR, HAI_dayminus2, HAI_day28, HAI_dayminus2_recodeNDA, 
         HAI_day28_recodeNDA, HAI_dayminus2_recodeNDA_x4, HAI_Seroconversion, Preliminary.HAI.Classification) %>%
  ungroup()

# Which SubjectID's were these with greater than antic immunity prior to Q (by MN, retrospectively)?

# 2 old versions commented out below followed by corrected version

#Qdata_MNprior <- Qdata %>%
# filter(Randomization_DorIRorCR == "D" & Microneut_VisitType == "Q baseline" & Microneut_Seroconvert == 1) 
#Qdata_MNprior_table1footnote <- Qdata_MNprior %>%
# group_by(QuarantineNumber) %>%
# summarize(NumberSeroconByMNprior = n_distinct(SubjectID))

# Qdata_MNprior_SubjectIDs <- Qdata %>%
  # filter(Randomization_DorIRorCR == "D" & Microneut_VisitType == "Q baseline" & Microneut_Seroconvert == 1 & Microneutralization.Titer.to.A.Wisconsin.67.2005 >80) %>%
  # group_by(QuarantineNumber) %>%
  # distinct(SubjectID, .keep_all = TRUE) %>%
  # select(QuarantineNumber, SubjectID, Randomization_DorIRorCR, Microneut_DrawDate, Microneut_VisitType, 
    #     Microneutralization.Titer.to.A.Wisconsin.67.2005, Microneut_Seroconvert)

MN_low_susceptibility_table1_footnote_SubIDs <- Qdata %>%
  filter(Randomization_DorIRorCR == "D") %>%
  filter(Microneut_VisitType == "Screening") %>%
  filter(Microneutralization.Titer.to.A.Wisconsin.67.2005 >=80) %>%
  group_by(QuarantineNumber) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(QuarantineNumber, SubjectID, Randomization_DorIRorCR, Microneut_DrawDate, Microneut_VisitType, 
         Microneutralization.Titer.to.A.Wisconsin.67.2005, Microneut_Seroconvert) %>%
  rename(Microneut_Seroconvert_Screening = Microneut_Seroconvert) %>%
  ungroup()

# But let's see who among these with higher than antic. immunity at baseline (admission to Q) seroconverted by MN
MN_susceptibility_table1_footnote_SubIDs <- Qdata %>%
  filter(Randomization_DorIRorCR == "D") %>%
  filter(Microneut_VisitType == "F/up") %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, Microneut_Seroconvert)  %>%
  rename(Microneut_Seroconvert_Followup = Microneut_Seroconvert) %>%
  right_join(MN_low_susceptibility_table1_footnote_SubIDs)

# Bind the greater than antic immunity by HAI and by MN together in a single table (table1 footnote)
Qdata_table1_footnote_subjectIDs <- HI_susceptibility_table1_footnote_SubIDs %>%
  full_join(MN_susceptibility_table1_footnote_SubIDs)
Qdata_table1_footnote_subjectIDs$HAIandMNprior <- NA
Qdata_table1_footnote_subjectIDs$HAIprior <- NA
Qdata_table1_footnote_subjectIDs$MNprior <- NA
Qdata_table1_footnote_subjectIDs$LowSuscepHAI_converted_anyway <- NA
Qdata_table1_footnote_subjectIDs$LowSuscepMN_converted_anyway <- NA
Qdata_table1_footnote_subjectIDs$EitherHAIorMNprior <- NA

Table1_footnote_sero_SubjectID <- Qdata_table1_footnote_subjectIDs %>%
  mutate(HAIandMNprior = ifelse(!is.na(HAI_dayminus2) & !is.na(Microneut_Seroconvert_Followup), 1, HAIandMNprior)) %>%
  mutate(HAIprior = ifelse(!is.na(HAI_dayminus2), 1, HAIprior)) %>%
  mutate(MNprior = ifelse(!is.na(Microneut_Seroconvert_Followup), 1, MNprior)) %>%
  mutate(EitherHAIorMNprior = ifelse(!is.na(Microneut_Seroconvert_Followup) | !is.na(HAI_dayminus2), 1, EitherHAIorMNprior)) %>%
  mutate(LowSuscepHAI_converted_anyway = ifelse(HAI_Seroconversion == 1 , 1, LowSuscepHAI_converted_anyway)) %>%
  mutate(LowSuscepMN_converted_anyway = ifelse(Microneut_Seroconvert_Followup == 1, 1, LowSuscepMN_converted_anyway)) %>%
  select(QuarantineNumber, SubjectID, HAIprior, MNprior, HAIandMNprior, EitherHAIorMNprior,
         LowSuscepHAI_converted_anyway, LowSuscepMN_converted_anyway) %>%
  arrange(QuarantineNumber, SubjectID)

# We can see that it was always the case that if someone had greater than antic immunity at entry to Q and seroconverted, they seroconverted by the detection method (HAI or MN) by which they had greater than antic immunity at entry to Q
# We can make a note of this and then also consolidate the LowSuscepHAI_converted_anyway and LowSuscepMN_converted_anyway to a single column
Table1_footnote_sero_SubjectID$Seroconverted_anyway <- NA

Table1_footnote_sero_anyway_SubjectID <- Table1_footnote_sero_SubjectID %>%
  mutate(Seroconverted_anyway = ifelse(LowSuscepHAI_converted_anyway == 1 | LowSuscepMN_converted_anyway, 
                                       1, Seroconverted_anyway)) %>%
  select(QuarantineNumber, SubjectID, HAIprior, MNprior, HAIandMNprior, EitherHAIorMNprior, Seroconverted_anyway)

# But the text in the manuscript is also curious about PCR evidence of infection among those with greater than antic. immunity
# So we will add the PCR data onto this Table1_footnote_SubjectID_summary

Table1_footnote_PCR_Pos_SubjectIDs <- Table1_footnote_sero_anyway_SubjectID %>%
  left_join(Qdata) %>%
  select(QuarantineNumber, SubjectID, HAIprior, MNprior, HAIandMNprior, EitherHAIorMNprior, Seroconverted_anyway,
         StudyDay, InfA_Ct) %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6) %>%
  distinct(SubjectID, StudyDay, .keep_all = TRUE) %>%
  filter(InfA_Ct > 0 & InfA_Ct < 38) %>%
  group_by(SubjectID) %>%
  summarize(Number_StudyDays_PCR_Pos = n_distinct(StudyDay)) %>%
  ungroup()
  
Table1_footnote_SubjectID_sero_PCRpos_days <- Table1_footnote_sero_anyway_SubjectID %>%
  left_join(Table1_footnote_PCR_Pos_SubjectIDs) 
Table1_footnote_SubjectID_sero_PCRpos_days$PCR_positive <- NA
Table1_footnote_SubjectID_sero_PCRpos_days$PCR_positive_and_seroconverted <- NA

Table1_footnote_SubjectID_summary <- Table1_footnote_SubjectID_sero_PCRpos_days %>%
  mutate(PCR_positive = ifelse(Number_StudyDays_PCR_Pos >= 2, 1, PCR_positive)) %>%
  mutate(PCR_positive_and_seroconverted = ifelse(PCR_positive == 1 & Seroconverted_anyway ==1, 1, PCR_positive_and_seroconverted))
Table1_footnote_SubjectID_summary[is.na(Table1_footnote_SubjectID_summary)] <- 0
# This can be written out as part of SI but we will add some steps to clean up below
  
Table1_footnote_summary <- Table1_footnote_SubjectID_summary %>%
  group_by(QuarantineNumber) %>%
  summarize(HAIprior = sum(HAIprior, na.rm = T),
            MNprior = sum(MNprior, na.rm = T),
            HAIandMNprior = sum(HAIandMNprior, na.rm = T),
            EitherHAIorMNprior = sum(EitherHAIorMNprior, na.rm = T),
            Seroconverted_anyway = sum(Seroconverted_anyway, na.rm = T),
            PCR_positive = sum(PCR_positive, na.rm = T),
            PCR_positive_and_seroconverted = sum(PCR_positive_and_seroconverted, na.rm = T)) %>%
  ungroup()
# This can be written out as part of SI but we will add some steps to clean up below

# Before writing out the Table1_footnote_SubjectID_summary df, want to make the column names better
Table1_footnote_SubjectID_summary <- Table1_footnote_SubjectID_summary %>%
  select(-PCR_positive_and_seroconverted, -EitherHAIorMNprior) %>%
  rename('Quarantine #' = QuarantineNumber,
         'Subject ID' = SubjectID,
         'Greater than Anticipated HAI' = HAIprior,
         'Greater than Anticipated MN' = MNprior,
         'Greater than Anticipated HAI and MN' = HAIandMNprior,
         'Seroconverted' = Seroconverted_anyway,
         'Days qPCR Positive' = Number_StudyDays_PCR_Pos,
         'Positive by qPCR' = PCR_positive)

# Before writing out the Table1_footnote_summary df, want to add row with totals, and make the column names better

Table1_footnote_summary_sums <- colSums(Table1_footnote_summary)
Table1_footnote_summary <- rbind(Table1_footnote_summary, Table1_footnote_summary_sums)
# Change the 4th row of the Quarantine # column to "Total"
Table1_footnote_summary$QuarantineNumber[4] <- "Total"

# Now working on the column names for Table1_footnote_summary
Table1_footnote_summary <- Table1_footnote_summary %>%
  rename('Quarantine #' = QuarantineNumber,
         'Greater than Anticipated HAI' = HAIprior,
         'Greater than Anticipated MN' = MNprior,
         'Greater than Anticipated HAI and MN' = HAIandMNprior,
         'Greater than Anticipated HAI or MN' = EitherHAIorMNprior,
         'Seroconverted' = Seroconverted_anyway,
         'Positive by qPCR' = PCR_positive,
         'Seroconverted and Positive by qPCR' = PCR_positive_and_seroconverted)

#### Writing out Table 1 Footnote to box sync directory ####

write.csv(Qdata_table1_footnote_subjectIDs, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table1_Footnote_Full_Data.csv")
write.csv(table1_footnote, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table1_Footnote_Summary.csv")
write.csv(Table1_footnote_SubjectID_summary, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table1_Footnote_Reportable_Summary_SubjectIDs.csv")
write.csv(Table1_footnote_summary, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table1_Footnote_Reportable_Summary.csv")

#### Column means, sums, and rounding for Table 1 ####
table1_manuscript <- Qdata_table1 # use this as a backup because it is arduous to recreate Qdata_table1
table1_manuscript[is.na(table1_manuscript)] <- 0

table1_manuscript_sums <-  table1_manuscript %>%
  summarise_all(funs(sum))
table1_manuscript_sums <- table1_manuscript %>%
  full_join(table1_manuscript_sums)
# This gets all of the columns to be summed
# Now let's work on changing the sums in all of the "percent" columns to the proper fraction 
# (also perform the *100 transformation to percent)
# (also round everything to the nearest whole number)
table1_manuscript_sums_fractions <- table1_manuscript_sums %>%
  mutate(Fraction_Infected_of_Inoculated = (Number_Infected_Donors/Number_Inoculated_Donors)*100,
         Fraction_Symptomatic_V3_of_Infected = (Number_Symptomatic_V3/Number_Infected_Donors)*100,
         Fraction_ILI_V3_of_Infected = (Number_ILI_V3/Number_Infected_Donors)*100,
         Fraction_Febrile_Infected_of_Total_Infected = (Number_Febrile_Infected/Number_Infected_Donors)*100,
         Fraction_PCR_Infected_Donors_of_Infected = (Number_PCR_Infected_Donors/Number_Infected_Donors)*100,
         Fraction_Infected_by_PCR_and_Serology = (Number_Positive_PCR_and_Seroconversion/Number_Infected_Donors)*100) %>%
  mutate_all(funs(round(., 0)))

# Now add parentheses to all of these fraction variables because they are presented in manuscript table as percents
table1_manuscript_sums_fractions$Fraction_Infected_of_Inoculated <- paste0("(", table1_manuscript_sums_fractions$Fraction_Infected_of_Inoculated, ")")
table1_manuscript_sums_fractions$Fraction_Symptomatic_V3_of_Infected <- paste0("(", table1_manuscript_sums_fractions$Fraction_Symptomatic_V3_of_Infected, ")")
table1_manuscript_sums_fractions$Fraction_ILI_V3_of_Infected <- paste0("(", table1_manuscript_sums_fractions$Fraction_ILI_V3_of_Infected, ")")
table1_manuscript_sums_fractions$Fraction_Febrile_Infected_of_Total_Infected <- paste0("(", table1_manuscript_sums_fractions$Fraction_Febrile_Infected_of_Total_Infected, ")")
table1_manuscript_sums_fractions$Fraction_PCR_Infected_Donors_of_Infected <- paste0("(", table1_manuscript_sums_fractions$Fraction_PCR_Infected_Donors_of_Infected, ")")
table1_manuscript_sums_fractions$Fraction_Infected_by_PCR_and_Serology <- paste0("(", table1_manuscript_sums_fractions$Fraction_Infected_by_PCR_and_Serology, ")")

# Now bring columns together into more publishable arrangement of data in the display of the table
# For example, when we have Infected/Inoculated column, we want to take the data from the Infected column and the data from the Inoculated column, and merge them into a single column, separated by a "/"
table1_manuscript_unite <- table1_manuscript_sums_fractions %>%
  unite(`Infected/Inoculated`, Number_Infected_Donors, Number_Inoculated_Donors, sep = "/", remove = TRUE) %>%
  unite(`Infected/Inoculated (%)`, `Infected/Inoculated`, Fraction_Infected_of_Inoculated, sep = " ", remove = TRUE) %>%
  unite(Symptomatic, Number_Symptomatic_V3, Fraction_Symptomatic_V3_of_Infected, sep = " ", remove = TRUE) %>%
  unite(ILI, Number_ILI_V3, Fraction_ILI_V3_of_Infected, sep = " ", remove = TRUE) %>%
  unite(Febrile, Number_Febrile_Infected, Fraction_Febrile_Infected_of_Total_Infected, sep = " ", remove = TRUE) %>%
  unite(`PCR Confirmed Infection`, Number_PCR_Infected_Donors, Fraction_PCR_Infected_Donors_of_Infected, sep = " ", remove = TRUE) %>%
  unite(`PCR Confirmed Infection and Seroconversion`, Number_Positive_PCR_and_Seroconversion, Fraction_Infected_by_PCR_and_Serology, sep = " ", remove = TRUE) %>%
  unite(`Seroconversion by HAI : MN : Either`, Number_HAI_Positive, Number_Microneut_Positive, Positive_By_Either_HAI_or_MN, sep = " : ", remove = TRUE) %>%
  rename('Quarantine #' = QuarantineNumber)
table1_manuscript_unite <- table1_manuscript_unite[,c(1:3,5,4,6:8)]

# Change the 4th row of the Quarantine # column to "Total"
table1_manuscript_unite$`Quarantine #`[4] <- "Total"

#### Writing out Table 1 to box sync directory ####

write.csv(table1_manuscript_unite, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table1_Manuscript.csv")

#### Writing out Table 1 to latex for direct translation of code to table image for paper

kable(table1_manuscript_unite) %>%
  kable_styling("striped") %>%
  add_header_above(c(" " = 2, "Laboratory Confirmed Infection & Illness (% of Infected)" = 3,
                     "Laboratory Confirmed Infection Criteria (% of Infected)" = 3))

datatable(table1_manuscript_unite,
          filter = 'top', options = list(
            pageLength = 10, autoWidth = TRUE
          ))

print(xtable(table1_manuscript_unite),
      comment = FALSE)

#### * TABLE 2 ---------------------------####
#### Overview of Table 2 for the quarantine main paper ####
# Table 2 is: Donor Exhaled Breath Viral RNA Detection and Copy Number Among Infected Donors by Quarantine Event and Aerosol Fraction
# Remember the demoniators are the positive subjects or samples
# a) Number of infected subjects (by any criteria for infected) for each Q 
# b) Number of samples from infected subjects for each Q
# c) Number positive subjects (and %) for coarse aerosol (for each Q)
# d) Number positive samples (and %) for coarse aerosol (for each Q)
# e) Coarse mean copy number of positive samples (for each Q)
# f) Geometric Mean and standard deviation for coarse aerosol samples (Table 2 footnote)
# g) Number positive subjects (and %) for fine aerosol (for each Q)
# h) Number positive samples (and %) for fine aerosol (for each Q)
# i) Fine mean copy number of positive samples (for each Q)
# j) Geometric Mean and standard deviation for fine aerosol samples (Table 2 footnote)

#### Table 2: a) Number of infected subjects (by any criteria for infected) for each Q ####

Exhaled_breath_subjects <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & !is.na(G2.pcr.copies.sample.type)) %>% # PCR assay that exists but was negative coded as 0, NA means no assay conducted
  right_join(Qdata_infected_donors) %>%
  group_by(QuarantineNumber) %>%
  summarize(G2_Subjects = n_distinct(SubjectID))
# We can check this by looking at the number of infected subjects reported in Table 1 (donors)
# We know that we got exhaled breath samples on all of the donors (based on execution of the protocol)

#### Table 2: b) number of samples from infected subjects for each Q ####

Exhaled_breath_samples <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & 
           !is.na(G2.pcr.copies.sample.type) & 
           Microneut_VisitType == "F/up") %>% # because the three Microneut visit types are repeated for each PCR day of data, reduce to one set to avoid repeats
  filter(G2.pcr.copies.sample.type == "C") %>% # confirmed already that there was a C and F for each subjectID and study day
  group_by(StudyDay) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  right_join(Qdata_infected_donors) %>%
  group_by(QuarantineNumber) %>%
  summarize(G2_Samples = n())

#### Generation of Table 2 using information from (a) and (b), above ####

Qdata_table2 <- Exhaled_breath_subjects %>%
  left_join(Exhaled_breath_samples, by = c("QuarantineNumber" = "QuarantineNumber"))

#### Table 2: c) Positive subjects (and %) for coarse aerosol (for each Q) ####

Coarse_Pos_Subj_table2 <- Qdata_infected_donors %>%
  left_join(Qdata) %>%
  filter(G2.pcr.copies.sample.type == "C" & !is.na(final.copies.replicate)) %>%
  group_by(QuarantineNumber) %>%
  summarize(Subjects_Coarse_Positive = n_distinct(SubjectID)) 

# Need to check to see if these positives also were serology positive or not? 
# Could do this but don't need it necessarily for this analysis to replicate what is in the main quarantine paper. 

# Add "Coarse_Pos_Subj_table2" to table 2
Qdata_table2 <- Qdata_table2 %>%
  left_join(Coarse_Pos_Subj_table2) %>%
  mutate(Fraction_Subjects_Coarse_Positive_of_G2_Subjects = Subjects_Coarse_Positive/G2_Subjects)

#### Table 2: d) Positive samples (and % of samples from infected) for coarse aerosol (for each Q) ####

Coarse_Pos_Samples_table2 <- Qdata %>%
  right_join(Qdata_infected_donors) %>%
  filter(Randomization_DorIRorCR == "D" & 
           !is.na(G2.pcr.copies.sample.type) & 
           Microneut_VisitType == "F/up") %>% # because the three Microneut visit types are repeated for each PCR day of data, reduce to one set to avoid repeats
  filter(G2.pcr.copies.sample.type == "C") %>% # confirmed already that there was a C and F for each subjectID and study day
  filter(!is.na(final.copies.replicate)) %>%
  distinct(SubjectID, StudyDay, .keep_all = TRUE) %>%
  group_by(QuarantineNumber) %>%
  summarize(Coarse_Samples_Positive = n())

# Add "Coarse_Pos_Samples_table2" to table 2
Qdata_table2 <- Qdata_table2 %>%
  left_join(Coarse_Pos_Samples_table2) %>%
  mutate(Fraction_Coarse_Samples_Positive_of_G2_Samples = Coarse_Samples_Positive/G2_Samples)

#### Table 2: e) Coarse Mean copy number of positive samples (for each Q) ####

Aerosol_copies_df <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & 
           !is.na(G2.pcr.copies.sample.type) & 
           Microneut_VisitType == "F/up") %>% # because the three Microneut visit types are repeated for each PCR day of data, reduce to one set to avoid repeats
  filter(G2.pcr.copies.sample.type == "C" | G2.pcr.copies.sample.type == "F") %>% 
  select(SubjectID, Randomization_DorIRorCR, QuarantineNumber, StudyDay, G2.pcr.copies.sample.type, final.copies.replicate)

# Need to find the subject ID + study day instances where there was one positive replicate and one negative replicate (the negative replicate will be treated as non-detect and we will apply the LOQ*1/sqrt(2) to impute a value to be used for measures of center and spread; but when we get to modelling, this will not be done, and rather we will use tobit regression or some other method to account for censored data)

Aerosol_1_replicate_positive_raw_data_all_pos <- Aerosol_copies_df %>%
  filter(!is.na(final.copies.replicate)) %>%
  group_by(SubjectID, StudyDay, G2.pcr.copies.sample.type)
Aerosol_1_replicate_positive_raw_data_all_pos <- unique(Aerosol_1_replicate_positive_raw_data_all_pos)
Aerosol_1_replicate_positive_count <- Aerosol_1_replicate_positive_raw_data_all_pos %>%
  group_by(SubjectID, StudyDay, G2.pcr.copies.sample.type) %>%
  count()  %>%
  filter(n == 1) %>%
  left_join(Aerosol_copies_df)
Aerosol_1_replicate_positive_count <- unique(Aerosol_1_replicate_positive_count)

Aerosol_1_replicate_negative_imputed <- Aerosol_1_replicate_positive_count %>%
  filter(is.na(final.copies.replicate)) %>%
  mutate(final.copies.replicate = 2000*1/sqrt(2)) %>%
  select(-n)

# Bind the Aerosol_1_replicate_negative_imputed df into the Aerosol_1_replicate_positive df
Aerosols_both_replicates <- Aerosol_1_replicate_positive_raw_data_all_pos %>%
  filter(!is.na(final.copies.replicate)) %>%
  bind_rows(Aerosol_1_replicate_negative_imputed) %>%
  arrange(SubjectID, StudyDay, G2.pcr.copies.sample.type, final.copies.replicate)

# Now use the Aerosols_both_replicates df to compute the mean sample copy number for coarse.
Coarse_Positive_Samples_Arith_Mean_table2 <- Aerosols_both_replicates %>%
  right_join(Qdata_infected_donors) %>%
  filter(G2.pcr.copies.sample.type == "C") %>%
  group_by(QuarantineNumber) %>%
  summarize(Coarse_Positive_Samples_ArithMean = mean(final.copies.replicate))
Coarse_Positive_Samples_Arith_Mean_table2$Coarse_Positive_Samples_ArithMean <- 
  format(Coarse_Positive_Samples_Arith_Mean_table2$Coarse_Positive_Samples_ArithMean, scientific = TRUE) 

Coarse_pos_arithmetic_mean_all_Q <- Aerosols_both_replicates %>%
  right_join(Qdata_infected_donors) %>%
  filter(G2.pcr.copies.sample.type == "C") %>%
  ungroup() %>%
  summarize(Coarse_Positive_Samples_ArithMean_total = mean(final.copies.replicate))
Coarse_pos_arithmetic_mean_all_Q$Coarse_Positive_Samples_ArithMean_total <- 
  format(Coarse_pos_arithmetic_mean_all_Q$Coarse_Positive_Samples_ArithMean_total, scientific = TRUE, digits = 3) 
  

# Add the "Coarse_Pos_Samples_ArithMean_table2" to Qdata_table2
Qdata_table2 <- Qdata_table2 %>%
  left_join(Coarse_Positive_Samples_Arith_Mean_table2)

#### * Table 2: f) Geometric Mean, SD, and maximum for coarse aerosol samples (part 1/2 Table 2 footnote) ####

Coarse_Pos_Samples_Geom_Mean_table2 <- Aerosols_both_replicates %>%
  right_join(Qdata_infected_donors, by = c("SubjectID" = "SubjectID", "QuarantineNumber" = "QuarantineNumber")) %>%
  filter(G2.pcr.copies.sample.type == "C") %>%
  ungroup() %>%
  group_by(SubjectID, StudyDay) %>%
  mutate(average_number = mean(final.copies.replicate)) %>% 
  distinct(G2.pcr.copies.sample.type, SubjectID, StudyDay, average_number) %>%
  mutate(ln.final.copies = log(average_number)) %>%
  ungroup() %>%
  summarize(Coarse_Positive_Samples_Geom_Mean = exp(mean(ln.final.copies)),
            Coarse_Positive_Samples_GSD = exp(sd(ln.final.copies)),
            Coarse_Positive_Samples_Max = exp(max(ln.final.copies)),
            Coarse_Positive_Samples_n = n())
Coarse_Pos_Samples_Geom_Mean_table2$Coarse_Positive_Samples_Geom_Mean <- 
  format(Coarse_Pos_Samples_Geom_Mean_table2$Coarse_Positive_Samples_Geom_Mean, scientific = TRUE)
Coarse_Pos_Samples_Geom_Mean_table2$Coarse_Positive_Samples_Max <- 
  format(Coarse_Pos_Samples_Geom_Mean_table2$Coarse_Positive_Samples_Max, scientific = TRUE)
# Note: these values go in the footnote and thus we don't add the Geom mean and SD to the Table 2.

## Write out the footnote to the box.com directory

write.csv(Coarse_Pos_Samples_Geom_Mean_table2, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table2_Coarse_footnote.csv")

#### Table 2: g) Positive subjects (and %) for Fine aerosol (for each Q) ####

Fine_Pos_Subj_table2 <- Qdata_infected_donors %>%
  left_join(Qdata) %>%
  filter(G2.pcr.copies.sample.type == "F" & !is.na(final.copies.replicate)) %>%
  group_by(QuarantineNumber) %>%
  summarize(Fine_Positive_Subjects = n_distinct(SubjectID))

# Need to check to see if these positives also were serology positive or not? 
# Could do this but don't need it necessarily for this analysis to replicate what is in the main quarantine paper. 

# Add "Fine_Pos_Subj_table2" to table 2
Qdata_table2 <- Qdata_table2 %>%
  left_join(Fine_Pos_Subj_table2, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
  mutate(Fraction_Fine_Positive_Subjects_of_G2_Subjects = Fine_Positive_Subjects/G2_Subjects)

#### Table 2: h) Positive samples (and %) for Fine aerosol (for each Q) #### 

Fine_Pos_Samples_table2 <- Qdata %>%
  right_join(Qdata_infected_donors) %>%
  filter(Randomization_DorIRorCR == "D" & 
           !is.na(G2.pcr.copies.sample.type) & 
           Microneut_VisitType == "F/up") %>% # because the three Microneut visit types are repeated for each PCR day of data, reduce to one set to avoid repeats
  filter(G2.pcr.copies.sample.type == "F") %>% # confirmed already that there was a C and F for each subjectID and study day
  filter(!is.na(final.copies.replicate)) %>%
  distinct(SubjectID, StudyDay, .keep_all = TRUE) %>%
  group_by(QuarantineNumber) %>%
  summarize(Fine_Positive_Samples = n())

# Add "Fine_Pos_Samples_table2" to table 2
Qdata_table2 <- Qdata_table2 %>%
  left_join(Fine_Pos_Samples_table2) %>%
  mutate(Fraction_Fine_Samples_Positive_of_G2_Samples = Fine_Positive_Samples/G2_Samples)

#### Table 2: i) Fine mean copy number of positive samples (for each Q) ####

Fine_Pos_Samples_ArithMean_table2 <- Aerosols_both_replicates %>%
  right_join(Qdata_infected_donors) %>%
  filter(G2.pcr.copies.sample.type == "F") %>%
  group_by(QuarantineNumber) %>%
  summarize(Fine_Positive_Samples_Arith_Mean = mean(final.copies.replicate))
Fine_Pos_Samples_ArithMean_table2$Fine_Positive_Samples_Arith_Mean <- 
  format(Fine_Pos_Samples_ArithMean_table2$Fine_Positive_Samples_Arith_Mean, scientific = TRUE)

# Add the "Fine_Pos_Samples_ArithMean_table2" to Qdata_table2
Qdata_table2 <- Qdata_table2 %>%
  left_join(Fine_Pos_Samples_ArithMean_table2, by = c("QuarantineNumber" = "QuarantineNumber"))

Fine_pos_arithmetic_mean_all_Q <- Aerosols_both_replicates %>%
  right_join(Qdata_infected_donors) %>%
  filter(G2.pcr.copies.sample.type == "F") %>%
  ungroup() %>%
  summarize(Fine_Positive_Samples_ArithMean_total = mean(final.copies.replicate))
Fine_pos_arithmetic_mean_all_Q$Fine_Positive_Samples_ArithMean_total <- 
  format(Fine_pos_arithmetic_mean_all_Q$Fine_Positive_Samples_ArithMean_total, scientific = TRUE, digits = 3) 

#### * Table 2: j) Geometric Mean, SD, and maximum for fine aerosol samples (part 2/2 Table 2 footnote) ####

Fine_Pos_Samples_GeomMean_table2 <- Aerosols_both_replicates %>%
  right_join(Qdata_infected_donors, by = c("SubjectID" = "SubjectID", "QuarantineNumber" = "QuarantineNumber")) %>%
  filter(G2.pcr.copies.sample.type == "F") %>%
  ungroup() %>%
  group_by(SubjectID, StudyDay) %>%
  mutate(average_number = mean(final.copies.replicate)) %>% 
  distinct(G2.pcr.copies.sample.type, SubjectID, StudyDay, average_number) %>%
  mutate(ln.final.copies = log(average_number)) %>%
  ungroup() %>%
  summarize(Fine_Positive_Samples_GeomMean = exp(mean(ln.final.copies)),
            Fine_Positive_Samples_GSD = exp(sd(ln.final.copies)),
            Fine_Positive_Samples_Max = exp(max(ln.final.copies)),
            Fine_Positive_Samples_n = n())
Fine_Pos_Samples_GeomMean_table2$Fine_Positive_Samples_GeomMean <- 
  format(Fine_Pos_Samples_GeomMean_table2$Fine_Positive_Samples_GeomMean, scientific = TRUE)
Fine_Pos_Samples_GeomMean_table2$Fine_Positive_Samples_Max <- 
  format(Fine_Pos_Samples_GeomMean_table2$Fine_Positive_Samples_Max, scientific = TRUE)
# Note: these values go in the footnote and thus we don't add the Geom mean and SD to the Table 2.

## Write out the footnote to the box.com directory

write.csv(Fine_Pos_Samples_GeomMean_table2, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table2_Fine_footnote.csv")

#### Column means, sums, and rounding for Table 2 ####

# First change the character vectors to numeric (the only character vectors were for the sci notation columns)
Qdata_table2[] <- lapply(Qdata_table2, function(x) {
  if(is.character(x)) as.numeric(as.character(x)) else x
})

# First creating a new df because it is arduous to recreate Qdata_table2
Table2_manuscript <- Qdata_table2

Table2_manuscript_sums <- Table2_manuscript %>%
  summarise_all(funs(sum))
Table2_manuscript_sums <- Table2_manuscript %>%
  full_join(Table2_manuscript_sums)
# This gets all of the columns to be summed
# Now let's work on changing the sums in all of the "percent" columns to the proper fraction 
# (also perform the *100 transformation to percent)
# (also round everything to the nearest whole number)
Table2_manuscript_sums_fractions <- Table2_manuscript_sums %>%
  mutate(Fraction_Subjects_Coarse_Positive_of_G2_Subjects = (Subjects_Coarse_Positive/G2_Subjects)*100,
         Fraction_Coarse_Samples_Positive_of_G2_Samples = (Coarse_Samples_Positive/G2_Samples)*100,
         Fraction_Fine_Positive_Subjects_of_G2_Subjects = (Fine_Positive_Subjects/G2_Subjects)*100,
         Fraction_Fine_Samples_Positive_of_G2_Samples = (Fine_Positive_Samples/G2_Samples)*100) %>%
  mutate_at(vars(Fraction_Subjects_Coarse_Positive_of_G2_Subjects,
                 Fraction_Coarse_Samples_Positive_of_G2_Samples,
                 Fraction_Fine_Positive_Subjects_of_G2_Subjects,
                 Fraction_Fine_Samples_Positive_of_G2_Samples), 
            funs(round(., 0)))

# Get the copy numbers from the aerosol samples back to scientific notation
Table2_manuscript_sums_fractions$Coarse_Positive_Samples_ArithMean <- formatC(Table2_manuscript_sums_fractions$Coarse_Positive_Samples_ArithMean, format = "e", digits = 2)
Table2_manuscript_sums_fractions$Fine_Positive_Samples_Arith_Mean <- formatC(Table2_manuscript_sums_fractions$Fine_Positive_Samples_Arith_Mean, format = "e", digits = 2)
         
# Now add parentheses to all of these fraction variables because they are presented in manuscript table as percents
Table2_manuscript_sums_fractions$Fraction_Subjects_Coarse_Positive_of_G2_Subjects <- paste0("(", Table2_manuscript_sums_fractions$Fraction_Subjects_Coarse_Positive_of_G2_Subjects, ")")
Table2_manuscript_sums_fractions$Fraction_Coarse_Samples_Positive_of_G2_Samples <- paste0("(", Table2_manuscript_sums_fractions$Fraction_Coarse_Samples_Positive_of_G2_Samples, ")")
Table2_manuscript_sums_fractions$Fraction_Fine_Positive_Subjects_of_G2_Subjects <- paste0("(", Table2_manuscript_sums_fractions$Fraction_Fine_Positive_Subjects_of_G2_Subjects, ")")
Table2_manuscript_sums_fractions$Fraction_Fine_Samples_Positive_of_G2_Samples <- paste0("(", Table2_manuscript_sums_fractions$Fraction_Fine_Samples_Positive_of_G2_Samples, ")")

Table2_manuscript_unite <- Table2_manuscript_sums_fractions %>%
  unite(`Coarse Positive Subjects (%)`, Subjects_Coarse_Positive, Fraction_Subjects_Coarse_Positive_of_G2_Subjects, sep = " ", remove = TRUE) %>%
  unite(`Coarse Positive Samples (%)`, Coarse_Samples_Positive, Fraction_Coarse_Samples_Positive_of_G2_Samples, sep = " ", remove = TRUE) %>%
  unite(`Fine Positive Subjects (%)`, Fine_Positive_Subjects, Fraction_Fine_Positive_Subjects_of_G2_Subjects, sep = " ", remove = TRUE) %>%
  unite(`Fine Positive Samples (%)`, Fine_Positive_Samples, Fraction_Fine_Samples_Positive_of_G2_Samples, sep = " ", remove = TRUE) %>%
  rename('Quarantine #' = QuarantineNumber) %>%
  rename('Mean of Positive Coarse Samples*' = Coarse_Positive_Samples_ArithMean) %>%
  rename('Mean of Positive Fine Samples*' = Fine_Positive_Samples_Arith_Mean) %>%
  rename('N Subjects' = G2_Subjects) %>%
  rename('N Samples' = G2_Samples)

# Change the 4th row of the Quarantine # column to "Total"
Table2_manuscript_unite$`Quarantine #`[4] <- "Total"


## Adjust the arithmetic mean total becuase we can't compute the total arithmetic means by taking the sum of the arithmetic means for each quarantine. 

Table2_manuscript_unite$`Mean of Positive Coarse Samples*`[4] <- Coarse_pos_arithmetic_mean_all_Q

Table2_manuscript_unite$`Mean of Positive Fine Samples*`[4] <- Fine_pos_arithmetic_mean_all_Q


#### Writing out Table 2 to box sync directory ####

# There are issues with kable not using the scientific notation from the table2_manuscript df
# To fix this I will convert the two variables that use sci notation to character 

Table2_manuscript_unite$`Mean of Positive Coarse Samples*` <- as.character(Table2_manuscript_unite$`Mean of Positive Coarse Samples*`)
Table2_manuscript_unite$`Mean of Positive Fine Samples*` <- as.character(Table2_manuscript_unite$`Mean of Positive Fine Samples*`)

write.csv(Table2_manuscript_unite, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table2_Manuscript.csv")

#### Writing out Table 2 to latex for direct translation of code to table image for paper ####

kable(Table2_manuscript_unite) %>%
  kable_styling("striped") %>%
  add_header_above(c(" " = 3, "Coarse Aerosol (>5um)" = 3,
                     "Fine Aerosol (≤5um)" = 3))

datatable(Table2_manuscript_unite,
          filter = 'top', options = list(
            pageLength = 10, autoWidth = TRUE
          ))

print(xtable(Table2_manuscript_unite),
      comment = FALSE)

#### Findings for results in text reporting in paper ####

# For each Q, how many days did each volunteer provide breath samples?
Exhaled_breath_samples_by_subject_by_Q <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & !is.na(G2.pcr.copies.sample.type)) %>% #PCR assay that exists but was negative coded as 0, NA means no assay conducted
  group_by(QuarantineNumber, SubjectID) %>%
  summarize(Person_Days_Breath_Samples = n_distinct(StudyDay)) %>%
  group_by(QuarantineNumber) %>%
  summarize(`1 Day` = sum(Person_Days_Breath_Samples == 1),
            `2 Days` = sum(Person_Days_Breath_Samples == 2),
            `3 Days` = sum(Person_Days_Breath_Samples == 3),
            `4 Days` = sum(Person_Days_Breath_Samples == 4)) %>%
  rename(`Quarantine #` = QuarantineNumber)

Exhaled_breath_samples_by_subject_by_Q[4,2:5] <- Exhaled_breath_samples_by_subject_by_Q %>%
  summarise_at(vars(`1 Day`:`4 Days`), funs(sum))
Exhaled_breath_samples_by_subject_by_Q[4,1] <- "Total"

# write out to the in text results
write_csv(Exhaled_breath_samples_by_subject_by_Q, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/G2_visits_by_person_Q.csv")

# How many subjects gave G2 samples on each of the study days 1-4?
Exhaled_breath_subjects_total52 <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & !is.na(G2.pcr.copies.sample.type)) %>% #PCR assay that exists but was negative coded as 0, NA means no assay conducted
  group_by(StudyDay) %>%
  summarize(`Volunteers Who Gave a G2 Sample` = n_distinct(SubjectID)) %>%
  rename(`Study Day` = StudyDay)

Exhaled_breath_subjects_total52[5,] <- Exhaled_breath_subjects_total52 %>%
  summarise_at(vars(`Volunteers Who Gave a G2 Sample`), funs(sum))
Exhaled_breath_subjects_total52[5,1] <- "Total"

# write out to the in text results
write_csv(Exhaled_breath_subjects_total52, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/G2_visits_by_studyday.csv")

# How many person-days of G2 sampling did we get for each Q?
Exhaled_breath_person_days_by_Q <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & !is.na(G2.pcr.copies.sample.type)) %>% #PCR assay that exists but was negative coded as 0, NA means no assay conducted
  group_by(QuarantineNumber) %>%
  summarize(`G2 Person-Day Samples` = n_distinct(SubjectID, StudyDay)) %>%
  rename(`Quarantine #` = QuarantineNumber)
Exhaled_breath_person_days_by_Q[4,2] <- Exhaled_breath_person_days_by_Q %>%
  summarise_at(vars(`G2 Person-Day Samples`), funs(sum))
Exhaled_breath_person_days_by_Q[4,1] <- "Total"

# write out to the in text results
write_csv(Exhaled_breath_person_days_by_Q, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Exhaled_breath_person_days_by_Q.csv")

# Which study days had positive coarse, fine, both, and either coarse or fine?
Coarse_Pos_Subject_Days <- Qdata_infected_donors %>%
  left_join(Qdata) %>%
  filter(G2.pcr.copies.sample.type == "C" & !is.na(final.copies.replicate)) %>%
  distinct(SubjectID, StudyDay, .keep_all = TRUE) %>%
  select(QuarantineNumber, SubjectID, StudyDay, G2.pcr.copies.sample.type, final.copies.replicate)

Fine_Pos_Subject_Days <- Qdata_infected_donors %>%
  left_join(Qdata) %>%
  filter(G2.pcr.copies.sample.type == "F" & !is.na(final.copies.replicate)) %>%
  distinct(SubjectID, StudyDay, .keep_all = TRUE) %>%
  select(QuarantineNumber, SubjectID, StudyDay, G2.pcr.copies.sample.type, final.copies.replicate)

Coarse_Fine_Pos_Subject_Days <- full_join(Coarse_Pos_Subject_Days, Fine_Pos_Subject_Days) %>%
  arrange(SubjectID, StudyDay, G2.pcr.copies.sample.type) %>%
  spread(G2.pcr.copies.sample.type, final.copies.replicate)
Coarse_Fine_Pos_Subject_Days$Fine_Pos <- NA
Coarse_Fine_Pos_Subject_Days$Coarse_Pos <- NA
Coarse_Fine_Pos_Subject_Days$Fine_and_Coarse_Pos <- NA
Coarse_Fine_Pos_Subject_Days$Fine_or_Coarse_Pos <- NA
Coarse_Fine_Pos_Subject_Days$Fine_Pos_Only <- NA
Coarse_Fine_Pos_Subject_Days$Coarse_Pos_Only <- NA

Coarse_Fine_Pos_Subject_Days <- Coarse_Fine_Pos_Subject_Days %>%
  mutate(Fine_Pos = ifelse(!is.na(F), 1, Fine_Pos)) %>%
  mutate(Coarse_Pos = ifelse(!is.na(C), 1, Coarse_Pos)) %>%
  mutate(Fine_and_Coarse_Pos = ifelse(!is.na(F) & !is.na(C), 1, Fine_and_Coarse_Pos)) %>%
  mutate(Fine_or_Coarse_Pos = ifelse(!is.na(F) | !is.na(C), 1, Fine_or_Coarse_Pos)) %>%
  mutate(Fine_Pos_Only = ifelse(!is.na(F) & is.na(C), 1, Fine_Pos_Only)) %>%
  mutate(Coarse_Pos_Only = ifelse(is.na(F) & !is.na(C), 1, Coarse_Pos_Only))

Coarse_Fine_Pos_Subject_Days_Summary <- Coarse_Fine_Pos_Subject_Days %>%
  group_by(SubjectID, StudyDay) %>%
  summarize(Fine_Pos = sum(Fine_Pos),
            Coarse_Pos = sum(Coarse_Pos),
            Fine_and_Coarse_Pos = sum(Fine_and_Coarse_Pos),
            Fine_or_Coarse_Pos = sum(Fine_or_Coarse_Pos),
            Fine_Pos_Only = sum(Fine_Pos_Only),
            Coarse_Pos_Only = sum(Coarse_Pos_Only)) %>%
  ungroup()
# May want to revise the above code. There is never more than one subjectID per study Day so these sums always equal 1 if they exist.
# So the above Coarse_Fine_Pos_Subject_Days_Summary is not different that selecting a few of the columns from the Coarse_Fine_Pos_Subject_Days df

Fine_Pos_Only_Instances <- Coarse_Fine_Pos_Subject_Days_Summary %>%
  summarize(Fine_Pos_Only_Days = sum(Fine_Pos_Only, na.rm = TRUE))

Coarse_Pos_Only_Instances <- Coarse_Fine_Pos_Subject_Days_Summary %>%
  summarize(Coarse_Pos_Only_Days = sum(Coarse_Pos_Only, na.rm = TRUE))

Fine_or_Coarse_Pos_Days <- Coarse_Fine_Pos_Subject_Days_Summary %>%
  summarize(Fine_or_Coarse_Pos = sum(Fine_or_Coarse_Pos, na.rm = TRUE))

Fine_and_Coarse_Pos_Days <- Coarse_Fine_Pos_Subject_Days_Summary %>%
  summarize(Fine_and_Coarse_Pos = sum(Fine_and_Coarse_Pos, na.rm = TRUE))

# How many donors were positive of the 42 infected?
Fine_or_Coarse_Pos_Subjects <- Coarse_Fine_Pos_Subject_Days_Summary %>%
  group_by(SubjectID) %>%
  summarize(Fine_or_Coarse_Pos_Days_by_Subject = sum(Fine_or_Coarse_Pos, na.rm = TRUE)) %>%
  summarize(Fine_or_Coarse_Pos_Subjects = n_distinct(SubjectID)) %>%
  mutate(Percent_Positive_Subjects_of_Infected = (Fine_or_Coarse_Pos_Subjects/42)*100)


####  What are the copy numbers of fine and coarse over the 4 study days where aerosols were collected?  ####

# We limit this to infected donors, but note that subject 109 had positive coarse aerosol sample on a single study day although was not classified as infected. 
# In looking at 109's NPS records, we see that they were never positive for by PCR on any study day (1-6).

# First cut the right df
Table_2_Copies <- Aerosols_both_replicates %>%
  right_join(Qdata_infected_donors) %>% 
  filter(!is.na(final.copies.replicate)) %>%
  select(SubjectID, QuarantineNumber, StudyDay, G2.pcr.copies.sample.type, final.copies.replicate) %>%
  group_by(SubjectID, StudyDay) %>%
  mutate(final.copies.replicate.avg = mean(final.copies.replicate)) %>%
  select(-final.copies.replicate)
Table_2_Copies <- unique(Table_2_Copies)
  
# Let's write this out because we need this df to produce the boxplots (see below) for the RMarkdown. 
write.csv(Table_2_Copies, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Donor_Aerosol_Copies.csv")

# Let's first plot by quarantine (even though there is a small sample size)
# The point of this will be to make sure there are similar trends between Q (if such can be ascertained)
# Then the final plot will likely just be a combination of all Qs

Exhaled_breath_samples <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & 
           !is.na(G2.pcr.copies.sample.type) & 
           Microneut_VisitType == "F/up") %>% # because the three Microneut visit types are repeated for each PCR day of data, reduce to one set to avoid repeats
  filter(G2.pcr.copies.sample.type == "C") %>% # confirmed already that there was a C and F for each subjectID and study day
  group_by(StudyDay) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  right_join(Qdata_infected_donors) %>%
  group_by(QuarantineNumber) %>%
  summarize(G2_Samples = n())

# plotting
# Will have to work on this if time allows. For now need to finish everything critical to the manuscript submission

## Boxplot of the fine and coarse aerosol samples
# There were 14 positive fine and 6 poisitive coarse samples
f <- ggplot(Table_2_Copies, aes(G2.pcr.copies.sample.type, final.copies.replicate.avg))
f + geom_boxplot()

# Maybe it would be better to plot on the log10 scale on the y axis?
# To do this, let's transform the G2.pcr.copies.sample.type variable
Table_2_Copies_log <- Table_2_Copies %>%
  mutate(log10.copies = log10(final.copies.replicate.avg))

f <- ggplot(Table_2_Copies_log, aes(G2.pcr.copies.sample.type, log10.copies))
f + geom_boxplot() + theme_bw() + labs(x = "Aerosol Fraction", y = "Log10 RNA Copies")

#### * TABLE 3 ---------------------------####
#### Overview of Table 3 in the paper ####
# Table 3 is "Recipient status". It gives for each of the 3 quarantines and for IR and CR:
# a) Number of infected/ number of exposed (and %)
# b) Number of symptomatic (and % of exposed)
# c) Number of symptomatic, non-ILI (and % of exposed)
# d) Number of ILI (and % of exposed)
# e) Number of febrile (and % of exposed)
# f) Number of PCR confirmed infection (and % of exposed)
# g) Number of PCR confirmed infection and seroconversion (and % of exposed)
# h) Number of seroconversion by HAI: MN: Either (and % of exposed)

#### Table 3: a1) IR: Number of infected/ number of exposed (and %) ####

# number of exposed IR

Exposed_IR <- Qdata %>% 
  filter(Randomization_DorIRorCR == "IR") %>%
  distinct(SubjectID, .keep_all = TRUE)
Exposed_IR_table3 <- Exposed_IR %>%
  group_by(QuarantineNumber) %>%
  summarize(NumberExposedIR = n_distinct(SubjectID))
print(Exposed_IR_table3)

# number of infected IR

# positive by PCR (seroconversion, or PCR positive on more than 1 day)
# let's get the list with at least one day PCR positive, then merge up with seroconversion data
Qdata_pcr_pos1_or_more_days_IR <- Qdata %>%
  filter(Randomization_DorIRorCR == "IR") %>%
  filter(!is.na(InfA_Ct)) %>%
  filter((InfA_Ct<38 & InfA_Ct!=0)) %>%
  group_by(SubjectID, StudyDay) %>%
  summarize(count = n()) %>%
  summarize(NumberDaysPosPCR_IR = n_distinct(StudyDay))
print(Qdata_pcr_pos1_or_more_days_IR)

# let's get the list with seroconversion by Microneuts (CDC serology)

# First only select the subjectIDs that were serosusceptible by MN at baseline (<80 at baseline)
# Upon the October 12, 2018 conference call with the team, decided to not exclude based on this criteria
#Qdata_Microneut_susceptible <- Qdata %>%
  #filter(Randomization_DorIRorCR == "IR" & Microneut_VisitType == "Q baseline" & Microneutralization.Titer.to.A.Wisconsin.67.2005 < 80) %>%
  #distinct(SubjectID, .keep_all = FALSE)

Qdata_Microneut_pos_IR <- Qdata %>%
  filter(Randomization_DorIRorCR == "IR" & Microneut_VisitType == "F/up" & 
           Microneut_Seroconvert == 1) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, QuarantineNumber) 
print(Qdata_Microneut_pos_IR)

# let's get the list with seroconversion by HAI (Glasgow serology)

# First only select the subjectIDs that were serosusceptible by HAI at baseline (<=10 at baseline)
# Following the October 12, 2018 conference call with the team, decided to not exclude based on this criteria
#Qdata_HAI_susceptible <- Qdata %>%
  #filter(Randomization_DorIRorCR == "IR" & HAI_dayminus2_recodeNDA <= 10) %>%
  #distinct(SubjectID, .keep_all = FALSE)

Qdata_HAI_pos_IR <- Qdata %>%
  filter(Randomization_DorIRorCR == "IR" & HAI_Seroconversion == 1) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, QuarantineNumber)
print(Qdata_HAI_pos_IR)

# Now let's merge the datasets together to get full list of volunteers who meet positivity criteria
Qdata_infected_IR <- Qdata_HAI_pos_IR %>%
  full_join(Qdata_Microneut_pos_IR, by = c("SubjectID" = "SubjectID")) %>%
  full_join(Qdata_pcr_pos1_or_more_days_IR, by = c("SubjectID" = "SubjectID")) %>%
  arrange(SubjectID)
print(Qdata_infected_IR)

# Identify whom among the pcr positive individuals with only a single day of PCR positivity did not also seroconvert to confirm infection
Qdata_1pcrpos_nosero_IR <- Qdata_infected_IR %>%
  filter(NumberDaysPosPCR_IR ==1) %>%
  filter(is.na(QuarantineNumber.x) & is.na(QuarantineNumber.y))

# Among the individuals that were positive on only a single day, which study day was the positive day?
Qdata_pcr_pos1_or_more_days_studydays_IR <- Qdata %>%
  filter(Randomization_DorIRorCR == "IR") %>%
  filter(!is.na(InfA_Ct)) %>%
  filter((InfA_Ct<38 & InfA_Ct!=0)) %>%
  group_by(SubjectID, StudyDay) %>%
  summarize(count = n())
Qdata_pcr_pos1_day_IR <- Qdata_infected_IR %>%
  filter(NumberDaysPosPCR_IR ==1) %>%
  left_join(Qdata_pcr_pos1_or_more_days_studydays_IR, by = c("SubjectID" = "SubjectID")) %>%
  select(-`count`)

# Remove those who were only 1 day pcr positive and no seroconversion (protocol criteria for positivity)
Qdata_infected_IR <- Qdata_infected_IR %>% 
  anti_join(Qdata_1pcrpos_nosero_IR, by = c("SubjectID" = "SubjectID"))

# Summarize number of infected (by any criteria) for each Q
# First need to attach a full set of quarantine numbers on the "Qdata_infected" df
Qdata_QuarantineNumbers <- Qdata %>%
  select(SubjectID, QuarantineNumber) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  filter(!is.na(SubjectID))
Qdata_infected_IR <- Qdata_infected_IR %>%
  left_join(Qdata_QuarantineNumbers, by = c("SubjectID" = "SubjectID"))
Qdata_infected_IR_table3 <- Qdata_infected_IR %>%
  group_by(QuarantineNumber) %>%
  summarize(NumberInfectedIR = n_distinct(SubjectID))
print(Qdata_infected_IR_table3)

#### Generation of Table3_IR for paper ####

# To output a nice summary table with numInfected, numInoculated, and %infected of inoculated
Qdata_table3_IR <- Exposed_IR_table3  %>%
  left_join(Qdata_infected_IR_table3) %>%
  mutate(Fraction_Inf_over_ExpIR = NumberInfectedIR/NumberExposedIR)
print(Qdata_table3_IR)

#### Table 3: a2) CR: Number of infected/ number of exposed (and %) ####

# number of exposed CR

Exposed_CR <- Qdata %>% 
  filter(Randomization_DorIRorCR == "CR") %>%
  distinct(SubjectID, .keep_all = TRUE)
Exposed_CR_table3 <- Exposed_CR %>%
  group_by(QuarantineNumber) %>%
  summarize(NumberExposedCR = n_distinct(SubjectID))
print(Exposed_CR_table3)

# number of infected CR

# positive by PCR (seroconversion, or PCR positive on more than 1 day)
# let's get the list with at least one day PCR positive, then merge up with seroconversion data
Qdata_pcr_pos1_or_more_days_CR <- Qdata %>%
  filter(Randomization_DorIRorCR == "CR") %>%
  filter(!is.na(InfA_Ct)) %>%
  filter((InfA_Ct<38 & InfA_Ct!=0)) %>%
  group_by(SubjectID, StudyDay) %>%
  summarize(count = n()) %>%
  summarize(NumberDaysPosPCR_CR = n_distinct(StudyDay))
print(Qdata_pcr_pos1_or_more_days_CR)

# let's get the list with seroconversion by Microneuts (CDC serology)

# First only select the subjectIDs that were serosusceptible by MN at baseline (<80 at baseline)
# Upon the October 12, 2018 conference call with the team, decided to not exclude based on this criteria
#Qdata_Microneut_susceptible <- Qdata %>%
  #filter(Randomization_DorIRorCR == "CR" & Microneut_VisitType == "Q baseline" & Microneutralization.Titer.to.A.Wisconsin.67.2005 < 80) %>%
  #distinct(SubjectID, .keep_all = FALSE)

Qdata_Microneut_pos_CR <- Qdata %>%
  filter(Randomization_DorIRorCR == "CR" & Microneut_VisitType == "F/up" & Microneut_Seroconvert == 1) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, QuarantineNumber)
print(Qdata_Microneut_pos_CR)

# let's get the list with seroconversion by HAI (Glasgow serology)

# First only select the subjectIDs that were serosusceptible by HAI at baseline (<=10 at baseline)
# Upon the October 12, 2018 conference call with the team, decided to not exclude based on this criteria
#Qdata_HAI_susceptible <- Qdata %>%
  #filter(Randomization_DorIRorCR == "CR" & HAI_dayminus2_recodeNDA <= 10) %>%
  #distinct(SubjectID, .keep_all = FALSE)

Qdata_HAI_pos_CR <- Qdata %>%
  filter(Randomization_DorIRorCR == "CR" & HAI_Seroconversion == 1) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, QuarantineNumber)
print(Qdata_HAI_pos_CR)

# Now let's merge the datasets together to get full list of volunteers who meet positivity criteria
Qdata_infected_CR <- Qdata_HAI_pos_CR %>%
  full_join(Qdata_Microneut_pos_CR, by = c("SubjectID" = "SubjectID")) %>%
  full_join(Qdata_pcr_pos1_or_more_days_CR, by = c("SubjectID" = "SubjectID")) %>%
  arrange(SubjectID)
print(Qdata_infected_CR)

# Identify whom among the pcr positive individuals with only a single day of PCR positivity did not also seroconvert to confirm infection
Qdata_1pcrpos_nosero_CR <- Qdata_infected_CR %>%
  filter(NumberDaysPosPCR_CR ==1) %>%
  filter(is.na(QuarantineNumber.x) & is.na(QuarantineNumber.y))

# Among the individuals that were positive on only a single day, which study day was the positive day?
Qdata_pcr_pos1_or_more_days_studydays_CR <- Qdata %>%
  filter(Randomization_DorIRorCR == "CR") %>%
  filter(!is.na(InfA_Ct)) %>%
  filter((InfA_Ct<38 & InfA_Ct!=0)) %>%
  group_by(SubjectID, StudyDay) %>%
  summarize(count = n())
Qdata_pcr_pos1_day_CR <- Qdata_infected_CR %>%
  filter(NumberDaysPosPCR_CR ==1) %>%
  left_join(Qdata_pcr_pos1_or_more_days_studydays_CR, by = c("SubjectID" = "SubjectID")) %>%
  select(-`count`)

# Remove those who were only 1 day pcr positive and no seroconversion (protocol criteria for positivity)
Qdata_infected_CR <- Qdata_infected_CR %>% 
  anti_join(Qdata_1pcrpos_nosero_CR, by = c("SubjectID" = "SubjectID"))

# Summarize number of infected (by any criteria) for each Q
# First need to attach a full set of quarantine numbers on the "Qdata_infected_CR" df
Qdata_QuarantineNumbers <- Qdata %>%
  select(SubjectID, QuarantineNumber) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  filter(!is.na(SubjectID))
Qdata_infected_CR <- Qdata_infected_CR %>%
  left_join(Qdata_QuarantineNumbers, by = c("SubjectID" = "SubjectID"))
Qdata_infected_CR_table3 <- Qdata_infected_CR %>%
  group_by(QuarantineNumber) %>%
  summarize(NumberInfectedCR = n_distinct(SubjectID))
print(Qdata_infected_CR_table3)

#### Generation of Table3_CR for paper ####

# To output a nice summary table with numInfected, numInoculated, and %infected of inoculated
Qdata_table3_CR <-Exposed_CR_table3 %>% #using the df just created, above
  left_join(Qdata_infected_CR_table3) %>%
  mutate(Fraction_Inf_over_ExpCR = NumberInfectedCR/NumberExposedCR)
print(Qdata_table3_CR)

#### Table 3: b1) IR: Number of symptomatic (and % of exposed) ####

## Implementing Version 2 of "Symptomatic afebrile" that we used in Table 1 (see above)

# “Symptomatic_V2_Afebrile”: “Evidence of at least 2 symptoms of any grade that do not necessarily need to persist for consecutive study days, nor persist for the same consecutive study days, but where each of the symptoms appeared on at least two different study days.”

# Going to implement "symptomatic" for afebrile to make a well-defined milder criteria for "symptomatic afebrile"

# First need to manipulate the dataset to prepare for the loop logic that was created to do this analysis
# Combine symptom severity measures (grades 1, 2, and 3) because grade >1 doesn't matter for this definition of symptomatic afebrile
# Symptomatic_IR_exposed_grade123 <- Exposed_IR %>%
#   select(SubjectID, QuarantineNumber) %>%
#   left_join(Qdata, by = c("SubjectID" = "SubjectID", "QuarantineNumber" = "QuarantineNumber")) %>%
#   filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0 | StudyDay == 1 | StudyDay == 2 | 
#            StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6 | StudyDay == 7 | StudyDay == 8 |
#            StudyDay == 9 | StudyDay == 10) %>%
#   mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis, 
#          LRI = cough+SOB, 
#          SystemicI = headache+muscleAches+malaise) %>%
#   mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
#   mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3, 
#          stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3, 
#          sneezing123 = sneezing==1 | sneezing==2 | sneezing==3, 
#          soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
#          DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3, 
#          DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3, 
#          DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3, 
#          DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
#          cough123 = cough==1 | cough==2 | cough==3, 
#          SOB123 = SOB==1 | SOB==2 | SOB==3,
#          headache123 = headache==1 | headache==2 | headache==3, 
#          muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3, 
#          malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
#   mutate(Febrile = as.numeric(Febrile),
#          runnyNose123 = as.numeric(runnyNose123), 
#          stuffyNose123 = as.numeric(stuffyNose123), 
#          sneezing123 = as.numeric(sneezing123), 
#          soreThroat123 = as.numeric(soreThroat123),
#          DPENasalDischarge123 = as.numeric(DPENasalDischarge123), 
#          DPEOtits123 = as.numeric(DPEOtits123), 
#          DPESinusTenderness123 = as.numeric(DPESinusTenderness123), 
#          DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
#          cough123 = as.numeric(cough123), 
#          SOB123 = as.numeric(SOB123),
#          headache123 = as.numeric(headache123), 
#          muscleAches123 = as.numeric(muscleAches123), 
#          malaise123 = as.numeric(malaise123)) %>%
#   group_by(SubjectID, StudyDay, QuarantineNumber) %>%
#   summarize(Febrile = max(Febrile),
#             runnyNose123 = max(runnyNose123), 
#             stuffyNose123 = max(stuffyNose123), 
#             sneezing123 = max(sneezing123), 
#             soreThroat123 = max(soreThroat123),
#             DPENasalDischarge123 = max(DPENasalDischarge123), 
#             DPEOtits123 = max(DPEOtits123), 
#             DPESinusTenderness123 = max(DPESinusTenderness123), 
#             DPEPharyngitis123 = max(DPEPharyngitis123),
#             cough123 = max(cough123), 
#             SOB123 = max(SOB123),
#             headache123 = max(headache123), 
#             muscleAches123 = max(muscleAches123), 
#             malaise123 = max(malaise123)) %>%
#   ungroup()
# # The above gets us to a dataset where symptoms with grade 1, 2, or 3 are summarized by whether there was...
# # ... at least one symptoms (of any grade) detection per study day
# 
# # Now can search for recipients that meet the definition of "symptomatic afebrile"
# # However we will deal with the "afebrile" component later
# # First applying the loop that will select those that meet the symptoms criteria apart from afebrile
# # But first need to create a new df that only has data from study days 1-10
# Symptomatic_IR_exposed_grade123_day1to10 <- Symptomatic_IR_exposed_grade123 %>%
#   filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 |
#            StudyDay == 6 | StudyDay == 7 | StudyDay == 8 | StudyDay == 9 | StudyDay == 10)
# 
# sub <- unique(Symptomatic_IR_exposed_grade123_day1to10$SubjectID)
# c_sub <- c()
# token<-0
# for (i in 1:length(sub)) {
#   token<-0
#   subid <- sub[i]
#   temp <- Symptomatic_IR_exposed_grade123_day1to10[Symptomatic_IR_exposed_grade123_day1to10$SubjectID == subid, ]
#   temp1<-temp[,4:17]
#   temp1[is.na(temp1)]<-0
#   temp<-cbind(temp[,1:3],temp1)
#   for (j in 1:(nrow(temp))) {
#     for (k in 5:16) {
#       for (l in (k+1):17){
#         if (sum(temp[,k], na.rm = TRUE)>=2 & sum(temp[,l], na.rm = TRUE)>=2) {
#           c_sub <- rbind(c_sub, subid)
#           token<-1
#           break
#         }
#       }
#       if (token==1){
#         break
#       } 
#     }
#     if (token==1){
#       break
#     }
#   }
# }
# # This yields 9 subjectIDs
# # However, if we want to exclude symptoms from contributing to the criteria if they appeared before day 1 we do
# sub <- unique(Symptomatic_IR_exposed_grade123$SubjectID)
# c_sub2 <- c()
# token<-0
# for (i in 1:length(sub)) {
#   token<-0
#   subid <- sub[i]
#   temp <- Symptomatic_IR_exposed_grade123[Symptomatic_IR_exposed_grade123$SubjectID == subid, ]
#   temp1<-temp[,4:17]
#   temp1[is.na(temp1)]<-0
#   temp<-cbind(temp[,1:3],temp1)
#   for (j in 1:(nrow(temp))) {
#     for (k in 5:16) {
#       for (l in (k+1):17){
#         if (sum(temp[,k], na.rm = TRUE)>=2 & sum(temp[,l], na.rm = TRUE)>=2) {
#           sum1<-0
#           sum2<-0
#           for (m in 1:(tail(which(temp$StudyDay==-1), n=1))){
#             sum1<-sum1+temp[m,k]
#             sum2<-sum1+temp[m,l]
#           }
#           if (sum1==0 & sum2==0){
#             c_sub2 <- rbind(c_sub2, subid)
#             token<-1
#             break
#           }
#         }
#       }
#       if (token==1){
#         break
#       } 
#     }
#     if (token==1){
#       break
#     }
#   }
# }
# # This yields 8 subjectIDs, 1 fewer subjectIDs with sympptoms (before applying the afebrile criteria) compared with the c_sub above
# # For now we will stick with the less stringent version and use c_sub of 9
# 
# # Now converting this vector of studyIDs to a df
# # Remember this is symptomatic version 2: a milder criteria for symptomatic, however it is symptomatic afebrile
# Symptomatic_afebrile_IR <- as.data.frame(c_sub) %>%
#   rename(SubjectID = "V1") 
# 
# # Now getting rid of the subjectIDs that were febrile
# # first find which ones were febrile.
# Qdata_exposed_febrile_IR <- Qdata %>%
#   filter(Randomization_DorIRorCR == "IR" & Tympanic.temp..degrees.C. >37.9) %>%
#   distinct(SubjectID, .keep_all = FALSE)
# 
# Symptomatic_afebrile_IR <- Symptomatic_afebrile_IR %>%
#   anti_join(Qdata_exposed_febrile_IR, by = c("SubjectID" = "SubjectID"))
# 
# # Now adding the QuarantineNumber on to the Symptomatic df 
# # Then we can sort by Q for the table3_IR
# Symptomatic_V2_Afebrile_IR_QuarantineNumber_table3 <- Symptomatic_afebrile_IR %>%
#   left_join(Qdata_QuarantineNumbers) %>%
#   group_by(QuarantineNumber) %>%
#   summarize(Number_Symptomatic_V2_Afebrile_IR = n_distinct(SubjectID))
# 
# # Add onto Table3_IR the number of symptomatic by version 2 criteria and % of infected
# # For the final version of table 3 we will use the Symptomatic V3, and thus will ignore this in the printed table
# #Qdata_table3_IR <- Qdata_table3_IR %>%
#   #left_join(Symptomatic_V2_Afebrile_IR_QuarantineNumber_table3) %>%
#   #mutate(Fraction_Symptomatic_V2_Afebrile_of_Exposed_IR = Number_Symptomatic_V2_Afebrile_IR/NumberExposedIR)
# #print(Qdata_table3_IR)

#### Table 3: b2) CR: Number of symptomatic (and % of exposed) ####

# # Symptomatic Version 2 afebrile (like what was done to the IR group above)
# # “Symptomatic_V2_Afebrile”: “Evidence of at least 2 symptoms of any grade that do not necessarily...
# # ...need to persist for consecutive study days, nor persist for the same consecutive study days, but ... 
# # ...where each of the symptoms appeared on at least two different study days.”
# # First need to manipulate the dataset to prepare for the loop logic that was created to do this analysis
# # Combine symptom severity measures (grades 1, 2, and 3) because grade >1 doesn't matter for this definition of symptomatic afebrile
# Symptomatic_CR_exposed_grade123 <- Exposed_CR %>%
#   select(SubjectID, QuarantineNumber) %>%
#   left_join(Qdata, by = c("SubjectID" = "SubjectID", "QuarantineNumber" = "QuarantineNumber")) %>%
#   filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0 | StudyDay == 1 | StudyDay == 2 | 
#            StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6 | StudyDay == 7 | StudyDay == 8 |
#            StudyDay == 9 | StudyDay == 10) %>%
#   mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis, 
#          LRI = cough+SOB, 
#          SystemicI = headache+muscleAches+malaise) %>%
#   mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
#   mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3, 
#          stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3, 
#          sneezing123 = sneezing==1 | sneezing==2 | sneezing==3, 
#          soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
#          DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3, 
#          DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3, 
#          DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3, 
#          DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
#          cough123 = cough==1 | cough==2 | cough==3, 
#          SOB123 = SOB==1 | SOB==2 | SOB==3,
#          headache123 = headache==1 | headache==2 | headache==3, 
#          muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3, 
#          malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
#   mutate(Febrile = as.numeric(Febrile),
#          runnyNose123 = as.numeric(runnyNose123), 
#          stuffyNose123 = as.numeric(stuffyNose123), 
#          sneezing123 = as.numeric(sneezing123), 
#          soreThroat123 = as.numeric(soreThroat123),
#          DPENasalDischarge123 = as.numeric(DPENasalDischarge123), 
#          DPEOtits123 = as.numeric(DPEOtits123), 
#          DPESinusTenderness123 = as.numeric(DPESinusTenderness123), 
#          DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
#          cough123 = as.numeric(cough123), 
#          SOB123 = as.numeric(SOB123),
#          headache123 = as.numeric(headache123), 
#          muscleAches123 = as.numeric(muscleAches123), 
#          malaise123 = as.numeric(malaise123)) %>%
#   group_by(SubjectID, StudyDay, QuarantineNumber) %>%
#   summarize(Febrile = max(Febrile),
#             runnyNose123 = max(runnyNose123), 
#             stuffyNose123 = max(stuffyNose123), 
#             sneezing123 = max(sneezing123), 
#             soreThroat123 = max(soreThroat123),
#             DPENasalDischarge123 = max(DPENasalDischarge123), 
#             DPEOtits123 = max(DPEOtits123), 
#             DPESinusTenderness123 = max(DPESinusTenderness123), 
#             DPEPharyngitis123 = max(DPEPharyngitis123),
#             cough123 = max(cough123), 
#             SOB123 = max(SOB123),
#             headache123 = max(headache123), 
#             muscleAches123 = max(muscleAches123), 
#             malaise123 = max(malaise123)) %>%
#   ungroup()
# # The above gets us to a dataset where symptoms with grade 1, 2, or 3 are summarized by whether there was...
# # ... at least one symptoms (of any grade) detection per study day
# 
# # Now can implement criteria search for recipients that meet the definition of "symptomatic afebrile"
# # The loop will check for those who meet the symptom definition and we will add the afebrile piece later.
# # First we need to cut a df that only has data from study days 1-10
# Symptomatic_CR_exposed_grade123_day1to10 <- Symptomatic_CR_exposed_grade123 %>%
#   filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 |
#            StudyDay == 6 | StudyDay == 7 | StudyDay == 8 | StudyDay == 9 | StudyDay == 10)
# 
# sub <- unique(Symptomatic_CR_exposed_grade123_day1to10$SubjectID)
# c_sub <- c()
# token<-0
# for (i in 1:length(sub)) {
#   token<-0
#   subid <- sub[i]
#   temp <- Symptomatic_CR_exposed_grade123_day1to10[Symptomatic_CR_exposed_grade123_day1to10$SubjectID == subid, ]
#   temp1<-temp[,4:17]
#   temp1[is.na(temp1)]<-0
#   temp<-cbind(temp[,1:3],temp1)
#   for (j in 1:(nrow(temp))) {
#     for (k in 5:16) {
#       for (l in (k+1):17){
#         if (sum(temp[,k], na.rm = TRUE)>=2 & sum(temp[,l], na.rm = TRUE)>=2) {
#             c_sub <- rbind(c_sub, subid)
#             token<-1
#             break
#         }
#       }
#       if (token==1){
#         break
#       } 
#     }
#     if (token==1){
#       break
#     }
#   }
# }
# # This yields a vector c_sub of 11 subjectIDs
# # But if we wanted to apply a more stringent criteria where symptoms occuring before day1 were excluded from contributing to criteria
# # We could do a new loop to create vector c_sub2
# sub <- unique(Symptomatic_CR_exposed_grade123$SubjectID)
# c_sub2 <- c()
# token<-0
# for (i in 1:length(sub)) {
#   token<-0
#   subid <- sub[i]
#   temp <- Symptomatic_CR_exposed_grade123[Symptomatic_CR_exposed_grade123$SubjectID == subid, ]
#   temp1<-temp[,4:17]
#   temp1[is.na(temp1)]<-0
#   temp<-cbind(temp[,1:3],temp1)
#   for (j in 1:(nrow(temp))) {
#     for (k in 5:16) {
#       for (l in (k+1):17){
#         if (sum(temp[,k], na.rm = TRUE)>=2 & sum(temp[,l], na.rm = TRUE)>=2) {
#           sum1<-0
#           sum2<-0
#           for (m in 1:(tail(which(temp$StudyDay==-1), n=1))){
#             sum1<-sum1+temp[m,k]
#             sum2<-sum1+temp[m,l]
#           }
#           if (sum1==0 & sum2==0){
#             c_sub2 <- rbind(c_sub2, subid)
#             token<-1
#             break
#           }
#         }
#       }
#       if (token==1){
#         break
#       } 
#     }
#     if (token==1){
#       break
#     }
#   }
# }
# # This yields a vector c_sub2 that has 11 subjectIDs, same as the c_sub vector that used less stringent criteria
# # For now we will keep the less stringent criteria
# # Now transforming this c_sub (the less stringent version) vector of studyIDs to a df and checking for afebrile
# # Remember this is symptomatic version 2: a milder criteria for symptomatic, however it is symptomatic afebrile
# Symptomatic_V2_CR <- as.data.frame(c_sub) %>%
#   rename(SubjectID = "V1") 
# 
# # Now getting rid of the subjectIDs that were febrile
# # first find which ones were febrile.
# Qdata_exposed_febrile_CR <- Qdata %>%
#   filter(Randomization_DorIRorCR == "IR" & Tympanic.temp..degrees.C. >37.9) %>%
#   distinct(SubjectID, .keep_all = FALSE)
# 
# Symptomatic_V2_Afebrile_CR <- Symptomatic_V2_CR %>%
#   anti_join(Qdata_exposed_febrile_CR, by = c("SubjectID" = "SubjectID"))
# 
# # Now adding the QuarantineNumber on to the Symptomatic df 
# # Then we can sort by Q for the table3
# Symptomatic_V2_Afebrile_CR_QuarantineNumber_table3 <- Symptomatic_V2_Afebrile_CR %>%
#   left_join(Qdata_QuarantineNumbers) %>%
#   group_by(QuarantineNumber) %>%
#   summarize(Number_Symptomatic_V2_Afebrile_CR = n_distinct(SubjectID))
# 
# # Add onto Table3 the number of symptomatic by version 2 criteria and % of infected
# # For the final version of table 3 we will use the Symptomatic V3, and thus will ignore this in the printed table
# #Qdata_table3_CR <- Qdata_table3_CR %>%
#   #left_join(Symptomatic_V2_Afebrile_CR_QuarantineNumber_table3, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
#   #mutate(Fraction_Symptomatic_V2_Afebrile_of_ExposedCR = Number_Symptomatic_V2_Afebrile_CR/NumberExposedCR)
# #print(Qdata_table3_CR)

#### Table 3: c1) IR: Number of symptomatic, non-ILI (and % of exposed) ####

# This category will not be used for this paper.

#### Table 3: c2) CR: Number of symptomatic, non-ILI (and % of exposed) ####

# This category will not be used for this paper.

#### Table 3: IR Symptomatic version 3 (to match Killingley, 2012) ####
## Implementing a new version of "symptomatic" based on October 12, 2018 webex conference with the team
## The purpose of this version of symptomatic is so that we are consistent with the definitions from the proof-of-concept study (Killingley, 2012 JID)

# Thus, this version 3 of symptomatic for IR is:
# "Any respiratory symptom that occurs at all over 2 consecutive days, or occurs for 3/3 (am, early pm, late pm) symptom measurements on a single day, where respiratory symptoms include runny nose, stuffy nose, sneezing, sore throat, cough, and shortness of breath"

# First we are going to cut the a new df that has only the 6 respiratory symtpomms of interest 
# (and also to include fever, just in case of future analyses)
Symptomatic_IR_V3_day1to10 <- Qdata %>%
  filter(Randomization_DorIRorCR == "IR") %>%
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0 | StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | 
           StudyDay == 5 | StudyDay == 6 | StudyDay == 7 | StudyDay == 8 | StudyDay == 9 | StudyDay == 10) %>%
  mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis, 
         LRI = cough+SOB, 
         SystemicI = headache+muscleAches+malaise) %>%
  filter(Microneut_VisitType == "Q baseline") %>%
  mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
  mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3, 
         stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3, 
         sneezing123 = sneezing==1 | sneezing==2 | sneezing==3, 
         soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
         DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3, 
         DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3, 
         DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3, 
         DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
         cough123 = cough==1 | cough==2 | cough==3, 
         SOB123 = SOB==1 | SOB==2 | SOB==3,
         headache123 = headache==1 | headache==2 | headache==3, 
         muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3, 
         malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
  mutate(Febrile = as.numeric(Febrile),
         runnyNose123 = as.numeric(runnyNose123), 
         stuffyNose123 = as.numeric(stuffyNose123), 
         sneezing123 = as.numeric(sneezing123), 
         soreThroat123 = as.numeric(soreThroat123),
         DPENasalDischarge123 = as.numeric(DPENasalDischarge123), 
         DPEOtits123 = as.numeric(DPEOtits123), 
         DPESinusTenderness123 = as.numeric(DPESinusTenderness123), 
         DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
         cough123 = as.numeric(cough123), 
         SOB123 = as.numeric(SOB123),
         headache123 = as.numeric(headache123), 
         muscleAches123 = as.numeric(muscleAches123), 
         malaise123 = as.numeric(malaise123)) %>%
  select(SubjectID, StudyDay, Sx_Date, SDC_time, QuarantineNumber, Febrile,
         runnyNose123, stuffyNose123, sneezing123, soreThroat123, cough123, SOB123) %>%
  group_by(SubjectID, StudyDay, QuarantineNumber) %>%
  distinct(SDC_time, .keep_all = TRUE) %>%
  arrange(SubjectID, StudyDay) %>%
  ungroup()
# This is great but the way the data is put together, this leaves out the day -3 through day0 data
# Therefore, as a quick fix, we will cut a new dataset that only filters in the data from day -3 through day0
# Then we will bind it back to the "Symptomatic_IR_V3" that was just created.
Symptomatic_IR_V3_before_day1 <- Exposed_IR %>%
  select(SubjectID) %>%
  left_join(Qdata) %>%
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0) %>%
  mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis, 
         LRI = cough+SOB, 
         SystemicI = headache+muscleAches+malaise) %>%
  mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
  mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3, 
         stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3, 
         sneezing123 = sneezing==1 | sneezing==2 | sneezing==3, 
         soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
         DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3, 
         DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3, 
         DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3, 
         DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
         cough123 = cough==1 | cough==2 | cough==3, 
         SOB123 = SOB==1 | SOB==2 | SOB==3,
         headache123 = headache==1 | headache==2 | headache==3, 
         muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3, 
         malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
  mutate(Febrile = as.numeric(Febrile),
         runnyNose123 = as.numeric(runnyNose123), 
         stuffyNose123 = as.numeric(stuffyNose123), 
         sneezing123 = as.numeric(sneezing123), 
         soreThroat123 = as.numeric(soreThroat123),
         DPENasalDischarge123 = as.numeric(DPENasalDischarge123), 
         DPEOtits123 = as.numeric(DPEOtits123), 
         DPESinusTenderness123 = as.numeric(DPESinusTenderness123), 
         DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
         cough123 = as.numeric(cough123), 
         SOB123 = as.numeric(SOB123),
         headache123 = as.numeric(headache123), 
         muscleAches123 = as.numeric(muscleAches123), 
         malaise123 = as.numeric(malaise123)) %>%
  select(SubjectID, StudyDay, Sx_Date, SDC_time, QuarantineNumber, Febrile,
         runnyNose123, stuffyNose123, sneezing123, soreThroat123, cough123, SOB123) %>%
  group_by(SubjectID, StudyDay, QuarantineNumber) %>%
  distinct(SDC_time, .keep_all = TRUE) %>%
  arrange(SubjectID, StudyDay) %>%
  ungroup()
# Now binding together and sorting
Symptomatic_IR_V3 <- bind_rows(Symptomatic_IR_V3_day1to10, Symptomatic_IR_V3_before_day1) 
Symptomatic_IR_V3 <- Symptomatic_IR_V3 %>%
  arrange(SubjectID, StudyDay)

# We will hold onto the above work for the future, but for now use df Symptomatic_IR_V3_day1to10

# Filter those with three measurements positive in a single study day for any of the respiratory pathogens
sub <- unique(Symptomatic_IR_V3_day1to10$SubjectID)
c_sub <- c()
token <- 0
for (i in 1:length(sub)) {
  token <- 0
  subid <- sub[i]
  temp <- Symptomatic_IR_V3_day1to10[Symptomatic_IR_V3_day1to10$SubjectID == subid, ]
  temp1<-temp[,6:12]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:5],temp1)
  stud <- unique(temp$StudyDay)
  for (j in 1:length(stud)) {
    studyday <- stud[j]
    temp2 <- temp[temp$StudyDay == studyday, ]
    for (k in 1:(nrow(temp2))) {
      for (l in 7:12) {
        if (sum(temp2[,l]) == 3) {
          c_sub <- rbind(c_sub, subid)
          token <- 1
          break
        }
      }
      if (token == 1) {
        break
      }
    }
    if (token == 1) {
      break
    }
  } 
}
# This yields a c_sub vector of 5 subjectIDs
# However if we wanted to ignore the symptoms from the criteria where there was observation before day 1, we would do a new loop
# Note that here we must use the "Symptomatic_IR_V3" df because it includes the data from before day1
sub <- unique(Symptomatic_IR_V3$SubjectID)
c_sub2 <- c()
token <- 0
for (i in 1:length(sub)) {
  token <- 0
  subid <- sub[i]
  temp <- Symptomatic_IR_V3[Symptomatic_IR_V3$SubjectID == subid, ]
  temp1<-temp[,6:12]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:5],temp1)
  stud <- unique(temp$StudyDay)
  for (j in 1:length(stud)) {
    studyday <- stud[j]
    temp2 <- temp[temp$StudyDay == studyday, ]
    for (k in 1:(nrow(temp2))) {
      for (l in 7:12) {
        if (sum(temp2[,l]) == 3) {
          sum1<-0
          for (m in 1:(tail(which(temp$StudyDay==0), n=1))) {
            sum1<-sum1+temp[m,l]
          }
          if (sum1==0) {
            token<-1
            c_sub2 <- rbind(c_sub2, subid)
            break
          }
        }
      }
      if (token == 1) {
        break
      }
    }
    if (token == 1) {
      break
    }
  } 
}
# This yields a c_sub2 vector with 3 subjectIDs (2 less than the c_sub) and implements the criteria where we eliminated the Sx appearing before day1
# But for now we will use the less stringent criteria
# Now get the list of subject IDs from c_sub (as opposed to the c_sub2 version)
Symptomatic_IR_singleday <- as.data.frame(c_sub) %>%
  rename(SubjectID = "V1") %>%
  distinct(SubjectID)

# Now use a loop to classify those with at least 1 respiratory symptom on two consec days
# For this we should use the "Symptomatic_IR_exposed_grade123" df that marks with indicator of 1 when any of the 3 symptom measurements in a day showed evidence of symptoms of any grade.
# This df was created in the first version of symptomatic for IR - will recreate here because we are commenting out earlier versions
# Combine symptom severity measures (grades 1, 2, and 3) because grade >1 doesn't matter for this definition of symptomatic afebrile
Symptomatic_IR_exposed_grade123 <- Exposed_IR %>%
  select(SubjectID, QuarantineNumber) %>%
  left_join(Qdata, by = c("SubjectID" = "SubjectID", "QuarantineNumber" = "QuarantineNumber")) %>%
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0 | StudyDay == 1 | StudyDay == 2 |
           StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6 | StudyDay == 7 | StudyDay == 8 |
           StudyDay == 9 | StudyDay == 10) %>%
  mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis,
         LRI = cough+SOB,
         SystemicI = headache+muscleAches+malaise) %>%
  mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
  mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3,
         stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3,
         sneezing123 = sneezing==1 | sneezing==2 | sneezing==3,
         soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
         DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3,
         DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3,
         DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3,
         DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
         cough123 = cough==1 | cough==2 | cough==3,
         SOB123 = SOB==1 | SOB==2 | SOB==3,
         headache123 = headache==1 | headache==2 | headache==3,
         muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3,
         malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
  mutate(Febrile = as.numeric(Febrile),
         runnyNose123 = as.numeric(runnyNose123),
         stuffyNose123 = as.numeric(stuffyNose123),
         sneezing123 = as.numeric(sneezing123),
         soreThroat123 = as.numeric(soreThroat123),
         DPENasalDischarge123 = as.numeric(DPENasalDischarge123),
         DPEOtits123 = as.numeric(DPEOtits123),
         DPESinusTenderness123 = as.numeric(DPESinusTenderness123),
         DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
         cough123 = as.numeric(cough123),
         SOB123 = as.numeric(SOB123),
         headache123 = as.numeric(headache123),
         muscleAches123 = as.numeric(muscleAches123),
         malaise123 = as.numeric(malaise123)) %>%
  group_by(SubjectID, StudyDay, QuarantineNumber) %>%
  summarize(Febrile = max(Febrile),
            runnyNose123 = max(runnyNose123),
            stuffyNose123 = max(stuffyNose123),
            sneezing123 = max(sneezing123),
            soreThroat123 = max(soreThroat123),
            DPENasalDischarge123 = max(DPENasalDischarge123),
            DPEOtits123 = max(DPEOtits123),
            DPESinusTenderness123 = max(DPESinusTenderness123),
            DPEPharyngitis123 = max(DPEPharyngitis123),
            cough123 = max(cough123),
            SOB123 = max(SOB123),
            headache123 = max(headache123),
            muscleAches123 = max(muscleAches123),
            malaise123 = max(malaise123)) %>%
  ungroup()
# The above gets us to a dataset where symptoms with grade 1, 2, or 3 are summarized by whether there was at least one symptoms (of any grade) detection per study day

# First cut the "resp" version of the "Symptomatic_IR_exposed_grade123" df to the variables of interest and proper scale for the loop
Symptomatic_IR_exposed_grade123_resp <-Symptomatic_IR_exposed_grade123 %>%
  select(SubjectID, StudyDay, QuarantineNumber, Febrile,
       runnyNose123, stuffyNose123, sneezing123, soreThroat123, cough123, SOB123)
# First we need to get the df for just study days 1-10
Symptomatic_IR_exposed_grade123_resp_day1to10 <- Symptomatic_IR_exposed_grade123_resp %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 |
           StudyDay == 6 | StudyDay == 7 | StudyDay == 8 | StudyDay == 9 | StudyDay == 10)

sub <- unique(Symptomatic_IR_exposed_grade123_resp_day1to10$SubjectID)
c_sub <- c()
token<-0
for (i in 1:length(sub)) {
  token<-0
  subid <- sub[i]
  temp <- Symptomatic_IR_exposed_grade123_resp_day1to10[Symptomatic_IR_exposed_grade123_resp_day1to10$SubjectID == subid, ]
  temp1<-temp[,4:10]
  temp1[is.na(temp1)]<-0
  temp<-cbind(temp[,1:3],temp1)
  for (j in 1:(nrow(temp)-1)) {
    for (k in 5:10) {
      if (temp[j, k] + temp[j+1, k] == 2){
        if (temp$StudyDay[j+1] == temp$StudyDay[j]+1) {
          c_sub <- rbind(c_sub, subid)
          token<-1
          break
        }
      }   
    }
    if (token==1){
      break
    } 
  }
}
# This yields a c_sub vector of 11 subjectIDs
# If we use the more stringent criteria we use the below loop instead and get c_sub2
sub <- unique(Symptomatic_IR_exposed_grade123_resp$SubjectID)
c_sub2 <- c()
token<-0
for (i in 1:length(sub)) {
  token<-0
  subid <- sub[i]
  temp <- Symptomatic_IR_exposed_grade123_resp[Symptomatic_IR_exposed_grade123_resp$SubjectID == subid, ]
  temp1<-temp[,4:10]
  temp1[is.na(temp1)]<-0
  temp<-cbind(temp[,1:3],temp1)
  for (j in 1:(nrow(temp)-1)) {
    for (k in 5:10) {
      if (temp[j, k] + temp[j+1, k] == 2){
        if (temp$StudyDay[j+1] == temp$StudyDay[j]+1) {
          sum1<-0
          for (m in 1:(tail(which(temp$StudyDay==0), n=1))) {
            sum1<-sum1+temp[m,k]
          }
          if (sum1==0) {
            token <- 1
            c_sub2 <- rbind(c_sub2, subid)
            break
          }
          c_sub2 <- rbind(c_sub2, subid)
          token<-1
          break
        }
      }   
    }
    if (token==1){
      break
    } 
  }
}
# This yields a c_sub2 of 11 subjectIDs (same as c_sub)
# However for now we will stick with the original definition and go with the c_sub of 13 subjectIs
# Rename "V1" as SubjectID using the less stringent c_sub
Symptomatic_IR_twodays <- as.data.frame(c_sub) %>%
  rename(SubjectID = "V1")

# Combine the Symptomatic_donors_infected_singleday df and the Symptomatic_donors_infected_twodays df
Symptomatic_IR_V3_combined <- Symptomatic_IR_twodays %>%
  full_join(Symptomatic_IR_singleday) %>%
  arrange(SubjectID) 

# But the above definition of symptomatic doesn't make any mention of febrile illness
# Let's check to see if the febrile are already accounted for among the group of symptomatic version 3
Symptomatic_by_fever_IR <- Symptomatic_IR_exposed_grade123 %>%
  filter(Febrile == 1) %>%
  select(SubjectID) %>%
  anti_join(Symptomatic_IR_V3_combined, by = c("SubjectID" = "SubjectID"))
# Returned 0 subject IDs, thus adding fever to the analysis doesn't add anything. However we should still be clear about definitions for the paper

Symptomatic_IR_V3_combined <- Symptomatic_IR_V3_combined %>%
  left_join(Qdata_QuarantineNumbers, by = c("SubjectID" = "SubjectID")) %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_IR_Symptomatic_V3 = n_distinct(SubjectID))
# Add onto Table3_IR the number of symptomatic by version 3 criteria and % of infected
Qdata_table3_IR <- Qdata_table3_IR %>%
  left_join(Symptomatic_IR_V3_combined, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
  mutate(Fraction_IR_Symptomatic_V3_of_ExposedIR = Number_IR_Symptomatic_V3/NumberExposedIR)
print(Qdata_table3_IR)

#### Table 3: CR Symptomatic version 3 (to match Killingley, 2012) ####

## The purpose of this version of symptomatic is so that we are consistent with the definitions from the proof-of-concept study (Killingley, 2012 JID)

# Thus, this version 3 of symptomatic for CR is:
# "Any respiratory symptom that occurs at all over 2 consecutive days, or occurs for 3/3 (am, early pm, late pm) symptom measurements on a single day, where respiratory symptoms include runny nose, stuffy nose, sneezing, sore throat, cough, and shortness of breath"

# First we are going to cut the a new df that has only the 6 respiratory symtpomms of interest 
# (and also to include fever, just in case of future analyses)
Symptomatic_CR_V3_day1to10 <- Qdata %>%
  filter(Randomization_DorIRorCR == "CR") %>%
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0 | StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | 
           StudyDay == 5 | StudyDay == 6 | StudyDay == 7 | StudyDay == 8 | StudyDay == 9 | StudyDay == 10) %>%
  mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis, 
         LRI = cough+SOB, 
         SystemicI = headache+muscleAches+malaise) %>%
  filter(Microneut_VisitType == "Q baseline") %>%
  mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
  mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3, 
         stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3, 
         sneezing123 = sneezing==1 | sneezing==2 | sneezing==3, 
         soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
         DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3, 
         DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3, 
         DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3, 
         DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
         cough123 = cough==1 | cough==2 | cough==3, 
         SOB123 = SOB==1 | SOB==2 | SOB==3,
         headache123 = headache==1 | headache==2 | headache==3, 
         muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3, 
         malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
  mutate(Febrile = as.numeric(Febrile),
         runnyNose123 = as.numeric(runnyNose123), 
         stuffyNose123 = as.numeric(stuffyNose123), 
         sneezing123 = as.numeric(sneezing123), 
         soreThroat123 = as.numeric(soreThroat123),
         DPENasalDischarge123 = as.numeric(DPENasalDischarge123), 
         DPEOtits123 = as.numeric(DPEOtits123), 
         DPESinusTenderness123 = as.numeric(DPESinusTenderness123), 
         DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
         cough123 = as.numeric(cough123), 
         SOB123 = as.numeric(SOB123),
         headache123 = as.numeric(headache123), 
         muscleAches123 = as.numeric(muscleAches123), 
         malaise123 = as.numeric(malaise123)) %>%
  select(SubjectID, StudyDay, Sx_Date, SDC_time, QuarantineNumber, Febrile,
         runnyNose123, stuffyNose123, sneezing123, soreThroat123, cough123, SOB123) %>%
  group_by(SubjectID, StudyDay, QuarantineNumber) %>%
  distinct(SDC_time, .keep_all = TRUE) %>%
  arrange(SubjectID, StudyDay) %>%
  ungroup()
# This is great but the way the data is put together, this leaves out the day -3 through day0 data
# Therefore, as a quick fix, we will cut a new dataset that only filters in the data from day -3 through day0
# Then we will bind it back to the "Symptomatic_CR_V3" that was just created.
Symptomatic_CR_before_day1 <- Exposed_CR %>%
  select(SubjectID) %>%
  left_join(Qdata) %>%
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0) %>%
  mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis, 
         LRI = cough+SOB, 
         SystemicI = headache+muscleAches+malaise) %>%
  mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
  mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3, 
         stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3, 
         sneezing123 = sneezing==1 | sneezing==2 | sneezing==3, 
         soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
         DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3, 
         DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3, 
         DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3, 
         DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
         cough123 = cough==1 | cough==2 | cough==3, 
         SOB123 = SOB==1 | SOB==2 | SOB==3,
         headache123 = headache==1 | headache==2 | headache==3, 
         muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3, 
         malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
  mutate(Febrile = as.numeric(Febrile),
         runnyNose123 = as.numeric(runnyNose123), 
         stuffyNose123 = as.numeric(stuffyNose123), 
         sneezing123 = as.numeric(sneezing123), 
         soreThroat123 = as.numeric(soreThroat123),
         DPENasalDischarge123 = as.numeric(DPENasalDischarge123), 
         DPEOtits123 = as.numeric(DPEOtits123), 
         DPESinusTenderness123 = as.numeric(DPESinusTenderness123), 
         DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
         cough123 = as.numeric(cough123), 
         SOB123 = as.numeric(SOB123),
         headache123 = as.numeric(headache123), 
         muscleAches123 = as.numeric(muscleAches123), 
         malaise123 = as.numeric(malaise123)) %>%
  select(SubjectID, StudyDay, Sx_Date, SDC_time, QuarantineNumber, Febrile,
         runnyNose123, stuffyNose123, sneezing123, soreThroat123, cough123, SOB123) %>%
  group_by(SubjectID, StudyDay, QuarantineNumber) %>%
  distinct(SDC_time, .keep_all = TRUE) %>%
  arrange(SubjectID, StudyDay) %>%
  ungroup()
# Now binding together and sorting
Symptomatic_CR_V3 <- rbind(Symptomatic_CR_V3_day1to10, Symptomatic_CR_before_day1) 
Symptomatic_CR_V3 <- Symptomatic_CR_V3 %>%
  arrange(SubjectID, StudyDay)

# We will hold onto the above work for the future, but for now use the Symptomatic_CR_V3_day1to10 df

# Filter those with three measurements positive in a single study day for any of the respiratory pathogens
sub <- unique(Symptomatic_CR_V3_day1to10$SubjectID)
c_sub <- c()
token <- 0
for (i in 1:length(sub)) {
  token <- 0
  subid <- sub[i]
  temp <- Symptomatic_CR_V3_day1to10[Symptomatic_CR_V3_day1to10$SubjectID == subid, ]
  temp1<-temp[,6:12]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:5],temp1)
  stud <- unique(temp$StudyDay)
  for (j in 1:length(stud)) {
    studyday <- stud[j]
    temp2 <- temp[temp$StudyDay == studyday, ]
    for (k in 1:(nrow(temp2))) {
      for (l in 7:12) {
        if (sum(temp2[,l]) == 3) {
          token <-1
          c_sub <- rbind(c_sub, subid)
          break
        }
      }
      if (token == 1) {
        break
      }
    }
    if (token == 1) {
      break
    }
  }
} 
# This yields a c_sub vector with 8 subjectIDs, however if we want to employ the stringent criteria where Sx are removed from the classification criteria if they appear before day1, then use the next loop.
sub <- unique(Symptomatic_CR_V3$SubjectID)
c_sub2 <- c()
token <- 0
for (i in 1:length(sub)) {
  token <- 0
  subid <- sub[i]
  temp <- Symptomatic_CR_V3[Symptomatic_CR_V3$SubjectID == subid, ]
  temp1<-temp[,6:12]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:5],temp1)
  stud <- unique(temp$StudyDay)
  for (j in 1:length(stud)) {
    studyday <- stud[j]
    temp2 <- temp[temp$StudyDay == studyday, ]
    for (k in 1:(nrow(temp2))) {
      for (l in 7:12) {
        if (sum(temp2[,l]) == 3) {
          sum1<-0
          for (m in 1:(tail(which(temp$StudyDay==0), n=1))) {
            sum1<-sum1+temp[m,l]
          }
          if (sum1==0) {
            token <- 1
            c_sub2 <- rbind(c_sub2, subid)
            break
          }
        }
      }
      if (token == 1) {
        break
      }
    }
    if (token == 1) {
      break
    }
  }
} 
# This yields a c_sub2 for 6 subjectIDs (2 fewer than the c_sub)
# However, for now, we will go with the less stringent criteria and use the c_sub
# Now get the list of subject IDs from c_sub (and not the more stringent c_sub2 list of subject IDs)
Symptomatic_V3_CR_singleday <- as.data.frame(c_sub) %>%
  rename(SubjectID = "V1") %>%
  distinct(SubjectID)

# Now use a loop to classify those with any sort of respiratory symptom on two consecutive days
# For this we should use the "Symptomatic_CR_exposed_grade123" df that marks with indicator of 1 when any of the 3 symptom measurements in a day showed evidence of symptoms of any grade.
# This df was created in the first version of symptomatic for CR

# First cut the  the "Symptomatic_CR_exposed_grade123" df to the variables of interest and proper scale for the loop
Symptomatic_CR_exposed_grade123 <- Exposed_CR %>%
  select(SubjectID, QuarantineNumber) %>%
  left_join(Qdata, by = c("SubjectID" = "SubjectID", "QuarantineNumber" = "QuarantineNumber")) %>%
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0 | StudyDay == 1 | StudyDay == 2 |
           StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6 | StudyDay == 7 | StudyDay == 8 |
           StudyDay == 9 | StudyDay == 10) %>%
  mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis,
         LRI = cough+SOB,
         SystemicI = headache+muscleAches+malaise) %>%
  mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
  mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3,
         stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3,
         sneezing123 = sneezing==1 | sneezing==2 | sneezing==3,
         soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
         DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3,
         DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3,
         DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3,
         DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
         cough123 = cough==1 | cough==2 | cough==3,
         SOB123 = SOB==1 | SOB==2 | SOB==3,
         headache123 = headache==1 | headache==2 | headache==3,
         muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3,
         malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
  mutate(Febrile = as.numeric(Febrile),
         runnyNose123 = as.numeric(runnyNose123),
         stuffyNose123 = as.numeric(stuffyNose123),
         sneezing123 = as.numeric(sneezing123),
         soreThroat123 = as.numeric(soreThroat123),
         DPENasalDischarge123 = as.numeric(DPENasalDischarge123),
         DPEOtits123 = as.numeric(DPEOtits123),
         DPESinusTenderness123 = as.numeric(DPESinusTenderness123),
         DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
         cough123 = as.numeric(cough123),
         SOB123 = as.numeric(SOB123),
         headache123 = as.numeric(headache123),
         muscleAches123 = as.numeric(muscleAches123),
         malaise123 = as.numeric(malaise123)) %>%
  group_by(SubjectID, StudyDay, QuarantineNumber) %>%
  summarize(Febrile = max(Febrile),
            runnyNose123 = max(runnyNose123),
            stuffyNose123 = max(stuffyNose123),
            sneezing123 = max(sneezing123),
            soreThroat123 = max(soreThroat123),
            DPENasalDischarge123 = max(DPENasalDischarge123),
            DPEOtits123 = max(DPEOtits123),
            DPESinusTenderness123 = max(DPESinusTenderness123),
            DPEPharyngitis123 = max(DPEPharyngitis123),
            cough123 = max(cough123),
            SOB123 = max(SOB123),
            headache123 = max(headache123),
            muscleAches123 = max(muscleAches123),
            malaise123 = max(malaise123)) %>%
  ungroup()
# The above gets us to a dataset where symptoms with grade 1, 2, or 3 are summarized by whether there was at least one symptoms (of any grade) detection per study day

Symptomatic_CR_exposed_grade123_day1to10 <-Symptomatic_CR_exposed_grade123 %>%
  select(SubjectID, StudyDay, QuarantineNumber, Febrile,
         runnyNose123, stuffyNose123, sneezing123, soreThroat123, cough123, SOB123) %>%
  filter(StudyDay == 1 |StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 |
           StudyDay == 6 | StudyDay == 7 | StudyDay == 8 | StudyDay == 9 | StudyDay == 10)

sub <- unique(Symptomatic_CR_exposed_grade123_day1to10$SubjectID)
c_sub <- c()
token<-0
for (i in 1:length(sub)) {
  token<-0
  subid <- sub[i]
  temp <- Symptomatic_CR_exposed_grade123_day1to10[Symptomatic_CR_exposed_grade123_day1to10$SubjectID == subid, ]
  temp1<-temp[,4:10]
  temp1[is.na(temp1)]<-0
  temp<-cbind(temp[,1:3],temp1)
  for (j in 1:(nrow(temp)-1)) {
    for (k in 5:10) {
      if (temp[j, k] + temp[j+1, k] == 2){
        if (temp$StudyDay[j+1] == temp$StudyDay[j]+1) {
          c_sub <- rbind(c_sub, subid)
          token<-1
          break
        }
      }   
    }
    if (token==1){
      break
    } 
  }
}
# This yields a c_sub with 12 subjectIDs, however if we want to use the more stringent criteria, where a Sx that appears before day 1 is removed from classification criteria, we use the following df and loop
sub <- unique(Symptomatic_CR_exposed_grade123$SubjectID)
c_sub2 <- c()
token<-0
for (i in 1:length(sub)) {
  token<-0
  subid <- sub[i]
  temp <- Symptomatic_CR_exposed_grade123[Symptomatic_CR_exposed_grade123$SubjectID == subid, ]
  temp1<-temp[,4:10]
  temp1[is.na(temp1)]<-0
  temp<-cbind(temp[,1:3],temp1)
  for (j in 1:(nrow(temp)-1)) {
    for (k in 5:10) {
      if (temp[j, k] + temp[j+1, k] == 2){
        if (temp$StudyDay[j+1] == temp$StudyDay[j]+1) {
          sum1<-0
          for (m in 1:(tail(which(temp$StudyDay==0), n=1))) {
            sum1<-sum1+temp[m,k]
          }
          if (sum1==0) {
            token <- 1
            c_sub2 <- rbind(c_sub2, subid)
            break
          }
        }
      }   
    }
    if (token==1){
      break
    } 
  }
}
# This yields a c_sub2 of 10 subjectIDs (2 fewer than the less stringent criteria c_sub which yielded 12)
# Even so, for now we will use the less stringent criteria for this classification
# Rename "V1" as SubjectID from the c_sub vector of 12
Symptomatic_V3_CR_twodays <- as.data.frame(c_sub) %>%
  rename(SubjectID = "V1")

# Combine the Symptomatic_donors_infected_singleday df and the Symptomatic_donors_infected_twodays df
Symptomatic_V3_CR_combined <- Symptomatic_V3_CR_twodays %>%
  full_join(Symptomatic_V3_CR_singleday, by = c("SubjectID" = "SubjectID")) %>%
  arrange(SubjectID) 

# But the above definition of symptomatic doesn't make any mention of febrile illness
# Let's check to see if the febrile are already accounted for among the group of symptomatic version 3
Symptomatic_by_fever_CR <- Symptomatic_CR_exposed_grade123 %>%
  filter(Febrile == 1) %>%
  select(SubjectID) %>%
  anti_join(Symptomatic_V3_CR_combined, by = c("SubjectID" = "SubjectID"))
# Returned 0 subject IDs, thus adding fever to the analysis doesn't add anything. However we should still be clear about definitions for the paper

Symptomatic_V3_CR_combined <- Symptomatic_V3_CR_combined %>%
  left_join(Qdata_QuarantineNumbers, by = c("SubjectID" = "SubjectID")) %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_Symptomatic_V3_CR = n_distinct(SubjectID))
# Add onto Table3_CR the number of symptomatic by version 3 criteria and % of infected
Qdata_table3_CR <- Qdata_table3_CR %>%
  left_join(Symptomatic_V3_CR_combined, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
  mutate(Fraction_Symptomatic_V3_CR_of_ExposedCR = Number_Symptomatic_V3_CR/NumberExposedCR)
print(Qdata_table3_CR)

# Important note about the data here: subject 203 did not have any symptom scores reported at all in the raw data (although they should have).
# Upon investigating this discrepancy with Alex Mann and Ben Killingley, we have found that 203, in fact, had symptom scores of 0 for all of the self-reported symtpoms on each day of symptom surveillance.
# I could move this note to the EMIT_Quarantine_Main_work_with_clean_files where we are summarizing the data some, but actually I didn't catch this error until later on when I was checking the data before putting this table together. 
# For now we will keep the not here. 
# Also - important to note that the raw data will not change becasue of this. Rather, the null symptoms scores for 203 are essentially already accounted for here by the NAs reported for 203's self-reported symptom data. 
# This note serves to remind us that yes, in fact, the NAs can be interpreted as symptom scores of 0.

#### Table 3: d1a) IR: First classification of ILI (and % of exposed) ####

# # Operationally, this means evidence of fever >100F (>37.9C) & any evidence of cough or sore throat or DPE Pharyngitis
# 
# # First, cut the dataset to only the infected donors who meet the definition for fever
# # Note, none of the volunteers registered a fever on any of the study days prior to inoculation day
# ILIdata <- Exposed_IR %>%
#   select(SubjectID, QuarantineNumber) %>%
#   left_join(Qdata) %>%
#   select(SubjectID, QuarantineNumber, StudyDay, Sx_Date, SDC_time, Tympanic.temp..degrees.C., cough, soreThroat, DPEPharyngitis) %>%
#   filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0 | StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | 
#            StudyDay == 4 | StudyDay == 5 | StudyDay == 6 | StudyDay == 7 | StudyDay == 8 | StudyDay == 9 | StudyDay == 10) %>%
#   group_by(SubjectID, StudyDay, SDC_time) %>%
#   distinct(SDC_time, .keep_all = TRUE) %>%
#   arrange(SubjectID, StudyDay) %>%
#   ungroup()
# 
# # Let's consolidate the sore throat and pharyngitis variables to make one cumulative variable (soreThroat or DPEPharyngitis)
# ILIdata_day1to10 <- ILIdata %>%
#   mutate(st = cough>=1 | soreThroat>=1 | DPEPharyngitis>=1, st = as.numeric(st)) %>%
#   filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 |
#            StudyDay == 6 | StudyDay == 7 | StudyDay == 8 | StudyDay == 9 | StudyDay == 10)
#   
# sub <- unique(ILIdata_day1to10$SubjectID)
# c_sub <- c()
# token_t<-0
# token_sx<-0
# for (i in 1:length(sub)) {
#   token_t<-0
#   token_sx<-0
#   subid <- sub[i]
#   temp <- ILIdata_day1to10[ILIdata_day1to10$SubjectID == subid, ]
#   temp1<-temp[,6:10]
#   temp1[is.na(temp1)]<-0
#   temp<-cbind(temp[,1:5],temp1)
#   for (j in 1:(nrow(temp))) {
#     if (!is.na(temp$Tympanic.temp..degrees.C.[j])) {
#       if (temp$Tympanic.temp..degrees.C.[j] > 37.9) {
#         token_t<-1
#       }
#     }
#     if (sum(temp$cough[j], temp$st[j], na.rm = TRUE) >=1) {
#       token_sx<-1
#     }
#     if (token_t == 1 & token_sx == 1){
#       c_sub <- rbind(c_sub, subid)
#       break
#     }
#   }
# }
# # Note: the above code may not work properly and would require additional verification and potential troubleshooting.
# # It could just be that there were 0 instances that met the criteria and for that reason the c_sub comes back null
# # ... however, since the definition of ILI has changed based on the October 12, 2018 call, we will move to work on the new definition
# # Note: the above code does not check for the case that someone had fever, cough, or sore throat...
# # ... prior to inoculation day.
# # Future iterations of this code would do well to implement logic that would filter those that met ILI criteria
# 
# # Now adding this vector of studyIDs to the table3_IR
# # ILI_febrile_IR <- as.data.frame(c_sub) %>%
#   # rename(SubjectID = "V1") 
# 
# # Create vector with 0s in place (in the case that there were 0 instances that met the ILI definition for IR)
# #m <- matrix(0, ncol = 2, nrow = 3)
# #ILI_febrile_table3_IR <- as.data.frame(m) %>%
#   #rename(QuarantineNumber = V1, NumberILI_febrile_IR = V2)
# 
# # Now adding the QuarantineNumber on to the Febrile ILI df 
# # Then we can sort by Q for the table3
# # ILI_febrile_table3_IR <- ILI_febrile_IR %>%
#   # left_join(Qdata_QuarantineNumbers, by = c("SubjectID" = "SubjectID")) %>%
#   # group_by(QuarantineNumber) %>%
#   # summarize(NumberILI_febrile_IR = n_distinct(SubjectID))
# 
# # Add onto Table3 the number of 1st version of ILI criteria and % of infected (this is outdated, pre-October 12, 2018 definition)
# #Qdata_table3_IR <- Qdata_table3_IR %>%
#   #left_join(ILI_febrile_table3_IR, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
#   #mutate(Fraction_ILI_febrile_of_ExpIR = NumberILI_febrile_IR/NumberExposedIR)
# #print(Qdata_table3_IR)

#### Table 3: d1b) IR: Second classification of ILI (afebrile ILI) (and % of exposed) ####

# # This definition of afebrile ILI is: symptom of grade >=1 for cough or soreThroat (or DPEPharyngitis)
# # This code eliminates those who were symptomatic for cough or soreThroat before Day0
# # note: using the "ILIdata" df, which was created in the ILI_Version 1 for IR code above
# # The below code is draft and does not work, so I will comment it out
# # Note that the attempt below uses the more stringent criteria (excluding Sx that occurred before day1 from criteria)
# # We will use the new definition of ILI, which comes from the Oct 12, 2018 EMIT team webex conference
# #sub <- unique(ILIdata$SubjectID)
# #c_sub <- c()
# #token_c<-0
# #token_st<-0
# #for (i in 1:length(sub)) {
#   #token_c<-0
#   #token_st<-0
#   #subid <- sub[i]
#   #temp <- ILIdata[ILIdata$SubjectID == subid, ]
#   #temp1<-temp[,6:9]
#   #temp1[is.na(temp1)]<-0
#   #temp<-cbind(temp[,1:5],temp1)
#   #for (j in 1:(nrow(temp))) {
#     #if (temp$cough[j] >=1) {
#       #sum1<-0
#       #for (k in 1:(tail(which(temp$StudyDay==-1), n=1))) {
#         #sum1 <- sum1+temp$cough[k]
#         #if (sum1 == 0) {
#           #token_c<-1
#         #}
#       #}
#     #}
#     #if (temp$st[j] >= 1) {
#       #sum2<-0
#       #for (l in 1:(tail(which(temp$StudyDay==-1), n=1))) {
#         #sum2 <- sum2+temp$st[l]
#         #if (sum2 == 0) {
#           #token_st<-1
#         #}
#       #}
#     #}
#   #}
#   #if (token_c + token_st >=1) {
#     #c_sub <- rbind(c_sub, subid)
#   #}
# #}
# 
# # Now adding this vector of studyIDs to the table3_IR
# #ILI_afebrile_IR <- as.data.frame(c_sub) %>%
#   #rename(SubjectID = "V1") 
# 
# # Now adding the QuarantineNumber on to the ILI afebrile df 
# # Then we can sort by Q for the table3
# #ILI_afebrile_table3_IR <- ILI_afebrile_IR %>%
#   #left_join(Qdata_QuarantineNumbers, by = c("SubjectID" = "SubjectID")) %>%
#   #group_by(QuarantineNumber) %>%
#   #summarize(NumberILI_afebrile_IR = n_distinct(SubjectID))
# 
# # Add onto Table3 the number of ILI afebrile and % of infected
# #Qdata_table3_IR <- Qdata_table3_IR %>%
#   #left_join(ILI_afebrile_table3_IR, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
#   #mutate(Fraction_ILI_afebrile_of_ExpIR = NumberILI_afebrile_IR/NumberExposedIR)
# #print(Qdata_table3_IR)

#### Table 3: d1c) IR: Third version of classification of ILI (to match Killingley et al., 2012) (and % of infected) ####

# This definition of ILI is: "an illness lasting >=24 hours with either (1) fever >37.9°C plus at least 1 respiratory symptom or (2) >=2 symptoms, at least 1 of which must be respiratory."
# Where "respiratory symptom" means evidence of any grade of runny nose, stuffy nose, sneeze, sore throat, cough, shortness of breath
# Where "lasting >=24 hours" means evidence of the symptom over all three instances of symptom measurements for a single day, or evidence of the symptom over two days at any frequency (1-3/3 instances of symptom recordings)

# First, let's program the first criteria (fever >37.9C plus at least 1 respiratory symptom)
# To do this, we can:
# a) create the set of subject IDs that meet the fever criteria, and then check them for
# b) evidence of three instances during a single day, or
# c) evidence of any frequency of instances >=1 for 2 consecutive days
# Then, we can deal with the second criteria for ILI (>=2 symptoms one of which being a respiratory)

# Find the SubjectIDs from among the exposed IR, that had fever
# First check to see if anyone had fever before day 1. 
Qdata_IR_febrile_pre_day1 <- Qdata %>%
  filter(Randomization_DorIRorCR == "IR") %>%
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0) %>%
  filter(Tympanic.temp..degrees.C. >37.9) %>%
  distinct(SubjectID, .keep_all = FALSE)
# As it turns out, none of the IR had fever before day 1 
# Now we can see who among the infected subject IDs had fever at least once over study days 1-10
Qdata_IR_febrile_day1to10 <- Qdata %>%
  filter(Randomization_DorIRorCR == "IR") %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6 |
           StudyDay == 7 | StudyDay == 8 | StudyDay == 9 | StudyDay == 10) %>%
  filter(Tympanic.temp..degrees.C. >37.9) %>%
  distinct(SubjectID, .keep_all = FALSE)
# As it turns out, non of the IR ever had fever during study days 1-10

ILI_V3_IR_criteria1 <- Qdata_IR_febrile_day1to10

# Now we can move to the second criteria for ILI for the IR

## Plan for implementing the second criteria for ILI (>= 2 symptoms for >=24 hours, 1 of which is respiratory) and merging with the first criteria for ILI
# To do this, first we will filter those subject IDs without fever.
# Then we will see who among those without fever had respiratory symptom on a single day plus at least one other symptom on the same single day
# Then we will see who among those without fever had respiratory sympomt at frequency >=1 over 2 days plus at least one other symptom at freq >=1 for same 2 days
# Then we will add those subject IDs together to form ILI_IR_criteria2
# Since there were no subjects forming an ILI_IR_criteria1 df, we will use and ILI_IR_criteria2 together to make ILI_IR

# First we will filter those subject IDs without fever.
# Now we can see who among the exposed IR subject IDs had fever at least once over study days 1-10
Qdata_IR_afebrile_day1to10 <- Exposed_IR %>%
  select (SubjectID) %>%
  anti_join(Qdata_IR_febrile_day1to10)

# Now we will see who among those without fever had respiratory symptom on a single day plus at least one other symptom on the same single day

# First we are going to cut the a new df that has the 6 respiratory symptoms of interest plus the 3 non-resp symptoms, and fever 
# for only those in the "Qdata_IR_afebrile_day1to10" df
ILI_V3_IR_afebrile_day1to10 <- Qdata_IR_afebrile_day1to10 %>%
  left_join(Qdata) %>% 
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0 | StudyDay == 1 | StudyDay == 2 | 
           StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6 | StudyDay == 7 | StudyDay == 8 | 
           StudyDay == 9 | StudyDay == 10) %>%
  mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis, 
         LRI = cough+SOB, 
         SystemicI = headache+muscleAches+malaise) %>%
  filter(Microneut_VisitType == "Q baseline") %>%
  mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
  mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3, 
         stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3, 
         sneezing123 = sneezing==1 | sneezing==2 | sneezing==3, 
         soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
         DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3, 
         DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3, 
         DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3, 
         DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
         cough123 = cough==1 | cough==2 | cough==3, 
         SOB123 = SOB==1 | SOB==2 | SOB==3,
         headache123 = headache==1 | headache==2 | headache==3, 
         muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3, 
         malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
  mutate(Febrile = as.numeric(Febrile),
         runnyNose123 = as.numeric(runnyNose123), 
         stuffyNose123 = as.numeric(stuffyNose123), 
         sneezing123 = as.numeric(sneezing123), 
         soreThroat123 = as.numeric(soreThroat123),
         DPENasalDischarge123 = as.numeric(DPENasalDischarge123), 
         DPEOtits123 = as.numeric(DPEOtits123), 
         DPESinusTenderness123 = as.numeric(DPESinusTenderness123), 
         DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
         cough123 = as.numeric(cough123), 
         SOB123 = as.numeric(SOB123),
         headache123 = as.numeric(headache123), 
         muscleAches123 = as.numeric(muscleAches123), 
         malaise123 = as.numeric(malaise123)) %>%
  select(SubjectID, StudyDay, Sx_Date, SDC_time, QuarantineNumber, Febrile,
         runnyNose123, stuffyNose123, sneezing123, soreThroat123, cough123, SOB123,
         headache123, muscleAches123, malaise123) %>%
  group_by(SubjectID, StudyDay, QuarantineNumber) %>%
  distinct(SDC_time, .keep_all = TRUE) %>%
  arrange(SubjectID, StudyDay) %>%
  ungroup()
# This is great but the way the data is put together, this leaves out the day -3 through day0 data
# Therefore, as a quick fix, we will cut a new dataset that only filters in the data from day -3 through day0
# Then we will bind it back to the "Symptomatic_donors_infected_V3_afebrile" that was just created.
ILI_V3_IR_afebrile_before_day1 <- Qdata_IR_afebrile_day1to10 %>%
  left_join(Qdata) %>% 
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0) %>%
  mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis, 
         LRI = cough+SOB, 
         SystemicI = headache+muscleAches+malaise) %>%
  mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
  mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3, 
         stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3, 
         sneezing123 = sneezing==1 | sneezing==2 | sneezing==3, 
         soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
         DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3, 
         DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3, 
         DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3, 
         DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
         cough123 = cough==1 | cough==2 | cough==3, 
         SOB123 = SOB==1 | SOB==2 | SOB==3,
         headache123 = headache==1 | headache==2 | headache==3, 
         muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3, 
         malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
  mutate(Febrile = as.numeric(Febrile),
         runnyNose123 = as.numeric(runnyNose123), 
         stuffyNose123 = as.numeric(stuffyNose123), 
         sneezing123 = as.numeric(sneezing123), 
         soreThroat123 = as.numeric(soreThroat123),
         DPENasalDischarge123 = as.numeric(DPENasalDischarge123), 
         DPEOtits123 = as.numeric(DPEOtits123), 
         DPESinusTenderness123 = as.numeric(DPESinusTenderness123), 
         DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
         cough123 = as.numeric(cough123), 
         SOB123 = as.numeric(SOB123),
         headache123 = as.numeric(headache123), 
         muscleAches123 = as.numeric(muscleAches123), 
         malaise123 = as.numeric(malaise123)) %>%
  select(SubjectID, StudyDay, Sx_Date, SDC_time, QuarantineNumber, Febrile,
         runnyNose123, stuffyNose123, sneezing123, soreThroat123, cough123, SOB123,
         headache123, muscleAches123, malaise123) 
# Now binding together and sorting
ILI_V3_IR_afebrile <- bind_rows(ILI_V3_IR_afebrile_day1to10, ILI_V3_IR_afebrile_before_day1) 
ILI_V3_IR_afebrile <- ILI_V3_IR_afebrile %>%
  arrange(SubjectID, StudyDay)

# But, the current definition is just for post day0 so we will filter just day1-10 "ILI_V3_IR_afebrile_day1to10" df

# Loop to get subjectIDs where there were 2 symptoms (one of which respiratory), each observed 3 times on the same day
sub <- unique(ILI_V3_IR_afebrile_day1to10$SubjectID)
c_sub <- c()
token <- 0
for (i in 1:length(sub)) {
  token <- 0
  subid <- sub[i]
  temp <- ILI_V3_IR_afebrile_day1to10[ILI_V3_IR_afebrile_day1to10$SubjectID == subid, ]
  temp1<-temp[,6:15]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:5],temp1)
  stud <- unique(temp$StudyDay)
  for (j in 1:length(stud)) {
    studyday <- stud[j]
    temp2 <- temp[temp$StudyDay == studyday, ]
    for (k in 1:(nrow(temp2))) {
      for (l in 7:12) {
        for (m in (l+1):15) {
          if (sum(temp2[,l]) + sum(temp2[,m]) == 6) {
            token <- 1
            c_sub <- rbind(c_sub, subid)
            break
          }
        }
        if(token == 1) {
          break
        }
      }
      if(token == 1) {
        break
      }
    }
    if(token == 1) {
      break
    }
  }
} 
# This yields a c_sub of 2 subjectIDs, but if we wanted to exclude symptoms that appeared before day 1 we could do a new loop
sub <- unique(ILI_V3_IR_afebrile$SubjectID)
c_sub2 <- c()
token <- 0
for (i in 1:length(sub)) {
  token <- 0
  subid <- sub[i]
  temp <- ILI_V3_IR_afebrile[ILI_V3_IR_afebrile$SubjectID == subid, ]
  temp1<-temp[,6:15]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:5],temp1)
  stud <- unique(temp$StudyDay)
  for (j in 1:length(stud)) {
    studyday <- stud[j]
    temp2 <- temp[temp$StudyDay == studyday, ]
    for (k in 1:(nrow(temp2))) {
      for (l in 7:12) {
        for (m in (l+1):15) {
          if (sum(temp2[,l]) + sum(temp2[,m]) == 6) {
            sum1<-0
            sum2<-0
            for (n in 1:(tail(which(temp$StudyDay==0), n=1))) {
              sum1<-sum1+temp[n,l]
              sum2<-sum2+temp[n,m]
            }
            if (sum1==0 & sum2==0) {
              token<-1
              c_sub2 <- rbind(c_sub2, subid)
              break
            }
          }
        }
        if(token == 1) {
          break
        }
      }
      if(token == 1) {
        break
      }
    }
    if(token == 1) {
      break
    }
  }
} 
# This c_sub2 vector holds 2 subjectIDs, just like the less stringent c_sub
# For now we will take the less stringent criteria and move the 2 subjectIDs from c_sub into a df
# Now get the df of subject IDs from c_sub
ILI_V3_IR_criteria2_singleday <- as.data.frame(c_sub) %>%
  rename(SubjectID = "V1")
# This is the output for the first part (single day) of the second ILI criteria

# Now implement the second part of the second ILI criteria: >=2 Sx (with >=1 resp) at any frequency over the same 2 consecutive study days

# Create an "IR_grade123_afebrile" df by collapsing the three study day values into 1, and select only the 9 symptoms that will be used as part of this analysis (the DPE aren't used here)
# The 9 are: runny nose, stuffy nose, sneeze, sore throat, cough, SOB, headache, muscleache, malaise
# Also remember to select the afebrile group (those who were never febrile, n = 40, which is actually all the IR)
IR_grade123_afebrile_day1to10 <- ILI_V3_IR_afebrile_day1to10 %>%
  group_by(SubjectID, StudyDay, QuarantineNumber) %>%
  summarize(Febrile = max(Febrile),
            runnyNose123 = max(runnyNose123), 
            stuffyNose123 = max(stuffyNose123), 
            sneezing123 = max(sneezing123), 
            soreThroat123 = max(soreThroat123),
            cough123 = max(cough123), 
            SOB123 = max(SOB123),
            headache123 = max(headache123), 
            muscleAches123 = max(muscleAches123), 
            malaise123 = max(malaise123))
# Bind with the data that comes before day 1 to get complete dataset
# Note that the data from before day 1 doesn't have to collapsed to a single measurement per day because we aren't scanning these in the criteria
# Rather we are interested in seeing if there were any symtoms before day 1 to implement a more stringent criteria for classifying symptoms
ILI_V3_IR_grade123_afebrile <- bind_rows(IR_grade123_afebrile_day1to10, ILI_V3_IR_afebrile_before_day1) 
ILI_V3_IR_grade123_afebrile <- ILI_V3_IR_grade123_afebrile %>%
  select(SubjectID, StudyDay, QuarantineNumber, Febrile,
         runnyNose123, stuffyNose123, sneezing123, soreThroat123, cough123, SOB123,
         headache123, muscleAches123, malaise123) %>%
  arrange(SubjectID, StudyDay)

# Columns 5-10 are respiratory symptoms and columns 11-13 are the other symptoms that matter for this definition
# Do a loop for 2 or more symptoms one of which is respiratory on with the Sx's occuring on 2 consecutive days at any frequency >=1
sub <- unique(IR_grade123_afebrile_day1to10$SubjectID)
c_sub <- c()
token<-0
for (i in 1:length(sub)) {
  token<-0
  subid <- sub[i]
  temp <- IR_grade123_afebrile_day1to10[IR_grade123_afebrile_day1to10$SubjectID == subid, ]
  temp1<-temp[,4:13]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:3],temp1)
  for (j in 1:(nrow(temp)-1)) {
    for (k in 5:10) {
      for (l in (k+1):13){
        if (temp[j, k] + temp[j, l] + temp[j+1, k] + temp[j+1, l] == 4) {
          if (temp$StudyDay[j+1] == temp$StudyDay[j]+1) {
            c_sub <- rbind(c_sub, subid)
            token<-1
            break
          }
        }
      }
      if (token==1){
        break
      } 
    }
    if (token==1){
      break
    }
  }
}
# This yields a c_sub vector of 5 subjectIDs, however if we want to exclude Sx that occured before day0 we use a new loop
sub <- unique(ILI_V3_IR_grade123_afebrile$SubjectID)
c_sub2 <- c()
token<-0
for (i in 1:length(sub)) {
  token<-0
  subid <- sub[i]
  temp <- ILI_V3_IR_grade123_afebrile[ILI_V3_IR_grade123_afebrile$SubjectID == subid, ]
  temp1<-temp[,4:13]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:3],temp1)
  for (j in 1:(nrow(temp)-1)) {
    for (k in 5:10) {
      for (l in (k+1):13){
        if (temp[j, k] + temp[j, l] + temp[j+1, k] + temp[j+1, l] == 4) {
          if (temp$StudyDay[j+1] == temp$StudyDay[j]+1) {
            sum1<-0
            sum2<-0
            for (m in 1:(tail(which(temp$StudyDay==-1), n=1))){
              sum1<-sum1+temp[m,k]
              sum2<-sum1+temp[m,l]
            }
            if (sum1==0 & sum2==0) {
              c_sub2 <- rbind(c_sub2, subid)
              token<-1
              break
            }
          }
        }
      }
      if (token==1){
        break
      } 
    }
    if (token==1){
      break
    }
  }
}
# This yields the same 5 subjectIDs as c_sub
# For now we will use the less stringent criteria (c_sub) and thus convert c_sub into a df for future manipulation
ILI_V3_IR_criteria2 <- as.data.frame(c_sub) %>%
  rename(SubjectID = "V1")

# merge together the ILI criteria 1 and 2 dfs
ILI_V3_IR <- full_join(ILI_V3_IR_criteria1, ILI_V3_IR_criteria2)

# Now adding the QuarantineNumber on to the ILI df 
# Then we can sort by Q for the table1
ILI_V3_IR_table3 <- ILI_V3_IR %>%
  left_join(Qdata_QuarantineNumbers, by = c("SubjectID" = "SubjectID")) %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_ILI_V3_IR = n_distinct(SubjectID))

# Add onto Table3_IR the number of symptomatic by version 3 criteria and % of infected
Qdata_table3_IR <- Qdata_table3_IR %>%
  left_join(ILI_V3_IR_table3, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
  mutate(Fraction_ILI_V3_IR_of_ExposedIR = Number_ILI_V3_IR/NumberExposedIR)
print(Qdata_table3_IR)

#### Table 3: d2a) CR: Number of ILI (and % of exposed) ####

# # Operationally, this means evidence of fever >100F (>37.9C) & any evidence of cough or sore throat or DPE Pharyngitis
# # Note that >100F could really be implemented as >=37.8 but the EMIT team in the UK consistently uses >37.9C so we will follow suit
# 
# # First, cut the dataset to only the infected donors who meet the definition for fever
# # Note, none of the volunteers registered a fever on any of the study days prior to inoculation day
# ILIdata_CR <- Exposed_CR %>%
#   select(SubjectID, QuarantineNumber) %>%
#   left_join(Qdata) %>%
#   select(SubjectID, QuarantineNumber, StudyDay, Sx_Date, SDC_time, Tympanic.temp..degrees.C., cough, soreThroat, DPEPharyngitis) %>%
#   filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0 | StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | 
#            StudyDay == 3 | StudyDay == 5 | StudyDay == 6 | StudyDay == 7 | StudyDay == 8 | StudyDay == 9 | StudyDay == 10) %>%
#   group_by(SubjectID, StudyDay, SDC_time) %>%
#   distinct(SDC_time, .keep_all = TRUE) %>%
#   arrange(SubjectID, StudyDay) %>%
#   ungroup()
# 
# # Let's consolidate the sore throat and pharyngitis variables to make one cumulative variable (soreThroat or DPEPharyngitis)
# ILIdata_CR <- ILIdata_CR %>%
#   mutate(st = cough>=1 | soreThroat>=1 | DPEPharyngitis>=1, st = as.numeric(st))
# 
# # First need to create a df with study data from only study days 1-10
# ILIdata_CR_day1to10 <- ILIdata_CR %>%
#   filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 |
#            StudyDay == 6 | StudyDay == 7 | StudyDay == 8 | StudyDay == 9 | StudyDay == 10)
# 
# sub <- unique(ILIdata_CR_day1to10$SubjectID)
# c_sub <- c()
# token_t<-0
# token_s<-0
# token<-0
# for (i in 1:length(sub)) {
#   token<-0
#   token_t<-0
#   token_s<-0
#   subid <- sub[i]
#   temp <- ILIdata_CR_day1to10[ILIdata_CR_day1to10$SubjectID == subid, ]
#   for (j in 1:(nrow(temp))) {
#     if (!is.na(temp$Tympanic.temp..degrees.C.[j])) {
#       if (temp$Tympanic.temp..degrees.C.[j] >37.9) {
#         token_t<-1
#       }
#     }
#     if (sum(temp$cough[j], temp$st[j], na.rm = TRUE) >=1) {
#       token_s<-1
#     }
#     if (token_t==1 & token_s==1){
#       token<-1
#       c_sub <- rbind(c_sub, subid)
#       break
#     }
#   }
#   if (token==1) {
#     break
#   }
# }
# # But also note: there were 0 instances where this criteria was met.
# # Note: the above code does not check for the case that someone had fever, cough, or sore throat prior to studyday1.
# # Future iterations to include more generalized cases of this code would do well to implement logic that would filter ...
# # ... those that met ILI criteria, accounting for the case where symptoms appeared before StudyDay == 1.
# 
# # Now adding this vector of studyIDs to the table3_CR
# #ILI_febrile_CR <- as.data.frame(c_sub) %>%
#   #rename(SubjectID = "V1") 
# # 0 instances so let's fill it in as 0
# ILI_febrile_CR <- as.data.frame(c(NA,NA,NA)) %>%
#   rename(SubjectID = "c(NA, NA, NA)")
# 
# # Now adding the QuarantineNumber on to the Febrile ILI df 
# # Then we can sort by Q for the table3
# ILI_febrile_table3_CR <- ILI_febrile_CR %>%
#   left_join(Qdata_QuarantineNumbers) %>%
#   group_by(QuarantineNumber) %>%
#   summarize(Number_ILI_febrile_CR = n_distinct(SubjectID))
# 
# # Add onto Table3 the number of ILI by version 1 criteria and % of infected
# # For the final version of table 3 we will use the ILI V3, and thus will ignore this in the printed table
# #Qdata_table3_CR <- Qdata_table3_CR %>%
#   #left_join(ILI_febrile_table3_CR, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
#   #mutate(Fraction_ILI_febrile_of_ExpCR = Number_ILI_febrile_CR/NumberExposedCR)
# #print(Qdata_table3_CR)

#### Table 3: d2b) CR: Second classification of ILI (afebrile ILI) (and % of exposed) ####

# # This definition of afebrile ILI is: symptom of grade >=1 for cough or soreThroat (or DPEPharyngitis)
# sub <- unique(ILIdata_CR_day1to10$SubjectID)
# c_sub <- c()
# token<-0
# for (i in 1:length(sub)) {
#   token_c<-0
#   token_st<-0
#   subid <- sub[i]
#   temp <- ILIdata_CR_day1to10[ILIdata_CR_day1to10$SubjectID == subid, ]
#   temp1<-temp[,6:9]
#   temp1[is.na(temp1)]<-0
#   temp<-cbind(temp[,1:5],temp1)
#   for (j in 1:(nrow(temp))) {
#     if (sum(temp$cough[j]) >=1) {
#       token_c<-1
#     }
#     if (sum(temp$st[j]) >= 1) {
#       token_st<-1
#     }
#   }
#   if (token_c + token_st >=1) {
#     c_sub <- rbind(c_sub, subid)
#   }
# }
# # This yields a c_sub vector of 5 subjectIDs, however if we use code to eliminate those who were ...
# # ...symptomatic for cough or soreThroat before Day0 then we get...
# sub <- unique(ILIdata_CR$SubjectID)
# c_sub2 <- c()
# token<-0
# for (i in 1:length(sub)) {
#   token_c<-0
#   token_st<-0
#   subid <- sub[i]
#   temp <- ILIdata_CR[ILIdata_CR$SubjectID == subid, ]
#   temp1<-temp[,6:9]
#   temp1[is.na(temp1)]<-0
#   temp<-cbind(temp[,1:5],temp1)
#   for (j in 1:(nrow(temp))) {
#     if (sum(temp$cough[j]) >=1) {
#       sum1<-0
#       for (k in 1:(tail(which(temp$StudyDay==-1), n=1))) {
#         sum1 <- sum1+temp$cough[k]
#         if (sum1 == 0) {
#           token_c<-1
#         }
#       }
#     }
#     if (sum(temp$st[j]) >= 1) {
#       sum2<-0
#       for (l in 1:(tail(which(temp$StudyDay==-1), n=1))) {
#         sum2 <- sum2+temp$st[l]
#         if (sum2 == 0) {
#           token_st<-1
#         }
#       }
#     }
#   }
#   if (token_c + token_st >=1) {
#     c_sub2 <- rbind(c_sub2, subid)
#   }
# }
# # This yielded 5 subjectIDs (same as the less stringent criteria). However, for now we will use the less stringent criteria.
# # Now adding this vector of studyIDs to the table3_CR
# ILI_afebrile_CR <- as.data.frame(c_sub) %>%
#   rename(SubjectID = "V1") 
# 
# # Taking out the febrile
# ILI_afebrile_CR <- ILI_afebrile_CR %>%
#   anti_join(ILI_febrile_CR, by = c("SubjectID" = "SubjectID"))
# 
# # Now adding the QuarantineNumber on to the ILI afebrile df 
# # Then we can sort by Q for the table3
# ILI_afebrile_table3_CR <- ILI_afebrile_CR %>%
#   left_join(Qdata_QuarantineNumbers, by = c("SubjectID" = "SubjectID")) %>%
#   group_by(QuarantineNumber) %>%
#   summarize(Number_ILI_afebrile_CR = n_distinct(SubjectID))
# 
# # Add onto Table3 the number of ILI afebrile and % of infected
# # For the final version of table 3 we will use the Symptomatic V3, and thus will ignore this in the printed table
# #Qdata_table3_CR <- Qdata_table3_CR %>%
#   #left_join(ILI_afebrile_table3_CR, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
#   #mutate(Fraction_ILI_afebrile_of_ExpCR = Number_ILI_afebrile_CR/NumberExposedCR)
# #print(Qdata_table3_CR)

#### Table 3: d1c) CR: Third version of classification of ILI (to match Killingley et al., 2012) (and % of infected) ####

# This definition of ILI is: "an illness lasting >=24 hours with either (1) fever >37.9°C plus at least 1 respiratory symptom or (2) >=2 symptoms, at least 1 of which must be respiratory."
# Where "respiratory symptom" means evidence of any grade of runny nose, stuffy nose, sneeze, sore throat, cough, shortness of breath
# Where "lasting >=24 hours" means evidence of the symptom over all three instances of symptom measurements for a single day, or evidence of the symptom over two days at any frequency (1-3/3 instances of symptom recordings)

# First, let's program the first criteria (fever >37.9C plus at least 1 respiratory symptom)
# To do this, we can:
# a) create the set of subject IDs that meet the fever criteria, and then check them for...
# b) evidence of three instances during a single day, or
# c) evidence of any frequency of instances >=1 for 2 consecutive days
# Then, we can deal with the second criteria for ILI (>=2 symptoms one of which being a respiratory)

# Find the SubjectIDs from among the exposed CR, that had fever
# First check to see if anyone had fever before day 1. 
Qdata_CR_febrile_pre_day1 <- Qdata %>%
  filter(Randomization_DorIRorCR == "CR") %>%
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0) %>%
  filter(Tympanic.temp..degrees.C. >37.9) %>%
  distinct(SubjectID, .keep_all = FALSE)
# As it turns out, none of the CR had fever before day 1 
# Now we can see who among the infected subject IDs had fever at least once over study days 1-10
Qdata_CR_febrile_day1to10 <- Qdata %>%
  filter(Randomization_DorIRorCR == "CR") %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6 |
           StudyDay == 7 | StudyDay == 8 | StudyDay == 9 | StudyDay == 10) %>%
  filter(Tympanic.temp..degrees.C. >37.9) %>%
  distinct(SubjectID, .keep_all = FALSE)
# As it turns out, none of the CR ever had fever during study days 1-10

ILI_V3_CR_criteria1 <- Qdata_CR_febrile_day1to10

# Now we can move to the second criteria for ILI for the CR

## Plan for implementing the second criteria for ILI (>= 2 symptoms for >=24 hours, 1 of which is respiratory) and merging with the first criteria for ILI
# To do this, first we will filter those subject IDs without fever.
# Then we will see who among those without fever had respiratory symptom on a single day plus at least one other symptom on the same single day
# Then we will see who among those without fever had respiratory symptom at frequency >=1 over 2 days plus at least one other resp symptom at freq >=1 for same 2 days
# Then we will add those subject IDs together to form ILI_CR_criteria2
# Since there were no subjects forming an ILI_CR_criteria1 df, we will use and ILI_CR_criteria2 together to make ILI_CR

# First we will filter those subject IDs without fever.
# Now we can see who among the exposed CR subject IDs had fever at least once over study days 1-10
Qdata_CR_afebrile_day1to10 <- Exposed_CR %>%
  select (SubjectID) %>%
  anti_join(Qdata_CR_febrile_day1to10)

# Now we will see who among those without fever had respiratory symptom on a single day plus at least one other respiratory symptom on the same single day

# First we are going to cut the a new df that has the 6 respiratory symptoms of interest plus the 3 non-resp symptoms, and fever 
# for only those in the "Qdata_CR_afebrile_day1to6" df
ILI_V3_CR_afebrile_day1to10 <- Qdata_CR_afebrile_day1to10 %>%
  left_join(Qdata) %>% 
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0 | StudyDay == 1 | StudyDay == 2 | 
           StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | StudyDay == 6 | StudyDay == 7 | StudyDay == 8 | 
           StudyDay == 9 | StudyDay == 10) %>%
  mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis, 
         LRI = cough+SOB, 
         SystemicI = headache+muscleAches+malaise) %>%
  filter(Microneut_VisitType == "Q baseline") %>%
  mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
  mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3, 
         stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3, 
         sneezing123 = sneezing==1 | sneezing==2 | sneezing==3, 
         soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
         DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3, 
         DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3, 
         DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3, 
         DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
         cough123 = cough==1 | cough==2 | cough==3, 
         SOB123 = SOB==1 | SOB==2 | SOB==3,
         headache123 = headache==1 | headache==2 | headache==3, 
         muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3, 
         malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
  mutate(Febrile = as.numeric(Febrile),
         runnyNose123 = as.numeric(runnyNose123), 
         stuffyNose123 = as.numeric(stuffyNose123), 
         sneezing123 = as.numeric(sneezing123), 
         soreThroat123 = as.numeric(soreThroat123),
         DPENasalDischarge123 = as.numeric(DPENasalDischarge123), 
         DPEOtits123 = as.numeric(DPEOtits123), 
         DPESinusTenderness123 = as.numeric(DPESinusTenderness123), 
         DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
         cough123 = as.numeric(cough123), 
         SOB123 = as.numeric(SOB123),
         headache123 = as.numeric(headache123), 
         muscleAches123 = as.numeric(muscleAches123), 
         malaise123 = as.numeric(malaise123)) %>%
  select(SubjectID, StudyDay, Sx_Date, SDC_time, QuarantineNumber, Febrile,
         runnyNose123, stuffyNose123, sneezing123, soreThroat123, cough123, SOB123,
         headache123, muscleAches123, malaise123) %>%
  group_by(SubjectID, StudyDay, QuarantineNumber) %>%
  distinct(SDC_time, .keep_all = TRUE) %>%
  arrange(SubjectID, StudyDay) %>%
  ungroup()
# This is great but the way the data is put together, this leaves out the day -3 through day0 data
# Therefore, as a quick fix, we will cut a new dataset that only filters in the data from day -3 through day0
# Then we will bind it back to the "Symptomatic_donors_infected_V3_afebrile" that was just created.
ILI_V3_CR_before_day1_afebrile <- Qdata_CR_afebrile_day1to10 %>%
  left_join(Qdata) %>% 
  filter(StudyDay == -3 | StudyDay == -2 | StudyDay == -1 | StudyDay == 0) %>%
  mutate(URI = runnyNose+stuffyNose+sneezing+soreThroat+DPENasalDischarge+DPEOtits+DPESinusTenderness+DPEPharyngitis, 
         LRI = cough+SOB, 
         SystemicI = headache+muscleAches+malaise) %>%
  mutate(Febrile = Tympanic.temp..degrees.C.>37.9) %>%
  mutate(runnyNose123 = runnyNose==1 | runnyNose==2 | runnyNose==3, 
         stuffyNose123 = stuffyNose==1 | stuffyNose==2 | stuffyNose==3, 
         sneezing123 = sneezing==1 | sneezing==2 | sneezing==3, 
         soreThroat123 = soreThroat==1 | soreThroat==2 | soreThroat==3,
         DPENasalDischarge123 = DPENasalDischarge==1 | DPENasalDischarge==2 | DPENasalDischarge==3, 
         DPEOtits123 = DPEOtits==1 | DPEOtits==2 | DPEOtits==3, 
         DPESinusTenderness123 = DPESinusTenderness==1 | DPESinusTenderness==2 | DPESinusTenderness==3, 
         DPEPharyngitis123 = DPEPharyngitis==1 | DPEPharyngitis==2 | DPEPharyngitis==3,
         cough123 = cough==1 | cough==2 | cough==3, 
         SOB123 = SOB==1 | SOB==2 | SOB==3,
         headache123 = headache==1 | headache==2 | headache==3, 
         muscleAches123 = muscleAches==1 | muscleAches==2 | muscleAches==3, 
         malaise123 = malaise==1 | malaise==2 | malaise==3) %>%
  mutate(Febrile = as.numeric(Febrile),
         runnyNose123 = as.numeric(runnyNose123), 
         stuffyNose123 = as.numeric(stuffyNose123), 
         sneezing123 = as.numeric(sneezing123), 
         soreThroat123 = as.numeric(soreThroat123),
         DPENasalDischarge123 = as.numeric(DPENasalDischarge123), 
         DPEOtits123 = as.numeric(DPEOtits123), 
         DPESinusTenderness123 = as.numeric(DPESinusTenderness123), 
         DPEPharyngitis123 = as.numeric(DPEPharyngitis123),
         cough123 = as.numeric(cough123), 
         SOB123 = as.numeric(SOB123),
         headache123 = as.numeric(headache123), 
         muscleAches123 = as.numeric(muscleAches123), 
         malaise123 = as.numeric(malaise123)) %>%
  select(SubjectID, StudyDay, Sx_Date, SDC_time, QuarantineNumber, Febrile,
         runnyNose123, stuffyNose123, sneezing123, soreThroat123, cough123, SOB123,
         headache123, muscleAches123, malaise123) %>%
  group_by(SubjectID, StudyDay, QuarantineNumber) %>%
  distinct(SDC_time, .keep_all = TRUE) %>%
  arrange(SubjectID, StudyDay) %>%
  ungroup()
# Now binding together and sorting
ILI_V3_CR_afebrile <- bind_rows(ILI_V3_CR_afebrile_day1to10, ILI_V3_CR_before_day1_afebrile) 
ILI_V3_CR_afebrile <- ILI_V3_CR_afebrile %>%
  arrange(SubjectID, StudyDay)

# But, the current definition is just for post day0 so we will filter just day1-10: use "ILI_V3_CR_afebrile_day1to10" df
# Loop to get subjectIDs where there were 2 symptoms (one of which respiratory), each observed 3 times on the same day
sub <- unique(ILI_V3_CR_afebrile_day1to10$SubjectID)
c_sub <- c()
token <- 0
for (i in 1:length(sub)) {
  token <- 0
  subid <- sub[i]
  temp <- ILI_V3_CR_afebrile_day1to10[ILI_V3_CR_afebrile_day1to10$SubjectID == subid, ]
  temp1<-temp[,6:15]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:5],temp1)
  stud <- unique(temp$StudyDay)
  for (j in 1:length(stud)) {
    studyday <- stud[j]
    temp2 <- temp[temp$StudyDay == studyday, ]
    for (k in 1:(nrow(temp2))) {
      for (l in 7:12) {
        for (m in (l+1):15) {
          if (sum(temp2[,l]) + sum(temp2[,m]) == 6) {
            c_sub <- rbind(c_sub, subid)
            token <- 1
            break
          }
        }
        if(token == 1) {
          break
        }
      }
      if(token == 1) {
        break
      }
    }
    if(token == 1) {
      break
    }
  }
} 
# This yields a c_sub vector of 3 subjectIDs
# If we wanted to add the more stringent criteria of eliminating Sx that occurred before day1 from the classification criteria
sub <- unique(ILI_V3_CR_afebrile$SubjectID)
c_sub2 <- c()
token <- 0
sum1<-0
sum2<-0
for (i in 1:length(sub)) {
  token <- 0
  subid <- sub[i]
  temp <- ILI_V3_CR_afebrile[ILI_V3_CR_afebrile$SubjectID == subid, ]
  temp1<-temp[,6:15]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:5],temp1)
  stud <- unique(temp$StudyDay)
  for (j in 1:length(stud)) {
    studyday <- stud[j]
    temp2 <- temp[temp$StudyDay == studyday, ]
    for (k in 1:(nrow(temp2))) {
      for (l in 7:12) {
        for (m in (l+1):15) {
          if (sum(temp2[,l]) + sum(temp2[,m]) == 6) {
            sum1<-0
            sum2<-0
            for (o in 1:(tail(which(temp$StudyDay==0), n=1))) {
              sum1<-sum1+temp[o,l]
              sum2<-sum2+temp[o,m]
            }
            if (sum1==0 & sum2==0) {
              token<-1
              c_sub2 <- rbind(c_sub2, subid)
              break
            }
          }
          if(token == 1) {
            break
          }
        }
        if(token == 1) {
          break
        }
      }
      if(token == 1) {
        break
      }
    }
  } 
}
# This yields a c_sub2 vector of 3 subjectIDs, which is is the same as using the less stringent criteria,
# For now we will use the less stringent criteria 
# Now get the df of subject IDs from the less stringent c_sub
ILI_V3_infected_donors_criteria2_singleday <- as.data.frame(c_sub) %>%
  rename(SubjectID = "V1")
# This is the output for the first part (single day) of the second ILI criteria

# Now implement the second part of the second ILI criteria: >=2 Sx (with >=1 resp) at any frequency over the same 2 consecutive study days

# Create an "CR_grade123_afebrile" df by collapsing the three study day values into 1, and select only the 9 symptoms that will be used as part of this analysis (the DPE aren't used here)
# The 9 are: runny nose, stuffy nose, sneeze, sore throat, cough, SOB, headache, muscleache, malaise
# Also remember to select the afebrile group (those who were never febrile, n = 35, which is actually all the CR)

CR_grade123_afebrile <- ILI_V3_CR_afebrile %>%
  group_by(SubjectID, StudyDay, QuarantineNumber) %>%
  summarize(Febrile = max(Febrile),
            runnyNose123 = max(runnyNose123), 
            stuffyNose123 = max(stuffyNose123), 
            sneezing123 = max(sneezing123), 
            soreThroat123 = max(soreThroat123),
            cough123 = max(cough123), 
            SOB123 = max(SOB123),
            headache123 = max(headache123), 
            muscleAches123 = max(muscleAches123), 
            malaise123 = max(malaise123))

# Columns 5-10 are respiratory symptoms and columns 11-13 are the other symptoms that matter for this definition
# Do a loop for 2 or more symptoms one of which is respiratory
# First do this without using symptoms that were positive before day 1 as part of the classification criteria

# First need to cut the data to just study days 1-10
CR_grade123_afebrile_day1to10 <- CR_grade123_afebrile %>%
  filter(StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4 | StudyDay == 5 | 
           StudyDay == 6 | StudyDay == 7 | StudyDay == 8 | StudyDay == 9 | StudyDay == 10)

# Now grabbing the subjectIDs that meet the criteria for ILI over 2 consecutive study days (with at least 1 Sx as Resp)
sub <- unique(CR_grade123_afebrile_day1to10$SubjectID)
c_sub <- c()
token<-0
for (i in 1:length(sub)) {
  token<-0
  subid <- sub[i]
  temp <- CR_grade123_afebrile_day1to10[CR_grade123_afebrile_day1to10$SubjectID == subid, ]
  temp1<-temp[,4:13]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:3],temp1)
  for (j in 1:(nrow(temp)-1)) {
    for (k in 5:10) {
      for (l in (k+1):13){
        if (temp[j, k] + temp[j, l] + temp[j+1, k] + temp[j+1, l] == 4) {
          if (temp$StudyDay[j+1] == temp$StudyDay[j]+1) {
            c_sub <- rbind(c_sub, subid)
            token<-1
             break
          }
        }
      }
      if (token==1){
        break
      } 
    }
    if (token==1){
      break
    }
  }
}
# This yields a c_sub of 9 subjectIDs but if we wanted to be more stringent we could exclude the Sx if they occured before study day 1 from the classification criteria
sub <- unique(CR_grade123_afebrile$SubjectID)
c_sub2 <- c()
token<-0
sum1<-0
sum2<-0
for (i in 1:length(sub)) {
  token<-0
  subid <- sub[i]
  temp <- CR_grade123_afebrile[CR_grade123_afebrile$SubjectID == subid, ]
  temp1<-temp[,4:13]
  temp1[is.na(temp1)]<-0
  temp<-bind_cols(temp[,1:3],temp1)
  for (j in 1:(nrow(temp)-1)) {
    for (k in 5:10) {
      for (l in (k+1):13){
        if (temp[j, k] + temp[j, l] + temp[j+1, k] + temp[j+1, l] == 4) {
          if (temp$StudyDay[j+1] == temp$StudyDay[j]+1) {
            sum1<-0
            sum2<-0
            for (m in 1:(tail(which(temp$StudyDay==-1), n=1))){
              sum1<-sum1+temp[m,k]
              sum2<-sum1+temp[m,l]
            }
            if (sum1==0 & sum2==0) {
              c_sub2 <- rbind(c_sub2, subid)
              token<-1
              break
            }
          }
        }
      }
      if (token==1){
        break
      } 
    }
    if (token==1){
      break
    }
  }
}
# This yields a c_sub2 of 8 subjectIDs (1 less than the c_sub), however we will go with the less stringent criteria for now and use the c_sub of 9 subjectIDs
# Get this ILI second criteria bit into a df using the less stringent c_sub vector of 9 subjectIDs
ILI_V3_CR_criteria2_2days <- as.data.frame(c_sub) %>%
  rename(SubjectID = "V1")

# merge the ILI_V3_infected_donors_criteria2_singleday and ILI_V3_CR_criteria2_2days dfs
ILI_V3_CR_criteria2 <- full_join(ILI_V3_infected_donors_criteria2_singleday, ILI_V3_CR_criteria2_2days)

# merge the ILI criteria 1 and 2 dfs
ILI_V3_CR <- full_join(ILI_V3_CR_criteria1, ILI_V3_CR_criteria2)

# Now adding the QuarantineNumber on to the ILI df 
# Then we can sort by Q for the table1
ILI_CR_V3_table3 <- ILI_V3_CR %>%
  left_join(Qdata_QuarantineNumbers) %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_ILI_CR_V3 = n_distinct(SubjectID))

# Add onto Table3_CR the number of symptomatic by version 3 criteria and % of infected
Qdata_table3_CR <- Qdata_table3_CR %>%
  left_join(ILI_CR_V3_table3, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
  mutate(Fraction_ILI_V3_CR_of_ExposedCR = Number_ILI_CR_V3/NumberExposedCR)
print(Qdata_table3_CR)

#### Table 3: e1) IR: Number of febrile (and % of exposed) ####

# Use the list of exposed IR to do this analysis
Qdata_exposed_febrile_IR <- Qdata %>%
  filter(Randomization_DorIRorCR == "IR" & Tympanic.temp..degrees.C. >37.9)

Qdata_exposed_febrile_table3_IR <- Qdata_exposed_febrile_IR %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_Febrile_IR = n_distinct(SubjectID))

# Add febrile count and fraction febrile to Table 3
Qdata_table3_IR <- Qdata_table3_IR %>%
  left_join(Qdata_exposed_febrile_table3_IR, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
  mutate(Fraction_Febrile_over_ExposedIR = Number_Febrile_IR/NumberExposedIR)

#### Table 3: e2) CR: Number of febrile (and % of exposed) ####

# Use the list of exposed CR to do this analysis
Qdata_exposed_febrile_CR <- Qdata %>%
  filter(Randomization_DorIRorCR == "CR" & Tympanic.temp..degrees.C. > 37.9)

Qdata_exposed_febrile_table3_CR <- Qdata_exposed_febrile_CR %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_Febrile_CR = n_distinct(SubjectID))

# Add febrile count and fraction febrile to Table 3
Qdata_table3_CR <- Qdata_table3_CR %>%
  left_join(Qdata_exposed_febrile_table3_CR, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
  mutate(Fraction_Febrile_over_ExpCR = Number_Febrile_CR/NumberExposedCR)

#### Table 3: f1) IR: Number of PCR confirmed infection (and % of exposed) ####

# This was already done to get the number of infected donors for the first few columns in this Table 3
# Redo what was done earlier, but tweaking for the purpose of this column in the table 3

# Get list of SubjectID and the number of days each was positive by PCR
Qdata_pcr_pos2_or_more_days_IR <- Qdata %>%
  filter(Randomization_DorIRorCR == "IR") %>%
  filter(!is.na(InfA_Ct)) %>%
  filter((InfA_Ct<38 & InfA_Ct!=0)) %>%
  group_by(SubjectID, StudyDay) %>%
  summarize(count = n()) %>%
  summarize(NumberDaysPosPCR_IR = n_distinct(StudyDay)) %>%
  filter(NumberDaysPosPCR >=2)
print(Qdata_pcr_pos2_or_more_days_IR)

# Add the Q numbers to the list of SubjectIDs and the number of PCR positive days and summarize by Q
# Note: there was not data on which to add Q numbers here. Running code anyways because to deal with generalized case
Qdata_pcr_pos2_or_more_days_table3_IR <- Qdata_pcr_pos2_or_more_days_IR %>%
  left_join(Qdata_QuarantineNumbers, by = c("SubjectID" = "SubjectID")) %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_PCR_Infected_IR = n_distinct(SubjectID))
print(Qdata_pcr_pos2_or_more_days_table3_IR)

# Add to table3_IR
Qdata_table3_IR <- Qdata_table3_IR %>%
  left_join(Qdata_pcr_pos2_or_more_days_table3_IR, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
  mutate(Fraction_PCR_Infected_over_ExposedIR = Number_PCR_Infected_IR/NumberExposedIR)
print(Qdata_table3_IR)

#### Table 3: f2) CR: Number of PCR confirmed infection (and % of exposed) ####

# This was already done to get the number of infected donors for the first few columns in this Table 3
# Redo what was done earlier, but tweaking for the purpose of this column in the table 3

# Get list of SubjectID and the number of days each was positive by PCR
Qdata_pcr_pos2_or_more_days_CR <- Qdata %>%
  filter(Randomization_DorIRorCR == "CR") %>%
  filter(!is.na(InfA_Ct)) %>%
  filter((InfA_Ct<38 & InfA_Ct!=0)) %>%
  group_by(SubjectID, StudyDay) %>%
  summarize(count = n()) %>%
  summarize(NumberDaysPosPCR_CR = n_distinct(StudyDay)) %>%
  filter(NumberDaysPosPCR_CR >=2)
print(Qdata_pcr_pos2_or_more_days_CR)

# Add the Q numbers to the list of SubjectIDs and the number of PCR positive days and summarize by Q
Qdata_pcr_pos2_or_more_days_table3_CR <- Qdata_pcr_pos2_or_more_days_CR %>%
  left_join(Qdata_QuarantineNumbers, by = c("SubjectID" = "SubjectID")) %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_PCR_Infected_CR = n_distinct(SubjectID))
print(Qdata_pcr_pos2_or_more_days_table3_CR)

# Add to table3_IR
Qdata_table3_CR <- Qdata_table3_CR %>%
  left_join(Qdata_pcr_pos2_or_more_days_table3_CR, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
  mutate(Fraction_PCR_Inf_over_ExpCR = Number_PCR_Infected_CR/NumberExposedCR)
print(Qdata_table3_CR)

#### * Detailed report on the CRs who were PCR positive ####
## There were no CRs that were PCR positive but there were some that were PCR positive for a single day
## Let's look at the serology data and respiratory symptom data for these individuals

# First confirm who was PCR positive for a single day
PCR_Pos_CR_Serology_Report <- Qdata %>%
  filter(Randomization_DorIRorCR == "CR") %>%
  filter(!is.na(InfA_Ct)) %>%
  filter((InfA_Ct<38 & InfA_Ct!=0)) %>%
  distinct(Microneut_DrawDate, Microneut_VisitType, Sx_Date, SDC_time, .keep_all = TRUE) %>%
  arrange(SubjectID, Microneut_DrawDate) %>%
  select(SubjectID, QuarantineNumber: Microneutralization.Titer.to.A.Wisconsin.67.2005, HAI_dayminus2: HAI_day28_recodeNDA, Randomization_DorIRorCR, StudyDay, Sx_Date, InfA_Ct) %>%
  filter(!is.na(Microneut_DrawDate)) %>%
  distinct(SubjectID, Microneut_VisitType, .keep_all = TRUE)
# This gives us the data about serology and pcr for the days that were pcr posiive for the 2 CR with positive pcr
# Now, let's get the data for MN and then the data for HAI and prepare it for tabular representation

# MN Serology table for #236 and #242
PCR_Pos_CR_Serology_Report_MN <- PCR_Pos_CR_Serology_Report %>%
  select(SubjectID:Microneutralization.Titer.to.A.Wisconsin.67.2005) %>%
  rename(`Subject ID` = SubjectID,
         `Quarantine Number` = QuarantineNumber,
         `Draw Date` = Microneut_DrawDate,
         `MN Visit` = Microneut_VisitType,
         `MN Titer to A/WI/67/2005` = Microneutralization.Titer.to.A.Wisconsin.67.2005)

# Write out this df for future RMarkdown reporting
write.csv(PCR_Pos_CR_Serology_Report_MN, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/PCR_Pos_CR_Serology_Report_MN.csv")
# HAI Serology table for #233
PCR_Pos_CR_Serology_Report_HAI <- PCR_Pos_CR_Serology_Report %>%
  select(SubjectID:QuarantineNumber, HAI_dayminus2, HAI_dayminus2_recodeNDA, HAI_day28) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  rename(`Subject ID` = SubjectID,
         `Quarantine Number` = QuarantineNumber,
         `HAI 2 Days Before Entry to Q` = HAI_dayminus2,
         `HAI Day 28` = HAI_day28,
         `HAI 2 Days Before Entry to Q (Recoded Nondetect)` = HAI_dayminus2_recodeNDA)

# Write out this df for future RMarkdown reporting
write.csv(PCR_Pos_CR_Serology_Report_HAI, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/PCR_Pos_CR_Serology_Report_HAI.csv")

# Add table with pcr ct values for the NPswabs tested for InfA
PCR_Pos_CR_pcr_Report <- PCR_Pos_CR_Serology_Report %>%
  select(SubjectID, Randomization_DorIRorCR, QuarantineNumber, StudyDay, Sx_Date, InfA_Ct) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  rename(`Subject ID` = SubjectID,
         `Randomized Group` = Randomization_DorIRorCR,
         `Quarantine Number` = QuarantineNumber,
         `Study Day` = StudyDay,
         `Date` = Sx_Date,
         `Ct Value` = InfA_Ct)

# Write out this df for future RMarkdown reporting
write.csv(PCR_Pos_CR_pcr_Report, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/PCR_Pos_CR_pcr_Report.csv")

# Now let's get the symptoms profiles for each of these CR with positive PCR
Positive_PCR_CR_Symptoms <- Qdata %>%
  filter(SubjectID == c(236, 242)) %>%
  select(SubjectID:QuarantineNumber, StudyDay:Tympanic.temp..degrees.C.) %>%
  filter(!is.na(SDC_time)) %>%
  distinct(Sx_Date, SDC_time, .keep_all = TRUE) %>%
  arrange(SubjectID, Sx_Date, SDC_time) %>%
  group_by(StudyDay) %>%
  mutate(Total_Respiratory_Score = runnyNose + stuffyNose + sneezing + soreThroat + cough + SOB)
# We see from this that there are no symptoms scores (for respiratory or any others) that are above 0!
# Thus, there is no sense in doing a plot of all 0s. 

#### Table 3: g1) IR: Number of PCR confirmed infection and seroconversion (and % of exposed) ####

# This was already done to get the number of infected IR for the first few columns in this Table 3

Inf_PCR_and_Sero_IR <- Qdata_infected_IR %>%
  filter(NumberDaysPosPCR_IR >=2) %>%
  filter(!is.na(QuarantineNumber.x) | !is.na(QuarantineNumber.y))

#Now summarize the number of unique SubjectIDs that meet this criteria by Q for the table
Inf_PCR_and_Sero_table_IR <- Inf_PCR_and_Sero_IR %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_PCR_and_Sero_Positive_IR = n_distinct(SubjectID))

# Now add Inf_PCR_and_Sero_table_IR to the cumulative table 3
Qdata_table3_IR <- Qdata_table3_IR %>%
  left_join(Inf_PCR_and_Sero_table_IR, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
  mutate(Fraction_Inf_PCR_and_Sero_Positive_of_ExposedIR = Number_PCR_and_Sero_Positive_IR/NumberExposedIR)

#### Table 3: g2) CR: Number of PCR confirmed infection and seroconversion (and % of exposed) ####

# This was already done to get the number of infected CR for the first few columns in this Table 3

Inf_PCR_and_Sero_CR <- Qdata_infected_CR %>%
  filter(NumberDaysPosPCR_CR >=2) %>%
  filter(!is.na(QuarantineNumber.x) | !is.na(QuarantineNumber.y))

#Now summarize the number of unique SubjectIDs that meet this criteria by Q for the table
Inf_PCR_and_Sero_table3_CR <- Inf_PCR_and_Sero_CR %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_PCR_and_Sero_Positive_CR = n_distinct(SubjectID))

# Now add Inf_PCR_and_Sero_table1 to the cumulative table 3
Qdata_table3_CR <- Qdata_table3_CR %>%
  left_join(Inf_PCR_and_Sero_table3_CR, by = c("QuarantineNumber" = "QuarantineNumber")) %>%
  mutate(Fraction_Inf_PCR_and_Sero_Positive_CR = Number_PCR_and_Sero_Positive_CR/NumberExposedCR)

#### Table 3: h1) IR: Number of seroconversion by HAI: MN: Either (and % of exposed) ####

# This was already done to get the number of infected IR for the first few columns in this Table 1
# Reworking here to tailor the current Table 3 columns in question

## HAI

# Qdata_HAI_pos is the list (generated in section a) above) with seroconversion by HAI (Glasgow serology)
# Group by Q day and summarize number of distinct SubjectIDs
Qdata_HAI_pos_table3_IR <- Qdata_HAI_pos_IR %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_HAI_Positive_IR = n_distinct(SubjectID))

# Add HAI_pos column to the table (and % HAI_pos of infected)
# Don't need the percentage here for final version of table 3 so commenting out the mutate
Qdata_table3_IR <- Qdata_table3_IR %>%
  left_join(Qdata_HAI_pos_table3_IR, by = c("QuarantineNumber" = "QuarantineNumber")) #%>%
  #mutate(Fraction_HAI_Positive_of_ExposedIR = Number_HAI_Positive_IR/NumberExposedIR)

## Microneuts

# Qdata_Microneut_pos is the list (generated in section a) above) with seroconversion by Microneuts (CDC serology)
# Group by Q day and summarize number of distinct SubjectIDs
Qdata_Microneut_pos_table3_IR <- Qdata_Microneut_pos_IR %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_Microneut_Positive_IR = n_distinct(SubjectID))

# Add Microneut_pos column to the table (and % Microneut_pos of infected)
# Don't need the percentage here for final version of table 3 so commenting out the mutate
Qdata_table3_IR <- Qdata_table3_IR %>%
  left_join(Qdata_Microneut_pos_table3_IR, by = c("QuarantineNumber" = "QuarantineNumber")) #%>%
  #mutate(Fraction_MN_Positive_of_ExposedIR = Number_Microneut_Positive_IR/NumberExposedIR)

## Either HAI or MN

# Already have this generated in the Exposed_IR df
Pos_Either_HAI_or_MN_table3_IR <- Qdata_infected_IR %>%
  filter(!is.na(QuarantineNumber.x) | !is.na(QuarantineNumber.y)) %>%
  group_by(QuarantineNumber) %>%
  summarize(Pos_Either_HAI_or_MN_IR = n_distinct(SubjectID))

# Add Pos_Either_HAI_or_MN_table3 to the cumulative Qdata_table3
# Don't need the percentage here for final version of table 3 so commenting out the mutate
Qdata_table3_IR <- Qdata_table3_IR %>%
  left_join(Pos_Either_HAI_or_MN_table3_IR, by = c("QuarantineNumber" = "QuarantineNumber")) #%>%
  #mutate(Fraction_Pos_Either_HAI_or_MN_of_ExpIR = Pos_Either_HAI_or_MN_IR/NumberExposedIR)

#### Table 3: h2) CR: Number of seroconversion by HAI: MN: Either (and % of exposed) ####

# This was already done to get the number of infected CR for the first few columns in this Table 1
# Reworking here to tailor the current Table 3 columns in question

## HAI

# Qdata_HAI_pos is the list (generated in section a) above) with seroconversion by HAI (Glasgow serology)
# Group by Q day and summarize number of distinct SubjectIDs
Qdata_HAI_pos_table3_CR <- Qdata_HAI_pos_CR %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_HAI_Positive_CR = n_distinct(SubjectID))
print(Qdata_HAI_pos_table3_CR)

# Add HAI_pos column to the table (and % HAI_pos of infected)
# Don't need the percentage here for final version of table 3 so commenting out the mutate
Qdata_table3_CR <- Qdata_table3_CR %>%
  left_join(Qdata_HAI_pos_table3_CR, by = c("QuarantineNumber" = "QuarantineNumber")) #%>%
  #mutate(Fraction_HAI_Positive_of_ExposedCR = Number_HAI_Positive_CR/NumberExposedCR)
print(Qdata_table3_CR)

## Microneuts

# Qdata_Microneut_pos is the list (generated in section a) above) with seroconversion by Microneuts (CDC serology)
# Group by Q day and summarize number of distinct SubjectIDs
Qdata_Microneut_pos_table3_CR <- Qdata_Microneut_pos_CR %>%
  group_by(QuarantineNumber) %>%
  summarize(Number_Microneut_Positive_CR = n_distinct(SubjectID))
print(Qdata_Microneut_pos_table3_CR)

# Add Microneut_pos column to the table (and % Microneut_pos of infected)
# Don't need the percentage here for final version of table 3 so commenting out the mutate
Qdata_table3_CR <- Qdata_table3_CR %>%
  left_join(Qdata_Microneut_pos_table3_CR, by = c("QuarantineNumber" = "QuarantineNumber")) #%>%
  #mutate(Fraction_MN_Positive_of_ExposedCR = Number_Microneut_Positive_CR/NumberExposedCR)
print(Qdata_table3_CR)

## Either HAI or MN

# Already have this generated in the Exposed_CR df
Pos_Either_HAI_or_MN_table3_CR <- Qdata_infected_CR %>% 
  filter(!is.na(QuarantineNumber.x) | !is.na(QuarantineNumber.y)) %>%
  group_by(QuarantineNumber) %>%
  summarize(Positive_Either_HAI_or_MN_CR = n_distinct(SubjectID))
print(Pos_Either_HAI_or_MN_table3_CR)

# Add Pos_Either_HAI_or_MN_table3 to the cumulative Qdata_table3
# Don't need the percentage here for final version of table 3 so commenting out the mutate
Qdata_table3_CR <- Qdata_table3_CR %>%
  left_join(Pos_Either_HAI_or_MN_table3_CR, by = c("QuarantineNumber" = "QuarantineNumber")) #%>%
  #mutate(Fraction_Positive_Either_HAI_or_MN_of_ExposedCR = Positive_Either_HAI_or_MN_CR/NumberExposedCR)

#### * Detailed report on the CR who seroconverted ####
## The above shows that there was a single CR who seroconverted (by both MN and HAI evidence)
## Let's do a detailed summary of the serology and symptoms data associated with this CR: #233

# First let's do the report for serology (we know the PCR is negative for #233)

seroconversion_CR_serology_report <- Qdata %>%
  filter(SubjectID == 233) %>%
  distinct(Microneut_DrawDate, Microneut_VisitType, Sx_Date, SDC_time, .keep_all = TRUE) %>%
  arrange(Microneut_DrawDate) %>%
  select(SubjectID, QuarantineNumber: Microneutralization.Titer.to.A.Wisconsin.67.2005, HAI_dayminus2: HAI_day28_recodeNDA) %>%
  filter(!is.na(Microneut_DrawDate)) %>%
  distinct(Microneut_VisitType, .keep_all = TRUE)
# Need to make one table for the MN data and one table for the HAI data
# MN Serology table for #233
seroconversion_CR_serology_report_MN <- seroconversion_CR_serology_report %>%
  select(SubjectID:Microneutralization.Titer.to.A.Wisconsin.67.2005) %>%
  rename(`Subject ID` = SubjectID,
         `Quarantine Number` = QuarantineNumber,
         `Draw Date` = Microneut_DrawDate,
         `MN Visit` = Microneut_VisitType,
         `MN Titer to A/WI/67/2005` = Microneutralization.Titer.to.A.Wisconsin.67.2005)
# Write out this df for future RMarkdown reporting
write.csv(seroconversion_CR_serology_report_MN, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Seroconverted_CR_MN_Serology.csv")
# HAI Serology table for #233
seroconversion_CR_serology_report_HAI <- seroconversion_CR_serology_report %>%
  select(SubjectID:QuarantineNumber, HAI_dayminus2, HAI_dayminus2_recodeNDA, HAI_day28) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  rename(`Subject ID` = SubjectID,
         `Quarantine Number` = QuarantineNumber,
         `HAI 2 Days Before Entry to Q` = HAI_dayminus2,
         `HAI Day 28` = HAI_day28,
         `HAI 2 Days Before Entry to Q (Recoded Nondetect)` = HAI_dayminus2_recodeNDA)
# Write out this df for future RMarkdown reporting
write.csv(seroconversion_CR_serology_report_HAI, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Seroconverted_CR_HAI_Serology.csv")

# Now that we have tables for MN and HAI serology, let's do the report for symptoms for #233

Seroconversion_CR_Symptoms <- Qdata %>%
  filter(SubjectID == 233) %>%
  select(SubjectID:QuarantineNumber, StudyDay:Tympanic.temp..degrees.C.) %>%
  distinct(Sx_Date, SDC_time, .keep_all = TRUE) %>%
  filter(StudyDay <= 10) %>%
  group_by(SubjectID, StudyDay) %>%
  mutate(Respiratory_Total = mean(runnyNose + stuffyNose + sneezing + soreThroat + cough + SOB)) 
# Note that we divided the respiratory symptoms total by 3 to get the average score per day for each resp score then summed by all the respiratory scores.
# Write out this df for future RMarkdown reporting
write.csv(Seroconversion_CR_Symptoms, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Seroconverted_CR_Symptoms.csv")

Seroconversion_CR_Symptoms_Line <- Seroconversion_CR_Symptoms %>%
  group_by(StudyDay) %>%
  summarise("Mean Total Respiratory Score" = mean(Respiratory_Total))
# Write out this df for future RMarkdown reporting
write.csv(Seroconversion_CR_Symptoms_Line, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Seroconverted_CR_Symptoms_Line.csv")

# We will not report on temperature because there was no fever for any of the Recipients in the study

# plot (line) the symptoms for #233 who seroconverted by MN and HAI
p <- ggplot(Seroconversion_CR_Symptoms_Line, aes(x = StudyDay, y = `Mean Total Respiratory Score`, group=1)) +
  geom_line() +
  geom_point(shape=21, size=3, fill="white") +
  theme_bw() +
  xlab("Study Day") +
  ylab("Mean Total Respiratory Score")
p


#### Table 3: i1) IR: Number with greater than antic. immunity prior to quarantine by HAI: MN: Both (and % of exposed) ####

# Definition of serosusceptible for this analysis, which will be included in the footnote of table 1 (and table 3) is from Alex Mann from email correspondence on September 28, 2018. He states:
# "An HI titre of ≤10 and/or an MN titre of <80 at baseline was retrospectively taken to indicate susceptibility to infection"
# Thus we will use this criteria to tell who among the inoculated donors was serosusceptible at baseline (entry to quarantine)
# We won't filter these individuals, but we will note who among those who above the MN of 80 (>=80) and HAI of 10 (>10) thresholds seroconverted, since the likelihood of seroconversion among those above the thresholds is lower.
# Based on teleconference with team on October 12, 2018 we will use the term greater than anticipated immunity upon admission to Q

HI_low_susceptibility_table3_footnote_IR <- Qdata %>%
  filter(Randomization_DorIRorCR == "IR" & HAI_dayminus2_recodeNDA >10) %>%
  group_by(QuarantineNumber) %>%
  summarize(low_HI_susceptibility_at_baseline_IR = n_distinct(SubjectID))

MN_low_susceptibility_table3_footnote_IR <- Qdata %>%
  filter(Microneut_VisitType == "Screening") %>%
  filter(Randomization_DorIRorCR == "IR" & Microneutralization.Titer.to.A.Wisconsin.67.2005 >=80) %>%
  group_by(QuarantineNumber) %>%
  summarize(low_MN_susceptibility_at_baseline_IR = n_distinct(SubjectID))

MN_seroconvert_between_screening_baseline_table3_footnote_IR <- Qdata %>%
  filter(Randomization_DorIRorCR == "IR" & Microneut_VisitType == "Q baseline") %>%
  filter(Microneut_Seroconvert == 1) %>%
  group_by(QuarantineNumber)%>%
  summarize(MN_seroconvert_between_screening_baseline_IR = n_distinct(SubjectID))

table3_footnote_IR <- full_join(HI_low_susceptibility_table3_footnote_IR, 
                             MN_low_susceptibility_table3_footnote_IR, 
                             by = c("QuarantineNumber"= "QuarantineNumber")) %>%
  left_join(MN_seroconvert_between_screening_baseline_table3_footnote_IR, 
            by = c("QuarantineNumber"= "QuarantineNumber"))

# Looking into more detail on who exactly might not be serosusceptible or may have seroconverted
# Need to check over the below to ensure that it matches the proper criteria for seroconversion, serosusceptible, serosuitable, and seropositive, etc. 
# Alex Mann has some good comments about this. 

# Which SubjectID's were these with greater than anticipated immunity prior to Q (by HAI, retrospectively)?

HI_low_susceptibility_table3_footnote_SubIDs_IR <- Qdata %>%
  filter(Randomization_DorIRorCR == "IR" & HAI_dayminus2_recodeNDA >10) %>%
  group_by(QuarantineNumber) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, QuarantineNumber, Randomization_DorIRorCR, HAI_dayminus2, HAI_day28, HAI_dayminus2_recodeNDA, 
         HAI_day28_recodeNDA, HAI_dayminus2_recodeNDA_x4, HAI_Seroconversion, Preliminary.HAI.Classification)

# Which SubjectID's were these with greater than anticipated immunity prior to Q (by MN, retrospectively)?

MN_low_susceptibility_table3_footnote_SubIDs_IR <- Qdata %>%
  filter(Randomization_DorIRorCR == "IR" & Microneut_VisitType == "Screening") %>%
  filter(Microneutralization.Titer.to.A.Wisconsin.67.2005 >=80) %>%
  group_by(QuarantineNumber) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, QuarantineNumber, Randomization_DorIRorCR, Microneut_DrawDate, Microneut_VisitType, 
         Microneutralization.Titer.to.A.Wisconsin.67.2005, Microneut_Seroconvert) %>%
  rename(Microneut_Seroconvert_Screening = Microneut_Seroconvert)

MN_low_susceptibility_table3_footnote_SubIDs_IR_seroconversion <- Qdata %>%
  filter(Randomization_DorIRorCR == "IR" & Microneut_VisitType == "F/up") %>%
  filter(Microneut_Seroconvert == 1) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, Microneut_Seroconvert) %>%
  rename(Microneut_Seroconvert_Followup = Microneut_Seroconvert) %>%
  right_join(MN_low_susceptibility_table3_footnote_SubIDs_IR)
# There were no IR subjectIDs that seroconverted by MN, and there were no IR subjectIDs that seroconverted after having lower than expected immunity

# Bind the seroconversion prior by HAI and by MN together in a single table (table1 footnote)
Qdata_table3_footnote_subjectIDs_IR <- HI_low_susceptibility_table3_footnote_SubIDs_IR %>%
  full_join(MN_low_susceptibility_table3_footnote_SubIDs_IR_seroconversion)
Qdata_table3_footnote_subjectIDs_IR$HAIandMNprior <- NA
Qdata_table3_footnote_subjectIDs_IR$HAIonlyprior <- NA
Qdata_table3_footnote_subjectIDs_IR$MNonlyprior <- NA
Qdata_table3_footnote_subjectIDs_IR$Low_Suscept_Converted_Anyways_HAIorMN <- NA
# Shows that there were a total of 5 IR subjectIDs (1 by HAI and 4 by MN) and none of them had evidence of seroconversion

Table3_footnote_SubjectID_summary_IR <- Qdata_table3_footnote_subjectIDs_IR %>%
  mutate(HAIonlyprior = 
           if_else(HAI_dayminus2_recodeNDA > 10 & 
                     (Microneutralization.Titer.to.A.Wisconsin.67.2005 < 80 | is.na(Microneutralization.Titer.to.A.Wisconsin.67.2005)), 1, 0)) %>%
  mutate(MNonlyprior = 
           if_else(Microneutralization.Titer.to.A.Wisconsin.67.2005 >=80 & 
                     (HAI_dayminus2_recodeNDA <= 10 | is.na(HAI_dayminus2_recodeNDA)), 1, 0)) %>%
  mutate(HAIandMNprior = 
           if_else(HAI_dayminus2_recodeNDA > 10 & 
                     Microneutralization.Titer.to.A.Wisconsin.67.2005 >= 80, 1, 0)) %>%
  mutate(Low_Suscept_Converted_Anyways_HAIorMN =
           if_else((HAI_dayminus2_recodeNDA > 10 & (HAI_Seroconversion == 1 | Microneut_Seroconvert_Followup == 1)) |
                     (Microneut_Seroconvert_Screening == 1 & (HAI_Seroconversion == 1 | Microneut_Seroconvert_Followup == 1)), 1, 0)) %>%
  select(QuarantineNumber, SubjectID, HAIonlyprior, MNonlyprior, HAIandMNprior, Low_Suscept_Converted_Anyways_HAIorMN)
# Convert all the NA to 0
Table3_footnote_SubjectID_summary_IR[is.na(Table3_footnote_SubjectID_summary_IR)] <- 0

Table3_IR_Footnote_Reportable_Summary <- Table3_footnote_SubjectID_summary_IR %>%
  group_by(QuarantineNumber) %>%
  summarise(`Greater than Anticipated HAI` = sum(HAIonlyprior),
            `Greater than Anticipated MN` = sum(MNonlyprior),
            `Greater than Anticipated HAI and MN` = sum(HAIandMNprior),
            Seroconverted = sum(Low_Suscept_Converted_Anyways_HAIorMN)) %>%
  rename(`Quarantine #` = QuarantineNumber)
  
#### Writing out Table 3_IR Footnote to box sync directory ####

# IR footnote information
write.csv(Qdata_table3_footnote_subjectIDs_IR, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table3_IR_Footnote_Full_Data.csv")
write.csv(table3_footnote_IR, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table3_IR_Footnote_Summary.csv")
write.csv(Table3_footnote_SubjectID_summary_IR, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table3_IR_Footnote_Reportable_Summary_SubjectIDs.csv")
write.csv(Table3_IR_Footnote_Reportable_Summary, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table3_IR_Footnote_Reportable_Summary.csv")

#### Table 3: i2) CR: Number with greater than antic. immunity prior to quarantine by HAI: MN: Both (and % of exposed) ####

# Definition of serosusceptible for this analysis, which will be included in the footnote of table 1 (and table 3) is from Alex Mann from email correspondence on September 28, 2018. He states:
# "An HI titre of ≤10 and/or an MN titre of <80 at baseline was retrospectively taken to indicate susceptibility to infection"
# Thus we will use this criteria to tell who among the inoculated donors was serosusceptible at baseline (entry to quarantine)
# Filter the number who were not serosusceptible at baseline

HI_susceptibility_table3_footnote_CR <- Qdata %>%
  filter(Randomization_DorIRorCR == "CR" & HAI_dayminus2_recodeNDA >10) %>%
  group_by(QuarantineNumber) %>%
  summarize(not_HI_susceptibile_at_baseline_CR = n_distinct(SubjectID))

MN_susceptibility_table3_footnote_CR <- Qdata %>%
  filter(Microneut_VisitType == "Screening") %>%
  filter(Randomization_DorIRorCR == "CR" & Microneutralization.Titer.to.A.Wisconsin.67.2005 >=80) %>%
  group_by(QuarantineNumber) %>%
  summarize(not_MN_susceptible_at_baseline_CR = n_distinct(SubjectID))

MN_seroconvert_between_screening_baseline_table3_footnote_CR <- Qdata %>%
  filter(Randomization_DorIRorCR == "CR" & Microneut_VisitType == "Q baseline") %>%
  filter(Microneut_Seroconvert == 1) %>%
  group_by(QuarantineNumber)%>%
  summarize(MN_seroconvert_between_screening_baseline_CR = n_distinct(SubjectID))

table3_footnote_CR <- left_join(HI_susceptibility_table3_footnote_CR, 
                                MN_susceptibility_table3_footnote_CR, 
                                by = c("QuarantineNumber"= "QuarantineNumber")) %>%
  left_join(MN_seroconvert_between_screening_baseline_table3_footnote_CR, 
            by = c("QuarantineNumber"= "QuarantineNumber"))

# Looking into more detail on who exactly might not be serosusceptible or may have seroconverted
# Need to check over the below to ensure that it matches the proper criteria for seroconversion, serosusceptible, serosuitable, and seropositive, etc. 
# Alex Mann has some good comments about this. 

# Which SubjectID's were these with not serosusceptible prior to Q (by HAI, retrospectively)?

HI_susceptibility_table3_footnote_SubIDs_CR <- Qdata %>%
  filter(Randomization_DorIRorCR == "CR" & HAI_dayminus2_recodeNDA >10) %>%
  group_by(QuarantineNumber) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, QuarantineNumber, Randomization_DorIRorCR, HAI_dayminus2, HAI_day28, HAI_dayminus2_recodeNDA, 
         HAI_day28_recodeNDA, HAI_dayminus2_recodeNDA_x4, HAI_Seroconversion, Preliminary.HAI.Classification)

# Which SubjectID's were these with not serosusceptible prior to Q (by MN, retrospectively)?

MN_susceptibility_table3_footnote_SubIDs_CR <- Qdata %>%
  filter(Randomization_DorIRorCR == "CR" & Microneut_VisitType == "Screening") %>%
  filter(Microneutralization.Titer.to.A.Wisconsin.67.2005 >=80) %>%
  group_by(QuarantineNumber) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, QuarantineNumber, Randomization_DorIRorCR, Microneut_DrawDate, Microneut_VisitType, 
         Microneutralization.Titer.to.A.Wisconsin.67.2005, Microneut_Seroconvert) %>%
  rename(Microneut_Seroconvert_Screening = Microneut_Seroconvert)

MN_low_susceptibility_table3_footnote_SubIDs_CR_seroconversion <- Qdata %>%
  filter(Randomization_DorIRorCR == "CR" & Microneut_VisitType == "F/up") %>%
  filter(Microneut_Seroconvert == 1) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, Microneut_Seroconvert) %>%
  rename(Microneut_Seroconvert_Followup = Microneut_Seroconvert) %>%
  right_join(MN_low_susceptibility_table3_footnote_SubIDs_IR)
# There was 1 CR subjectIDs that seroconverted by MN, and there were no CR subjectIDs that seroconverted after having lower than expected immunity

# Bind the seroconversion prior by HAI and by MN together in a single table (table1 footnote)
Qdata_table3_footnote_subjectIDs_CR <- HI_susceptibility_table3_footnote_SubIDs_CR %>%
  full_join(MN_low_susceptibility_table3_footnote_SubIDs_CR_seroconversion)
Qdata_table3_footnote_subjectIDs_CR$HAIandMNprior <- NA
Qdata_table3_footnote_subjectIDs_CR$HAIonlyprior <- NA
Qdata_table3_footnote_subjectIDs_CR$MNonlyprior <- NA
Qdata_table3_footnote_subjectIDs_CR$LowSuscept_Seroconverted_Anyways <- NA

Table3_footnote_SubjectID_summary_CR <- Qdata_table3_footnote_subjectIDs_CR %>%
  mutate(HAIandMNprior = 
           if_else(HAI_dayminus2_recodeNDA > 10 & Microneutralization.Titer.to.A.Wisconsin.67.2005 >= 80, 1, 0)) %>%
  mutate(HAIonlyprior = 
           if_else(HAI_dayminus2_recodeNDA > 10 & 
                     (Microneutralization.Titer.to.A.Wisconsin.67.2005 < 80 | is.na(Microneutralization.Titer.to.A.Wisconsin.67.2005)), 1, 0)) %>%
  mutate(MNonlyprior = 
           if_else(Microneutralization.Titer.to.A.Wisconsin.67.2005 >=80 & 
                     (HAI_dayminus2_recodeNDA <= 10 | is.na(HAI_dayminus2_recodeNDA)), 1, 0)) %>%
  mutate(LowSuscept_Seroconverted_Anyways = 
           if_else((HAI_dayminus2_recodeNDA > 10 & (HAI_Seroconversion == 1 | Microneut_Seroconvert_Followup == 1)) |
                     (Microneut_Seroconvert_Screening == 1 & (HAI_Seroconversion == 1 | Microneut_Seroconvert_Followup == 1)), 1, 0)) %>%
  select(QuarantineNumber, SubjectID, HAIonlyprior, MNonlyprior, HAIandMNprior, LowSuscept_Seroconverted_Anyways)
# Convert all the NA to 0
Table3_footnote_SubjectID_summary_CR[is.na(Table3_footnote_SubjectID_summary_CR)] <- 0

Table3_CR_Footnote_Reportable_Summary <- Table3_footnote_SubjectID_summary_CR %>%
  group_by(QuarantineNumber) %>%
  summarise(`Greater than Anticipated HAI` = sum(HAIonlyprior),
            `Greater than Anticipated MN` = sum(MNonlyprior),
            `Greater than Anticipated HAI and MN` = sum(HAIandMNprior),
            Seroconverted = sum(LowSuscept_Seroconverted_Anyways)) %>%
  rename(`Quarantine #` = QuarantineNumber)

#### Writing out Table 3_CR Footnote to box sync directory ####
# CR footnote information
write.csv(Qdata_table3_footnote_subjectIDs_CR, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table3_CR_Footnote_Full_Data.csv")
write.csv(table3_footnote_CR, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table3_CR_Footnote_Summary.csv")
write.csv(Table3_footnote_SubjectID_summary_CR, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table3_CR_Footnote_Reportable_Summary_SubjectID.csv")
write.csv(Table3_CR_Footnote_Reportable_Summary, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table3_CR_Footnote_Reportable_Summary.csv")

#### Column means, sums and rounding for Table 3_IR ####

## IR
# First convert all the NAs to 0.
Qdata_table3_IR[is.na(Qdata_table3_IR)] <- 0 

Table3_IR_manuscript <- Qdata_table3_IR # use this as a backup because it is arduous to recreate Qdata_table1

Table3_IR_manuscript_sums <-  Table3_IR_manuscript %>%
  summarise_all(funs(sum))
Table3_IR_manuscript_sums <- Table3_IR_manuscript %>%
  full_join(Table3_IR_manuscript_sums)
# This gets all of the columns to be summed
# Now let's work on changing the sums in all of the "percent" columns to the proper fraction 
# (also perform the *100 transformation to percent)
# (also round everything to the nearest whole number)
Table3_IR_manuscript_sums_fractions <- Table3_IR_manuscript_sums %>%
  mutate(Fraction_Inf_over_ExpIR = (NumberInfectedIR/NumberExposedIR)*100,
         Fraction_IR_Symptomatic_V3_of_ExposedIR = (Number_IR_Symptomatic_V3/NumberExposedIR)*100,
         Fraction_ILI_V3_IR_of_ExposedIR = (Number_ILI_V3_IR/NumberExposedIR)*100,
         Fraction_Febrile_over_ExposedIR = (Number_Febrile_IR/NumberExposedIR)*100,
         Fraction_PCR_Infected_over_ExposedIR = (Number_PCR_Infected_IR/NumberExposedIR)*100,
         Fraction_Inf_PCR_and_Sero_Positive_of_ExposedIR = (Number_PCR_and_Sero_Positive_IR/NumberExposedIR)*100) %>%
  mutate_all(funs(round(., 0)))

# Now add parentheses to all of these fraction variables because they are presented in manuscript table as percents
Table3_IR_manuscript_sums_fractions$Fraction_Inf_over_ExpIR <- 
  paste0("(", Table3_IR_manuscript_sums_fractions$Fraction_Inf_over_ExpIR, ")")
Table3_IR_manuscript_sums_fractions$Fraction_IR_Symptomatic_V3_of_ExposedIR <- 
  paste0("(", Table3_IR_manuscript_sums_fractions$Fraction_IR_Symptomatic_V3_of_ExposedIR, ")")
Table3_IR_manuscript_sums_fractions$Fraction_ILI_V3_IR_of_ExposedIR <- 
  paste0("(", Table3_IR_manuscript_sums_fractions$Fraction_ILI_V3_IR_of_ExposedIR, ")")
Table3_IR_manuscript_sums_fractions$Fraction_Febrile_over_ExposedIR <- 
  paste0("(", Table3_IR_manuscript_sums_fractions$Fraction_Febrile_over_ExposedIR, ")")
Table3_IR_manuscript_sums_fractions$Fraction_PCR_Infected_over_ExposedIR <- 
  paste0("(", Table3_IR_manuscript_sums_fractions$Fraction_PCR_Infected_over_ExposedIR, ")")
Table3_IR_manuscript_sums_fractions$Fraction_Inf_PCR_and_Sero_Positive_of_ExposedIR <- 
  paste0("(", Table3_IR_manuscript_sums_fractions$Fraction_Inf_PCR_and_Sero_Positive_of_ExposedIR, ")")

# Now bring columns together into more publishable arrangement of data in the display of the table

Table3_IR_manuscript_unite <- Table3_IR_manuscript_sums_fractions %>%
  unite(`Infected/Exposed`, NumberInfectedIR, NumberExposedIR, sep = "/", remove = TRUE) %>%
  unite(`Infected/Exposed (%)`, `Infected/Exposed`, Fraction_Inf_over_ExpIR, sep = " ", remove = TRUE) %>%
  unite(Symptomatic, Number_IR_Symptomatic_V3, Fraction_IR_Symptomatic_V3_of_ExposedIR, sep = " ", remove = TRUE) %>%
  unite(ILI, Number_ILI_V3_IR, Fraction_ILI_V3_IR_of_ExposedIR, sep = " ", remove = TRUE) %>%
  unite(Febrile, Number_Febrile_IR, Fraction_Febrile_over_ExposedIR, sep = " ", remove = TRUE) %>%
  unite(`PCR Confirmed Infection`, Number_PCR_Infected_IR, Fraction_PCR_Infected_over_ExposedIR, sep = " ", remove = TRUE) %>%
  unite(`PCR Confirmed Infection and Seroconversion`, Number_PCR_and_Sero_Positive_IR, Fraction_Inf_PCR_and_Sero_Positive_of_ExposedIR, sep = " ", remove = TRUE) %>%
  unite(`Seroconversion by HAI : MN : Either`, Number_HAI_Positive_IR, Number_Microneut_Positive_IR, Pos_Either_HAI_or_MN_IR, sep = " : ", remove = TRUE) %>%
  rename('Quarantine #' = QuarantineNumber)
Table3_IR_manuscript_unite <- Table3_IR_manuscript_unite[ ,c(1:3,5,4,6:8)]

# Change the 4th row of the Quarantine # column to "Total"
Table3_IR_manuscript_unite$`Quarantine #`[4] <- "Total"

# Add a column that indicates in each row the recipient classification (Intervention (IR))
# We need to do this in order to keep the data organized when we merge with the CR group to make 1 table 3
Table3_IR_manuscript_unite$`Recipient Classification` <- "Intervention (IR)"
Table3_IR_manuscript_unite <- Table3_IR_manuscript_unite[,c(1,9,2:8)]

#### Writing out Table 3_IR to box sync directory ####

write.csv(Table3_IR_manuscript_unite, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table3_IR_Manuscript.csv")

#### Writing out Table 3_IR to latex for direct translation of code to table image for paper

kable(Table3_IR_manuscript_unite) %>%
  kable_styling("striped") %>%
  add_header_above(c(" " = 3, "Laboratory Confirmed Infection & Illness (% of Exposed)" = 3, 
                     "Laboratory Confirmed Infection Criteria (% of Exposed)" = 3))

datatable(Table3_IR_manuscript_unite,
          filter = 'top', options = list(
            pageLength = 10, autoWidth = TRUE
          ))

print(xtable(Table3_IR_manuscript_unite),
      comment = FALSE)

#### Column means, sums and rounding for Table 3_CR ####

## CR
# First convert all the NAs to 0.
Qdata_table3_CR[is.na(Qdata_table3_CR)] <- 0 

table3_CR_manuscript <- Qdata_table3_CR # use this as a backup because it is arduous to recreate Qdata_table1

table3_CR_manuscript_sums <-  table3_CR_manuscript %>%
  summarise_all(funs(sum))
table3_CR_manuscript_sums <- table3_CR_manuscript %>%
  full_join(table3_CR_manuscript_sums)
# This gets all of the columns to be summed
# Now let's work on changing the sums in all of the "percent" columns to the proper fraction 
# (also perform the *100 transformation to percent)
# (also round everything to the nearest whole number)
table3_CR_manuscript_sums_fractions <- table3_CR_manuscript_sums %>%
  mutate(Fraction_Inf_over_ExpCR = (NumberInfectedCR/NumberExposedCR)*100,
         Fraction_Symptomatic_V3_CR_of_ExposedCR = (Number_Symptomatic_V3_CR/NumberExposedCR)*100,
         Fraction_ILI_V3_CR_of_ExposedCR = (Number_ILI_CR_V3/NumberExposedCR)*100,
         Fraction_Febrile_over_ExpCR = (Number_Febrile_CR/NumberExposedCR)*100,
         Fraction_PCR_Inf_over_ExpCR = (Number_PCR_Infected_CR/NumberExposedCR)*100,
         Fraction_Inf_PCR_and_Sero_Positive_CR = (Number_PCR_and_Sero_Positive_CR/NumberExposedCR)*100) %>%
  mutate_all(funs(round(., 0)))

# Now add parentheses to all of these fraction variables because they are presented in manuscript table as percents
table3_CR_manuscript_sums_fractions$Fraction_Inf_over_ExpCR <- 
  paste0("(", table3_CR_manuscript_sums_fractions$Fraction_Inf_over_ExpCR, ")")
table3_CR_manuscript_sums_fractions$Fraction_Symptomatic_V3_CR_of_ExposedCR <- 
  paste0("(", table3_CR_manuscript_sums_fractions$Fraction_Symptomatic_V3_CR_of_ExposedCR, ")")
table3_CR_manuscript_sums_fractions$Fraction_ILI_V3_CR_of_ExposedCR <- 
  paste0("(", table3_CR_manuscript_sums_fractions$Fraction_ILI_V3_CR_of_ExposedCR, ")")
table3_CR_manuscript_sums_fractions$Fraction_Febrile_over_ExpCR <- 
  paste0("(", table3_CR_manuscript_sums_fractions$Fraction_Febrile_over_ExpCR, ")")
table3_CR_manuscript_sums_fractions$Fraction_PCR_Inf_over_ExpCR <- 
  paste0("(", table3_CR_manuscript_sums_fractions$Fraction_PCR_Inf_over_ExpCR, ")")
table3_CR_manuscript_sums_fractions$Fraction_Inf_PCR_and_Sero_Positive_CR <- 
  paste0("(", table3_CR_manuscript_sums_fractions$Fraction_Inf_PCR_and_Sero_Positive_CR, ")")

# Now bring columns together into more publishable arrangement of data in the display of the table

table3_CR_manuscript_unite <- table3_CR_manuscript_sums_fractions %>%
  unite(`Infected/Exposed`, NumberInfectedCR, NumberExposedCR, sep = "/", remove = TRUE) %>%
  unite(`Infected/Exposed (%)`, `Infected/Exposed`, Fraction_Inf_over_ExpCR, sep = " ", remove = TRUE) %>%
  unite(Symptomatic, Number_Symptomatic_V3_CR, Fraction_Symptomatic_V3_CR_of_ExposedCR, sep = " ", remove = TRUE) %>%
  unite(ILI, Number_ILI_CR_V3, Fraction_ILI_V3_CR_of_ExposedCR, sep = " ", remove = TRUE) %>%
  unite(Febrile, Number_Febrile_CR, Fraction_Febrile_over_ExpCR, sep = " ", remove = TRUE) %>%
  unite(`PCR Confirmed Infection`, Number_PCR_Infected_CR, Fraction_PCR_Inf_over_ExpCR, sep = " ", remove = TRUE) %>%
  unite(`PCR Confirmed Infection and Seroconversion`, Number_PCR_and_Sero_Positive_CR, Fraction_Inf_PCR_and_Sero_Positive_CR, sep = " ", remove = TRUE) %>%
  unite(`Seroconversion by HAI : MN : Either`, Number_HAI_Positive_CR, Number_Microneut_Positive_CR, Positive_Either_HAI_or_MN_CR, sep = " : ", remove = TRUE) %>%
  rename('Quarantine #' = QuarantineNumber)
table3_CR_manuscript_unite <- table3_CR_manuscript_unite[,c(1:3,5,4,6:8)]

# Change the 4th row of the Quarantine # column to "Total"
table3_CR_manuscript_unite$`Quarantine #`[4] <- "Total"

# Add a column that indicates in each row the recipient classification (Control (CR))
# We need to do this in order to keep the data organized when we merge with the CR group to make 1 table 3
table3_CR_manuscript_unite$`Recipient Classification` <- "Control (CR)"
table3_CR_manuscript_unite <- table3_CR_manuscript_unite[,c(1,9,2:8)]

#### Writing out Table 3_CR to box sync directory ####

write.csv(table3_CR_manuscript_unite, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table3_CR_Manuscript.csv")

#### Writing out Table 3_CR to latex for direct translation of code to table image for paper

kable(table3_CR_manuscript_unite) %>%
  kable_styling("striped") %>%
  add_header_above(c(" " = 3, "Laboratory Confirmed Infection & Illness (% of Exposed)" = 3, 
                     "Laboratory Confirmed Infection Criteria (% of Exposed)" = 3))

datatable(table3_CR_manuscript_unite,
          filter = 'top', options = list(
            pageLength = 10, autoWidth = TRUE
          ))

print(xtable(table3_CR_manuscript_unite),
      comment = FALSE)

#### Building a cumulative Table 3 and writing out to box sync directory and for translation to manuscript ####

table3 <- full_join(Table3_IR_manuscript_unite, table3_CR_manuscript_unite) %>%
  arrange(`Quarantine #`, `Recipient Classification`)

kable(table3) %>%
  kable_styling("striped") %>%
  add_header_above(c(" " = 3, "Laboratory Confirmed Infection & Illness (% of Exposed)" = 3,
                     "Laboratory Confirmed Infection Criteria (% of Exposed)" = 3))%>%
  kable_styling(full_width = F) %>%
  column_spec(1, bold = T) %>%
  collapse_rows(columns = 1, valign = "top")

datatable(table3,
          filter = 'top', options = list(
            pageLength = 10, autoWidth = TRUE
          ))

print(xtable(table3),
      comment = FALSE)

# Table3
write.csv(table3, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Table3_Manuscript.csv")

#### * FIGURE 3 -----------------------------####
#### Overview of Figure 3 ####
## Bar chart: Percent positive by NPS, fine, coarse, both fine and coarse, either fine or coarse for study days 1-4
## And line chart: NPS ct means (with sd bars) for study days 1-4
#### Implementing the bar chart piece of Figure 3 ####

## Find the number of subjects with NPS, Coarse, Fine samples for each study day 1-4

# Number of subjects with NPS for each study day 1-4
InfDonors_NPSnumber <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & 
           !is.na(InfA_Ct) & 
           Microneut_VisitType == "F/up" & # because the 3 Microneut visit types are repeated for each PCR day of data, reduce to one set to avoid repeats
           StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4) %>% 
  group_by(StudyDay) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, QuarantineNumber, StudyDay, InfA_Ct, G2.pcr.copies.sample.type, final.copies.replicate) %>%
  right_join(Qdata_infected_donors, by = c("SubjectID" = "SubjectID", "QuarantineNumber" = "QuarantineNumber")) %>%
  summarize(NPSsubjects = n_distinct(SubjectID))

# Number of subjects with coarse pcr results (could be positive or negative) for each study day 1-4
InfDonors_CoarseNumber <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & 
           StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4) %>%
  filter(G2.pcr.copies.sample.type == "C") %>%
  group_by(StudyDay) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, QuarantineNumber, StudyDay, InfA_Ct, G2.pcr.copies.sample.type, final.copies.replicate) %>%
  right_join(Qdata_infected_donors, by = c("SubjectID" = "SubjectID", "QuarantineNumber" = "QuarantineNumber")) %>%
  summarize(CoarseSubjects = n_distinct(SubjectID))

# Number of subjects with fine for each study day 1-4
InfDonors_FineNumber <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & 
           StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4) %>%
  filter(G2.pcr.copies.sample.type == "F") %>%
  group_by(StudyDay) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, QuarantineNumber, StudyDay, InfA_Ct, G2.pcr.copies.sample.type, final.copies.replicate) %>%
  right_join(Qdata_infected_donors, by = c("SubjectID" = "SubjectID", "QuarantineNumber" = "QuarantineNumber")) %>%
  summarize(FineSubjects = n_distinct(SubjectID))

## Merge together the NPS, coarse, and fine df's from above

NumberSubjectsSampled_per_day <- InfDonors_NPSnumber %>%
  left_join(InfDonors_CoarseNumber) %>% 
  left_join(InfDonors_FineNumber)

## Now need to know how many from these NPS, coarse, and fine totals are positive

# Number of subjects with positive NPS for each study day 1-4
InfDonors_NPSnumberPos <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & 
           !is.na(InfA_Ct) & 
           Microneut_VisitType == "F/up" & # because the 3 Microneut visit types are repeated for each PCR day of data, reduce to one set to avoid repeats
           StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4) %>% 
  group_by(StudyDay) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, QuarantineNumber, StudyDay, InfA_Ct, G2.pcr.copies.sample.type, final.copies.replicate) %>%
  right_join(Qdata_infected_donors, by = c("SubjectID" = "SubjectID", "QuarantineNumber" = "QuarantineNumber")) %>%
  filter(InfA_Ct > 0) %>%
  summarize(NPSsubjectsPos = n_distinct(SubjectID))

# Number of subjects with positive coarse for each study day 1-4
InfDonors_CoarseNumberPos <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & 
           StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4) %>%
  filter(G2.pcr.copies.sample.type == "C") %>%
  group_by(StudyDay) %>%
  right_join(Qdata_infected_donors, by = c("SubjectID" = "SubjectID", "QuarantineNumber" = "QuarantineNumber")) %>%
  filter(!is.na(final.copies.replicate)) %>%
  select(SubjectID, QuarantineNumber, StudyDay, InfA_Ct, G2.pcr.copies.sample.type, final.copies.replicate) %>%
  distinct(SubjectID, StudyDay, G2.pcr.copies.sample.type, .keep_all = TRUE) %>%
  summarize(CoarseSubjectsPos = n_distinct(SubjectID))

# Number of subjects with positive fine for each study day 1-4
InfDonors_FineNumberPos <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & 
           StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4) %>%
  filter(G2.pcr.copies.sample.type == "F") %>%
  group_by(StudyDay) %>%
  right_join(Qdata_infected_donors, by = c("SubjectID" = "SubjectID", "QuarantineNumber" = "QuarantineNumber")) %>%
  filter(!is.na(final.copies.replicate)) %>%
  select(SubjectID, QuarantineNumber, StudyDay, InfA_Ct, G2.pcr.copies.sample.type, final.copies.replicate) %>%
  distinct(SubjectID, StudyDay, G2.pcr.copies.sample.type, .keep_all = TRUE) %>%
  summarize(FineSubjectsPos = n_distinct(SubjectID))

## Merge together the preceeding data so that we have the total number and number of positives for NPS, coarse, and fine for each sample day

PosBySampDay <- NumberSubjectsSampled_per_day %>%
  left_join(InfDonors_NPSnumberPos, by = "StudyDay") %>% 
  left_join(InfDonors_CoarseNumberPos, by = "StudyDay") %>% 
  left_join(InfDonors_FineNumberPos, by = "StudyDay")

## Find fraction positive for NPS, coarse, and fine, for each sample day

PosBySampDay <- PosBySampDay %>%
  mutate(NPS_fractionpos = NPSsubjectsPos/NPSsubjects) %>% 
  mutate(Coarse_fractionpos = CoarseSubjectsPos/CoarseSubjects) %>%
  mutate(Fine_fractionpos = FineSubjectsPos/FineSubjects)

## Turn the NA into 0
PosBySampDay[is.na(PosBySampDay)] <- 0

## Need to create and add the number of "any aerosol samples" and "fraction positive for any aerosol" (coarse or Fine) to the PosBySampDay df to be plotted

# We know that there was a fine and coarse aerosol sample collected at each sampling instance so the number of any and both aerosol samples is the same as the number of subjects with samples on each study day for coarse and fine

# # Any and both aerosol samples (number)
# InfDonors_AnyAerosolPos <- Qdata %>%
#   filter(Randomization_DorIRorCR == "D" & 
#            StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4) %>%
#   filter(G2.pcr.copies.sample.type == "C" | G2.pcr.copies.sample.type == "F") %>%
#   group_by(StudyDay) %>%
#   select(SubjectID, QuarantineNumber, StudyDay, InfA_Ct, G2.pcr.copies.sample.type, final.copies.replicate) %>%
#   right_join(Qdata_infected_donors, by = c("SubjectID" = "SubjectID", "QuarantineNumber" = "QuarantineNumber")) %>%
#   summarize(AnyAndBothAerosolSamples = n_distinct(SubjectID))

# Any aerosol positive
InfDonors_AnyAerosolPos <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & 
           StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4) %>%
  filter(G2.pcr.copies.sample.type == "C" | G2.pcr.copies.sample.type == "F") %>%
  group_by(StudyDay) %>%
  select(SubjectID, QuarantineNumber, StudyDay, InfA_Ct, G2.pcr.copies.sample.type, final.copies.replicate) %>%
  right_join(Qdata_infected_donors, by = c("SubjectID" = "SubjectID", "QuarantineNumber" = "QuarantineNumber")) %>%
  filter(!is.na(final.copies.replicate)) %>%
  summarize(AnyAerosolPos = n_distinct(SubjectID))

## Both aerosol positive

# First create df with only the Coarse
CoarseInfected <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & 
           StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4) %>%
  filter(G2.pcr.copies.sample.type == "C") %>%
  group_by(StudyDay) %>%
  right_join(Qdata_infected_donors, by = c("SubjectID" = "SubjectID", "QuarantineNumber" = "QuarantineNumber")) %>%
  filter(!is.na(final.copies.replicate)) %>%
  select(SubjectID, QuarantineNumber, StudyDay, InfA_Ct, G2.pcr.copies.sample.type) %>%
  distinct(SubjectID, StudyDay, G2.pcr.copies.sample.type, .keep_all = TRUE)
CoarseInfected$CoarsePositive <- 1
  
# Next create df with only the Fine
FineInfected <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & 
           StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4) %>%
  filter(G2.pcr.copies.sample.type == "F") %>%
  group_by(StudyDay) %>%
  right_join(Qdata_infected_donors, by = c("SubjectID" = "SubjectID", "QuarantineNumber" = "QuarantineNumber")) %>%
  filter(!is.na(final.copies.replicate)) %>%
  select(SubjectID, QuarantineNumber, StudyDay, InfA_Ct, G2.pcr.copies.sample.type) %>%
  distinct(SubjectID, StudyDay, G2.pcr.copies.sample.type, .keep_all = TRUE)
FineInfected$FinePositive <- 1

# Join Fine and Coarse positive dfs together
BothFine_CoarsePositive <- CoarseInfected %>%
  full_join(FineInfected, by = c("SubjectID", "QuarantineNumber", "StudyDay", "InfA_Ct")) %>%
  filter(!is.na(CoarsePositive) & !is.na(FinePositive))

# Summarize the number of both fine and coarse positive
BothFine_CoarsePositive_table <- BothFine_CoarsePositive %>%
  group_by(StudyDay) %>%
  summarize(BothAerosolPos = n_distinct(SubjectID))

# Now bind the "Any aerosol number" and "Both aerosol positive" dfs to the PosBySampleDay df
PosBySampDay <- PosBySampDay %>%
  left_join(InfDonors_AnyAerosolPos, by = "StudyDay") %>%
  left_join(BothFine_CoarsePositive_table, by = "StudyDay")
# Remove newly introduced NAs from the table
PosBySampDay[is.na(PosBySampDay)] <- 0

## Want to add the any positive and both positive as bars to the barplot that was created above ##

# First need to make proportions for the Any.Positive.Aerosol and Both.Positive.Aerosol columns 
PosBySampDay <- PosBySampDay %>%
  mutate(AnyAerosolPosFraction = AnyAerosolPos/CoarseSubjects) %>%
  mutate(BothAerosolPosFraction = BothAerosolPos/CoarseSubjects)
  
## Getting to the Plot. First need to prepare the dataset for the ggplot 

BarPosBySampDay <- PosBySampDay[, c(1, 8:10, 13:14)] # cut the data to the relevant columns (just the fraction pos columns)
# Need to reshape the data into the long format in order to easily use the ggplot functions
BarPosBySampDay <- gather(BarPosBySampDay, "Sample.Type", "Fraction.Positive", 2:6) # to long format
# Need to change names to simple Nasopharyngeal Swab, Coarse Aerosol, Fine Aerosol, Either Coarse or Fine Aerosol, Both Coarse and Fine Aerosol
BarPosBySampDay$Sample.Type[BarPosBySampDay$Sample.Type=="NPS_fractionpos"] <- "Nasopharyngeal Swab"
BarPosBySampDay$Sample.Type[BarPosBySampDay$Sample.Type=="Coarse_fractionpos"] <- "Coarse Aerosol"
BarPosBySampDay$Sample.Type[BarPosBySampDay$Sample.Type=="Fine_fractionpos"] <- "Fine Aerosol"
BarPosBySampDay$Sample.Type[BarPosBySampDay$Sample.Type=="AnyAerosolPosFraction"] <- "Either Coarse or Fine Aerosol"
BarPosBySampDay$Sample.Type[BarPosBySampDay$Sample.Type=="BothAerosolPosFraction"] <- "Both Coarse and Fine Aerosol"
# Make the variable names nicer to read 
setnames(BarPosBySampDay, "StudyDay", "Study Day")
setnames(BarPosBySampDay, "Sample.Type", "Sample Type")
setnames(BarPosBySampDay, "Fraction.Positive", "Fraction Positive")

## Write out BarPosBySampDay to facilitate RMarkdown document to produce the figure
write.csv(BarPosBySampDay, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/BarPosBySampDay.csv")

## Make the barplot

p <- ggplot(BarPosBySampDay, aes(fill = `Sample Type`, y = `Fraction Positive`, x = `Study Day`)) +
  geom_bar(position="dodge", stat="identity") + theme(legend.position = c(0.2, 0.8)) +
  theme_bw()
p

#### Implementing the line chart piece of Figure 3 ####
## This will be placed on the same chart space as the barchart -- we can manipulate this using Graphic or other program

# Need to calculate the mean ct value and sd for the np swabs

InfectedNPSdata <- Qdata %>%
  filter(Randomization_DorIRorCR == "D" & 
           StudyDay == 1 | StudyDay == 2 | StudyDay == 3 | StudyDay == 4) %>%
  group_by(StudyDay) %>%
  distinct(SubjectID, .keep_all = TRUE) %>%
  select(SubjectID, QuarantineNumber, StudyDay, InfA_Ct, G2.pcr.copies.sample.type) %>%
  right_join(Qdata_infected_donors, by = c("SubjectID", "QuarantineNumber")) %>%
  filter(!is.na(InfA_Ct)) %>%
  select(SubjectID, QuarantineNumber, StudyDay, InfA_Ct)

# Test to see how many subjects contributed samples on each of the study days 1-4
InfectedNPSdataDenominatorTest <- InfectedNPSdata %>%
  group_by(StudyDay) %>%
  summarize(NumberWithNPSdata = n_distinct(SubjectID))

# Test to see how many subjects contributed positive samples on each of the study days 1-4
InfectedNPSdataPosNPSTest <- InfectedNPSdata %>%
  filter(InfA_Ct > 0) %>%
  group_by(StudyDay) %>%
  summarize(NumberWithNPSdata = n_distinct(SubjectID))

# Create mean and sd data for line
# Filter out the 0s from the mean and sd calculations
NPctLineData <- InfectedNPSdata %>%
  filter(InfA_Ct > 0) %>%
  filter(!is.na(InfA_Ct)) %>%
  group_by(StudyDay) %>%
  summarize(NPS_ct_mean = mean(InfA_Ct, na.rm = TRUE), NPS_ct_sd = sd(InfA_Ct, na.rm = TRUE))
# Make the variable names nicer to read 
setnames(NPctLineData, "StudyDay", "Study Day")
setnames(NPctLineData, "NPS_ct_mean", "Nasopharyngeal Swab CT Value Mean")
setnames(NPctLineData, "NPS_ct_sd", "Nasopharyngeal Swab CT Value SD")

# Write out this df for use in RMarkdown
write.csv(NPctLineData, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/NPctLineData.csv")

# Plot the line graph
p2 <- ggplot(NPctLineData, aes(x = `Study Day`, y = `Nasopharyngeal Swab CT Value Mean`, group=1)) +
  geom_line() +
  geom_errorbar(width=.1, aes(ymin=`Nasopharyngeal Swab CT Value Mean` - `Nasopharyngeal Swab CT Value SD`, 
                              ymax=`Nasopharyngeal Swab CT Value Mean` + `Nasopharyngeal Swab CT Value SD`)) +
  geom_point(shape=21, size=3, fill="white") +
  ylim(20,38) +
  theme_bw()
p2

#### * Group Comparisons with Respect to SAR Outcomes ####

## Making some comparisons between the Proof of Concept study (PoC) and the Main Q study

# Comparing PoC with PoC criteria versus Main Q with Main Q criteria:
fisher.test(matrix(c(3, 12-3, 1, 75-1), ncol = 2))

# Comparing PoC with PoC criteria versus Main Q CR with Main Q criteria:
fisher.test(matrix(c(3, 12-3, 1, 35-1), ncol = 2))

# But the correct comparison (applying the more stringent Main Q criteria to both studies) between Main Q overall and PoC would be:
fisher.test(matrix(c(1, 12-1, 1, 75-1), ncol = 2))

# Correct comparison (stringent Main Q criteria applied to both studies) between Main Q CR and PoC:
fisher.test(matrix(c(1, 12-1, 1, 35-1), ncol = 2))

# Correct comparison (stringent Main Q criteria applied to both studies) between Main Q IR and PoC:
fisher.test(matrix(c(1, 12-1, 0, 40-0), ncol = 2))

# Correct comparison (stringent Main Q criteria) of Main Q CR to Main Q IR:
fisher.test(matrix(c(1, 35-1, 0, 40-0), ncol = 2))

# Correct comparison (with less stringent PoC criteria) of Main Q CR to Main Q IR
fisher.test(matrix(c(3, 35-3, 0, 40-0), ncol = 2))

## Additional computations using PoC criteria to compare between both studies:

# Comparing PoC with IR
fisher.test(matrix(c(3, 12-3, 0, 40-0), ncol = 2))
# Comparing PoC with CR
fisher.test(matrix(c(3, 12-3, 3, 35-3), ncol = 2))
# Comparing PoC with overall Main Q
fisher.test(matrix(c(3, 12-3, 3, 75-3), ncol = 2))

## Using the Main Q criteria for infection status in both studies: 
## We got 1/75 (overall, and 1/35 for CR group, and 0/40 for IR group) but we were expecting at least .16 (double the infecteds and recipients, double the time)
# Comparing the theoretical ~16% expected versus the 1/75 actual (overall study)
n_successes = 1
n = 75
p = 0.16
binom.test(n_successes, n, p)$p.value
# Comparing the theoretical ~16% expected versus the 1/35 actual (CR group only)
n_successes = 1
n = 35
p = 0.16
binom.test(n_successes, n, p)$p.value

## Using the POC criteria for infection status in both studies:
## We got 3/75 (overall, and 3/35 for the CR group, and 0/40 for IR group) and we were expecting at least .5 (double the infecteds and recipients, double the time)
# Comparing the theoretical ~50% expected versus the 1/75 actual (overall study)
n_successes = 3
n = 75
p = 0.50
binom.test(n_successes, n, p)$p.value
# Comparing the theoretical ~50% expected versus the 1/35 actual (CR group only)
n_successes = 3
n = 35
p = 0.50
binom.test(n_successes, n, p)$p.value

#### * Table of Baseline Characteristics ####
# The Lancet ID requests that this be done.

# The baseline characteristics that we could potentially use are MN baseline titre, HAI baseline titre, age, and gender (I don't have any data on ethnicity)

Baseline_vars <- Qdata %>%
  select(SubjectID, Randomization_DorIRorCR, Randomization_DorR, QuarantineNumber, Gender, Age, Microneut_VisitType, Microneutralization.Titer.to.A.Wisconsin.67.2005, HAI_dayminus2, HAI_dayminus2_recodeNDA) %>%
  filter(Microneut_VisitType == "Q baseline") %>%
  distinct(SubjectID, Randomization_DorIRorCR, .keep_all = TRUE)

Baseline_table_byQ <- Baseline_vars %>%
  distinct(SubjectID, Randomization_DorIRorCR, QuarantineNumber, .keep_all = TRUE) %>%
  group_by(QuarantineNumber, Gender) %>%
  summarize(total_population = n_distinct(SubjectID))

## Getting a sense for these variables and their values
with(Baseline_vars, table(Gender, Randomization_DorIRorCR))
with(Baseline_vars, table(Age, Randomization_DorIRorCR))
with(Baseline_vars, all.equal(Gender, Randomization_DorIRorCR))

mean_sd(Baseline_vars$Age)
mean_sd(Baseline_vars$Age, denote_sd = "paren")

mean_sd(Baseline_vars$Microneutralization.Titer.to.A.Wisconsin.67.2005)
mean_sd(Baseline_vars$Microneutralization.Titer.to.A.Wisconsin.67.2005, denote_sd = "paren")

mean_sd(Baseline_vars$HAI_dayminus2_recodeNDA)
mean_sd(Baseline_vars$HAI_dayminus2_recodeNDA, denote_sd = "paren")

mci <- mean_ci(Baseline_vars$Age)
mci
print(mci, show_level = TRUE)

mci <- mean_ci(Baseline_vars$Microneutralization.Titer.to.A.Wisconsin.67.2005)
mci
print(mci, show_level = TRUE)

mci <- mean_ci(Baseline_vars$HAI_dayminus2_recodeNDA)
mci
print(mci, show_level = TRUE)

median_iqr(Baseline_vars$Age)
median_iqr(Baseline_vars$Microneutralization.Titer.to.A.Wisconsin.67.2005)
median_iqr(Baseline_vars$HAI_dayminus2_recodeNDA)

n_perc(Baseline_vars$Gender == "f")
n_perc0(Baseline_vars$Gender == "f")

## Using the 'qwraps2' package

# summary_1 <-
#   list("Sex" =
#          list("male" = ~ n_perc(Baseline_vars$Gender == "f"),
#               "female" = ~ n_perc(Baseline_vars$Gender == "m")),
#        "Age" = ~ median_iqr(Baseline_vars$Age)
#   )
# 
# whole <- summary_table(Baseline_vars, summary_1)
# summary(whole)

## Using the 'arsenal' package
tab1 <- tableby(Randomization_DorIRorCR ~ Gender + Age, data = Baseline_vars)
summary(tab1)
summary(tab1, text = TRUE)

tab1_df <- as.data.frame(tab1)
# Could potentially get this into nice markdown layout using kable to print the df

# Modifying the summary
tab2 <- tableby(Randomization_DorIRorCR ~ Gender + Age, data = Baseline_vars,
                control = tableby.control(numeric.stats = "medianq1q3", digits = 0L, total = FALSE, test = FALSE))
summary(tab2)
summary(tab2, text = TRUE)

#### Old version of manipulating variables for summary table ####
# # Age for Donors
# Baseline_table_age_median_D <- Baseline_vars %>%
#   distinct(SubjectID, Randomization_DorIRorCR, QuarantineNumber, .keep_all = TRUE) %>%
#   filter(Randomization_DorIRorCR == "D") %>%
#   summarize(age_D_combined_Q = median(Age))
# 
# Baseline_table_age_25percentile_D <- Baseline_vars %>%
#   distinct(SubjectID, Randomization_DorIRorCR, QuarantineNumber, .keep_all = TRUE) %>%
#   filter(Randomization_DorIRorCR == "D") %>%
#   summarize(age_D_IQR25 = quantile(Age, .25))
# 
# Baseline_table_age_75percentile_D <- Baseline_vars %>%
#   distinct(SubjectID, Randomization_DorIRorCR, QuarantineNumber, .keep_all = TRUE) %>%
#   filter(Randomization_DorIRorCR == "D") %>%
#   summarize(age_D_IQR75 = quantile(Age, .75))
# 
# Baseline_table_age_D <- bind_cols(Baseline_table_age_median_D, 
#                                   Baseline_table_age_25percentile_D,
#                                   Baseline_table_age_75percentile_D)
# 
# # Age for IR
# Baseline_table_age_median_IR <- Baseline_vars %>%
#   distinct(SubjectID, Randomization_DorIRorCR, QuarantineNumber, .keep_all = TRUE) %>%
#   filter(Randomization_DorIRorCR == "IR") %>%
#   summarize(age_IR_combined_Q = median(Age))
# 
# Baseline_table_age_25percentile_IR <- Baseline_vars %>%
#   distinct(SubjectID, Randomization_DorIRorCR, QuarantineNumber, .keep_all = TRUE) %>%
#   filter(Randomization_DorIRorCR == "IR") %>%
#   summarize(age_IR_IQR25 = quantile(Age, .25))
# 
# Baseline_table_age_75percentile_IR <- Baseline_vars %>%
#   distinct(SubjectID, Randomization_DorIRorCR, QuarantineNumber, .keep_all = TRUE) %>%
#   filter(Randomization_DorIRorCR == "IR") %>%
#   summarize(age_IR_IQR75 = quantile(Age, .75))
# 
# Baseline_table_age_IR <- bind_cols(Baseline_table_age_median_IR, 
#                                   Baseline_table_age_25percentile_IR,
#                                   Baseline_table_age_75percentile_IR)
# 
# # Age for CR
# Baseline_table_age_median_CR <- Baseline_vars %>%
#   distinct(SubjectID, Randomization_DorIRorCR, QuarantineNumber, .keep_all = TRUE) %>%
#   filter(Randomization_DorIRorCR == "CR") %>%
#   summarize(age_CR_combined_Q = median(Age))
# 
# Baseline_table_age_25percentile_CR <- Baseline_vars %>%
#   distinct(SubjectID, Randomization_DorIRorCR, QuarantineNumber, .keep_all = TRUE) %>%
#   filter(Randomization_DorIRorCR == "CR") %>%
#   summarize(age_CR_IQR25 = quantile(Age, .25))
# 
# Baseline_table_age_75percentile_CR <- Baseline_vars %>%
#   distinct(SubjectID, Randomization_DorIRorCR, QuarantineNumber, .keep_all = TRUE) %>%
#   filter(Randomization_DorIRorCR == "CR") %>%
#   summarize(age_CR_IQR75 = quantile(Age, .75))
# 
# Baseline_table_age_CR <- bind_cols(Baseline_table_age_median_CR, 
#                                    Baseline_table_age_25percentile_CR,
#                                    Baseline_table_age_75percentile_CR)
# 
# # Putting together age
# Baseline_table_age_combined <- bind_cols(Baseline_table_age_D, 
#                                 Baseline_table_age_IR,
#                                 Baseline_table_age_CR)
# 
# age_IQR <- Baseline_table_age_combined %>%
#   unite(Donors_IQR, age_D_IQR25, age_D_IQR75, sep = "-", remove = TRUE) %>%
#   unite(IR_IQR, age_IR_IQR25, age_IR_IQR75, sep = "-", remove = TRUE) %>%
#   unite(CR_IQR, age_CR_IQR25, age_CR_IQR75, sep = "-", remove = TRUE)
# 
# age_IQR$Donors_IQR <- 
#   paste0("(", age_IQR$Donors_IQR, ")")
# age_IQR$IR_IQR <- 
#   paste0("(", age_IQR$IR_IQR, ")")
# age_IQR$CR_IQR <- 
#   paste0("(", age_IQR$CR_IQR, ")")
# 
# age_unite <- age_IQR %>%
#   unite(Donors, age_D_combined_Q, Donors_IQR, sep = " ", remove = TRUE) %>%
#   unite(IR, age_IR_combined_Q, IR_IQR, sep = " ", remove = TRUE) %>%
#   unite(CR, age_CR_combined_Q, CR_IQR, sep = " ", remove = TRUE)
# 
# age_unite <- age_unite %>%
#   mutate(Variable = "Age (years)") %>%
#   select(Variable, everything())

#### * Redoing the GM and GSD for the EMIT_UMD pop because we have updated the conversion factor from RNA copies to virus particles ####

UMD_PNAS_full <- read.csv("/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_UMD_Natural_Infection/Curated Data/Analytical Datasets/PNAS_data_full.csv")

## First cut a df with only fine and coarse samples listed. 
UMD_PNAS_full_aerosol <- UMD_PNAS_full %>%
  select(subject.id, date.visit, sample.id, sample.type, g2.run, visit.num, type.inf, type, final.copies, dpo, cough_number, sex, age, BMI, vax_bothyear) %>%
  filter(sample.type == "GII condensate NO mask" | sample.type == "Impactor 5 um NO mask") %>%
  mutate(vax_bothyear = if_else(is.na(vax_bothyear), 0, 
                                if_else(vax_bothyear == 1, 1, 0)))

# Need to find the subject ID + study day instances where there was one positive replicate and one negative replicate (the negative replicate will be treated as non-detect and we will apply the LOQ*1/sqrt(2) to impute a value to be used for measures of center and spread; but when we get to modelling, this will not be done, and rather we will use tobit regression or some other method to account for censored data).

Aerosol_1_replicate_positive_raw_data_all_pos <- UMD_PNAS_full_aerosol %>%
  filter(!is.na(final.copies)) %>%
  group_by(subject.id, g2.run, sample.type, type) %>%
  count() %>%
  filter(n == 1) %>%
#  distinct(subject.id, g2.run, sample.type, type, .keep_all = TRUE) %>%
#  distinct(subject.id, g2.run, type) %>%
  left_join(UMD_PNAS_full_aerosol)

aerosol_neg <- UMD_PNAS_full_aerosol %>%
  filter(is.na(final.copies)) %>%
  anti_join (Aerosol_1_replicate_positive_raw_data_all_pos, by = c("subject.id", "g2.run", "sample.type", "type"))


Aerosol_1_replicate_positive_count_fluA <- Aerosol_1_replicate_positive_raw_data_all_pos %>%
  filter(type == "A")

Aerosol_1_replicate_positive_count_fluB <- Aerosol_1_replicate_positive_raw_data_all_pos %>%
  filter(type == "B")

Aerosol_1_replicate_negative_imputed_A <- Aerosol_1_replicate_positive_count_fluA %>%
  filter(is.na(final.copies)) %>%
  mutate(final.copies = 2000*1/sqrt(2))

Aerosol_1_replicate_negative_imputed_B <- Aerosol_1_replicate_positive_count_fluB %>%
  filter(is.na(final.copies)) %>%
  mutate(final.copies = 9000*1/sqrt(2))

# Bind the imputed values for A and B together
Aerosol_1_replicate_negative_imputed_A_and_B <- Aerosol_1_replicate_negative_imputed_A %>%
  bind_rows(Aerosol_1_replicate_negative_imputed_B)

# Bind the Aerosol_1_replicate_negative_imputed df into the Aerosol_1_replicate_positive df
Aerosols_both_replicates <- Aerosol_1_replicate_positive_raw_data_all_pos %>%
  filter(!is.na(final.copies)) %>%
  bind_rows(Aerosol_1_replicate_negative_imputed_A_and_B) %>%
  arrange(subject.id, date.visit, sample.type, type, final.copies)

# Bind this with the set of data where all replicates were positive. 
# First get the df where all replicates were positive
Aerosol_all_replicates_positive_raw_data_all_pos <- UMD_PNAS_full_aerosol %>%
  filter(!is.na(final.copies)) %>%
  group_by(subject.id, g2.run, sample.type, type) %>%
  count() %>%
  filter(n >= 2) %>% # the n here refers to number of positive replicates that are used to take the sample mean below.
  #  distinct(subject.id, g2.run, sample.type, type, .keep_all = TRUE) %>%
  #  distinct(subject.id, g2.run, type) %>%
  left_join(UMD_PNAS_full_aerosol)


full_Set_aerosols_both_replicates_imputed_where_needed <- Aerosols_both_replicates %>%
  bind_rows(Aerosol_all_replicates_positive_raw_data_all_pos) %>%
  arrange(subject.id, date.visit, sample.type, type, final.copies)


# Get sample average by taking the mean of all of the positive replicates with imputations already made for positive samples that has 1 or more replicates below limit of detection. 
sample_avg <- full_Set_aerosols_both_replicates_imputed_where_needed %>%
  group_by(subject.id, g2.run, sample.type, type) %>%
  mutate(final.copies.pos.sample.avg = mean(final.copies)) %>%
  distinct(subject.id, date.visit, type, .keep_all = TRUE) %>%
  select(-final.copies, -sample.id, -visit.num) %>%
  ungroup()


#### * Geometric Mean, GSD, and maximum for coarse aerosol samples ####

UMD_Coarse_Pos_Samples_Geom_Mean <- sample_avg %>%
  filter(sample.type == "Impactor 5 um NO mask") %>%
  mutate(ln.final.copies = log(final.copies.pos.sample.avg)) %>%
  summarize(Coarse_Positive_Samples_Geom_Mean = exp(mean(ln.final.copies, na.rm = TRUE)),
            Coarse_Positive_Samples_GSD = exp(sd(ln.final.copies, na.rm = TRUE)),
            Coarse_Positive_Samples_Max = exp(max(ln.final.copies, na.rm = TRUE)),
            Coarse_Positive_Samples_n = n())
UMD_Coarse_Pos_Samples_Geom_Mean$Coarse_Positive_Samples_Geom_Mean <- 
  format(UMD_Coarse_Pos_Samples_Geom_Mean$Coarse_Positive_Samples_Geom_Mean, scientific = TRUE)
UMD_Coarse_Pos_Samples_Geom_Mean$Coarse_Positive_Samples_Max <- 
  format(UMD_Coarse_Pos_Samples_Geom_Mean$Coarse_Positive_Samples_Max, scientific = TRUE)

# Write out the coarse aerosol GM, GSD, max info
write.csv(UMD_Coarse_Pos_Samples_Geom_Mean, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/UMD_Coarse_Pos_Samples_Geom_Mean.csv")

#### * Geometric Mean, GSD, and maximum for fine aerosol samples ####

UMD_Fine_Pos_Samples_Geom_Mean <- sample_avg %>%
  filter(sample.type == "GII condensate NO mask") %>%
  mutate(ln.final.copies = log(final.copies.pos.sample.avg)) %>%
  summarize(Fine_Positive_Samples_Geom_Mean = exp(mean(ln.final.copies, na.rm = TRUE)),
            Fine_Positive_Samples_GSD = exp(sd(ln.final.copies, na.rm = TRUE)),
            Fine_Positive_Samples_Max = exp(max(ln.final.copies, na.rm = TRUE)),
            Fine_Positive_Samples_n = n())
UMD_Fine_Pos_Samples_Geom_Mean$Fine_Positive_Samples_Geom_Mean <- 
  format(UMD_Fine_Pos_Samples_Geom_Mean$Fine_Positive_Samples_Geom_Mean, scientific = TRUE)
UMD_Fine_Pos_Samples_Geom_Mean$Fine_Positive_Samples_Max <- 
  format(UMD_Fine_Pos_Samples_Geom_Mean$Fine_Positive_Samples_Max, scientific = TRUE)

# Write out the fine aerosol GM, GSD, max info
write.csv(UMD_Fine_Pos_Samples_Geom_Mean, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/UMD_Fine_Pos_Samples_Geom_Mean.csv")


# Why am I not getting out the same GM (GSD) results as Jing in the PNAS publication? ####
# Let me apply my process to her data and see if this replicates my findings from above.

Jing_data <- read.csv("/Users/jbueno/Desktop/EMIT UMD/SAS_Input/emitfulldatarepeatform_age.csv")

Jing_data$final.copies <- na_if(Jing_data$final.copies, ".")

Jing_data$final.copies <- as.character(Jing_data$final.copies)
Jing_data$final.copies <- as.numeric(Jing_data$final.copies)

# Add the virus type (A or B) to this dataframe (since it isn't there)
# First select just the type (A or B) vector and the subject.id and sample.type vars (needed to merge with Jing_data)
UMD_PNAS_type <- UMD_PNAS_full %>%
  select(subject.id, sample.type, g2.run, type) %>%
  distinct(subject.id, sample.type, g2.run, type)

#Jing_data <- Jing_data %>%
#  left_join(UMD_PNAS_type)

# Now apply the process that was done to the UMD_PNAS data (my reproduction) above to Jing_data
Jing_data_aerosol <- Jing_data %>%
  select(subject.id, date.visit, sample.id, sample.type, g2.run, type.inf, typeAB, final.copies) %>%
  filter(sample.type == "GII condensate NO mask" | sample.type == "Impactor 5 um NO mask") %>%
  arrange(subject.id, date.visit, sample.type, typeAB, final.copies)

# Need to find the subject ID + study day instances where there was one positive replicate and one negative replicate (the negative replicate will be treated as non-detect and we will apply the LOQ*1/sqrt(2) to impute a value to be used for measures of center and spread; but when we get to modelling, this will not be done, and rather we will use tobit regression or some other method to account for censored data).

Aerosol_1_replicate_positive_raw_data_all_pos <- Jing_data_aerosol %>%
  filter(!is.na(final.copies)) %>%
  group_by(subject.id, g2.run, sample.type, typeAB) %>%
  count() %>%
  filter(n == 1) %>%
  #  distinct(subject.id, g2.run, sample.type, type, .keep_all = TRUE) %>%
  #  distinct(subject.id, g2.run, type) %>%
  left_join(Jing_data_aerosol)

aerosol_neg <- Jing_data_aerosol %>%
  filter(is.na(final.copies)) %>%
  anti_join (Aerosol_1_replicate_positive_raw_data_all_pos, by = c("subject.id", "g2.run", "sample.type", "typeAB"))


Aerosol_1_replicate_positive_count_fluA <- Aerosol_1_replicate_positive_raw_data_all_pos %>%
  filter(typeAB == "A")

Aerosol_1_replicate_positive_count_fluB <- Aerosol_1_replicate_positive_raw_data_all_pos %>%
  filter(typeAB == "B")

Aerosol_1_replicate_negative_imputed_A <- Aerosol_1_replicate_positive_count_fluA %>%
  filter(is.na(final.copies)) %>%
  mutate(final.copies = 2000*1/sqrt(2))

Aerosol_1_replicate_negative_imputed_B <- Aerosol_1_replicate_positive_count_fluB %>%
  filter(is.na(final.copies)) %>%
  mutate(final.copies = 10275*1/sqrt(2))

# Bind the imputed values for A and B together
Aerosol_1_replicate_negative_imputed_A_and_B <- Aerosol_1_replicate_negative_imputed_A %>%
  bind_rows(Aerosol_1_replicate_negative_imputed_B)

# Bind the Aerosol_1_replicate_negative_imputed df into the Aerosol_1_replicate_positive df
Aerosols_both_replicates <- Aerosol_1_replicate_positive_raw_data_all_pos %>%
  filter(!is.na(final.copies)) %>%
  bind_rows(Aerosol_1_replicate_negative_imputed_A_and_B) %>%
  arrange(subject.id, date.visit, sample.type, typeAB, final.copies)

# Bind this with the set of data where both all were positive. 
# First get the df where all replicates were positive
Aerosol_all_replicates_positive_raw_data_all_pos <- Jing_data_aerosol %>%
  filter(!is.na(final.copies)) %>%
  group_by(subject.id, g2.run, sample.type, typeAB) %>%
  count() %>%
  filter(n >= 2) %>%
  #  distinct(subject.id, g2.run, sample.type, type, .keep_all = TRUE) %>%
  #  distinct(subject.id, g2.run, type) %>%
  left_join(Jing_data_aerosol)


full_Set_aerosols_both_replicates_imputed_where_needed <- Aerosols_both_replicates %>%
  bind_rows(Aerosol_all_replicates_positive_raw_data_all_pos)

#### * Jing Data: Geometric Mean, GSD, and maximum for coarse aerosol samples ####

UMD_Coarse_Pos_Samples_Geom_Mean <- full_Set_aerosols_both_replicates_imputed_where_needed %>%
  filter(sample.type == "Impactor 5 um NO mask") %>%
  ungroup() %>%
#  group_by(subject.id, g2.run) %>%
#  mutate(average_number = mean(final.copies)) %>%
#  distinct(sample.type, subject.id, g2.run, average_number) %>%
  mutate(ln.final.copies = log(final.copies)) %>%
#  mutate(ln.final.copies = log(average_number)) %>%
  ungroup() %>%
  summarize(Coarse_Positive_Samples_Geom_Mean = exp(mean(ln.final.copies, na.rm = TRUE)),
            Coarse_Positive_Samples_GSD = exp(sd(ln.final.copies, na.rm = TRUE)),
            Coarse_Positive_Samples_Max = exp(max(ln.final.copies, na.rm = TRUE)),
            Coarse_Positive_Samples_n = n())
UMD_Coarse_Pos_Samples_Geom_Mean$Coarse_Positive_Samples_Geom_Mean <- 
  format(UMD_Coarse_Pos_Samples_Geom_Mean$Coarse_Positive_Samples_Geom_Mean, scientific = TRUE)
UMD_Coarse_Pos_Samples_Geom_Mean$Coarse_Positive_Samples_Max <- 
  format(UMD_Coarse_Pos_Samples_Geom_Mean$Coarse_Positive_Samples_Max, scientific = TRUE)

# Write out the coarse aerosol GM, GSD, max info
#write.csv(UMD_Coarse_Pos_Samples_Geom_Mean, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Jing_data_UMD_Coarse_Pos_Samples_Geom_Mean.csv")

#### * Jing Data: Geometric Mean, GSD, and maximum for fine aerosol samples ####

UMD_Fine_Pos_Samples_Geom_Mean <- full_Set_aerosols_both_replicates_imputed_where_needed %>%
  filter(sample.type == "GII condensate NO mask") %>%
  ungroup() %>%
#  group_by(subject.id, g2.run) %>%
#  mutate(average_number = mean(final.copies)) %>%
#  distinct(sample.type, subject.id, g2.run, average_number) %>%
  mutate(ln.final.copies = log(final.copies)) %>%
#  mutate(ln.final.copies = log(average_number)) %>%
  ungroup() %>%
  summarize(Fine_Positive_Samples_Geom_Mean = exp(mean(ln.final.copies, na.rm = TRUE)),
            Fine_Positive_Samples_GSD = exp(sd(ln.final.copies, na.rm = TRUE)),
            Fine_Positive_Samples_Max = exp(max(ln.final.copies, na.rm = TRUE)),
            Fine_Positive_Samples_n = n())
UMD_Fine_Pos_Samples_Geom_Mean$Fine_Positive_Samples_Geom_Mean <- 
  format(UMD_Fine_Pos_Samples_Geom_Mean$Fine_Positive_Samples_Geom_Mean, scientific = TRUE)
UMD_Fine_Pos_Samples_Geom_Mean$Fine_Positive_Samples_Max <- 
  format(UMD_Fine_Pos_Samples_Geom_Mean$Fine_Positive_Samples_Max, scientific = TRUE)

# Write out the fine aerosol GM, GSD, max info
write.csv(UMD_Fine_Pos_Samples_Geom_Mean, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Analysis Results/Jing_data_UMD_Fine_Pos_Samples_Geom_Mean.csv")

### Note - we used data from 167 subjects here, even though the PNAS table 2 (Yan et al., 2018) showed 166. This disparity has arisen because subject 223 was positive for both flu A and flu B in the fine aerosol fraction on the same day (this was the only subject to have a dual infection where both A and B were detected in the aerosol during the same sampling isntance). It looks like in the PNAS report, only the A (or B) data was taken and used in the GM computation. However, he we have included both. It is actually probably even better practice to ignore this subject's aerosol data on this day because it is an outlier in the sense of the dual infection in the aerosol on a particular sampling instance. 

## GM, GSD, max, n info using dataset that Jing just sent, which is what she used as input for the GraphPad Prism program that was used to compute the final results in PNAS paper table 2 ####

graphpad_data <- read.csv("/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_UMD_Natural_Infection/Curated Data/Jing_PNAS_Data_Table2_Figures/NP coarse fine pcr.csv")

coarse <- graphpad_data %>%
  mutate(ln.final.copies = log(Coarse)) %>%
  summarize(Coarse_Positive_Samples_Geom_Mean = exp(mean(ln.final.copies, na.rm = TRUE)),
            Coarse_Positive_Samples_GSD = exp(sd(ln.final.copies, na.rm = TRUE)),
            Coarse_Positive_Samples_Max = exp(max(ln.final.copies, na.rm = TRUE)),
            Coarse_Positive_Samples_n = n())
coarse$Coarse_Positive_Samples_Geom_Mean <- 
  format(coarse$Coarse_Positive_Samples_Geom_Mean, scientific = TRUE)
coarse$Coarse_Positive_Samples_Max <- 
  format(coarse$Coarse_Positive_Samples_Max, scientific = TRUE)

fine <- graphpad_data %>%
  mutate(ln.final.copies = log(Fine)) %>%
  summarize(Fine_Positive_Samples_Geom_Mean = exp(mean(ln.final.copies, na.rm = TRUE)),
            Fine_Positive_Samples_GSD = exp(sd(ln.final.copies, na.rm = TRUE)),
            Fine_Positive_Samples_Max = exp(max(ln.final.copies, na.rm = TRUE)),
            Fine_Positive_Samples_n = n())
fine$Fine_Positive_Samples_Geom_Mean <- 
  format(fine$Fine_Positive_Samples_Geom_Mean, scientific = TRUE)
fine$Fine_Positive_Samples_Max <- 
  format(fine$Fine_Positive_Samples_Max, scientific = TRUE)

# This shows that when I use Jing's input data, I get the same results as she got (she used GraphPad Prism) when I used my bit of R code above. 
# This suggests that there is a difference in the way that the data was prepared.
# Although we have discussed that the method used to prepare the data was to take all of the "positive" samples (i.e., the samples were there was at least 1 replicate that was positive for flu virus) and impute any non-detected replicates with 1/sqrt(2) * LOQ. Then all of the replicate values were averaged for each sample. However perhaps these processing steps led to some differences in the input data that Jing used in GraphPad and I was using in R. 


# To assess this, I will make some comparisons between my input and Jing's. 

## Comparisons for Fine aerosol

# I will cut a df of just Jing's fine data to compare with my fine data
fine_jing <- graphpad_data %>%
  select(Fine) %>%
  slice(1:166) %>%
  arrange(Fine) %>%
  mutate(Fine = as.numeric(Fine))

# Looking at just fine in my data - to compare with Jing's GraphPad Prism input data for fine
fine_jake <- sample_avg %>%
  filter(sample.type == "GII condensate NO mask") %>%
  slice(-(121)) %>% # taking out the value for flu B which Jing doesn't account for (dual infection for subject 223)
  arrange(final.copies.pos.sample.avg)

fine_jake_jing <- fine_jake %>%
  bind_cols(fine_jing) %>%
  slice(1:50) # restricting to lowest 50 observations

# Plot this 
fine_jake_jing_plot <- ggplot(data = fine_jake_jing, aes(x = final.copies.pos.sample.avg, y = Fine)) +
  geom_point(shape = 1) +
  geom_smooth() +
  geom_abline(intercept = 0, slope = 1) +
  theme_bw()
fine_jake_jing_plot


## Coarse

# I will cut a df of just Jing's coarse data to compare with my coarse data
coarse_jing <- graphpad_data %>%
  select(Coarse) %>%
  slice(1:88) %>%
  arrange(Coarse) %>%
  mutate(Coarse = as.numeric(Coarse))

# Looking at just coarse in my data - to compare with Jing's GraphPad Prism input data for coarse
coarse_jake <- sample_avg %>%
  filter(sample.type == "Impactor 5 um NO mask") %>%
  arrange(final.copies.pos.sample.avg)

coarse_jake_jing <- coarse_jake %>%
  bind_cols(coarse_jing)

# Plot this 
coarse_jake_jing_plot <- ggplot(data = coarse_jake_jing, aes(x = final.copies.pos.sample.avg, y = Coarse)) +
  geom_point(shape = 1) +
#  geom_smooth() +
  theme_bw()
coarse_jake_jing_plot


# Drop some of the very higher values and replot
coarse_jake_jing_drop_high <- coarse_jake_jing %>%
  slice(1:50)

coarse_jake_jing_drop_high_plot <- ggplot(data = coarse_jake_jing_drop_high, aes(x = final.copies.pos.sample.avg, y = Coarse)) +
  geom_point(shape = 1) +
  geom_smooth() +
  geom_abline(intercept = 0, slope = 1) +
  theme_bw()
coarse_jake_jing_drop_high_plot

## It looks like Table 2 was computing by doing Tobit regression to impute for the missing values (per Yan PNAS paper methods section). I will replicate this now ####

# Variables needed are subject.id, study.day, final.copies, cough, sex for the subset of samples where there was at least one positive replicate. Tobit model will fill in where there is one or more negative replicates. 

Aerosol_1_replicate_positive_raw_data_all_pos <- UMD_PNAS_full_aerosol %>%
  filter(!is.na(final.copies)) %>%
  group_by(subject.id, g2.run, sample.type, type) %>%
  count() %>%
  filter(n == 1) %>%
  left_join(UMD_PNAS_full_aerosol) %>%
  ungroup()

Aerosol_all_replicates_positive_raw_data_all_pos <- UMD_PNAS_full_aerosol %>%
  filter(!is.na(final.copies)) %>%
  group_by(subject.id, g2.run, sample.type, type) %>%
  count() %>%
  filter(n >= 2) %>% # the n here refers to number of positive replicates that are used to take the sample mean below.
  left_join(UMD_PNAS_full_aerosol) %>%
  ungroup()

pos_samples <- Aerosol_1_replicate_positive_raw_data_all_pos %>%
  bind_rows(Aerosol_all_replicates_positive_raw_data_all_pos) %>%
  arrange(desc(final.copies)) %>% #doing this helps SAS recognize that final.copies should be a numeric variable - otherwise it gets automatically recognized as a character variable. 
  filter(!(subject.id = 223 & type == "B")) %>%  # Don says that where there was dual infection (flu A and B) and we have aerosol data for both flu A and B for a fine or coarse sample for a particular subject on a particular study day, that we should ignore this, because keeping it would make interpretation difficult as having a result for both flu A and B would be like adding a subject-study.day observation to the dataframe. Since this only happens one time (with subject 223 on G2 visit number 2) it doesn't seem like a bad idea to eliminate 223 for this analysis. However, I believe Jing left the flu A and eliminated the flu B. So I will do that here, even though I believe that we should eliminate this instance because of the dual infection. 
  mutate(Fine = if_else(sample.type == "GII condensate NO mask", 1, 0)) %>%
  mutate(Coarse = if_else(sample.type == "Impactor 5 um NO mask", 1, 0))

# write out this df for use in SAS for tobit model with NLMIXED
write.csv(pos_samples, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Curated Data/Analytical Datasets/pos_samples_tobit_table2.csv")

# Run tobit model in sas and get the output - read in the output here:
table2_tobit_fine <- read.csv("/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Curated Data/Analytical Datasets/OUTPUT_MU_FINE_UMD_POSITIVES.csv")

pred_values_for_neg_replicates <- table2_tobit_fine %>%
  select(-VAR1, -final.copies, -logfinalcopies, -X1, -X2, -subject_id_char, -StdErrPred, -DF, -tValue, -Probt, -Alpha, -Lower, -Upper) %>%
  rename(final.copies = Pred) %>%
  mutate(final.copies = 10^(final.copies)) %>%
  select(subject.id, dpo, final.copies) %>%
  arrange(subject.id, dpo)

imputed_fine <- Aerosol_1_replicate_positive_raw_data_all_pos %>% # this gives the imputed values for both replicates even though 1 was observed pos and 1 was observed neg.
  filter(sample.type == "GII condensate NO mask") %>%
  distinct(subject.id, dpo, type) %>%
  left_join(pred_values_for_neg_replicates) %>%
  select(subject.id, dpo, final.copies)

Aerosol_all_replicates_positive_raw_data_all_pos_fine <- Aerosol_all_replicates_positive_raw_data_all_pos %>%
  filter(sample.type == "GII condensate NO mask") %>%
  select(subject.id, dpo, final.copies)

observed_both_pos_plus_imputed_fine <- imputed_fine %>%
  bind_rows(Aerosol_all_replicates_positive_raw_data_all_pos_fine)

# fine_pos_samples_minus_nondetects <- pos_samples %>%
#   ungroup() %>%
#   filter(sample.type == "GII condensate NO mask") %>%
#   select(subject.id, dpo, final.copies) %>%
#   arrange(subject.id, dpo)
# 
# pos_samples_neg_replicates_imp_by_tobit <- fine_pos_samples_minus_nondetects %>%
#   bind_cols(pred_values_for_neg_replicates) %>%
#   mutate(final.copies.combined = if_else(is.na(final.copies), final.copies1, final.copies))

fine_sample_avg_table2 <- observed_both_pos_plus_imputed_fine %>%
  group_by(subject.id, dpo) %>%
  mutate(avg_copies = mean(final.copies)) %>%
  distinct(subject.id, dpo, .keep_all = TRUE) %>%
  ungroup() %>%
  mutate(ln.final.copies = log(avg_copies)) %>%
  summarize(Fine_Positive_Samples_Geom_Mean = exp(mean(ln.final.copies, na.rm = TRUE)),
            Fine_Positive_Samples_GSD = exp(sd(ln.final.copies, na.rm = TRUE)),
            Fine_Positive_Samples_Max = exp(max(ln.final.copies, na.rm = TRUE)),
            Fine_Positive_Samples_n = n())
fine_sample_avg_table2$Fine_Positive_Samples_Geom_Mean <- 
  format(fine_sample_avg_table2$Fine_Positive_Samples_Geom_Mean, scientific = TRUE)
fine_sample_avg_table2$Fine_Positive_Samples_Max <- 
  format(fine_sample_avg_table2$Fine_Positive_Samples_Max, scientific = TRUE)


# Run coarse aerosol tobit model in sas and get the output - read in the coarse output here:
table2_tobit_coarse <- read.csv("/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Curated Data/Analytical Datasets/OUTPUT_MU_COARSE_UMD_POSITIVES.csv")

pred_values_for_neg_replicates <- table2_tobit_coarse %>%
  select(-VAR1, -final.copies, -logfinalcopies, -X1, -X2, -subject_id_char, -StdErrPred, -DF, -tValue, -Probt, -Alpha, -Lower, -Upper) %>%
  rename(final.copies = Pred) %>%
  mutate(final.copies = 10^(final.copies)) %>%
  select(subject.id, dpo, final.copies) %>%
  arrange(subject.id, dpo)

imputed_coarse <- Aerosol_1_replicate_positive_raw_data_all_pos %>% # this gives the imputed values for both replicates even though 1 was observed pos and 1 was observed neg.
  filter(sample.type == "Impactor 5 um NO mask") %>%
  distinct(subject.id, dpo, type) %>%
  left_join(pred_values_for_neg_replicates) %>%
  select(subject.id, dpo, final.copies)

Aerosol_all_replicates_positive_raw_data_all_pos_coarse <- Aerosol_all_replicates_positive_raw_data_all_pos %>%
  filter(sample.type == "Impactor 5 um NO mask") %>%
  select(subject.id, dpo, final.copies)

observed_both_pos_plus_imputed_coarse <- imputed_coarse %>%
  bind_rows(Aerosol_all_replicates_positive_raw_data_all_pos_coarse)


# coarse_pos_samples_minus_nondetects <- pos_samples %>%
#   ungroup() %>%
#   filter(sample.type == "Impactor 5 um NO mask") %>%
#   select(subject.id, dpo, final.copies) %>%
#   arrange(subject.id, dpo)
# 
# pos_samples_neg_replicates_imp_by_tobit <- coarse_pos_samples_minus_nondetects %>%
#   bind_cols(pred_values_for_neg_replicates) %>%
#   mutate(final.copies.combined = if_else(is.na(final.copies), final.copies1, final.copies))

coarse_sample_avg_table2 <- observed_both_pos_plus_imputed_coarse %>%
  group_by(subject.id, dpo) %>%
  mutate(avg_copies = mean(final.copies)) %>%
  distinct(subject.id, dpo, .keep_all = TRUE) %>%
  ungroup() %>%
  mutate(ln.final.copies = log(avg_copies)) %>%
  summarize(coarse_Positive_Samples_Geom_Mean = exp(mean(ln.final.copies, na.rm = TRUE)),
            coarse_Positive_Samples_GSD = exp(sd(ln.final.copies, na.rm = TRUE)),
            coarse_Positive_Samples_Max = exp(max(ln.final.copies, na.rm = TRUE)),
            coarse_Positive_Samples_n = n())
coarse_sample_avg_table2$coarse_Positive_Samples_Geom_Mean <- 
  format(coarse_sample_avg_table2$coarse_Positive_Samples_Geom_Mean, scientific = TRUE)
coarse_sample_avg_table2$coarse_Positive_Samples_Max <- 
  format(coarse_sample_avg_table2$coarse_Positive_Samples_Max, scientific = TRUE)


## GM and GSD for UK Quarantine using Tobit to impute for samples with negative replicates ####

# First need to get the right datasets for fine and coarse that have the positive samples. We consider positive samples to be samples that have at least 1 replicate positive and we impute the sample values where there is at least one non-detected replicate with the predicted values from tobit regression.

# Let's start with a convenient datset that we have already cleaned up in the Natural_vs_Artificial_Infection project.

Combined_Q_UMD_long_analysis <- read.csv("/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/Natural_vs_Artificial_Infection/Analytical Datasets/Combined_Q_UMD_long_analysis.csv") %>%
  select(-X)

Q_tobit_data <- Combined_Q_UMD_long_analysis %>%
  filter(study.day == 1 | study.day == 2 | study.day == 3 | study.day == 4) %>%
  filter(Group == "Infected donors") %>%
  select(subject.id,
         age,
         sex,
         study.day,
         cough,
         upper_resp,
         lower_resp,
         systemic_sx,
         total_sx,
         sample.type,
         final.copies,
         cough_number,
         cough_categorical,
         Ct,
         NPswab,
         Coarse,
         Fine) %>%
  arrange(desc(final.copies))


Q_tobit_data_fine <- Q_tobit_data %>%
  filter(sample.type == "Condensate") %>%
  filter(!is.na(final.copies)) %>%
  distinct(subject.id, study.day) %>%
  left_join(Q_tobit_data) %>%
  filter(sample.type == "Condensate") %>%
  distinct(subject.id, study.day) %>%
  left_join(Q_tobit_data) %>%
  filter(sample.type == "Condensate") %>%
  arrange(desc(final.copies)) # SAS is strange - this statement makes SAS understand that the data is numeric and not character. 
# This SAS_data dataset includes the positive and negative g-ii samples (i.e., all samples) for each positive donor who ever shed virus into fine particle aerosols.


Q_tobit_data_coarse <- Q_tobit_data %>%
  filter(sample.type == "Impactor") %>%
  filter(!is.na(final.copies)) %>%
  distinct(subject.id, study.day) %>%
  left_join(Q_tobit_data) %>%
  filter(sample.type == "Impactor") %>%
  distinct(subject.id, study.day) %>%
  left_join(Q_tobit_data) %>%
  filter(sample.type == "Impactor") %>%
  arrange(desc(final.copies)) # SAS is strange - this statement makes SAS understand that the data is numeric and not character. 
# This SAS_data dataset includes the positive and negative g-ii samples (i.e., all samples) for each positive donor who ever shed virus into coarse particle aerosols.

# Write out these datasets (Q_tobit_data_fine and Q_tobit_data_coarse) for use in SAS tobit regression.
write.csv(Q_tobit_data_fine, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Curated Data/Analytical Datasets/Q_positive_fine_samples.csv")
write.csv(Q_tobit_data_coarse, "/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Curated Data/Analytical Datasets/Q_positive_coarse_samples.csv")

# Run tobit model in sas and get the output - read in the output here:
# However, because the tobit model for coarse doesn't work, we will not do the below so I am commenting it out here. If we wanted to compute UK GM and GSD for fine, the below would be what we would use. 

# EMIT_Q_tobit_fine <- read.csv("/Users/jbueno/Box Sync/EMIT/EMIT_Data_Analysis_Jake/EMIT_Quarantine/Curated Data/Analytical Datasets/OUTPUT_MU_Q_FINE.csv")
# 
# pos_fine_subject_days_plus_neg_days_imputed_from_model <- EMIT_Q_tobit_fine %>%
#   mutate(predicted = 10^(Pred)) %>%
#   mutate(final.copies.obs.imp = if_else(is.na(final.copies), predicted,
#                                         if_else(!is.na(final.copies), final.copies, NA_real_)))
# # The above pos_fine_subject_days_plus_neg_days_imputed_from_model df uses the tobit pred value to impute for the replicates that are assumed to be below the LOQ. 
# # The above version uses the tobit pred value for each replicate where there was a non-detect. However Don believes that it might be more valid to impute for both replicates where there was one replicate positive and one replicate negative. I'll try that here. 
# pos_fine_samples_neg_replicates_imputed_from_model <- EMIT_Q_tobit_fine %>%
#   filter(is.na(final.copies)) %>%
#   group_by(subject.id, study.day) %>%
#   count() %>%
#   filter(n >= 1) %>%
#   left_join(EMIT_Q_tobit_fine)
# # This gives data for all 28 observations where there was either 1 or 0 replicates positive -- for these instances we will take the tobit imputed data for all of these replicates. 
# 
# pos_fine_samples_both_rep_pos <- EMIT_Q_tobit_fine %>%
#   anti_join(pos_fine_samples_neg_replicates_imputed_from_model) 
# 
# pos_fine_subject_days_plus_neg_days_imputed_from_model <- pos_fine_samples_both_rep_pos %>%
#   bind_rows(pos_fine_samples_neg_replicates_imputed_from_model) %>%
#   mutate(predicted = 10^(Pred)) %>%
#   mutate(final.copies.obs.imp = if_else(is.na(final.copies), predicted,
#                                         if_else(!is.na(final.copies), final.copies, NA_real_)))
# # Compute the GM and GSD from sample averages (average replicates for subject.id-study.days and then take GM and GSD)


# In the end, we have decided to leave the GM and GSD values as they are. I will note that the UMD GM and GSD for fine and coarse don't match exactly that of the PNAS table 2, however, this is ok because we are computing it differently -- here we use LOQ*1/sqrt(2) to impute for replicates that were assumed to be non-detections (i.e., where a sample had at least one replicate positive for other replicate(s) were negative). These results are quite similar to the PNAS result. We cannot compute the fancy tobit regression for the UK data (although we tried) but the coarse dataset of positive samples is just too small in order to compute this and so we are forced to leave the data comparisons here using the less than ideal LOQ*1/sqrt(2) method. However, given the few number of observations, and the overall amount of error in these computations, using this imputation method, as inadequate as it may be, should not bias the results too far. I will note here that we have included 167 fine samples in the computation of GM and GSD, however the PNAS papers used 166. This disparity has arisen because subject 223 was positive for both flu A and flu B in the fine aerosol fraction on the same day (this was the only subject to have a dual infection where both A and B were detected in the aerosol during the same sampling isntance). It looks like in the PNAS report, only the A (or B) data was taken and used in the GM computation. However, he we have included both. It is actually probably even better practice to ignore this subject's aerosol data on this day because it is an outlier in the sense of the dual infection in the aerosol on a particular sampling instance.