reformatting code to select needed df for analysis

2020-11-20 11:43:03 +00:00 · 2020-11-20 11:43:03 +00:00 · b72c4df796
commit b72c4df796
parent a6cbaab40a
7 changed files with 243 additions and 102 deletions
--- a/colnames_clinical_meds.R
+++ b/colnames_clinical_meds.R
@ -25,7 +25,7 @@ clinical_cols = c("mosaic"
                    , "onsfindeath"
                    , "onset_2_initial"
                    , "o2_sat_admis"
-                    , "o2_sat_suppl"
+                    #, "o2_sat_suppl"
                    , "ethnicity"
                    , "smoking"
                    , "ia_cxr"
--- a/data_extraction_formatting.R
+++ b/data_extraction_formatting.R
@ -13,18 +13,25 @@ getwd()
 #====================
 source("read_data.R")

+#============================
+# Data to use: Important step
+#============================
+# select df to use
+my_data = fp_adults
+
 # clear unnecessary variables
-rm(all_df)
+rm(all_df, adult_df, fp_adults_na)
+
 ########################################################################

 #=========
 # sam
 #=========
 sam_regex = regex(".*_sam[1-3]{1}$", ignore_case = T)
-sam_cols_i = str_extract(colnames(adult_df), sam_regex)   # not boolean  
-#sam_cols_b = colnames(adult_df)%in%sam_cols_i # boolean
+sam_cols_i = str_extract(colnames(my_data), sam_regex)   # not boolean  
+#sam_cols_b = colnames(my_data)%in%sam_cols_i # boolean

-sam_cols = colnames(adult_df)[colnames(adult_df)%in%sam_cols_i]
+sam_cols = colnames(my_data)[colnames(my_data)%in%sam_cols_i]

 # this contains log columns +  daysamp_samXX: omitting these
 sam_regex_log_days = regex("log|day.*_sam[1-3]{1}$", ignore_case = T, perl = T)
@ -48,7 +55,7 @@ cat("Extracting SAM cols + metadata_cols")

 if ( length(sam_cols_to_extract) == length(meta_data_cols) + length(sam_cols_clean) ){
  cat("Extracing", length(sam_cols_to_extract),  "columns for sam")
-  sam_df = adult_df[, sam_cols_to_extract]
+  sam_df = my_data[, sam_cols_to_extract]
 }else{
  cat("FAIL: length mismatch"
      , "Expeceted to extract:", length(meta_data_cols) + length(sam_cols_clean), "columns"
@ -61,10 +68,10 @@ colnames_sam_df = colnames(sam_df); colnames_sam_df
 # serum
 #=========
 serum_regex = regex(".*_serum[1-3]{1}$", ignore_case = T)
-serum_cols_i = str_extract(colnames(adult_df), serum_regex)   # not boolean  
-#serum_cols_b = colnames(adult_df)%in%serum_cols_i # boolean
+serum_cols_i = str_extract(colnames(my_data), serum_regex)   # not boolean  
+#serum_cols_b = colnames(my_data)%in%serum_cols_i # boolean

-serum_cols = colnames(adult_df)[colnames(adult_df)%in%serum_cols_i]
+serum_cols = colnames(my_data)[colnames(my_data)%in%serum_cols_i]

 # this contains log columns +  dayserump_serumXX: omitting these
 serum_regex_log_days = regex("log|day.*_serum[1-3]{1}$", ignore_case = T, perl = T)
@ -88,7 +95,7 @@ cat("Extracting SERUM cols + metadata_cols")

 if ( length(serum_cols_to_extract) == length(meta_data_cols) + length(serum_cols_clean) ){
  cat("Extracing", length(serum_cols_to_extract),  "columns for serum")
-  serum_df = adult_df[, serum_cols_to_extract]
+  serum_df = my_data[, serum_cols_to_extract]
 }else{
  cat("FAIL: length mismatch"
      , "Expeceted to extract:", length(meta_data_cols) + length(serum_cols_clean), "columns"
@ -101,10 +108,10 @@ colnames_serum_df = colnames(serum_df); colnames_serum_df
 # npa
 #=========
 npa_regex = regex(".*_npa[1-3]{1}$", ignore_case = T)
-npa_cols_i = str_extract(colnames(adult_df), npa_regex)   # not boolean  
-#npa_cols_b = colnames(adult_df)%in%npa_cols_i # boolean
+npa_cols_i = str_extract(colnames(my_data), npa_regex)   # not boolean  
+#npa_cols_b = colnames(my_data)%in%npa_cols_i # boolean

-npa_cols = colnames(adult_df)[colnames(adult_df)%in%npa_cols_i]
+npa_cols = colnames(my_data)[colnames(my_data)%in%npa_cols_i]

 # this contains log columns +  daynpap_npaXX: omitting these
 npa_regex_log_days = regex("log|day|vl_samptime|ct.*_npa[1-3]{1}$", ignore_case = T, perl = T)
@ -128,7 +135,7 @@ cat("Extracting NPA cols + metadata_cols")

 if ( length(npa_cols_to_extract) == length(meta_data_cols) + length(npa_cols_clean) ){
  cat("Extracing", length(npa_cols_to_extract),  "columns for npa")
-  npa_df = adult_df[, npa_cols_to_extract]
+  npa_df = my_data[, npa_cols_to_extract]
 }else{
  cat("FAIL: length mismatch"
      , "Expeceted to extract:", length(meta_data_cols) + length(npa_cols_clean), "columns"
@ -166,21 +173,21 @@ for (i in extra_cols){
  }
 }
 tail(colnames_check_f)
-# write file?
-quick_check = as.data.frame(cbind(metadata_all$mosaic
-                                  , metadata_all$adult
-                                  , metadata_all$age
-                                  , metadata_all$obesity
-                                  , metadata_all$obese2
-                                  ))
-colnames(quick_check) = c("mosaic", "adult", "age", "obesity", "obese2")

 ##########################################################################
 #                               LF data
 ##########################################################################
-cols_to_omit = c("adult", "obese2"
-                 , "height", "height_unit", "weight"
-                 ,  "weight_unit", "visual_est_bmi", "bmi_rating")
+cols_to_omit = c("adult"
+                 #, "obese2"
+                 #, "height", "height_unit", "weight"
+                 #,  "weight_unit", "visual_est_bmi", "bmi_rating"
+                 )
+
+pivot_cols = meta_data_cols
+# subselect pivot_cols
+pivot_cols = meta_data_cols[!meta_data_cols%in%cols_to_omit];pivot_cols 
+ncols_omitted = table(meta_data_cols%in%cols_to_omit)[[2]]
+ncols_omitted

 #==============
 # lf data: sam
@ -198,11 +205,11 @@ pivot_cols = meta_data_cols
 # subselect pivot_cols
 pivot_cols = meta_data_cols[!meta_data_cols%in%cols_to_omit];pivot_cols 

-if (length(pivot_cols) == length(meta_data_cols) - length(cols_to_omit)){
+if (length(pivot_cols) == length(meta_data_cols) - ncols_omitted){
  cat("PASS: pivot cols successfully extracted")
 }else{
  cat("FAIL: length mismatch! pivot cols could not be extracted"
-      , "\nExpected length:", length(meta_data_cols) - length(cols_to_omit)
+      , "\nExpected length:", length(meta_data_cols) - ncols_omitted
      , "\nGot:",length(pivot_cols)  )
  quit()
 }
@ -249,11 +256,11 @@ serum_wf = serum_df_adults[wf_cols]
 pivot_cols = meta_data_cols
 pivot_cols = meta_data_cols[!meta_data_cols%in%cols_to_omit];pivot_cols 

-if (length(pivot_cols) == length(meta_data_cols) - length(cols_to_omit)){
+if (length(pivot_cols) == length(meta_data_cols) - ncols_omitted){
  cat("PASS: pivot cols successfully extracted")
 }else{
  cat("FAIL: length mismatch! pivot cols could not be extracted"
-      , "\nExpected length:", length(meta_data_cols) - length(cols_to_omit)
+      , "\nExpected length:", length(meta_data_cols) - ncols_omitted
      , "\nGot:",length(pivot_cols)  )
  quit()
 }
@ -296,11 +303,11 @@ npa_wf = npa_df_adults[wf_cols]
 pivot_cols = meta_data_cols
 pivot_cols = meta_data_cols[!meta_data_cols%in%cols_to_omit];pivot_cols 

-if (length(pivot_cols) == length(meta_data_cols) - length(cols_to_omit)){
+if (length(pivot_cols) == length(meta_data_cols) - ncols_omitted){
  cat("PASS: pivot cols successfully extracted")
 }else{
  cat("FAIL: length mismatch! pivot cols could not be extracted"
-      , "\nExpected length:", length(meta_data_cols) - length(cols_to_omit)
+      , "\nExpected length:", length(meta_data_cols) - ncols_omitted
      , "\nGot:",length(pivot_cols)  )
  quit()
 }
@ -333,7 +340,7 @@ if (
 rm(sam_regex, sam_regex_log_days, sam_cols, sam_cols_clean, sam_cols_i, sam_cols_to_extract, sam_cols_to_omit)
 rm(serum_regex, serum_regex_log_days, serum_cols, serum_cols_clean, serum_cols_i, serum_cols_to_extract, serum_cols_to_omit)
 rm(npa_regex, npa_regex_log_days, npa_cols, npa_cols_clean, npa_cols_i, npa_cols_to_extract, npa_cols_to_omit)
-rm(adult_df)
+rm(my_data)
 rm(colnames_check)
 rm(i, j
   #, expected_cols
@ -344,3 +351,4 @@ rm(sam_df_adults, serum_df_adults, npa_df_adults)

 # rm df
 rm(sam_df, serum_df, npa_df)
+rm(colnames_check_f, fp_adults)
--- a/data_extraction_formatting_clinical.R
+++ b/data_extraction_formatting_clinical.R
@ -53,31 +53,36 @@ if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(tab

 table(clinical_df$ia_exac_copd)

+str(clinical_df)
+#clinical_df$o2_sat_suppl
+
 ########################################################################
 #==================================
-# asthma and copd status correction
-# for conflicting field!
+# Check asthma and copd conflict
 #=================================
-
+if ( table(fp_adults$ia_exac_copd, fp_adults$asthma) [[2,2]] == 0){
+  cat("PASS: asthma and copd do not conflict")
+}else{
+  cat("Conflict detected in asthm and copd filed, attempting to resolve...")
  # Reassign the copd and asthma status and do some checks 
-table(clinical_df$ia_exac_copd); sum(is.na(clinical_df$ia_exac_copd))
-
-clinical_df$ia_exac_copd[clinical_df$ia_exac_copd< 1]<- 0
-clinical_df$ia_exac_copd[is.na(clinical_df$ia_exac_copd)] <- 0
-
-table(clinical_df$ia_exac_copd); sum(is.na(clinical_df$ia_exac_copd))
+  table(fp_adults$ia_exac_copd); sum(is.na(fp_adults$ia_exac_copd))
+  fp_adults$ia_exac_copd[fp_adults$ia_exac_copd< 1]<- 0
+  fp_adults$ia_exac_copd[is.na(fp_adults$ia_exac_copd)] <- 0
+  table(fp_adults$ia_exac_copd); sum(is.na(fp_adults$ia_exac_copd))
  
  # check copd and asthma status
-table(clinical_df$ia_exac_copd, clinical_df$asthma)
-check_copd_and_asthma_1<- subset(clinical_df, ia_exac_copd ==1 & asthma == 1) # check this is 3 
+  table(fp_adults$ia_exac_copd, fp_adults$asthma)
+  check_copd_and_asthma_1<- subset(fp_adults, ia_exac_copd ==1 & asthma == 1) # check this is 3 
  
  # reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma
-clinical_df$asthma[clinical_df$ia_exac_copd == 1 & clinical_df$asthma == 1]= 0 
-table(clinical_df$ia_exac_copd, clinical_df$asthma)
-
-foo<- subset(clinical_df, asthma==1 & ia_exac_copd ==1) # check that its 0
-
+  fp_adults$asthma[fp_adults$ia_exac_copd == 1 & fp_adults$asthma == 1]= 0 
+  table(fp_adults$ia_exac_copd, fp_adults$asthma)
+  foo<- subset(fp_adults, asthma==1 & ia_exac_copd ==1) # check that its 0
  rm(check_copd_and_asthma_1, foo)
+  cat("Check status again...")
+
+}
+
 #=====================================================================
 #=================================
 # resp scores: In, max and t1 & t2
@ -125,29 +130,30 @@ rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev
 # age
 #========
 # Create categories of variables
-clinical_df$age = round(clinical_df$age, digits = 0)
-table(clinical_df$age)
-table(clinical_df$asthma, clinical_df$age)
-min(clinical_df$age); max(clinical_df$age)
+clinical_df$age_int = round(clinical_df$age, digits = 0)
+table(clinical_df$age_int)
+table(clinical_df$asthma, clinical_df$age_int)
+min(clinical_df$age_int); max(clinical_df$age_int)

-max_age_interval = round_any(max(clinical_df$age), 10, f = ceiling)  
+max_age_interval = round_any(max(clinical_df$age_int), 10, f = ceiling)  
 max_age_interval 
-min_age = min(clinical_df$age); min_age #19
+min_age = min(clinical_df$age_int); min_age #19
 min_age_interval = min_age - 1; min_age_interval

-#age_bins = cut(clinical_df$age, c(0,18,30,40,50,60,70,80,90))
-age_bins = cut(clinical_df$age, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval))
+#age_bins = cut(clinical_df$age_int, c(0,18,30,40,50,60,70,80,90))
+age_bins = cut(clinical_df$age_int, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval))
 clinical_df$age_bins = age_bins
-dim(clinical_df) # 133 27
+dim(clinical_df) # 133 28

 # age_bins (to keep consistent with the results table)
 class(clinical_df$age_bins)
 levels(clinical_df$age_bins)
 #"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]"
+
 table(clinical_df$asthma, clinical_df$age_bins)
 #     (18,30] (30,40] (40,50] (50,60] (60,70] (70,80]
 #0      25      17      25      14      11      1
-#1      11       8      12       5       3      2
+#1      11       8      12       5       2      2

 if (sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) ){
  cat("\nPASS: age_bins assigned successfully")
@ -156,7 +162,7 @@ if (sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) ){
  quit()
 }

-# reassign
+# reassign levels
 class(clinical_df$age_bins)
 levels(clinical_df$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]")
 table(clinical_df$asthma, clinical_df$age_bins)
@ -170,11 +176,25 @@ sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df)
 #===========================
 # O2 saturation binning
 #===========================
+clinical_df$o2_sat_admis
+n1 = sum(is.na(clinical_df$o2_sat_admis))
+
 clinical_df$o2_sat_admis = round(clinical_df$o2_sat_admis, digits = 0)
 table(clinical_df$o2_sat_admis)
 tot_o2 = sum(table(clinical_df$o2_sat_admis))- table(clinical_df$o2_sat_admis)[["-1"]]
 tot_o2

+n_text_code =  table(clinical_df$o2_sat_admis)[["-1"]]
+
+clinical_df$o2_sat_admis[clinical_df$o2_sat_admis <0] <- NA
+n2 = sum(is.na(clinical_df$o2_sat_admis))
+
+if (n2 == n1 + n_text_code) {
+  cat ("PASS: -1 code converted to NA")
+} else{
+  cat("FAIL: something went wrong!")
+}
+
 o2_sat_bin = cut(clinical_df$o2_sat_admis, c(0,92,100))
 clinical_df$o2_sat_bin = o2_sat_bin
 table(clinical_df$o2_sat_bin)
@ -184,6 +204,8 @@ sum(table(clinical_df$o2_sat_bin)) == tot_o2
 #===========================
 # Onset to initial binning
 #===========================
+clinical_df$onset_2_initial
+
 max_in = max(clinical_df$onset_2_initial); max_in #23
 min_in = min(clinical_df$onset_2_initial) - 1 ; min_in  # -6

@ -198,14 +220,15 @@ sum(table(clinical_df$onset_initial_bin)) == tot_onset2ini
 #=======================
 # seasonal flu: sfluv
 #=======================
-# should be a factor
 if (! is.factor(clinical_df$sfluv)){
  clinical_df$sfluv  = as.factor(clinical_df$sfluv)
 }
-class(clinical_df$sfluv) #[1] "factor"
-
+class(clinical_df$sfluv)
 levels(clinical_df$sfluv)
+
+table(clinical_df$sfluv)
 table(clinical_df$asthma, clinical_df$sfluv)
+
 # reassign
 levels(clinical_df$sfluv) <- c("0", "0", "1")
 table(clinical_df$asthma, clinical_df$sfluv)
@ -213,14 +236,16 @@ table(clinical_df$asthma, clinical_df$sfluv)
 #=======================
 # h1n1v
 #=======================
-# should be a factor
 if (! is.factor(clinical_df$h1n1v)){
  clinical_df$h1n1v  = as.factor(clinical_df$h1n1v)
 }
-class(clinical_df$h1n1v) #[1] "factor"

+class(clinical_df$h1n1v)
 levels(clinical_df$h1n1v)
+
+table(clinical_df$h1n1v)
 table(clinical_df$asthma, clinical_df$h1n1v)
+
 # reassign
 levels(clinical_df$h1n1v) <- c("0", "0", "1")
 table(clinical_df$asthma, clinical_df$h1n1v)
@ -229,18 +254,21 @@ table(clinical_df$asthma, clinical_df$h1n1v)
 # ethnicity
 #=======================
 class(clinical_df$ethnicity) # integer
+table(clinical_df$ethnicity)
 table(clinical_df$asthma, clinical_df$ethnicity)

 clinical_df$ethnicity[clinical_df$ethnicity == 4] <- 2
+table(clinical_df$ethnicity)
 table(clinical_df$asthma, clinical_df$ethnicity)

 #=======================
 # pneumonia
 #=======================
+table(clinical_df$ia_cxr)
 class(clinical_df$ia_cxr) # integer
 # ia_cxr 2 ---> yes pneumonia (1)
 # 1 ---> no (0)
-# ! 1 or 2 -- > "unkown"
+# ! 1 or 2 -- > "unknown"

 # reassign the pneumonia codes
 #0: not performed 
@ -251,7 +279,6 @@ class(clinical_df$ia_cxr) # integer
 #-2: n/a specified by the clinician # not in the data...
 #-3: unknown specified by clinician

-
 table(clinical_df$ia_cxr)
 #-3 -1  0  1  2  3 
 #5 48 13 47 17  3 
@ -262,6 +289,8 @@ table(clinical_df$ia_cxr)
 #    1  2 
 #69 47 17

+sum(is.na(clinical_df$ia_cxr))
+
 clinical_df$ia_cxr[clinical_df$ia_cxr == 1] <- 0
 clinical_df$ia_cxr[clinical_df$ia_cxr == 2] <- 1
 table(clinical_df$ia_cxr)
@ -306,7 +335,7 @@ clinical_df$smoking[clinical_df$smoking == 4 | clinical_df$smoking == 2 ] <- 0
 clinical_df$smoking[clinical_df$smoking == 1 | clinical_df$smoking == 3 ] <- 1
 clinical_df$smoking[clinical_df$smoking == -1 | clinical_df$smoking == -2 | clinical_df$smoking == -3 ] <- NA

-table(clinical_df$smoking)
+table(clinical_df$smoking); sum(is.na(clinical_df$smoking))
 #    0  1 
 #30 49 54  

@ -316,17 +345,13 @@ table(clinical_df$asthma, clinical_df$smoking)
 #      0  1
 #0 24 32 37
 #1  6 17 17
-
-#      0  1
-#0 23 32 35
-#1  7 17 19
-
 ################################################################

 #=========================
 # Merge: clinical_df and infile ics
 #=========================
 merging_cols = intersect( names(clinical_df), names(clinical_ics) )
+merging_cols

 clinical_df_ics = merge(clinical_df, clinical_ics, by = merging_cols, all = T); clinical_df_ics 

@ -351,6 +376,15 @@ if (nrow(clinical_df_ics) == nrow(clinical_df) & nrow(clinical_ics)){
      , "\nExpected nrows:", nrow(fp_adults))
 }

+# change the factor vars to integers
+str(clinical_df_ics)
+factor_vars = lapply(clinical_df_ics, class) == "factor"
+table(factor_vars)
+clinical_df_ics[, factor_vars] <- lapply(clinical_df_ics[, factor_vars], as.integer)
+table(factor_vars)
+
+str(clinical_df_ics)
+
 #======================
 # writing output file
 #======================
@ -359,9 +393,8 @@ outfile_reg = paste0(outdir, outfile_name_reg)

 cat("\nWriting clinical file for regression:", outfile_reg)

-write.csv(clinical_df_ics, file = outfile_reg)
+#write.csv(clinical_df_ics, file = outfile_reg)
 ################################################################
-
 rm(age_bins, max_age_interval, max_in, min_in
   , o2_sat_bin, onset_initial_bin, tot_o2
   , tot_onset2ini, meta_data_cols
--- a/flu_stats_contingency.R
+++ b/flu_stats_contingency.R
@ -214,4 +214,36 @@ comb_stats_categ_df_f =  comb_stats_categ_df[order(comb_stats_categ_df$p_signif
 # write output file
 #******************
 cat("Chisq and fishers test results in:", outfile_clin_categ)
-write.csv(comb_stats_categ_df_f, outfile_clin_categ, row.names = FALSE)
+#write.csv(comb_stats_categ_df_f, outfile_clin_categ, row.names = FALSE)
+
+#==================
+#0 date not recorded
+#-1 not recorded
+#-2 n/a specified by clinician
+#-3 unknown specified by
+
+
+chisq_test(table(clinical_df_ics$obesity, clinical_df_ics$smoking))
+
+chisq_test(table(clinical_df_ics$obesity, clinical_df_ics$max_resp_score))
+
+chisq_test(table(clinical_df_ics$obesity, clinical_df_ics$T1_resp_score))
+chisq_test(table(clinical_df_ics$obesity, clinical_df_ics$t1_resp_recoded))
+
+
+obese_df = clinical_df_ics[clinical_df_ics$obesity == 1,]
+not_ob_df = clinical_df_ics[clinical_df_ics$obesity == 0,]
+
+wilcox.test(obese_df$age, not_ob_df$age, paired = F)
+wilcox.test(obese_df$los, not_ob_df$los, paired = F)
+wilcox.test(obese_df$o2_sat_admis, not_ob_df$o2_sat_admis, paired = F)
+wilcox.test(obese_df$onset_2_initial, not_ob_df$onset_2_initial, paired = F)
+wilcox.test(obese_df$onset2final, not_ob_df$onset2final, paired = F)
+wilcox.test(obese_df$onsfindeath, not_ob_df$onsfindeath, paired = F)
+
+clinical_df_ics$age
+clinical_df_ics$los
+clinical_df_ics$o2_sat_admis #***** (already bin)
+clinical_df_ics$onset_2_initial # ***** (already bin)
+clinical_df_ics$onset2final
+clinical_df_ics$onsfindeath
--- a/flu_stats_unpaired_clinical.R
+++ b/flu_stats_unpaired_clinical.R
@ -25,27 +25,28 @@ outfile_clinical_unpaired
 # Unpaired stats for clinical data b/w groups: wilcoxon UNpaired analysis 
 # No correction required
 ########################################################################
-
+str(clinical_df_ics)
 numerical_cols = c("age"
-                   #, "vl_pfu_ul_npa1"
+                   , "vl_pfu_ul_npa1"
                   , "los"
                   , "onset2final"
                   , "onsfindeath"
-                   , "onset_2_initial"
-                   , "o2_sat_admis")
+                   #, "onset_2_initial" # already bin
+                   #, "o2_sat_admis"# already bin
+)

 metadata_cols = c("mosaic", "obesity")

-clinical_df_numerical = clinical_df[, c(metadata_cols, numerical_cols)]
+clinical_df_numerical = clinical_df_ics[, c(metadata_cols, numerical_cols)]

 pivot_cols = metadata_cols
 #pivot_cols = metadata_cols[!meta_data_cols%in%cols_to_omit];pivot_cols 
 expected_rows_clinical_lf = nrow(clinical_df_numerical) * (length(clinical_df_numerical) - length(pivot_cols)); expected_rows_clinical_lf

-
+# lf data colnames
 keycol <- "clinical_params"
 valuecol <- "value"
-gathercols <- c("age", "los", "onset2final", "onsfindeath", "onset_2_initial", "o2_sat_admis")
+gathercols <- numerical_cols 

 clinical_lf = gather_(clinical_df_numerical, keycol, valuecol, gathercols)

@ -70,12 +71,15 @@ stats_un_clinical = compare_means(value~obesity
                            #, data = clinical_lf_comp
                            , paired = FALSE)

+head(stats_un_clinical)

+# rstatix
 stat_df <- clinical_lf %>%
  group_by(clinical_params) %>%
  wilcox_test(value ~ obesity, paired = F) %>%
  add_significance("p")
 stat_df$p_format = round(stat_df$p, digits = 3)
+stat_df

 #----------------------------------------
 # calculate n_obs for each clinical param: Overall
@ -101,31 +105,39 @@ n_all_gp  = merge(n_all, n_gp
 #----------------------------------------
 # calculate n_obs for each clinical param: complete cases
 #----------------------------------------
-n_comp = data.frame(table(clinical_lf_comp$clinical_params))
+n_comp = data.frame(table(clinical_lf$clinical_params))
 colnames(n_comp) = c("clinical_params", "n_complete")
 n_comp$clinical_params = as.character(n_comp$clinical_params)
 n_comp

-n_gp_comp_lf = data.frame(table(clinical_lf_comp$clinical_params, clinical_lf_comp$obesity)); n_gp_comp_lf
+n_gp_comp_lf = data.frame(table(clinical_lf$clinical_params
+                                , clinical_lf$obesity)); n_gp_comp_lf
 n_gp_comp = spread(n_gp_comp_lf, "Var2", "Freq"); n_gp_comp
 colnames(n_gp_comp)
 colnames(n_gp_comp) = c("clinical_params"
                   , paste0("n_complete_gp", colnames(n_gp_comp)[2])
                   , paste0("n_complete_gp", colnames(n_gp_comp)[3]))

-
+#---------
+# merge 1
+#---------
 n_comp_gp  = merge(n_comp, n_gp_comp
                  , by = intersect( names(n_comp), names(n_gp_comp))
                  , all = T)
+n_comp_gp

+#---------
+# merge 2
+#---------
 merge_cols = intersect(names(n_all_gp), names(n_comp_gp)); merge_cols

 n_df = merge(n_all_gp, n_comp_gp, by = merge_cols, all = T); n_df

-#==================================
-# Merge: merge stats + n_obs df
-#===================================
+#----------------------------------
+# Merge 3: merge stats + n_obs df
+#----------------------------------
 merging_cols = intersect(names(stats_un_clinical), names(n_df)); merging_cols
+
 if (all(n_df$clinical_params%in%stats_un_clinical$clinical_params)) {
  cat("PASS: merging stats and n_obs on column/s:", merging_cols)
  stats_un_clinical = merge(stats_un_clinical, n_df, by = merging_cols, all = T)
@ -188,6 +200,7 @@ if( length(my_col_order2) == ncol(stats_clinical_df) && (all(my_col_order2%in%co
  quit()
 }  
 # assign nice column names like replace "." with "_"
+# same ordering as my_col_order2, just minor formatting
 colnames(stats_clinical_df_f) = c("clinical_params"
                                   , "method"
                                   , "group1"
@ -208,4 +221,4 @@ colnames(stats_clinical_df_f)
 # write output file
 #******************
 cat("UNpaired stats for clinical data for groups in:", outfile_clinical_unpaired)
-#write.csv(stats_clinical_df_f, outfile_clinical_unpaired, row.names = FALSE)
+write.csv(stats_clinical_df_f, outfile_clinical_unpaired, row.names = FALSE)
--- a/flu_stats_unpaired_npa.R
+++ b/flu_stats_unpaired_npa.R
@ -365,6 +365,14 @@ colnames(combined_unpaired_stats_f) = c("mediator"
                                        , "p_bon_signif")

 colnames(combined_unpaired_stats_f)
+
+# count how many meds are significant
+n_sig  = length(combined_unpaired_stats_f$mediator[combined_unpaired_stats_f$p_signif<0.05])
+cat("\nTotal no. of statistically significant mediators in", toupper(my_sample_type)
+    , "are:", n_sig)
+
+sig_meds = combined_unpaired_stats_f[combined_unpaired_stats_f$p_signif<0.05,]
+
 ########################################################################
 #******************
 # write output file
--- a/read_data.R
+++ b/read_data.R
@ -30,11 +30,11 @@ meta_data_cols = c("mosaic", "gender", "age"
                   , "adult"
                   , "flustat", "type"
                   , "obesity"
-                   , "obese2"
-                   , "height", "height_unit"
-                   , "weight", "weight_unit"
-                   , "ia_height_ftin", "ia_height_m", "ia_weight"
-                   , "visual_est_bmi", "bmi_rating"
+                   #, "obese2"
+                   #, "height", "height_unit"
+                   #, "weight", "weight_unit"
+                   #, "ia_height_ftin", "ia_height_m", "ia_weight"
+                   #, "visual_est_bmi", "bmi_rating"
                   )

 # check if these columns to select are present in the data
@ -55,9 +55,9 @@ if (table(adult_df$adult == 1)[[1]] == nrow(adult_df) ){
  cat ("\nFAIL: adult df number mismatch!")
 }

-#==============
+#=================================
 # FLU positive: adult patients
-#==============
+#=================================
 # extract the flu positive population
 fp_adults = adult_df[adult_df$flustat == 1,]

@ -67,6 +67,53 @@ if (table(fp_adults$flustat == 1)[[1]] == nrow(fp_adults) ){
  cat ("\nFAIL: adult df number mismatch!")
 }

+#=============================================
+# FLU positive adult patients: without asthma
+#=============================================
+#-----------------------------------
+# asthma and copd status correction
+# for conflicting field!
+#------------------------------------
+# Reassign the copd and asthma status and do some checks 
+table(fp_adults$ia_exac_copd); sum(is.na(fp_adults$ia_exac_copd))
+
+fp_adults$ia_exac_copd[fp_adults$ia_exac_copd< 1]<- 0
+fp_adults$ia_exac_copd[is.na(fp_adults$ia_exac_copd)] <- 0
+
+table(fp_adults$ia_exac_copd); sum(is.na(fp_adults$ia_exac_copd))
+
+# check copd and asthma status
+table(fp_adults$ia_exac_copd, fp_adults$asthma)
+check_copd_and_asthma_1<- subset(fp_adults, ia_exac_copd ==1 & asthma == 1) # check this is 3 
+
+# reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma
+fp_adults$asthma[fp_adults$ia_exac_copd == 1 & fp_adults$asthma == 1]= 0 
+table(fp_adults$ia_exac_copd, fp_adults$asthma)
+
+foo<- subset(fp_adults, asthma==1 & ia_exac_copd ==1) # check that its 0
+
+rm(check_copd_and_asthma_1, foo)
+
+if ( table(fp_adults$ia_exac_copd, fp_adults$asthma) [[2,2]] == 0 ){
+  cat("\nPASS: asthma and copd do not conflict")
+} else{
+  cat ("\nFAIL: asthma and copd conflict not resolved!")
+  quit()
+}
+
+cat("\nExtracting flu positive without asthma")
+table(fp_adults$asthma)
+cat("\nNo. of asthmatics:", table(fp_adults$asthma)[[2]]
+    , "\nNo. of non-asthmatics:", table(fp_adults$asthma)[[1]])
+str(fp_adults$asthma)
+
+table(fp_adults$obesity)
+table(fp_adults$obesity, fp_adults$asthma)
+
+fp_adults_na = fp_adults[fp_adults$asthma == 0,]
+table(fp_adults_na$obesity)
+table(fp_adults_na$obesity, fp_adults_na$asthma)
+
 #============
 # hc
 #============