diff --git a/colnames_clinical_meds.R b/colnames_clinical_meds.R index 546b729..b59d8ed 100644 --- a/colnames_clinical_meds.R +++ b/colnames_clinical_meds.R @@ -25,7 +25,7 @@ clinical_cols = c("mosaic" , "onsfindeath" , "onset_2_initial" , "o2_sat_admis" - , "o2_sat_suppl" + #, "o2_sat_suppl" , "ethnicity" , "smoking" , "ia_cxr" diff --git a/data_extraction_formatting.R b/data_extraction_formatting.R index 7b99913..cdc65a7 100755 --- a/data_extraction_formatting.R +++ b/data_extraction_formatting.R @@ -13,18 +13,25 @@ getwd() #==================== source("read_data.R") +#============================ +# Data to use: Important step +#============================ +# select df to use +my_data = fp_adults + # clear unnecessary variables -rm(all_df) +rm(all_df, adult_df, fp_adults_na) + ######################################################################## #========= # sam #========= sam_regex = regex(".*_sam[1-3]{1}$", ignore_case = T) -sam_cols_i = str_extract(colnames(adult_df), sam_regex) # not boolean -#sam_cols_b = colnames(adult_df)%in%sam_cols_i # boolean +sam_cols_i = str_extract(colnames(my_data), sam_regex) # not boolean +#sam_cols_b = colnames(my_data)%in%sam_cols_i # boolean -sam_cols = colnames(adult_df)[colnames(adult_df)%in%sam_cols_i] +sam_cols = colnames(my_data)[colnames(my_data)%in%sam_cols_i] # this contains log columns + daysamp_samXX: omitting these sam_regex_log_days = regex("log|day.*_sam[1-3]{1}$", ignore_case = T, perl = T) @@ -48,7 +55,7 @@ cat("Extracting SAM cols + metadata_cols") if ( length(sam_cols_to_extract) == length(meta_data_cols) + length(sam_cols_clean) ){ cat("Extracing", length(sam_cols_to_extract), "columns for sam") - sam_df = adult_df[, sam_cols_to_extract] + sam_df = my_data[, sam_cols_to_extract] }else{ cat("FAIL: length mismatch" , "Expeceted to extract:", length(meta_data_cols) + length(sam_cols_clean), "columns" @@ -61,10 +68,10 @@ colnames_sam_df = colnames(sam_df); colnames_sam_df # serum #========= serum_regex = regex(".*_serum[1-3]{1}$", ignore_case = T) -serum_cols_i = str_extract(colnames(adult_df), serum_regex) # not boolean -#serum_cols_b = colnames(adult_df)%in%serum_cols_i # boolean +serum_cols_i = str_extract(colnames(my_data), serum_regex) # not boolean +#serum_cols_b = colnames(my_data)%in%serum_cols_i # boolean -serum_cols = colnames(adult_df)[colnames(adult_df)%in%serum_cols_i] +serum_cols = colnames(my_data)[colnames(my_data)%in%serum_cols_i] # this contains log columns + dayserump_serumXX: omitting these serum_regex_log_days = regex("log|day.*_serum[1-3]{1}$", ignore_case = T, perl = T) @@ -88,7 +95,7 @@ cat("Extracting SERUM cols + metadata_cols") if ( length(serum_cols_to_extract) == length(meta_data_cols) + length(serum_cols_clean) ){ cat("Extracing", length(serum_cols_to_extract), "columns for serum") - serum_df = adult_df[, serum_cols_to_extract] + serum_df = my_data[, serum_cols_to_extract] }else{ cat("FAIL: length mismatch" , "Expeceted to extract:", length(meta_data_cols) + length(serum_cols_clean), "columns" @@ -101,10 +108,10 @@ colnames_serum_df = colnames(serum_df); colnames_serum_df # npa #========= npa_regex = regex(".*_npa[1-3]{1}$", ignore_case = T) -npa_cols_i = str_extract(colnames(adult_df), npa_regex) # not boolean -#npa_cols_b = colnames(adult_df)%in%npa_cols_i # boolean +npa_cols_i = str_extract(colnames(my_data), npa_regex) # not boolean +#npa_cols_b = colnames(my_data)%in%npa_cols_i # boolean -npa_cols = colnames(adult_df)[colnames(adult_df)%in%npa_cols_i] +npa_cols = colnames(my_data)[colnames(my_data)%in%npa_cols_i] # this contains log columns + daynpap_npaXX: omitting these npa_regex_log_days = regex("log|day|vl_samptime|ct.*_npa[1-3]{1}$", ignore_case = T, perl = T) @@ -128,7 +135,7 @@ cat("Extracting NPA cols + metadata_cols") if ( length(npa_cols_to_extract) == length(meta_data_cols) + length(npa_cols_clean) ){ cat("Extracing", length(npa_cols_to_extract), "columns for npa") - npa_df = adult_df[, npa_cols_to_extract] + npa_df = my_data[, npa_cols_to_extract] }else{ cat("FAIL: length mismatch" , "Expeceted to extract:", length(meta_data_cols) + length(npa_cols_clean), "columns" @@ -166,21 +173,21 @@ for (i in extra_cols){ } } tail(colnames_check_f) -# write file? -quick_check = as.data.frame(cbind(metadata_all$mosaic - , metadata_all$adult - , metadata_all$age - , metadata_all$obesity - , metadata_all$obese2 - )) -colnames(quick_check) = c("mosaic", "adult", "age", "obesity", "obese2") ########################################################################## -# LF data +# LF data ########################################################################## -cols_to_omit = c("adult", "obese2" - , "height", "height_unit", "weight" - , "weight_unit", "visual_est_bmi", "bmi_rating") +cols_to_omit = c("adult" + #, "obese2" + #, "height", "height_unit", "weight" + #, "weight_unit", "visual_est_bmi", "bmi_rating" + ) + +pivot_cols = meta_data_cols +# subselect pivot_cols +pivot_cols = meta_data_cols[!meta_data_cols%in%cols_to_omit];pivot_cols +ncols_omitted = table(meta_data_cols%in%cols_to_omit)[[2]] +ncols_omitted #============== # lf data: sam @@ -198,11 +205,11 @@ pivot_cols = meta_data_cols # subselect pivot_cols pivot_cols = meta_data_cols[!meta_data_cols%in%cols_to_omit];pivot_cols -if (length(pivot_cols) == length(meta_data_cols) - length(cols_to_omit)){ +if (length(pivot_cols) == length(meta_data_cols) - ncols_omitted){ cat("PASS: pivot cols successfully extracted") }else{ cat("FAIL: length mismatch! pivot cols could not be extracted" - , "\nExpected length:", length(meta_data_cols) - length(cols_to_omit) + , "\nExpected length:", length(meta_data_cols) - ncols_omitted , "\nGot:",length(pivot_cols) ) quit() } @@ -249,11 +256,11 @@ serum_wf = serum_df_adults[wf_cols] pivot_cols = meta_data_cols pivot_cols = meta_data_cols[!meta_data_cols%in%cols_to_omit];pivot_cols -if (length(pivot_cols) == length(meta_data_cols) - length(cols_to_omit)){ +if (length(pivot_cols) == length(meta_data_cols) - ncols_omitted){ cat("PASS: pivot cols successfully extracted") }else{ cat("FAIL: length mismatch! pivot cols could not be extracted" - , "\nExpected length:", length(meta_data_cols) - length(cols_to_omit) + , "\nExpected length:", length(meta_data_cols) - ncols_omitted , "\nGot:",length(pivot_cols) ) quit() } @@ -296,11 +303,11 @@ npa_wf = npa_df_adults[wf_cols] pivot_cols = meta_data_cols pivot_cols = meta_data_cols[!meta_data_cols%in%cols_to_omit];pivot_cols -if (length(pivot_cols) == length(meta_data_cols) - length(cols_to_omit)){ +if (length(pivot_cols) == length(meta_data_cols) - ncols_omitted){ cat("PASS: pivot cols successfully extracted") }else{ cat("FAIL: length mismatch! pivot cols could not be extracted" - , "\nExpected length:", length(meta_data_cols) - length(cols_to_omit) + , "\nExpected length:", length(meta_data_cols) - ncols_omitted , "\nGot:",length(pivot_cols) ) quit() } @@ -333,7 +340,7 @@ if ( rm(sam_regex, sam_regex_log_days, sam_cols, sam_cols_clean, sam_cols_i, sam_cols_to_extract, sam_cols_to_omit) rm(serum_regex, serum_regex_log_days, serum_cols, serum_cols_clean, serum_cols_i, serum_cols_to_extract, serum_cols_to_omit) rm(npa_regex, npa_regex_log_days, npa_cols, npa_cols_clean, npa_cols_i, npa_cols_to_extract, npa_cols_to_omit) -rm(adult_df) +rm(my_data) rm(colnames_check) rm(i, j #, expected_cols @@ -344,3 +351,4 @@ rm(sam_df_adults, serum_df_adults, npa_df_adults) # rm df rm(sam_df, serum_df, npa_df) +rm(colnames_check_f, fp_adults) diff --git a/data_extraction_formatting_clinical.R b/data_extraction_formatting_clinical.R index 5bb10bf..acd5ce1 100644 --- a/data_extraction_formatting_clinical.R +++ b/data_extraction_formatting_clinical.R @@ -53,31 +53,36 @@ if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(tab table(clinical_df$ia_exac_copd) +str(clinical_df) +#clinical_df$o2_sat_suppl + ######################################################################## #================================== -# asthma and copd status correction -# for conflicting field! +# Check asthma and copd conflict #================================= +if ( table(fp_adults$ia_exac_copd, fp_adults$asthma) [[2,2]] == 0){ + cat("PASS: asthma and copd do not conflict") +}else{ + cat("Conflict detected in asthm and copd filed, attempting to resolve...") + # Reassign the copd and asthma status and do some checks + table(fp_adults$ia_exac_copd); sum(is.na(fp_adults$ia_exac_copd)) + fp_adults$ia_exac_copd[fp_adults$ia_exac_copd< 1]<- 0 + fp_adults$ia_exac_copd[is.na(fp_adults$ia_exac_copd)] <- 0 + table(fp_adults$ia_exac_copd); sum(is.na(fp_adults$ia_exac_copd)) + + # check copd and asthma status + table(fp_adults$ia_exac_copd, fp_adults$asthma) + check_copd_and_asthma_1<- subset(fp_adults, ia_exac_copd ==1 & asthma == 1) # check this is 3 + + # reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma + fp_adults$asthma[fp_adults$ia_exac_copd == 1 & fp_adults$asthma == 1]= 0 + table(fp_adults$ia_exac_copd, fp_adults$asthma) + foo<- subset(fp_adults, asthma==1 & ia_exac_copd ==1) # check that its 0 + rm(check_copd_and_asthma_1, foo) + cat("Check status again...") -# Reassign the copd and asthma status and do some checks -table(clinical_df$ia_exac_copd); sum(is.na(clinical_df$ia_exac_copd)) +} -clinical_df$ia_exac_copd[clinical_df$ia_exac_copd< 1]<- 0 -clinical_df$ia_exac_copd[is.na(clinical_df$ia_exac_copd)] <- 0 - -table(clinical_df$ia_exac_copd); sum(is.na(clinical_df$ia_exac_copd)) - -# check copd and asthma status -table(clinical_df$ia_exac_copd, clinical_df$asthma) -check_copd_and_asthma_1<- subset(clinical_df, ia_exac_copd ==1 & asthma == 1) # check this is 3 - -# reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma -clinical_df$asthma[clinical_df$ia_exac_copd == 1 & clinical_df$asthma == 1]= 0 -table(clinical_df$ia_exac_copd, clinical_df$asthma) - -foo<- subset(clinical_df, asthma==1 & ia_exac_copd ==1) # check that its 0 - -rm(check_copd_and_asthma_1, foo) #===================================================================== #================================= # resp scores: In, max and t1 & t2 @@ -97,7 +102,7 @@ Inresp_sev<- table(clinical_df$inresp_sev) Inresp_sev # Reassign the resp score so all 4 are replace by 3 -clinical_df$max_resp_score[clinical_df$max_resp_score ==4 ] <- 3 +clinical_df$max_resp_score[clinical_df$max_resp_score == 4 ] <- 3 revised_resp_score_table<- table(clinical_df$max_resp_score) revised_resp_score_table @@ -125,29 +130,30 @@ rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev # age #======== # Create categories of variables -clinical_df$age = round(clinical_df$age, digits = 0) -table(clinical_df$age) -table(clinical_df$asthma, clinical_df$age) -min(clinical_df$age); max(clinical_df$age) +clinical_df$age_int = round(clinical_df$age, digits = 0) +table(clinical_df$age_int) +table(clinical_df$asthma, clinical_df$age_int) +min(clinical_df$age_int); max(clinical_df$age_int) -max_age_interval = round_any(max(clinical_df$age), 10, f = ceiling) +max_age_interval = round_any(max(clinical_df$age_int), 10, f = ceiling) max_age_interval -min_age = min(clinical_df$age); min_age #19 +min_age = min(clinical_df$age_int); min_age #19 min_age_interval = min_age - 1; min_age_interval -#age_bins = cut(clinical_df$age, c(0,18,30,40,50,60,70,80,90)) -age_bins = cut(clinical_df$age, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval)) +#age_bins = cut(clinical_df$age_int, c(0,18,30,40,50,60,70,80,90)) +age_bins = cut(clinical_df$age_int, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval)) clinical_df$age_bins = age_bins -dim(clinical_df) # 133 27 +dim(clinical_df) # 133 28 # age_bins (to keep consistent with the results table) class(clinical_df$age_bins) levels(clinical_df$age_bins) #"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]" + table(clinical_df$asthma, clinical_df$age_bins) # (18,30] (30,40] (40,50] (50,60] (60,70] (70,80] #0 25 17 25 14 11 1 -#1 11 8 12 5 3 2 +#1 11 8 12 5 2 2 if (sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) ){ cat("\nPASS: age_bins assigned successfully") @@ -156,7 +162,7 @@ if (sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) ){ quit() } -# reassign +# reassign levels class(clinical_df$age_bins) levels(clinical_df$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]") table(clinical_df$asthma, clinical_df$age_bins) @@ -170,11 +176,25 @@ sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) #=========================== # O2 saturation binning #=========================== +clinical_df$o2_sat_admis +n1 = sum(is.na(clinical_df$o2_sat_admis)) + clinical_df$o2_sat_admis = round(clinical_df$o2_sat_admis, digits = 0) table(clinical_df$o2_sat_admis) tot_o2 = sum(table(clinical_df$o2_sat_admis))- table(clinical_df$o2_sat_admis)[["-1"]] tot_o2 +n_text_code = table(clinical_df$o2_sat_admis)[["-1"]] + +clinical_df$o2_sat_admis[clinical_df$o2_sat_admis <0] <- NA +n2 = sum(is.na(clinical_df$o2_sat_admis)) + +if (n2 == n1 + n_text_code) { + cat ("PASS: -1 code converted to NA") +} else{ + cat("FAIL: something went wrong!") +} + o2_sat_bin = cut(clinical_df$o2_sat_admis, c(0,92,100)) clinical_df$o2_sat_bin = o2_sat_bin table(clinical_df$o2_sat_bin) @@ -184,6 +204,8 @@ sum(table(clinical_df$o2_sat_bin)) == tot_o2 #=========================== # Onset to initial binning #=========================== +clinical_df$onset_2_initial + max_in = max(clinical_df$onset_2_initial); max_in #23 min_in = min(clinical_df$onset_2_initial) - 1 ; min_in # -6 @@ -198,14 +220,15 @@ sum(table(clinical_df$onset_initial_bin)) == tot_onset2ini #======================= # seasonal flu: sfluv #======================= -# should be a factor if (! is.factor(clinical_df$sfluv)){ clinical_df$sfluv = as.factor(clinical_df$sfluv) } -class(clinical_df$sfluv) #[1] "factor" - +class(clinical_df$sfluv) levels(clinical_df$sfluv) + +table(clinical_df$sfluv) table(clinical_df$asthma, clinical_df$sfluv) + # reassign levels(clinical_df$sfluv) <- c("0", "0", "1") table(clinical_df$asthma, clinical_df$sfluv) @@ -213,14 +236,16 @@ table(clinical_df$asthma, clinical_df$sfluv) #======================= # h1n1v #======================= -# should be a factor if (! is.factor(clinical_df$h1n1v)){ clinical_df$h1n1v = as.factor(clinical_df$h1n1v) } -class(clinical_df$h1n1v) #[1] "factor" +class(clinical_df$h1n1v) levels(clinical_df$h1n1v) + +table(clinical_df$h1n1v) table(clinical_df$asthma, clinical_df$h1n1v) + # reassign levels(clinical_df$h1n1v) <- c("0", "0", "1") table(clinical_df$asthma, clinical_df$h1n1v) @@ -229,18 +254,21 @@ table(clinical_df$asthma, clinical_df$h1n1v) # ethnicity #======================= class(clinical_df$ethnicity) # integer +table(clinical_df$ethnicity) table(clinical_df$asthma, clinical_df$ethnicity) clinical_df$ethnicity[clinical_df$ethnicity == 4] <- 2 +table(clinical_df$ethnicity) table(clinical_df$asthma, clinical_df$ethnicity) #======================= # pneumonia #======================= +table(clinical_df$ia_cxr) class(clinical_df$ia_cxr) # integer # ia_cxr 2 ---> yes pneumonia (1) # 1 ---> no (0) -# ! 1 or 2 -- > "unkown" +# ! 1 or 2 -- > "unknown" # reassign the pneumonia codes #0: not performed @@ -251,7 +279,6 @@ class(clinical_df$ia_cxr) # integer #-2: n/a specified by the clinician # not in the data... #-3: unknown specified by clinician - table(clinical_df$ia_cxr) #-3 -1 0 1 2 3 #5 48 13 47 17 3 @@ -262,6 +289,8 @@ table(clinical_df$ia_cxr) # 1 2 #69 47 17 +sum(is.na(clinical_df$ia_cxr)) + clinical_df$ia_cxr[clinical_df$ia_cxr == 1] <- 0 clinical_df$ia_cxr[clinical_df$ia_cxr == 2] <- 1 table(clinical_df$ia_cxr) @@ -306,7 +335,7 @@ clinical_df$smoking[clinical_df$smoking == 4 | clinical_df$smoking == 2 ] <- 0 clinical_df$smoking[clinical_df$smoking == 1 | clinical_df$smoking == 3 ] <- 1 clinical_df$smoking[clinical_df$smoking == -1 | clinical_df$smoking == -2 | clinical_df$smoking == -3 ] <- NA -table(clinical_df$smoking) +table(clinical_df$smoking); sum(is.na(clinical_df$smoking)) # 0 1 #30 49 54 @@ -316,17 +345,13 @@ table(clinical_df$asthma, clinical_df$smoking) # 0 1 #0 24 32 37 #1 6 17 17 - -# 0 1 -#0 23 32 35 -#1 7 17 19 - ################################################################ #========================= # Merge: clinical_df and infile ics #========================= merging_cols = intersect( names(clinical_df), names(clinical_ics) ) +merging_cols clinical_df_ics = merge(clinical_df, clinical_ics, by = merging_cols, all = T); clinical_df_ics @@ -351,6 +376,15 @@ if (nrow(clinical_df_ics) == nrow(clinical_df) & nrow(clinical_ics)){ , "\nExpected nrows:", nrow(fp_adults)) } +# change the factor vars to integers +str(clinical_df_ics) +factor_vars = lapply(clinical_df_ics, class) == "factor" +table(factor_vars) +clinical_df_ics[, factor_vars] <- lapply(clinical_df_ics[, factor_vars], as.integer) +table(factor_vars) + +str(clinical_df_ics) + #====================== # writing output file #====================== @@ -359,9 +393,8 @@ outfile_reg = paste0(outdir, outfile_name_reg) cat("\nWriting clinical file for regression:", outfile_reg) -write.csv(clinical_df_ics, file = outfile_reg) +#write.csv(clinical_df_ics, file = outfile_reg) ################################################################ - rm(age_bins, max_age_interval, max_in, min_in , o2_sat_bin, onset_initial_bin, tot_o2 , tot_onset2ini, meta_data_cols diff --git a/flu_stats_contingency.R b/flu_stats_contingency.R index 8f43b88..a191912 100755 --- a/flu_stats_contingency.R +++ b/flu_stats_contingency.R @@ -214,4 +214,36 @@ comb_stats_categ_df_f = comb_stats_categ_df[order(comb_stats_categ_df$p_signif # write output file #****************** cat("Chisq and fishers test results in:", outfile_clin_categ) -write.csv(comb_stats_categ_df_f, outfile_clin_categ, row.names = FALSE) +#write.csv(comb_stats_categ_df_f, outfile_clin_categ, row.names = FALSE) + +#================== +#0 date not recorded +#-1 not recorded +#-2 n/a specified by clinician +#-3 unknown specified by + + +chisq_test(table(clinical_df_ics$obesity, clinical_df_ics$smoking)) + +chisq_test(table(clinical_df_ics$obesity, clinical_df_ics$max_resp_score)) + +chisq_test(table(clinical_df_ics$obesity, clinical_df_ics$T1_resp_score)) +chisq_test(table(clinical_df_ics$obesity, clinical_df_ics$t1_resp_recoded)) + + +obese_df = clinical_df_ics[clinical_df_ics$obesity == 1,] +not_ob_df = clinical_df_ics[clinical_df_ics$obesity == 0,] + +wilcox.test(obese_df$age, not_ob_df$age, paired = F) +wilcox.test(obese_df$los, not_ob_df$los, paired = F) +wilcox.test(obese_df$o2_sat_admis, not_ob_df$o2_sat_admis, paired = F) +wilcox.test(obese_df$onset_2_initial, not_ob_df$onset_2_initial, paired = F) +wilcox.test(obese_df$onset2final, not_ob_df$onset2final, paired = F) +wilcox.test(obese_df$onsfindeath, not_ob_df$onsfindeath, paired = F) + +clinical_df_ics$age +clinical_df_ics$los +clinical_df_ics$o2_sat_admis #***** (already bin) +clinical_df_ics$onset_2_initial # ***** (already bin) +clinical_df_ics$onset2final +clinical_df_ics$onsfindeath diff --git a/flu_stats_unpaired_clinical.R b/flu_stats_unpaired_clinical.R index 67ef423..8e119b4 100755 --- a/flu_stats_unpaired_clinical.R +++ b/flu_stats_unpaired_clinical.R @@ -25,27 +25,28 @@ outfile_clinical_unpaired # Unpaired stats for clinical data b/w groups: wilcoxon UNpaired analysis # No correction required ######################################################################## - +str(clinical_df_ics) numerical_cols = c("age" - #, "vl_pfu_ul_npa1" + , "vl_pfu_ul_npa1" , "los" , "onset2final" , "onsfindeath" - , "onset_2_initial" - , "o2_sat_admis") + #, "onset_2_initial" # already bin + #, "o2_sat_admis"# already bin +) metadata_cols = c("mosaic", "obesity") -clinical_df_numerical = clinical_df[, c(metadata_cols, numerical_cols)] +clinical_df_numerical = clinical_df_ics[, c(metadata_cols, numerical_cols)] pivot_cols = metadata_cols #pivot_cols = metadata_cols[!meta_data_cols%in%cols_to_omit];pivot_cols expected_rows_clinical_lf = nrow(clinical_df_numerical) * (length(clinical_df_numerical) - length(pivot_cols)); expected_rows_clinical_lf - +# lf data colnames keycol <- "clinical_params" valuecol <- "value" -gathercols <- c("age", "los", "onset2final", "onsfindeath", "onset_2_initial", "o2_sat_admis") +gathercols <- numerical_cols clinical_lf = gather_(clinical_df_numerical, keycol, valuecol, gathercols) @@ -70,12 +71,15 @@ stats_un_clinical = compare_means(value~obesity #, data = clinical_lf_comp , paired = FALSE) +head(stats_un_clinical) +# rstatix stat_df <- clinical_lf %>% group_by(clinical_params) %>% wilcox_test(value ~ obesity, paired = F) %>% add_significance("p") stat_df$p_format = round(stat_df$p, digits = 3) +stat_df #---------------------------------------- # calculate n_obs for each clinical param: Overall @@ -101,31 +105,39 @@ n_all_gp = merge(n_all, n_gp #---------------------------------------- # calculate n_obs for each clinical param: complete cases #---------------------------------------- -n_comp = data.frame(table(clinical_lf_comp$clinical_params)) +n_comp = data.frame(table(clinical_lf$clinical_params)) colnames(n_comp) = c("clinical_params", "n_complete") n_comp$clinical_params = as.character(n_comp$clinical_params) n_comp -n_gp_comp_lf = data.frame(table(clinical_lf_comp$clinical_params, clinical_lf_comp$obesity)); n_gp_comp_lf +n_gp_comp_lf = data.frame(table(clinical_lf$clinical_params + , clinical_lf$obesity)); n_gp_comp_lf n_gp_comp = spread(n_gp_comp_lf, "Var2", "Freq"); n_gp_comp colnames(n_gp_comp) colnames(n_gp_comp) = c("clinical_params" , paste0("n_complete_gp", colnames(n_gp_comp)[2]) , paste0("n_complete_gp", colnames(n_gp_comp)[3])) - +#--------- +# merge 1 +#--------- n_comp_gp = merge(n_comp, n_gp_comp , by = intersect( names(n_comp), names(n_gp_comp)) , all = T) +n_comp_gp +#--------- +# merge 2 +#--------- merge_cols = intersect(names(n_all_gp), names(n_comp_gp)); merge_cols n_df = merge(n_all_gp, n_comp_gp, by = merge_cols, all = T); n_df -#================================== -# Merge: merge stats + n_obs df -#=================================== +#---------------------------------- +# Merge 3: merge stats + n_obs df +#---------------------------------- merging_cols = intersect(names(stats_un_clinical), names(n_df)); merging_cols + if (all(n_df$clinical_params%in%stats_un_clinical$clinical_params)) { cat("PASS: merging stats and n_obs on column/s:", merging_cols) stats_un_clinical = merge(stats_un_clinical, n_df, by = merging_cols, all = T) @@ -188,6 +200,7 @@ if( length(my_col_order2) == ncol(stats_clinical_df) && (all(my_col_order2%in%co quit() } # assign nice column names like replace "." with "_" +# same ordering as my_col_order2, just minor formatting colnames(stats_clinical_df_f) = c("clinical_params" , "method" , "group1" @@ -208,4 +221,4 @@ colnames(stats_clinical_df_f) # write output file #****************** cat("UNpaired stats for clinical data for groups in:", outfile_clinical_unpaired) -#write.csv(stats_clinical_df_f, outfile_clinical_unpaired, row.names = FALSE) +write.csv(stats_clinical_df_f, outfile_clinical_unpaired, row.names = FALSE) diff --git a/flu_stats_unpaired_npa.R b/flu_stats_unpaired_npa.R index ebd8647..f302f8f 100755 --- a/flu_stats_unpaired_npa.R +++ b/flu_stats_unpaired_npa.R @@ -365,6 +365,14 @@ colnames(combined_unpaired_stats_f) = c("mediator" , "p_bon_signif") colnames(combined_unpaired_stats_f) + +# count how many meds are significant +n_sig = length(combined_unpaired_stats_f$mediator[combined_unpaired_stats_f$p_signif<0.05]) +cat("\nTotal no. of statistically significant mediators in", toupper(my_sample_type) + , "are:", n_sig) + +sig_meds = combined_unpaired_stats_f[combined_unpaired_stats_f$p_signif<0.05,] + ######################################################################## #****************** # write output file diff --git a/read_data.R b/read_data.R index 614ed87..e8686e3 100755 --- a/read_data.R +++ b/read_data.R @@ -30,11 +30,11 @@ meta_data_cols = c("mosaic", "gender", "age" , "adult" , "flustat", "type" , "obesity" - , "obese2" - , "height", "height_unit" - , "weight", "weight_unit" - , "ia_height_ftin", "ia_height_m", "ia_weight" - , "visual_est_bmi", "bmi_rating" + #, "obese2" + #, "height", "height_unit" + #, "weight", "weight_unit" + #, "ia_height_ftin", "ia_height_m", "ia_weight" + #, "visual_est_bmi", "bmi_rating" ) # check if these columns to select are present in the data @@ -55,9 +55,9 @@ if (table(adult_df$adult == 1)[[1]] == nrow(adult_df) ){ cat ("\nFAIL: adult df number mismatch!") } -#============== +#================================= # FLU positive: adult patients -#============== +#================================= # extract the flu positive population fp_adults = adult_df[adult_df$flustat == 1,] @@ -67,6 +67,53 @@ if (table(fp_adults$flustat == 1)[[1]] == nrow(fp_adults) ){ cat ("\nFAIL: adult df number mismatch!") } +#============================================= +# FLU positive adult patients: without asthma +#============================================= +#----------------------------------- +# asthma and copd status correction +# for conflicting field! +#------------------------------------ +# Reassign the copd and asthma status and do some checks +table(fp_adults$ia_exac_copd); sum(is.na(fp_adults$ia_exac_copd)) + +fp_adults$ia_exac_copd[fp_adults$ia_exac_copd< 1]<- 0 +fp_adults$ia_exac_copd[is.na(fp_adults$ia_exac_copd)] <- 0 + +table(fp_adults$ia_exac_copd); sum(is.na(fp_adults$ia_exac_copd)) + +# check copd and asthma status +table(fp_adults$ia_exac_copd, fp_adults$asthma) +check_copd_and_asthma_1<- subset(fp_adults, ia_exac_copd ==1 & asthma == 1) # check this is 3 + +# reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma +fp_adults$asthma[fp_adults$ia_exac_copd == 1 & fp_adults$asthma == 1]= 0 +table(fp_adults$ia_exac_copd, fp_adults$asthma) + +foo<- subset(fp_adults, asthma==1 & ia_exac_copd ==1) # check that its 0 + +rm(check_copd_and_asthma_1, foo) + +if ( table(fp_adults$ia_exac_copd, fp_adults$asthma) [[2,2]] == 0 ){ + cat("\nPASS: asthma and copd do not conflict") +} else{ + cat ("\nFAIL: asthma and copd conflict not resolved!") + quit() +} + +cat("\nExtracting flu positive without asthma") +table(fp_adults$asthma) +cat("\nNo. of asthmatics:", table(fp_adults$asthma)[[2]] + , "\nNo. of non-asthmatics:", table(fp_adults$asthma)[[1]]) +str(fp_adults$asthma) + +table(fp_adults$obesity) +table(fp_adults$obesity, fp_adults$asthma) + +fp_adults_na = fp_adults[fp_adults$asthma == 0,] +table(fp_adults_na$obesity) +table(fp_adults_na$obesity, fp_adults_na$asthma) + #============ # hc #============