From 269918d696b79d07ba1271e053e92e4197d786bb Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 19 Nov 2020 17:48:08 +0000 Subject: [PATCH] separated fisher test only and renamed logistic_outcome col --- data_extraction_formatting_clinical.R | 13 +- flu_stats_contingency.R | 215 +++++++++++--------------- 2 files changed, 98 insertions(+), 130 deletions(-) diff --git a/data_extraction_formatting_clinical.R b/data_extraction_formatting_clinical.R index 95ad909..5bb10bf 100644 --- a/data_extraction_formatting_clinical.R +++ b/data_extraction_formatting_clinical.R @@ -330,6 +330,12 @@ merging_cols = intersect( names(clinical_df), names(clinical_ics) ) clinical_df_ics = merge(clinical_df, clinical_ics, by = merging_cols, all = T); clinical_df_ics +colnames(clinical_df_ics) + +# change colname of logistic_outcome +c1 = which(colnames(clinical_df_ics) == "logistic_outcome") +colnames(clinical_df_ics)[c1] <- "t1_resp_recoded" + if (nrow(clinical_df_ics) == nrow(clinical_df) & nrow(clinical_ics)){ cat("\nPASS: No. of rows match, nrow =", nrow(clinical_df_ics) , "\nChecking ncols...") @@ -353,7 +359,10 @@ outfile_reg = paste0(outdir, outfile_name_reg) cat("\nWriting clinical file for regression:", outfile_reg) -#write.csv(clinical_df_ics, file = outfile_reg) +write.csv(clinical_df_ics, file = outfile_reg) ################################################################ -rm(age_bins, max_age_interval, max_in, min_in, o2_sat_bin, onset_initial_bin, tot_o2, tot_onset2ini, meta_data_cols) +rm(age_bins, max_age_interval, max_in, min_in + , o2_sat_bin, onset_initial_bin, tot_o2 + , tot_onset2ini, meta_data_cols + , clinical_df) diff --git a/flu_stats_contingency.R b/flu_stats_contingency.R index bf22982..a64ad95 100755 --- a/flu_stats_contingency.R +++ b/flu_stats_contingency.R @@ -3,20 +3,22 @@ getwd() setwd("~/git/mosaic_2020/") getwd() ############################################################ -# TASK: unpaired (time) analysis of clinical data +# TASK: Contingency table analysis i.e chisq and fishers # data: clincial data of flu positive adult patients # group: obesity + +# Chisq test +#https://www.google.com/search?q=chisq+test+on+long+format+data+in+R+using+group+by&source=lmns&bih=828&biw=1280&client=firefox-b-d&hl=en-GB&sa=X&ved=2ahUKEwjItpL7xI7tAhUGTBoKHXlSBa8Q_AUoAHoECAEQAA ############################################################ -#my_sample_type = "npa" #============= # Input #============= source("data_extraction_formatting_clinical.R") -str(clinical_df) +str(clinical_df_ics) -clinical_df$sfluv = as.integer(clinical_df$sfluv) -clinical_df$h1n1v = as.integer(clinical_df$h1n1v) +clinical_df_ics$sfluv = as.integer(clinical_df_ics$sfluv) +clinical_df_ics$h1n1v = as.integer(clinical_df_ics$h1n1v) #============= # Output @@ -44,58 +46,28 @@ categorical_cols = c( "death" , "com_noasthma" , "T2_resp_score" , "inresp_sev" - , "steroid") - + , "steroid" + , "age_bins" + , "o2_sat_bin" + , "onset_initial_bin" + , "t1_resp_recoded" + , "steroid_ics") metadata_cols = c("mosaic", "obesity") -categ_df = clinical_df[, c(metadata_cols, categorical_cols)] +categ_df = clinical_df_ics[, c(metadata_cols, categorical_cols)] # quick test tab = table(categ_df$obesity, categ_df$death) tab chisq.test(tab) -pivot_cols = metadata_cols -#pivot_cols = metadata_cols[!meta_data_cols%in%cols_to_omit];pivot_cols -expected_rows_categ_lf = nrow(categ_df) * ( length(categ_df) - length(pivot_cols) ); expected_rows_categ_lf - -colnames(categ_df) - -keycol <- "clinical_categ_params" -valuecol <- "value" -gathercols <- c("death" - #, "flustat" - , "sfluv", "h1n1v" - , "gender", "asthma" - #, "o2_sat_suppl" - , "ethnicity" - , "smoking", "ia_cxr", "max_resp_score", "T1_resp_score" - , "com_noasthma", "T2_resp_score", "inresp_sev", "steroid") - -categ_lf = gather_(categ_df, keycol, valuecol, gathercols) -#categ_lf = gather(categ_df, clinical_categ_params, value, death:T2_resp_score, factor_key = F) - -if( nrow(categ_lf) == expected_rows_categ_lf){ - cat("PASS: long format data created successfully" - , "\nnrow:", nrow(categ_lf) - , "\nncol:", ncol(categ_lf)) -} - -#============================================= -# Chisq test: clinical categorical vars -#https://www.google.com/search?q=chisq+test+on+long+format+data+in+R+using+group+by&source=lmns&bih=828&biw=1280&client=firefox-b-d&hl=en-GB&sa=X&ved=2ahUKEwjItpL7xI7tAhUGTBoKHXlSBa8Q_AUoAHoECAEQAA -#============================================= - - - -#============================================= -# wf data -#============================================= +#======================================================== +# Data: for chisq and fisher +#======================================================== stats_df = subset(categ_df, select = -c(mosaic)) -drop_cols = c("mosaic", "obesity") -my_categ_cols = colnames(categ_df)[!colnames(categ_df)%in%drop_cols] +my_categ_cols = colnames(categ_df)[!colnames(categ_df)%in%metadata_cols] if ( length(my_categ_cols) == ncol(categ_df) - length(drop_cols) ){ cat("PASS: variables for chisq test successfully extracted") @@ -103,16 +75,19 @@ if ( length(my_categ_cols) == ncol(categ_df) - length(drop_cols) ){ cat("FAIL: length mismatch when extracting variables for chisq") quit() } - -# quick test -tab = table(stats_df$obesity, stats_df$smoking) -tab -test1 = chisq.test(tab) - +######################################################################## +#-------------------- +# chisq test +#-------------------- chisq_df = data.frame() for (i in my_categ_cols){ #print(i) - df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA, p = NA) + df = data.frame(clinical_categ_params = NA + , n_obs = NA + , method = NA + #, test_statistic = NA + , p = NA) + my_param_val = (get(i, stats_df)) tab = table(stats_df$obesity, my_param_val) print(tab) @@ -123,20 +98,21 @@ for (i in my_categ_cols){ my_param_name = i my_n_obs = sum(my_chi_test$observed) my_method = my_chi_test$method - my_test_statistic = my_chi_test$statistic[[1]] + #my_test_statistic = my_chi_test$statistic[[1]] my_pval = my_chi_test$p.value # assiging to loop df df$clinical_categ_params = my_param_name df$n_obs = my_n_obs df$method = my_method - df$test_statistic = my_test_statistic + #df$test_statistic = my_test_statistic df$p = my_pval print(df) chisq_df = rbind(chisq_df, df) } +# formatting chisq_df$p_format = format.pval(chisq_df$p) chisq_df$p_signif = chisq_df$p @@ -147,34 +123,29 @@ chisq_df = dplyr::mutate(chisq_df, p_signif = case_when(p_signif == 0.05 ~ "." , p_signif <0.05 ~ '*' , p_signif == 0.1 ~ '.' , TRUE ~ 'ns')) -#============================================= -# Fishers test -#============================================= + # quick test -tab2 = table(stats_df$obesity, stats_df$smoking) -tab2 -#test2 = fisher_test(tab2); test2 -test3 = fisher.test(stats_df$obesity, stats_df$com_noasthma); test3 +tab = table(stats_df$obesity, stats_df$smoking); tab +test1 = chisq.test(tab); test1 +test2 = chisq_test(tab); test2 #rstatix +sum(tab) -stats_df_copy = stats_df -int_vars <- lapply(stats_df_copy, class)%in%c("integer", "numeric") -int_vars -stats_df_copy[, int_vars] <- lapply(stats_df_copy[, int_vars], as.factor) -str(stats_df_copy) +#================================================================================ +#-------------------- +# Fishers test: without any OR +# see standalone script for this! +#-------------------- -two_lev = lapply(stats_df_copy, nlevels) == 2 -fisher_cols_df1 = names(two_lev)[two_lev == TRUE] -fisher_cols_df1 +fisher_df = data.frame() -drop_cols = c("mosaic", "obesity") -fisher_cols_df1 = fisher_cols_df1[!fisher_cols_df1%in%drop_cols] -fisher_cols_df1 - -fisher_df1 = data.frame() - -for (i in fisher_cols_df1){ +for (i in my_categ_cols){ cat(i, "\n===============\n") - df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA, p = NA, ci_low = NA, ci_high = NA) + df = data.frame(clinical_categ_params = NA + , n_obs = NA + , method = NA + #, test_statistic = NA + , p = NA) + my_param_val = (get(i, stats_df)) tab = table(stats_df$obesity, my_param_val) print(tab) @@ -185,74 +156,62 @@ for (i in fisher_cols_df1){ my_param_name = i my_n_obs = sum(tab) my_method = my_fisher_test$method - my_test_statistic = my_fisher_test$statistic[[1]] # FIXME| + #my_test_statistic = my_fisher_test$estimate[[1]] my_pval = my_fisher_test$p.value - my_ci_low = my_fisher_test$conf.int[[1]] - my_ci_hi = my_fisher_test$conf.int[[2]] - + # assiging to loop df df$clinical_categ_params = my_param_name df$n_obs = my_n_obs df$method = my_method - df$test_statistic = my_test_statistic + #df$test_statistic = my_test_statistic df$p = my_pval - df$ci_low = my_ci_low - df$ci_high = my_ci_hi - + print(df) - fisher_df1 = rbind(fisher_df1, df) + fisher_df = rbind(fisher_df, df) } -#============= +# formatting +fisher_df$p_format = format.pval(fisher_df$p) +fisher_df$p_signif = fisher_df$p -three_lev = lapply(stats_df_copy, nlevels) == 3 -fisher_cols_df2 = names(three_lev)[three_lev == TRUE] -fisher_cols_df2 +fisher_df = dplyr::mutate(fisher_df, p_signif = case_when(p_signif == 0.05 ~ "." + , p_signif <=0.0001 ~ '****' + , p_signif <=0.001 ~ '***' + , p_signif <=0.01 ~ '**' + , p_signif <0.05 ~ '*' + , p_signif == 0.1 ~ '.' + , TRUE ~ 'ns')) -#drop_cols = c("mosaic", "obesity") -#fisher_cols_df2 = fisher_cols_df2[!fisher_cols_df2%in%drop_cols] -#fisher_cols_df2 +# quick test +tab_f = table(stats_df$obesity, stats_df$smoking); tab_f +test1_f = fisher.test(tab); test1_f +test2_f = fisher_test(tab); test2_f #rstatix +sum(tab_f) +test1_f$estimate -fisher_df2 = data.frame() - -for (i in fisher_cols_df2){ - cat(i, "\n===============\n") - df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA , p = NA) - my_param_val = (get(i, stats_df)) - tab = table(stats_df$obesity, my_param_val) - print(tab) - my_fisher_test = fisher.test(tab) - print(my_fisher_test) +#================================================================================ +#-------------------------------- +# Combining chisq and fishers df +#--------------------------------- +if ( all(colnames(chisq_df) == colnames(fisher_df)) && all(dim(chisq_df) == dim(fisher_df)) ){ + cat("Colnames AND dim match for both chisq and fisher df" + , "\nNo. of rows:", nrow(chisq_df) + , "\nNo. of cols:", ncol(chisq_df) + , "\nProceeding to rbind") - # extracting results - my_param_name = i - my_n_obs = sum(tab) - my_method = my_fisher_test$method - my_test_statistic = ">2 categories" - my_pval = my_fisher_test$p.value + comb_stats_categ_df = rbind(chisq_df, fisher_df) - # assiging to loop df - df$clinical_categ_params = my_param_name - df$n_obs = my_n_obs - df$method = my_method - df$test_statistic = my_test_statistic - df$p = my_pval - - print(df) - fisher_df2 = rbind(fisher_df2, df) + cat("\nCombined df dim:") + print(dim(comb_stats_categ_df)) + } - - - - - - - - +# sort df according to p_signif AND clinical_categ_params +comb_stats_categ_df_f = comb_stats_categ_df[order(comb_stats_categ_df$p_signif + , comb_stats_categ_df$clinical_categ_params),] ######################################################################## #****************** # write output file #****************** cat("Chisq and fishers test results in:", outfile_clin_categ) -#write.csv(stats_categ_df_f, outfile_clin_categ, row.names = FALSE) +write.csv(comb_stats_categ_df_f, outfile_clin_categ, row.names = FALSE)