#!/usr/bin/Rscript getwd() setwd("~/git/mosaic_2020/") getwd() ############################################################ # TASK: unpaired (time) analysis of clinical data # data: clincial data of flu positive adult patients # group: obesity ############################################################ #my_sample_type = "npa" #============= # Input #============= source("data_extraction_formatting_clinical.R") str(clinical_df) clinical_df$sfluv = as.integer(clinical_df$sfluv) clinical_df$h1n1v = as.integer(clinical_df$h1n1v) #============= # Output #============= outfile_name_clin_categ = paste0("flu_stats_clin_categ.csv") outfile_clin_categ = paste0(outdir_stats, outfile_name_clin_categ) outfile_clin_categ ######################################################################## # Chisq or fisher's test for clinical data b/w obseity groups ######################################################################## categorical_cols = c( "death" #, "obesity" #, "flustat" , "sfluv" , "h1n1v" , "gender" , "asthma" #, "o2_sat_suppl" ---> not recoded!? , "ethnicity" , "smoking" , "ia_cxr" , "max_resp_score" , "T1_resp_score" , "com_noasthma" , "T2_resp_score" , "inresp_sev" , "steroid") metadata_cols = c("mosaic", "obesity") categ_df = clinical_df[, c(metadata_cols, categorical_cols)] # quick test tab = table(categ_df$obesity, categ_df$death) tab chisq.test(tab) pivot_cols = metadata_cols #pivot_cols = metadata_cols[!meta_data_cols%in%cols_to_omit];pivot_cols expected_rows_categ_lf = nrow(categ_df) * ( length(categ_df) - length(pivot_cols) ); expected_rows_categ_lf colnames(categ_df) keycol <- "clinical_categ_params" valuecol <- "value" gathercols <- c("death" #, "flustat" , "sfluv", "h1n1v" , "gender", "asthma" #, "o2_sat_suppl" , "ethnicity" , "smoking", "ia_cxr", "max_resp_score", "T1_resp_score" , "com_noasthma", "T2_resp_score", "inresp_sev", "steroid") categ_lf = gather_(categ_df, keycol, valuecol, gathercols) #categ_lf = gather(categ_df, clinical_categ_params, value, death:T2_resp_score, factor_key = F) if( nrow(categ_lf) == expected_rows_categ_lf){ cat("PASS: long format data created successfully" , "\nnrow:", nrow(categ_lf) , "\nncol:", ncol(categ_lf)) } #============================================= # Chisq test: clinical categorical vars #https://www.google.com/search?q=chisq+test+on+long+format+data+in+R+using+group+by&source=lmns&bih=828&biw=1280&client=firefox-b-d&hl=en-GB&sa=X&ved=2ahUKEwjItpL7xI7tAhUGTBoKHXlSBa8Q_AUoAHoECAEQAA #============================================= #============================================= # wf data #============================================= stats_df = subset(categ_df, select = -c(mosaic)) drop_cols = c("mosaic", "obesity") my_categ_cols = colnames(categ_df)[!colnames(categ_df)%in%drop_cols] if ( length(my_categ_cols) == ncol(categ_df) - length(drop_cols) ){ cat("PASS: variables for chisq test successfully extracted") }else{ cat("FAIL: length mismatch when extracting variables for chisq") quit() } # quick test tab = table(stats_df$obesity, stats_df$smoking) tab test1 = chisq.test(tab) chisq_df = data.frame() for (i in my_categ_cols){ #print(i) df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA, p = NA) my_param_val = (get(i, stats_df)) tab = table(stats_df$obesity, my_param_val) print(tab) my_chi_test = chisq.test(tab) print(my_chi_test) # extracting results my_param_name = i my_n_obs = sum(my_chi_test$observed) my_method = my_chi_test$method my_test_statistic = my_chi_test$statistic[[1]] my_pval = my_chi_test$p.value # assiging to loop df df$clinical_categ_params = my_param_name df$n_obs = my_n_obs df$method = my_method df$test_statistic = my_test_statistic df$p = my_pval print(df) chisq_df = rbind(chisq_df, df) } chisq_df$p_format = format.pval(chisq_df$p) chisq_df$p_signif = chisq_df$p chisq_df = dplyr::mutate(chisq_df, p_signif = case_when(p_signif == 0.05 ~ "." , p_signif <=0.0001 ~ '****' , p_signif <=0.001 ~ '***' , p_signif <=0.01 ~ '**' , p_signif <0.05 ~ '*' , p_signif == 0.1 ~ '.' , TRUE ~ 'ns')) #============================================= # Fishers test #============================================= # quick test tab2 = table(stats_df$obesity, stats_df$smoking) tab2 #test2 = fisher_test(tab2); test2 test3 = fisher.test(stats_df$obesity, stats_df$com_noasthma); test3 stats_df_copy = stats_df int_vars <- lapply(stats_df_copy, class)%in%c("integer", "numeric") int_vars stats_df_copy[, int_vars] <- lapply(stats_df_copy[, int_vars], as.factor) str(stats_df_copy) two_lev = lapply(stats_df_copy, nlevels) == 2 fisher_cols_df1 = names(two_lev)[two_lev == TRUE] fisher_cols_df1 drop_cols = c("mosaic", "obesity") fisher_cols_df1 = fisher_cols_df1[!fisher_cols_df1%in%drop_cols] fisher_cols_df1 fisher_df1 = data.frame() for (i in fisher_cols_df1){ cat(i, "\n===============\n") df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA, p = NA, ci_low = NA, ci_high = NA) my_param_val = (get(i, stats_df)) tab = table(stats_df$obesity, my_param_val) print(tab) my_fisher_test = fisher.test(tab) print(my_fisher_test) # extracting results my_param_name = i my_n_obs = sum(tab) my_method = my_fisher_test$method my_test_statistic = my_fisher_test$statistic[[1]] # FIXME| my_pval = my_fisher_test$p.value my_ci_low = my_fisher_test$conf.int[[1]] my_ci_hi = my_fisher_test$conf.int[[2]] # assiging to loop df df$clinical_categ_params = my_param_name df$n_obs = my_n_obs df$method = my_method df$test_statistic = my_test_statistic df$p = my_pval df$ci_low = my_ci_low df$ci_high = my_ci_hi print(df) fisher_df1 = rbind(fisher_df1, df) } #============= three_lev = lapply(stats_df_copy, nlevels) == 3 fisher_cols_df2 = names(three_lev)[three_lev == TRUE] fisher_cols_df2 #drop_cols = c("mosaic", "obesity") #fisher_cols_df2 = fisher_cols_df2[!fisher_cols_df2%in%drop_cols] #fisher_cols_df2 fisher_df2 = data.frame() for (i in fisher_cols_df2){ cat(i, "\n===============\n") df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA , p = NA) my_param_val = (get(i, stats_df)) tab = table(stats_df$obesity, my_param_val) print(tab) my_fisher_test = fisher.test(tab) print(my_fisher_test) # extracting results my_param_name = i my_n_obs = sum(tab) my_method = my_fisher_test$method my_test_statistic = ">2 categories" my_pval = my_fisher_test$p.value # assiging to loop df df$clinical_categ_params = my_param_name df$n_obs = my_n_obs df$method = my_method df$test_statistic = my_test_statistic df$p = my_pval print(df) fisher_df2 = rbind(fisher_df2, df) } ######################################################################## #****************** # write output file #****************** cat("Chisq and fishers test results in:", outfile_clin_categ) #write.csv(stats_categ_df_f, outfile_clin_categ, row.names = FALSE)