diff --git a/flu_stats_contingency.R b/flu_stats_contingency.R new file mode 100755 index 0000000..bf22982 --- /dev/null +++ b/flu_stats_contingency.R @@ -0,0 +1,258 @@ +#!/usr/bin/Rscript +getwd() +setwd("~/git/mosaic_2020/") +getwd() +############################################################ +# TASK: unpaired (time) analysis of clinical data +# data: clincial data of flu positive adult patients +# group: obesity +############################################################ +#my_sample_type = "npa" + +#============= +# Input +#============= +source("data_extraction_formatting_clinical.R") +str(clinical_df) + +clinical_df$sfluv = as.integer(clinical_df$sfluv) +clinical_df$h1n1v = as.integer(clinical_df$h1n1v) + +#============= +# Output +#============= +outfile_name_clin_categ = paste0("flu_stats_clin_categ.csv") +outfile_clin_categ = paste0(outdir_stats, outfile_name_clin_categ) +outfile_clin_categ + +######################################################################## +# Chisq or fisher's test for clinical data b/w obseity groups +######################################################################## +categorical_cols = c( "death" + #, "obesity" + #, "flustat" + , "sfluv" + , "h1n1v" + , "gender" + , "asthma" + #, "o2_sat_suppl" ---> not recoded!? + , "ethnicity" + , "smoking" + , "ia_cxr" + , "max_resp_score" + , "T1_resp_score" + , "com_noasthma" + , "T2_resp_score" + , "inresp_sev" + , "steroid") + + +metadata_cols = c("mosaic", "obesity") + +categ_df = clinical_df[, c(metadata_cols, categorical_cols)] + +# quick test +tab = table(categ_df$obesity, categ_df$death) +tab +chisq.test(tab) + +pivot_cols = metadata_cols +#pivot_cols = metadata_cols[!meta_data_cols%in%cols_to_omit];pivot_cols +expected_rows_categ_lf = nrow(categ_df) * ( length(categ_df) - length(pivot_cols) ); expected_rows_categ_lf + +colnames(categ_df) + +keycol <- "clinical_categ_params" +valuecol <- "value" +gathercols <- c("death" + #, "flustat" + , "sfluv", "h1n1v" + , "gender", "asthma" + #, "o2_sat_suppl" + , "ethnicity" + , "smoking", "ia_cxr", "max_resp_score", "T1_resp_score" + , "com_noasthma", "T2_resp_score", "inresp_sev", "steroid") + +categ_lf = gather_(categ_df, keycol, valuecol, gathercols) +#categ_lf = gather(categ_df, clinical_categ_params, value, death:T2_resp_score, factor_key = F) + +if( nrow(categ_lf) == expected_rows_categ_lf){ + cat("PASS: long format data created successfully" + , "\nnrow:", nrow(categ_lf) + , "\nncol:", ncol(categ_lf)) +} + +#============================================= +# Chisq test: clinical categorical vars +#https://www.google.com/search?q=chisq+test+on+long+format+data+in+R+using+group+by&source=lmns&bih=828&biw=1280&client=firefox-b-d&hl=en-GB&sa=X&ved=2ahUKEwjItpL7xI7tAhUGTBoKHXlSBa8Q_AUoAHoECAEQAA +#============================================= + + + +#============================================= +# wf data +#============================================= +stats_df = subset(categ_df, select = -c(mosaic)) +drop_cols = c("mosaic", "obesity") + +my_categ_cols = colnames(categ_df)[!colnames(categ_df)%in%drop_cols] + +if ( length(my_categ_cols) == ncol(categ_df) - length(drop_cols) ){ + cat("PASS: variables for chisq test successfully extracted") +}else{ + cat("FAIL: length mismatch when extracting variables for chisq") + quit() +} + +# quick test +tab = table(stats_df$obesity, stats_df$smoking) +tab +test1 = chisq.test(tab) + +chisq_df = data.frame() +for (i in my_categ_cols){ + #print(i) + df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA, p = NA) + my_param_val = (get(i, stats_df)) + tab = table(stats_df$obesity, my_param_val) + print(tab) + my_chi_test = chisq.test(tab) + print(my_chi_test) + + # extracting results + my_param_name = i + my_n_obs = sum(my_chi_test$observed) + my_method = my_chi_test$method + my_test_statistic = my_chi_test$statistic[[1]] + my_pval = my_chi_test$p.value + + # assiging to loop df + df$clinical_categ_params = my_param_name + df$n_obs = my_n_obs + df$method = my_method + df$test_statistic = my_test_statistic + df$p = my_pval + + print(df) + chisq_df = rbind(chisq_df, df) +} + +chisq_df$p_format = format.pval(chisq_df$p) +chisq_df$p_signif = chisq_df$p + +chisq_df = dplyr::mutate(chisq_df, p_signif = case_when(p_signif == 0.05 ~ "." + , p_signif <=0.0001 ~ '****' + , p_signif <=0.001 ~ '***' + , p_signif <=0.01 ~ '**' + , p_signif <0.05 ~ '*' + , p_signif == 0.1 ~ '.' + , TRUE ~ 'ns')) +#============================================= +# Fishers test +#============================================= +# quick test +tab2 = table(stats_df$obesity, stats_df$smoking) +tab2 +#test2 = fisher_test(tab2); test2 +test3 = fisher.test(stats_df$obesity, stats_df$com_noasthma); test3 + +stats_df_copy = stats_df +int_vars <- lapply(stats_df_copy, class)%in%c("integer", "numeric") +int_vars +stats_df_copy[, int_vars] <- lapply(stats_df_copy[, int_vars], as.factor) +str(stats_df_copy) + +two_lev = lapply(stats_df_copy, nlevels) == 2 +fisher_cols_df1 = names(two_lev)[two_lev == TRUE] +fisher_cols_df1 + +drop_cols = c("mosaic", "obesity") +fisher_cols_df1 = fisher_cols_df1[!fisher_cols_df1%in%drop_cols] +fisher_cols_df1 + +fisher_df1 = data.frame() + +for (i in fisher_cols_df1){ + cat(i, "\n===============\n") + df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA, p = NA, ci_low = NA, ci_high = NA) + my_param_val = (get(i, stats_df)) + tab = table(stats_df$obesity, my_param_val) + print(tab) + my_fisher_test = fisher.test(tab) + print(my_fisher_test) + + # extracting results + my_param_name = i + my_n_obs = sum(tab) + my_method = my_fisher_test$method + my_test_statistic = my_fisher_test$statistic[[1]] # FIXME| + my_pval = my_fisher_test$p.value + my_ci_low = my_fisher_test$conf.int[[1]] + my_ci_hi = my_fisher_test$conf.int[[2]] + + # assiging to loop df + df$clinical_categ_params = my_param_name + df$n_obs = my_n_obs + df$method = my_method + df$test_statistic = my_test_statistic + df$p = my_pval + df$ci_low = my_ci_low + df$ci_high = my_ci_hi + + print(df) + fisher_df1 = rbind(fisher_df1, df) +} + +#============= + +three_lev = lapply(stats_df_copy, nlevels) == 3 +fisher_cols_df2 = names(three_lev)[three_lev == TRUE] +fisher_cols_df2 + +#drop_cols = c("mosaic", "obesity") +#fisher_cols_df2 = fisher_cols_df2[!fisher_cols_df2%in%drop_cols] +#fisher_cols_df2 + +fisher_df2 = data.frame() + +for (i in fisher_cols_df2){ + cat(i, "\n===============\n") + df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA , p = NA) + my_param_val = (get(i, stats_df)) + tab = table(stats_df$obesity, my_param_val) + print(tab) + my_fisher_test = fisher.test(tab) + print(my_fisher_test) + + # extracting results + my_param_name = i + my_n_obs = sum(tab) + my_method = my_fisher_test$method + my_test_statistic = ">2 categories" + my_pval = my_fisher_test$p.value + + # assiging to loop df + df$clinical_categ_params = my_param_name + df$n_obs = my_n_obs + df$method = my_method + df$test_statistic = my_test_statistic + df$p = my_pval + + print(df) + fisher_df2 = rbind(fisher_df2, df) +} + + + + + + + + + +######################################################################## +#****************** +# write output file +#****************** +cat("Chisq and fishers test results in:", outfile_clin_categ) +#write.csv(stats_categ_df_f, outfile_clin_categ, row.names = FALSE)