separated fisher test only and renamed logistic_outcome col
This commit is contained in:
parent
6e999898df
commit
269918d696
2 changed files with 98 additions and 130 deletions
|
@ -330,6 +330,12 @@ merging_cols = intersect( names(clinical_df), names(clinical_ics) )
|
||||||
|
|
||||||
clinical_df_ics = merge(clinical_df, clinical_ics, by = merging_cols, all = T); clinical_df_ics
|
clinical_df_ics = merge(clinical_df, clinical_ics, by = merging_cols, all = T); clinical_df_ics
|
||||||
|
|
||||||
|
colnames(clinical_df_ics)
|
||||||
|
|
||||||
|
# change colname of logistic_outcome
|
||||||
|
c1 = which(colnames(clinical_df_ics) == "logistic_outcome")
|
||||||
|
colnames(clinical_df_ics)[c1] <- "t1_resp_recoded"
|
||||||
|
|
||||||
if (nrow(clinical_df_ics) == nrow(clinical_df) & nrow(clinical_ics)){
|
if (nrow(clinical_df_ics) == nrow(clinical_df) & nrow(clinical_ics)){
|
||||||
cat("\nPASS: No. of rows match, nrow =", nrow(clinical_df_ics)
|
cat("\nPASS: No. of rows match, nrow =", nrow(clinical_df_ics)
|
||||||
, "\nChecking ncols...")
|
, "\nChecking ncols...")
|
||||||
|
@ -353,7 +359,10 @@ outfile_reg = paste0(outdir, outfile_name_reg)
|
||||||
|
|
||||||
cat("\nWriting clinical file for regression:", outfile_reg)
|
cat("\nWriting clinical file for regression:", outfile_reg)
|
||||||
|
|
||||||
#write.csv(clinical_df_ics, file = outfile_reg)
|
write.csv(clinical_df_ics, file = outfile_reg)
|
||||||
################################################################
|
################################################################
|
||||||
|
|
||||||
rm(age_bins, max_age_interval, max_in, min_in, o2_sat_bin, onset_initial_bin, tot_o2, tot_onset2ini, meta_data_cols)
|
rm(age_bins, max_age_interval, max_in, min_in
|
||||||
|
, o2_sat_bin, onset_initial_bin, tot_o2
|
||||||
|
, tot_onset2ini, meta_data_cols
|
||||||
|
, clinical_df)
|
||||||
|
|
|
@ -3,20 +3,22 @@ getwd()
|
||||||
setwd("~/git/mosaic_2020/")
|
setwd("~/git/mosaic_2020/")
|
||||||
getwd()
|
getwd()
|
||||||
############################################################
|
############################################################
|
||||||
# TASK: unpaired (time) analysis of clinical data
|
# TASK: Contingency table analysis i.e chisq and fishers
|
||||||
# data: clincial data of flu positive adult patients
|
# data: clincial data of flu positive adult patients
|
||||||
# group: obesity
|
# group: obesity
|
||||||
|
|
||||||
|
# Chisq test
|
||||||
|
#https://www.google.com/search?q=chisq+test+on+long+format+data+in+R+using+group+by&source=lmns&bih=828&biw=1280&client=firefox-b-d&hl=en-GB&sa=X&ved=2ahUKEwjItpL7xI7tAhUGTBoKHXlSBa8Q_AUoAHoECAEQAA
|
||||||
############################################################
|
############################################################
|
||||||
#my_sample_type = "npa"
|
|
||||||
|
|
||||||
#=============
|
#=============
|
||||||
# Input
|
# Input
|
||||||
#=============
|
#=============
|
||||||
source("data_extraction_formatting_clinical.R")
|
source("data_extraction_formatting_clinical.R")
|
||||||
str(clinical_df)
|
str(clinical_df_ics)
|
||||||
|
|
||||||
clinical_df$sfluv = as.integer(clinical_df$sfluv)
|
clinical_df_ics$sfluv = as.integer(clinical_df_ics$sfluv)
|
||||||
clinical_df$h1n1v = as.integer(clinical_df$h1n1v)
|
clinical_df_ics$h1n1v = as.integer(clinical_df_ics$h1n1v)
|
||||||
|
|
||||||
#=============
|
#=============
|
||||||
# Output
|
# Output
|
||||||
|
@ -44,58 +46,28 @@ categorical_cols = c( "death"
|
||||||
, "com_noasthma"
|
, "com_noasthma"
|
||||||
, "T2_resp_score"
|
, "T2_resp_score"
|
||||||
, "inresp_sev"
|
, "inresp_sev"
|
||||||
, "steroid")
|
, "steroid"
|
||||||
|
, "age_bins"
|
||||||
|
, "o2_sat_bin"
|
||||||
|
, "onset_initial_bin"
|
||||||
|
, "t1_resp_recoded"
|
||||||
|
, "steroid_ics")
|
||||||
|
|
||||||
metadata_cols = c("mosaic", "obesity")
|
metadata_cols = c("mosaic", "obesity")
|
||||||
|
|
||||||
categ_df = clinical_df[, c(metadata_cols, categorical_cols)]
|
categ_df = clinical_df_ics[, c(metadata_cols, categorical_cols)]
|
||||||
|
|
||||||
# quick test
|
# quick test
|
||||||
tab = table(categ_df$obesity, categ_df$death)
|
tab = table(categ_df$obesity, categ_df$death)
|
||||||
tab
|
tab
|
||||||
chisq.test(tab)
|
chisq.test(tab)
|
||||||
|
|
||||||
pivot_cols = metadata_cols
|
#========================================================
|
||||||
#pivot_cols = metadata_cols[!meta_data_cols%in%cols_to_omit];pivot_cols
|
# Data: for chisq and fisher
|
||||||
expected_rows_categ_lf = nrow(categ_df) * ( length(categ_df) - length(pivot_cols) ); expected_rows_categ_lf
|
#========================================================
|
||||||
|
|
||||||
colnames(categ_df)
|
|
||||||
|
|
||||||
keycol <- "clinical_categ_params"
|
|
||||||
valuecol <- "value"
|
|
||||||
gathercols <- c("death"
|
|
||||||
#, "flustat"
|
|
||||||
, "sfluv", "h1n1v"
|
|
||||||
, "gender", "asthma"
|
|
||||||
#, "o2_sat_suppl"
|
|
||||||
, "ethnicity"
|
|
||||||
, "smoking", "ia_cxr", "max_resp_score", "T1_resp_score"
|
|
||||||
, "com_noasthma", "T2_resp_score", "inresp_sev", "steroid")
|
|
||||||
|
|
||||||
categ_lf = gather_(categ_df, keycol, valuecol, gathercols)
|
|
||||||
#categ_lf = gather(categ_df, clinical_categ_params, value, death:T2_resp_score, factor_key = F)
|
|
||||||
|
|
||||||
if( nrow(categ_lf) == expected_rows_categ_lf){
|
|
||||||
cat("PASS: long format data created successfully"
|
|
||||||
, "\nnrow:", nrow(categ_lf)
|
|
||||||
, "\nncol:", ncol(categ_lf))
|
|
||||||
}
|
|
||||||
|
|
||||||
#=============================================
|
|
||||||
# Chisq test: clinical categorical vars
|
|
||||||
#https://www.google.com/search?q=chisq+test+on+long+format+data+in+R+using+group+by&source=lmns&bih=828&biw=1280&client=firefox-b-d&hl=en-GB&sa=X&ved=2ahUKEwjItpL7xI7tAhUGTBoKHXlSBa8Q_AUoAHoECAEQAA
|
|
||||||
#=============================================
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=============================================
|
|
||||||
# wf data
|
|
||||||
#=============================================
|
|
||||||
stats_df = subset(categ_df, select = -c(mosaic))
|
stats_df = subset(categ_df, select = -c(mosaic))
|
||||||
drop_cols = c("mosaic", "obesity")
|
|
||||||
|
|
||||||
my_categ_cols = colnames(categ_df)[!colnames(categ_df)%in%drop_cols]
|
my_categ_cols = colnames(categ_df)[!colnames(categ_df)%in%metadata_cols]
|
||||||
|
|
||||||
if ( length(my_categ_cols) == ncol(categ_df) - length(drop_cols) ){
|
if ( length(my_categ_cols) == ncol(categ_df) - length(drop_cols) ){
|
||||||
cat("PASS: variables for chisq test successfully extracted")
|
cat("PASS: variables for chisq test successfully extracted")
|
||||||
|
@ -103,16 +75,19 @@ if ( length(my_categ_cols) == ncol(categ_df) - length(drop_cols) ){
|
||||||
cat("FAIL: length mismatch when extracting variables for chisq")
|
cat("FAIL: length mismatch when extracting variables for chisq")
|
||||||
quit()
|
quit()
|
||||||
}
|
}
|
||||||
|
########################################################################
|
||||||
# quick test
|
#--------------------
|
||||||
tab = table(stats_df$obesity, stats_df$smoking)
|
# chisq test
|
||||||
tab
|
#--------------------
|
||||||
test1 = chisq.test(tab)
|
|
||||||
|
|
||||||
chisq_df = data.frame()
|
chisq_df = data.frame()
|
||||||
for (i in my_categ_cols){
|
for (i in my_categ_cols){
|
||||||
#print(i)
|
#print(i)
|
||||||
df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA, p = NA)
|
df = data.frame(clinical_categ_params = NA
|
||||||
|
, n_obs = NA
|
||||||
|
, method = NA
|
||||||
|
#, test_statistic = NA
|
||||||
|
, p = NA)
|
||||||
|
|
||||||
my_param_val = (get(i, stats_df))
|
my_param_val = (get(i, stats_df))
|
||||||
tab = table(stats_df$obesity, my_param_val)
|
tab = table(stats_df$obesity, my_param_val)
|
||||||
print(tab)
|
print(tab)
|
||||||
|
@ -123,20 +98,21 @@ for (i in my_categ_cols){
|
||||||
my_param_name = i
|
my_param_name = i
|
||||||
my_n_obs = sum(my_chi_test$observed)
|
my_n_obs = sum(my_chi_test$observed)
|
||||||
my_method = my_chi_test$method
|
my_method = my_chi_test$method
|
||||||
my_test_statistic = my_chi_test$statistic[[1]]
|
#my_test_statistic = my_chi_test$statistic[[1]]
|
||||||
my_pval = my_chi_test$p.value
|
my_pval = my_chi_test$p.value
|
||||||
|
|
||||||
# assiging to loop df
|
# assiging to loop df
|
||||||
df$clinical_categ_params = my_param_name
|
df$clinical_categ_params = my_param_name
|
||||||
df$n_obs = my_n_obs
|
df$n_obs = my_n_obs
|
||||||
df$method = my_method
|
df$method = my_method
|
||||||
df$test_statistic = my_test_statistic
|
#df$test_statistic = my_test_statistic
|
||||||
df$p = my_pval
|
df$p = my_pval
|
||||||
|
|
||||||
print(df)
|
print(df)
|
||||||
chisq_df = rbind(chisq_df, df)
|
chisq_df = rbind(chisq_df, df)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# formatting
|
||||||
chisq_df$p_format = format.pval(chisq_df$p)
|
chisq_df$p_format = format.pval(chisq_df$p)
|
||||||
chisq_df$p_signif = chisq_df$p
|
chisq_df$p_signif = chisq_df$p
|
||||||
|
|
||||||
|
@ -147,34 +123,29 @@ chisq_df = dplyr::mutate(chisq_df, p_signif = case_when(p_signif == 0.05 ~ "."
|
||||||
, p_signif <0.05 ~ '*'
|
, p_signif <0.05 ~ '*'
|
||||||
, p_signif == 0.1 ~ '.'
|
, p_signif == 0.1 ~ '.'
|
||||||
, TRUE ~ 'ns'))
|
, TRUE ~ 'ns'))
|
||||||
#=============================================
|
|
||||||
# Fishers test
|
|
||||||
#=============================================
|
|
||||||
# quick test
|
# quick test
|
||||||
tab2 = table(stats_df$obesity, stats_df$smoking)
|
tab = table(stats_df$obesity, stats_df$smoking); tab
|
||||||
tab2
|
test1 = chisq.test(tab); test1
|
||||||
#test2 = fisher_test(tab2); test2
|
test2 = chisq_test(tab); test2 #rstatix
|
||||||
test3 = fisher.test(stats_df$obesity, stats_df$com_noasthma); test3
|
sum(tab)
|
||||||
|
|
||||||
stats_df_copy = stats_df
|
#================================================================================
|
||||||
int_vars <- lapply(stats_df_copy, class)%in%c("integer", "numeric")
|
#--------------------
|
||||||
int_vars
|
# Fishers test: without any OR
|
||||||
stats_df_copy[, int_vars] <- lapply(stats_df_copy[, int_vars], as.factor)
|
# see standalone script for this!
|
||||||
str(stats_df_copy)
|
#--------------------
|
||||||
|
|
||||||
two_lev = lapply(stats_df_copy, nlevels) == 2
|
fisher_df = data.frame()
|
||||||
fisher_cols_df1 = names(two_lev)[two_lev == TRUE]
|
|
||||||
fisher_cols_df1
|
|
||||||
|
|
||||||
drop_cols = c("mosaic", "obesity")
|
for (i in my_categ_cols){
|
||||||
fisher_cols_df1 = fisher_cols_df1[!fisher_cols_df1%in%drop_cols]
|
|
||||||
fisher_cols_df1
|
|
||||||
|
|
||||||
fisher_df1 = data.frame()
|
|
||||||
|
|
||||||
for (i in fisher_cols_df1){
|
|
||||||
cat(i, "\n===============\n")
|
cat(i, "\n===============\n")
|
||||||
df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA, p = NA, ci_low = NA, ci_high = NA)
|
df = data.frame(clinical_categ_params = NA
|
||||||
|
, n_obs = NA
|
||||||
|
, method = NA
|
||||||
|
#, test_statistic = NA
|
||||||
|
, p = NA)
|
||||||
|
|
||||||
my_param_val = (get(i, stats_df))
|
my_param_val = (get(i, stats_df))
|
||||||
tab = table(stats_df$obesity, my_param_val)
|
tab = table(stats_df$obesity, my_param_val)
|
||||||
print(tab)
|
print(tab)
|
||||||
|
@ -185,74 +156,62 @@ for (i in fisher_cols_df1){
|
||||||
my_param_name = i
|
my_param_name = i
|
||||||
my_n_obs = sum(tab)
|
my_n_obs = sum(tab)
|
||||||
my_method = my_fisher_test$method
|
my_method = my_fisher_test$method
|
||||||
my_test_statistic = my_fisher_test$statistic[[1]] # FIXME|
|
#my_test_statistic = my_fisher_test$estimate[[1]]
|
||||||
my_pval = my_fisher_test$p.value
|
my_pval = my_fisher_test$p.value
|
||||||
my_ci_low = my_fisher_test$conf.int[[1]]
|
|
||||||
my_ci_hi = my_fisher_test$conf.int[[2]]
|
|
||||||
|
|
||||||
# assiging to loop df
|
# assiging to loop df
|
||||||
df$clinical_categ_params = my_param_name
|
df$clinical_categ_params = my_param_name
|
||||||
df$n_obs = my_n_obs
|
df$n_obs = my_n_obs
|
||||||
df$method = my_method
|
df$method = my_method
|
||||||
df$test_statistic = my_test_statistic
|
#df$test_statistic = my_test_statistic
|
||||||
df$p = my_pval
|
df$p = my_pval
|
||||||
df$ci_low = my_ci_low
|
|
||||||
df$ci_high = my_ci_hi
|
|
||||||
|
|
||||||
print(df)
|
print(df)
|
||||||
fisher_df1 = rbind(fisher_df1, df)
|
fisher_df = rbind(fisher_df, df)
|
||||||
}
|
}
|
||||||
|
|
||||||
#=============
|
# formatting
|
||||||
|
fisher_df$p_format = format.pval(fisher_df$p)
|
||||||
|
fisher_df$p_signif = fisher_df$p
|
||||||
|
|
||||||
three_lev = lapply(stats_df_copy, nlevels) == 3
|
fisher_df = dplyr::mutate(fisher_df, p_signif = case_when(p_signif == 0.05 ~ "."
|
||||||
fisher_cols_df2 = names(three_lev)[three_lev == TRUE]
|
, p_signif <=0.0001 ~ '****'
|
||||||
fisher_cols_df2
|
, p_signif <=0.001 ~ '***'
|
||||||
|
, p_signif <=0.01 ~ '**'
|
||||||
|
, p_signif <0.05 ~ '*'
|
||||||
|
, p_signif == 0.1 ~ '.'
|
||||||
|
, TRUE ~ 'ns'))
|
||||||
|
|
||||||
#drop_cols = c("mosaic", "obesity")
|
# quick test
|
||||||
#fisher_cols_df2 = fisher_cols_df2[!fisher_cols_df2%in%drop_cols]
|
tab_f = table(stats_df$obesity, stats_df$smoking); tab_f
|
||||||
#fisher_cols_df2
|
test1_f = fisher.test(tab); test1_f
|
||||||
|
test2_f = fisher_test(tab); test2_f #rstatix
|
||||||
|
sum(tab_f)
|
||||||
|
test1_f$estimate
|
||||||
|
|
||||||
fisher_df2 = data.frame()
|
#================================================================================
|
||||||
|
#--------------------------------
|
||||||
for (i in fisher_cols_df2){
|
# Combining chisq and fishers df
|
||||||
cat(i, "\n===============\n")
|
#---------------------------------
|
||||||
df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA , p = NA)
|
if ( all(colnames(chisq_df) == colnames(fisher_df)) && all(dim(chisq_df) == dim(fisher_df)) ){
|
||||||
my_param_val = (get(i, stats_df))
|
cat("Colnames AND dim match for both chisq and fisher df"
|
||||||
tab = table(stats_df$obesity, my_param_val)
|
, "\nNo. of rows:", nrow(chisq_df)
|
||||||
print(tab)
|
, "\nNo. of cols:", ncol(chisq_df)
|
||||||
my_fisher_test = fisher.test(tab)
|
, "\nProceeding to rbind")
|
||||||
print(my_fisher_test)
|
|
||||||
|
|
||||||
# extracting results
|
comb_stats_categ_df = rbind(chisq_df, fisher_df)
|
||||||
my_param_name = i
|
|
||||||
my_n_obs = sum(tab)
|
|
||||||
my_method = my_fisher_test$method
|
|
||||||
my_test_statistic = ">2 categories"
|
|
||||||
my_pval = my_fisher_test$p.value
|
|
||||||
|
|
||||||
# assiging to loop df
|
cat("\nCombined df dim:")
|
||||||
df$clinical_categ_params = my_param_name
|
print(dim(comb_stats_categ_df))
|
||||||
df$n_obs = my_n_obs
|
|
||||||
df$method = my_method
|
|
||||||
df$test_statistic = my_test_statistic
|
|
||||||
df$p = my_pval
|
|
||||||
|
|
||||||
print(df)
|
|
||||||
fisher_df2 = rbind(fisher_df2, df)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# sort df according to p_signif AND clinical_categ_params
|
||||||
|
comb_stats_categ_df_f = comb_stats_categ_df[order(comb_stats_categ_df$p_signif
|
||||||
|
, comb_stats_categ_df$clinical_categ_params),]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
########################################################################
|
########################################################################
|
||||||
#******************
|
#******************
|
||||||
# write output file
|
# write output file
|
||||||
#******************
|
#******************
|
||||||
cat("Chisq and fishers test results in:", outfile_clin_categ)
|
cat("Chisq and fishers test results in:", outfile_clin_categ)
|
||||||
#write.csv(stats_categ_df_f, outfile_clin_categ, row.names = FALSE)
|
write.csv(comb_stats_categ_df_f, outfile_clin_categ, row.names = FALSE)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue