diff --git a/data_extraction_formatting_regression.R b/data_extraction_formatting_regression.R new file mode 100644 index 0000000..676d671 --- /dev/null +++ b/data_extraction_formatting_regression.R @@ -0,0 +1,329 @@ +#!/usr/bin/Rscript +getwd() +setwd('~/git/mosaic_2020/') +getwd() +######################################################################## +# TASK: Extract relevant columns from mosaic adults data +# npa +######################################################################## +#==================== +# Input: source data for clinical +#==================== +source("data_extraction_formatting_clinical.R") +#source("reg_cols_extraction.R") +#======================================= +# Data for mediator to include in regression +#======================================= +cat("Extracting", length(sig_npa_cols), "mediator cols from fp_adults") + +med_df = fp_adults[, c("mosaic", sig_npa_cols)] + +# sanity checks +if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) == nrow(clinical_df) ){ + cat("PASS: binary data obs are complete, n =", nrow(clinical_df)) +}else{ + cat("FAIL: Incomplete data for binary outcomes. Please check and decide!") + quit() +} + +table(clinical_df$ia_exac_copd) + +######################################################################## +# Data extraction for regression +######################################################################## + +common_cols = names(clinical_df)[names(clinical_df)%in%names(med_df)] + +cat("\nMerging clinical and mediator data for regression" + ,"\nMerging on column:", common_cols) + +reg_data = merge(clinical_df, med_df + , by = common_cols) + +if (nrow(reg_data) == nrow(clinical_df) & nrow(med_df)){ + cat("\nNo. of rows match, nrow =", nrow(clinical_df) + , "\nChecking ncols...") + if ( ncol(reg_data) == ncol(clinical_df) + ncol(med_df) - length(common_cols) ){ + cat("\nNo. of cols match, ncol =", ncol(reg_data)) +} else { + cat("FAIL: ncols mismatch" + , "Expected ncols:", ncol(clinical_df) + ncol(med_df) - length(common_cols) + , "\nGot:", ncol(reg_data)) + } +} else { + cat("FAIL: nrows mismatch" + , "\nExpected nrows:", nrow(fp_adults)) +} + +######################################################################## +# Reassign the copd and asthma status and do some checks +table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd)) + +reg_data$ia_exac_copd[reg_data$ia_exac_copd< 1]<- 0 +reg_data$ia_exac_copd[is.na(reg_data$ia_exac_copd)] <- 0 + +table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd)) + +# check copd and asthma status +table(reg_data$ia_exac_copd, reg_data$asthma) +check_copd_and_asthma_1<- subset(reg_data, ia_exac_copd ==1 & asthma == 1) # check this is 3 + +# reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma +reg_data$asthma[reg_data$ia_exac_copd == 1 & reg_data$asthma == 1]= 0 +table(reg_data$ia_exac_copd, reg_data$asthma) + +foo<- subset(reg_data, asthma==1 & ia_exac_copd ==1) # check that its 0 + +rm(check_copd_and_asthma_1, foo) +#===================================================================== +# count the resp scores +max_resp_score_table<- table(reg_data$max_resp_score) +max_resp_score_table + +T1_resp_score_table<- table(reg_data$T1_resp_score) +T1_resp_score_table + +T2_resp_score_table<- table(reg_data$T2_resp_score) +T2_resp_score_table + +Inresp_sev<- table(reg_data$inresp_sev) +Inresp_sev + +# Reassign the resp score so all 4 are replace by 3 +reg_data$max_resp_score[reg_data$max_resp_score ==4 ] <- 3 +revised_resp_score_table<- table(reg_data$max_resp_score) +revised_resp_score_table + +reg_data$T1_resp_score[reg_data$T1_resp_score ==4 ] <- 3 +revised_T1_resp_score_table<- table(reg_data$T1_resp_score) +revised_T1_resp_score_table + +reg_data$T2_resp_score[reg_data$T2_resp_score == 4]<- 3 +revised_T2_resp_score_table<- table(reg_data$T2_resp_score) +revised_T2_resp_score_table + +reg_data$inresp_sev[reg_data$inresp_sev == 4]<- 3 +revised_Inresp_sev<- table(reg_data$inresp_sev) +revised_Inresp_sev +#===================================================================== +# Remove these after checking +rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev + , revised_resp_score_table, revised_T1_resp_score_table, revised_T2_resp_score_table, revised_Inresp_sev) +#===================================================================== +# Binning +# "(": not inclusive +# "]": inclusive + +#======== +# age +#======== +# Create categories of variables +reg_data$age = round(reg_data$age, digits = 0) +table(reg_data$age) +table(reg_data$asthma, reg_data$age) +min(reg_data$age); max(reg_data$age) + +max_age_interval = round_any(max(reg_data$age), 10, f = ceiling) +max_age_interval +min_age = min(reg_data$age); min_age #19 +min_age_interval = min_age - 1; min_age_interval + +#age_bins = cut(reg_data$age, c(0,18,30,40,50,60,70,80,90)) +age_bins = cut(reg_data$age, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval)) +reg_data$age_bins = age_bins +dim(reg_data) # 133 27 + +# age_bins (to keep consistent with the results table) +class(reg_data$age_bins) +levels(reg_data$age_bins) +#"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]" +table(reg_data$asthma, reg_data$age_bins) +# (18,30] (30,40] (40,50] (50,60] (60,70] (70,80] +#0 25 17 25 14 11 1 +#1 11 8 12 5 3 2 + +if (sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data) ){ + cat("PASS: age_bins assigned successfully") +}else{ + cat("FAIL: no. mismatch when assigning age_bins") + quit() +} + +# reassign +class(reg_data$age_bins) +levels(reg_data$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]") +table(reg_data$asthma, reg_data$age_bins) +table(reg_data$asthma, reg_data$age_bins) +# (18,30] (30,40] (40,50] (50,80] +#0 25 17 25 26 +#1 11 8 12 9 + +sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data) + +#=========================== +# O2 saturation binning +#=========================== +reg_data$o2_sat_admis = round(reg_data$o2_sat_admis, digits = 0) +table(reg_data$o2_sat_admis) +tot_o2 = sum(table(reg_data$o2_sat_admis))- table(reg_data$o2_sat_admis)[["-1"]] +tot_o2 + +o2_sat_bin = cut(reg_data$o2_sat_admis, c(0,92,100)) +reg_data$o2_sat_bin = o2_sat_bin +table(reg_data$o2_sat_bin) + +sum(table(reg_data$o2_sat_bin)) == tot_o2 + +#=========================== +# Onset to initial binning +#=========================== +max_in = max(reg_data$onset_2_initial); max_in #23 +min_in = min(reg_data$onset_2_initial) - 1 ; min_in # -6 + +tot_onset2ini = sum(table(reg_data$onset_2_initial)) +tot_onset2ini + +onset_initial_bin = cut(reg_data$onset_2_initial, c(min_in, 4, max_in)) +reg_data$onset_initial_bin = onset_initial_bin + +sum(table(reg_data$onset_initial_bin)) == tot_onset2ini + +#======================= +# seasonal flu: sfluv +#======================= +# should be a factor +if (! is.factor(reg_data$sfluv)){ + reg_data$sfluv = as.factor(reg_data$sfluv) +} +class(reg_data$sfluv) #[1] "factor" + +levels(reg_data$sfluv) +table(reg_data$asthma, reg_data$sfluv) +# reassign +levels(reg_data$sfluv) <- c("0", "0", "1") +table(reg_data$asthma, reg_data$sfluv) + +#======================= +# h1n1v +#======================= +# should be a factor +if (! is.factor(reg_data$h1n1v)){ + reg_data$h1n1v = as.factor(reg_data$h1n1v) +} +class(reg_data$h1n1v) #[1] "factor" + +levels(reg_data$h1n1v) +table(reg_data$asthma, reg_data$h1n1v) +# reassign +levels(reg_data$h1n1v) <- c("0", "0", "1") +table(reg_data$asthma, reg_data$h1n1v) + +#======================= +# ethnicity +#======================= +class(reg_data$ethnicity) # integer +table(reg_data$asthma, reg_data$ethnicity) + +reg_data$ethnicity[reg_data$ethnicity == 4] <- 2 +table(reg_data$asthma, reg_data$ethnicity) + +#======================= +# pneumonia +#======================= +class(reg_data$ia_cxr) # integer +# ia_cxr 2 ---> yes pneumonia (1) +# 1 ---> no (0) +# ! 1 or 2 -- > "unkown" + +# reassign the pneumonia codes +#0: not performed +#1: normal +#2: findings consistent with pneumonia +#3: abnormal +#-1: not recorded +#-2: n/a specified by the clinician # not in the data... +#-3: unknown specified by clinician + + +table(reg_data$ia_cxr) +#-3 -1 0 1 2 3 +#5 48 13 47 17 3 + +# change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist +reg_data$ia_cxr[reg_data$ia_cxr == -3 | reg_data$ia_cxr == -1 | reg_data$ia_cxr == 0 | reg_data$ia_cxr == 3 ] <- "" +table(reg_data$ia_cxr) +# 1 2 +#69 47 17 + +reg_data$ia_cxr[reg_data$ia_cxr == 1] <- 0 +reg_data$ia_cxr[reg_data$ia_cxr == 2] <- 1 +table(reg_data$ia_cxr) +# 0 1 +#69 47 17 + +#======================= +# smoking [tricky one] +#======================= +class(reg_data$smoking) # integer +table(reg_data$asthma, reg_data$smoking) + +# orig +# -3 -1 1 2 3 4 +#0 15 9 22 2 15 30 +#1 4 2 13 0 4 17 + +# -3 -1 1 2 3 4 +#0 14 9 20 2 15 30 +#1 5 2 15 0 4 17 + +# never smoking, 4 and 2 -- > no (0) +#1 and 3 ---> yes (1) +#!-3 and -1 ---- > NA + +################# smoking + +#1: current daily ===> categ smoker(1) +#2: occasional =====> categ no smoker(0) +#3: ex-smoker ===> categ smoker(1) +#4: never =====> categ no smoker(0) +#-1: not recorded =====> categ blank (NA) +#-2: n/a specified by the clinician =====> categ blank (NA) +#-3: unknown specified by clinician=====> categ blank (NA) + +table(reg_data$smoking) +#-3 -1 1 2 3 4 +#19 11 35 2 19 47 + +# reassign the smoking codes +reg_data$smoking[reg_data$smoking == 4 | reg_data$smoking == 2 ] <- 0 +reg_data$smoking[reg_data$smoking == 1 | reg_data$smoking == 3 ] <- 1 +reg_data$smoking[reg_data$smoking == -1 | reg_data$smoking == -2 | reg_data$smoking == -3 ] <- "" + +table(reg_data$smoking) +# 0 1 +#30 49 54 + +table(reg_data$asthma, reg_data$smoking) + +# orig +# 0 1 +#0 24 32 37 +#1 6 17 17 + +# 0 1 +#0 23 32 35 +#1 7 17 19 + +################################################################ +#================== +# writing output file +#================== +outfile_name_reg = "reg_data_recoded_with_NA.csv" +outfile_reg = paste0(outdir, outfile_name_reg) + +cat("Writing clinical file for regression:", outfile_reg) + +#write.csv(reg_data, file = outfile_reg) +################################################################ + +rm(age_bins, max_age_interval, max_in, min_in, o2_sat_bin, onset_initial_bin, tot_o2, tot_onset2ini, meta_data_cols) diff --git a/flu_stats_unpaired_clinical.R b/flu_stats_unpaired_clinical.R new file mode 100755 index 0000000..989cc01 --- /dev/null +++ b/flu_stats_unpaired_clinical.R @@ -0,0 +1,229 @@ +#!/usr/bin/Rscript +getwd() +setwd("~/git/mosaic_2020/") +getwd() +############################################################ +# TASK: unpaired (time) analysis of clinical data +# data: clincial data of flu positive adult patients +# group: obesity +############################################################ +#my_sample_type = "npa" + +#============= +# Input +#============= +source("data_extraction_formatting_clinical.R") + +#============= +# Output +#============= +outfile_name_clinical = paste0("flu_stats_clinical_unpaired.csv") +outfile_clinical_unpaired = paste0(outdir_stats, outfile_name_clinical) +outfile_clinical_unpaired +#=============================== +# data assignment for stats +#================================ +wf = npa_wf[npa_wf$flustat == 1,] +lf = npa_lf[npa_lf$flustat == 1,] +lf$timepoint = paste0("t", lf$timepoint) +lf = lf[!lf$mediator == "vitd",] +######################################################################## +# clear variables +rm(sam_lf, sam_wf +, serum_lf, serum_wf) +rm(colnames_sam_df, expected_rows_sam_lf +, colnames_serum_df, expected_rows_serum_lf) + +rm(pivot_cols) + +# sanity checks +table(lf$timepoint) + +#if (table(lf$flustat) == table(npa_lf$flustat)[[2]]){ +# cat("Analysing Flu positive patients for:", my_sample_type) +#}else{ +# cat("FAIL: problem with subsetting data for:", my_sample_type) +# quit() +#} +######################################################################## +# Unpaired stats for clinical data b/w groups: wilcoxon UNpaired analysis +# No correction required +######################################################################## + +numerical_cols = c("age" + #, "vl_pfu_ul_npa1" + , "los" + , "onset2final" + , "onsfindeath" + , "onset_2_initial" + , "o2_sat_admis") + +metadata_cols = c("mosaic", "obesity") + +clinical_df_numerical = clinical_df[, c(metadata_cols, numerical_cols)] + +pivot_cols = metadata_cols +#pivot_cols = metadata_cols[!meta_data_cols%in%cols_to_omit];pivot_cols +expected_rows_clinical_lf = nrow(clinical_df_numerical) * (length(clinical_df_numerical) - length(pivot_cols)); expected_rows_clinical_lf + + +keycol <- "clinical_params" +valuecol <- "value" +gathercols <- c("age", "los", "onset2final", "onsfindeath", "onset_2_initial", "o2_sat_admis") + +clinical_lf = gather_(clinical_df_numerical, keycol, valuecol, gathercols) + +#============== +# unpaired: t1 +#============== +sum(is.na(clinical_lf$value)) + +foo = clinical_lf[which(is.na(clinical_lf$value)),] + +clinical_lf_comp = clinical_lf[-which(is.na(clinical_lf$value)),] + +stats_un_clinical = compare_means(value~obesity + , group.by = "clinical_params" + , data = clinical_lf + #, data = clinical_lf_comp + , paired = FALSE) + + +stat_df <- clinical_lf %>% + group_by(clinical_params) %>% + wilcox_test(value ~ obesity, paired = F) %>% + add_significance("p") +stat_df$p_format = round(stat_df$p, digits = 3) + +#---------------------------------------- +# calculate n_obs for each clinical param: Overall +#---------------------------------------- +#n_t1 = data.frame(table(lf_t1_comp$mediator)) +n_all = data.frame(table(clinical_lf$clinical_params)) +colnames(n_all) = c("clinical_params", "n") +n_all$clinical_params = as.character(n_all$clinical_params) + +n_gp_lf = data.frame(table(clinical_lf$clinical_params, clinical_lf$obesity)) +n_gp = spread(n_gp_lf, "Var2", "Freq"); n_gp +colnames(n_gp) +colnames(n_gp) = c("clinical_params" + , paste0("n_gp", colnames(n_gp)[2]) + , paste0("n_gp", colnames(n_gp)[3])) + +n_gp$clinical_params = as.character(n_gp$clinical_params) + +n_all_gp = merge(n_all, n_gp + , by = intersect( names(n_all), names(n_gp) ) + , all = T) + +#---------------------------------------- +# calculate n_obs for each clinical param: complete cases +#---------------------------------------- +n_comp = data.frame(table(clinical_lf_comp$clinical_params)) +colnames(n_comp) = c("clinical_params", "n_complete") +n_comp$clinical_params = as.character(n_comp$clinical_params) +n_comp + +n_gp_comp_lf = data.frame(table(clinical_lf_comp$clinical_params, clinical_lf_comp$obesity)); n_gp_comp_lf +n_gp_comp = spread(n_gp_comp_lf, "Var2", "Freq"); n_gp_comp +colnames(n_gp_comp) +colnames(n_gp_comp) = c("clinical_params" + , paste0("n_complete_gp", colnames(n_gp_comp)[2]) + , paste0("n_complete_gp", colnames(n_gp_comp)[3])) + + +n_comp_gp = merge(n_comp, n_gp_comp + , by = intersect( names(n_comp), names(n_gp_comp)) + , all = T) + +merge_cols = intersect(names(n_all_gp), names(n_comp_gp)); merge_cols + +n_df = merge(n_all_gp, n_comp_gp, by = merge_cols, all = T); n_df + +#================================== +# Merge: merge stats + n_obs df +#=================================== +merging_cols = intersect(names(stats_un_clinical), names(n_df)); merging_cols +if (all(n_df$clinical_params%in%stats_un_clinical$clinical_params)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_clinical = merge(stats_un_clinical, n_df, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_clinical) + , "\nncol:", ncol(stats_un_clinical)) +}else{ + nf = n_df$clinical_params[!n_df$clinical_params%in%stats_un_clinical$clinical_params] + stats_un_clinical = merge(stats_un_clinical, n_df, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "\nnot found in stats possibly due to all obs being missing" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_clinical) + , "\nncol:", ncol(stats_un_clinical)) +} + +####################################################################### +#================= +# formatting df +#================= +# delete: unnecessary column +stats_clinical_df = subset(stats_un_clinical, select = -c(.y.,p.adj)) + +# add: reflect stats method correctly i.e paired or unpaired +# incase there are NA due to LLODs, the gsub won't work! +#stats_clinical_df$method = gsub("Wilcoxon", "Wilcoxon_unpaired", stats_clinical_df$method) +stats_clinical_df$method = "wilcoxon unpaired" +stats_clinical_df$method + +# reorder columns +print("preparing to reorder columns...") +colnames(stats_clinical_df) +my_col_order2 = c("clinical_params" + , "method" + , "group1" + , "group2" + , "n" + , "n_gp0" + , "n_gp1" + , "n_complete" + , "n_complete_gp0" + , "n_complete_gp1" + , "p" + , "p.format" + , "p.signif") + +if( length(my_col_order2) == ncol(stats_clinical_df) && (all(my_col_order2%in%colnames(stats_clinical_df))) ){ + print("PASS: Reordering columns...") + stats_clinical_df_f = stats_clinical_df[, my_col_order2] + print("Successful: column reordering") + print("formatted df called:'stats_clinical_df_f'") + cat('\nformatted df has the following dimensions\n') + print(dim(stats_clinical_df_f )) +} else{ + cat(paste0("FAIL:Cannot reorder columns, length or names mismatch" + , "\nExpected column order for: ", ncol(stats_clinical_df) # FIXME: can handle better! + , "\nGot:", length(my_col_order2) + , "\nElse check colnames to see if they exist in both")) + quit() +} +# assign nice column names like replace "." with "_" +colnames(stats_clinical_df_f) = c("clinical_params" + , "method" + , "group1" + , "group2" + , "n" + , "n_gp0" + , "n_gp1" + , "n_complete" + , "n_complete_gp0" + , "n_complete_gp1" + , "p" + , "p_format" + , "p_signif") + +colnames(stats_clinical_df_f) +######################################################################## +#****************** +# write output file +#****************** +cat("UNpaired stats for clinical data for groups in:", outfile_clinical_unpaired) +#write.csv(stats_clinical_df_f, outfile_clinical_unpaired, row.names = FALSE) diff --git a/reg_cols_extraction.R b/reg_cols_extraction.R new file mode 100644 index 0000000..546b729 --- /dev/null +++ b/reg_cols_extraction.R @@ -0,0 +1,89 @@ +#!/usr/bin/Rscript +getwd() +setwd('~/git/mosaic_2020/') +getwd() +######################################################################## +# TASK: Extract relevant columns from mosaic fp adults data for regression +# clinical and sig meds +######################################################################## + +######################################################################## +clinical_cols = c("mosaic" + , "ia_exac_copd" + , "death" + #, "obese2" #inc peaeds, but once you subset data for adults, its the same! + , "obesity" + , "flustat" + , "sfluv" + , "h1n1v" + , "age" + , "gender" + , "asthma" + , "vl_pfu_ul_npa1" + , "los" + , "onset2final" + , "onsfindeath" + , "onset_2_initial" + , "o2_sat_admis" + , "o2_sat_suppl" + , "ethnicity" + , "smoking" + , "ia_cxr" + , "max_resp_score" + , "T1_resp_score" + , "com_noasthma" + , "T2_resp_score" + , "inresp_sev" + , "steroid") + +sig_npa_cols = c("eotaxin_npa1" + , "eotaxin3_npa1" + , "eotaxin3_npa2" + , "gmcsf_npa2" + , "ifnb_npa1" + , "ifnb_npa2" + , "il1_npa1" + , "il1_npa2" + , "il10_npa1" + , "il12p70_npa1" + , "il12p70_npa2" + , "il13_npa1" + , "il2_npa3" + , "il4_npa1" + , "il5_npa1" + , "il5_npa2" + , "il6_npa1" + , "il6_npa2" + , "il8_npa1" + , "il8_npa2" + , "il8_2_npa1" + , "il8_2_npa2" + , "ip10_npa1" + , "mcp4_npa1" + , "mcp4_npa2" + , "mdc_npa1" + , "mip17_npa1" + , "mip17_npa2" + , "neopterin_npa1" + , "rantes_npa1" + , "tarc_npa2" + , "tnf_npa2" + , "tnfr1_npa1" + , "tnfr1_npa2" + , "tnfr1_npa3" + , "tnfr2_npa1" + , "tnfr2_npa2" + , "tnfr2_npa3") + +cols_to_extract = c(clinical_cols, sig_npa_cols) + +if ( length(cols_to_extract) == length(clinical_cols) + length(sig_npa_cols) ){ + cat("PASS: length match" + , "\nTotal no. of cols to extract for regression:", length(cols_to_extract) + , "\nNo. of clinical cols:", length(clinical_cols) + , "\nNo. of sig npa cols: ", length(sig_npa_cols)) +}else{ + cat("FAIL: length mismatch" + , "\nAborting!") + quit() +}