added data_extraction_formatting_regression.R flu_stats_unpaired_clinical.R and reg_cols_extraction.R
This commit is contained in:
parent
bb6e92fa0f
commit
76dc3fbb6b
3 changed files with 647 additions and 0 deletions
329
data_extraction_formatting_regression.R
Normal file
329
data_extraction_formatting_regression.R
Normal file
|
@ -0,0 +1,329 @@
|
|||
#!/usr/bin/Rscript
|
||||
getwd()
|
||||
setwd('~/git/mosaic_2020/')
|
||||
getwd()
|
||||
########################################################################
|
||||
# TASK: Extract relevant columns from mosaic adults data
|
||||
# npa
|
||||
########################################################################
|
||||
#====================
|
||||
# Input: source data for clinical
|
||||
#====================
|
||||
source("data_extraction_formatting_clinical.R")
|
||||
#source("reg_cols_extraction.R")
|
||||
#=======================================
|
||||
# Data for mediator to include in regression
|
||||
#=======================================
|
||||
cat("Extracting", length(sig_npa_cols), "mediator cols from fp_adults")
|
||||
|
||||
med_df = fp_adults[, c("mosaic", sig_npa_cols)]
|
||||
|
||||
# sanity checks
|
||||
if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) == nrow(clinical_df) ){
|
||||
cat("PASS: binary data obs are complete, n =", nrow(clinical_df))
|
||||
}else{
|
||||
cat("FAIL: Incomplete data for binary outcomes. Please check and decide!")
|
||||
quit()
|
||||
}
|
||||
|
||||
table(clinical_df$ia_exac_copd)
|
||||
|
||||
########################################################################
|
||||
# Data extraction for regression
|
||||
########################################################################
|
||||
|
||||
common_cols = names(clinical_df)[names(clinical_df)%in%names(med_df)]
|
||||
|
||||
cat("\nMerging clinical and mediator data for regression"
|
||||
,"\nMerging on column:", common_cols)
|
||||
|
||||
reg_data = merge(clinical_df, med_df
|
||||
, by = common_cols)
|
||||
|
||||
if (nrow(reg_data) == nrow(clinical_df) & nrow(med_df)){
|
||||
cat("\nNo. of rows match, nrow =", nrow(clinical_df)
|
||||
, "\nChecking ncols...")
|
||||
if ( ncol(reg_data) == ncol(clinical_df) + ncol(med_df) - length(common_cols) ){
|
||||
cat("\nNo. of cols match, ncol =", ncol(reg_data))
|
||||
} else {
|
||||
cat("FAIL: ncols mismatch"
|
||||
, "Expected ncols:", ncol(clinical_df) + ncol(med_df) - length(common_cols)
|
||||
, "\nGot:", ncol(reg_data))
|
||||
}
|
||||
} else {
|
||||
cat("FAIL: nrows mismatch"
|
||||
, "\nExpected nrows:", nrow(fp_adults))
|
||||
}
|
||||
|
||||
########################################################################
|
||||
# Reassign the copd and asthma status and do some checks
|
||||
table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd))
|
||||
|
||||
reg_data$ia_exac_copd[reg_data$ia_exac_copd< 1]<- 0
|
||||
reg_data$ia_exac_copd[is.na(reg_data$ia_exac_copd)] <- 0
|
||||
|
||||
table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd))
|
||||
|
||||
# check copd and asthma status
|
||||
table(reg_data$ia_exac_copd, reg_data$asthma)
|
||||
check_copd_and_asthma_1<- subset(reg_data, ia_exac_copd ==1 & asthma == 1) # check this is 3
|
||||
|
||||
# reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma
|
||||
reg_data$asthma[reg_data$ia_exac_copd == 1 & reg_data$asthma == 1]= 0
|
||||
table(reg_data$ia_exac_copd, reg_data$asthma)
|
||||
|
||||
foo<- subset(reg_data, asthma==1 & ia_exac_copd ==1) # check that its 0
|
||||
|
||||
rm(check_copd_and_asthma_1, foo)
|
||||
#=====================================================================
|
||||
# count the resp scores
|
||||
max_resp_score_table<- table(reg_data$max_resp_score)
|
||||
max_resp_score_table
|
||||
|
||||
T1_resp_score_table<- table(reg_data$T1_resp_score)
|
||||
T1_resp_score_table
|
||||
|
||||
T2_resp_score_table<- table(reg_data$T2_resp_score)
|
||||
T2_resp_score_table
|
||||
|
||||
Inresp_sev<- table(reg_data$inresp_sev)
|
||||
Inresp_sev
|
||||
|
||||
# Reassign the resp score so all 4 are replace by 3
|
||||
reg_data$max_resp_score[reg_data$max_resp_score ==4 ] <- 3
|
||||
revised_resp_score_table<- table(reg_data$max_resp_score)
|
||||
revised_resp_score_table
|
||||
|
||||
reg_data$T1_resp_score[reg_data$T1_resp_score ==4 ] <- 3
|
||||
revised_T1_resp_score_table<- table(reg_data$T1_resp_score)
|
||||
revised_T1_resp_score_table
|
||||
|
||||
reg_data$T2_resp_score[reg_data$T2_resp_score == 4]<- 3
|
||||
revised_T2_resp_score_table<- table(reg_data$T2_resp_score)
|
||||
revised_T2_resp_score_table
|
||||
|
||||
reg_data$inresp_sev[reg_data$inresp_sev == 4]<- 3
|
||||
revised_Inresp_sev<- table(reg_data$inresp_sev)
|
||||
revised_Inresp_sev
|
||||
#=====================================================================
|
||||
# Remove these after checking
|
||||
rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev
|
||||
, revised_resp_score_table, revised_T1_resp_score_table, revised_T2_resp_score_table, revised_Inresp_sev)
|
||||
#=====================================================================
|
||||
# Binning
|
||||
# "(": not inclusive
|
||||
# "]": inclusive
|
||||
|
||||
#========
|
||||
# age
|
||||
#========
|
||||
# Create categories of variables
|
||||
reg_data$age = round(reg_data$age, digits = 0)
|
||||
table(reg_data$age)
|
||||
table(reg_data$asthma, reg_data$age)
|
||||
min(reg_data$age); max(reg_data$age)
|
||||
|
||||
max_age_interval = round_any(max(reg_data$age), 10, f = ceiling)
|
||||
max_age_interval
|
||||
min_age = min(reg_data$age); min_age #19
|
||||
min_age_interval = min_age - 1; min_age_interval
|
||||
|
||||
#age_bins = cut(reg_data$age, c(0,18,30,40,50,60,70,80,90))
|
||||
age_bins = cut(reg_data$age, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval))
|
||||
reg_data$age_bins = age_bins
|
||||
dim(reg_data) # 133 27
|
||||
|
||||
# age_bins (to keep consistent with the results table)
|
||||
class(reg_data$age_bins)
|
||||
levels(reg_data$age_bins)
|
||||
#"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]"
|
||||
table(reg_data$asthma, reg_data$age_bins)
|
||||
# (18,30] (30,40] (40,50] (50,60] (60,70] (70,80]
|
||||
#0 25 17 25 14 11 1
|
||||
#1 11 8 12 5 3 2
|
||||
|
||||
if (sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data) ){
|
||||
cat("PASS: age_bins assigned successfully")
|
||||
}else{
|
||||
cat("FAIL: no. mismatch when assigning age_bins")
|
||||
quit()
|
||||
}
|
||||
|
||||
# reassign
|
||||
class(reg_data$age_bins)
|
||||
levels(reg_data$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]")
|
||||
table(reg_data$asthma, reg_data$age_bins)
|
||||
table(reg_data$asthma, reg_data$age_bins)
|
||||
# (18,30] (30,40] (40,50] (50,80]
|
||||
#0 25 17 25 26
|
||||
#1 11 8 12 9
|
||||
|
||||
sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data)
|
||||
|
||||
#===========================
|
||||
# O2 saturation binning
|
||||
#===========================
|
||||
reg_data$o2_sat_admis = round(reg_data$o2_sat_admis, digits = 0)
|
||||
table(reg_data$o2_sat_admis)
|
||||
tot_o2 = sum(table(reg_data$o2_sat_admis))- table(reg_data$o2_sat_admis)[["-1"]]
|
||||
tot_o2
|
||||
|
||||
o2_sat_bin = cut(reg_data$o2_sat_admis, c(0,92,100))
|
||||
reg_data$o2_sat_bin = o2_sat_bin
|
||||
table(reg_data$o2_sat_bin)
|
||||
|
||||
sum(table(reg_data$o2_sat_bin)) == tot_o2
|
||||
|
||||
#===========================
|
||||
# Onset to initial binning
|
||||
#===========================
|
||||
max_in = max(reg_data$onset_2_initial); max_in #23
|
||||
min_in = min(reg_data$onset_2_initial) - 1 ; min_in # -6
|
||||
|
||||
tot_onset2ini = sum(table(reg_data$onset_2_initial))
|
||||
tot_onset2ini
|
||||
|
||||
onset_initial_bin = cut(reg_data$onset_2_initial, c(min_in, 4, max_in))
|
||||
reg_data$onset_initial_bin = onset_initial_bin
|
||||
|
||||
sum(table(reg_data$onset_initial_bin)) == tot_onset2ini
|
||||
|
||||
#=======================
|
||||
# seasonal flu: sfluv
|
||||
#=======================
|
||||
# should be a factor
|
||||
if (! is.factor(reg_data$sfluv)){
|
||||
reg_data$sfluv = as.factor(reg_data$sfluv)
|
||||
}
|
||||
class(reg_data$sfluv) #[1] "factor"
|
||||
|
||||
levels(reg_data$sfluv)
|
||||
table(reg_data$asthma, reg_data$sfluv)
|
||||
# reassign
|
||||
levels(reg_data$sfluv) <- c("0", "0", "1")
|
||||
table(reg_data$asthma, reg_data$sfluv)
|
||||
|
||||
#=======================
|
||||
# h1n1v
|
||||
#=======================
|
||||
# should be a factor
|
||||
if (! is.factor(reg_data$h1n1v)){
|
||||
reg_data$h1n1v = as.factor(reg_data$h1n1v)
|
||||
}
|
||||
class(reg_data$h1n1v) #[1] "factor"
|
||||
|
||||
levels(reg_data$h1n1v)
|
||||
table(reg_data$asthma, reg_data$h1n1v)
|
||||
# reassign
|
||||
levels(reg_data$h1n1v) <- c("0", "0", "1")
|
||||
table(reg_data$asthma, reg_data$h1n1v)
|
||||
|
||||
#=======================
|
||||
# ethnicity
|
||||
#=======================
|
||||
class(reg_data$ethnicity) # integer
|
||||
table(reg_data$asthma, reg_data$ethnicity)
|
||||
|
||||
reg_data$ethnicity[reg_data$ethnicity == 4] <- 2
|
||||
table(reg_data$asthma, reg_data$ethnicity)
|
||||
|
||||
#=======================
|
||||
# pneumonia
|
||||
#=======================
|
||||
class(reg_data$ia_cxr) # integer
|
||||
# ia_cxr 2 ---> yes pneumonia (1)
|
||||
# 1 ---> no (0)
|
||||
# ! 1 or 2 -- > "unkown"
|
||||
|
||||
# reassign the pneumonia codes
|
||||
#0: not performed
|
||||
#1: normal
|
||||
#2: findings consistent with pneumonia
|
||||
#3: abnormal
|
||||
#-1: not recorded
|
||||
#-2: n/a specified by the clinician # not in the data...
|
||||
#-3: unknown specified by clinician
|
||||
|
||||
|
||||
table(reg_data$ia_cxr)
|
||||
#-3 -1 0 1 2 3
|
||||
#5 48 13 47 17 3
|
||||
|
||||
# change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist
|
||||
reg_data$ia_cxr[reg_data$ia_cxr == -3 | reg_data$ia_cxr == -1 | reg_data$ia_cxr == 0 | reg_data$ia_cxr == 3 ] <- ""
|
||||
table(reg_data$ia_cxr)
|
||||
# 1 2
|
||||
#69 47 17
|
||||
|
||||
reg_data$ia_cxr[reg_data$ia_cxr == 1] <- 0
|
||||
reg_data$ia_cxr[reg_data$ia_cxr == 2] <- 1
|
||||
table(reg_data$ia_cxr)
|
||||
# 0 1
|
||||
#69 47 17
|
||||
|
||||
#=======================
|
||||
# smoking [tricky one]
|
||||
#=======================
|
||||
class(reg_data$smoking) # integer
|
||||
table(reg_data$asthma, reg_data$smoking)
|
||||
|
||||
# orig
|
||||
# -3 -1 1 2 3 4
|
||||
#0 15 9 22 2 15 30
|
||||
#1 4 2 13 0 4 17
|
||||
|
||||
# -3 -1 1 2 3 4
|
||||
#0 14 9 20 2 15 30
|
||||
#1 5 2 15 0 4 17
|
||||
|
||||
# never smoking, 4 and 2 -- > no (0)
|
||||
#1 and 3 ---> yes (1)
|
||||
#!-3 and -1 ---- > NA
|
||||
|
||||
################# smoking
|
||||
|
||||
#1: current daily ===> categ smoker(1)
|
||||
#2: occasional =====> categ no smoker(0)
|
||||
#3: ex-smoker ===> categ smoker(1)
|
||||
#4: never =====> categ no smoker(0)
|
||||
#-1: not recorded =====> categ blank (NA)
|
||||
#-2: n/a specified by the clinician =====> categ blank (NA)
|
||||
#-3: unknown specified by clinician=====> categ blank (NA)
|
||||
|
||||
table(reg_data$smoking)
|
||||
#-3 -1 1 2 3 4
|
||||
#19 11 35 2 19 47
|
||||
|
||||
# reassign the smoking codes
|
||||
reg_data$smoking[reg_data$smoking == 4 | reg_data$smoking == 2 ] <- 0
|
||||
reg_data$smoking[reg_data$smoking == 1 | reg_data$smoking == 3 ] <- 1
|
||||
reg_data$smoking[reg_data$smoking == -1 | reg_data$smoking == -2 | reg_data$smoking == -3 ] <- ""
|
||||
|
||||
table(reg_data$smoking)
|
||||
# 0 1
|
||||
#30 49 54
|
||||
|
||||
table(reg_data$asthma, reg_data$smoking)
|
||||
|
||||
# orig
|
||||
# 0 1
|
||||
#0 24 32 37
|
||||
#1 6 17 17
|
||||
|
||||
# 0 1
|
||||
#0 23 32 35
|
||||
#1 7 17 19
|
||||
|
||||
################################################################
|
||||
#==================
|
||||
# writing output file
|
||||
#==================
|
||||
outfile_name_reg = "reg_data_recoded_with_NA.csv"
|
||||
outfile_reg = paste0(outdir, outfile_name_reg)
|
||||
|
||||
cat("Writing clinical file for regression:", outfile_reg)
|
||||
|
||||
#write.csv(reg_data, file = outfile_reg)
|
||||
################################################################
|
||||
|
||||
rm(age_bins, max_age_interval, max_in, min_in, o2_sat_bin, onset_initial_bin, tot_o2, tot_onset2ini, meta_data_cols)
|
229
flu_stats_unpaired_clinical.R
Executable file
229
flu_stats_unpaired_clinical.R
Executable file
|
@ -0,0 +1,229 @@
|
|||
#!/usr/bin/Rscript
|
||||
getwd()
|
||||
setwd("~/git/mosaic_2020/")
|
||||
getwd()
|
||||
############################################################
|
||||
# TASK: unpaired (time) analysis of clinical data
|
||||
# data: clincial data of flu positive adult patients
|
||||
# group: obesity
|
||||
############################################################
|
||||
#my_sample_type = "npa"
|
||||
|
||||
#=============
|
||||
# Input
|
||||
#=============
|
||||
source("data_extraction_formatting_clinical.R")
|
||||
|
||||
#=============
|
||||
# Output
|
||||
#=============
|
||||
outfile_name_clinical = paste0("flu_stats_clinical_unpaired.csv")
|
||||
outfile_clinical_unpaired = paste0(outdir_stats, outfile_name_clinical)
|
||||
outfile_clinical_unpaired
|
||||
#===============================
|
||||
# data assignment for stats
|
||||
#================================
|
||||
wf = npa_wf[npa_wf$flustat == 1,]
|
||||
lf = npa_lf[npa_lf$flustat == 1,]
|
||||
lf$timepoint = paste0("t", lf$timepoint)
|
||||
lf = lf[!lf$mediator == "vitd",]
|
||||
########################################################################
|
||||
# clear variables
|
||||
rm(sam_lf, sam_wf
|
||||
, serum_lf, serum_wf)
|
||||
rm(colnames_sam_df, expected_rows_sam_lf
|
||||
, colnames_serum_df, expected_rows_serum_lf)
|
||||
|
||||
rm(pivot_cols)
|
||||
|
||||
# sanity checks
|
||||
table(lf$timepoint)
|
||||
|
||||
#if (table(lf$flustat) == table(npa_lf$flustat)[[2]]){
|
||||
# cat("Analysing Flu positive patients for:", my_sample_type)
|
||||
#}else{
|
||||
# cat("FAIL: problem with subsetting data for:", my_sample_type)
|
||||
# quit()
|
||||
#}
|
||||
########################################################################
|
||||
# Unpaired stats for clinical data b/w groups: wilcoxon UNpaired analysis
|
||||
# No correction required
|
||||
########################################################################
|
||||
|
||||
numerical_cols = c("age"
|
||||
#, "vl_pfu_ul_npa1"
|
||||
, "los"
|
||||
, "onset2final"
|
||||
, "onsfindeath"
|
||||
, "onset_2_initial"
|
||||
, "o2_sat_admis")
|
||||
|
||||
metadata_cols = c("mosaic", "obesity")
|
||||
|
||||
clinical_df_numerical = clinical_df[, c(metadata_cols, numerical_cols)]
|
||||
|
||||
pivot_cols = metadata_cols
|
||||
#pivot_cols = metadata_cols[!meta_data_cols%in%cols_to_omit];pivot_cols
|
||||
expected_rows_clinical_lf = nrow(clinical_df_numerical) * (length(clinical_df_numerical) - length(pivot_cols)); expected_rows_clinical_lf
|
||||
|
||||
|
||||
keycol <- "clinical_params"
|
||||
valuecol <- "value"
|
||||
gathercols <- c("age", "los", "onset2final", "onsfindeath", "onset_2_initial", "o2_sat_admis")
|
||||
|
||||
clinical_lf = gather_(clinical_df_numerical, keycol, valuecol, gathercols)
|
||||
|
||||
#==============
|
||||
# unpaired: t1
|
||||
#==============
|
||||
sum(is.na(clinical_lf$value))
|
||||
|
||||
foo = clinical_lf[which(is.na(clinical_lf$value)),]
|
||||
|
||||
clinical_lf_comp = clinical_lf[-which(is.na(clinical_lf$value)),]
|
||||
|
||||
stats_un_clinical = compare_means(value~obesity
|
||||
, group.by = "clinical_params"
|
||||
, data = clinical_lf
|
||||
#, data = clinical_lf_comp
|
||||
, paired = FALSE)
|
||||
|
||||
|
||||
stat_df <- clinical_lf %>%
|
||||
group_by(clinical_params) %>%
|
||||
wilcox_test(value ~ obesity, paired = F) %>%
|
||||
add_significance("p")
|
||||
stat_df$p_format = round(stat_df$p, digits = 3)
|
||||
|
||||
#----------------------------------------
|
||||
# calculate n_obs for each clinical param: Overall
|
||||
#----------------------------------------
|
||||
#n_t1 = data.frame(table(lf_t1_comp$mediator))
|
||||
n_all = data.frame(table(clinical_lf$clinical_params))
|
||||
colnames(n_all) = c("clinical_params", "n")
|
||||
n_all$clinical_params = as.character(n_all$clinical_params)
|
||||
|
||||
n_gp_lf = data.frame(table(clinical_lf$clinical_params, clinical_lf$obesity))
|
||||
n_gp = spread(n_gp_lf, "Var2", "Freq"); n_gp
|
||||
colnames(n_gp)
|
||||
colnames(n_gp) = c("clinical_params"
|
||||
, paste0("n_gp", colnames(n_gp)[2])
|
||||
, paste0("n_gp", colnames(n_gp)[3]))
|
||||
|
||||
n_gp$clinical_params = as.character(n_gp$clinical_params)
|
||||
|
||||
n_all_gp = merge(n_all, n_gp
|
||||
, by = intersect( names(n_all), names(n_gp) )
|
||||
, all = T)
|
||||
|
||||
#----------------------------------------
|
||||
# calculate n_obs for each clinical param: complete cases
|
||||
#----------------------------------------
|
||||
n_comp = data.frame(table(clinical_lf_comp$clinical_params))
|
||||
colnames(n_comp) = c("clinical_params", "n_complete")
|
||||
n_comp$clinical_params = as.character(n_comp$clinical_params)
|
||||
n_comp
|
||||
|
||||
n_gp_comp_lf = data.frame(table(clinical_lf_comp$clinical_params, clinical_lf_comp$obesity)); n_gp_comp_lf
|
||||
n_gp_comp = spread(n_gp_comp_lf, "Var2", "Freq"); n_gp_comp
|
||||
colnames(n_gp_comp)
|
||||
colnames(n_gp_comp) = c("clinical_params"
|
||||
, paste0("n_complete_gp", colnames(n_gp_comp)[2])
|
||||
, paste0("n_complete_gp", colnames(n_gp_comp)[3]))
|
||||
|
||||
|
||||
n_comp_gp = merge(n_comp, n_gp_comp
|
||||
, by = intersect( names(n_comp), names(n_gp_comp))
|
||||
, all = T)
|
||||
|
||||
merge_cols = intersect(names(n_all_gp), names(n_comp_gp)); merge_cols
|
||||
|
||||
n_df = merge(n_all_gp, n_comp_gp, by = merge_cols, all = T); n_df
|
||||
|
||||
#==================================
|
||||
# Merge: merge stats + n_obs df
|
||||
#===================================
|
||||
merging_cols = intersect(names(stats_un_clinical), names(n_df)); merging_cols
|
||||
if (all(n_df$clinical_params%in%stats_un_clinical$clinical_params)) {
|
||||
cat("PASS: merging stats and n_obs on column/s:", merging_cols)
|
||||
stats_un_clinical = merge(stats_un_clinical, n_df, by = merging_cols, all = T)
|
||||
cat("\nsuccessfull merge:"
|
||||
, "\nnrow:", nrow(stats_un_clinical)
|
||||
, "\nncol:", ncol(stats_un_clinical))
|
||||
}else{
|
||||
nf = n_df$clinical_params[!n_df$clinical_params%in%stats_un_clinical$clinical_params]
|
||||
stats_un_clinical = merge(stats_un_clinical, n_df, by = merging_cols, all.y = T)
|
||||
cat("\nMerged with caution:"
|
||||
, "\nnrows mismatch:", nf
|
||||
, "\nnot found in stats possibly due to all obs being missing"
|
||||
, "\nintroduced NAs for:", nf
|
||||
, "\nnrow:", nrow(stats_un_clinical)
|
||||
, "\nncol:", ncol(stats_un_clinical))
|
||||
}
|
||||
|
||||
#######################################################################
|
||||
#=================
|
||||
# formatting df
|
||||
#=================
|
||||
# delete: unnecessary column
|
||||
stats_clinical_df = subset(stats_un_clinical, select = -c(.y.,p.adj))
|
||||
|
||||
# add: reflect stats method correctly i.e paired or unpaired
|
||||
# incase there are NA due to LLODs, the gsub won't work!
|
||||
#stats_clinical_df$method = gsub("Wilcoxon", "Wilcoxon_unpaired", stats_clinical_df$method)
|
||||
stats_clinical_df$method = "wilcoxon unpaired"
|
||||
stats_clinical_df$method
|
||||
|
||||
# reorder columns
|
||||
print("preparing to reorder columns...")
|
||||
colnames(stats_clinical_df)
|
||||
my_col_order2 = c("clinical_params"
|
||||
, "method"
|
||||
, "group1"
|
||||
, "group2"
|
||||
, "n"
|
||||
, "n_gp0"
|
||||
, "n_gp1"
|
||||
, "n_complete"
|
||||
, "n_complete_gp0"
|
||||
, "n_complete_gp1"
|
||||
, "p"
|
||||
, "p.format"
|
||||
, "p.signif")
|
||||
|
||||
if( length(my_col_order2) == ncol(stats_clinical_df) && (all(my_col_order2%in%colnames(stats_clinical_df))) ){
|
||||
print("PASS: Reordering columns...")
|
||||
stats_clinical_df_f = stats_clinical_df[, my_col_order2]
|
||||
print("Successful: column reordering")
|
||||
print("formatted df called:'stats_clinical_df_f'")
|
||||
cat('\nformatted df has the following dimensions\n')
|
||||
print(dim(stats_clinical_df_f ))
|
||||
} else{
|
||||
cat(paste0("FAIL:Cannot reorder columns, length or names mismatch"
|
||||
, "\nExpected column order for: ", ncol(stats_clinical_df) # FIXME: can handle better!
|
||||
, "\nGot:", length(my_col_order2)
|
||||
, "\nElse check colnames to see if they exist in both"))
|
||||
quit()
|
||||
}
|
||||
# assign nice column names like replace "." with "_"
|
||||
colnames(stats_clinical_df_f) = c("clinical_params"
|
||||
, "method"
|
||||
, "group1"
|
||||
, "group2"
|
||||
, "n"
|
||||
, "n_gp0"
|
||||
, "n_gp1"
|
||||
, "n_complete"
|
||||
, "n_complete_gp0"
|
||||
, "n_complete_gp1"
|
||||
, "p"
|
||||
, "p_format"
|
||||
, "p_signif")
|
||||
|
||||
colnames(stats_clinical_df_f)
|
||||
########################################################################
|
||||
#******************
|
||||
# write output file
|
||||
#******************
|
||||
cat("UNpaired stats for clinical data for groups in:", outfile_clinical_unpaired)
|
||||
#write.csv(stats_clinical_df_f, outfile_clinical_unpaired, row.names = FALSE)
|
89
reg_cols_extraction.R
Normal file
89
reg_cols_extraction.R
Normal file
|
@ -0,0 +1,89 @@
|
|||
#!/usr/bin/Rscript
|
||||
getwd()
|
||||
setwd('~/git/mosaic_2020/')
|
||||
getwd()
|
||||
########################################################################
|
||||
# TASK: Extract relevant columns from mosaic fp adults data for regression
|
||||
# clinical and sig meds
|
||||
########################################################################
|
||||
|
||||
########################################################################
|
||||
clinical_cols = c("mosaic"
|
||||
, "ia_exac_copd"
|
||||
, "death"
|
||||
#, "obese2" #inc peaeds, but once you subset data for adults, its the same!
|
||||
, "obesity"
|
||||
, "flustat"
|
||||
, "sfluv"
|
||||
, "h1n1v"
|
||||
, "age"
|
||||
, "gender"
|
||||
, "asthma"
|
||||
, "vl_pfu_ul_npa1"
|
||||
, "los"
|
||||
, "onset2final"
|
||||
, "onsfindeath"
|
||||
, "onset_2_initial"
|
||||
, "o2_sat_admis"
|
||||
, "o2_sat_suppl"
|
||||
, "ethnicity"
|
||||
, "smoking"
|
||||
, "ia_cxr"
|
||||
, "max_resp_score"
|
||||
, "T1_resp_score"
|
||||
, "com_noasthma"
|
||||
, "T2_resp_score"
|
||||
, "inresp_sev"
|
||||
, "steroid")
|
||||
|
||||
sig_npa_cols = c("eotaxin_npa1"
|
||||
, "eotaxin3_npa1"
|
||||
, "eotaxin3_npa2"
|
||||
, "gmcsf_npa2"
|
||||
, "ifnb_npa1"
|
||||
, "ifnb_npa2"
|
||||
, "il1_npa1"
|
||||
, "il1_npa2"
|
||||
, "il10_npa1"
|
||||
, "il12p70_npa1"
|
||||
, "il12p70_npa2"
|
||||
, "il13_npa1"
|
||||
, "il2_npa3"
|
||||
, "il4_npa1"
|
||||
, "il5_npa1"
|
||||
, "il5_npa2"
|
||||
, "il6_npa1"
|
||||
, "il6_npa2"
|
||||
, "il8_npa1"
|
||||
, "il8_npa2"
|
||||
, "il8_2_npa1"
|
||||
, "il8_2_npa2"
|
||||
, "ip10_npa1"
|
||||
, "mcp4_npa1"
|
||||
, "mcp4_npa2"
|
||||
, "mdc_npa1"
|
||||
, "mip17_npa1"
|
||||
, "mip17_npa2"
|
||||
, "neopterin_npa1"
|
||||
, "rantes_npa1"
|
||||
, "tarc_npa2"
|
||||
, "tnf_npa2"
|
||||
, "tnfr1_npa1"
|
||||
, "tnfr1_npa2"
|
||||
, "tnfr1_npa3"
|
||||
, "tnfr2_npa1"
|
||||
, "tnfr2_npa2"
|
||||
, "tnfr2_npa3")
|
||||
|
||||
cols_to_extract = c(clinical_cols, sig_npa_cols)
|
||||
|
||||
if ( length(cols_to_extract) == length(clinical_cols) + length(sig_npa_cols) ){
|
||||
cat("PASS: length match"
|
||||
, "\nTotal no. of cols to extract for regression:", length(cols_to_extract)
|
||||
, "\nNo. of clinical cols:", length(clinical_cols)
|
||||
, "\nNo. of sig npa cols: ", length(sig_npa_cols))
|
||||
}else{
|
||||
cat("FAIL: length mismatch"
|
||||
, "\nAborting!")
|
||||
quit()
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue