added data_extraction_formatting_regression.R flu_stats_unpaired_clinical.R and reg_cols_extraction.R

2020-11-18 16:01:29 +00:00 · 2020-11-18 16:01:29 +00:00 · 76dc3fbb6b
commit 76dc3fbb6b
parent bb6e92fa0f
3 changed files with 647 additions and 0 deletions
--- a/data_extraction_formatting_regression.R
+++ b/data_extraction_formatting_regression.R
@ -0,0 +1,329 @@
+#!/usr/bin/Rscript   
+getwd()
+setwd('~/git/mosaic_2020/')
+getwd()
+########################################################################
+# TASK: Extract relevant columns from mosaic adults data
+# npa
+########################################################################
+#====================
+# Input: source data for clinical
+#====================
+source("data_extraction_formatting_clinical.R")
+#source("reg_cols_extraction.R")
+#=======================================
+# Data for mediator to include in regression
+#=======================================
+cat("Extracting", length(sig_npa_cols), "mediator cols from fp_adults")
+
+med_df = fp_adults[, c("mosaic", sig_npa_cols)]
+
+# sanity checks
+if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) ==  nrow(clinical_df) ){
+  cat("PASS: binary data obs are complete, n =", nrow(clinical_df))
+}else{
+  cat("FAIL: Incomplete data for binary outcomes. Please check and decide!")
+  quit()
+}
+
+table(clinical_df$ia_exac_copd)
+
+########################################################################
+# Data extraction for regression
+########################################################################
+
+common_cols = names(clinical_df)[names(clinical_df)%in%names(med_df)]
+
+cat("\nMerging clinical and mediator data for regression"
+    ,"\nMerging on column:", common_cols)
+
+reg_data = merge(clinical_df, med_df
+                 , by = common_cols)
+
+if (nrow(reg_data) == nrow(clinical_df) & nrow(med_df)){
+  cat("\nNo. of rows match, nrow =", nrow(clinical_df)
+      , "\nChecking ncols...")
+  if ( ncol(reg_data) == ncol(clinical_df) + ncol(med_df)  - length(common_cols) ){
+    cat("\nNo. of cols match, ncol =", ncol(reg_data))
+} else {
+  cat("FAIL: ncols mismatch"
+      , "Expected ncols:", ncol(clinical_df) + ncol(med_df)  - length(common_cols)
+      , "\nGot:", ncol(reg_data))
+  }
+} else {
+  cat("FAIL: nrows mismatch"
+           , "\nExpected nrows:", nrow(fp_adults))
+}
+
+########################################################################
+# Reassign the copd and asthma status and do some checks 
+table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd))
+
+reg_data$ia_exac_copd[reg_data$ia_exac_copd< 1]<- 0
+reg_data$ia_exac_copd[is.na(reg_data$ia_exac_copd)] <- 0
+
+table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd))
+
+# check copd and asthma status
+table(reg_data$ia_exac_copd, reg_data$asthma)
+check_copd_and_asthma_1<- subset(reg_data, ia_exac_copd ==1 & asthma == 1) # check this is 3 
+
+# reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma
+reg_data$asthma[reg_data$ia_exac_copd == 1 & reg_data$asthma == 1]= 0 
+table(reg_data$ia_exac_copd, reg_data$asthma)
+
+foo<- subset(reg_data, asthma==1 & ia_exac_copd ==1) # check that its 0
+
+rm(check_copd_and_asthma_1, foo)
+#=====================================================================
+# count the resp scores 
+max_resp_score_table<- table(reg_data$max_resp_score)
+max_resp_score_table
+
+T1_resp_score_table<- table(reg_data$T1_resp_score)
+T1_resp_score_table
+
+T2_resp_score_table<- table(reg_data$T2_resp_score)
+T2_resp_score_table
+
+Inresp_sev<- table(reg_data$inresp_sev)
+Inresp_sev
+
+# Reassign the resp score so all 4 are replace by 3
+reg_data$max_resp_score[reg_data$max_resp_score ==4 ] <- 3
+revised_resp_score_table<- table(reg_data$max_resp_score)
+revised_resp_score_table
+
+reg_data$T1_resp_score[reg_data$T1_resp_score ==4 ] <- 3
+revised_T1_resp_score_table<- table(reg_data$T1_resp_score)
+revised_T1_resp_score_table
+
+reg_data$T2_resp_score[reg_data$T2_resp_score == 4]<- 3
+revised_T2_resp_score_table<- table(reg_data$T2_resp_score)
+revised_T2_resp_score_table
+
+reg_data$inresp_sev[reg_data$inresp_sev == 4]<- 3
+revised_Inresp_sev<- table(reg_data$inresp_sev)
+revised_Inresp_sev
+#=====================================================================
+# Remove these after checking
+rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev
+   , revised_resp_score_table, revised_T1_resp_score_table, revised_T2_resp_score_table, revised_Inresp_sev)
+#=====================================================================
+# Binning
+# "(": not inclusive
+# "]": inclusive
+
+#========
+# age
+#========
+# Create categories of variables
+reg_data$age = round(reg_data$age, digits = 0)
+table(reg_data$age)
+table(reg_data$asthma, reg_data$age)
+min(reg_data$age); max(reg_data$age)
+
+max_age_interval = round_any(max(reg_data$age), 10, f = ceiling)  
+max_age_interval 
+min_age = min(reg_data$age); min_age #19
+min_age_interval = min_age - 1; min_age_interval
+
+#age_bins = cut(reg_data$age, c(0,18,30,40,50,60,70,80,90))
+age_bins = cut(reg_data$age, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval))
+reg_data$age_bins = age_bins
+dim(reg_data) # 133 27
+
+# age_bins (to keep consistent with the results table)
+class(reg_data$age_bins)
+levels(reg_data$age_bins)
+#"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]"
+table(reg_data$asthma, reg_data$age_bins)
+#     (18,30] (30,40] (40,50] (50,60] (60,70] (70,80]
+#0      25      17      25      14      11      1
+#1      11       8      12       5       3      2
+
+if (sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data) ){
+  cat("PASS: age_bins assigned successfully")
+}else{
+  cat("FAIL: no. mismatch when assigning age_bins")
+  quit()
+}
+
+# reassign
+class(reg_data$age_bins)
+levels(reg_data$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]")
+table(reg_data$asthma, reg_data$age_bins)
+table(reg_data$asthma, reg_data$age_bins)
+#     (18,30] (30,40] (40,50] (50,80]
+#0      25      17      25      26
+#1      11       8      12      9
+
+sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data)
+
+#===========================
+# O2 saturation binning
+#===========================
+reg_data$o2_sat_admis = round(reg_data$o2_sat_admis, digits = 0)
+table(reg_data$o2_sat_admis)
+tot_o2 = sum(table(reg_data$o2_sat_admis))- table(reg_data$o2_sat_admis)[["-1"]]
+tot_o2
+
+o2_sat_bin = cut(reg_data$o2_sat_admis, c(0,92,100))
+reg_data$o2_sat_bin = o2_sat_bin
+table(reg_data$o2_sat_bin)
+
+sum(table(reg_data$o2_sat_bin)) == tot_o2
+
+#===========================
+# Onset to initial binning
+#===========================
+max_in = max(reg_data$onset_2_initial); max_in #23
+min_in = min(reg_data$onset_2_initial) - 1 ; min_in  # -6
+
+tot_onset2ini = sum(table(reg_data$onset_2_initial))
+tot_onset2ini
+
+onset_initial_bin = cut(reg_data$onset_2_initial, c(min_in, 4, max_in))
+reg_data$onset_initial_bin = onset_initial_bin
+
+sum(table(reg_data$onset_initial_bin)) == tot_onset2ini
+
+#=======================
+# seasonal flu: sfluv
+#=======================
+# should be a factor
+if (! is.factor(reg_data$sfluv)){
+  reg_data$sfluv  = as.factor(reg_data$sfluv)
+}
+class(reg_data$sfluv) #[1] "factor"
+
+levels(reg_data$sfluv)
+table(reg_data$asthma, reg_data$sfluv)
+# reassign
+levels(reg_data$sfluv) <- c("0", "0", "1")
+table(reg_data$asthma, reg_data$sfluv)
+
+#=======================
+# h1n1v
+#=======================
+# should be a factor
+if (! is.factor(reg_data$h1n1v)){
+  reg_data$h1n1v  = as.factor(reg_data$h1n1v)
+}
+class(reg_data$h1n1v) #[1] "factor"
+
+levels(reg_data$h1n1v)
+table(reg_data$asthma, reg_data$h1n1v)
+# reassign
+levels(reg_data$h1n1v) <- c("0", "0", "1")
+table(reg_data$asthma, reg_data$h1n1v)
+
+#=======================
+# ethnicity
+#=======================
+class(reg_data$ethnicity) # integer
+table(reg_data$asthma, reg_data$ethnicity)
+
+reg_data$ethnicity[reg_data$ethnicity == 4] <- 2
+table(reg_data$asthma, reg_data$ethnicity)
+
+#=======================
+# pneumonia
+#=======================
+class(reg_data$ia_cxr) # integer
+# ia_cxr 2 ---> yes pneumonia (1)
+# 1 ---> no (0)
+# ! 1 or 2 -- > "unkown"
+
+# reassign the pneumonia codes
+#0: not performed 
+#1: normal
+#2: findings consistent with pneumonia
+#3: abnormal
+#-1: not recorded
+#-2: n/a specified by the clinician # not in the data...
+#-3: unknown specified by clinician
+
+
+table(reg_data$ia_cxr)
+#-3 -1  0  1  2  3 
+#5 48 13 47 17  3 
+
+# change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist
+reg_data$ia_cxr[reg_data$ia_cxr == -3 | reg_data$ia_cxr == -1 | reg_data$ia_cxr == 0 | reg_data$ia_cxr == 3 ] <- ""
+table(reg_data$ia_cxr)
+#    1  2 
+#69 47 17
+
+reg_data$ia_cxr[reg_data$ia_cxr == 1] <- 0
+reg_data$ia_cxr[reg_data$ia_cxr == 2] <- 1
+table(reg_data$ia_cxr)
+#    0  1 
+#69 47 17 
+
+#=======================
+# smoking [tricky one]
+#=======================
+class(reg_data$smoking) # integer
+table(reg_data$asthma, reg_data$smoking)
+
+# orig
+#   -3 -1  1  2  3   4
+#0  15 9  22  2  15  30
+#1  4  2  13  0  4   17
+
+#  -3 -1  1  2  3  4
+#0 14  9 20  2 15 30
+#1  5  2 15  0  4 17
+
+# never smoking, 4 and 2 -- > no (0)
+#1 and 3 ---> yes (1)
+#!-3 and -1 ---- > NA
+
+################# smoking
+
+#1: current daily ===> categ smoker(1)
+#2: occasional =====> categ no smoker(0)
+#3: ex-smoker ===> categ smoker(1)
+#4: never =====> categ no smoker(0)
+#-1: not recorded =====> categ blank (NA)
+#-2: n/a specified by the clinician =====> categ blank (NA)
+#-3: unknown specified by clinician=====> categ blank (NA)
+
+table(reg_data$smoking)
+#-3 -1  1  2  3  4 
+#19 11 35  2 19 47 
+
+# reassign the smoking codes
+reg_data$smoking[reg_data$smoking == 4 | reg_data$smoking == 2 ] <- 0
+reg_data$smoking[reg_data$smoking == 1 | reg_data$smoking == 3 ] <- 1
+reg_data$smoking[reg_data$smoking == -1 | reg_data$smoking == -2 | reg_data$smoking == -3 ] <- ""
+
+table(reg_data$smoking)
+#    0  1 
+#30 49 54  
+
+table(reg_data$asthma, reg_data$smoking)
+
+# orig
+#      0  1
+#0 24 32 37
+#1  6 17 17
+
+#      0  1
+#0 23 32 35
+#1  7 17 19
+
+################################################################
+#==================
+# writing output file
+#==================
+outfile_name_reg = "reg_data_recoded_with_NA.csv"
+outfile_reg = paste0(outdir, outfile_name_reg)
+
+cat("Writing clinical file for regression:", outfile_reg)
+
+#write.csv(reg_data, file = outfile_reg)
+################################################################
+
+rm(age_bins, max_age_interval, max_in, min_in, o2_sat_bin, onset_initial_bin, tot_o2, tot_onset2ini, meta_data_cols)
--- a/flu_stats_unpaired_clinical.R
+++ b/flu_stats_unpaired_clinical.R
@ -0,0 +1,229 @@
+#!/usr/bin/Rscript   
+getwd()
+setwd("~/git/mosaic_2020/")
+getwd()
+############################################################
+# TASK: unpaired (time) analysis of clinical data
+# data: clincial data of flu positive adult patients
+# group: obesity
+############################################################
+#my_sample_type = "npa"
+
+#=============
+# Input
+#=============
+source("data_extraction_formatting_clinical.R")
+
+#=============
+# Output
+#=============
+outfile_name_clinical = paste0("flu_stats_clinical_unpaired.csv")
+outfile_clinical_unpaired = paste0(outdir_stats, outfile_name_clinical)
+outfile_clinical_unpaired 
+#===============================
+# data assignment for stats
+#================================
+wf = npa_wf[npa_wf$flustat == 1,]
+lf = npa_lf[npa_lf$flustat == 1,]
+lf$timepoint = paste0("t", lf$timepoint)
+lf = lf[!lf$mediator == "vitd",]
+########################################################################
+# clear variables
+rm(sam_lf, sam_wf
+, serum_lf, serum_wf) 
+rm(colnames_sam_df, expected_rows_sam_lf
+, colnames_serum_df, expected_rows_serum_lf)
+
+rm(pivot_cols)
+
+# sanity checks
+table(lf$timepoint)
+
+#if (table(lf$flustat) == table(npa_lf$flustat)[[2]]){
+#  cat("Analysing Flu positive patients for:", my_sample_type)
+#}else{
+#  cat("FAIL: problem with subsetting data for:", my_sample_type)
+#  quit()
+#}
+########################################################################
+# Unpaired stats for clinical data b/w groups: wilcoxon UNpaired analysis 
+# No correction required
+########################################################################
+
+numerical_cols = c("age"
+                   #, "vl_pfu_ul_npa1"
+                   , "los"
+                   , "onset2final"
+                   , "onsfindeath"
+                   , "onset_2_initial"
+                   , "o2_sat_admis")
+
+metadata_cols = c("mosaic", "obesity")
+
+clinical_df_numerical = clinical_df[, c(metadata_cols, numerical_cols)]
+
+pivot_cols = metadata_cols
+#pivot_cols = metadata_cols[!meta_data_cols%in%cols_to_omit];pivot_cols 
+expected_rows_clinical_lf = nrow(clinical_df_numerical) * (length(clinical_df_numerical) - length(pivot_cols)); expected_rows_clinical_lf
+
+
+keycol <- "clinical_params"
+valuecol <- "value"
+gathercols <- c("age", "los", "onset2final", "onsfindeath", "onset_2_initial", "o2_sat_admis")
+
+clinical_lf = gather_(clinical_df_numerical, keycol, valuecol, gathercols)
+
+#==============
+# unpaired: t1
+#==============
+sum(is.na(clinical_lf$value))
+
+foo = clinical_lf[which(is.na(clinical_lf$value)),]
+
+clinical_lf_comp = clinical_lf[-which(is.na(clinical_lf$value)),]
+
+stats_un_clinical = compare_means(value~obesity
+                            , group.by = "clinical_params"
+                            , data = clinical_lf 
+                            #, data = clinical_lf_comp
+                            , paired = FALSE)
+
+
+stat_df <- clinical_lf %>%
+  group_by(clinical_params) %>%
+  wilcox_test(value ~ obesity, paired = F) %>%
+  add_significance("p")
+stat_df$p_format = round(stat_df$p, digits = 3)
+
+#----------------------------------------
+# calculate n_obs for each clinical param: Overall
+#----------------------------------------
+#n_t1 = data.frame(table(lf_t1_comp$mediator))
+n_all = data.frame(table(clinical_lf$clinical_params))
+colnames(n_all) = c("clinical_params", "n")
+n_all$clinical_params = as.character(n_all$clinical_params)
+
+n_gp_lf = data.frame(table(clinical_lf$clinical_params, clinical_lf$obesity))
+n_gp = spread(n_gp_lf, "Var2", "Freq"); n_gp
+colnames(n_gp)
+colnames(n_gp) = c("clinical_params"
+                   , paste0("n_gp", colnames(n_gp)[2])
+                   , paste0("n_gp", colnames(n_gp)[3]))
+
+n_gp$clinical_params = as.character(n_gp$clinical_params)
+
+n_all_gp  = merge(n_all, n_gp
+                  , by = intersect( names(n_all), names(n_gp) )
+                  , all = T)
+
+#----------------------------------------
+# calculate n_obs for each clinical param: complete cases
+#----------------------------------------
+n_comp = data.frame(table(clinical_lf_comp$clinical_params))
+colnames(n_comp) = c("clinical_params", "n_complete")
+n_comp$clinical_params = as.character(n_comp$clinical_params)
+n_comp
+
+n_gp_comp_lf = data.frame(table(clinical_lf_comp$clinical_params, clinical_lf_comp$obesity)); n_gp_comp_lf
+n_gp_comp = spread(n_gp_comp_lf, "Var2", "Freq"); n_gp_comp
+colnames(n_gp_comp)
+colnames(n_gp_comp) = c("clinical_params"
+                   , paste0("n_complete_gp", colnames(n_gp_comp)[2])
+                   , paste0("n_complete_gp", colnames(n_gp_comp)[3]))
+
+
+n_comp_gp  = merge(n_comp, n_gp_comp
+                  , by = intersect( names(n_comp), names(n_gp_comp))
+                  , all = T)
+
+merge_cols = intersect(names(n_all_gp), names(n_comp_gp)); merge_cols
+
+n_df = merge(n_all_gp, n_comp_gp, by = merge_cols, all = T); n_df
+
+#==================================
+# Merge: merge stats + n_obs df
+#===================================
+merging_cols = intersect(names(stats_un_clinical), names(n_df)); merging_cols
+if (all(n_df$clinical_params%in%stats_un_clinical$clinical_params)) {
+  cat("PASS: merging stats and n_obs on column/s:", merging_cols)
+  stats_un_clinical = merge(stats_un_clinical, n_df, by = merging_cols, all = T)
+  cat("\nsuccessfull merge:"
+      , "\nnrow:", nrow(stats_un_clinical)
+      , "\nncol:", ncol(stats_un_clinical))
+}else{
+  nf = n_df$clinical_params[!n_df$clinical_params%in%stats_un_clinical$clinical_params]
+  stats_un_clinical = merge(stats_un_clinical, n_df, by = merging_cols, all.y = T)
+  cat("\nMerged with caution:"
+      , "\nnrows mismatch:", nf
+      , "\nnot found in stats possibly due to all obs being missing"
+      , "\nintroduced NAs for:", nf
+      , "\nnrow:", nrow(stats_un_clinical)
+      , "\nncol:", ncol(stats_un_clinical))
+}
+
+#######################################################################
+#=================
+# formatting df
+#=================
+# delete: unnecessary column
+stats_clinical_df = subset(stats_un_clinical, select = -c(.y.,p.adj))
+
+# add: reflect stats method correctly i.e paired or unpaired
+# incase there are NA due to LLODs, the gsub won't work!
+#stats_clinical_df$method = gsub("Wilcoxon", "Wilcoxon_unpaired", stats_clinical_df$method)
+stats_clinical_df$method = "wilcoxon unpaired"
+stats_clinical_df$method
+
+# reorder columns
+print("preparing to reorder columns...")
+colnames(stats_clinical_df)
+my_col_order2 = c("clinical_params"
+                  , "method"
+                  , "group1"
+                  , "group2"
+                  , "n"
+                  , "n_gp0"
+                  , "n_gp1"
+                  , "n_complete"
+                  , "n_complete_gp0"
+                  , "n_complete_gp1"
+                  , "p"
+                  , "p.format"
+                  , "p.signif")
+
+if( length(my_col_order2) == ncol(stats_clinical_df) && (all(my_col_order2%in%colnames(stats_clinical_df))) ){
+  print("PASS: Reordering columns...")
+  stats_clinical_df_f = stats_clinical_df[, my_col_order2]
+  print("Successful: column reordering")
+  print("formatted df called:'stats_clinical_df_f'")
+  cat('\nformatted df has the following dimensions\n')
+  print(dim(stats_clinical_df_f ))
+} else{
+  cat(paste0("FAIL:Cannot reorder columns, length or names mismatch"
+             , "\nExpected column order for: ", ncol(stats_clinical_df) # FIXME: can handle better!
+             , "\nGot:", length(my_col_order2)
+             , "\nElse check colnames to see if they exist in both"))
+  quit()
+}  
+# assign nice column names like replace "." with "_"
+colnames(stats_clinical_df_f) = c("clinical_params"
+                                   , "method"
+                                   , "group1"
+                                   , "group2"
+                                   , "n"
+                                   , "n_gp0"
+                                   , "n_gp1"
+                                   , "n_complete"
+                                   , "n_complete_gp0"
+                                   , "n_complete_gp1"
+                                   , "p"
+                                   , "p_format"
+                                   , "p_signif")
+
+colnames(stats_clinical_df_f)
+########################################################################
+#******************
+# write output file
+#******************
+cat("UNpaired stats for clinical data for groups in:", outfile_clinical_unpaired)
+#write.csv(stats_clinical_df_f, outfile_clinical_unpaired, row.names = FALSE)
--- a/reg_cols_extraction.R
+++ b/reg_cols_extraction.R
@ -0,0 +1,89 @@
+#!/usr/bin/Rscript   
+getwd()
+setwd('~/git/mosaic_2020/')
+getwd()
+########################################################################
+# TASK: Extract relevant columns from mosaic fp adults data for regression
+# clinical and sig meds
+########################################################################
+
+########################################################################
+clinical_cols = c("mosaic"
+                    , "ia_exac_copd"
+                    , "death"
+                    #, "obese2" #inc peaeds, but once you subset data for adults, its the same!
+                    , "obesity"
+                    , "flustat"
+                    , "sfluv"
+                    , "h1n1v"
+                    , "age"
+                    , "gender"
+                    , "asthma"
+                    , "vl_pfu_ul_npa1"
+                    , "los"
+                    , "onset2final"
+                    , "onsfindeath"
+                    , "onset_2_initial"
+                    , "o2_sat_admis"
+                    , "o2_sat_suppl"
+                    , "ethnicity"
+                    , "smoking"
+                    , "ia_cxr"
+                    , "max_resp_score"
+                    , "T1_resp_score"
+                    , "com_noasthma"
+                    , "T2_resp_score"
+                    , "inresp_sev"
+                    , "steroid")
+
+sig_npa_cols = c("eotaxin_npa1"
+                 , "eotaxin3_npa1"
+                 , "eotaxin3_npa2"
+                 , "gmcsf_npa2"
+                 , "ifnb_npa1"
+                 , "ifnb_npa2"
+                 , "il1_npa1"
+                 , "il1_npa2"
+                 , "il10_npa1"
+                 , "il12p70_npa1"
+                 , "il12p70_npa2"
+                 , "il13_npa1"
+                 , "il2_npa3"
+                 , "il4_npa1"
+                 , "il5_npa1"
+                 , "il5_npa2"
+                 , "il6_npa1"
+                 , "il6_npa2"
+                 , "il8_npa1"
+                 , "il8_npa2"
+                 , "il8_2_npa1"
+                 , "il8_2_npa2"
+                 , "ip10_npa1"
+                 , "mcp4_npa1"
+                 , "mcp4_npa2"
+                 , "mdc_npa1"
+                 , "mip17_npa1"
+                 , "mip17_npa2"
+                 , "neopterin_npa1"
+                 , "rantes_npa1"
+                 , "tarc_npa2"
+                 , "tnf_npa2"
+                 , "tnfr1_npa1" 
+                 , "tnfr1_npa2"
+                 , "tnfr1_npa3" 
+                 , "tnfr2_npa1"
+                 , "tnfr2_npa2"
+                 , "tnfr2_npa3")
+
+cols_to_extract = c(clinical_cols, sig_npa_cols)
+
+if ( length(cols_to_extract) == length(clinical_cols) + length(sig_npa_cols) ){
+  cat("PASS: length match"
+      , "\nTotal no. of cols to extract for regression:", length(cols_to_extract)
+      , "\nNo. of clinical cols:", length(clinical_cols)
+      , "\nNo. of sig npa cols: ", length(sig_npa_cols))
+}else{
+  cat("FAIL: length mismatch"
+      , "\nAborting!")
+  quit()
+}