mosaic_2020/read_data.R

131 lines
4.3 KiB
R
Executable file

#!/usr/bin/Rscript
getwd()
setwd("~/git/mosaic_2020/")
getwd()
########################################################################
# TASK: read data
########################################################################
# load libraries, packages and local imports
source("Header_TT.R")
########################################################################
maindir = "~/git/mosaic_2020/"
outdir = paste0(maindir, "output/")
ifelse(!dir.exists(outdir), dir.create(outdir), FALSE)
outdir_stats = paste0(maindir, "output/stats/")
ifelse(!dir.exists(outdir_stats), dir.create(outdir_stats), FALSE)
outdir_plots = paste0(maindir, "output/plots/")
ifelse(!dir.exists(outdir_plots), dir.create(outdir_plots), FALSE)
########################################################################
# static file read: csv
#==============
# all patients
#==============
all_df <- read.csv("/home/backup/MOSAIC/MEDIATOR_Data/master_file/Mosaic_master_file_from_stata.csv"
, fileEncoding = 'latin1')
# meta data columns
meta_data_cols = c("mosaic", "gender", "age"
, "adult"
, "flustat", "type"
, "obesity"
#, "obese2"
#, "height", "height_unit"
#, "weight", "weight_unit"
#, "ia_height_ftin", "ia_height_m", "ia_weight"
#, "visual_est_bmi", "bmi_rating"
)
# check if these columns to select are present in the data
meta_data_cols%in%colnames(all_df)
all(meta_data_cols%in%colnames(all_df))
metadata_all = all_df[, meta_data_cols]
#==============
# adult patients
#==============
adult_df = all_df[all_df$age>=18,]
if (table(adult_df$adult == 1)[[1]] == nrow(adult_df) ){
cat ("\nPASS: adult df extracted successfully")
} else{
cat ("\nFAIL: adult df number mismatch!")
}
#=================================
# FLU positive: adult patients
#=================================
# extract the flu positive population
fp_adults = adult_df[adult_df$flustat == 1,]
if (table(fp_adults$flustat == 1)[[1]] == nrow(fp_adults) ){
cat ("\nPASS: adult df extracted successfully")
} else{
cat ("\nFAIL: adult df number mismatch!")
}
#-----------------------------------
# asthma and copd status correction
# for conflicting field!
#------------------------------------
# Reassign the copd and asthma status and do some checks
table(fp_adults$ia_exac_copd); sum(is.na(fp_adults$ia_exac_copd))
fp_adults$ia_exac_copd[fp_adults$ia_exac_copd< 1]<- 0
fp_adults$ia_exac_copd[is.na(fp_adults$ia_exac_copd)] <- 0
table(fp_adults$ia_exac_copd); sum(is.na(fp_adults$ia_exac_copd))
# check copd and asthma status
table(fp_adults$ia_exac_copd, fp_adults$asthma)
check_copd_and_asthma_1<- subset(fp_adults, ia_exac_copd ==1 & asthma == 1) # check this is 3
# reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma
fp_adults$asthma[fp_adults$ia_exac_copd == 1 & fp_adults$asthma == 1]= 0
table(fp_adults$ia_exac_copd, fp_adults$asthma)
foo<- subset(fp_adults, asthma==1 & ia_exac_copd ==1) # check that its 0
rm(check_copd_and_asthma_1, foo)
if ( table(fp_adults$ia_exac_copd, fp_adults$asthma) [[2,2]] == 0 ){
cat("\nPASS: asthma and copd do not conflict")
} else{
cat ("\nFAIL: asthma and copd conflict not resolved!")
quit()
}
#=============================================
# FLU positive adult patients: without asthma
#=============================================
#cat("\nExtracting flu positive without asthma")
#table(fp_adults$asthma)
#cat("\nNo. of asthmatics:", table(fp_adults$asthma)[[2]]
# , "\nNo. of non-asthmatics:", table(fp_adults$asthma)[[1]])
#str(fp_adults$asthma)
#table(fp_adults$obesity)
#table(fp_adults$obesity, fp_adults$asthma)
#fp_adults_na = fp_adults[fp_adults$asthma == 0,]
#table(fp_adults_na$obesity)
#table(fp_adults_na$obesity, fp_adults_na$asthma)
#============
# hc
#============
#hc_data<- read.csv("/home/backup/MOSAIC/MEDIATOR_Data/master_file/Mediators_for_HC.csv")
#str(hc_data)
#table(hc_data$Timepoint, hc_data$Sample)
########################################################################
# quick sanity checks
table(adult_df$ia_exac_copd==1 & adult_df$asthma == 1) # check this is 4
table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) # check this is 3
# clear unnecessary variables
rm(metadata_all)
rm(all_df, adult_df)