230 lines
8.7 KiB
R
Executable file
230 lines
8.7 KiB
R
Executable file
#!/usr/bin/Rscript
|
|
getwd()
|
|
setwd('~/git/covid_analysis/')
|
|
getwd()
|
|
############################################################
|
|
# TASK: data cleaning and manipulation to allow plotting
|
|
# and analysis
|
|
############################################################
|
|
# source data
|
|
source("read_data.R")
|
|
|
|
# clear unwanted variables
|
|
rm(lf_data, wf_data)
|
|
|
|
#==================================
|
|
# output: formatted and clean data
|
|
#==================================
|
|
outfile_icu_wf = paste0(datadir,"/icu_covid_wf_v3.csv")
|
|
outfile_icu_lf = paste0(datadir,"/icu_covid_lf_v3.csv")
|
|
outfile_colnames = paste0(datadir, "/colnames_check_v3.csv")
|
|
#########################################################
|
|
# some numerical cols are characters, change these
|
|
#my_data <- as.data.frame(sapply(my_data, function(x) as.numeric(as.character((x)))))
|
|
#str(my_data)
|
|
|
|
# colnames
|
|
all_cols_v3 = as.data.frame(colnames(my_data))
|
|
#write.csv(all_cols_v3, "colnames_v3.csv")
|
|
|
|
#==================================
|
|
# select the mediators to analyse
|
|
#==================================
|
|
cols_to_select = c("id",
|
|
"sRAGEpgmLt1",
|
|
"sICAM1ngmLt1",
|
|
"PSELECTINngmLt1",
|
|
"sVCAM1ngmLt1",
|
|
"Angiopoietin2pgmLt1",
|
|
"sESelectinngmLt1",
|
|
"sESelectinngmLt2",
|
|
"Angiopoietin2pgmLt2",
|
|
"sRAGEpgmLt2",
|
|
"sICAM1ngmLt2",
|
|
"PSELECTINngmLt2",
|
|
"sVCAM1ngmLt2",
|
|
"sESelectinngmLt3",
|
|
"Angiopoietin2pgmLt3",
|
|
"sRAGEpgmLt3",
|
|
"sICAM1ngmLt3",
|
|
"PSELECTINngmLt3",
|
|
"sVCAM1ngmLt3",
|
|
"outcomes0death1recovery2other",
|
|
"PF_t1",
|
|
"PF_t2" ,
|
|
"PF_t3" ,
|
|
"studygroup0coorteA1coorteB2coorteC")
|
|
|
|
# check if these columns to select are present in the data
|
|
cols_to_select%in%colnames(my_data)
|
|
all(cols_to_select%in%colnames(my_data))
|
|
|
|
table(my_data$studygroup0coorteA1coorteB2coorteC)
|
|
|
|
# subset
|
|
my_df = my_data[,cols_to_select]
|
|
dim(my_df) #31, 24
|
|
|
|
# some numerical cols are characters, change these
|
|
str(my_df)
|
|
my_df <- as.data.frame(sapply(my_df, function(x) as.numeric(as.character((x)))))
|
|
str(my_df)
|
|
|
|
# add column name subject_id with "S" prefix to id or simply add it to id
|
|
my_df$id = paste0("S", my_df$id)
|
|
|
|
# assign nicer colnames
|
|
original_colnames = colnames(my_df)
|
|
orig_cols = as.data.frame(colnames(my_df))
|
|
|
|
my_colnames = c("id",
|
|
"sRAGE_pgmL_t1",
|
|
"sICAM1_ngmL_t1",
|
|
"PSelectin_ngmL_t1",
|
|
"sVCAM1_ngmL_t1",
|
|
"Angiopoietin2_pgmL_t1",
|
|
"sESelectin_ngmL_t1",
|
|
"sESelectin_ngmL_t2",
|
|
"Angiopoietin2_pgmL_t2",
|
|
"sRAGE_pgmL_t2",
|
|
"sICAM1_ngmL_t2",
|
|
"PSelectin_ngmL_t2",
|
|
"sVCAM1_ngmL_t2",
|
|
"sESelectin_ngmL_t3",
|
|
"Angiopoietin2_pgmL_t3",
|
|
"sRAGE_pgmL_t3",
|
|
"sICAM1_ngmL_t3",
|
|
"PSelectin_ngmL_t3",
|
|
"sVCAM1_ngmL_t3",
|
|
"outcomes",
|
|
"PF_units_t1",
|
|
"PF_units_t2" ,
|
|
"PF_units_t3" ,
|
|
"studygroup")
|
|
|
|
|
|
if (length(original_colnames) == length(my_colnames) && all(cols_to_select%in%colnames(my_data))){
|
|
print("PASS: length of colnames match. Assigning clean colnames")
|
|
colnames(my_df) = my_colnames
|
|
revised_colnames = colnames(my_df)
|
|
colnames_check = as.data.frame(cbind(original_colnames, revised_colnames))
|
|
|
|
} else{
|
|
cat(paste0("FAIL:length mismatch when assigning colnames"
|
|
, "\nExpected length of colnames: ", length(original_colnames)
|
|
, "\nGot: ", length(my_colnames)))
|
|
quit()
|
|
}
|
|
|
|
print(colnames(my_df))
|
|
n_patients = length(unique(my_df$id))
|
|
print(paste0("Total no. of patients:", n_patients))
|
|
|
|
table(my_df$studygroup)
|
|
table(my_df$outcomes)
|
|
table(my_df$studygroup, my_df$outcomes)
|
|
|
|
#%% subset only icu patients
|
|
my_df_icu = my_df[my_df$studygroup == 0,]
|
|
n_icupatients = length(unique(my_df_icu$id))
|
|
|
|
cat(paste0("Total no. of rows in original df:", nrow(my_df)
|
|
, "\nTotal no. of unique patients:",n_patients
|
|
, "\nTotal no. of ICU patients:", n_icupatients))
|
|
|
|
table(my_df_icu$studygroup, my_df_icu$outcomes)
|
|
tab1 = table(my_df_icu$studygroup, my_df_icu$outcomes)
|
|
cat(paste0("no. of icu patients who"
|
|
, "\ndied:", tab1[1], " ~ ", tab1[1]/n_icupatients*100, "%"
|
|
, "\nrecovered:", tab1[2]," ~ ", tab1[2]/n_icupatients*100, "%"))
|
|
#, "\nother:", tab1[3], " ~ ", tab1[3]/n_icupatients*100, "%"))
|
|
|
|
table(my_df$studygroup)
|
|
#=====================================================
|
|
#=========
|
|
# lf data
|
|
#=========
|
|
pivot_cols = c("id"
|
|
, "studygroup"
|
|
, "outcomes")
|
|
|
|
expected_rows_lf = nrow(my_df_icu) * (length(my_df_icu) - length(pivot_cols))
|
|
|
|
# using regex:
|
|
df_lf = my_df_icu %>%
|
|
tidyr::pivot_longer(-all_of(pivot_cols), names_to = c("mediator", "units", "timepoint"),
|
|
names_pattern = "(.*)_(.*)_(.*)",
|
|
values_to = "value")
|
|
|
|
if ((nrow(df_lf) == expected_rows_lf) & (sum(table(is.na(df_lf$mediator))) == expected_rows_lf)) {
|
|
cat(paste0("PASS: long format data has correct no. of rows and NA in mediator:"
|
|
, "\nNo. of rows: ", nrow(df_lf)
|
|
, "\nNo. of cols: ", ncol(df_lf)))
|
|
} else{
|
|
cat(paste0("FAIL:long format data has unexpected no. of rows or NAs in mediator"
|
|
, "\nExpected no. of rows: ", expected_rows_lf
|
|
, "\nGot: ", nrow(df_lf)
|
|
, "\ncheck expected rows calculation!"))
|
|
quit()
|
|
}
|
|
|
|
class(df_lf) # hmmm
|
|
str(df_lf)
|
|
|
|
class(as.data.frame(df_lf))
|
|
str(as.data.frame(df_lf))
|
|
|
|
# COMMENT: slight difference in class and structure b/w the outout from pivot and when you convert to df
|
|
# I will use the df as I am familiar with it!
|
|
lf_df = as.data.frame(df_lf)
|
|
class(lf_df)
|
|
str(lf_df)
|
|
|
|
# sort by mediator and timepoint
|
|
lf_df = lf_df[order(lf_df$mediator, lf_df$timepoint),]
|
|
|
|
table(is.na(lf_df$mediator))
|
|
|
|
#=========
|
|
# wf data
|
|
#=========
|
|
# icu data is your wf data
|
|
# sort icu data by columnames
|
|
auto_col_order = order(names(my_df_icu))
|
|
#my_col_order = c(1,25, 24, 20, 6, 9, 15,21, 22, 23, 4, 12, 18, 7, 8, 14, 3, 11, 17, 2, 10, 16, 5, 13, 19)
|
|
my_col_order = c(1, 24, 20, 6, 9, 15,21, 22, 23, 4, 12, 18, 7, 8, 14, 3, 11, 17, 2, 10, 16, 5, 13, 19)
|
|
|
|
if(length(auto_col_order) == length(my_col_order)){
|
|
print("PASS: column order successfully generated. Reordering column in wf data")
|
|
wf_df = my_df_icu[, my_col_order]
|
|
} else{
|
|
cat(paste0("FAIL:length mismatch of column orders"
|
|
, "\nExpected column order for: ", length(auto_col_order)
|
|
, "\nGot:", length(my_col_order)))
|
|
quit()
|
|
}
|
|
#all.equal(my_df_icu, wf_df)
|
|
#===========================================================
|
|
#%% write icu files
|
|
|
|
# lf_data
|
|
write.csv(lf_df, outfile_icu_lf, row.names = F)
|
|
cat(paste0("Finsihed writing lf data:"
|
|
, "\nNo. of rows: ", nrow(lf_df)
|
|
, "\nNo. of cols: ", ncol(lf_df)))
|
|
|
|
# column names to check
|
|
write.csv(colnames_check, outfile_colnames, row.names = F)
|
|
cat(paste0("Finsihed writing colnames original and revised:"
|
|
, "\nNo. of rows: ", nrow(colnames_check)
|
|
, "\nNo. of cols: ", ncol(colnames_check)))
|
|
|
|
# wf_data: only original
|
|
write.csv(wf_df, outfile_icu_wf, row.names = F)
|
|
cat(paste0("\nFinsihed wrting wf data:"
|
|
, "\nNo. of rows: ", nrow(wf_df)
|
|
, "\nNo. of cols: ", ncol(wf_df)))
|
|
|
|
# COMMENT: wf_data for scaled values not written out!
|
|
#=======================================================
|
|
# end of script
|