From 51069fdb7661da53dac42a9bb7ea7c8176310be3 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 4 Mar 2022 10:58:14 +0000 Subject: [PATCH] output merged_df3 and merged_df2 files for all gene-targtes along with active site residues annotated --- ml_data/ml_data.R | 69 +++++++++++++++++------------------------------ 1 file changed, 25 insertions(+), 44 deletions(-) diff --git a/ml_data/ml_data.R b/ml_data/ml_data.R index 7e2c883..f8637da 100644 --- a/ml_data/ml_data.R +++ b/ml_data/ml_data.R @@ -5,61 +5,42 @@ # pyrazinamide: 0 and 1, loss of data # mutation_info_labels: DM and OM, full data ################################################## -# ONLY ONCE -#source("~/git/LSHTM_analysis/config/pnca.R") -#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") -#write.csv(colnames(merged_df3), "data_colnames.csv") -#--------------------------------------------------- -colnames_order_pnca = read.csv("~/git/ML_AI_training/ml_data/colnames_order.csv" - , header = F) -# reorder columns by name -colnames_order_pnca <- colnames_order_pnca$V1 + ################################################### #config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob") #config_gene = c("alr", "embb") #sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F) - +#---------------------------------------------------- #source("~/git/LSHTM_analysis/config/alr.R") -# FIXME: "cycloserine" "mcsm_ppi2_affinity" "mcsm_ppi2_scaled" "mcsm_ppi2_outcome" "interface_dist" -# source("~/git/LSHTM_analysis/config/embb.R") -# source("~/git/LSHTM_analysis/config/gid.R") -# source("~/git/LSHTM_analysis/config/katg.R") -source("~/git/LSHTM_analysis/config/pnca.R") -# source("~/git/LSHTM_analysis/config/rpob.R") -################################################## +#source("~/git/LSHTM_analysis/config/embb.R") +#source("~/git/LSHTM_analysis/config/gid.R") +#source("~/git/LSHTM_analysis/config/katg.R") +#source("~/git/LSHTM_analysis/config/pnca.R") +source("~/git/LSHTM_analysis/config/rpob.R") +#---------------------------------------------------- source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") - ###################################################### +gene; drug + +merged_df3$active_aa_pos = ifelse(merged_df3$position %in% active_aa_pos , 1, 0) +table(merged_df3$active_aa_pos) + mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv") mdf3_outName -if( (length(colnames_order) == ncol(merged_df3)) && (all(colnames_order %in%colnames(merged_df3))) ){ - cat("\nProceeding with rearranging columns in merged_df3") - merged_df3_o = merged_df3[ , colnames_order] - cat("\nWriting output file:", mdf3_outName) - write.csv(merged_df3_o, mdf3_outName, row.names = F) - cat("\nnrows:" , nrow(merged_df3_o) - , "\nncols:" , ncol(merged_df3_o)) - - }else - cat("length mismatch:" - , colnames(merged_df3)[!colnames(merged_df3)%in%(colnames_order )] - ) +cat("\nWriting output file:", mdf3_outName) +write.csv(merged_df3, mdf3_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df3) + , "\nncols:" , ncol(merged_df3)) + +#========================================================= +merged_df2$active_aa_pos = ifelse(merged_df2$position %in% active_aa_pos , 1, 0) +table(merged_df2$active_aa_pos) mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv") mdf2_outName -if( (length(colnames_order) == ncol(merged_df2)) && (all(colnames_order %in%colnames(merged_df2))) ){ - cat("\nProceeding with rearranging columns in merged_df3") - merged_df2_o = merged_df2[ , colnames_order] - cat("\nWriting output file:", mdf2_outName) - write.csv(merged_df2_o, mdf2_outName, row.names = F) - cat("\nnrows:" , nrow(merged_df2_o) - , "\nncols:" , ncol(merged_df2_o)) - -} - - - - - +cat("\nWriting output file:", mdf2_outName) +write.csv(merged_df2, mdf2_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df2) + , "\nncols:" , ncol(merged_df2))