From 13999a477d701f8110f15b20646ee66d105b9a82 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 22 Aug 2022 14:33:06 +0100 Subject: [PATCH] fixed source to contain plotting cols and pos_count correctly --- scripts/functions/combining_dfs_plotting.R | 28 ++++++++---- scripts/functions/dm_om_data.R | 5 ++- scripts/functions/plotting_data.R | 23 +++++----- scripts/plotting/get_plotting_dfs.R | 2 +- scripts/plotting/plotting_colnames.R | 45 ++++++++++++------- .../plotting/plotting_thesis/basic_barplots.R | 2 +- 6 files changed, 66 insertions(+), 39 deletions(-) diff --git a/scripts/functions/combining_dfs_plotting.R b/scripts/functions/combining_dfs_plotting.R index 3441c3a..d82a7c0 100644 --- a/scripts/functions/combining_dfs_plotting.R +++ b/scripts/functions/combining_dfs_plotting.R @@ -41,7 +41,7 @@ geneL_ppi2 = c("alr", "embb", "katg", "rpob") combining_dfs_plotting <- function( my_df_u , gene_metadata - , gene # ADDED + #, gene # ADDED , lig_dist_colname = '' , lig_dist_cutoff = ''){ @@ -686,8 +686,11 @@ combining_dfs_plotting <- function( my_df_u min( merged_df3['avg_lig_affinity_scaled']); max( merged_df3['avg_lig_affinity_scaled']) ################################################################### - # Rectify pos_count column in merged_df3 - # The one in merged_df2 is correct + #-------------------------------------------- + # merged_df3: Rectify pos_count column + # Rename existing pos_count colum to reflect + # that it is correct according to merged_df2 + #-------------------------------------------- nc_pc_CHANGE = which(colnames(merged_df3)== "pos_count"); nc_pc_CHANGE colnames(merged_df3)[nc_pc_CHANGE] = "df2_pos_count_all" @@ -707,16 +710,25 @@ combining_dfs_plotting <- function( my_df_u nc_change = which(colnames(merged_df3) == "n") colnames(merged_df3)[nc_change] <- "pos_count" class(merged_df3) + + #################################################################### + #------------------------------------------------- + # merged_df2: Rename existing pos_count + # column to df2_pos_count_all like in above df + #------------------------------------------------- + nc_pc_CHANGE_df2 = which(colnames(merged_df2)== "pos_count"); nc_pc_CHANGE_df2 + colnames(merged_df2)[nc_pc_CHANGE_df2] = "df2_pos_count_all" + head(merged_df2$pos_count) + head(merged_df2$df2_pos_count_all) + #################################################################### # ADD: distance to Nucleic acid column for na genes - - + # already done in plotting_data #################################################################### - #TODO # Choose few columns to return as plot_df - - + merged_df3 = merged_df3[, colnames(merged_df3)%in%c(plotting_cols, "pos_count", "df2_pos_count_all")] + merged_df2 = merged_df2[, colnames(merged_df2)%in%c(plotting_cols, "df2_pos_count_all")] #################################################################### return(list( merged_df2 diff --git a/scripts/functions/dm_om_data.R b/scripts/functions/dm_om_data.R index 2217f5c..48c290b 100644 --- a/scripts/functions/dm_om_data.R +++ b/scripts/functions/dm_om_data.R @@ -121,7 +121,7 @@ dm_om_wf_lf_data <- function(df mmcsm_lig_dn2 = paste0("mmCSM-lig"); mmcsm_lig_dn2 - na_dist_dn = paste0("NA Dist(", angstroms_symbol, ")"); na_dist_dn + na_dist_dn = paste0("Dist to NA (", angstroms_symbol, ")"); na_dist_dn mcsm_na_dn = paste0("mCSM-NA ", stability_suffix); mcsm_na_dn ppi2_dist_dn = paste0("PPI Dist(", angstroms_symbol, ")"); ppi2_dist_dn @@ -174,7 +174,8 @@ dm_om_wf_lf_data <- function(df ) display_common_colnames = c(snp_colname - , mut_colname , "dst_mode" , mut_info_label_colname + , mut_colname + , "dst_mode" , mut_info_label_colname , aa_pos_colname , "duet_stability_change" , duet_dn , "duet_outcome" diff --git a/scripts/functions/plotting_data.R b/scripts/functions/plotting_data.R index ea17d82..cd30dee 100755 --- a/scripts/functions/plotting_data.R +++ b/scripts/functions/plotting_data.R @@ -7,14 +7,10 @@ library(data.table) library(dplyr) # ADDED: New -geneL_normal = c("pnca") -geneL_na = c("gid", "rpob") -geneL_ppi2 = c("alr", "embb", "katg", "rpob") +# geneL_normal = c("pnca") +# geneL_na = c("gid", "rpob") +# geneL_ppi2 = c("alr", "embb", "katg", "rpob") -if (tolower(gene)%in%geneL_na){ - infilename_nca = paste0("/home/tanu/git/Misc/mcsm_na_dist/" - , tolower(gene), "_nca_distances.csv") -} #======================================================== # plotting_data(): formatting data for plots # input args: @@ -31,8 +27,9 @@ if (tolower(gene)%in%geneL_na){ plotting_data <- function(df , gene # ADDED - , lig_dist_colname - , lig_dist_cutoff) { + , lig_dist_colname = 'ligand_distance' + , lig_dist_cutoff = 10 + ) { my_df = data.frame() my_df_u = data.frame() my_df_u_lig = data.frame() @@ -89,11 +86,15 @@ plotting_data <- function(df # all = T) # # } - + geneL_na=c("gid","rpob") + if (tolower(gene)%in%geneL_na){ + infilename_nca = paste0("/home/tanu/git/Misc/mcsm_na_dist/" + , tolower(gene), "_nca_distances.csv") distcol_nca_name = read.csv(infilename_nca, header = F) - + if (tolower(gene)=='rpob'){ + print('WARNING: running special-case handler for rpoB') # create 5uhc equivalent column for mutationinformation diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index e4df5be..aae338e 100644 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -88,7 +88,7 @@ cat("\nDim of meta data file: ", dim(gene_metadata)) all_plot_dfs = combining_dfs_plotting(my_df_u , gene_metadata - , gene = gene # ADDED + #, gene = gene # ADDED , lig_dist_colname = LigDist_colname , lig_dist_cutoff = LigDist_cutoff) diff --git a/scripts/plotting/plotting_colnames.R b/scripts/plotting/plotting_colnames.R index 282bc35..dd7524a 100644 --- a/scripts/plotting/plotting_colnames.R +++ b/scripts/plotting/plotting_colnames.R @@ -1,6 +1,4 @@ -geneL_normal = c("pnca") -geneL_na = c("gid", "rpob") -geneL_ppi2 = c("alr", "embb", "katg", "rpob") +# Initialise the required dfs based on gene name # LigDist_colname # from globals used # ppi2Dist_colname #from globals used @@ -11,7 +9,7 @@ common_cols = c("mutationinformation" , drug, "drug_name" , "mutation", "mutation_info" , "wild_type", "mutant_type", "position" - , "pos_count" + #, "pos_count", "df2_pos_count_all" , "snp_frequency" , "total_id_ucount" , "drtype", "drtype_mode", "drtype_max" @@ -63,7 +61,7 @@ common_outcome_affinity_cols = c( "ligand_outcome" #====================================================== # Plotting cols + affinity cols: conditional on gene #====================================================== -if (tolower(gene)%in%geneL_normal){ +if (tolower(gene)%in%c("pnca")){ plotting_cols = common_cols raw_affinity_cols = common_raw_affinity_cols @@ -73,35 +71,50 @@ if (tolower(gene)%in%geneL_normal){ } # ppi2 genes -if (tolower(gene)%in%geneL_ppi2){ +if (tolower(gene)%in%c("alr", "embb", "katg")){ plotting_cols = c(common_cols, ppi2Dist_colname, "mcsm_ppi2_affinity", "mcsm_ppi2_scaled", "mcsm_ppi2_outcome") - raw_affinity_cols = c(common_raw_affinity_cols , "mcsm_ppi2_affinity") - scaled_affinity_cols = c(common_scaled_affinity_cols , "mcsm_ppi2_scaled" ) - outcome_affinity_cols = c(common_outcome_affinity_cols , "mcsm_ppi2_outcome") - affinity_dist_colnames = c(LigDist_colname, ppi2Dist_colname) + raw_affinity_cols = c(common_raw_affinity_cols , "mcsm_ppi2_affinity") + scaled_affinity_cols = c(common_scaled_affinity_cols , "mcsm_ppi2_scaled" ) + outcome_affinity_cols = c(common_outcome_affinity_cols , "mcsm_ppi2_outcome") + affinity_dist_colnames = c(LigDist_colname, ppi2Dist_colname) } #na_genes -if (tolower(gene)%in%geneL_na){ +if (tolower(gene)%in%c("gid")){ plotting_cols = c(common_cols, naDist_colname, "mcsm_na_affinity", "mcsm_na_scaled", "mcsm_na_outcome") - raw_affinity_cols = c(common_raw_affinity_cols , "mcsm_na_affinity") - scaled_affinity_cols = c(common_scaled_affinity_cols , "mcsm_na_scaled") - outcome_affinity_cols = c(common_outcome_affinity_cols , "mcsm_na_outcome") - affinity_dist_colnames = c(LigDist_colname, ppi2Dist_colname, naDist_colname) + raw_affinity_cols = c(common_raw_affinity_cols , "mcsm_na_affinity") + scaled_affinity_cols = c(common_scaled_affinity_cols , "mcsm_na_scaled") + outcome_affinity_cols = c(common_outcome_affinity_cols , "mcsm_na_outcome") + affinity_dist_colnames = c(LigDist_colname, naDist_colname) } if (tolower(gene)%in%c("rpob")){ - plotting_cols = c(plotting_cols, "X5uhc_position","X5uhc_offset") + #plotting_cols = c(plotting_cols, "X5uhc_position","X5uhc_offset") + plotting_cols = c(common_cols, + ppi2Dist_colname, + "mcsm_ppi2_affinity", "mcsm_ppi2_scaled", "mcsm_ppi2_outcome", + naDist_colname, + "mcsm_na_affinity", "mcsm_na_scaled", "mcsm_na_outcome", + "X5uhc_position","X5uhc_offset") + + + raw_affinity_cols = c(common_raw_affinity_cols , "mcsm_ppi2_affinity", "mcsm_na_affinity") + scaled_affinity_cols = c(common_scaled_affinity_cols , "mcsm_ppi2_scaled" , "mcsm_na_scaled") + outcome_affinity_cols = c(common_outcome_affinity_cols , "mcsm_ppi2_outcome", "mcsm_na_outcome") + outcome_affinity_cols = c(common_outcome_affinity_cols , "mcsm_na_outcome") + affinity_dist_colnames = c(LigDist_colname, ppi2Dist_colname, naDist_colname) + } + #======================================= # All: affinity cols: based on above confition #======================================== diff --git a/scripts/plotting/plotting_thesis/basic_barplots.R b/scripts/plotting/plotting_thesis/basic_barplots.R index 7eb2030..61b3540 100644 --- a/scripts/plotting/plotting_thesis/basic_barplots.R +++ b/scripts/plotting/plotting_thesis/basic_barplots.R @@ -57,7 +57,7 @@ merged_df3 = merged_df3[, !colnames(merged_df3)%in%c("pos_count")] head(merged_df3$pos_count) df3 = merged_df3[, colnames(merged_df3)%in%plotting_cols] -#"nca_distance"%in%colnames(df3) +"nca_distance"%in%colnames(df3) #======= # output