From 8d6c148fff809824d8501339fe163fbe0d541ef0 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 22 Aug 2022 10:53:25 +0100 Subject: [PATCH] renamed 2 to _v2 --- scripts/functions/combining_dfs_plotting.R | 31 + scripts/functions/corr_plot_data.R | 4 + scripts/functions/plotting_data.R | 33 + scripts/plotting/get_plotting_dfs.R | 2 + scripts/plotting/plotting_colnames.R | 4 +- .../plotting/plotting_thesis/basic_barplots.R | 4 +- .../plotting_thesis/basic_barplots_v2.R | 584 ------------------ 7 files changed, 74 insertions(+), 588 deletions(-) delete mode 100644 scripts/plotting/plotting_thesis/basic_barplots_v2.R diff --git a/scripts/functions/combining_dfs_plotting.R b/scripts/functions/combining_dfs_plotting.R index 7285f11..3441c3a 100644 --- a/scripts/functions/combining_dfs_plotting.R +++ b/scripts/functions/combining_dfs_plotting.R @@ -33,9 +33,15 @@ #========================================================== #lig_dist_colname = 'ligand_distance' or global var LigDist_colname #lig_dist_cutoff = 10 or global var LigDist_cutoff +geneL_normal = c("pnca") +geneL_na = c("gid", "rpob") +geneL_ppi2 = c("alr", "embb", "katg", "rpob") + + combining_dfs_plotting <- function( my_df_u , gene_metadata + , gene # ADDED , lig_dist_colname = '' , lig_dist_cutoff = ''){ @@ -679,6 +685,31 @@ combining_dfs_plotting <- function( my_df_u min( merged_df3['avg_lig_affinity_scaled']); max( merged_df3['avg_lig_affinity_scaled']) + ################################################################### + # Rectify pos_count column in merged_df3 + # The one in merged_df2 is correct + + nc_pc_CHANGE = which(colnames(merged_df3)== "pos_count"); nc_pc_CHANGE + colnames(merged_df3)[nc_pc_CHANGE] = "df2_pos_count_all" + head(merged_df3$pos_count) + head(merged_df3$df2_pos_count_all) + + # DROP pos_count column + # merged_df3$pos_count <-NULL + merged_df3 = merged_df3[, !colnames(merged_df3)%in%c("pos_count")] + head(merged_df3$pos_count) + + merged_df3 = merged_df3 %>% + dplyr::add_count(position) + class(merged_df3) + merged_df3 = as.data.frame(merged_df3) + class(merged_df3) + nc_change = which(colnames(merged_df3) == "n") + colnames(merged_df3)[nc_change] <- "pos_count" + class(merged_df3) + #################################################################### + # ADD: distance to Nucleic acid column for na genes + #################################################################### #TODO diff --git a/scripts/functions/corr_plot_data.R b/scripts/functions/corr_plot_data.R index cd242cc..ab479c2 100644 --- a/scripts/functions/corr_plot_data.R +++ b/scripts/functions/corr_plot_data.R @@ -7,6 +7,10 @@ # LigDist_colname #from globals: plotting_globals.R # ppi2Dist_colname #from globals: plotting_globals.R # naDist_colname #from globals: plotting_globals.R +geneL_normal = c("pnca") +geneL_na = c("gid", "rpob") +geneL_ppi2 = c("alr", "embb", "katg", "rpob") + corr_data_extract <- function(df , gene , drug diff --git a/scripts/functions/plotting_data.R b/scripts/functions/plotting_data.R index 67a3f2c..47c707d 100755 --- a/scripts/functions/plotting_data.R +++ b/scripts/functions/plotting_data.R @@ -5,6 +5,17 @@ # load libraries and functions library(data.table) library(dplyr) + +# ADDED: New +geneL_normal = c("pnca") +geneL_na = c("gid", "rpob") +geneL_ppi2 = c("alr", "embb", "katg", "rpob") + +if (tolower(gene)%in%geneL_na){ + + infilename_nca = paste0("/home/tanu/git/Misc/mcsm_na_dist/" + , tolower(gene), "_nca_distances.csv") +} #======================================================== # plotting_data(): formatting data for plots # input args: @@ -20,6 +31,7 @@ library(dplyr) #lig_dist_cutoff = 10 or global var LigDist_cutoff plotting_data <- function(df + , gene # ADDED , lig_dist_colname , lig_dist_cutoff) { my_df = data.frame() @@ -57,7 +69,28 @@ if ( length(unique(df$mutationinformation)) != length(df$mutationinformation)){ upos = unique(my_df_u$position) cat("\nDim of clean df:"); cat(dim(my_df_u), "\n") cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n") +#=============================================== +# ADD : na distance column for genes with nucleic acid affinity +#=============================================== +#gid_na_distcol +if (tolower(gene)%in%geneL_na){ + + distcol_nca_name = read.csv(infilename_nca, header = F) + head(distcol_nca_name) + colnames(distcol_nca_name) <- c("mutationinformation", "nca_distance") + head(distcol_nca_name) + class(distcol_nca_name) + + mcol = colnames(distcol_nca_name)[colnames(distcol_nca_name)%in%colnames(my_df_u)] + mcol + head(my_df_u$mutationinformation) + head(distcol_nca_name$mutationinformation) + my_df_u = merge(my_df_u, distcol_nca_name, + by = "mutationinformation", + all = T) + +} #=============================================== # extract mutations <10 Angstroms and symbol #=============================================== diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index ec44d90..f06f5d7 100644 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -53,6 +53,7 @@ if (!exists("infile_params") && exists("gene")){ cat("\nReading mcsm combined data file: ", infile_params) mcsm_df = read.csv(infile_params, header = T) pd_df = plotting_data(mcsm_df + , gene = gene # ADDED , lig_dist_colname = LigDist_colname , lig_dist_cutoff = LigDist_cutoff) @@ -87,6 +88,7 @@ cat("\nDim of meta data file: ", dim(gene_metadata)) all_plot_dfs = combining_dfs_plotting(my_df_u , gene_metadata + , gene = gene # ADDED , lig_dist_colname = LigDist_colname , lig_dist_cutoff = LigDist_cutoff) diff --git a/scripts/plotting/plotting_colnames.R b/scripts/plotting/plotting_colnames.R index d417de1..282bc35 100644 --- a/scripts/plotting/plotting_colnames.R +++ b/scripts/plotting/plotting_colnames.R @@ -92,8 +92,8 @@ if (tolower(gene)%in%geneL_na){ naDist_colname, "mcsm_na_affinity", "mcsm_na_scaled", "mcsm_na_outcome") - raw_affinity_cols = c(common_raw_affinity_cols , "mcsm_na_affinity") - scaled_affinity_cols = c(common_scaled_affinity_cols , "mcsm_na_scaled") + raw_affinity_cols = c(common_raw_affinity_cols , "mcsm_na_affinity") + scaled_affinity_cols = c(common_scaled_affinity_cols , "mcsm_na_scaled") outcome_affinity_cols = c(common_outcome_affinity_cols , "mcsm_na_outcome") affinity_dist_colnames = c(LigDist_colname, ppi2Dist_colname, naDist_colname) diff --git a/scripts/plotting/plotting_thesis/basic_barplots.R b/scripts/plotting/plotting_thesis/basic_barplots.R index 4d4a520..7eb2030 100644 --- a/scripts/plotting/plotting_thesis/basic_barplots.R +++ b/scripts/plotting/plotting_thesis/basic_barplots.R @@ -30,8 +30,8 @@ #source("~/git/LSHTM_analysis/config/gid.R") #source("~/git/LSHTM_analysis/config/alr.R") -source("~/git/LSHTM_analysis/config/katg.R") -#source("~/git/LSHTM_analysis/config/rpob.R") +#source("~/git/LSHTM_analysis/config/katg.R") +source("~/git/LSHTM_analysis/config/rpob.R") source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") #source("~/git/LSHTM_analysis/scripts/plotting/plotting_colnames.R") sourced by above diff --git a/scripts/plotting/plotting_thesis/basic_barplots_v2.R b/scripts/plotting/plotting_thesis/basic_barplots_v2.R deleted file mode 100644 index b6fcfea..0000000 --- a/scripts/plotting/plotting_thesis/basic_barplots_v2.R +++ /dev/null @@ -1,584 +0,0 @@ -#!/usr/bin/env Rscript -######################################################### -# TASK: Barplots for mCSM DUET, ligand affinity, and foldX -# basic barplots with count of mutations -# basic barplots with frequency of count of mutations - -# , df_colname = "" -# , leg_title = "" -# , ats = 25 # axis text size -# , als = 22 # axis label size -# , lts = 20 # legend text size -# , ltis = 22 # label title size -# , geom_ls = 10 # geom_label size -# , yaxis_title = "Number of nsSNPs" -# , bp_plot_title = "" -# , label_categories = c("Destabilising", "Stabilising") -# , title_colour = "chocolate4" -# , subtitle_text = NULL -# , sts = 20 -# , subtitle_colour = "pink" -# #, leg_position = c(0.73,0.8) # within plot area -# , leg_position = "top" -# , bar_fill_values = c("#F8766D", "#00BFC4") -######################################################### -#============= -# Data: Input -#============== -#source("~/git/LSHTM_analysis/config/pnca.R") -#source("~/git/LSHTM_analysis/config/embb.R") -#source("~/git/LSHTM_analysis/config/gid.R") - -source("~/git/LSHTM_analysis/config/alr.R") -#source("~/git/LSHTM_analysis/config/katg.R") -#source("~/git/LSHTM_analysis/config/rpob.R") - -source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") -#source("~/git/LSHTM_analysis/scripts/plotting/plotting_colnames.R") sourced by above -# sanity check - -cat("\nSourced plotting cols as well:", length(plotting_cols)) - -#################################################### -class(merged_df3) -merged_df3 = as.data.frame(merged_df3) - -class(merged_df3) -head(merged_df3$pos_count) - -nc_pc_CHANGE = which(colnames(merged_df3)== "pos_count"); nc_pc_CHANGE -colnames(merged_df3)[nc_pc_CHANGE] = "df2_pos_count_all" -head(merged_df3$pos_count) -head(merged_df3$df2_pos_count_all) - -# DROP pos_count column -# merged_df3$pos_count <-NULL -merged_df3 = merged_df3[, !colnames(merged_df3)%in%c("pos_count")] -head(merged_df3$pos_count) - -df3 = merged_df3[, colnames(merged_df3)%in%plotting_cols] -"nca_distance"%in%colnames(df3) - -#======= -# output -#======= -outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/") -cat("plots will output to:", outdir_images) - -########################################################### -#------------------------------ -# plot default sizes -#------------------------------ -#========================= -# Affinity outcome -# check this var: outcome_cols_affinity -# get from preformatting or put in globals -#========================== -DistCutOff -LigDist_colname # = "ligand_distance" # from globals -ppi2Dist_colname -naDist_colname - -########################################################### -# get plotting data within the distance -df3_lig = df3[df3[[LigDist_colname]]