separated get_plotting_dfs_with_lig.R

2021-09-02 12:51:31 +01:00 · 2021-09-02 12:51:31 +01:00 · 55233ea474
commit 55233ea474
parent 2c65bb25d8
1 changed files with 589 additions and 0 deletions
--- a/scripts/plotting/get_plotting_dfs_with_lig.R
+++ b/scripts/plotting/get_plotting_dfs_with_lig.R
@ -0,0 +1,589 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: Get formatted data for plots
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting")
+getwd()
+
+source("Header_TT.R")
+source("../functions/my_pairs_panel.R") # with lower panel turned off
+source("../functions/plotting_globals.R")
+source("../functions/plotting_data.R")
+source("../functions/combining_dfs_plotting.R")
+source("../functions/bp_subcolours.R")
+
+#********************
+# cmd args passed 
+# in from other scripts
+# to call this
+#********************
+#drug = 'streptomycin'
+#gene = 'gid'
+#====================
+# variables for lig
+#====================
+
+LigDist_colname = "ligand_distance"
+LigDist_cutoff = 10
+
+#===========
+# input
+#===========
+#---------------------
+# call: import_dirs()
+#---------------------
+import_dirs(drug, gene)
+
+#---------------------------
+# call: plotting_data()
+#---------------------------
+if (!exists("infile_params") && exists("gene")){
+#if (!is.character(infile_params) && exists("gene")){ # when running as cmd
+  #in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA
+  in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
+  infile_params = paste0(outdir, "/", in_filename_params)
+  cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
+}
+
+# Input 1: read <gene>_comb_afor.csv
+cat("\nReading mcsm combined data file: ", infile_params)
+mcsm_df = read.csv(infile_params, header = T)
+pd_df = plotting_data(mcsm_df
+                      , lig_dist_colname = LigDist_colname
+                      , lig_dist_cutoff = LigDist_cutoff)
+
+my_df       = pd_df[[1]] 
+my_df_u     = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
+my_df_u_lig = pd_df[[3]] 
+dup_muts    = pd_df[[4]] 
+
+#--------------------------------
+# call: combining_dfs_plotting()
+#--------------------------------
+if (!exists("infile_metadata") && exists("gene")){
+#if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd
+  in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
+  infile_metadata = paste0(outdir, "/", in_filename_metadata)
+  cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
+}
+
+# Input 2: read <gene>_meta data.csv
+cat("\nReading meta data file: ", infile_metadata)
+
+gene_metadata <- read.csv(infile_metadata
+                          , stringsAsFactors = F
+                          , header = T)
+
+all_plot_dfs = combining_dfs_plotting(my_df_u
+                                      , gene_metadata
+                                      , lig_dist_colname = LigDist_colname
+                                      , lig_dist_cutoff = LigDist_cutoff)
+
+merged_df2          = all_plot_dfs[[1]]
+merged_df3          = all_plot_dfs[[2]]
+merged_df2_comp     = all_plot_dfs[[3]]
+merged_df3_comp     = all_plot_dfs[[4]]
+merged_df2_lig      = all_plot_dfs[[5]]
+merged_df3_lig      = all_plot_dfs[[6]]
+merged_df2_comp_lig = all_plot_dfs[[7]]
+merged_df3_comp_lig = all_plot_dfs[[8]]
+
+####################################################################
+#                        Data for subcols barplot (~heatmap)
+####################################################################
+# can include: mutation, or_kin, pwald, af_kin
+cols_to_select = c("mutationinformation", "drtype"
+                   , "wild_type"
+                   , "position"
+                   , "mutant_type"
+                   , "chain", "ligand_id", "ligand_distance"
+                   , "duet_stability_change", "duet_outcome", "duet_scaled"
+                   , "ligand_affinity_change", "ligand_outcome", "affinity_scaled"
+                   , "ddg_foldx", "foldx_scaled", "foldx_outcome"
+                   , "deepddg", "deepddg_outcome" # comment out as not available for pnca
+                   , "asa", "rsa", "rd_values", "kd_values"
+                   , "af", "or_mychisq", "pval_fisher" 
+                   , "or_fisher", "or_logistic", "pval_logistic"
+                   , "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity"
+                   , "wt_calcprop", "mut_calcprop")
+
+#=======================
+# Data for sub colours
+# barplot: PS
+#=======================
+
+cat("\nNo. of cols to select:", length(cols_to_select))
+
+subcols_df_ps = merged_df3[, cols_to_select]
+
+cat("\nNo of unique positions for ps:"
+    , length(unique(subcols_df_ps$position)))
+
+# add count_pos col that counts the no. of nsSNPS at a position
+setDT(subcols_df_ps)[, pos_count := .N, by = .(position)]
+
+# should be a factor
+if (is.factor(subcols_df_ps$duet_outcome)){
+  cat("\nDuet_outcome is factor")
+  table(subcols_df_ps$duet_outcome)
+}else{
+  cat("\nConverting duet_outcome to factor")
+  subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome)
+  table(subcols_df_ps$duet_outcome)
+}
+
+# should be -1 and 1
+min(subcols_df_ps$duet_scaled)
+max(subcols_df_ps$duet_scaled)
+
+tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min)
+tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max)
+
+# check unique values in normalised data
+cat("\nNo. of unique values in duet scaled, no rounding:"
+    , length(unique(subcols_df_ps$duet_scaled)))
+
+# No rounding    
+my_grp = subcols_df_ps$duet_scaled; length(my_grp)
+
+# Add rounding is to be used
+n = 3 
+subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n)
+
+cat("\nNo. of unique values in duet scaled", n, "places rounding:"
+    , length(unique(subcols_df_ps$duet_scaledR)))
+
+my_grp_r = subcols_df_ps$duet_scaledR  # rounding
+
+# Add grp cols
+subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "")
+subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "")
+
+# Call the function to create the palette based on the group defined above
+subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp")
+subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r")
+
+print(paste0("Colour palette generated for my_grp: ", length(subcols_ps), " colours"))
+print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours"))
+
+#=======================
+# Data for sub colours
+# barplot: LIG
+#=======================
+cat("\nNo. of cols to select:", length(cols_to_select))
+
+subcols_df_lig = merged_df3_lig[, cols_to_select]
+
+cat("\nNo of unique positions for LIG:"
+    , length(unique(subcols_df_lig$position)))
+
+# should be a factor
+if (is.factor(subcols_df_lig$ligand_outcome)){
+  cat("\nLigand_outcome is factor")
+  table(subcols_df_lig$ligand_outcome)
+}else{
+  cat("\nConverting ligand_outcome to factor")
+  subcols_df_lig$ligand_outcome = as.factor(subcols_df_lig$ligand_outcome)
+  table(subcols_df_lig$ligand_outcome)
+}
+
+# should be -1 and 1
+min(subcols_df_lig$affinity_scaled)
+max(subcols_df_lig$affinity_scaled)
+
+tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, min)
+tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, max)
+
+# check unique values in normalised data
+cat("\nNo. of unique values in affinity scaled, no rounding:"
+    , length(unique(subcols_df_lig$affinity_scaled)))
+
+# No rounding    
+my_grp_lig = subcols_df_lig$affinity_scaled; length(my_grp_lig)
+
+# Add rounding is to be used
+n = 3 
+subcols_df_lig$affinity_scaledR = round(subcols_df_lig$affinity_scaled, n)
+
+cat("\nNo. of unique values in duet scaled", n, "places rounding:"
+    , length(unique(subcols_df_lig$affinity_scaledR)))
+
+my_grp_lig_r = subcols_df_lig$affinity_scaledR  # rounding
+
+# Add grp cols
+subcols_df_lig$group_lig <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig, sep = "")
+subcols_df_lig$group_ligR <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig_r, sep = "")
+
+# Call the function to create the palette based on the group defined above
+subcols_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig")
+subcolsR_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig_r")
+
+print(paste0("Colour palette generated for my_grp: ", length(subcols_lig), " colours"))
+print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_lig), " colours"))
+
+####################################################################
+#                        Data for logoplots
+####################################################################
+#-------------------------
+# choose df for logoplot
+#-------------------------
+logo_data = merged_df3
+#logo_data = merged_df3_comp
+
+# quick checks
+colnames(logo_data)
+str(logo_data)
+
+c1 = unique(logo_data$position) 
+nrow(logo_data)
+cat("No. of rows in my_data:", nrow(logo_data)
+    , "\nDistinct positions corresponding to snps:", length(c1)
+    , "\n===========================================================")
+#=======================================================================
+#==================
+# logo data: OR
+#==================
+foo = logo_data[, c("position"
+                      , "mutant_type","duet_scaled", "or_mychisq"
+                      , "mut_prop_polarity", "mut_prop_water")] 
+
+logo_data$log10or = log10(logo_data$or_mychisq)
+logo_data_plot = logo_data[, c("position"
+                            , "mutant_type", "or_mychisq", "log10or")]
+
+logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")]
+wide_df_or <- logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0)
+
+wide_df_or = as.matrix(wide_df_or)
+rownames(wide_df_or) = wide_df_or[,1]
+dim(wide_df_or)
+wide_df_or = wide_df_or[,-1]
+str(wide_df_or)
+
+position_or = as.numeric(colnames(wide_df_or))
+
+#==================
+# logo data: logOR
+#==================
+logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")]
+wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0)
+
+wide_df_logor = as.matrix(wide_df_logor)
+
+rownames(wide_df_logor) = wide_df_logor[,1]
+wide_df_logor = subset(wide_df_logor, select = -c(1) )
+colnames(wide_df_logor)
+wide_df_logor_m = data.matrix(wide_df_logor)
+
+rownames(wide_df_logor_m)
+colnames(wide_df_logor_m)
+
+position_logor = as.numeric(colnames(wide_df_logor_m))
+
+#===============================
+# logo data: multiple nsSNPs (>1)
+#=================================
+#require(data.table)
+
+# get freq count of positions so you can subset freq<1
+setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)] 
+
+table(logo_data$position)
+table(logo_data$mut_pos_occurrence)
+
+max_mut = max(table(logo_data$position))
+
+# extract freq_pos > 1
+my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,] 
+u = unique(my_data_snp$position)
+max_mult_mut = max(table(my_data_snp$position))
+
+if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){
+  
+  cat("PASS: positions with  multiple muts extracted"
+      , "\nNo. of mutations:", nrow(my_data_snp)
+      , "\nNo. of positions:", length(u)
+      , "\nMax no. of muts at any position", max_mult_mut)
+}else{
+  cat("FAIL: positions with multiple muts could NOT be extracted"
+      , "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]]
+      , "\nGot:", nrow(my_data_snp) )
+}
+
+cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]])
+
+#--------------------------------------
+# matrix for_mychisq mutant type
+# frequency of mutant type by position
+#---------------------------------------
+table(my_data_snp$mutant_type, my_data_snp$position)
+tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
+class(tab_mt)
+
+# unclass to convert to matrix
+tab_mt = unclass(tab_mt)
+tab_mt = as.matrix(tab_mt, rownames = T)
+
+# should be TRUE
+is.matrix(tab_mt)
+
+rownames(tab_mt) #aa
+colnames(tab_mt) #pos
+
+#-------------------------------------
+# matrix for wild type
+# frequency of wild type by position
+#-------------------------------------
+tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
+tab_wt = unclass(tab_wt)
+
+# remove wt duplicates
+wt = my_data_snp[, c("position", "wild_type")]
+wt = wt[!duplicated(wt),]
+
+tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
+
+rownames(tab_wt)
+rownames(tab_wt)
+
+identical(colnames(tab_mt), colnames(tab_wt))
+identical(ncol(tab_mt), ncol(tab_wt))
+
+#----------------------------------
+# logo data OR: multiple nsSNPs (>1)
+#----------------------------------
+logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")]
+#wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0)
+wide_df_or_mult <- logo_data_or_mult %>% spread(position, or_mychisq, fill = NA)
+
+wide_df_or_mult = as.matrix(wide_df_or_mult)
+rownames(wide_df_or_mult) = wide_df_or_mult[,1]
+wide_df_or_mult = wide_df_or_mult[,-1]
+str(wide_df_or_mult)
+
+position_or_mult = as.numeric(colnames(wide_df_or_mult))
+
+####################################################################
+#                        Data for Corrplots
+####################################################################
+cat("\n=========================================="
+    , "\nCORR PLOTS data: PS"
+    , "\n===========================================")
+
+df_ps = merged_df2
+
+#--------------------
+# adding log cols : NEW UNCOMMENT
+#--------------------
+#df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
+#df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
+
+##df_ps$log10_or_kin = log10(df_ps$or_kin)
+##df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
+
+#df_ps$mutation_info_labels = ifelse(df_ps$mutation_info == dr_muts_col, 1, 0)
+
+#----------------------------
+# columns for corr plots:PS
+#----------------------------
+# subset data to generate pairwise correlations
+cols_to_select =  c("mutationinformation"
+                    , "duet_scaled"
+                    , "foldx_scaled"
+                    #, "mutation_info_labels"
+                    , "asa"
+                    , "rsa"
+                    , "rd_values"
+                    , "kd_values"
+                    , "log10_or_mychisq"
+                    , "neglog_pval_fisher"
+                    ##, "or_kin"
+                    ##, "neglog_pwald_kin"
+                    , "af"
+                    ##, "af_kin"
+                    , "duet_outcome"
+                    , drug)
+
+corr_data_ps = df_ps[cols_to_select]
+
+dim(corr_data_ps)
+
+#--------------------------------------
+# assign nice colnames (for display)
+#--------------------------------------
+my_corr_colnames = c("Mutation"
+                     , "DUET"
+                     , "FoldX"
+                     #, "Mutation class"
+                     , "ASA"
+                     , "RSA"
+                     , "RD"
+                     , "KD"
+                     , "Log (OR)"
+                     , "-Log (P)"
+                     ##, "Adjusted (OR)"
+                     ##, "-Log (P wald)"
+                     , "MAF"
+                     ##, "AF_kin"
+                     , "duet_outcome"
+                     , drug)
+
+length(my_corr_colnames)
+
+colnames(corr_data_ps)
+colnames(corr_data_ps) <- my_corr_colnames
+colnames(corr_data_ps)
+
+start = 1
+end = which(colnames(corr_data_ps) == drug); end # should be the last column
+offset = 1
+
+#===========================
+# Corr data for plots: PS
+# big_df ps: ~ merged_df2
+#===========================
+
+#corr_ps_df2 = corr_data_ps[start:(end-offset)] # without drug
+corr_ps_df2 = corr_data_ps[start:end]
+head(corr_ps_df2)
+
+#===========================
+# Corr data for plots: PS
+# short_df ps: ~merged_df3
+#===========================
+corr_ps_df3 = corr_ps_df2[!duplicated(corr_ps_df2$Mutation),]
+
+na_or = sum(is.na(corr_ps_df3$`Log (OR)`))
+check1 = nrow(corr_ps_df3) - na_or
+
+##na_adj_or = sum(is.na(corr_ps_df3$`adjusted (OR)`))
+##check2 = nrow(corr_ps_df3) - na_adj_or 
+
+if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
+  cat( "\nPASS: No. of rows for corr_ps_df3 match"
+       , "\nPASS: No. of OR values checked: " , check1)
+} else {
+  cat("\nFAIL: Numbers  mismatch:"
+      , "\nExpected nrows: ", nrow(merged_df3)
+      , "\nGot: ", nrow(corr_ps_df3)
+      , "\nExpected OR values: ", nrow(merged_df3_comp)
+      , "\nGot: ", check1)
+}
+
+#=================================
+# Data for Correlation plots: LIG
+#=================================
+cat("\n=========================================="
+    , "\nCORR PLOTS data: LIG"
+    , "\n===========================================")
+
+df_lig = merged_df2_lig
+
+table(df_lig$ligand_outcome)
+
+#--------------------
+# adding log cols : NEW UNCOMMENT
+#--------------------
+#df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
+#df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
+
+##df_lig$log10_or_kin = log10(df_lig$or_kin)
+##df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
+
+#----------------------------
+# columns for corr plots:PS
+#----------------------------
+# subset data to generate pairwise correlations
+cols_to_select =  c("mutationinformation"
+                    , "affinity_scaled"
+                    #, "mutation_info_labels"
+                    , "asa"
+                    , "rsa"
+                    , "rd_values"
+                    , "kd_values"
+                    , "log10_or_mychisq"
+                    , "neglog_pval_fisher"
+                    ##, "or_kin"
+                    ##, "neglog_pwald_kin"
+                    , "af"
+                    ##, "af_kin"
+                    , "ligand_outcome"
+                    , drug)
+
+corr_data_lig = df_lig[, cols_to_select]
+
+dim(corr_data_lig)
+
+#--------------------------------------
+# assign nice colnames (for display)
+#--------------------------------------
+my_corr_colnames = c("Mutation"
+                     , "Ligand Affinity"
+                     #, "Mutation class"
+                     , "ASA"
+                     , "RSA"
+                     , "RD"
+                     , "KD"
+                     , "Log (OR)"
+                     , "-Log (P)"
+                     ##, "Adjusted (OR)"
+                     ##, "-Log (P wald)"
+                     , "MAF"
+                     ##, "MAF_kin"
+                     , "ligand_outcome"
+                     , drug)
+
+length(my_corr_colnames)
+
+colnames(corr_data_lig)
+colnames(corr_data_lig) <- my_corr_colnames
+colnames(corr_data_lig)
+
+start = 1
+end = which(colnames(corr_data_lig) == drug); end # should be the last column
+offset = 1
+
+#=============================
+# Corr data for plots: LIG
+# big_df lig: ~ merged_df2_lig
+#==============================
+#corr_lig_df2 = corr_data_lig[start:(end-offset)] # without drug
+corr_lig_df2 = corr_data_lig[start:end]
+head(corr_lig_df2)
+
+#=============================
+# Corr data for plots: LIG
+# short_df lig: ~ merged_df3_lig
+#==============================
+corr_lig_df3 = corr_lig_df2[!duplicated(corr_lig_df2$Mutation),]
+
+na_or_lig = sum(is.na(corr_lig_df3$`Log (OR)`))
+check1_lig = nrow(corr_lig_df3) - na_or_lig
+
+if (nrow(corr_lig_df3) == nrow(merged_df3_lig) && nrow(merged_df3_comp_lig) == check1_lig) {
+  cat( "\nPASS: No. of rows for corr_lig_df3 match"
+       , "\nPASS: No. of OR values checked: " , check1_lig)
+} else {
+  cat("\nFAIL: Numbers  mismatch:"
+      , "\nExpected nrows: ", nrow(merged_df3_lig)
+      , "\nGot: ", nrow(corr_ps_df3_lig)
+      , "\nExpected OR values: ", nrow(merged_df3_comp_lig)
+      , "\nGot: ", check1_lig)
+}
+
+# remove unnecessary columns
+identical(corr_data_lig, corr_lig_df2)
+identical(corr_data_ps, corr_ps_df2)
+
+#rm(df_ps, df_lig, corr_data_ps, corr_data_lig)
+
+########################################################################
+#                           End of script
+########################################################################
+rm(foo)
+
+cat("\n===================================================\n"
+      , "\nSuccessful: get_plotting_dfs.R worked!"
+      , "\n====================================================")