LSHTM_analysis/scripts/plotting/get_plotting_dfs.R

#!/usr/bin/env Rscript

#########################################################
# TASK: Get formatted data for plots
#########################################################
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting")
getwd()

source("Header_TT.R")

#********************
# cmd args passed
# in from other scripts
# to call this
#********************

#====================
# variables for lig
#====================

#LigDist_colname = "ligand_distance"
#LigDist_cutoff = 10

#===========
# input
#===========
#---------------------
# call: import_dirs()
#---------------------
import_dirs(drug, gene)

#---------------------------
# call: plotting_data()
#---------------------------
if (!exists("infile_params") && exists("gene")){
#if (!is.character(infile_params) && exists("gene")){ # when running as cmd
  in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA (and for gid finally) 10/09/21
  #in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
  infile_params = paste0(outdir, "/", in_filename_params)
  cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
}

# Input 1: read <gene>_comb_afor.csv
cat("\nReading mcsm combined data file: ", infile_params)
mcsm_df = read.csv(infile_params, header = T)
pd_df = plotting_data(mcsm_df
                      , lig_dist_colname = LigDist_colname
                      , lig_dist_cutoff = LigDist_cutoff)

my_df   = pd_df[[1]]
my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()

max_ang <- round(max(my_df_u[LigDist_colname]))
min_ang <- round(min(my_df_u[LigDist_colname]))

cat("\nLigand distance cut off, colname:", LigDist_colname
    , "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b"
    , "\nThe min distance", gene, "structure df" , ":", min_ang, "\u212b")

#--------------------------------
# call: combining_dfs_plotting()
#--------------------------------
if (!exists("infile_metadata") && exists("gene")){
#if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd
  in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
  infile_metadata = paste0(outdir, "/", in_filename_metadata)
  cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
}

# Input 2: read <gene>_meta data.csv
cat("\nReading meta data file: ", infile_metadata)

gene_metadata <- read.csv(infile_metadata
                          , stringsAsFactors = F
                          , header = T)

all_plot_dfs = combining_dfs_plotting(my_df_u
                                      , gene_metadata
                                      , lig_dist_colname = LigDist_colname
                                      , lig_dist_cutoff = LigDist_cutoff)

merged_df2      = all_plot_dfs[[1]]
merged_df3      = all_plot_dfs[[2]]
merged_df2_comp = all_plot_dfs[[3]]
merged_df3_comp = all_plot_dfs[[4]]
#======================================================================
#TODO: Think! MOVE TO COMBINE or singular file for deepddg

#============================
# adding deepddg scaled values
# scale data b/w -1 and 1
#============================
# n = which(colnames(merged_df3) == "deepddg"); n
#
# my_min = min(merged_df3[,n]); my_min
# my_max = max(merged_df3[,n]); my_max
#
# merged_df3$deepddg_scaled = ifelse(merged_df3[,n] < 0
#                                    , merged_df3[,n]/abs(my_min)
#                                    , merged_df3[,n]/my_max)
# # sanity check
# my_min = min(merged_df3$deepddg_scaled); my_min
# my_max = max(merged_df3$deepddg_scaled); my_max
#
# if (my_min == -1 && my_max == 1){
#    cat("\nPASS: DeepDDG successfully scaled b/w -1 and 1"
#        #, "\nProceeding with assigning deep outcome category")
#        , "\n")
# }else{
#    cat("\nFAIL: could not scale DeepDDG ddg values"
#        , "Aborting!")
# }
#

####################################################################
#                        Data for combining other dfs
####################################################################

#source("other_dfs_data.R")
# Fixed this at source i.e python script
# Moved: "other_dfs_data.R" to redundant/

####################################################################
#                        Data for subcols barplot (~heatmap)
####################################################################

#source("coloured_bp_data.R")
# Repurposed function so that params can be passed instead to generate
# data required for plotting.
# Moved "coloured_bp_data.R" to redundant/

####################################################################
#                        Data for logoplots
####################################################################

source("logo_data.R")

s1 = c("\nSuccessfully sourced logo_data.R")
cat(s1)

####################################################################
#                        Data for DM OM Plots: Long format dfs
####################################################################

#source("other_plots_data.R")

source("dm_om_data.R")

s2 = c("\nSuccessfully sourced other_plots_data.R")
cat(s2)

####################################################################
#                  Data for Lineage barplots: WF and LF dfs
####################################################################

source("lineage_data.R")

s3 = c("\nSuccessfully sourced lineage_data.R")
cat(s3)

####################################################################
#                  Data for corr plots:
####################################################################
# make sure the above script works because merged_df2_combined is needed
source("corr_data.R")

s4 = c("\nSuccessfully sourced corr_data.R")
cat(s4)

########################################################################
#                           End of script
########################################################################
if (  all( length(s1), length(s2), length(s3), length(s4) ) >0 ){
 cat(
  "\n##################################################"
 , "\nSuccessful: get_plotting_dfs.R worked!"
 , "\n###################################################\n")
} else {
 cat(
  "\n#################################################"
 , "\nFAIL: get_plotting_dfs.R didn't complete fully!Please check"
 , "\n###################################################\n" )
 }

########################################################################
# clear excess variables
rm(c1, c2, c3, c4, check1
   , curr_count, curr_total
   , cols_check
   , cols_to_select
   , cols_to_select_deepddg
   , cols_to_select_duet
   , cols_to_select_dynamut
   , cols_to_select_dynamut2
   , cols_to_select_encomddg
   , cols_to_select_encomdds
   , cols_to_select_mcsm
   , cols_to_select_mcsm_na
   , cols_to_select_sdm
   , infile_metadata
   , infile_params
   #, infilename_dynamut
   #, infilename_dynamut2
   #, infilename_mcsm_f_snps
   #, infilename_mcsm_na
   )

rm(pivot_cols
, pivot_cols_deepddg
, pivot_cols_duet
, pivot_cols_dynamut
, pivot_cols_dynamut2
, pivot_cols_encomddg
, pivot_cols_encomdds
, pivot_cols_foldx
, pivot_cols_mcsm
, pivot_cols_mcsm_na
, pivot_cols_n
, pivot_cols_sdm)

rm(expected_cols
, expected_ncols
, expected_rows
, expected_rows_lf
, fact_cols)