LSHTM_analysis/scripts/plotting/get_plotting_dfs.R

#!/usr/bin/env Rscript

#########################################################
# TASK: Get formatted data for plots
#########################################################
# working dir and loading libraries
getwd()
source("~/git/LSHTM_analysis/scripts/Header_TT.R")
source("~/git/LSHTM_analysis/scripts/plotting/plotting_colnames.R")
# cmd args passed
# in from other scripts
# to call this
# set drug and gene name

#==========================================
# variables for affinity:
# comes from functions/plotting_globals.R
#==========================================

cat("\nGlobal variables for Ligand:"
    , "\nligand distance colname:", LigDist_colname
    , "\nligand distance cut off:", LigDist_cutoff)

cat("\nGlobal variables for mCSM-PPI2 affinity:"
    , "\nPPI2 distance colname:", ppi2Dist_colname
    , "\nPPI2 cut off:", DistCutOff)

cat("\nGlobal variables for mCSM-NA affinity:"
    , "\nligand distance colname:", naDist_colname
    , "\nligand distance cut off:", DistCutOff)


#===========
# input
#===========
#--------------------------------------------
# call: import_dirs()
# comes from functions/plotting_globals.R
#--------------------------------------------
import_dirs(drug, gene)

#---------------------------
# call: plotting_data()
#---------------------------
if (!exists("infile_params") && exists("gene")){
  #if (!is.character(infile_params) && exists("gene")){ # when running as cmd
  in_filename_params = paste0(tolower(gene), "_all_params.csv")
  infile_params = paste0(outdir, "/", in_filename_params)
  cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
}

# Input 1:
cat("\nReading mcsm combined data file: ", infile_params)
mcsm_df = read.csv(infile_params, header = T)
if (tolower(gene)%in%c("rpob")){
  mcsm_df =  mcsm_df[mcsm_df$position!=1148,]
}

pd_df = plotting_data(mcsm_df
                      , gene = gene # ADDED
                      , lig_dist_colname = LigDist_colname
                      , lig_dist_cutoff = LigDist_cutoff)

my_df   = pd_df[[1]]
my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()

max_ang <- round(max(my_df_u[[LigDist_colname]]))
min_ang <- round(min(my_df_u[[LigDist_colname]]))

cat("\nLigand distance colname:", LigDist_colname
    , "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b"
    , "\nThe min distance", gene, "structure df" , ":", min_ang, "\u212b")

#--------------------------------
# call: combining_dfs_plotting()
#--------------------------------
if (!exists("infile_metadata") && exists("gene")){
  #if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd
  in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
  infile_metadata = paste0(outdir, "/", in_filename_metadata)
  cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
}

# Input 2: read <gene>_meta data.csv
cat("\nReading meta data file: ", infile_metadata)

gene_metadata <- read.csv(infile_metadata
                          , stringsAsFactors = F
                          , header = T)

cat("\nDim of meta data file: ", dim(gene_metadata))

all_plot_dfs = combining_dfs_plotting(my_df_u
                                      , gene_metadata
                                      #, gene = gene # ADDED
                                      , lig_dist_colname = LigDist_colname
                                      , lig_dist_cutoff = LigDist_cutoff)

merged_df2      = all_plot_dfs[[1]]
merged_df3      = all_plot_dfs[[2]]
#merged_df2_comp = all_plot_dfs[[3]]
#merged_df3_comp = all_plot_dfs[[4]]

#======================================================================

####################################################################
#                        Data for subcols barplot (~heatmap)
####################################################################

#source("coloured_bp_data.R")
# Repurposed function so that params can be passed instead to generate
# data required for plotting.
# Moved "coloured_bp_data.R" to redundant/

####################################################################
#                        Data for logoplots
####################################################################
#
# source(paste0(plot_script_path, "logo_data_msa.R"))
# s1 = c("\nSuccessfully sourced logo_data_msa.R")
# cat(s1)
#
# ####################################################################
# #                     Data for DM OM Plots: WF and LF dfs
# #                   My function: dm_om_wf_lf_data()
# #                 location: scripts/functions/dm_om_data.R
# #source("other_plots_data.R")
# ####################################################################
#
# #source(paste0(plot_script_path, "dm_om_data.R")) # calling the function directly instead
geneL_normal  = c("pnca")
geneL_na      = c("gid", "rpob")
geneL_ppi2    = c("alr", "embb", "katg", "rpob")

# geneL_normal  = c("pnca")
# geneL_both    = c("rpob")
# geneL_ppi2    = c("alr", "embb", "katg")
# geneL_na      = c("gid")

all_dm_om_df = dm_om_wf_lf_data(df = merged_df3, gene = gene)

wf_duet      = all_dm_om_df[['wf_duet']]
lf_duet      = all_dm_om_df[['lf_duet']]

wf_mcsm_lig  = all_dm_om_df[['wf_mcsm_lig']]
lf_mcsm_lig  = all_dm_om_df[['lf_mcsm_lig']]

wf_foldx     = all_dm_om_df[['wf_foldx']]
lf_foldx     = all_dm_om_df[['lf_foldx']]

wf_deepddg   = all_dm_om_df[['wf_deepddg']]
lf_deepddg   = all_dm_om_df[['lf_deepddg']]

wf_dynamut2  = all_dm_om_df[['wf_dynamut2']]
lf_dynamut2  = all_dm_om_df[['lf_dynamut2']]

wf_consurf   = all_dm_om_df[['wf_consurf']]
lf_consurf   = all_dm_om_df[['lf_consurf']]

wf_snap2     = all_dm_om_df[['wf_snap2']]
lf_snap2     = all_dm_om_df[['lf_snap2']]

wf_provean   = all_dm_om_df[['wf_provean']]
lf_provean   = all_dm_om_df[['lf_provean']]

# NEW
wf_dist_gen   = all_dm_om_df[['wf_dist_gen']]
lf_dist_gen   = all_dm_om_df[['lf_dist_gen']]

# ppi2 genes
if (tolower(gene)%in%geneL_ppi2){
  wf_mcsm_ppi2 = all_dm_om_df[['wf_mcsm_ppi2']]
  lf_mcsm_ppi2 = all_dm_om_df[['lf_mcsm_ppi2']]
}

# na genes
if (tolower(gene)%in%geneL_na){
  wf_mcsm_na   = all_dm_om_df[['wf_mcsm_na']]
  lf_mcsm_na   = all_dm_om_df[['lf_mcsm_na']]
}

# both ppi2+na genes:: NOT NEEDED Here as its is handled by the two ifs above
# if (tolower(gene)%in%geneL_both){
#   wf_mcsm_ppi2 = all_dm_om_df[['wf_mcsm_ppi2']]
#   lf_mcsm_ppi2 = all_dm_om_df[['lf_mcsm_ppi2']]
#
#   wf_mcsm_na   = all_dm_om_df[['wf_mcsm_na']]
#   lf_mcsm_na   = all_dm_om_df[['lf_mcsm_na']]
# }


s2 = c("\nSuccessfully sourced other_plots_data.R")
cat(s2)
#
# ####################################################################
# #                  Data for Lineage barplots: WF and LF dfs
# #               My function: lineage_plot_data()
# #           location: scripts/functions/lineage_plot_data.R
# ####################################################################
#
source(paste0(plot_script_path, "lineage_data.R"))
# # converted to a function. Moved lineage_data.R to redundant/
lineage_dfL = lineage_plot_data(merged_df2
                                , lineage_column_name = "lineage"
                                , remove_empty_lineage = F
                                , lineage_label_col_name = "lineage_labels"
                                , id_colname = "id"
                                , snp_colname = "mutationinformation"
)

lin_wf = lineage_dfL[['lin_wf']]
lin_lf = lineage_dfL[['lin_lf']]

s3 = c("\nSuccessfully sourced lineage_data.R")
cat(s3)

####################################################################
#                  Data for corr plots:
#               My function: corr_data_extract()
#          location: scripts/functions/corr_plot_data.R
####################################################################
# make sure the above script works because merged_df2_combined is needed
merged_df3 = as.data.frame(merged_df3)

corr_df_m3_f = corr_data_extract(merged_df3
                                 , gene = gene
                                 , drug = drug
                                 , extract_scaled_cols = F)
head(corr_df_m3_f)

# corr_df_m2_f = corr_data_extract(merged_df2
#                                  , gene = gene
#                                  , drug = drug
#                                  , extract_scaled_cols = F)
# head(corr_df_m2_f)

s4 = c("\nSuccessfully sourced Corr_data.R")
cat(s4)

########################################################################
#                           End of script
########################################################################
# if (  all( length(s1), length(s2), length(s3), length(s4) ) > 0 ){
#   cat(
#     "\n##################################################"
#     , "\nSuccessful: get_plotting_dfs.R worked!"
#     , "\n###################################################\n")
# } else {
#   cat(
#     "\n#################################################"
#     , "\nFAIL: get_plotting_dfs.R didn't complete fully!Please check"
#     , "\n###################################################\n" )
# }
#
########################################################################
# clear excess variables: from the global enviornment

vars0 = ls(envir = .GlobalEnv)[grepl("curr_*", ls(envir = .GlobalEnv))]
vars1 = ls(envir = .GlobalEnv)[grepl("^cols_to*", ls(envir = .GlobalEnv))]
vars2 = ls(envir = .GlobalEnv)[grepl("pivot_cols_*", ls(envir = .GlobalEnv))]
vars3 = ls(envir = .GlobalEnv)[grepl("expected_*", ls(envir = .GlobalEnv))]

rm( infile_metadata
    , infile_params
    , vars0
    , vars1
    , vars2
    , vars3)