From f7aac580814358c67976a36e1fac347a3f701604 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 19 Aug 2021 16:25:38 +0100
Subject: [PATCH] adde format_results_dynamut2.py and ran shiny scripts for
 barplots

---
 dynamut/format_results_dynamut.py             |   9 +-
 dynamut/format_results_dynamut2.py            | 137 ++++++++++++++++++
 dynamut/run_format_results_dynamut.py         |  45 ++++--
 dynamut/run_get_results_dynamut.py            |  19 +--
 scripts/functions/plotting_data.R             |   2 +-
 scripts/functions/position_count_bp.R         |  25 +++-
 scripts/functions/stability_count_bp.R        |  25 +++-
 scripts/functions/test_bp.R                   |  30 ++--
 .../plotting/redundant/basic_barplots_PS.R    |   2 +-
 9 files changed, 235 insertions(+), 59 deletions(-)
 create mode 100644 dynamut/format_results_dynamut2.py

diff --git a/dynamut/format_results_dynamut.py b/dynamut/format_results_dynamut.py
index 261bb54..05dbd0c 100644
--- a/dynamut/format_results_dynamut.py
+++ b/dynamut/format_results_dynamut.py
@@ -123,7 +123,7 @@ def format_dynamut_output(dynamut_output_csv):
     # reorder columns
     #############
     dynamut_data.columns
-    dynamut_dataf = dynamut_data[['mutationinformation'
+    dynamut_data_f = dynamut_data[['mutationinformation'
                                  
                                 , 'ddg_dynamut'
                                 , 'ddg_dynamut_scaled'
@@ -149,13 +149,14 @@ def format_dynamut_output(dynamut_output_csv):
                                 , 'dds_encom_scaled'
                                 , 'dds_encom_outcome']]
     
-    if len(dynamut_data.columns) == len(dynamut_dataf):
+    if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns):
         print('\nPASS: outcome_classification, scaling  and column reordering completed')
     else:
         print('\nFAIL: Something went wrong...'
               , '\nExpected length: ', len(dynamut_data.columns)
-              , '\nGot: ', len(dynamut_dataf))
+              , '\nGot: ', len(dynamut_data_f.columns))
+        sys.exit()
 
-    return(dynamut_dataf)
+    return(dynamut_data_f)
 #%%##################################################################### 
 
diff --git a/dynamut/format_results_dynamut2.py b/dynamut/format_results_dynamut2.py
new file mode 100644
index 0000000..fe806e8
--- /dev/null
+++ b/dynamut/format_results_dynamut2.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+import numpy as np
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+#%%#####################################################################
+def format_dynamut2_output(dynamut_output_csv):
+    """
+    @param dynamut_output_csv: file containing dynamut2 results for all muts 
+     which is the result of combining all dynamut2_output batch results, and using
+     bash scripts to combine all the batch results into one file. 
+     Dynamut2ran manually from batches
+     Formatting df to a pandas df and output as csv.
+     @type string
+
+     @return (not true) formatted csv for dynamut output
+     @type pandas df
+
+     """
+    #############
+    # Read file
+    #############
+    dynamut_data_raw  = pd.read_csv(dynamut_output_csv, sep = ',')  
+    
+    # strip white space from both ends in all columns
+    dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
+    dforig_shape = dynamut_data.shape
+    print('dimensions of input file:', dforig_shape) 
+
+#%%============================================================================        
+    #####################################
+    # create binary cols for ddg_dynamut2
+    # >=0: Stabilising
+    ###################################### 
+    outcome_cols = ['ddg_dynamut2']
+    
+    # col test: ddg_dynamut
+    #len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0])
+    #dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+    #len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising'])
+
+    print('\nCreating classification cols for', len(outcome_cols), 'columns'
+          , '\nThese are:')
+    
+    for cols in outcome_cols:
+        print(cols)
+        
+        tot_muts = dynamut_data[cols].count()
+        print('\nTotal entries:', tot_muts)
+        
+        outcome_colname = cols + '_outcome'
+        print(cols, ':', outcome_colname)
+        c1 = len(dynamut_data[dynamut_data[cols] >= 0])
+        dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+        c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising'])
+        if c1 == c2:
+            print('\nPASS: outcome classification column created successfully'
+                  , '\nColumn created:', outcome_colname
+                  #, '\nNo. of stabilising muts: ', c1
+                  #, '\nNo. of DEstabilising muts: ', tot_muts-c1
+                  , '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() )
+            
+        else:
+            print('\nFAIL: outcome classification numbers MISmatch'
+                  , '\nexpected length:', c1
+                  , '\nGot:', c2)
+            
+#%%=====================================================================  
+    ################################
+    # scale all ddg_dynamut2 values
+    #################################
+    # Rescale values in all ddg_dynamut2 col  col b/w -1 and 1 so negative numbers
+    # stay neg and pos numbers stay positive    
+        
+    outcome_cols = ['ddg_dynamut2']    
+        
+    for cols in outcome_cols:
+        #print(cols)
+        col_max = dynamut_data[cols].max()
+        col_min = dynamut_data[cols].min()
+        print( '\n===================='
+              , '\nColname:', cols 
+              , '\n===================='
+              , '\nMax: ', col_max
+              , '\nMin: ', col_min)
+        
+        scaled_colname = cols + '_scaled'
+        print('\nCreated scaled colname for', cols, ':', scaled_colname)
+        col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed')
+        
+        dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale)
+
+        col_scaled_max = dynamut_data[scaled_colname].max()
+        col_scaled_min = dynamut_data[scaled_colname].min()
+        print( '\n===================='
+              , '\nColname:', scaled_colname
+              , '\n===================='
+              , '\nMax: ', col_scaled_max
+              , '\nMin: ', col_scaled_min)
+       
+#%%=====================================================================
+    #############
+    # reorder columns
+    #############
+    dynamut_data.columns
+    dynamut_data_f = dynamut_data[['mutationinformation'
+                                , 'chain'
+                                , 'ddg_dynamut2'
+                                , 'ddg_dynamut2_scaled'
+                                , 'ddg_dynamut2_outcome']]
+    
+    if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns):
+        print('\nPASS: outcome_classification, scaling  and column reordering completed')
+    else:
+        print('\nFAIL: Something went wrong...'
+              , '\nExpected length: ', len(dynamut_data.columns)
+              , '\nGot: ', len(dynamut_data_f.columns))
+        sys.exit()
+
+    return(dynamut_data_f)
+#%%##################################################################### 
+
diff --git a/dynamut/run_format_results_dynamut.py b/dynamut/run_format_results_dynamut.py
index c8922e0..02af524 100644
--- a/dynamut/run_format_results_dynamut.py
+++ b/dynamut/run_format_results_dynamut.py
@@ -15,9 +15,9 @@ import os
 homedir = os.path.expanduser('~')
 os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
 from format_results_dynamut import *
+from format_results_dynamut2 import *
 ########################################################################
 # variables
-
 # TODO: add cmd line args
 
 gene = 'gid'
@@ -26,28 +26,47 @@ datadir = homedir + '/git/Data'
 indir = datadir + '/' + drug + '/input'
 outdir = datadir + '/' + drug + '/output'
 outdir_dynamut = outdir + '/dynamut_results/'
+outdir_dynamut2 = outdir + '/dynamut_results/dynamut2/'
 
 # Input file
 infile_dynamut =  outdir_dynamut + gene + '_dynamut_all_output_clean.csv'
+infile_dynamut2 =  outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv'
 
 # Formatted output filename
-outfile_dynamut_f = outdir_dynamut + gene + '_complex_dynamut_norm.csv'
+outfile_dynamut_f = outdir_dynamut2 + gene + '_complex_dynamut_norm.csv'
+outfile_dynamut2_f = outdir_dynamut2 + gene + '_complex_dynamut2_norm.csv'
 
-#==========================
-# CALL: format_results_mcsm_na() 
-# Data: gid+streptomycin
-#==========================
-print('Formatting results for:', infile_dynamut)
-dynamut_df_f = format_dynamut_output(dynamut_output_csv = infile_dynamut)
+#===============================
+# CALL: format_results_dynamut
+# DYNAMUT results
+# #===============================
+# print('Formatting results for:', infile_dynamut)
+# dynamut_df_f = format_dynamut_output(infile_dynamut) 
+# # writing file
+# print('Writing formatted dynamut df to csv')
+# dynamut_df_f.to_csv(outfile_dynamut_f, index = False)
+
+# print('Finished writing file:'
+#        , '\nFile:', outfile_dynamut_f
+#        , '\nExpected no. of rows:', len(dynamut_df_f)
+#        , '\nExpected no. of cols:', len(dynamut_df_f.columns)
+#        , '\n=============================================================')
+
+#===============================
+# CALL: format_results_dynamut2
+# DYNAMUT2 results
+#===============================
+print('Formatting results for:', infile_dynamut2)
+dynamut2_df_f = format_dynamut2_output(infile_dynamut2) # dynamut2
 
 # writing file
-print('Writing formatted dynamut df to csv')
-dynamut_df_f.to_csv(outfile_dynamut_f, index = False)
+print('Writing formatted dynamut2 df to csv')
+dynamut2_df_f.to_csv(outfile_dynamut2_f, index = False)
 
 print('Finished writing file:'
-       , '\nFile:', outfile_dynamut_f
-       , '\nExpected no. of rows:', len(dynamut_df_f)
-       , '\nExpected no. of cols:', len(dynamut_df_f.columns)
+       , '\nFile:', outfile_dynamut2_f
+       , '\nExpected no. of rows:', len(dynamut2_df_f)
+       , '\nExpected no. of cols:', len(dynamut2_df_f.columns)
        , '\n=============================================================')
 
 #%%#####################################################################
\ No newline at end of file
diff --git a/dynamut/run_get_results_dynamut.py b/dynamut/run_get_results_dynamut.py
index 4ddcc2f..e9e82ef 100755
--- a/dynamut/run_get_results_dynamut.py
+++ b/dynamut/run_get_results_dynamut.py
@@ -24,20 +24,9 @@ indir = datadir + drug + '/input/'
 outdir = datadir + drug + '/output/'
 outdir_dynamut_temp = outdir + 'dynamut_results/dynamut_temp/'
 #==============================================================================
-# batch 8: 08.txt, # RETRIEVED 23 Feb 08:54
-#my_url_file =  outdir + '/dynamut_temp/dynamut_result_url_gid_b8.txt'
-#my_suffix = 'gid_b7'
-
-#b09 and b10 failed, ran by Carlos, and returned results on 12 Aug
-
-# batch 9 and 10: RETRIEVED 12 Aug 09:25
-#my_url_file = outdir_dynamut_temp + 'dynamut_result_url_gid_b10.txt'
-#my_suffix = 'gid_b10'
-
-# batch10_21: from bissection: humour me! (don't need since b10 ran
-# from dynamut team, but still its ready to extract it!) RETRIEVED 12 Aug 17:37
-my_url_file = outdir_dynamut_temp + 'dynamut_result_url_gid_b10_21.txt'
-my_suffix = 'gid_b10_21'
+# batch 7 (previously 1b file): RETRIEVED 17 Aug 16:40
+my_url_file = outdir_dynamut_temp + 'dynamut_result_url_gid_b7.txt'
+my_suffix = 'gid_b7'
 #==============================================================================
 
 #==========================
@@ -52,4 +41,4 @@ get_results(url_file  = my_url_file
             , output_dir = outdir
             , outfile_suffix = my_suffix)
            
-########################################################################
+########################################################################
\ No newline at end of file
diff --git a/scripts/functions/plotting_data.R b/scripts/functions/plotting_data.R
index 105abc8..d3dff8d 100755
--- a/scripts/functions/plotting_data.R
+++ b/scripts/functions/plotting_data.R
@@ -23,7 +23,7 @@ my_df_u_lig = data.frame()
 dup_muts    = data.frame()
   
 #===========================
-# Read file: struct params
+# Read file: struct params 
 #===========================
 #df = read.csv(infile_params, header = T)
 
diff --git a/scripts/functions/position_count_bp.R b/scripts/functions/position_count_bp.R
index 2e1c26c..ce0767c 100755
--- a/scripts/functions/position_count_bp.R
+++ b/scripts/functions/position_count_bp.R
@@ -30,8 +30,12 @@ site_snp_count_bp <- function (plotdf
                                , axis_text_size = 25
                                , axis_label_size = 22
                                , xaxis_title = "Number of nsSNPs"
-                               , yaxis_title = "Number of Sites"){
-  
+                               , yaxis_title = "Number of Sites"
+                               , title_colour = "chocolate4"
+                               , subtitle_text = NULL
+                               , subtitle_size = 20
+                               , subtitle_colour = "pink")
+                               {
   # dim of plotdf
   cat(paste0("\noriginal df dimensions:"
              , "\nNo. of rows:", nrow(plotdf)
@@ -83,9 +87,9 @@ site_snp_count_bp <- function (plotdf
   
   # FIXME: should really be legend title
   # but atm being using as plot title
-  my_leg_title = paste0("Total nsSNPs:", tot_muts
-                        , ", Total no. of nsSNPs sites:", tot_sites)
-  bp_plot_title = my_leg_title
+  #my_leg_title
+  bp_plot_title = paste0("Total nsSNPs: ", tot_muts
+                        , ", Total no. of nsSNPs sites: ", tot_sites)
   
   #-------------
   # start plot 2
@@ -111,11 +115,16 @@ site_snp_count_bp <- function (plotdf
           #, legend.position = c(0.73,0.8)
           #, legend.text = element_text(size = leg_text_size)
           #, legend.title = element_text(size =  axis_label_size)
-          , plot.title = element_text(size =  leg_text_size)) + 
+          , plot.title = element_text(size = leg_text_size
+                                      , colour = title_colour)
+          , plot.subtitle = element_text(size = subtitle_size
+                                         , hjust = 0.5
+                                         , colour = subtitle_colour)) + 
     
     labs(title = bp_plot_title
-      , x = xaxis_title
-      , y = yaxis_title)
+         , subtitle = subtitle_text
+         , x = xaxis_title
+         , y = yaxis_title)
 
   return(OutPlot_pos_count)
 }
diff --git a/scripts/functions/stability_count_bp.R b/scripts/functions/stability_count_bp.R
index aa8f5a4..8f7d9ed 100644
--- a/scripts/functions/stability_count_bp.R
+++ b/scripts/functions/stability_count_bp.R
@@ -22,7 +22,14 @@ stability_count_bp <- function(plotdf
          , leg_text_size = 20
          , leg_title_size = 22
          , yaxis_title = "Number of nsSNPs"
-         , bp_plot_title = ""){
+         , bp_plot_title = ""
+         , label_categories = c("Destabilising", "Stabilising")
+         , title_colour = "chocolate4"
+         , subtitle_text = NULL
+         , subtitle_size = 20
+         , subtitle_colour = "pink"
+         #, leg_position = c(0.73,0.8) # within plot area
+         , leg_position = "top"){
  
   OutPlot_count = ggplot(plotdf, aes(x = eval(parse(text = df_colname)))) + 
     geom_bar(aes(fill = eval(parse(text = df_colname))), show.legend = TRUE) +
@@ -35,14 +42,20 @@ stability_count_bp <- function(plotdf
           , axis.title.x = element_blank()
           , axis.title.y = element_text(size =  axis_label_size)
           , axis.text.y = element_text(size = axis_text_size)
-          , legend.position = c(0.73,0.8)
+          , legend.position = leg_position
           , legend.text = element_text(size = leg_text_size)
           , legend.title = element_text(size =  leg_title_size)
-          , plot.title = element_text(size =  axis_label_size)) + 
-    labs(title = bp_plot_title
-         , y = yaxis_title) + 
+          , plot.title = element_text(size =  axis_label_size
+                                      , colour = title_colour)
+          , plot.subtitle = element_text(size = subtitle_size
+                                         , hjust = 0.5
+                                         , colour = subtitle_colour)) + 
+    labs(title      = bp_plot_title
+         , subtitle = subtitle_text
+         , y        = yaxis_title) + 
     scale_fill_discrete(name = leg_title
-                        , labels = c("Destabilising", "Stabilising"))
+                        #, labels = c("Destabilising", "Stabilising")
+                        , labels = label_categories)
   
   return(OutPlot_count)
 }
diff --git a/scripts/functions/test_bp.R b/scripts/functions/test_bp.R
index dc57452..bdb48ca 100644
--- a/scripts/functions/test_bp.R
+++ b/scripts/functions/test_bp.R
@@ -6,21 +6,24 @@ getwd()
 # load functions, data, dirs, hardocded vars
 # that will be used in testing the functions
 #===========================================
-source("plotting_data.R")
-infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv"
+drug = "streptomycin"
+gene = "gid"
+
+source("plotting_data.R")
+
+infile = paste0("~/git/Data/", drug, "/output/", gene, "_comb_stab_struc_params.csv")
+infile_df = read.csv(infile)
+
+lig_dist = 5
+pd_df = plotting_data(infile_df
+                      , lig_dist_colname = 'ligand_distance'
+                      , lig_dist_cutoff = lig_dist)
 
-pd_df = plotting_data(infile)
 my_df       = pd_df[[1]]
 my_df_u     = pd_df[[2]]
 my_df_u_lig = pd_df[[3]]
 dup_muts    = pd_df[[4]]
 
-source("plotting_globals.R")
-drug = "streptomycin"
-gene = "gid"
-
-import_dirs(drug, gene)
-
 #=====================
 # functions to test 
 #=====================
@@ -40,7 +43,9 @@ print(paste0("plot filename:", basic_bp_duet))
 # function only
 stability_count_bp(plotdf = my_df_u
                , df_colname = "duet_outcome"
-               , leg_title = "DUET outcome")
+               , leg_title = "DUET outcome"
+               , label_categories = c("Destabilising", "Stabilising")
+               , leg_position = "top")
 
 dev.off()
 
@@ -54,10 +59,13 @@ svg(plot_basic_bp_ligand)
 print(paste0("plot filename:", basic_bp_ligand))
 
 # function only
+lig_dist = 10
 stability_count_bp(plotdf = my_df_u_lig
                , df_colname = "ligand_outcome"
                , leg_title = "Ligand outcome"
-               , bp_plot_title = "Sites < 10 Ang of ligand")
+               , yaxis_title = paste0("Number of nsSNPs\nLigand dist: <", lig_dist, "\u212b")
+               #, bp_plot_title = "Sites < 10 Ang of ligand"
+               )
 
 dev.off()
 # ------------------------------
diff --git a/scripts/plotting/redundant/basic_barplots_PS.R b/scripts/plotting/redundant/basic_barplots_PS.R
index 20f8457..d6b4530 100755
--- a/scripts/plotting/redundant/basic_barplots_PS.R
+++ b/scripts/plotting/redundant/basic_barplots_PS.R
@@ -103,7 +103,7 @@ cat(paste0("Directories imported:"
  cat(paste0("\nVariables imported:"
            , "\ndrug:", drug
            , "\ngene:", gene
-           , "\n))
+           , "\n"))
            #, "\ngene_match:", gene_match
            #, "\nLength of upos:", length(upos)
            #, "\nAngstrom symbol:", angstroms_symbol))