From 1ea42097aec3fd44404385daa69db9ff6429268c Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 13 Aug 2021 13:24:22 +0100
Subject: [PATCH] added dynamut results formatting scripts, althouh needs to be
 rerun once b7 completes

---
 dynamut/format_results_dynamut.py     | 161 ++++++++++++++++++++++++++
 dynamut/run_format_results_dynamut.py |  53 +++++++++
 2 files changed, 214 insertions(+)
 create mode 100644 dynamut/format_results_dynamut.py
 create mode 100644 dynamut/run_format_results_dynamut.py

diff --git a/dynamut/format_results_dynamut.py b/dynamut/format_results_dynamut.py
new file mode 100644
index 0000000..261bb54
--- /dev/null
+++ b/dynamut/format_results_dynamut.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+import numpy as np
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+#%%#####################################################################
+def format_dynamut_output(dynamut_output_csv):
+    """
+    @param dynamut_output_csv: file containing dynamut results for all muts 
+     which is the result of combining all dynamut_output batch results, and using
+     bash scripts to combine all the batch results into one file. 
+     This is post run_get_results_dynamut.py 
+     Formatting df to a pandas df and output as csv.
+     @type string
+
+     @return (not true) formatted csv for dynamut output
+     @type pandas df
+
+     """
+    #############
+    # Read file
+    #############
+    dynamut_data_raw  = pd.read_csv(dynamut_output_csv, sep = ',')  
+    
+    # strip white space from both ends in all columns
+    dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
+    dforig_shape = dynamut_data.shape
+    print('dimensions of input file:', dforig_shape) 
+
+#%%============================================================================        
+    #####################################
+    # create binary cols for each param
+    # >=0: Stabilising
+    ###################################### 
+    outcome_cols = ['ddg_dynamut', 'ddg_encom', 'ddg_mcsm','ddg_sdm', 'ddg_duet']
+    
+    # col test: ddg_dynamut
+    #len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0])
+    #dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+    #len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising'])
+
+    print('\nCreating classification cols for', len(outcome_cols), 'columns'
+          , '\nThese are:')
+    
+    for cols in outcome_cols:
+        print(cols)
+        
+        tot_muts = dynamut_data[cols].count()
+        print('\nTotal entries:', tot_muts)
+        
+        outcome_colname = cols + '_outcome'
+        print(cols, ':', outcome_colname)
+        c1 = len(dynamut_data[dynamut_data[cols] >= 0])
+        dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+        c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising'])
+        if c1 == c2:
+            print('\nPASS: outcome classification column created successfully'
+                  , '\nColumn created:', outcome_colname
+                  #, '\nNo. of stabilising muts: ', c1
+                  #, '\nNo. of DEstabilising muts: ', tot_muts-c1
+                  , '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() )
+            
+        else:
+            print('\nFAIL: outcome classification numbers MISmatch'
+                  , '\nexpected length:', c1
+                  , '\nGot:', c2)
+            
+    # Rename categ for: dds_encom
+    len(dynamut_data[dynamut_data['dds_encom'] >= 0])
+    dynamut_data['dds_encom_outcome'] = dynamut_data['dds_encom'].apply(lambda x: 'Increased_flexibility' if x >= 0 else 'Decreased_flexibility')
+    dynamut_data['dds_encom_outcome'].value_counts()
+    
+#%%=====================================================================  
+    ################################
+    # scale all ddg param values
+    #################################
+    # Rescale values in all ddg cols  col b/w -1 and 1 so negative numbers
+    # stay neg and pos numbers stay positive    
+        
+    outcome_cols = ['ddg_dynamut', 'ddg_encom', 'ddg_mcsm','ddg_sdm', 'ddg_duet', 'dds_encom']    
+        
+    for cols in outcome_cols:
+        #print(cols)
+        col_max = dynamut_data[cols].max()
+        col_min = dynamut_data[cols].min()
+        print( '\n===================='
+              , '\nColname:', cols 
+              , '\n===================='
+              , '\nMax: ', col_max
+              , '\nMin: ', col_min)
+        
+        scaled_colname = cols + '_scaled'
+        print('\nCreated scaled colname for', cols, ':', scaled_colname)
+        col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed')
+        
+        dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale)
+
+        col_scaled_max = dynamut_data[scaled_colname].max()
+        col_scaled_min = dynamut_data[scaled_colname].min()
+        print( '\n===================='
+              , '\nColname:', scaled_colname
+              , '\n===================='
+              , '\nMax: ', col_scaled_max
+              , '\nMin: ', col_scaled_min)
+       
+#%%=====================================================================
+    #############
+    # reorder columns
+    #############
+    dynamut_data.columns
+    dynamut_dataf = dynamut_data[['mutationinformation'
+                                 
+                                , 'ddg_dynamut'
+                                , 'ddg_dynamut_scaled'
+                                , 'ddg_dynamut_outcome'
+                                
+                                , 'ddg_encom'
+                                , 'ddg_encom_scaled'
+                                , 'ddg_encom_outcome'
+                                
+                                , 'ddg_mcsm'
+                                , 'ddg_mcsm_scaled'
+                                , 'ddg_mcsm_outcome'
+                                
+                                , 'ddg_sdm'
+                                , 'ddg_sdm_scaled'
+                                , 'ddg_sdm_outcome'
+                                
+                                , 'ddg_duet'
+                                , 'ddg_duet_scaled'
+                                , 'ddg_duet_outcome'
+                                
+                                , 'dds_encom'
+                                , 'dds_encom_scaled'
+                                , 'dds_encom_outcome']]
+    
+    if len(dynamut_data.columns) == len(dynamut_dataf):
+        print('\nPASS: outcome_classification, scaling  and column reordering completed')
+    else:
+        print('\nFAIL: Something went wrong...'
+              , '\nExpected length: ', len(dynamut_data.columns)
+              , '\nGot: ', len(dynamut_dataf))
+
+    return(dynamut_dataf)
+#%%##################################################################### 
+
diff --git a/dynamut/run_format_results_dynamut.py b/dynamut/run_format_results_dynamut.py
new file mode 100644
index 0000000..c8922e0
--- /dev/null
+++ b/dynamut/run_format_results_dynamut.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 12 12:15:26 2021
+
+@author: tanu
+"""
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# FIXME 
+# RE RUN when B07 completes!!!! as norm gets affected!
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+#%% load packages
+import os
+homedir = os.path.expanduser('~')
+os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
+from format_results_dynamut import *
+########################################################################
+# variables
+
+# TODO: add cmd line args
+
+gene = 'gid'
+drug = 'streptomycin'
+datadir = homedir + '/git/Data'
+indir = datadir + '/' + drug + '/input'
+outdir = datadir + '/' + drug + '/output'
+outdir_dynamut = outdir + '/dynamut_results/'
+
+# Input file
+infile_dynamut =  outdir_dynamut + gene + '_dynamut_all_output_clean.csv'
+
+# Formatted output filename
+outfile_dynamut_f = outdir_dynamut + gene + '_complex_dynamut_norm.csv'
+
+#==========================
+# CALL: format_results_mcsm_na() 
+# Data: gid+streptomycin
+#==========================
+print('Formatting results for:', infile_dynamut)
+dynamut_df_f = format_dynamut_output(dynamut_output_csv = infile_dynamut)
+
+# writing file
+print('Writing formatted dynamut df to csv')
+dynamut_df_f.to_csv(outfile_dynamut_f, index = False)
+
+print('Finished writing file:'
+       , '\nFile:', outfile_dynamut_f
+       , '\nExpected no. of rows:', len(dynamut_df_f)
+       , '\nExpected no. of cols:', len(dynamut_df_f.columns)
+       , '\n=============================================================')
+
+#%%#####################################################################
\ No newline at end of file