bring in embb stuff which was in the wrong branch

2021-10-28 11:18:13 +01:00 · 2021-10-28 11:18:13 +01:00 · 3368e949e8
commit 3368e949e8
parent 0e44958585
6 changed files with 816 additions and 98 deletions
--- a/mcsm_ppi2/format_results_mcsm_ppi2.py
+++ b/mcsm_ppi2/format_results_mcsm_ppi2.py
@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+homedir = os.path.expanduser('~')
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+import numpy as np
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+
+sys.path.append(homedir + '/git/LSHTM_analysis/scripts')
+from reference_dict import up_3letter_aa_dict
+from reference_dict import oneletter_aa_dict
+
+#%%#####################################################################
+
+def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
+    """
+    @param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all muts 
+     which is the result of combining all mcsm_ppi2 batch results, and using
+     bash scripts to combine all the batch results into one file. 
+     Formatting df to a pandas df and output as csv.
+     @type string
+
+     @return (not true) formatted csv for mcsm_ppi2 output
+     @type pandas df
+
+     """
+    #############
+    # Read file
+    #############
+    mcsm_ppi2_data_raw  = pd.read_csv(mcsm_ppi2_output_csv, sep = ',')  
+    
+    # strip white space from both ends in all columns
+    mcsm_ppi2_data = mcsm_ppi2_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
+    dforig_shape = mcsm_ppi2_data.shape
+    print('dimensions of input file:', dforig_shape) 
+
+    #############
+    # Map 3 letter 
+    # code to one
+    #############
+    # initialise a sub dict that is lookup dict for 
+    # 3-LETTER aa code to 1-LETTER aa code
+    lookup_dict = dict()
+    for k, v in up_3letter_aa_dict.items():
+        lookup_dict[k] = v['one_letter_code']
+        wt = mcsm_ppi2_data['wild-type'].squeeze() # converts to a series that map works on
+        mcsm_ppi2_data['w_type'] = wt.map(lookup_dict)   
+        mut = mcsm_ppi2_data['mutant'].squeeze()
+        mcsm_ppi2_data['m_type'] = mut.map(lookup_dict)
+    
+    # #############
+    # # CHECK
+    # # Map 1 letter 
+    # # code to 3Upper
+    # #############
+    # # initialise a sub dict that is lookup dict for 
+    # # 3-LETTER aa code to 1-LETTER aa code
+    # lookup_dict = dict()
+    # for k, v in oneletter_aa_dict.items():
+    #     lookup_dict[k] = v['three_letter_code_upper']
+    #     wt = mcsm_ppi2_data['w_type'].squeeze() #converts to a series that map works on
+    #     mcsm_ppi2_data['WILD'] = wt.map(lookup_dict)   
+    #     mut = mcsm_ppi2_data['m_type'].squeeze()
+    #     mcsm_ppi2_data['MUT'] = mut.map(lookup_dict)
+    
+    # # check
+    # mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD'])
+    # mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])
+#%%============================================================================    
+    #############
+    # rename cols
+    #############
+    # format colnames: all lowercase and consistent colnames
+    mcsm_ppi2_data.columns
+    print('Assigning meaningful colnames'
+            , '\n=======================================================')
+    
+    my_colnames_dict = {'chain': 'chain'
+        , 'wild-type': 'wt_upper'
+        , 'res-number': 'position'
+        , 'mutant': 'mut_upper'
+        , 'distance-to-interface': 'interface_dist'
+        , 'mcsm-ppi2-prediction': 'mcsm_ppi2_affinity'
+        , 'affinity': 'mcsm_ppi2_outcome'
+        , 'w_type': 'wild_type' # one letter amino acid code
+        , 'm_type': 'mutant_type' # one letter amino acid code  
+} 
+
+    mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True)
+    mcsm_ppi2_data.columns
+
+    #############
+    # create mutationinformation column
+    #############    
+    #mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data['wild_type'] + mcsm_ppi2_data.position.map(str) + mcsm_ppi2_data['mutant_type']
+    mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data.loc[:,'wild_type'] + mcsm_ppi2_data.loc[:,'position'].astype(int).apply(str) + mcsm_ppi2_data.loc[:,'mutant_type']
+
+#%%=====================================================================
+    #########################
+    # scale mcsm_ppi2 values
+    #########################
+    # Rescale values in mcsm_ppi2_affinity col b/w -1 and 1 so negative numbers
+    # stay neg and pos numbers stay positive
+    mcsm_ppi2_min = mcsm_ppi2_data['mcsm_ppi2_affinity'].min() 
+    mcsm_ppi2_max = mcsm_ppi2_data['mcsm_ppi2_affinity'].max() 
+    
+    mcsm_ppi2_scale = lambda x : x/abs(mcsm_ppi2_min) if x < 0 else (x/mcsm_ppi2_max if x >= 0 else 'failed')
+    
+    mcsm_ppi2_data['mcsm_ppi2_scaled'] = mcsm_ppi2_data['mcsm_ppi2_affinity'].apply(mcsm_ppi2_scale)
+    print('Raw mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_affinity']
+        , '\n---------------------------------------------------------------'
+        , '\nScaled mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_scaled'])
+    
+    c = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_affinity']>=0].count()
+    mcsm_ppi2_pos = c.get(key = 'mcsm_ppi2_affinity')
+    
+    c2 = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_scaled']>=0].count()
+    mcsm_ppi2_pos2 = c2.get(key = 'mcsm_ppi2_scaled')
+    
+    if mcsm_ppi2_pos == mcsm_ppi2_pos2:
+        print('\nPASS: Affinity values scaled correctly')
+    else:
+        print('\nFAIL: Affinity values scaled numbers MISmatch'
+              , '\nExpected number:', mcsm_ppi2_pos
+              , '\nGot:', mcsm_ppi2_pos2
+              , '\n======================================================')
+
+#%%=====================================================================
+    #############
+    # reorder columns
+    #############
+    mcsm_ppi2_data.columns
+    mcsm_ppi2_dataf = mcsm_ppi2_data[['mutationinformation'
+                                , 'mcsm_ppi2_affinity'
+                                , 'mcsm_ppi2_scaled'
+                                , 'mcsm_ppi2_outcome'
+                                , 'interface_dist'
+                                , 'wild_type'
+                                , 'position'
+                                , 'mutant_type'
+                                , 'wt_upper'
+                                , 'mut_upper'
+                                , 'chain']]
+    return(mcsm_ppi2_dataf)
+#%%##################################################################### 
--- a/mcsm_ppi2/run_format_results_mcsm_ppi2.py
+++ b/mcsm_ppi2/run_format_results_mcsm_ppi2.py
@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 12 12:15:26 2021
+
+@author: tanu
+"""
+#%% load packages
+import sys, os
+homedir = os.path.expanduser('~')
+#sys.path.append(homedir + '/git/LSHTM_analysis/mcsm_ppi2')
+
+from format_results_mcsm_ppi2 import *
+########################################################################
+# TODO: add cmd line args
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug'      , help = 'drug name (case sensitive)', default = None)
+arg_parser.add_argument('-g', '--gene'      , help = 'gene name (case sensitive)', default = None)
+arg_parser.add_argument('--datadir'         , help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+#arg_parser.add_argument('--mkdir_name'      , help = 'Output dir for processed results. This will be created if it does not exist') 
+arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
+
+arg_parser.add_argument('--debug'           , action = 'store_true' , help = 'Debug Mode')
+
+args = arg_parser.parse_args()
+#%%============================================================================ 
+# variable assignment: input and output paths & filenames
+drug         = args.drug
+gene         = args.gene
+datadir      = args.datadir
+indir        = args.input_dir
+outdir       = args.output_dir
+#outdir_ppi2  = args.mkdir_name
+make_dirs    = args.make_dirs
+
+#=======
+# dirs
+#=======
+if not datadir:
+    datadir = homedir + '/git/Data/'
+    
+if not indir:
+    indir = datadir + drug + '/input/'
+    
+if not outdir:
+    outdir = datadir + drug + '/output/'
+
+#if not mkdir_name:
+#    outdir_ppi2 = outdir + 'mcsm_ppi2/'
+
+outdir_ppi2 = outdir + 'mcsm_ppi2/'
+
+# Input file
+infile_mcsm_ppi2 =  outdir_ppi2 +  gene.lower() + '_output_combined_clean.csv'
+
+# Formatted output file
+outfile_mcsm_ppi2_f = outdir_ppi2 + gene.lower() + '_complex_mcsm_ppi2_norm.csv'
+
+#==========================
+# CALL: format_results_mcsm_na() 
+# Data: gid+streptomycin
+#==========================
+print('Formatting results for:', infile_mcsm_ppi2)
+mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2)
+
+# writing file
+print('Writing formatted df to csv')
+mcsm_ppi2_df_f.to_csv(outfile_mcsm_ppi2_f, index = False)
+
+print('Finished writing file:'
+       , '\nFile:', outfile_mcsm_ppi2_f
+       , '\nExpected no. of rows:', len(mcsm_ppi2_df_f)
+       , '\nExpected no. of cols:', len(mcsm_ppi2_df_f.columns)
+       , '\n=============================================================')
+
+#%%#####################################################################