From f79aea254e4e673f83c19cdda05190fdc4cf4f58 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 18 Jun 2021 17:48:26 +0100
Subject: [PATCH] added function to add aa code for mcsm and gwas style
 mutations to a given files

---
 scripts/aa_code.py | 178 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 178 insertions(+)
 create mode 100644 scripts/aa_code.py

diff --git a/scripts/aa_code.py b/scripts/aa_code.py
new file mode 100644
index 0000000..0fad2a0
--- /dev/null
+++ b/scripts/aa_code.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+Created on Mon June 14 2021
+
+@author: tanu
+'''
+# FIXME: import dirs.py to get the basic dir paths available
+#=======================================================================
+# TASK
+
+# Input:
+
+# Output: 
+#=======================================================================
+#%% load libraries
+import os, sys
+import pandas as pd
+import re
+#import numpy as np
+import argparse
+DEBUG = False
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+os.getcwd()
+
+from reference_dict import oneletter_aa_dict 
+from reference_dict import low_3letter_dict
+#=======================================================================
+#%%###########################################################################
+# FUNCTION: using mcsm mutation format to split mutation info into
+# 2 separate columns for wt 3 letter lowecase and mut 3 letter lowercase
+###############################################################################
+
+def get_aa_3lower(df, wt_colname = 'wild_type', mut_colname = 'mutant_type', col_wt = 'wt_aa_3lower', col_mut = 'mut_aa_3lower'):
+
+    """ Add 3 letter lowercase aa code for wt and mutant residues specified as 1 letter uppercase aa code
+    
+    @df: df containing one letter aa code for wt and mutant respectively
+    @type: pandas df
+    
+    @wt_colname: column containing one letter wild type aa
+    @type: str
+    
+    @mut_colname: column containing one letter mutant type aa
+    @type: str
+    
+    @col_wt: column with 3 letter aa code lower for wild type aa
+    @type: str
+    
+    @col_mut: column with 3 letter aa code lower for mutant type aa
+    @type: str
+    
+    returns df: with 2 added columns. If column names clash, the function column 
+                name will override original column
+    @rtype: pandas df
+    """
+
+    lookup_dict_aa_3lower = dict()
+
+    for k, v in oneletter_aa_dict.items():
+      
+        lookup_dict_aa_3lower[k] = v['three_letter_code_lower']
+        #if DEBUG:
+        #    print('Key:', k
+        #          , 'Value:', v
+        #          , '\n=====================================================\n'
+        #          , '\nDICT:', lookup_dict_aa_3lower :\n')
+            
+        df[col_wt] = df[wt_colname].map(lookup_dict_aa_3lower)   
+        df[col_mut] = df[mut_colname].map(lookup_dict_aa_3lower)    
+            
+    return df
+#%%
+#==================================
+# example: get_aa_3upper()
+#==================================
+# test_filename =  '/home/tanu/git/Data/streptomycin/output/gid_complex_mcsm_norm_SAM.csv'
+# test_df =  pd.read_csv(test_filename , sep = ',')
+
+# my_wt_colname = 'wild_type'
+# my_mut_colname = 'mutant_type'
+# my_col1 = 'wt_aa_3lower'
+# my_col2 = 'mut_aa_3lower'
+
+# get_aa_3lower(df = test_df
+#               , wt_colname = my_wt_colname
+#               , mut_colname = my_mut_colname
+#               , col_wt = my_col1
+#               , col_mut = my_col2)
+#%%###########################################################################
+# FUNCTION: using gwas mutation format to split mutation info into
+# 3 separate columns for wild type, position and mutation
+###############################################################################
+def get_aa_1upper(df
+                  , gwas_mut_colname = 'mutation'
+                  , wt_colname = 'wt_aa_1upper'
+                  , pos_colname = 'position'
+                  , mut_colname = 'mut_aa_1upper'):
+
+    """Add 1 letter aa uppercase aa code for wt and mutant residues specified as 3 letter lowercase aa code
+    
+    @df: df containing one letter aa code for wt and mutant respectively
+    @type: pandas df
+    
+    @wt_regex: regex string matching three letter lowercase aa code 
+    @type:regex
+        
+    @pos_regex: regex string matching aa position 
+    @type:regex
+        
+    @mut_regex: regex string matching three letter lowercase aa code
+    @type: regex
+       
+    @wt_colname: column containing one letter wild type aa
+    @type: str
+    
+    @mut_colname: column containing one letter mutant type aa
+    @type: str
+    
+    @wt_colname: column with 3 letter aa code lower for wild type aa
+    @type: str
+    
+    @pos_colname: column with aa position
+    @type: int
+    
+    @mut_colname: column with 3 letter aa code lower for mutant type aa
+    @type: str
+    
+    returns df: with 3 added columns. If column names clash, the function column 
+                name will override original column
+    @rtype: pandas df
+    """
+    
+    # static regex
+    gwas_regex = r'^.*_p\.([A-Za-z]{3})([0-9]+)([A-Za-z]{3})$'
+    
+    gwas_wt  = df[gwas_mut_colname].str.extract(gwas_regex)[0]
+    gwas_pos = df[gwas_mut_colname].str.extract(gwas_regex)[1]
+    gwas_mut = df[gwas_mut_colname].str.extract(gwas_regex)[2]
+    
+    lookup_dict_aa_1upper = dict()
+    for k, v in low_3letter_dict.items():
+      
+        lookup_dict_aa_1upper[k] = v['one_letter_code']
+        #if DEBUG:
+        #    print('Key:', k
+        #          , 'Value:', v
+        #          , '\n======================================================\n'
+        #          , '\nDICT:', lookup_dict_aa_1upper :\n')
+            
+       # wild type
+        df[wt_colname] = gwas_wt.map(lookup_dict_aa_1upper)   
+       
+       # position 
+        df[pos_colname] = gwas_pos  
+       
+       # mutant type 
+        df[mut_colname] = gwas_mut.map(lookup_dict_aa_1upper)      
+
+    return df
+#%%
+#==================================
+# example: get_aa_1upper()
+#==================================
+# test_filename2 =  '/home/tanu/git/Data/streptomycin/output/gid_af_or.csv'
+# test_df2 =  pd.read_csv(test_filename2 , sep = ',')
+
+# get_aa_1upper(df = test_df2
+#               , gwas_mut_colname = 'mutation'
+#               , wt_colname = 'wild_type'
+#               , pos_colname = 'position'
+#               , mut_colname = 'mutant_type')
\ No newline at end of file