LSHTM_analysis/scripts/aa_code.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Mon June 14 2021

@author: tanu
'''
# FIXME: import dirs.py to get the basic dir paths available
#=======================================================================
# TASK

# Input:

# Output:
#=======================================================================
#%% load libraries
import os, sys
import pandas as pd
import re
#import numpy as np
import argparse
DEBUG = False
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')

# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd()

from reference_dict import oneletter_aa_dict
from reference_dict import low_3letter_dict
#=======================================================================
#%%###########################################################################
# FUNCTION: using mcsm mutation format to split mutation info into
# 2 separate columns for wt 3 letter lowecase and mut 3 letter lowercase
###############################################################################

def get_aa_3lower(df, wt_colname = 'wild_type', mut_colname = 'mutant_type', col_wt = 'wt_aa_3lower', col_mut = 'mut_aa_3lower'):

    """ Add 3 letter lowercase aa code for wt and mutant residues specified as 1 letter uppercase aa code

    @df: df containing one letter aa code for wt and mutant respectively
    @type: pandas df

    @wt_colname: column containing one letter wild type aa
    @type: str

    @mut_colname: column containing one letter mutant type aa
    @type: str

    @col_wt: column with 3 letter aa code lower for wild type aa
    @type: str

    @col_mut: column with 3 letter aa code lower for mutant type aa
    @type: str

    returns df: with 2 added columns. If column names clash, the function column
                name will override original column
    @rtype: pandas df
    """

    lookup_dict_aa_3lower = dict()

    for k, v in oneletter_aa_dict.items():

        lookup_dict_aa_3lower[k] = v['three_letter_code_lower']
        #if DEBUG:
        #    print('Key:', k
        #          , 'Value:', v
        #          , '\n=====================================================\n'
        #          , '\nDICT:', lookup_dict_aa_3lower :\n')

        df[col_wt] = df[wt_colname].map(lookup_dict_aa_3lower)
        df[col_mut] = df[mut_colname].map(lookup_dict_aa_3lower)

    return df
#%%
#==================================
# example: get_aa_3upper()
#==================================
# test_filename =  '/home/tanu/git/Data/streptomycin/output/gid_complex_mcsm_norm_SAM.csv'
# test_df =  pd.read_csv(test_filename , sep = ',')

# my_wt_colname = 'wild_type'
# my_mut_colname = 'mutant_type'
# my_col1 = 'wt_aa_3lower'
# my_col2 = 'mut_aa_3lower'

# get_aa_3lower(df = test_df
#               , wt_colname = my_wt_colname
#               , mut_colname = my_mut_colname
#               , col_wt = my_col1
#               , col_mut = my_col2)
#%%###########################################################################
# FUNCTION: using gwas mutation format to split mutation info into
# 3 separate columns for wild type, position and mutation
###############################################################################
def get_aa_1upper(df
                  , gwas_mut_colname = 'mutation'
                  , wt_colname = 'wt_aa_1upper'
                  , pos_colname = 'position'
                  , mut_colname = 'mut_aa_1upper'):

    """Add 1 letter aa uppercase aa code for wt and mutant residues specified as 3 letter lowercase aa code

    @df: df containing one letter aa code for wt and mutant respectively
    @type: pandas df

    @wt_regex: regex string matching three letter lowercase aa code
    @type:regex

    @pos_regex: regex string matching aa position
    @type:regex

    @mut_regex: regex string matching three letter lowercase aa code
    @type: regex

    @wt_colname: column containing one letter wild type aa
    @type: str

    @mut_colname: column containing one letter mutant type aa
    @type: str

    @wt_colname: column with 3 letter aa code lower for wild type aa
    @type: str

    @pos_colname: column with aa position
    @type: int

    @mut_colname: column with 3 letter aa code lower for mutant type aa
    @type: str

    returns df: with 3 added columns. If column names clash, the function column
                name will override original column
    @rtype: pandas df
    """

    # static regex
    gwas_regex = r'^.*_p\.([A-Za-z]{3})([0-9]+)([A-Za-z]{3})$'

    gwas_wt  = df[gwas_mut_colname].str.extract(gwas_regex)[0]
    gwas_pos = df[gwas_mut_colname].str.extract(gwas_regex)[1]
    gwas_mut = df[gwas_mut_colname].str.extract(gwas_regex)[2]

    lookup_dict_aa_1upper = dict()
    for k, v in low_3letter_dict.items():

        lookup_dict_aa_1upper[k] = v['one_letter_code']
        #if DEBUG:
        #    print('Key:', k
        #          , 'Value:', v
        #          , '\n======================================================\n'
        #          , '\nDICT:', lookup_dict_aa_1upper :\n')

       # wild type
        df[wt_colname] = gwas_wt.map(lookup_dict_aa_1upper)

       # position
        df[pos_colname] = gwas_pos

       # mutant type
        df[mut_colname] = gwas_mut.map(lookup_dict_aa_1upper)

    return df
#%%
#==================================
# example: get_aa_1upper()
#==================================
# test_filename2 =  '/home/tanu/git/Data/streptomycin/output/gid_af_or.csv'
# test_df2 =  pd.read_csv(test_filename2 , sep = ',')

# get_aa_1upper(df = test_df2
#               , gwas_mut_colname = 'mutation'
#               , wt_colname = 'wild_type'
#               , pos_colname = 'position'
#               , mut_colname = 'mutant_type')