LSHTM_analysis/scripts/aa_code.py

178 lines
No EOL
5.9 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Mon June 14 2021
@author: tanu
'''
# FIXME: import dirs.py to get the basic dir paths available
#=======================================================================
# TASK
# Input:
# Output:
#=======================================================================
#%% load libraries
import os, sys
import pandas as pd
import re
#import numpy as np
import argparse
DEBUG = False
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd()
from reference_dict import oneletter_aa_dict
from reference_dict import low_3letter_dict
#=======================================================================
#%%###########################################################################
# FUNCTION: using mcsm mutation format to split mutation info into
# 2 separate columns for wt 3 letter lowecase and mut 3 letter lowercase
###############################################################################
def get_aa_3lower(df, wt_colname = 'wild_type', mut_colname = 'mutant_type', col_wt = 'wt_aa_3lower', col_mut = 'mut_aa_3lower'):
""" Add 3 letter lowercase aa code for wt and mutant residues specified as 1 letter uppercase aa code
@df: df containing one letter aa code for wt and mutant respectively
@type: pandas df
@wt_colname: column containing one letter wild type aa
@type: str
@mut_colname: column containing one letter mutant type aa
@type: str
@col_wt: column with 3 letter aa code lower for wild type aa
@type: str
@col_mut: column with 3 letter aa code lower for mutant type aa
@type: str
returns df: with 2 added columns. If column names clash, the function column
name will override original column
@rtype: pandas df
"""
lookup_dict_aa_3lower = dict()
for k, v in oneletter_aa_dict.items():
lookup_dict_aa_3lower[k] = v['three_letter_code_lower']
#if DEBUG:
# print('Key:', k
# , 'Value:', v
# , '\n=====================================================\n'
# , '\nDICT:', lookup_dict_aa_3lower :\n')
df[col_wt] = df[wt_colname].map(lookup_dict_aa_3lower)
df[col_mut] = df[mut_colname].map(lookup_dict_aa_3lower)
return df
#%%
#==================================
# example: get_aa_3upper()
#==================================
# test_filename = '/home/tanu/git/Data/streptomycin/output/gid_complex_mcsm_norm_SAM.csv'
# test_df = pd.read_csv(test_filename , sep = ',')
# my_wt_colname = 'wild_type'
# my_mut_colname = 'mutant_type'
# my_col1 = 'wt_aa_3lower'
# my_col2 = 'mut_aa_3lower'
# get_aa_3lower(df = test_df
# , wt_colname = my_wt_colname
# , mut_colname = my_mut_colname
# , col_wt = my_col1
# , col_mut = my_col2)
#%%###########################################################################
# FUNCTION: using gwas mutation format to split mutation info into
# 3 separate columns for wild type, position and mutation
###############################################################################
def get_aa_1upper(df
, gwas_mut_colname = 'mutation'
, wt_colname = 'wt_aa_1upper'
, pos_colname = 'position'
, mut_colname = 'mut_aa_1upper'):
"""Add 1 letter aa uppercase aa code for wt and mutant residues specified as 3 letter lowercase aa code
@df: df containing one letter aa code for wt and mutant respectively
@type: pandas df
@wt_regex: regex string matching three letter lowercase aa code
@type:regex
@pos_regex: regex string matching aa position
@type:regex
@mut_regex: regex string matching three letter lowercase aa code
@type: regex
@wt_colname: column containing one letter wild type aa
@type: str
@mut_colname: column containing one letter mutant type aa
@type: str
@wt_colname: column with 3 letter aa code lower for wild type aa
@type: str
@pos_colname: column with aa position
@type: int
@mut_colname: column with 3 letter aa code lower for mutant type aa
@type: str
returns df: with 3 added columns. If column names clash, the function column
name will override original column
@rtype: pandas df
"""
# static regex
gwas_regex = r'^.*_p\.([A-Za-z]{3})([0-9]+)([A-Za-z]{3})$'
gwas_wt = df[gwas_mut_colname].str.extract(gwas_regex)[0]
gwas_pos = df[gwas_mut_colname].str.extract(gwas_regex)[1]
gwas_mut = df[gwas_mut_colname].str.extract(gwas_regex)[2]
lookup_dict_aa_1upper = dict()
for k, v in low_3letter_dict.items():
lookup_dict_aa_1upper[k] = v['one_letter_code']
#if DEBUG:
# print('Key:', k
# , 'Value:', v
# , '\n======================================================\n'
# , '\nDICT:', lookup_dict_aa_1upper :\n')
# wild type
df[wt_colname] = gwas_wt.map(lookup_dict_aa_1upper)
# position
df[pos_colname] = gwas_pos
# mutant type
df[mut_colname] = gwas_mut.map(lookup_dict_aa_1upper)
return df
#%%
#==================================
# example: get_aa_1upper()
#==================================
# test_filename2 = '/home/tanu/git/Data/streptomycin/output/gid_af_or.csv'
# test_df2 = pd.read_csv(test_filename2 , sep = ',')
# get_aa_1upper(df = test_df2
# , gwas_mut_colname = 'mutation'
# , wt_colname = 'wild_type'
# , pos_colname = 'position'
# , mut_colname = 'mutant_type')