178 lines
No EOL
5.9 KiB
Python
178 lines
No EOL
5.9 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
'''
|
|
Created on Mon June 14 2021
|
|
|
|
@author: tanu
|
|
'''
|
|
# FIXME: import dirs.py to get the basic dir paths available
|
|
#=======================================================================
|
|
# TASK
|
|
|
|
# Input:
|
|
|
|
# Output:
|
|
#=======================================================================
|
|
#%% load libraries
|
|
import os, sys
|
|
import pandas as pd
|
|
import re
|
|
#import numpy as np
|
|
import argparse
|
|
DEBUG = False
|
|
#=======================================================================
|
|
#%% specify input and curr dir
|
|
homedir = os.path.expanduser('~')
|
|
|
|
# set working dir
|
|
os.getcwd()
|
|
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
|
os.getcwd()
|
|
|
|
from reference_dict import oneletter_aa_dict
|
|
from reference_dict import low_3letter_dict
|
|
#=======================================================================
|
|
#%%###########################################################################
|
|
# FUNCTION: using mcsm mutation format to split mutation info into
|
|
# 2 separate columns for wt 3 letter lowecase and mut 3 letter lowercase
|
|
###############################################################################
|
|
|
|
def get_aa_3lower(df, wt_colname = 'wild_type', mut_colname = 'mutant_type', col_wt = 'wt_aa_3lower', col_mut = 'mut_aa_3lower'):
|
|
|
|
""" Add 3 letter lowercase aa code for wt and mutant residues specified as 1 letter uppercase aa code
|
|
|
|
@df: df containing one letter aa code for wt and mutant respectively
|
|
@type: pandas df
|
|
|
|
@wt_colname: column containing one letter wild type aa
|
|
@type: str
|
|
|
|
@mut_colname: column containing one letter mutant type aa
|
|
@type: str
|
|
|
|
@col_wt: column with 3 letter aa code lower for wild type aa
|
|
@type: str
|
|
|
|
@col_mut: column with 3 letter aa code lower for mutant type aa
|
|
@type: str
|
|
|
|
returns df: with 2 added columns. If column names clash, the function column
|
|
name will override original column
|
|
@rtype: pandas df
|
|
"""
|
|
|
|
lookup_dict_aa_3lower = dict()
|
|
|
|
for k, v in oneletter_aa_dict.items():
|
|
|
|
lookup_dict_aa_3lower[k] = v['three_letter_code_lower']
|
|
#if DEBUG:
|
|
# print('Key:', k
|
|
# , 'Value:', v
|
|
# , '\n=====================================================\n'
|
|
# , '\nDICT:', lookup_dict_aa_3lower :\n')
|
|
|
|
df[col_wt] = df[wt_colname].map(lookup_dict_aa_3lower)
|
|
df[col_mut] = df[mut_colname].map(lookup_dict_aa_3lower)
|
|
|
|
return df
|
|
#%%
|
|
#==================================
|
|
# example: get_aa_3upper()
|
|
#==================================
|
|
# test_filename = '/home/tanu/git/Data/streptomycin/output/gid_complex_mcsm_norm_SAM.csv'
|
|
# test_df = pd.read_csv(test_filename , sep = ',')
|
|
|
|
# my_wt_colname = 'wild_type'
|
|
# my_mut_colname = 'mutant_type'
|
|
# my_col1 = 'wt_aa_3lower'
|
|
# my_col2 = 'mut_aa_3lower'
|
|
|
|
# get_aa_3lower(df = test_df
|
|
# , wt_colname = my_wt_colname
|
|
# , mut_colname = my_mut_colname
|
|
# , col_wt = my_col1
|
|
# , col_mut = my_col2)
|
|
#%%###########################################################################
|
|
# FUNCTION: using gwas mutation format to split mutation info into
|
|
# 3 separate columns for wild type, position and mutation
|
|
###############################################################################
|
|
def get_aa_1upper(df
|
|
, gwas_mut_colname = 'mutation'
|
|
, wt_colname = 'wt_aa_1upper'
|
|
, pos_colname = 'position'
|
|
, mut_colname = 'mut_aa_1upper'):
|
|
|
|
"""Add 1 letter aa uppercase aa code for wt and mutant residues specified as 3 letter lowercase aa code
|
|
|
|
@df: df containing one letter aa code for wt and mutant respectively
|
|
@type: pandas df
|
|
|
|
@wt_regex: regex string matching three letter lowercase aa code
|
|
@type:regex
|
|
|
|
@pos_regex: regex string matching aa position
|
|
@type:regex
|
|
|
|
@mut_regex: regex string matching three letter lowercase aa code
|
|
@type: regex
|
|
|
|
@wt_colname: column containing one letter wild type aa
|
|
@type: str
|
|
|
|
@mut_colname: column containing one letter mutant type aa
|
|
@type: str
|
|
|
|
@wt_colname: column with 3 letter aa code lower for wild type aa
|
|
@type: str
|
|
|
|
@pos_colname: column with aa position
|
|
@type: int
|
|
|
|
@mut_colname: column with 3 letter aa code lower for mutant type aa
|
|
@type: str
|
|
|
|
returns df: with 3 added columns. If column names clash, the function column
|
|
name will override original column
|
|
@rtype: pandas df
|
|
"""
|
|
|
|
# static regex
|
|
gwas_regex = r'^.*_p\.([A-Za-z]{3})([0-9]+)([A-Za-z]{3})$'
|
|
|
|
gwas_wt = df[gwas_mut_colname].str.extract(gwas_regex)[0]
|
|
gwas_pos = df[gwas_mut_colname].str.extract(gwas_regex)[1]
|
|
gwas_mut = df[gwas_mut_colname].str.extract(gwas_regex)[2]
|
|
|
|
lookup_dict_aa_1upper = dict()
|
|
for k, v in low_3letter_dict.items():
|
|
|
|
lookup_dict_aa_1upper[k] = v['one_letter_code']
|
|
#if DEBUG:
|
|
# print('Key:', k
|
|
# , 'Value:', v
|
|
# , '\n======================================================\n'
|
|
# , '\nDICT:', lookup_dict_aa_1upper :\n')
|
|
|
|
# wild type
|
|
df[wt_colname] = gwas_wt.map(lookup_dict_aa_1upper)
|
|
|
|
# position
|
|
df[pos_colname] = gwas_pos
|
|
|
|
# mutant type
|
|
df[mut_colname] = gwas_mut.map(lookup_dict_aa_1upper)
|
|
|
|
return df
|
|
#%%
|
|
#==================================
|
|
# example: get_aa_1upper()
|
|
#==================================
|
|
# test_filename2 = '/home/tanu/git/Data/streptomycin/output/gid_af_or.csv'
|
|
# test_df2 = pd.read_csv(test_filename2 , sep = ',')
|
|
|
|
# get_aa_1upper(df = test_df2
|
|
# , gwas_mut_colname = 'mutation'
|
|
# , wt_colname = 'wild_type'
|
|
# , pos_colname = 'position'
|
|
# , mut_colname = 'mutant_type') |