#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Mon June 14 2021 @author: tanu ''' # FIXME: import dirs.py to get the basic dir paths available #======================================================================= # TASK # Input: # Output: #======================================================================= #%% load libraries import os, sys import pandas as pd import re #import numpy as np import argparse DEBUG = False #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() from reference_dict import oneletter_aa_dict from reference_dict import low_3letter_dict #======================================================================= #%%########################################################################### # FUNCTION: using mcsm mutation format to split mutation info into # 2 separate columns for wt 3 letter lowecase and mut 3 letter lowercase ############################################################################### def get_aa_3lower(df, wt_colname = 'wild_type', mut_colname = 'mutant_type', col_wt = 'wt_aa_3lower', col_mut = 'mut_aa_3lower'): """ Add 3 letter lowercase aa code for wt and mutant residues specified as 1 letter uppercase aa code @df: df containing one letter aa code for wt and mutant respectively @type: pandas df @wt_colname: column containing one letter wild type aa @type: str @mut_colname: column containing one letter mutant type aa @type: str @col_wt: column with 3 letter aa code lower for wild type aa @type: str @col_mut: column with 3 letter aa code lower for mutant type aa @type: str returns df: with 2 added columns. If column names clash, the function column name will override original column @rtype: pandas df """ lookup_dict_aa_3lower = dict() for k, v in oneletter_aa_dict.items(): lookup_dict_aa_3lower[k] = v['three_letter_code_lower'] #if DEBUG: # print('Key:', k # , 'Value:', v # , '\n=====================================================\n' # , '\nDICT:', lookup_dict_aa_3lower :\n') df[col_wt] = df[wt_colname].map(lookup_dict_aa_3lower) df[col_mut] = df[mut_colname].map(lookup_dict_aa_3lower) return df #%% #================================== # example: get_aa_3upper() #================================== # test_filename = '/home/tanu/git/Data/streptomycin/output/gid_complex_mcsm_norm_SAM.csv' # test_df = pd.read_csv(test_filename , sep = ',') # my_wt_colname = 'wild_type' # my_mut_colname = 'mutant_type' # my_col1 = 'wt_aa_3lower' # my_col2 = 'mut_aa_3lower' # get_aa_3lower(df = test_df # , wt_colname = my_wt_colname # , mut_colname = my_mut_colname # , col_wt = my_col1 # , col_mut = my_col2) #%%########################################################################### # FUNCTION: using gwas mutation format to split mutation info into # 3 separate columns for wild type, position and mutation ############################################################################### def get_aa_1upper(df , gwas_mut_colname = 'mutation' , wt_colname = 'wt_aa_1upper' , pos_colname = 'position' , mut_colname = 'mut_aa_1upper'): """Add 1 letter aa uppercase aa code for wt and mutant residues specified as 3 letter lowercase aa code @df: df containing one letter aa code for wt and mutant respectively @type: pandas df @wt_regex: regex string matching three letter lowercase aa code @type:regex @pos_regex: regex string matching aa position @type:regex @mut_regex: regex string matching three letter lowercase aa code @type: regex @wt_colname: column containing one letter wild type aa @type: str @mut_colname: column containing one letter mutant type aa @type: str @wt_colname: column with 3 letter aa code lower for wild type aa @type: str @pos_colname: column with aa position @type: int @mut_colname: column with 3 letter aa code lower for mutant type aa @type: str returns df: with 3 added columns. If column names clash, the function column name will override original column @rtype: pandas df """ # static regex gwas_regex = r'^.*_p\.([A-Za-z]{3})([0-9]+)([A-Za-z]{3})$' gwas_wt = df[gwas_mut_colname].str.extract(gwas_regex)[0] gwas_pos = df[gwas_mut_colname].str.extract(gwas_regex)[1] gwas_mut = df[gwas_mut_colname].str.extract(gwas_regex)[2] lookup_dict_aa_1upper = dict() for k, v in low_3letter_dict.items(): lookup_dict_aa_1upper[k] = v['one_letter_code'] #if DEBUG: # print('Key:', k # , 'Value:', v # , '\n======================================================\n' # , '\nDICT:', lookup_dict_aa_1upper :\n') # wild type df[wt_colname] = gwas_wt.map(lookup_dict_aa_1upper) # position df[pos_colname] = gwas_pos # mutant type df[mut_colname] = gwas_mut.map(lookup_dict_aa_1upper) return df #%% #================================== # example: get_aa_1upper() #================================== # test_filename2 = '/home/tanu/git/Data/streptomycin/output/gid_af_or.csv' # test_df2 = pd.read_csv(test_filename2 , sep = ',') # get_aa_1upper(df = test_df2 # , gwas_mut_colname = 'mutation' # , wt_colname = 'wild_type' # , pos_colname = 'position' # , mut_colname = 'mutant_type')