LSHTM_analysis/scripts/reference_dict.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 18 11:32:28 2019

@author: tanushree
"""
#=======================================================================
# TASK: creating an aa dict to map 3 letter and other combinations of
# aa codes to one-letter aa code and also with aa properties.

# Input: .csv file containing aa_code

# Output: is called by other .py script to perform this mapping.

#=======================================================================
#%% load packages
import pandas as pd
import os
#=======================================================================
#%% specify homedir and curr dir
homedir = os.path.expanduser('~')

# set working dir
#os.getcwd()
#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
#os.getcwd()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
#gene_match = gene + '_p.'

#==========
# data dir
#==========
datadir = homedir + '/' + 'git/Data'

#=======
# input
#=======
in_filename = 'aa_codes.csv'
infile = datadir + '/' + in_filename
print('Input filename:', in_filename
      , '\nInput path:', datadir
      , '\n============================================================')

#=======
# output: No output
#=======
#outdir = datadir + '/' + drug + '/' + 'output'
#out_filename = ''
#outfile = outdir + '/' + out_filename
#print('Output filename:', out_filename
#      , '\nOutput path:', outdir)

#%% end of variable assignment for input and output files
#=======================================================================
#%% Read input file
my_aa = pd.read_csv(infile) #20, 6

# assign the one_letter code as the row names so that it is easier to create
# a dict of dicts using index
#my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6  #a way to it since it is the first column
my_aa = my_aa.set_index('three_letter_code_lower') #20, 5

#==================
# convert file
# to dict of dicts
#====================
# convert each row into a dict of dicts so that there are 20 aa and 5 keys within
# with your choice of column name that you have assigned to index as the "primary key".
# using 'index' creates a dict of dicts
# using 'records' creates a list of dicts
my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
print('Printing my_aa_dict:', my_aa_dict.keys())

#================================================
# dict of aa with their corresponding properties
# This is defined twice
#================================================
# 7 categories: no overlap
qualities1 = { ('R', 'H', 'K'): 'Basic'
             , ('D', 'E'): 'Acidic'
             , ('N', 'Q'): 'Amidic'
             , ('G', 'A', 'V', 'L', 'I', 'P'): 'Hydrophobic'
             , ('S', 'T'): 'Hydroxylic'
             , ('F', 'W', 'Y'): 'Aromatic'
             , ('C', 'M'): 'Sulphur'
}

# 9 categories: allowing for overlap
qualities2 = { ('R', 'H', 'K'): 'Basic'
             , ('D', 'E'): 'Acidc'
             , ('S', 'T', 'N', 'Q'): 'Polar'
             , ('V', 'I', 'L', 'M', 'F', 'Y', 'W'): 'Hydrophobic'
             , ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic'
             , ('S', 'G', 'A', 'P'): 'Small'
             , ('F', 'W', 'Y', 'H'): 'Aromatic'
             , ('V', 'I', 'L', 'M'): 'Aliphatic'
             , ('C', 'G', 'P'): 'Special'
}

# taylor classification: allowing for overlap
qualities_taylor = { ('R', 'H', 'K'): 'Basic'
             , ('D', 'E'): 'Acidc'
             , ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar'
             , ('V', 'I', 'L', 'M', 'F', 'Y', 'W', 'C', 'A', 'G', 'T', 'H'): 'Hydrophobic'
             #, ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic', #C, W, y MISSING FROM POLAR!
             , ('S', 'G', 'A', 'P', 'C', 'T', 'N', 'D', 'V'): 'Small'
             , ('F', 'W', 'Y', 'H'): 'Aromatic'
             , ('V', 'I', 'L', 'M'): 'Aliphatic' #although M is not strictly in the circle!
             , ('C', 'G', 'P'): 'Special'
}

# binary classification: hydrophilic or hydrophobic
qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic'
                   , ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic'
}

# polarity: no overlap
qualities_polarity = { ('D', 'E'): 'acidic'
                      , ('H', 'K', 'R'): 'basic'
                      , ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral'
                      , ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar'
}

# almost same as the one above but as pos, neg, polar and non-polar
aa_calcprop = { ('D', 'E'): 'neg'
                      , ('H', 'K', 'R'): 'pos'
                      , ('N', 'Q', 'S', 'T', 'Y'): 'polar'
                      , ('C', 'G', 'A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar'
}

#==============================================================================
# adding amino acid properties to my dict of dicts
for k, v in my_aa_dict.items():
    #print (k,v)
    v['aa_prop1'] = str() #initialise keys
    v['aa_prop2'] = list() #initialise keys (allows for overalpping properties)
    v['aa_taylor'] = list() #initialise keys (allows for overalpping properties)
    v['aa_prop_water'] = str() #initialise keys
    v['aa_prop_polarity'] = str() #initialise keys
    v['aa_calcprop'] = str() #initialise keys

    for group in qualities1:
        if v['one_letter_code'] in group:
            v['aa_prop1']+= qualities1[group] # += for str concat

    for group in qualities2:
        if v['one_letter_code'] in group:
            v['aa_prop2'].append(qualities2[group]) # append to list

    for group in qualities_taylor:
        if v['one_letter_code'] in group:
            v['aa_taylor'].append(qualities_taylor[group]) # append to list

    for group in qualities_water:
        if v['one_letter_code'] in group:
            v['aa_prop_water']+= qualities_water[group] # += for str concat

    for group in qualities_polarity:
        if v['one_letter_code'] in group:
            v['aa_prop_polarity']+= qualities_polarity[group] # += for str concat

    for group in aa_calcprop:
        if v['one_letter_code'] in group:
            v['aa_calcprop']+= aa_calcprop[group] # += for str concat

# COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all
# associated properties for each aa
#==============================================================================
#%% end of script