#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Jun 18 11:32:28 2019 @author: tanushree """ #======================================================================= # TASK: creating an aa dict to map 3 letter and other combinations of # aa codes to one-letter aa code and also with aa properties. # Input: .csv file containing aa_code # Output: is called by other .py script to perform this mapping. #======================================================================= #%% load packages import pandas as pd import os #======================================================================= #%% specify homedir and curr dir homedir = os.path.expanduser('~') # set working dir #os.getcwd() #os.chdir(homedir + '/git/LSHTM_analysis/scripts') #os.getcwd() #======================================================================= #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' #gene_match = gene + '_p.' #========== # data dir #========== datadir = homedir + '/' + 'git/Data' #======= # input #======= in_filename = 'aa_codes.csv' infile = datadir + '/' + in_filename print('Input filename:', in_filename , '\nInput path:', datadir , '\n============================================================') #======= # output: No output #======= #outdir = datadir + '/' + drug + '/' + 'output' #out_filename = '' #outfile = outdir + '/' + out_filename #print('Output filename:', out_filename # , '\nOutput path:', outdir) #%% end of variable assignment for input and output files #======================================================================= #%% Read input file my_aa = pd.read_csv(infile) #20, 6 # assign the one_letter code as the row names so that it is easier to create # a dict of dicts using index #my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6 #a way to it since it is the first column my_aa = my_aa.set_index('three_letter_code_lower') #20, 5 #================== # convert file # to dict of dicts #==================== # convert each row into a dict of dicts so that there are 20 aa and 5 keys within # with your choice of column name that you have assigned to index as the "primary key". # using 'index' creates a dict of dicts # using 'records' creates a list of dicts my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys print('Printing my_aa_dict:', my_aa_dict.keys()) #================================================ # dict of aa with their corresponding properties # This is defined twice #================================================ # 7 categories: no overlap qualities1 = { ('R', 'H', 'K'): 'Basic' , ('D', 'E'): 'Acidic' , ('N', 'Q'): 'Amidic' , ('G', 'A', 'V', 'L', 'I', 'P'): 'Hydrophobic' , ('S', 'T'): 'Hydroxylic' , ('F', 'W', 'Y'): 'Aromatic' , ('C', 'M'): 'Sulphur' } # 9 categories: allowing for overlap qualities2 = { ('R', 'H', 'K'): 'Basic' , ('D', 'E'): 'Acidc' , ('S', 'T', 'N', 'Q'): 'Polar' , ('V', 'I', 'L', 'M', 'F', 'Y', 'W'): 'Hydrophobic' , ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic' , ('S', 'G', 'A', 'P'): 'Small' , ('F', 'W', 'Y', 'H'): 'Aromatic' , ('V', 'I', 'L', 'M'): 'Aliphatic' , ('C', 'G', 'P'): 'Special' } # taylor classification: allowing for overlap qualities_taylor = { ('R', 'H', 'K'): 'Basic' , ('D', 'E'): 'Acidc' , ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar' , ('V', 'I', 'L', 'M', 'F', 'Y', 'W', 'C', 'A', 'G', 'T', 'H'): 'Hydrophobic' #, ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic', #C, W, y MISSING FROM POLAR! , ('S', 'G', 'A', 'P', 'C', 'T', 'N', 'D', 'V'): 'Small' , ('F', 'W', 'Y', 'H'): 'Aromatic' , ('V', 'I', 'L', 'M'): 'Aliphatic' #although M is not strictly in the circle! , ('C', 'G', 'P'): 'Special' } # binary classification: hydrophilic or hydrophobic qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic' , ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic' } # polarity: no overlap qualities_polarity = { ('D', 'E'): 'acidic' , ('H', 'K', 'R'): 'basic' , ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral' , ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar' } # almost same as the one above but as pos, neg, polar and non-polar aa_calcprop = { ('D', 'E'): 'neg' , ('H', 'K', 'R'): 'pos' , ('N', 'Q', 'S', 'T', 'Y'): 'polar' , ('C', 'G', 'A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar' } #============================================================================== # adding amino acid properties to my dict of dicts for k, v in my_aa_dict.items(): #print (k,v) v['aa_prop1'] = str() #initialise keys v['aa_prop2'] = list() #initialise keys (allows for overalpping properties) v['aa_taylor'] = list() #initialise keys (allows for overalpping properties) v['aa_prop_water'] = str() #initialise keys v['aa_prop_polarity'] = str() #initialise keys v['aa_calcprop'] = str() #initialise keys for group in qualities1: if v['one_letter_code'] in group: v['aa_prop1']+= qualities1[group] # += for str concat for group in qualities2: if v['one_letter_code'] in group: v['aa_prop2'].append(qualities2[group]) # append to list for group in qualities_taylor: if v['one_letter_code'] in group: v['aa_taylor'].append(qualities_taylor[group]) # append to list for group in qualities_water: if v['one_letter_code'] in group: v['aa_prop_water']+= qualities_water[group] # += for str concat for group in qualities_polarity: if v['one_letter_code'] in group: v['aa_prop_polarity']+= qualities_polarity[group] # += for str concat for group in aa_calcprop: if v['one_letter_code'] in group: v['aa_calcprop']+= aa_calcprop[group] # += for str concat # COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all # associated properties for each aa #============================================================================== #%% end of script