tidy code and renamed kd.py to kd_df.py

2020-03-26 15:43:13 +00:00 · 2020-03-26 15:43:13 +00:00 · 0b7a938fbd
commit 0b7a938fbd
parent 4c2fa2b600
6 changed files with 156 additions and 194 deletions
--- a/meta_data_analysis/reference_dict.py
+++ b/meta_data_analysis/reference_dict.py
@ -5,58 +5,82 @@ Created on Tue Jun 18 11:32:28 2019

@author: tanushree
 """
-############################################
-# load libraries
+#=======================================================================
+# TASK: creating an aa dict to map 3 letter and other combinations of
+# aa codes to one-letter aa code and also with aa properties.
+
+# Input: .csv file containing aa_code
+
+# Output: is called by other .py script to perform this mapping.
+
+#=======================================================================
+#%% load packages
 import pandas as pd
 import os
-#############################################
+#=======================================================================
+#%% specify homedir and curr dir
+homedir = os.path.expanduser('~')

-#!#########################!
-# REQUIREMNETS:
-# Data/ must exist
-# containing GWAS data
-#!#########################!
-
-print(os.getcwd()) 
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
-os.chdir(homedir + '/git/Data/pyrazinamide/input/original') 
-print(os.getcwd())
-
-#%%
-############# specify variables for input and output paths and filenames
+# set working dir
+#os.getcwd()
+#os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+#os.getcwd()
+#=======================================================================
+#%% variable assignment: input and output
 drug = 'pyrazinamide'
-#gene = 'pnca'
-datadir = homedir + '/git/Data'
-basedir =  datadir + '/' + drug + '/input'
+gene = 'pncA'
+gene_match = gene + '_p.'

+#==========
+# data dir
+#==========
+#indir = 'git/Data/pyrazinamide/input/original'
+datadir = homedir + '/' + 'git/Data'
+
+#=======
 # input
-inpath = "/original"
-in_filename  = "/aa_codes.csv"
-infile = basedir + inpath + in_filename
-print(infile)
+#=======
+indir = datadir + '/' + drug + 'input'
+in_filename = 'aa_codes.csv'
+infile = indir + '/' + in_filename 
+print('Input filename:', in_filename
+      , '\nInput path:', indir)

-#==========
-#read file
-#==========
+#=======
+# output: No output
+#=======
+
+#outdir = datadir + '/' + drug + '/' + 'output'  
+#out_filename = ''
+#outfile = outdir + '/' + out_filename
+#print('Output filename:', out_filename
+#      , '\nOutput path:', outdir)
+
+#%% end of variable assignment for input and output files
+#=======================================================================
+#%% Read input file
 my_aa = pd.read_csv(infile) #20, 6
-#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
+
+# assign the one_letter code as the row names so that it is easier to create 
+# a dict of dicts using index
 #my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6  #a way to it since it is the first column
 my_aa = my_aa.set_index('three_letter_code_lower') #20, 5

-#=========================================================
-#convert file to  dict of dicts
-#=========================================================
-#convert each row into a dict of dicts so that there are 20 aa and 5 keys within
-#with your choice of column name that you have assigned to index as the "primary key". 
-#using 'index' creates a dict of dicts
-#using 'records' creates a list of dicts
+#==================
+# convert file 
+# to dict of dicts
+#====================
+# convert each row into a dict of dicts so that there are 20 aa and 5 keys within
+# with your choice of column name that you have assigned to index as the "primary key". 
+# using 'index' creates a dict of dicts
+# using 'records' creates a list of dicts
 my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys

 #================================================
-#dict of aa with their corresponding properties
-#This is defined twice
+# dict of aa with their corresponding properties
+# This is defined twice
 #================================================
-#7 categories: no overlap
+# 7 categories: no overlap
 qualities1 = { ('R', 'H', 'K'): 'Basic'
             , ('D', 'E'): 'Acidic'
             , ('N', 'Q'): 'Amidic'
@ -66,7 +90,7 @@ qualities1 = { ('R', 'H', 'K'): 'Basic'
             , ('C', 'M'): 'Sulphur'
 }

-#9 categories: allowing for overlap
+# 9 categories: allowing for overlap
 qualities2 = { ('R', 'H', 'K'): 'Basic'
             , ('D', 'E'): 'Acidc'
             , ('S', 'T', 'N', 'Q'): 'Polar'
@ -78,6 +102,7 @@ qualities2 = { ('R', 'H', 'K'): 'Basic'
             , ('C', 'G', 'P'): 'Special'
 }

+# taylor classification: allowing for overlap
 qualities_taylor = { ('R', 'H', 'K'): 'Basic'
             , ('D', 'E'): 'Acidc'
             , ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar'
@ -89,17 +114,19 @@ qualities_taylor = { ('R', 'H', 'K'): 'Basic'
             , ('C', 'G', 'P'): 'Special'
 }

+# binary classification: hydrophilic or hydrophobic
 qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic'
                   , ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic'
 }

+# polarity: no overlap
 qualities_polarity = { ('D', 'E'): 'acidic'
                      , ('H', 'K', 'R'): 'basic'
                      , ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral'
                      , ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar'    
 }

-# almost same as the one above
+# almost same as the one above but as pos, neg, polar and non-polar
 aa_calcprop = { ('D', 'E'): 'neg'
                      , ('H', 'K', 'R'): 'pos'
                      , ('N', 'Q', 'S', 'T', 'Y'): 'polar'
@ -107,7 +134,7 @@ aa_calcprop = { ('D', 'E'): 'neg'
 }

 #==============================================================================                
-#adding amino acid properties to my dict of dicts                      
+# adding amino acid properties to my dict of dicts                      
 for k, v in my_aa_dict.items():
    #print (k,v)
    v['aa_prop1'] = str() #initialise keys 
@ -141,7 +168,9 @@ for k, v in my_aa_dict.items():
        if v['one_letter_code'] in group:
            v['aa_calcprop']+= aa_calcprop[group] # += for str concat 
             
-#COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all associated properties for each aa
+# COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all 
+# associated properties for each aa
 #==============================================================================
+#%% end of script