tidy code and renamed kd.py to kd_df.py
This commit is contained in:
parent
73e0029b65
commit
a074d29f6e
6 changed files with 156 additions and 194 deletions
|
@ -1,109 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
Created on Thu Feb 6 12:18:24 2020
|
|
||||||
|
|
||||||
@author: tanu
|
|
||||||
"""
|
|
||||||
#=======================================================================
|
|
||||||
# Task: Residue Depth (rd) values for amino acid sequence using the
|
|
||||||
# Depth server.
|
|
||||||
# Depth server: http://cospi.iiserpune.ac.in/depth/htdocs/index.html
|
|
||||||
# FIXME: for now input is a valid pdb code NOT a valid pdb file that you can upload
|
|
||||||
# Input: PDB file (valid pdb code)
|
|
||||||
|
|
||||||
# Output:
|
|
||||||
|
|
||||||
# useful links
|
|
||||||
# http://foldxsuite.crg.eu/faq-page#
|
|
||||||
# after fold x downlaoded, extract and run it from
|
|
||||||
# https://biopython.org/DIST/docs/api/Bio.PDB.ResidueDepth%27-module.html
|
|
||||||
# proDepth: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0007072
|
|
||||||
# needs biopython and msms
|
|
||||||
#=======================================================================
|
|
||||||
#%% load packages
|
|
||||||
import sys, os
|
|
||||||
import re
|
|
||||||
import pandas as pd
|
|
||||||
from Bio.PDB.ResidueDepth import ResidueDepth
|
|
||||||
from Bio.PDB.PDBParser import PDBParser
|
|
||||||
from Bio.PDB.ResidueDepth import get_surface
|
|
||||||
|
|
||||||
#%% specify input and output variables
|
|
||||||
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
|
|
||||||
|
|
||||||
# set working dir
|
|
||||||
os.getcwd()
|
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
|
||||||
os.getcwd()
|
|
||||||
#=======================================================================
|
|
||||||
#%% variable assignment: input and output paths & filenames
|
|
||||||
drug = 'pyrazinamide'
|
|
||||||
gene = 'pncA'
|
|
||||||
gene_match = gene + '_p.'
|
|
||||||
|
|
||||||
#==========
|
|
||||||
# data dir
|
|
||||||
#==========
|
|
||||||
#indir = 'git/Data/pyrazinamide/input/original'
|
|
||||||
datadir = homedir + '/' + 'git/Data'
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# input
|
|
||||||
#=======
|
|
||||||
indir = datadir + '/' + drug + '/' + 'input'
|
|
||||||
in_filename = '3pl1.pdb'
|
|
||||||
infile = indir + '/' + in_filename
|
|
||||||
print('Input filename:', in_filename
|
|
||||||
, '\nInput path:', indir)
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# output
|
|
||||||
#=======
|
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
# specify output file
|
|
||||||
out_filename = 'XXX'
|
|
||||||
outfile = outdir + '/' + out_filename
|
|
||||||
print('Output filename: ', out_filename
|
|
||||||
, '\nOutput path: ', outdir)
|
|
||||||
|
|
||||||
#%% end of variable assignment for input and output files
|
|
||||||
#================================================================
|
|
||||||
# Read input pdb file
|
|
||||||
parser = PDBParser()
|
|
||||||
|
|
||||||
# extract the 3 letter pdb code
|
|
||||||
pdb_code = re.search(r'(^[0-9]{1}\w{3})', in_filename).group(1)
|
|
||||||
|
|
||||||
#structure = parser.get_structure("3pl1", "/home/tanu/git/3pl1.pdb")
|
|
||||||
structure = parser.get_structure(pdb_code, infile)
|
|
||||||
model = structure[0]
|
|
||||||
surface = get_surface(model)
|
|
||||||
|
|
||||||
rd = ResidueDepth(model)
|
|
||||||
print(rd['A',(' ', 152, ' ')])
|
|
||||||
rd.keys()
|
|
||||||
foo = rd.property_dict
|
|
||||||
rd.property_keys
|
|
||||||
baz = rd.property_list
|
|
||||||
|
|
||||||
|
|
||||||
# To calculate the residue depth (average atom depth of the atoms in a residue):
|
|
||||||
from Bio.PDB.ResidueDepth import residue_depth
|
|
||||||
chain = model['A']
|
|
||||||
res152 = chain[152]
|
|
||||||
rd2 = residue_depth(res152, surface)
|
|
||||||
|
|
||||||
# df from dict
|
|
||||||
foo1 = pd.DataFrame.from_dict(baz, orient='index', columns = ['res_depth', 'surface'])
|
|
||||||
test = pd.Series(foo, name = "test")
|
|
||||||
|
|
||||||
# df from list
|
|
||||||
foo2 = pd.DataFrame(baz, columns = ['residue', 'residue depth'])
|
|
||||||
|
|
||||||
|
|
||||||
### iterate
|
|
||||||
for i in range(185):
|
|
||||||
print(i)
|
|
||||||
rd3 = residue_depth(res+i, surface)
|
|
||||||
print(rd3)
|
|
|
@ -439,11 +439,11 @@ dr_WF0['dr_mutations_pyrazinamide'] = dr_WF0['dr_mutations_pyrazinamide'].str.ls
|
||||||
# extract only the samples/rows with pncA_p.
|
# extract only the samples/rows with pncA_p.
|
||||||
dr_pnca_WF0 = dr_WF0.loc[dr_WF0.dr_mutations_pyrazinamide.str.contains(gene_match)]
|
dr_pnca_WF0 = dr_WF0.loc[dr_WF0.dr_mutations_pyrazinamide.str.contains(gene_match)]
|
||||||
|
|
||||||
print('lengths after tidy split and extracting', gene_match, 'muts:',
|
print('lengths after tidy split and extracting', gene_match, 'muts:'
|
||||||
'\nold length:' , len(meta_pnca_dr),
|
, '\nold length:' , len(meta_pnca_dr)
|
||||||
'\nlen after split:', len(dr_WF0),
|
, '\nlen after split:', len(dr_WF0)
|
||||||
'\ndr_pnca_WF0 length:', len(dr_pnca_WF0),
|
, '\ndr_pnca_WF0 length:', len(dr_pnca_WF0)
|
||||||
'\nexpected len:', dr_pnca_count)
|
, '\nexpected len:', dr_pnca_count)
|
||||||
|
|
||||||
if len(dr_pnca_WF0) == dr_pnca_count:
|
if len(dr_pnca_WF0) == dr_pnca_count:
|
||||||
print('PASS: length of dr_pnca_WF0 match with expected length')
|
print('PASS: length of dr_pnca_WF0 match with expected length')
|
||||||
|
|
|
@ -1,8 +1,22 @@
|
||||||
#!/home/tanu/anaconda3/envs/ContactMap/bin/python3
|
#!/home/tanu/anaconda3/envs/ContactMap/bin/python3
|
||||||
# Read a DSSP file into a data frame and pretty-print it
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Tue Feb 18 10:10:12 2020
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
#=======================================================================
|
||||||
|
# Task: Read a DSSP file into a data frame and output to a csv file
|
||||||
|
|
||||||
|
# Input: '.dssp' i.e gene associated.dssp file (output from run_dssp.sh)
|
||||||
|
|
||||||
|
# Output: '.csv' file containing DSSP output as a df ith ASA, RSA, etc.
|
||||||
|
|
||||||
|
# useful links:
|
||||||
#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
|
#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
|
||||||
#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
|
#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
|
||||||
|
#=======================================================================
|
||||||
|
#%% load packages
|
||||||
import sys, os
|
import sys, os
|
||||||
import re
|
import re
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
@ -13,16 +27,16 @@ import pprint as pp
|
||||||
#from Bio.PDB.PDBParser import PDBParser
|
#from Bio.PDB.PDBParser import PDBParser
|
||||||
import dms_tools2
|
import dms_tools2
|
||||||
import dms_tools2.dssp
|
import dms_tools2.dssp
|
||||||
|
#=======================================================================#
|
||||||
#%% specify input and output variables
|
#%% specify homedir and curr dir
|
||||||
homedir = os.path.expanduser('~')
|
homedir = os.path.expanduser('~')
|
||||||
|
|
||||||
#%% set working dir
|
# set working dir
|
||||||
os.getcwd()
|
os.getcwd()
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
||||||
os.getcwd()
|
os.getcwd()
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% variable assignment: input and output paths & filenames
|
#%% variable assignment: input and output
|
||||||
drug = 'pyrazinamide'
|
drug = 'pyrazinamide'
|
||||||
gene = 'pncA'
|
gene = 'pncA'
|
||||||
#gene_match = gene + '_p.'
|
#gene_match = gene + '_p.'
|
||||||
|
@ -57,7 +71,7 @@ print('Output filename:', out_filename
|
||||||
,'\nOutfile: ', outfile)
|
,'\nOutfile: ', outfile)
|
||||||
|
|
||||||
#%% end of variable assignment for input and output files
|
#%% end of variable assignment for input and output files
|
||||||
#================================================================
|
#=======================================================================
|
||||||
# Process dssp output and extract into df
|
# Process dssp output and extract into df
|
||||||
dssp_file = infile
|
dssp_file = infile
|
||||||
dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain)
|
dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain)
|
||||||
|
@ -65,6 +79,11 @@ dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain)
|
||||||
# Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area
|
# Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area
|
||||||
pp.pprint(dssp_df)
|
pp.pprint(dssp_df)
|
||||||
|
|
||||||
|
#=====================
|
||||||
|
# Renaming amino-acid
|
||||||
|
# and site columns
|
||||||
|
#=====================
|
||||||
|
|
||||||
# Rename column (amino acid) as 'wild_type' and (site} as 'position'
|
# Rename column (amino acid) as 'wild_type' and (site} as 'position'
|
||||||
# to be the same names as used in the file required for merging later.
|
# to be the same names as used in the file required for merging later.
|
||||||
dssp_df.columns
|
dssp_df.columns
|
||||||
|
@ -83,3 +102,5 @@ print('Finished writing:', out_filename
|
||||||
, '\nNo. of rows:', len(dssp_df)
|
, '\nNo. of rows:', len(dssp_df)
|
||||||
, '\nNo. of cols:', len(dssp_df.columns))
|
, '\nNo. of cols:', len(dssp_df.columns))
|
||||||
print('======================================================================')
|
print('======================================================================')
|
||||||
|
#%% end of script
|
||||||
|
#=======================================================================
|
||||||
|
|
|
@ -9,6 +9,10 @@ Created on Tue Aug 6 12:56:03 2019
|
||||||
# Task: Hydrophobicity (Kd) values for amino acid sequence using the
|
# Task: Hydrophobicity (Kd) values for amino acid sequence using the
|
||||||
# Kyt&-Doolittle.
|
# Kyt&-Doolittle.
|
||||||
# Same output as using the expasy server https://web.expasy.org/protscale/
|
# Same output as using the expasy server https://web.expasy.org/protscale/
|
||||||
|
# Input: fasta file
|
||||||
|
|
||||||
|
# Output: csv file with
|
||||||
|
|
||||||
|
|
||||||
# useful links
|
# useful links
|
||||||
# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
|
# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
|
||||||
|
@ -24,8 +28,8 @@ from Bio import SeqIO
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import sys, os
|
import sys, os
|
||||||
|
#=======================================================================
|
||||||
#%% specify input and output variables
|
#%% specify input and curr dir
|
||||||
homedir = os.path.expanduser('~')
|
homedir = os.path.expanduser('~')
|
||||||
|
|
||||||
# set working dir
|
# set working dir
|
||||||
|
@ -33,7 +37,7 @@ os.getcwd()
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
||||||
os.getcwd()
|
os.getcwd()
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% variable assignment: input and output paths & filenames
|
#%% variable assignment: input and output
|
||||||
drug = 'pyrazinamide'
|
drug = 'pyrazinamide'
|
||||||
gene = 'pncA'
|
gene = 'pncA'
|
||||||
gene_match = gene + '_p.'
|
gene_match = gene + '_p.'
|
||||||
|
@ -54,6 +58,7 @@ infile = indir + '/' + in_filename
|
||||||
print('Input filename:', in_filename
|
print('Input filename:', in_filename
|
||||||
, '\nInput path:', indir)
|
, '\nInput path:', indir)
|
||||||
|
|
||||||
|
print('======================================================================')
|
||||||
#=======
|
#=======
|
||||||
# output
|
# output
|
||||||
#=======
|
#=======
|
||||||
|
@ -63,8 +68,10 @@ outfile = outdir + '/' + out_filename
|
||||||
print('Output filename:', out_filename
|
print('Output filename:', out_filename
|
||||||
, '\nOutput path:', outdir)
|
, '\nOutput path:', outdir)
|
||||||
|
|
||||||
#%%11
|
print('======================================================================')
|
||||||
# specify window size for hydropathy profile computation
|
#%% end of variable assignment for input and output files
|
||||||
|
#=======================================================================
|
||||||
|
#%%specify window size for hydropathy profile computation
|
||||||
# https://web.expasy.org/protscale/pscale/protscale_help.html
|
# https://web.expasy.org/protscale/pscale/protscale_help.html
|
||||||
my_window = 3
|
my_window = 3
|
||||||
offset = round((my_window/2)-0.5)
|
offset = round((my_window/2)-0.5)
|
||||||
|
@ -115,27 +122,48 @@ dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)
|
||||||
max(dfVals['kd_values'])
|
max(dfVals['kd_values'])
|
||||||
min(dfVals['kd_values'])
|
min(dfVals['kd_values'])
|
||||||
|
|
||||||
|
#============
|
||||||
|
# merging dfs
|
||||||
|
#============
|
||||||
# Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2
|
# Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2
|
||||||
# This will introduce NaN where there is missing values. In our case this will be 2 (first and last ones)
|
# This will introduce NaN where there is missing values. In our case this will be 2 (first and last ones)
|
||||||
# Conveniently, the last position in this case is not part of the struc, so not much loss of info
|
# Conveniently, the last position in this case is not part of the struc, so not much loss of info
|
||||||
# Needless to state that this will be variable for other targets.
|
# Needless to state that this will be variable for other targets.
|
||||||
|
|
||||||
df = pd.concat([dfSeq, dfVals], axis = 1)
|
kd_df = pd.concat([dfSeq, dfVals], axis = 1)
|
||||||
# rename index to position
|
|
||||||
df = df.rename_axis('position')
|
#============================
|
||||||
print(df)
|
# Renaming index to position
|
||||||
|
#============================
|
||||||
|
kd_df = kd_df.rename_axis('position')
|
||||||
|
kd_df.head
|
||||||
|
print('======================================================================')
|
||||||
|
|
||||||
|
print('position col i.e. index should be numeric')
|
||||||
|
print('======================================================================')
|
||||||
|
if kd_df.index.dtype == 'int64':
|
||||||
|
print('PASS: position col is numeric'
|
||||||
|
, '\ndtype is:', kd_df.index.dtype)
|
||||||
|
else:
|
||||||
|
print('FAIL: position col is not numeric'
|
||||||
|
, '\nConverting to numeric')
|
||||||
|
kd_df.index.astype('int64')
|
||||||
|
print('Checking dtype for after conversion:\n'
|
||||||
|
,'\ndtype is:', kd_df.index.dtype)
|
||||||
|
|
||||||
|
print('======================================================================')
|
||||||
#%% write file
|
#%% write file
|
||||||
print('Writing file:', out_filename
|
print('Writing file:', out_filename
|
||||||
, '\nFilename:', out_filename
|
, '\nFilename:', out_filename
|
||||||
, '\nPath:', outdir)
|
, '\nPath:', outdir)
|
||||||
|
|
||||||
df.to_csv(outfile, header = True, index = True)
|
kd_df.to_csv(outfile, header = True, index = True)
|
||||||
|
|
||||||
print('Finished writing:', out_filename
|
print('Finished writing:', out_filename
|
||||||
, '\nNo. of rows:', len(df)
|
, '\nNo. of rows:', len(kd_df)
|
||||||
, '\nNo. of cols:', len(df.columns))
|
, '\nNo. of cols:', len(kd_df.columns))
|
||||||
|
|
||||||
#%% Plot
|
#%% plot
|
||||||
# http://www.dalkescientific.com/writings/NBN/plotting.html
|
# http://www.dalkescientific.com/writings/NBN/plotting.html
|
||||||
|
|
||||||
# FIXME: save fig
|
# FIXME: save fig
|
||||||
|
@ -147,4 +175,7 @@ xlabel('Residue Number')
|
||||||
ylabel('Hydrophobicity')
|
ylabel('Hydrophobicity')
|
||||||
title('K&D Hydrophobicity for ' + id)
|
title('K&D Hydrophobicity for ' + id)
|
||||||
show()
|
show()
|
||||||
|
|
||||||
|
print('======================================================================')
|
||||||
#%% end of script
|
#%% end of script
|
||||||
|
#=======================================================================
|
|
@ -10,15 +10,17 @@ Created on Tue Aug 6 12:56:03 2019
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
# TASK: calculate how many mutations result in
|
# TASK: calculate how many mutations result in
|
||||||
# electrostatic changes wrt wt
|
# electrostatic changes wrt wt
|
||||||
|
|
||||||
# Input: mcsm and AF_OR file
|
# Input: mcsm and AF_OR file
|
||||||
|
|
||||||
# Output: mut_elec_changes_results.txt
|
# Output: mut_elec_changes_results.txt
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% load libraries
|
#%% load libraries
|
||||||
import os, sys
|
import os, sys
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
#import numpy as np
|
#import numpy as np
|
||||||
|
#=======================================================================
|
||||||
#%% specify homedir as python doesn't recognise tilde
|
#%% specify homedir and curr dir
|
||||||
homedir = os.path.expanduser('~')
|
homedir = os.path.expanduser('~')
|
||||||
|
|
||||||
# set working dir
|
# set working dir
|
||||||
|
@ -26,7 +28,7 @@ os.getcwd()
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
||||||
os.getcwd()
|
os.getcwd()
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% variable assignment: input and output paths & filenames
|
#%% variable assignment: input and output
|
||||||
drug = 'pyrazinamide'
|
drug = 'pyrazinamide'
|
||||||
gene = 'pncA'
|
gene = 'pncA'
|
||||||
gene_match = gene + '_p.'
|
gene_match = gene + '_p.'
|
||||||
|
@ -59,7 +61,6 @@ print('Output filename: ', out_filename
|
||||||
#%% end of variable assignment for input and output files
|
#%% end of variable assignment for input and output files
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% Read input files
|
#%% Read input files
|
||||||
#in_filename = gene.lower() + '_meta_data_with_AFandOR.csv'
|
|
||||||
print('Reading input file (merged file):', infile)
|
print('Reading input file (merged file):', infile)
|
||||||
|
|
||||||
comb_df = pd.read_csv(infile, sep = ',')
|
comb_df = pd.read_csv(infile, sep = ',')
|
||||||
|
@ -137,16 +138,10 @@ no_change_muts = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-pol
|
||||||
|
|
||||||
no_change_muts.mut_count.sum()
|
no_change_muts.mut_count.sum()
|
||||||
|
|
||||||
|
#%% output from console
|
||||||
###########
|
|
||||||
# Step 5: output from console
|
|
||||||
###########
|
|
||||||
#sys.stdout = open(file, 'w')
|
#sys.stdout = open(file, 'w')
|
||||||
sys.stdout = open(outfile, 'w')
|
sys.stdout = open(outfile, 'w')
|
||||||
|
|
||||||
#print(no_change_muts, '\n',
|
|
||||||
# all_prop_change)
|
|
||||||
|
|
||||||
print('======================\n'
|
print('======================\n'
|
||||||
,'Unchanged muts'
|
,'Unchanged muts'
|
||||||
,'\n=====================\n'
|
,'\n=====================\n'
|
||||||
|
@ -156,16 +151,11 @@ print('======================\n'
|
||||||
, '\n============================\n'
|
, '\n============================\n'
|
||||||
, all_prop_change)
|
, all_prop_change)
|
||||||
|
|
||||||
#print('======================================================================')
|
|
||||||
#print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes)
|
|
||||||
#print('Total no. of muts: ', total_muts)
|
|
||||||
#print('Total no. of changed muts: ', all_prop_change.mut_count.sum())
|
|
||||||
#print('Total no. of unchanged muts: ', no_change_muts.mut_count.sum() )
|
|
||||||
#print('=======================================================================')
|
|
||||||
|
|
||||||
print('========================================================================'
|
print('========================================================================'
|
||||||
, '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes
|
, '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes
|
||||||
, '\nTotal no. of muts: ', total_muts
|
, '\nTotal no. of muts: ', total_muts
|
||||||
, '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum()
|
, '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum()
|
||||||
, '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum()
|
, '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum()
|
||||||
, '\n=========================================================================')
|
, '\n=========================================================================')
|
||||||
|
#%% end of script
|
||||||
|
#=======================================================================
|
||||||
|
|
111
meta_data_analysis/reference_dict.py
Executable file → Normal file
111
meta_data_analysis/reference_dict.py
Executable file → Normal file
|
@ -5,58 +5,82 @@ Created on Tue Jun 18 11:32:28 2019
|
||||||
|
|
||||||
@author: tanushree
|
@author: tanushree
|
||||||
"""
|
"""
|
||||||
############################################
|
#=======================================================================
|
||||||
# load libraries
|
# TASK: creating an aa dict to map 3 letter and other combinations of
|
||||||
|
# aa codes to one-letter aa code and also with aa properties.
|
||||||
|
|
||||||
|
# Input: .csv file containing aa_code
|
||||||
|
|
||||||
|
# Output: is called by other .py script to perform this mapping.
|
||||||
|
|
||||||
|
#=======================================================================
|
||||||
|
#%% load packages
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
#############################################
|
#=======================================================================
|
||||||
|
#%% specify homedir and curr dir
|
||||||
|
homedir = os.path.expanduser('~')
|
||||||
|
|
||||||
#!#########################!
|
# set working dir
|
||||||
# REQUIREMNETS:
|
#os.getcwd()
|
||||||
# Data/ must exist
|
#os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
||||||
# containing GWAS data
|
#os.getcwd()
|
||||||
#!#########################!
|
#=======================================================================
|
||||||
|
#%% variable assignment: input and output
|
||||||
print(os.getcwd())
|
|
||||||
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
|
|
||||||
os.chdir(homedir + '/git/Data/pyrazinamide/input/original')
|
|
||||||
print(os.getcwd())
|
|
||||||
|
|
||||||
#%%
|
|
||||||
############# specify variables for input and output paths and filenames
|
|
||||||
drug = 'pyrazinamide'
|
drug = 'pyrazinamide'
|
||||||
#gene = 'pnca'
|
gene = 'pncA'
|
||||||
datadir = homedir + '/git/Data'
|
gene_match = gene + '_p.'
|
||||||
basedir = datadir + '/' + drug + '/input'
|
|
||||||
|
|
||||||
|
#==========
|
||||||
|
# data dir
|
||||||
|
#==========
|
||||||
|
#indir = 'git/Data/pyrazinamide/input/original'
|
||||||
|
datadir = homedir + '/' + 'git/Data'
|
||||||
|
|
||||||
|
#=======
|
||||||
# input
|
# input
|
||||||
inpath = "/original"
|
#=======
|
||||||
in_filename = "/aa_codes.csv"
|
indir = datadir + '/' + drug + 'input'
|
||||||
infile = basedir + inpath + in_filename
|
in_filename = 'aa_codes.csv'
|
||||||
print(infile)
|
infile = indir + '/' + in_filename
|
||||||
|
print('Input filename:', in_filename
|
||||||
|
, '\nInput path:', indir)
|
||||||
|
|
||||||
#==========
|
#=======
|
||||||
#read file
|
# output: No output
|
||||||
#==========
|
#=======
|
||||||
|
|
||||||
|
#outdir = datadir + '/' + drug + '/' + 'output'
|
||||||
|
#out_filename = ''
|
||||||
|
#outfile = outdir + '/' + out_filename
|
||||||
|
#print('Output filename:', out_filename
|
||||||
|
# , '\nOutput path:', outdir)
|
||||||
|
|
||||||
|
#%% end of variable assignment for input and output files
|
||||||
|
#=======================================================================
|
||||||
|
#%% Read input file
|
||||||
my_aa = pd.read_csv(infile) #20, 6
|
my_aa = pd.read_csv(infile) #20, 6
|
||||||
#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
|
|
||||||
|
# assign the one_letter code as the row names so that it is easier to create
|
||||||
|
# a dict of dicts using index
|
||||||
#my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6 #a way to it since it is the first column
|
#my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6 #a way to it since it is the first column
|
||||||
my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
|
my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
|
||||||
|
|
||||||
#=========================================================
|
#==================
|
||||||
#convert file to dict of dicts
|
# convert file
|
||||||
#=========================================================
|
# to dict of dicts
|
||||||
#convert each row into a dict of dicts so that there are 20 aa and 5 keys within
|
#====================
|
||||||
#with your choice of column name that you have assigned to index as the "primary key".
|
# convert each row into a dict of dicts so that there are 20 aa and 5 keys within
|
||||||
#using 'index' creates a dict of dicts
|
# with your choice of column name that you have assigned to index as the "primary key".
|
||||||
#using 'records' creates a list of dicts
|
# using 'index' creates a dict of dicts
|
||||||
|
# using 'records' creates a list of dicts
|
||||||
my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
|
my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
|
||||||
|
|
||||||
#================================================
|
#================================================
|
||||||
#dict of aa with their corresponding properties
|
# dict of aa with their corresponding properties
|
||||||
#This is defined twice
|
# This is defined twice
|
||||||
#================================================
|
#================================================
|
||||||
#7 categories: no overlap
|
# 7 categories: no overlap
|
||||||
qualities1 = { ('R', 'H', 'K'): 'Basic'
|
qualities1 = { ('R', 'H', 'K'): 'Basic'
|
||||||
, ('D', 'E'): 'Acidic'
|
, ('D', 'E'): 'Acidic'
|
||||||
, ('N', 'Q'): 'Amidic'
|
, ('N', 'Q'): 'Amidic'
|
||||||
|
@ -66,7 +90,7 @@ qualities1 = { ('R', 'H', 'K'): 'Basic'
|
||||||
, ('C', 'M'): 'Sulphur'
|
, ('C', 'M'): 'Sulphur'
|
||||||
}
|
}
|
||||||
|
|
||||||
#9 categories: allowing for overlap
|
# 9 categories: allowing for overlap
|
||||||
qualities2 = { ('R', 'H', 'K'): 'Basic'
|
qualities2 = { ('R', 'H', 'K'): 'Basic'
|
||||||
, ('D', 'E'): 'Acidc'
|
, ('D', 'E'): 'Acidc'
|
||||||
, ('S', 'T', 'N', 'Q'): 'Polar'
|
, ('S', 'T', 'N', 'Q'): 'Polar'
|
||||||
|
@ -78,6 +102,7 @@ qualities2 = { ('R', 'H', 'K'): 'Basic'
|
||||||
, ('C', 'G', 'P'): 'Special'
|
, ('C', 'G', 'P'): 'Special'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# taylor classification: allowing for overlap
|
||||||
qualities_taylor = { ('R', 'H', 'K'): 'Basic'
|
qualities_taylor = { ('R', 'H', 'K'): 'Basic'
|
||||||
, ('D', 'E'): 'Acidc'
|
, ('D', 'E'): 'Acidc'
|
||||||
, ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar'
|
, ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar'
|
||||||
|
@ -89,17 +114,19 @@ qualities_taylor = { ('R', 'H', 'K'): 'Basic'
|
||||||
, ('C', 'G', 'P'): 'Special'
|
, ('C', 'G', 'P'): 'Special'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# binary classification: hydrophilic or hydrophobic
|
||||||
qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic'
|
qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic'
|
||||||
, ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic'
|
, ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# polarity: no overlap
|
||||||
qualities_polarity = { ('D', 'E'): 'acidic'
|
qualities_polarity = { ('D', 'E'): 'acidic'
|
||||||
, ('H', 'K', 'R'): 'basic'
|
, ('H', 'K', 'R'): 'basic'
|
||||||
, ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral'
|
, ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral'
|
||||||
, ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar'
|
, ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar'
|
||||||
}
|
}
|
||||||
|
|
||||||
# almost same as the one above
|
# almost same as the one above but as pos, neg, polar and non-polar
|
||||||
aa_calcprop = { ('D', 'E'): 'neg'
|
aa_calcprop = { ('D', 'E'): 'neg'
|
||||||
, ('H', 'K', 'R'): 'pos'
|
, ('H', 'K', 'R'): 'pos'
|
||||||
, ('N', 'Q', 'S', 'T', 'Y'): 'polar'
|
, ('N', 'Q', 'S', 'T', 'Y'): 'polar'
|
||||||
|
@ -107,7 +134,7 @@ aa_calcprop = { ('D', 'E'): 'neg'
|
||||||
}
|
}
|
||||||
|
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
#adding amino acid properties to my dict of dicts
|
# adding amino acid properties to my dict of dicts
|
||||||
for k, v in my_aa_dict.items():
|
for k, v in my_aa_dict.items():
|
||||||
#print (k,v)
|
#print (k,v)
|
||||||
v['aa_prop1'] = str() #initialise keys
|
v['aa_prop1'] = str() #initialise keys
|
||||||
|
@ -141,7 +168,9 @@ for k, v in my_aa_dict.items():
|
||||||
if v['one_letter_code'] in group:
|
if v['one_letter_code'] in group:
|
||||||
v['aa_calcprop']+= aa_calcprop[group] # += for str concat
|
v['aa_calcprop']+= aa_calcprop[group] # += for str concat
|
||||||
|
|
||||||
#COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all associated properties for each aa
|
# COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all
|
||||||
|
# associated properties for each aa
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
|
#%% end of script
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue