tidy code and renamed kd.py to kd_df.py

2020-03-26 15:43:13 +00:00 · 2020-03-26 15:43:13 +00:00 · 0b7a938fbd
commit 0b7a938fbd
parent 4c2fa2b600
6 changed files with 156 additions and 194 deletions
--- a/meta_data_analysis/kd_df.py
+++ b/meta_data_analysis/kd_df.py
@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+Created on Tue Aug  6 12:56:03 2019
+
+@author: tanu
+'''
+#=======================================================================
+# Task: Hydrophobicity (Kd) values for amino acid sequence using the
+# Kyt&-Doolittle.
+# Same output as using the expasy server https://web.expasy.org/protscale/
+# Input: fasta file
+    
+# Output: csv file with 
+
+
+# useful links
+# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
+# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
+#=======================================================================
+#%% load packages
+from pylab import *
+from Bio.SeqUtils import ProtParamData
+from Bio.SeqUtils.ProtParam import ProteinAnalysis
+from Bio import SeqIO
+#from Bio.Alphabet.IUPAC import IUPACProtein
+#import pprint as pp
+import pandas as pd
+import numpy as np
+import sys, os
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+os.getcwd()
+#=======================================================================
+#%% variable assignment: input and output 
+drug = 'pyrazinamide'
+gene = 'pncA'
+gene_match = gene + '_p.'
+
+#==========
+# data dir
+#==========
+#indir = 'git/Data/pyrazinamide/input/original'
+datadir = homedir + '/' + 'git/Data'
+
+#=======
+# input
+#=======
+#indir = 'git/Data/pyrazinamide/input/original'
+indir = datadir + '/' + drug + '/' + 'input'
+in_filename = '3pl1.fasta.txt'
+infile = indir + '/' + in_filename
+print('Input filename:', in_filename
+      , '\nInput path:', indir)
+
+print('======================================================================')
+#=======
+# output 
+#=======
+outdir =   datadir + '/' + drug + '/' + 'output'
+out_filename = gene.lower() + '_kd.csv'
+outfile =  outdir + '/' + out_filename
+print('Output filename:', out_filename
+      , '\nOutput path:', outdir)
+
+print('======================================================================')
+#%% end of variable assignment for input and output files
+#=======================================================================
+#%%specify window size for hydropathy profile computation
+# https://web.expasy.org/protscale/pscale/protscale_help.html
+my_window = 3
+offset = round((my_window/2)-0.5)
+
+fh = open(infile)
+
+for record in SeqIO.parse(fh, 'fasta'):
+    id = record.id
+    seq = record.seq
+    num_residues = len(seq)
+fh.close()
+
+sequence = str(seq)
+
+X = ProteinAnalysis(sequence)
+
+kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # edge weight is set to  default (100%)
+
+# sanity checks 
+print('Sequence Length:', num_residues)
+print('kd_values Length:',len(kd_values))
+print('Window Length:', my_window)
+print('Window Offset:', offset)
+print('======================================================================')
+print('Checking:len(kd values) is as expected for the given window size & offset...')
+expected_length =  num_residues - (my_window - offset) 
+if len(kd_values) == expected_length:
+    print('PASS: expected and actual length of kd values match')
+else:
+    print('FAIL: length mismatch'
+          ,'\nExpected length:', expected_length
+          ,'\nActual length:', len(kd_values))
+    
+print('======================================================================')
+#%% make 2 dfs; 1) aa sequence and 2) kd_values. Then reset index for each df 
+# which will allow easy merging of the two dfs.
+
+# df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence)
+# Name column of wt as 'wild_type' to be the same name used in the file required for merging later.
+dfSeq = pd.DataFrame({'wild_type':list(sequence)})
+dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive
+
+# df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values
+dfVals = pd.DataFrame({'kd_values':kd_values})
+dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)
+
+# sanity checks
+max(dfVals['kd_values'])
+min(dfVals['kd_values'])
+
+#============
+# merging dfs
+#============
+# Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2 
+# This will introduce NaN where there is missing values. In our case this will be 2 (first and last ones)
+# Conveniently, the last position in this case is not part of the struc, so not much loss of info
+# Needless to state that this will be variable for other targets.
+
+kd_df = pd.concat([dfSeq, dfVals], axis = 1)
+
+#============================
+# Renaming index to position
+#============================
+kd_df = kd_df.rename_axis('position')
+kd_df.head
+print('======================================================================')
+
+print('position col i.e. index should be numeric')
+print('======================================================================')
+if kd_df.index.dtype == 'int64':
+    print('PASS: position col is numeric'
+          , '\ndtype is:', kd_df.index.dtype)
+else:
+    print('FAIL: position col is not numeric'
+          , '\nConverting to numeric')
+    kd_df.index.astype('int64')
+    print('Checking dtype for after conversion:\n'
+          ,'\ndtype is:', kd_df.index.dtype)
+    
+print('======================================================================')
+#%% write file
+print('Writing file:', out_filename
+      , '\nFilename:', out_filename
+      , '\nPath:',  outdir)
+
+kd_df.to_csv(outfile, header = True, index = True)
+
+print('Finished writing:', out_filename
+      , '\nNo. of rows:', len(kd_df)
+      , '\nNo. of cols:', len(kd_df.columns))
+
+#%% plot
+# http://www.dalkescientific.com/writings/NBN/plotting.html
+
+# FIXME: save fig
+# extract just pdb if from 'id' to pass to title of plot
+# foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1)
+plot(kd_values, linewidth = 1.0)
+#axis(xmin = 1, xmax = num_residues)
+xlabel('Residue Number')
+ylabel('Hydrophobicity')
+title('K&D Hydrophobicity for ' + id)
+show()
+
+print('======================================================================')
+#%% end of script
+#=======================================================================