LSHTM_analysis/meta_data_analysis/kd.py

#!/usr/bin/python
#%%
# Task: Hydrophobicity (Kd) values for amino acid sequence using the Kyt&-Doolittle
# Same output as using the expasy server https://web.expasy.org/protscale/

# useful links
# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
#%%
# load packages
from pylab import *
from Bio.SeqUtils import ProtParamData
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SeqIO
#from Bio.Alphabet.IUPAC import IUPACProtein
#import pprint as pp
import pandas as pd
import numpy as np
import sys, os
#%%
# specify input and output variables
homedir = os.path.expanduser('~')
#=======
# input
#=======
indir = 'git/Data/pyrazinamide/input/original'
in_filename = "3pl1.fasta.txt"
infile = homedir + '/' + indir + '/' + in_filename
print(infile)

#=======
# output
#=======
outdir =   'git/Data/pyrazinamide/output'
out_filename = "kd.csv"
outfile = homedir + '/' + outdir + '/' + out_filename
print(outfile)
#%%
# specify window size for hydropathy profile computation
# https://web.expasy.org/protscale/pscale/protscale_help.html
my_window = 3
offset = round((my_window/2)-0.5)

fh = open(infile)

for record in SeqIO.parse(fh, "fasta"):
    id = record.id
    seq = record.seq
    num_residues = len(seq)
fh.close()

sequence = str(seq)

X = ProteinAnalysis(sequence)

kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # edge weight is set to  default (100%)

# sanity checks
print('Sequence Length:', num_residues)
print('kd_values Length:',len(kd_values))
print('Window Length:',my_window)
print('Window Offset:',offset)

# make a df each for; aa sequence and kd_values. Then reset index for each df which will allow easy merging of the two

# df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence)
dfSeq = pd.DataFrame({'aa_wt':list(sequence)})
dfSeq.index = np.arange(1, len(dfSeq) + 1) #python is not inclusive

# df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values
dfVals = pd.DataFrame({'kd_values':kd_values})
dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)

# sanity checks
max(dfVals['kd_values'])
min(dfVals['kd_values'])

# Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2
df = pd.concat([dfSeq, dfVals], axis = 1)
# rename index to position
df = df.rename_axis('position')
print(df)
#%%
# write file
df.to_csv(outfile, header = True, index = True)
#%% Plot
# http://www.dalkescientific.com/writings/NBN/plotting.html
plot(kd_values, linewidth = 1.0)
#axis(xmin = 1, xmax = num_residues)
xlabel("Residue Number")
ylabel("Hydrophobicity")
title("K&D Hydrophobicity for " + id)
show()
#%% end of script