remove old surface_res3.py

This commit is contained in:
Tanushree Tunstall 2020-02-20 12:23:56 +00:00
parent d7ef8ef51e
commit ed8fc4d488
2 changed files with 89 additions and 70 deletions

View file

@ -1,20 +1,94 @@
#!/usr/bin/python
#%%
# Task: Hydrophobicity (Kd) values for amino acid sequence using the Kyt&-Doolittle
# Same output as using the expasy server https://web.expasy.org/protscale/
# hydrophobicity and SAA
#https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
import sys, os
import pandas as pd
import pprint as pp
#import dms_tools2
#import dms_tools2.dssp
# useful links
# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
#%%
# load packages
from pylab import *
from Bio.SeqUtils import ProtParamData
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SeqIO
#from Bio.Alphabet.IUPAC import IUPACProtein
#import pprint as pp
import pandas as pd
import numpy as np
import sys, os
#%%
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis/struct_params')
os.getcwd()
#%%
# specify variables
#=======
# input
#=======
homedir = os.path.expanduser('~')
indir = 'git/Data/pyrazinamide/input/original'
in_filename = "3pl1.fasta.txt"
infile = homedir + '/' + indir + '/' + in_filename
print(infile)
foo = ProtParamData.kd(3pl1.pdb)
#=======
# output
#=======
outdir = 'git/Data/pyrazinamide/output'
out_filename = "kd.csv"
outfile = homedir + '/' + outdir + '/' + out_filename
print(outfile)
#%%
# specify window size for hydropathy profile computation
# https://web.expasy.org/protscale/pscale/protscale_help.html
my_window = 3
offset = round((my_window/2)-0.5)
fh = open(infile)
for record in SeqIO.parse(fh, "fasta"):
id = record.id
seq = record.seq
num_residues = len(seq)
fh.close()
sequence = str(seq)
X = ProteinAnalysis(sequence)
kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # edge weight is set to default (100%)
# sanity checks
print('Sequence Length:', num_residues)
print('kd_values Length:',len(kd_values))
print('Window Length:',my_window)
print('Window Offset:',offset)
# make a df each for; aa sequence and kd_values. Then reset index for each df which will allow easy merging of the two
# df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence)
dfSeq = pd.DataFrame({'aa_wt':list(sequence)})
dfSeq.index = np.arange(1, len(dfSeq) + 1) #python is not inclusive
# df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values
dfVals = pd.DataFrame({'kd_values':kd_values})
dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)
# sanity checks
max(dfVals['kd_values'])
min(dfVals['kd_values'])
# Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2
df = pd.concat([dfSeq, dfVals], axis = 1)
# rename index to position
df = df.rename_axis('position')
print(df)
#%%
# write file
df.to_csv(outfile, header = True, index = True)
#%% Plot
# http://www.dalkescientific.com/writings/NBN/plotting.html
plot(kd_values, linewidth = 1.0)
#axis(xmin = 1, xmax = num_residues)
xlabel("Residue Number")
ylabel("Hydrophobicity")
title("K&D Hydrophobicity for " + id)
show()
#%% end of script

View file

@ -1,55 +0,0 @@
#!/usr/bin/python
# Surface calculation
#
from pylab import *
from Bio.SeqUtils import ProtParamData
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SeqIO
from Bio.Alphabet.IUPAC import IUPACProtein
import pprint as pp
import pandas as pd
import numpy as np
infile='/home/tanu/git/Data/pyrazinamide/input/original/3pl1.fasta.txt'
window=9
offset=round((window/2)-0.5)
fh = open(infile)
for record in SeqIO.parse(fh, "fasta"):
id = record.id
seq = record.seq
num_residues = len(seq)
fh.close()
sequence = str(seq)
X = ProteinAnalysis(sequence)
values=(X.protein_scale(ProtParamData.kd,window))
print('Sequence Length:', num_residues)
print('Post-Analysis Length:',len(values))
print('Window Length:',window)
print('Window Offset:',offset)
dfSeq=pd.DataFrame({'seq':list(sequence)})
dfVals=pd.DataFrame({'values':values})
# FIXME:
# These need to be offset by 'offset' from the start and finish
# so that the sequence letters line up with the value calculated
df=pd.concat([dfSeq,dfVals], ignore_index=True, axis=1)
print(df)
#df=pd.DataFrame(list(sequence), values)
#plot(185, values, linewidth=1.0)
#axis(xmin = 1, xmax = num_residues)
#xlabel("Residue Number")
#ylabel("Hydrophobicity")
#title("K&D Hydrophobicity for " + id)
#show()