diff --git a/meta_data_analysis/kd.py b/meta_data_analysis/kd.py index dabeb64..0081eef 100644 --- a/meta_data_analysis/kd.py +++ b/meta_data_analysis/kd.py @@ -1,20 +1,94 @@ #!/usr/bin/python +#%% +# Task: Hydrophobicity (Kd) values for amino acid sequence using the Kyt&-Doolittle +# Same output as using the expasy server https://web.expasy.org/protscale/ -# hydrophobicity and SAA -#https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html -#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html -import sys, os -import pandas as pd -import pprint as pp -#import dms_tools2 -#import dms_tools2.dssp - +# useful links +# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html +# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html +#%% +# load packages +from pylab import * from Bio.SeqUtils import ProtParamData +from Bio.SeqUtils.ProtParam import ProteinAnalysis +from Bio import SeqIO +#from Bio.Alphabet.IUPAC import IUPACProtein +#import pprint as pp +import pandas as pd +import numpy as np +import sys, os #%% -homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde -os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis/struct_params') -os.getcwd() -#%% +# specify variables +#======= +# input +#======= +homedir = os.path.expanduser('~') +indir = 'git/Data/pyrazinamide/input/original' +in_filename = "3pl1.fasta.txt" +infile = homedir + '/' + indir + '/' + in_filename +print(infile) -foo = ProtParamData.kd(3pl1.pdb) +#======= +# output +#======= +outdir = 'git/Data/pyrazinamide/output' +out_filename = "kd.csv" +outfile = homedir + '/' + outdir + '/' + out_filename +print(outfile) +#%% +# specify window size for hydropathy profile computation +# https://web.expasy.org/protscale/pscale/protscale_help.html +my_window = 3 +offset = round((my_window/2)-0.5) + +fh = open(infile) + +for record in SeqIO.parse(fh, "fasta"): + id = record.id + seq = record.seq + num_residues = len(seq) +fh.close() + +sequence = str(seq) + +X = ProteinAnalysis(sequence) + +kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # edge weight is set to default (100%) + +# sanity checks +print('Sequence Length:', num_residues) +print('kd_values Length:',len(kd_values)) +print('Window Length:',my_window) +print('Window Offset:',offset) + +# make a df each for; aa sequence and kd_values. Then reset index for each df which will allow easy merging of the two + +# df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence) +dfSeq = pd.DataFrame({'aa_wt':list(sequence)}) +dfSeq.index = np.arange(1, len(dfSeq) + 1) #python is not inclusive + +# df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values +dfVals = pd.DataFrame({'kd_values':kd_values}) +dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset) + +# sanity checks +max(dfVals['kd_values']) +min(dfVals['kd_values']) + +# Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2 +df = pd.concat([dfSeq, dfVals], axis = 1) +# rename index to position +df = df.rename_axis('position') +print(df) +#%% +# write file +df.to_csv(outfile, header = True, index = True) +#%% Plot +# http://www.dalkescientific.com/writings/NBN/plotting.html +plot(kd_values, linewidth = 1.0) +#axis(xmin = 1, xmax = num_residues) +xlabel("Residue Number") +ylabel("Hydrophobicity") +title("K&D Hydrophobicity for " + id) +show() +#%% end of script diff --git a/meta_data_analysis/surface_res3.py b/meta_data_analysis/surface_res3.py deleted file mode 100755 index 9a3ef07..0000000 --- a/meta_data_analysis/surface_res3.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/python -# Surface calculation -# -from pylab import * -from Bio.SeqUtils import ProtParamData -from Bio.SeqUtils.ProtParam import ProteinAnalysis -from Bio import SeqIO -from Bio.Alphabet.IUPAC import IUPACProtein -import pprint as pp -import pandas as pd -import numpy as np - - -infile='/home/tanu/git/Data/pyrazinamide/input/original/3pl1.fasta.txt' - -window=9 -offset=round((window/2)-0.5) - -fh = open(infile) - -for record in SeqIO.parse(fh, "fasta"): - id = record.id - seq = record.seq - num_residues = len(seq) -fh.close() - -sequence = str(seq) - -X = ProteinAnalysis(sequence) - -values=(X.protein_scale(ProtParamData.kd,window)) - -print('Sequence Length:', num_residues) -print('Post-Analysis Length:',len(values)) -print('Window Length:',window) -print('Window Offset:',offset) - -dfSeq=pd.DataFrame({'seq':list(sequence)}) -dfVals=pd.DataFrame({'values':values}) - -# FIXME: -# These need to be offset by 'offset' from the start and finish -# so that the sequence letters line up with the value calculated -df=pd.concat([dfSeq,dfVals], ignore_index=True, axis=1) - -print(df) -#df=pd.DataFrame(list(sequence), values) - -#plot(185, values, linewidth=1.0) -#axis(xmin = 1, xmax = num_residues) -#xlabel("Residue Number") -#ylabel("Hydrophobicity") -#title("K&D Hydrophobicity for " + id) -#show() -