remove old surface_res3.py

2020-02-20 12:23:56 +00:00 · 2020-02-20 12:23:56 +00:00 · ed8fc4d488
commit ed8fc4d488
parent d7ef8ef51e
2 changed files with 89 additions and 70 deletions
--- a/meta_data_analysis/kd.py
+++ b/meta_data_analysis/kd.py
@ -1,20 +1,94 @@
 #!/usr/bin/python
 #%%
 # Task: Hydrophobicity (Kd) values for amino acid sequence using the Kyt&-Doolittle
 # Same output as using the expasy server https://web.expasy.org/protscale/
-# hydrophobicity and SAA
+# useful links
 # https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
 # https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
-import sys, os
+#%% 
-import pandas as pd
+# load packages
-import pprint as pp
+from pylab import *
 #import dms_tools2
 #import dms_tools2.dssp
 from Bio.SeqUtils import ProtParamData
 from Bio.SeqUtils.ProtParam import ProteinAnalysis
 from Bio import SeqIO
 #from Bio.Alphabet.IUPAC import IUPACProtein
 #import pprint as pp
 import pandas as pd
 import numpy as np
 import sys, os
 #%%
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
+# specify variables
-os.getcwd()
+#=======
-os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis/struct_params')
+# input
-os.getcwd()
+#=======
-#%%
+homedir = os.path.expanduser('~')
 indir = 'git/Data/pyrazinamide/input/original'
 in_filename = "3pl1.fasta.txt"
 infile = homedir + '/' + indir + '/' + in_filename
 print(infile)
-foo = ProtParamData.kd(3pl1.pdb)
+#=======
 # output 
 #=======
 outdir =   'git/Data/pyrazinamide/output'
 out_filename = "kd.csv"
 outfile = homedir + '/' + outdir + '/' + out_filename
 print(outfile)
 #%%
 # specify window size for hydropathy profile computation
 # https://web.expasy.org/protscale/pscale/protscale_help.html
 my_window = 3
 offset = round((my_window/2)-0.5)
 fh = open(infile)
 for record in SeqIO.parse(fh, "fasta"):
    id = record.id
    seq = record.seq
    num_residues = len(seq)
 fh.close()
 sequence = str(seq)
 X = ProteinAnalysis(sequence)
 kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # edge weight is set to  default (100%)
 # sanity checks 
 print('Sequence Length:', num_residues)
 print('kd_values Length:',len(kd_values))
 print('Window Length:',my_window)
 print('Window Offset:',offset)
 # make a df each for; aa sequence and kd_values. Then reset index for each df which will allow easy merging of the two
 # df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence)
 dfSeq = pd.DataFrame({'aa_wt':list(sequence)})
 dfSeq.index = np.arange(1, len(dfSeq) + 1) #python is not inclusive
 # df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values
 dfVals = pd.DataFrame({'kd_values':kd_values})
 dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)
 # sanity checks
 max(dfVals['kd_values'])
 min(dfVals['kd_values'])
 # Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2 
 df = pd.concat([dfSeq, dfVals], axis = 1)
 # rename index to position
 df = df.rename_axis('position')
 print(df)
 #%%
 # write file
 df.to_csv(outfile, header = True, index = True)
 #%% Plot
 # http://www.dalkescientific.com/writings/NBN/plotting.html
 plot(kd_values, linewidth = 1.0)
 #axis(xmin = 1, xmax = num_residues)
 xlabel("Residue Number")
 ylabel("Hydrophobicity")
 title("K&D Hydrophobicity for " + id)
 show()
 #%% end of script
--- a/meta_data_analysis/surface_res3.py
+++ b/meta_data_analysis/surface_res3.py
@ -1,55 +0,0 @@
 #!/usr/bin/python
 # Surface calculation
 #
 from pylab import *
 from Bio.SeqUtils import ProtParamData
 from Bio.SeqUtils.ProtParam import ProteinAnalysis
 from Bio import SeqIO
 from Bio.Alphabet.IUPAC import IUPACProtein
 import pprint as pp
 import pandas as pd
 import numpy as np
 infile='/home/tanu/git/Data/pyrazinamide/input/original/3pl1.fasta.txt'
 window=9
 offset=round((window/2)-0.5)
 fh = open(infile)
 for record in SeqIO.parse(fh, "fasta"):
    id = record.id
    seq = record.seq
    num_residues = len(seq)
 fh.close()
 sequence = str(seq)
 X = ProteinAnalysis(sequence)
 values=(X.protein_scale(ProtParamData.kd,window))
 print('Sequence Length:', num_residues)
 print('Post-Analysis Length:',len(values))
 print('Window Length:',window)
 print('Window Offset:',offset)
 dfSeq=pd.DataFrame({'seq':list(sequence)})
 dfVals=pd.DataFrame({'values':values})
 # FIXME:
 # These need to be offset by 'offset' from the start and finish
 # so that the sequence letters line up with the value calculated
 df=pd.concat([dfSeq,dfVals], ignore_index=True, axis=1)
 print(df)
 #df=pd.DataFrame(list(sequence), values)
 #plot(185, values, linewidth=1.0)
 #axis(xmin = 1, xmax = num_residues)
 #xlabel("Residue Number")
 #ylabel("Hydrophobicity")
 #title("K&D Hydrophobicity for " + id)
 #show()