remove old surface_res3.py

2020-02-20 12:23:56 +00:00 · 2020-02-20 12:23:56 +00:00 · 2ed0df41e2
commit 2ed0df41e2
parent 26e4652d63
2 changed files with 89 additions and 70 deletions
--- a/meta_data_analysis/kd.py
+++ b/meta_data_analysis/kd.py
@ -1,20 +1,94 @@
 #!/usr/bin/python
+#%%
+# Task: Hydrophobicity (Kd) values for amino acid sequence using the Kyt&-Doolittle
+# Same output as using the expasy server https://web.expasy.org/protscale/

-# hydrophobicity and SAA
-#https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
-#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
-import sys, os
-import pandas as pd
-import pprint as pp
-#import dms_tools2
-#import dms_tools2.dssp
-
+# useful links
+# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
+# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
+#%% 
+# load packages
+from pylab import *
 from Bio.SeqUtils import ProtParamData
+from Bio.SeqUtils.ProtParam import ProteinAnalysis
+from Bio import SeqIO
+#from Bio.Alphabet.IUPAC import IUPACProtein
+#import pprint as pp
+import pandas as pd
+import numpy as np
+import sys, os
 #%%
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
-os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis/struct_params')
-os.getcwd()
-#%%
+# specify variables
+#=======
+# input
+#=======
+homedir = os.path.expanduser('~')
+indir = 'git/Data/pyrazinamide/input/original'
+in_filename = "3pl1.fasta.txt"
+infile = homedir + '/' + indir + '/' + in_filename
+print(infile)

-foo = ProtParamData.kd(3pl1.pdb)
+#=======
+# output 
+#=======
+outdir =   'git/Data/pyrazinamide/output'
+out_filename = "kd.csv"
+outfile = homedir + '/' + outdir + '/' + out_filename
+print(outfile)
+#%%
+# specify window size for hydropathy profile computation
+# https://web.expasy.org/protscale/pscale/protscale_help.html
+my_window = 3
+offset = round((my_window/2)-0.5)
+
+fh = open(infile)
+
+for record in SeqIO.parse(fh, "fasta"):
+    id = record.id
+    seq = record.seq
+    num_residues = len(seq)
+fh.close()
+
+sequence = str(seq)
+
+X = ProteinAnalysis(sequence)
+
+kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # edge weight is set to  default (100%)
+
+# sanity checks 
+print('Sequence Length:', num_residues)
+print('kd_values Length:',len(kd_values))
+print('Window Length:',my_window)
+print('Window Offset:',offset)
+
+# make a df each for; aa sequence and kd_values. Then reset index for each df which will allow easy merging of the two
+
+# df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence)
+dfSeq = pd.DataFrame({'aa_wt':list(sequence)})
+dfSeq.index = np.arange(1, len(dfSeq) + 1) #python is not inclusive
+
+# df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values
+dfVals = pd.DataFrame({'kd_values':kd_values})
+dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)
+
+# sanity checks
+max(dfVals['kd_values'])
+min(dfVals['kd_values'])
+
+# Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2 
+df = pd.concat([dfSeq, dfVals], axis = 1)
+# rename index to position
+df = df.rename_axis('position')
+print(df)
+#%%
+# write file
+df.to_csv(outfile, header = True, index = True)
+#%% Plot
+# http://www.dalkescientific.com/writings/NBN/plotting.html
+plot(kd_values, linewidth = 1.0)
+#axis(xmin = 1, xmax = num_residues)
+xlabel("Residue Number")
+ylabel("Hydrophobicity")
+title("K&D Hydrophobicity for " + id)
+show()
+#%% end of script
--- a/meta_data_analysis/surface_res3.py
+++ b/meta_data_analysis/surface_res3.py
@ -1,55 +0,0 @@
-#!/usr/bin/python
-# Surface calculation
-#
-from pylab import *
-from Bio.SeqUtils import ProtParamData
-from Bio.SeqUtils.ProtParam import ProteinAnalysis
-from Bio import SeqIO
-from Bio.Alphabet.IUPAC import IUPACProtein
-import pprint as pp
-import pandas as pd
-import numpy as np
-
-
-infile='/home/tanu/git/Data/pyrazinamide/input/original/3pl1.fasta.txt'
-
-window=9
-offset=round((window/2)-0.5)
-
-fh = open(infile)
-
-for record in SeqIO.parse(fh, "fasta"):
-    id = record.id
-    seq = record.seq
-    num_residues = len(seq)
-fh.close()
-
-sequence = str(seq)
-
-X = ProteinAnalysis(sequence)
-
-values=(X.protein_scale(ProtParamData.kd,window))
-
-print('Sequence Length:', num_residues)
-print('Post-Analysis Length:',len(values))
-print('Window Length:',window)
-print('Window Offset:',offset)
-
-dfSeq=pd.DataFrame({'seq':list(sequence)})
-dfVals=pd.DataFrame({'values':values})
-
-# FIXME:
-# These need to be offset by 'offset' from the start and finish
-# so that the sequence letters line up with the value calculated
-df=pd.concat([dfSeq,dfVals], ignore_index=True, axis=1)
-
-print(df)
-#df=pd.DataFrame(list(sequence), values)
-
-#plot(185, values, linewidth=1.0)
-#axis(xmin = 1, xmax = num_residues)
-#xlabel("Residue Number")
-#ylabel("Hydrophobicity")
-#title("K&D Hydrophobicity for " + id)
-#show()
-