#!/usr/bin/python #%% # Task: Hydrophobicity (Kd) values for amino acid sequence using the Kyt&-Doolittle # Same output as using the expasy server https://web.expasy.org/protscale/ # useful links # https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html # https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html #%% # load packages from pylab import * from Bio.SeqUtils import ProtParamData from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio import SeqIO #from Bio.Alphabet.IUPAC import IUPACProtein #import pprint as pp import pandas as pd import numpy as np import sys, os #%% # specify variables #======= # input #======= homedir = os.path.expanduser('~') indir = 'git/Data/pyrazinamide/input/original' in_filename = "3pl1.fasta.txt" infile = homedir + '/' + indir + '/' + in_filename print(infile) #======= # output #======= outdir = 'git/Data/pyrazinamide/output' out_filename = "kd.csv" outfile = homedir + '/' + outdir + '/' + out_filename print(outfile) #%% # specify window size for hydropathy profile computation # https://web.expasy.org/protscale/pscale/protscale_help.html my_window = 3 offset = round((my_window/2)-0.5) fh = open(infile) for record in SeqIO.parse(fh, "fasta"): id = record.id seq = record.seq num_residues = len(seq) fh.close() sequence = str(seq) X = ProteinAnalysis(sequence) kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # edge weight is set to default (100%) # sanity checks print('Sequence Length:', num_residues) print('kd_values Length:',len(kd_values)) print('Window Length:',my_window) print('Window Offset:',offset) # make a df each for; aa sequence and kd_values. Then reset index for each df which will allow easy merging of the two # df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence) dfSeq = pd.DataFrame({'aa_wt':list(sequence)}) dfSeq.index = np.arange(1, len(dfSeq) + 1) #python is not inclusive # df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values dfVals = pd.DataFrame({'kd_values':kd_values}) dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset) # sanity checks max(dfVals['kd_values']) min(dfVals['kd_values']) # Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2 df = pd.concat([dfSeq, dfVals], axis = 1) # rename index to position df = df.rename_axis('position') print(df) #%% # write file df.to_csv(outfile, header = True, index = True) #%% Plot # http://www.dalkescientific.com/writings/NBN/plotting.html plot(kd_values, linewidth = 1.0) #axis(xmin = 1, xmax = num_residues) xlabel("Residue Number") ylabel("Hydrophobicity") title("K&D Hydrophobicity for " + id) show() #%% end of script