#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' #======================================================================= # Task: Hydrophobicity (Kd) values for amino acid sequence using the # Kyt&-Doolittle. # Same output as using the expasy server https://web.expasy.org/protscale/ # Input: fasta file # Output: csv file with # useful links # https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html # https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html #======================================================================= #%% load packages from pylab import * from Bio.SeqUtils import ProtParamData from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio import SeqIO #from Bio.Alphabet.IUPAC import IUPACProtein #import pprint as pp import pandas as pd import numpy as np import sys, os #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.getcwd() #======================================================================= #%% variable assignment: input and output drug = 'pyrazinamide' gene = 'pncA' gene_match = gene + '_p.' #========== # data dir #========== #indir = 'git/Data/pyrazinamide/input/original' datadir = homedir + '/' + 'git/Data' #======= # input #======= #indir = 'git/Data/pyrazinamide/input/original' indir = datadir + '/' + drug + '/' + 'input' in_filename = '3pl1.fasta.txt' infile = indir + '/' + in_filename print('Input filename:', in_filename , '\nInput path:', indir , '\n============================================================') #======= # output #======= outdir = datadir + '/' + drug + '/' + 'output' out_filename = gene.lower() + '_kd.csv' outfile = outdir + '/' + out_filename print('Output filename:', out_filename , '\nOutput path:', outdir , '\n=============================================================') #%% end of variable assignment for input and output files #======================================================================= #%%specify window size for hydropathy profile computation # https://web.expasy.org/protscale/pscale/protscale_help.html my_window = 3 offset = round((my_window/2)-0.5) fh = open(infile) for record in SeqIO.parse(fh, 'fasta'): id = record.id seq = record.seq num_residues = len(seq) fh.close() sequence = str(seq) X = ProteinAnalysis(sequence) kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # edge weight is set to default (100%) # sanity checks print('Sequence Length:', num_residues) print('kd_values Length:',len(kd_values)) print('Window Length:', my_window) print('Window Offset:', offset) print('=================================================================') print('Checking:len(kd values) is as expected for the given window size & offset...') expected_length = num_residues - (my_window - offset) if len(kd_values) == expected_length: print('PASS: expected and actual length of kd values match') else: print('FAIL: length mismatch' ,'\nExpected length:', expected_length ,'\nActual length:', len(kd_values) , '\n=========================================================') #%% make 2 dfs; 1) aa sequence and 2) kd_values. Then reset index for each df # which will allow easy merging of the two dfs. # df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence) # Name column of wt as 'wild_type' to be the same name used in the file required for merging later. dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)}) dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive # df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values dfVals = pd.DataFrame({'kd_values':kd_values}) dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset) # sanity checks max(dfVals['kd_values']) min(dfVals['kd_values']) #=================== # concatenating dfs #=================== # Merge the two on index # (as these are now reflective of the aa position numbers): df1 and df2 # This will introduce NaN where there is missing values. In our case this # will be 2 (first and last ones based on window size and offset) # In our case this will be 2 (first and last ones) # For pnca: the last position is not part of the struc, so not info loss # Needless to say that this will be variable for other targets. kd_df = pd.concat([dfSeq, dfVals], axis = 1) #============================ # renaming index to position #============================ kd_df = kd_df.rename_axis('position') kd_df.head print('=================================================================') print('position col i.e. index should be numeric , '\n===============================================================') if kd_df.index.dtype == 'int64': print('PASS: position col is numeric' , '\ndtype is:', kd_df.index.dtype) else: print('FAIL: position col is not numeric' , '\nConverting to numeric') kd_df.index.astype('int64') print('Checking dtype for after conversion:\n' , '\ndtype is:', kd_df.index.dtype , '\n=========================================================') #%% write file print('Writing file:', out_filename , '\nFilename:', out_filename , '\nPath:', outdir , '\n=============================================================') kd_df.to_csv(outfile, header = True, index = True) print('Finished writing:', out_filename , '\nNo. of rows:', len(kd_df) , '\nNo. of cols:', len(kd_df.columns) , '\n=============================================================') #%% plot # http://www.dalkescientific.com/writings/NBN/plotting.html # FIXME: save fig # extract just pdb if from 'id' to pass to title of plot # foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1) plot(kd_values, linewidth = 1.0) #axis(xmin = 1, xmax = num_residues) xlabel('Residue Number') ylabel('Hydrophobicity') title('K&D Hydrophobicity for ' + id) show() print('======================================================================') #%% end of script #=======================================================================