#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' #======================================================================= # Task: Hydrophobicity (Kd) values for amino acid sequence using the # Kyt&-Doolittle. # Same output as using the expasy server (link below) # Input: fasta file # Output: csv file with # useful links # https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html # https://web.expasy.org/protscale/pscale/protscale_help.html #======================================================================= #%% load packages import sys, os import argparse import pandas as pd import numpy as np from pylab import * from Bio.SeqUtils import ProtParamData from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio import SeqIO #from Bio.Alphabet.IUPAC import IUPACProtein import pprint as pp #======================================================================= #%% specify homedir and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help='drug name', default = None) arg_parser.add_argument('-g', '--gene', help='gene name', default = None) #arg_parser.add_argument('-p', '--plot', help='show plot', action='store_true') arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') arg_parser.add_argument('-fasta','--fasta_file', help = 'fasta file. By default, it assmumes a file called .fasta.txt in input_dir') arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode') args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' drug = args.drug gene = args.gene gene_match = gene + '_p.' data_dir = args.datadir indir = args.input_dir outdir = args.output_dir fasta_filename = args.fasta_file #plot = args.plot DEBUG = args.debug #============ # directories #============ if data_dir: datadir = data_dir else: datadir = homedir + '/' + 'git/Data' if not indir: indir = datadir + '/' + drug + '/' + 'input' if not outdir: outdir = datadir + '/' + drug + '/' + 'output' #======= # input #======= if fasta_filename: in_filename_fasta = fasta_filename else: in_filename_fasta = gene.lower() + '.fasta.txt' infile_fasta = indir + '/' + in_filename_fasta print('Input fasta file:', infile_fasta , '\n============================================================') #======= # output #======= out_filename_kd = gene.lower() + '_kd.csv' outfile_kd = outdir + '/' + out_filename_kd print('Output file:', outfile_kd , '\n=============================================================') #%% end of variable assignment for input and output files #======================================================================= #%% kd values from fasta file and output csv def kd_to_csv(inputfasta, outputkdcsv, windowsize = 3): """ Calculate kd (hydropathy values) from input fasta file @param inputfasta: fasta file @type: string @param outputkdcsv: csv file with kd values @type: string @param windowsize: windowsize to perform KD calcs on (Kyte&-Doolittle) @type: numeric @return: none, writes kd values df as csv """ #======================== # read input fasta file #======================== fh = open(inputfasta) for record in SeqIO.parse(fh, 'fasta'): id = record.id seq = record.seq num_residues = len(seq) fh.close() sequence = str(seq) X = ProteinAnalysis(sequence) #=================== # calculate KD values: same as the expasy server #=================== my_window = windowsize offset = round((my_window/2)-0.5) # edge weight is set to default (100%) kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # sanity checks print('Sequence Length:', num_residues) print('kd_values Length:',len(kd_values)) print('Window Length:', my_window) print('Window Offset:', offset) print('=================================================================') print('Checking:len(kd values) is as expected for the given window size & offset...') expected_length = num_residues - (my_window - offset) if len(kd_values) == expected_length: print('PASS: expected and actual length of kd values match') else: print('FAIL: length mismatch' ,'\nExpected length:', expected_length ,'\nActual length:', len(kd_values) , '\n=========================================================') #=================== # creating two dfs #=================== # 1) aa sequence and 2) kd_values. Then reset index for each df # which will allow easy merging of the two dfs. # df1: df of aa seq with index reset to start from 1 # (reflective of the actual aa position in a sequence) # Name column of wt as 'wild_type' to be the same name used # in the file required for merging later. dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)}) dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive # df2: df of kd_values with index reset to start from offset + 1 and # subsequent matched length of the kd_values dfVals = pd.DataFrame({'kd_values':kd_values}) dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset) # sanity checks max(dfVals['kd_values']) min(dfVals['kd_values']) #=================== # concatenating dfs #=================== # Merge the two on index # (as these are now reflective of the aa position numbers): df1 and df2 # This will introduce NaN where there is missing values. In our case this # will be 2 (first and last ones based on window size and offset) kd_df = pd.concat([dfSeq, dfVals], axis = 1) #============================ # renaming index to position #============================ kd_df = kd_df.rename_axis('position') kd_df.head print('Checking: position col i.e. index should be numeric') if kd_df.index.dtype == 'int64': print('PASS: position col is numeric' , '\ndtype is:', kd_df.index.dtype) else: print('FAIL: position col is not numeric' , '\nConverting to numeric') kd_df.index.astype('int64') print('Checking dtype for after conversion:\n' , '\ndtype is:', kd_df.index.dtype , '\n=========================================================') # Ensuring lowercase column names for consistency kd_df.columns = kd_df.columns.str.lower() #=============== # writing file #=============== print('Writing file:' , '\nFilename:', outputkdcsv , '\nExpected no. of rows:', len(kd_df) , '\nExpected no. of cols:', len(kd_df.columns) , '\n=============================================================') kd_df.to_csv(outputkdcsv, header = True, index = True) #=============== # plot: optional! #=============== # http://www.dalkescientific.com/writings/NBN/plotting.html # FIXME: save fig # extract just pdb if from 'id' to pass to title of plot # foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1) #if doplot: plot(kd_values, linewidth = 1.0) #axis(xmin = 1, xmax = num_residues) xlabel('Residue Number') ylabel('Hydrophobicity') title('K&D Hydrophobicity for ' + id) show() #%% end of function #======================================================================= def main(): print('Running hydropathy calcs with following params\n' , '\nInput fasta file:', in_filename_fasta , '\nOutput:', out_filename_kd) kd_to_csv(infile_fasta, outfile_kd, 3) print('Finished writing file:' , '\nFile:', outfile_kd , '\n=============================================================') if __name__ == '__main__': main() #%% end of script #=======================================================================