diff --git a/scripts/kd_df.py b/scripts/kd_df.py new file mode 100755 index 0000000..904d92f --- /dev/null +++ b/scripts/kd_df.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' +#======================================================================= +# Task: Hydrophobicity (Kd) values for amino acid sequence using the +# Kyt&-Doolittle. +# Same output as using the expasy server (link below) +# Input: fasta file + +# Output: csv file with + +# useful links +# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html +# https://web.expasy.org/protscale/pscale/protscale_help.html +#======================================================================= +#%% load packages +import sys, os +import argparse +import pandas as pd +import numpy as np +from pylab import * +from Bio.SeqUtils import ProtParamData +from Bio.SeqUtils.ProtParam import ProteinAnalysis +from Bio import SeqIO +#from Bio.Alphabet.IUPAC import IUPACProtein +import pprint as pp +#======================================================================= +#%% specify homedir and curr dir +homedir = os.path.expanduser('~') + +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/scripts') +os.getcwd() +#======================================================================= +#%% command line args +arg_parser = argparse.ArgumentParser() +#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') +#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive +arg_parser.add_argument('-d', '--drug', help='drug name', default = 'DRUGNAME') +arg_parser.add_argument('-g', '--gene', help='gene name', default = 'geneName') +args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output +#drug = 'pyrazinamide' +#gene = 'pncA' +drug = args.drug +gene = args.gene +gene_match = gene + '_p.' + +#========== +# data dir +#========== +datadir = homedir + '/' + 'git/Data' + +#======= +# input +#======= +indir = datadir + '/' + drug + '/' + 'input' +in_filename = '3pl1.fasta.txt' +infile = indir + '/' + in_filename +print('Input filename:', in_filename + , '\nInput path:', indir + , '\n============================================================') + +#======= +# output +#======= +outdir = datadir + '/' + drug + '/' + 'output' +out_filename = gene.lower() + '_kd.csv' +outfile = outdir + '/' + out_filename +print('Output filename:', out_filename + , '\nOutput path:', outdir + , '\n=============================================================') +#%% end of variable assignment for input and output files +#======================================================================= +def kd_to_csv(inputfasta, outputkdcsv, windowsize): + """ + Calculate kd (hydropathy values) from input fasta file + + @param inputfasta: fasta file + @type inputfasta: string + + @param outputkdcsv: csv file with kd values + @type outfile: string + + @param windowsize: windowsize to perform KD calcs on (Kyte&-Doolittle) + @type DSSP: numeric + + @return: writes df of kd values as csv + @type: .csv + """ + #=================== + #calculate KD values: same as the expasy server + #=================== + my_window = windowsize + offset = round((my_window/2)-0.5) + + fh = open(inputfasta) + + for record in SeqIO.parse(fh, 'fasta'): + id = record.id + seq = record.seq + num_residues = len(seq) + fh.close() + + sequence = str(seq) + X = ProteinAnalysis(sequence) + + # edge weight is set to default (100%) + kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) + # sanity checks + print('Sequence Length:', num_residues) + print('kd_values Length:',len(kd_values)) + print('Window Length:', my_window) + print('Window Offset:', offset) + print('=================================================================') + print('Checking:len(kd values) is as expected for the given window size & offset...') + expected_length = num_residues - (my_window - offset) + if len(kd_values) == expected_length: + print('PASS: expected and actual length of kd values match') + else: + print('FAIL: length mismatch' + ,'\nExpected length:', expected_length + ,'\nActual length:', len(kd_values) + , '\n=========================================================') + + #=================== + # creating two dfs + #=================== + # 1) aa sequence and 2) kd_values. Then reset index for each df + # which will allow easy merging of the two dfs. + + # df1: df of aa seq with index reset to start from 1 + # (reflective of the actual aa position in a sequence) + # Name column of wt as 'wild_type' to be the same name used + # in the file required for merging later. + dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)}) + dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive + + # df2: df of kd_values with index reset to start from offset + 1 and + # subsequent matched length of the kd_values + dfVals = pd.DataFrame({'kd_values':kd_values}) + dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset) + + # sanity checks + max(dfVals['kd_values']) + min(dfVals['kd_values']) + + #=================== + # concatenating dfs + #=================== + # Merge the two on index + # (as these are now reflective of the aa position numbers): df1 and df2 + # This will introduce NaN where there is missing values. In our case this + # will be 2 (first and last ones based on window size and offset) + + kd_df = pd.concat([dfSeq, dfVals], axis = 1) + + #============================ + # renaming index to position + #============================ + kd_df = kd_df.rename_axis('position') + kd_df.head + + print('Checking: position col i.e. index should be numeric') + if kd_df.index.dtype == 'int64': + print('PASS: position col is numeric' + , '\ndtype is:', kd_df.index.dtype) + else: + print('FAIL: position col is not numeric' + , '\nConverting to numeric') + kd_df.index.astype('int64') + print('Checking dtype for after conversion:\n' + , '\ndtype is:', kd_df.index.dtype + , '\n=========================================================') + #=============== + # writing file + #=============== + print('Writing file:', out_filename + , '\nFilename:', out_filename + , '\nPath:', outdir + , '\n=============================================================') + + kd_df.to_csv(outfile, header = True, index = True) + + print('Finished writing:', out_filename + , '\nNo. of rows:', len(kd_df) + , '\nNo. of cols:', len(kd_df.columns) + , '\n=============================================================') + + #=============== + # plot: optional! + #=============== + # http://www.dalkescientific.com/writings/NBN/plotting.html + + # FIXME: save fig + # extract just pdb if from 'id' to pass to title of plot + # foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1) + plot(kd_values, linewidth = 1.0) + #axis(xmin = 1, xmax = num_residues) + xlabel('Residue Number') + ylabel('Hydrophobicity') + title('K&D Hydrophobicity for ' + id) + show() +#%% end of function +#======================================================================= +#%% call function +#kd_to_csv(infile, outfile, windowsize = 3) +#======================================================================= +def main(): + print('Running hydropathy calcs', in_filename, 'output csv:', out_filename) + kd_to_csv(infile, outfile, windowsize = 3) + +if __name__ == '__main__': + main() +#%% end of script +#=======================================================================