adapted kd calc script with command line args and made it into a function

2020-04-07 16:45:59 +01:00 · 2020-04-07 16:45:59 +01:00 · ae541ca16a
commit ae541ca16a
parent ded7307c22
1 changed files with 222 additions and 0 deletions
--- a/scripts/kd_df.py
+++ b/scripts/kd_df.py
@ -0,0 +1,222 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 '''
 Created on Tue Aug  6 12:56:03 2019
@author: tanu
 '''
 #=======================================================================
 # Task: Hydrophobicity (Kd) values for amino acid sequence using the
 # Kyt&-Doolittle.
 # Same output as using the expasy server (link below)
 # Input: fasta file
 # Output: csv file with 
 # useful links
 # https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
 # https://web.expasy.org/protscale/pscale/protscale_help.html
 #=======================================================================
 #%% load packages
 import sys, os
 import argparse
 import pandas as pd
 import numpy as np
 from pylab import *
 from Bio.SeqUtils import ProtParamData
 from Bio.SeqUtils.ProtParam import ProteinAnalysis
 from Bio import SeqIO
 #from Bio.Alphabet.IUPAC import IUPACProtein
 import pprint as pp
 #=======================================================================
 #%% specify homedir and curr dir
 homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
 #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
 #arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
 arg_parser.add_argument('-d', '--drug', help='drug name', default = 'DRUGNAME')
 arg_parser.add_argument('-g', '--gene', help='gene name', default = 'geneName')
 args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
 drug = args.drug
 gene = args.gene
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input
 #=======
 indir = datadir + '/' + drug + '/' + 'input'
 in_filename = '3pl1.fasta.txt'
 infile = indir + '/' + in_filename
 print('Input filename:', in_filename
      , '\nInput path:', indir
      , '\n============================================================')
 #=======
 # output 
 #=======
 outdir =   datadir + '/' + drug + '/' + 'output'
 out_filename = gene.lower() + '_kd.csv'
 outfile =  outdir + '/' + out_filename
 print('Output filename:', out_filename
      , '\nOutput path:', outdir
      , '\n=============================================================')
 #%% end of variable assignment for input and output files
 #=======================================================================
 def kd_to_csv(inputfasta, outputkdcsv, windowsize):
 	"""
    Calculate kd (hydropathy values) from input fasta file
    @param inputfasta: fasta file
    @type inputfasta: string
    @param outputkdcsv: csv file with kd values
    @type outfile: string
    @param windowsize: windowsize to perform KD calcs on (Kyte&-Doolittle)
    @type DSSP: numeric
    @return:  writes df of kd values as csv
    @type: .csv
    """
    #===================
    #calculate KD values: same as the expasy server
    #===================
 	my_window = windowsize
 	offset = round((my_window/2)-0.5)
 	fh = open(inputfasta)
 	for record in SeqIO.parse(fh, 'fasta'):
 		id = record.id
 		seq = record.seq
 		num_residues = len(seq)
 	fh.close()
 	sequence = str(seq)
 	X = ProteinAnalysis(sequence)
 	# edge weight is set to  default (100%)
 	kd_values = (X.protein_scale(ProtParamData.kd , window = my_window))
 	# sanity checks 
 	print('Sequence Length:', num_residues)
 	print('kd_values Length:',len(kd_values))
 	print('Window Length:', my_window)
 	print('Window Offset:', offset)
 	print('=================================================================')
 	print('Checking:len(kd values) is as expected for the given window size & offset...')
 	expected_length =  num_residues - (my_window - offset) 
 	if len(kd_values) == expected_length:
 		print('PASS: expected and actual length of kd values match')
 	else:
 		print('FAIL: length mismatch'
 		      ,'\nExpected length:', expected_length
 		      ,'\nActual length:', len(kd_values)
 		      , '\n=========================================================')
 	#===================
 	# creating two dfs
 	#===================
 	# 1) aa sequence and 2) kd_values. Then reset index for each df 
 	# which will allow easy merging of the two dfs.
 	# df1: df of aa seq with index reset to start from 1 
 	# (reflective of the actual aa position in a sequence)
 	# Name column of wt as 'wild_type' to be the same name used 
 	# in the file required for merging later.
 	dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)})
 	dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive
 	# df2: df of kd_values with index reset to start from offset + 1 and 
 	# subsequent matched length of the kd_values
 	dfVals = pd.DataFrame({'kd_values':kd_values})
 	dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)
 	# sanity checks
 	max(dfVals['kd_values'])
 	min(dfVals['kd_values'])
 	#===================
 	# concatenating dfs
 	#===================
 	# Merge the two on index 
 	# (as these are now reflective of the aa position numbers): df1 and df2 
 	# This will introduce NaN where there is missing values. In our case this
 	# will be 2 (first and last ones based on window size and offset)
 	kd_df = pd.concat([dfSeq, dfVals], axis = 1)
 	#============================
 	# renaming index to position
 	#============================
 	kd_df = kd_df.rename_axis('position')
 	kd_df.head
 	print('Checking: position col i.e. index should be numeric')
 	if kd_df.index.dtype == 'int64':
 		print('PASS: position col is numeric'
 		      , '\ndtype is:', kd_df.index.dtype)
 	else:
 		print('FAIL: position col is not numeric'
 		      , '\nConverting to numeric')
 		kd_df.index.astype('int64')
 		print('Checking dtype for after conversion:\n'
 		      , '\ndtype is:', kd_df.index.dtype
 		      , '\n=========================================================')
 	#===============
 	# writing file
 	#===============
 	print('Writing file:', out_filename
 		  , '\nFilename:', out_filename
 		  , '\nPath:',  outdir
 		  , '\n=============================================================')
 	kd_df.to_csv(outfile, header = True, index = True)
 	print('Finished writing:', out_filename
 		  , '\nNo. of rows:', len(kd_df)
 		  , '\nNo. of cols:', len(kd_df.columns)
 		  , '\n=============================================================')
 	#===============
 	# plot: optional!
 	#===============
 	# http://www.dalkescientific.com/writings/NBN/plotting.html
 	# FIXME: save fig
 	# extract just pdb if from 'id' to pass to title of plot
 	# foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1)
 	plot(kd_values, linewidth = 1.0)
 	#axis(xmin = 1, xmax = num_residues)
 	xlabel('Residue Number')
 	ylabel('Hydrophobicity')
 	title('K&D Hydrophobicity for ' + id)
 	show()
 #%% end of function
 #=======================================================================
 #%% call function
 #kd_to_csv(infile, outfile, windowsize = 3)
 #=======================================================================
 def main():
 	print('Running hydropathy calcs', in_filename, 'output csv:', out_filename)
 	kd_to_csv(infile, outfile, windowsize = 3)
 if __name__ == '__main__':
 	main()
 #%% end of script	
 #=======================================================================