ran struc param analysis

2020-06-17 19:36:02 +01:00 · 2020-06-17 19:36:02 +01:00 · 8d1daabff4
commit 8d1daabff4
parent e21635fe02
5 changed files with 373 additions and 382 deletions
--- a/scripts/kd_df.py
+++ b/scripts/kd_df.py
@ -39,10 +39,8 @@ os.getcwd()
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
-#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
-#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
-arg_parser.add_argument('-d', '--drug', help='drug name', default = 'DRUGNAME')
-arg_parser.add_argument('-g', '--gene', help='gene name', default = 'geneName')
+arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
+arg_parser.add_argument('-g', '--gene', help='gene name', default = None)
 #arg_parser.add_argument('-p', '--plot', help='show plot',  action='store_true')
 args = arg_parser.parse_args()
 #=======================================================================
@ -81,8 +79,8 @@ print('Output filename:', out_filename
 #%% end of variable assignment for input and output files
 #=======================================================================
 #%% kd values from fasta file and output csv
-def kd_to_csv(inputfasta, outputkdcsv, windowsize):
-	"""
+def kd_to_csv(inputfasta, outputkdcsv, windowsize = 3):
+    """
    Calculate kd (hydropathy values) from input fasta file

    @param inputfasta: fasta file
@ -96,138 +94,137 @@ def kd_to_csv(inputfasta, outputkdcsv, windowsize):

    @return:  none, writes kd values df as csv
    """
-	#========================
-	# read input fasta file
-	#========================
-	fh = open(inputfasta)
+    #========================
+    # read input fasta file
+    #========================
+    fh = open(inputfasta)

-	for record in SeqIO.parse(fh, 'fasta'):
-		id = record.id
-		seq = record.seq
-		num_residues = len(seq)
-	fh.close()
+    for record in SeqIO.parse(fh, 'fasta'):
+        id = record.id
+        seq = record.seq
+        num_residues = len(seq)
+    fh.close()

- 	sequence = str(seq)
-	X = ProteinAnalysis(sequence)
+    sequence = str(seq)
+    X = ProteinAnalysis(sequence)

-   	#===================
+    #===================
    # calculate KD values: same as the expasy server
    #===================
-	my_window = windowsize
-	offset = round((my_window/2)-0.5)
-	# edge weight is set to  default (100%)
-	
-	kd_values = (X.protein_scale(ProtParamData.kd , window = my_window))
-	# sanity checks 
-	print('Sequence Length:', num_residues)
-	print('kd_values Length:',len(kd_values))
-	print('Window Length:', my_window)
-	print('Window Offset:', offset)
-	print('=================================================================')
-	print('Checking:len(kd values) is as expected for the given window size & offset...')
-	expected_length =  num_residues - (my_window - offset) 
-	if len(kd_values) == expected_length:
-		print('PASS: expected and actual length of kd values match')
-	else:
-		print('FAIL: length mismatch'
-		      ,'\nExpected length:', expected_length
-		      ,'\nActual length:', len(kd_values)
-		      , '\n=========================================================')
+    my_window = windowsize
+    offset = round((my_window/2)-0.5)
+    # edge weight is set to  default (100%)
+    
+    kd_values = (X.protein_scale(ProtParamData.kd , window = my_window))
+    # sanity checks 
+    print('Sequence Length:', num_residues)
+    print('kd_values Length:',len(kd_values))
+    print('Window Length:', my_window)
+    print('Window Offset:', offset)
+    print('=================================================================')
+    print('Checking:len(kd values) is as expected for the given window size & offset...')
+    expected_length =  num_residues - (my_window - offset) 
+    if len(kd_values) == expected_length:
+        print('PASS: expected and actual length of kd values match')
+    else:
+        print('FAIL: length mismatch'
+              ,'\nExpected length:', expected_length
+              ,'\nActual length:', len(kd_values)
+              , '\n=========================================================')
 
- 	#===================
-	# creating two dfs
-	#===================
-	# 1) aa sequence and 2) kd_values. Then reset index for each df 
-	# which will allow easy merging of the two dfs.
+    #===================
+    # creating two dfs
+    #===================
+    # 1) aa sequence and 2) kd_values. Then reset index for each df 
+    # which will allow easy merging of the two dfs.

-	# df1: df of aa seq with index reset to start from 1 
-	# (reflective of the actual aa position in a sequence)
-	# Name column of wt as 'wild_type' to be the same name used 
-	# in the file required for merging later.
-	dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)})
-	dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive
+    # df1: df of aa seq with index reset to start from 1 
+    # (reflective of the actual aa position in a sequence)
+    # Name column of wt as 'wild_type' to be the same name used 
+    # in the file required for merging later.
+    dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)})
+    dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive

-	# df2: df of kd_values with index reset to start from offset + 1 and 
-	# subsequent matched length of the kd_values
-	dfVals = pd.DataFrame({'kd_values':kd_values})
-	dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)
+    # df2: df of kd_values with index reset to start from offset + 1 and 
+    # subsequent matched length of the kd_values
+    dfVals = pd.DataFrame({'kd_values':kd_values})
+    dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)

-	# sanity checks
-	max(dfVals['kd_values'])
-	min(dfVals['kd_values'])
-	
-	#===================
-	# concatenating dfs
-	#===================
-	# Merge the two on index 
-	# (as these are now reflective of the aa position numbers): df1 and df2 
-	# This will introduce NaN where there is missing values. In our case this
-	# will be 2 (first and last ones based on window size and offset)
+    # sanity checks
+    max(dfVals['kd_values'])
+    min(dfVals['kd_values'])
+    
+    #===================
+    # concatenating dfs
+    #===================
+    # Merge the two on index 
+    # (as these are now reflective of the aa position numbers): df1 and df2 
+    # This will introduce NaN where there is missing values. In our case this
+    # will be 2 (first and last ones based on window size and offset)

-	kd_df = pd.concat([dfSeq, dfVals], axis = 1)
+    kd_df = pd.concat([dfSeq, dfVals], axis = 1)

-	#============================
-	# renaming index to position
-	#============================
-	kd_df = kd_df.rename_axis('position')
-	kd_df.head
+    #============================
+    # renaming index to position
+    #============================
+    kd_df = kd_df.rename_axis('position')
+    kd_df.head

-	print('Checking: position col i.e. index should be numeric')
-	if kd_df.index.dtype == 'int64':
-		print('PASS: position col is numeric'
-		      , '\ndtype is:', kd_df.index.dtype)
-	else:
-		print('FAIL: position col is not numeric'
-		      , '\nConverting to numeric')
-		kd_df.index.astype('int64')
-		print('Checking dtype for after conversion:\n'
-		      , '\ndtype is:', kd_df.index.dtype
-		      , '\n=========================================================')
+    print('Checking: position col i.e. index should be numeric')
+    if kd_df.index.dtype == 'int64':
+        print('PASS: position col is numeric'
+              , '\ndtype is:', kd_df.index.dtype)
+    else:
+        print('FAIL: position col is not numeric'
+              , '\nConverting to numeric')
+        kd_df.index.astype('int64')
+        print('Checking dtype for after conversion:\n'
+              , '\ndtype is:', kd_df.index.dtype
+              , '\n=========================================================')

-	#===============
-	# writing file
-	#===============
-	print('Writing file:'
-		  , '\nFilename:', outputkdcsv
-#		  , '\nPath:',  outdir
-		  , '\nExpected no. of rows:', len(kd_df)
-		  , '\nExpected no. of cols:', len(kd_df.columns)
-		  , '\n=============================================================')
+    #===============
+    # writing file
+    #===============
+    print('Writing file:'
+          , '\nFilename:', outputkdcsv
+#          , '\nPath:',  outdir
+          , '\nExpected no. of rows:', len(kd_df)
+          , '\nExpected no. of cols:', len(kd_df.columns)
+          , '\n=============================================================')

-	kd_df.to_csv(outputkdcsv, header = True, index = True)
-		  
-	#===============
-	# plot: optional!
-	#===============
-	# http://www.dalkescientific.com/writings/NBN/plotting.html
+    kd_df.to_csv(outputkdcsv, header = True, index = True)
+          
+    #===============
+    # plot: optional!
+    #===============
+    # http://www.dalkescientific.com/writings/NBN/plotting.html

-	# FIXME: save fig
-	# extract just pdb if from 'id' to pass to title of plot
-	# foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1)
-#	if doplot:
-	plot(kd_values, linewidth = 1.0)
-	#axis(xmin = 1, xmax = num_residues)
-	xlabel('Residue Number')
-	ylabel('Hydrophobicity')
-	title('K&D Hydrophobicity for ' + id)
-	show()
-		
+    # FIXME: save fig
+    # extract just pdb if from 'id' to pass to title of plot
+    # foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1)
+#    if doplot:
+    plot(kd_values, linewidth = 1.0)
+    #axis(xmin = 1, xmax = num_residues)
+    xlabel('Residue Number')
+    ylabel('Hydrophobicity')
+    title('K&D Hydrophobicity for ' + id)
+    show()
+        
 #%% end of function
 #=======================================================================
 #%% call function
 #kd_to_csv(infile, outfile, windowsize = 3)
 #=======================================================================
 def main():
-	print('Running hydropathy calcs with following params\n'
-		, in_filename
-		, '\noutfile:', out_filename)
-		kd_to_csv(infile, outfile, 3)
-	print('Finished writing file:'
-		, '\nFilename:', out_filename
-		, '\nPath:',  outdir
-		, '\n=============================================================')
-	
+    print('Running hydropathy calcs with following params\n'
+        , in_filename
+        , '\noutfile:', out_filename)
+    kd_to_csv(infile, outfile, 3)
+    print('Finished writing file:'
+        , '\nFilename:', outfile
+        , '\n=============================================================')
+    
 if __name__ == '__main__':
-	main()
-#%% end of script	
+    main()
+#%% end of script    
 #=======================================================================