fixed white space prob with mcsm input with merge

2020-07-14 14:07:23 +01:00 · 2020-07-14 14:07:23 +01:00 · 8dc2fa7326
commit 8dc2fa7326
parent 5a2084ba11
6 changed files with 108 additions and 98 deletions
--- a/foldx/runFoldx.py
+++ b/foldx/runFoldx.py
@ -36,7 +36,6 @@ arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assm
 arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
 arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')

-
 arg_parser.add_argument('-pdb', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
 arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_snps.csv exists')

@ -50,38 +49,36 @@ args = arg_parser.parse_args()
 #gene = 'pncA'
 #gene_match = gene + '_p.'
 #%%=====================================================================
-# Command Line Options
-drug            = args.drug
-gene            = args.gene
+# Command line options
+drug         = args.drug
+gene         = args.gene

-data_dir        = args.data_dir
-indir           = args.input_dir
-outdir          = args.output_dir
+datadir      = args.datadir
+indir        = args.input_dir
+outdir       = args.output_dir

-mut_filename    = args.mutation_file
-chainA          = args.chain1
-chainB          = args.chain2
-pdb_filename    = args.pdb_file
+mut_filename = args.mutation_file
+chainA       = args.chain1
+chainB       = args.chain2
+pdb_filename = args.pdb_file

 # os.path.splitext will fail interestingly with file.pdb.txt.zip
 #pdb_name = os.path.splitext(pdb_file)[0]
 # Just the filename, thanks
 #pdb_name = Path(in_filename_pdb).stem

-#============
+#==============
 # directories
-#============
-if data_dir:
-    datadir = data_dir
-else:
+#==============
+if not datadir:
    datadir = homedir + '/' + 'git/Data'
-
-if not indir:
-    indir = datadir + '/' + drug + '/' + 'input'
-
-if not outdir:
-    outdir = datadir + '/' + drug + '/' + 'output'
    
+if not indir:
+    indir = datadir + '/' + drug + '/input'
+    
+if not outdir:
+    outdir = datadir + '/' + drug + '/output'
+
 # FIXME: this is a temporary directory and should be correctly handled
 process_dir = datadir + '/' + drug +'/' + 'processing'

@ -90,7 +87,6 @@ os.mkdir(process_dir)
 # input
 #=======
 # FIXME
-
 if pdb_filename:
    pdb_name = Path(pdb_filename).stem
 else:
--- a/mcsm/ind_scripts/format_results.py
+++ b/mcsm/ind_scripts/format_results.py
@ -74,7 +74,11 @@ def format_mcsm_output(mcsm_outputcsv):
 	#############
 	# Read file
 	#############
-	mcsm_data  = pd.read_csv(mcsm_outputcsv, sep = ',')
+	mcsm_data_raw  = pd.read_csv(mcsm_outputcsv, sep = ',')
+	
+    # strip white space from both ends in all columns
+    mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
 	dforig_shape = mcsm_data.shape
 	print('dimensions of input file:', dforig_shape) 
 	
@ -85,7 +89,7 @@ def format_mcsm_output(mcsm_outputcsv):
 	print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
 		  , '\n===================================================================')
 	my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
-		           , 'Mutation information': 'mutation_information' # {wild_type}<position>{mutant_type}
+		           , 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
 		           , 'Wild-type': 'wild_type' # one letter amino acid code
 		           , 'Position': 'position' # number
 		           , 'Mutant-type': 'mutant_type' # one letter amino acid code
@ -97,19 +101,19 @@ def format_mcsm_output(mcsm_outputcsv):
 	mcsm_data.rename(columns = my_colnames_dict, inplace = True)
 	#%%===========================================================================
 	#################################
-	# populate mutation_information 
+	# populate mutationinformation 
 	# col which is currently blank
 	#################################
-	# populate mutation_information column:mcsm style  muts {WT}<POS>{MUT}
-	print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information'])
-	mcsm_data['mutation_information'] = mcsm_data['wild_type'] +  mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
-	print('checking after populating:\n', mcsm_data['mutation_information']
+	# populate mutationinformation column:mcsm style  muts {WT}<POS>{MUT}
+	print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
+	mcsm_data['mutationinformation'] = mcsm_data['wild_type'] +  mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
+	print('checking after populating:\n', mcsm_data['mutationinformation']
 		  , '\n===================================================================')

 	# Remove spaces b/w pasted columns
-	print('removing white space within column: \mutation_information')
-	mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '')
-	print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information']
+	print('removing white space within column: \mutationinformation')
+	mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
+	print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
 		  , '\n===================================================================')
 	#%%===========================================================================
 	#############
@ -118,7 +122,7 @@ def format_mcsm_output(mcsm_outputcsv):
 	# shouldn't exist as this should be eliminated at the time of running mcsm
 	print('Sanity check:'
 		  , '\nChecking duplicate mutations')
-	if mcsm_data['mutation_information'].duplicated().sum() == 0:
+	if mcsm_data['mutationinformation'].duplicated().sum() == 0:
 		print('PASS: No duplicate mutations detected (as expected)'
 		      , '\nDim of data:', mcsm_data.shape
 		      , '\n===============================================================')
@ -126,7 +130,7 @@ def format_mcsm_output(mcsm_outputcsv):
 		print('FAIL (but not fatal): Duplicate mutations detected'
 		      , '\nDim of df with duplicates:', mcsm_data.shape
 		      , 'Removing duplicate entries')
-		mcsm_data = mcsm_data.drop_duplicates(['mutation_information'])
+		mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
 		print('Dim of data after removing duplicate muts:', mcsm_data.shape
 		      , '\n===============================================================')
 	#%%=========================================================================== 
@ -285,7 +289,7 @@ def format_mcsm_output(mcsm_outputcsv):
 	#=============================================================================
 	#%% ensuring dtypes are string for the non-numeric cols
 	#) char cols
-	char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain'
+	char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain'
 		         , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']

 	#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
@ -309,8 +313,8 @@ def format_mcsm_output(mcsm_outputcsv):
    mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
    print('sorted df:\n', mcsm_data_fs.head())
    
-    # Remove white space everywhere before output: bit me when merging!?
-    mcsm_data_fs.columns = mcsm_data_fs.columns.str.replace(' ', '')
+    # Ensuring column names are lowercase before output
+    mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
 	#%%===========================================================================
 	#############
 	# sanity check before writing file
--- a/mcsm/ind_scripts/format_results_notdef.py
+++ b/mcsm/ind_scripts/format_results_notdef.py
@ -13,7 +13,6 @@ import pandas as pd
 from pandas.api.types import is_string_dtype
 from pandas.api.types import is_numeric_dtype
 import numpy as np
-
 #=======================================================================
 #%% specify input and curr dir
 homedir = os.path.expanduser('~')
@ -25,10 +24,6 @@ os.getcwd()
 #%% variable assignment: input and output 
 drug = 'pyrazinamide'
 gene = 'pncA'
-
-#drug = args.drug
-#gene = args.gene
-
 gene_match = gene + '_p.'
 #==========
 # dirs
@ -41,7 +36,6 @@ outdir = datadir + '/' + drug + '/' + 'output'
 # input:
 #=======
 # 1) result_urls (from outdir)
-
 in_filename_mcsm_output = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
 infile_mcsm_output = outdir + '/' + in_filename_mcsm_output
 print('Input file:', infile_mcsm_output
@ -57,9 +51,11 @@ print('Output file:', out_filename_mcsm_norm

 #=======================================================================
 print('Reading input file')
-mcsm_data  = pd.read_csv(infile_mcsm_output, sep = ',')
+mcsm_data_raw  = pd.read_csv(infile_mcsm_output, sep = ',')
+
+# strip white space from both ends in all columns
+mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)

-mcsm_data.columns
 # PredAffLog = affinity_change_log
 # "DUETStability_Kcalpermol = DUET_change_kcalpermol
 dforig_shape = mcsm_data.shape
@ -72,7 +68,7 @@ print('dim of infile:', dforig_shape)
 print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
 		  , '\n===================================================================')
 my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
-	           , 'Mutation information': 'mutation_information' # {wild_type}<position>{mutant_type}
+	           , 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
 	           , 'Wild-type': 'wild_type' # one letter amino acid code
 	           , 'Position': 'position' # number
 	           , 'Mutant-type': 'mutant_type' # one letter amino acid code
@ -83,17 +79,17 @@ my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info fr

 mcsm_data.rename(columns = my_colnames_dict, inplace = True)
 #%%===========================================================================
-# populate mutation_information column:mcsm style  muts {WT}<POS>{MUT}
-print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information'])
-mcsm_data['mutation_information'] = mcsm_data['wild_type'] +  mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
-print('checking after populating:\n', mcsm_data['mutation_information']
+# populate mutationinformation column:mcsm style  muts {WT}<POS>{MUT}
+print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
+mcsm_data['mutationinformation'] = mcsm_data['wild_type'] +  mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
+print('checking after populating:\n', mcsm_data['mutationinformation']
 	  , '\n===================================================================')

-# Remove spaces b/w pasted columns
-print('removing white space within column: \mutation_information')
-mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '')
-print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information']
-	  , '\n===================================================================')
+# Remove spaces b/w pasted columns: not needed as white space removed at the time of import
+#print('removing white space within column: \mutationinformation')
+#mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
+#print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
+#	  , '\n===================================================================')
 #%% Remove whitespace from column
 #orig_dtypes = mcsm_data.dtypes
 #https://stackoverflow.com/questions/33788913/pythonic-efficient-way-to-strip-whitespace-from-every-pandas-data-frame-cell-tha/33789292
@ -103,7 +99,7 @@ print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_
 # very important
 print('Sanity check:'
 	  , '\nChecking duplicate mutations')
-if mcsm_data['mutation_information'].duplicated().sum() == 0:
+if mcsm_data['mutationinformation'].duplicated().sum() == 0:
 	print('PASS: No duplicate mutations detected (as expected)'
 	      , '\nDim of data:', mcsm_data.shape
 	      , '\n===============================================================')
@ -111,7 +107,7 @@ else:
 	print('FAIL (but not fatal): Duplicate mutations detected'
 	      , '\nDim of df with duplicates:', mcsm_data.shape
 	      , 'Removing duplicate entries')
-	mcsm_data = mcsm_data.drop_duplicates(['mutation_information'])
+	mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
 	print('Dim of data after removing duplicate muts:', mcsm_data.shape
 		, '\n===============================================================')
 #%%=========================================================================== 
@ -248,7 +244,7 @@ print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'
 #=============================================================================
 #%% ensuring dtypes are string for the non-numeric cols
 #) char cols
-char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain'
+char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain'
             , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']

 #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
@ -298,8 +294,8 @@ else:
    	, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
    	, '\n===============================================================')
 #%%============================================================================
-# Remove white space everywhere before output: bit me when merging!?
-mcsm_data_fs.columns = mcsm_data_fs.columns.str.replace(' ', '')
+# Ensuring column names are lowercase before output
+mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()

 # writing file
 print('Writing formatted df to csv')
--- a/mcsm/mcsm.py
+++ b/mcsm/mcsm.py
@ -183,7 +183,11 @@ def format_mcsm_output(mcsm_outputcsv):
    #############
    # Read file
    #############
-    mcsm_data  = pd.read_csv(mcsm_outputcsv, sep = ',')
+    mcsm_data_raw  = pd.read_csv(mcsm_outputcsv, sep = ',')  
+    
+    # strip white space from both ends in all columns
+    mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
    dforig_shape = mcsm_data.shape
    print('dimensions of input file:', dforig_shape) 

@ -396,8 +400,7 @@ def format_mcsm_output(mcsm_outputcsv):
    print('removing white space within created column: wild_chain_pos')
    mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
    print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
-	      , '\n=========================================================')
-    
+	      , '\n=========================================================')    
 #%%=====================================================================    
    #############
    # ensuring corrrect dtype in non-numeric cols
@ -426,8 +429,8 @@ def format_mcsm_output(mcsm_outputcsv):
    mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
    print('sorted df:\n', mcsm_data_fs.head())
    
-    # Remove white space everywhere before output: bit me when merging!?
-    mcsm_data_fs.columns = mcsm_data_fs.columns.str.replace(' ', '')
+    # Ensuring column names are lowercase before output
+    mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
 #%%=====================================================================
    #############
    # sanity check before writing file
--- a/mcsm/run_mcsm.py
+++ b/mcsm/run_mcsm.py
@ -18,29 +18,39 @@ arg_parser.add_argument('-c', '--chain',   help='Chain ID as per PDB, Case sensi
 arg_parser.add_argument('-l','--ligand',   help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage', default = None)
 arg_parser.add_argument('-a','--affinity', help='Affinity in nM. REQUIRED only in "submit" stage', default = 0.99) 
 arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File') 
-arg_parser.add_argument('--datadir', help = 'Data Directory')
+
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+
 arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode')

 args = arg_parser.parse_args()
-
-gene     = args.gene
-drug     = args.drug
-stage    = args.stage
-chain    = args.chain
-ligand   = args.ligand
-affinity = args.affinity
-pdb_filename = args.pdb_file
-data_dir = args.data_dir
-DEBUG    = args.debug
-
-# Actual Globals :-)
-host = args.host
-prediction_url = args.url
-
+#=======================================================================
+#%% variables
 #host = "http://biosig.unimelb.edu.au"
 #prediction_url = f"{host}/mcsm_lig/prediction"
 #drug = 'isoniazid'
 #gene = 'KatG'
+#%%=====================================================================
+# Command line options
+gene         = args.gene
+drug         = args.drug
+stage        = args.stage
+chain        = args.chain
+ligand       = args.ligand
+affinity     = args.affinity
+pdb_filename = args.pdb_file
+
+datadir      = args.datadir
+indir        = args.input_dir
+outdir       = args.output_dir
+
+DEBUG        = args.debug
+
+# Actual Globals :-)
+host = args.host
+prediction_url = args.url

 # submit_mcsm globals
 homedir = os.path.expanduser('~')
@ -51,13 +61,14 @@ gene_match = gene + '_p.'
 #============
 # directories
 #============
-if data_dir:
-    datadir = data_dir
-else:
-    datadir = homedir + '/git/Data'
-
-indir = datadir + '/' + drug + '/' + 'input'
-outdir = datadir + '/' + drug + '/' + 'output'
+if not datadir:
+    datadir = homedir + '/' + 'git/Data'
+    
+if not indir:
+    indir = datadir + '/' + drug + '/input'
+    
+if not outdir:
+    outdir = datadir + '/' + drug + '/output'

 #=======
 # input
--- a/scripts/or_kinship_link.py
+++ b/scripts/or_kinship_link.py
@ -46,18 +46,18 @@ args = arg_parser.parse_args()
 #drug = 'pyrazinamide'
 #start_cds = 2288681
 #end_cds = 2289241
-
-# cmd variables
-gene = args.gene
-drug = args.drug
+#%%=====================================================================
+# Command line options
+gene       = args.gene
+drug       = args.drug
 gene_match = gene + '_p.'

-datadir      = args.datadir
-indir        = args.input_dir
-outdir       = args.output_dir
+datadir     = args.datadir
+indir       = args.input_dir
+outdir      = args.output_dir

-start_cds = args.start_coord
-end_cds = args.end_coord
+start_cds   = args.start_coord
+end_cds     = args.end_coord

 #%%=======================================================================
 #==============