From 640299015456c425b812929dcd9096e1ecc9e25d Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 9 Jul 2020 11:15:56 +0100
Subject: [PATCH] minor edits to format mcsm data like sorting df

---
 mcsm/ind_scripts/format_results.py        |  44 ++++++---
 mcsm/ind_scripts/format_results_notdef.py | 106 ++++++++++++----------
 mcsm/mcsm.py                              |  56 ++++++++----
 mcsm/mcsm_wrapper.py                      |   6 +-
 4 files changed, 127 insertions(+), 85 deletions(-)

diff --git a/mcsm/ind_scripts/format_results.py b/mcsm/ind_scripts/format_results.py
index ffcf880..e2b05b0 100755
--- a/mcsm/ind_scripts/format_results.py
+++ b/mcsm/ind_scripts/format_results.py
@@ -264,19 +264,29 @@ def format_mcsm_output(mcsm_outputcsv):
 		, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
 	#=============================================================================
 	# Adding colname: wild_pos: sometimes useful for plotting and db
-	print('Creating column: wild_position')
-	mcsm_data['wild_position'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
-	print(mcsm_data['wild_position'].head())
+	print('Creating column: wild_pos')
+	mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
+	print(mcsm_data['wild_pos'].head())
 	# Remove spaces b/w pasted columns
-	print('removing white space within column: wild_position')
-	mcsm_data['wild_position'] = mcsm_data['wild_position'].str.replace(' ', '')
-	print('Correctly formatted column: wild_position\n', mcsm_data['wild_position'].head()
+	print('removing white space within column: wild_pos')
+	mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
+	print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
 		, '\n===================================================================')
 	#=============================================================================
+	# Adding colname: wild_chain_pos: sometimes useful for plotting and db and is explicit
+    print('Creating column: wild_chain_pos')
+    mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
+    print(mcsm_data['wild_chain_pos'].head())
+    # Remove spaces b/w pasted columns
+    print('removing white space within column: wild_chain_pos')
+    mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
+    print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
+	      , '\n===================================================================')
+	#=============================================================================
 	#%% ensuring dtypes are string for the non-numeric cols
 	#) char cols
 	char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain'
-		         , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_position']
+		         , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
 
 	#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
 	cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
@@ -292,7 +302,12 @@ def format_mcsm_output(mcsm_outputcsv):
 	#=============================================================================
 	# Removing PredAff log column as it is not needed?
 	print('Removing col: PredAffLog since relevant info has been extracted from it')
-	mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog'])
+	mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
+	#=============================================================================
+	#sort df by position for convenience
+    print('Sorting df by position')
+    mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
+    print('sorted df:\n', mcsm_data_fs.head())
 	#%%===========================================================================
 	#############
 	# sanity check before writing file
@@ -300,29 +315,28 @@ def format_mcsm_output(mcsm_outputcsv):
 	expected_ncols_toadd = 5 # beware of hardcoded numbers
 	dforig_len = dforig_shape[1]
 	expected_cols = dforig_len + expected_ncols_toadd
-	if len(mcsm_dataf.columns) == expected_cols:
+	if len(mcsm_data_fs.columns) == expected_cols:
 		print('PASS: formatting successful'
 		, '\nformatted df has expected no. of cols:', expected_cols
-		, '\ncolnames:', mcsm_dataf.columns
+		, '\ncolnames:', mcsm_data_fs.columns
 		, '\n----------------------------------------------------------------'
-		, '\ndtypes in cols:', mcsm_dataf.dtypes
+		, '\ndtypes in cols:', mcsm_data_fs.dtypes
 		, '\n----------------------------------------------------------------'
 		, '\norig data shape:', dforig_shape
-		, '\nformatted df shape:', mcsm_dataf.shape
+		, '\nformatted df shape:', mcsm_data_fs.shape
 		, '\n===============================================================')
 	else: 
 		print('FAIL: something went wrong in formatting df'
 		, '\nLen of orig df:', dforig_len
 		, '\nExpected number of cols to add:', expected_ncols_toadd
 		, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
-		, '\nGot no. of cols:', len(mcsm_dataf.columns)
+		, '\nGot no. of cols:', len(mcsm_data_fs.columns)
 		, '\nCheck formatting:'
 		, '\ncheck hardcoded value:', expected_ncols_toadd
 		, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
 		, '\n===============================================================')
-
 	      
-	return mcsm_dataf
+	return mcsm_data_fs
 #=======================================================================
 # call function
 mcsm_df_formatted = format_mcsm_output(infile)
diff --git a/mcsm/ind_scripts/format_results_notdef.py b/mcsm/ind_scripts/format_results_notdef.py
index fbf99a0..7dc0450 100755
--- a/mcsm/ind_scripts/format_results_notdef.py
+++ b/mcsm/ind_scripts/format_results_notdef.py
@@ -23,45 +23,41 @@ os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
 os.getcwd()
 #=======================================================================
 #%% variable assignment: input and output 
-#drug = 'pyrazinamide'
-#gene = 'pncA'
-
-drug = 'rifampicin'
-gene = 'rpoB'
+drug = 'pyrazinamide'
+gene = 'pncA'
 
 #drug = args.drug
 #gene = args.gene
 
 gene_match = gene + '_p.'
 #==========
-# data dir
+# dirs
 #==========
 datadir = homedir + '/' + 'git/Data'
+indir = datadir + '/' + drug + '/' + 'input'
+outdir = datadir + '/' + drug + '/' + 'output'
 
 #=======
 # input:
 #=======
 # 1) result_urls (from outdir)
-outdir = datadir + '/' + drug + '/' + 'output'
-in_filename = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
-infile = outdir + '/' + in_filename
-print('Input filename:', in_filename
-      , '\nInput path(from output dir):', outdir
+
+in_filename_mcsm_output = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
+infile_mcsm_output = outdir + '/' + in_filename_mcsm_output
+print('Input file:', infile_mcsm_output
       , '\n=============================================================')
       
 #=======
 # output 
 #=======
-outdir =   datadir + '/' + drug + '/' + 'output'
-out_filename = gene.lower() + '_complex_mcsm_norm.csv'
-outfile =  outdir + '/' + out_filename
-print('Output filename:', out_filename
-      , '\nOutput path:', outdir
+out_filename_mcsm_norm = gene.lower() + '_complex_mcsm_norm.csv'
+outfile_mcsm_norm =  outdir + '/' + out_filename_mcsm_norm
+print('Output file:', out_filename_mcsm_norm
       , '\n=============================================================')
 
 #=======================================================================
 print('Reading input file')
-mcsm_data  = pd.read_csv(infile, sep = ',')
+mcsm_data  = pd.read_csv(infile_mcsm_output, sep = ',')
 
 mcsm_data.columns
 # PredAffLog = affinity_change_log
@@ -231,19 +227,29 @@ print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
 	, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
 #=============================================================================
 # Adding colname: wild_pos: sometimes useful for plotting and db
-print('Creating column: wild_position')
-mcsm_data['wild_position'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
-print(mcsm_data['wild_position'].head())
+print('Creating column: wild_pos')
+mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
+print(mcsm_data['wild_pos'].head())
 # Remove spaces b/w pasted columns
 print('removing white space within column: wild_position')
-mcsm_data['wild_position'] = mcsm_data['wild_position'].str.replace(' ', '')
-print('Correctly formatted column: wild_position\n', mcsm_data['wild_position'].head()
+mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
+print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
+	  , '\n===================================================================')
+#=============================================================================
+#%% Adding colname: wild_chain_pos: sometimes useful for plotting and db and is explicit
+print('Creating column: wild_chain_pos')
+mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
+print(mcsm_data['wild_chain_pos'].head())
+# Remove spaces b/w pasted columns
+print('removing white space within column: wild_chain_pos')
+mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
+print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
 	  , '\n===================================================================')
 #=============================================================================
 #%% ensuring dtypes are string for the non-numeric cols
 #) char cols
 char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain'
-             , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_position']
+             , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
 
 #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
 cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
@@ -258,42 +264,48 @@ else:
 print(mcsm_data.dtypes)
 #%%
 #=============================================================================
-# Removing PredAff log column as it is not needed?
+#%% Removing PredAff log column as it is not needed?
 print('Removing col: PredAffLog since relevant info has been extracted from it')
-mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog'])
+mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
+print(mcsm_data_f.head())
+#=============================================================================
+#%% sort df by position for convenience
+print('Sorting df by position')
+mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
+print('sorted df:\n', mcsm_data_fs.head())
 #%%===========================================================================
-expected_ncols_toadd = 5 # beware of hardcoded numbers
+expected_ncols_toadd = 6 # beware of hardcoded numbers
 dforig_len = dforig_shape[1]
 expected_cols = dforig_len + expected_ncols_toadd
-if len(mcsm_dataf.columns) == expected_cols:
+if len(mcsm_data_fs.columns) == expected_cols:
 	print('PASS: formatting successful'
-	, '\nformatted df has expected no. of cols:', expected_cols
-	, '\ncolnames:', mcsm_dataf.columns
-	, '\n----------------------------------------------------------------'
-	, '\ndtypes in cols:', mcsm_dataf.dtypes
-	, '\n----------------------------------------------------------------'
-	, '\norig data shape:', dforig_shape
-	, '\nformatted df shape:', mcsm_dataf.shape
-	, '\n===============================================================')
+    	, '\nformatted df has expected no. of cols:', expected_cols
+    	, '\ncolnames:', mcsm_data_fs.columns
+    	, '\n----------------------------------------------------------------'
+    	, '\ndtypes in cols:', mcsm_data_fs.dtypes
+    	, '\n----------------------------------------------------------------'
+    	, '\norig data shape:', dforig_shape
+    	, '\nformatted df shape:', mcsm_data_fs.shape
+    	, '\n===============================================================')
 else: 
 	print('FAIL: something went wrong in formatting df'
-	, '\nLen of orig df:', dforig_len
-	, '\nExpected number of cols to add:', expected_ncols_toadd
-	, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
-	, '\nGot no. of cols:', len(mcsm_dataf.columns)
-	, '\nCheck formatting:'
-	, '\ncheck hardcoded value:', expected_ncols_toadd
-	, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
-	, '\n===============================================================')
+        , '\nLen of orig df:', dforig_len
+    	, '\nExpected number of cols to add:', expected_ncols_toadd
+    	, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
+    	, '\nGot no. of cols:', len(mcsm_data_fs.columns)
+    	, '\nCheck formatting:'
+    	, '\ncheck hardcoded value:', expected_ncols_toadd
+    	, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
+    	, '\n===============================================================')
 #%%============================================================================
 # writing file
 print('Writing formatted df to csv')
-mcsm_dataf.to_csv(outfile, index = False)
+mcsm_data_fs.to_csv(outfile_mcsm_norm, index = False)
 
 print('Finished writing file:'
-      , '\nFile:', outfile
-      , '\nExpected no. of rows:', len(mcsm_dataf)
-      , '\nExpected no. of cols:', len(mcsm_dataf.columns)
+      , '\nFile:', outfile_mcsm_norm
+      , '\nExpected no. of rows:', len(mcsm_data_fs)
+      , '\nExpected no. of cols:', len(mcsm_data_fs.columns)
       , '\n=============================================================')
 #%%
 #End of script
diff --git a/mcsm/mcsm.py b/mcsm/mcsm.py
index 9eb0e56..16f9004 100644
--- a/mcsm/mcsm.py
+++ b/mcsm/mcsm.py
@@ -376,23 +376,34 @@ def format_mcsm_output(mcsm_outputcsv):
     # adding column: wild_position
     # useful for plots and db
     #############
-    print('Creating column: wild_position')
-    mcsm_data['wild_position'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
-    print(mcsm_data['wild_position'].head())
+    print('Creating column: wild_pos')
+    mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
+    print(mcsm_data['wild_pos'].head())
     # Remove spaces b/w pasted columns
-    print('removing white space within column: wild_position')
-    mcsm_data['wild_position'] = mcsm_data['wild_position'].str.replace(' ', '')
-    print('Correctly formatted column: wild_position\n', mcsm_data['wild_position'].head()
+    print('removing white space within created column: wild_pos')
+    mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
+    print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
           , '\n=========================================================')
+#%%=====================================================================
+    #############
+    # adding column: wild_chain_pos
+    # useful for plots and db and its explicit
+    #############
+    print('Creating column: wild_chain_pos')
+    mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
+    print(mcsm_data['wild_chain_pos'].head())
+    # Remove spaces b/w pasted columns
+    print('removing white space within created column: wild_chain_pos')
+    mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
+    print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
+	      , '\n=========================================================')
     
 #%%=====================================================================    
-    
     #############
     # ensuring corrrect dtype in non-numeric cols
-    #############
-    
+    #############  
     #) char cols
-    char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain', 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_position']
+    char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain', 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
 
     #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
     cols_check_char = mcsm_data.select_dtypes(include = 'object').columns.isin(char_cols)
@@ -408,36 +419,41 @@ def format_mcsm_output(mcsm_outputcsv):
 #%%=====================================================================
     # Removing PredAff log column as it is not needed?
     print('Removing col: PredAffLog since relevant info has been extracted from it')
-    mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog'])
+    mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
+#%%=====================================================================
+    # sort df by position for convenience
+    print('Sorting df by position')
+    mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
+    print('sorted df:\n', mcsm_data_fs.head())
 #%%=====================================================================
     #############
     # sanity check before writing file
     #############
-    expected_ncols_toadd = 5
+    expected_ncols_toadd = 6 # beware hardcoding!
     dforig_len = dforig_shape[1]
     expected_cols = dforig_len + expected_ncols_toadd
-    if len(mcsm_dataf.columns) == expected_cols:
+    if len(mcsm_data_fs.columns) == expected_cols:
         print('PASS: formatting successful'
                 , '\nformatted df has expected no. of cols:', expected_cols
                 , '\n---------------------------------------------------'
-                , '\ncolnames:', mcsm_dataf.columns
+                , '\ncolnames:', mcsm_data_fs.columns
                 , '\n---------------------------------------------------'
-                , '\ndtypes in cols:', mcsm_dataf.dtypes
+                , '\ndtypes in cols:', mcsm_data_fs.dtypes
                 , '\n---------------------------------------------------'
                 , '\norig data shape:', dforig_shape
-                , '\nformatted df shape:', mcsm_dataf.shape
+                , '\nformatted df shape:', mcsm_data_fs.shape
                 , '\n===================================================')
     else: 
-        sys.exit('FAIL: something went wrong in formatting df'
+        print('FAIL: something went wrong in formatting df'
                 , '\nLen of orig df:', dforig_len
                 , '\nExpected number of cols to add:', expected_ncols_toadd
                 , '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
-                , '\nGot no. of cols:', len(mcsm_dataf.columns)
+                , '\nGot no. of cols:', len(mcsm_data_fs.columns)
                 , '\nCheck formatting:'
                 , '\ncheck hardcoded value:', expected_ncols_toadd
                 , '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
                 , '\n===================================================')
+        sys.exit()        
                 
-                
-    return mcsm_dataf
+    return mcsm_data_fs
 
diff --git a/mcsm/mcsm_wrapper.py b/mcsm/mcsm_wrapper.py
index 9d34c4e..3614d66 100755
--- a/mcsm/mcsm_wrapper.py
+++ b/mcsm/mcsm_wrapper.py
@@ -76,7 +76,8 @@ if DEBUG:
     print('DEBUG: mCSM output CSV file:', mcsm_output)
 
 # format_results globals
-out_filename_format = gene.lower() + '_mcsm_processed.csv'
+#out_filename_format = gene.lower() + '_mcsm_processed.csv'
+out_filename_format = gene.lower() + '_complex_mcsm_norm.csv'
 outfile_format =  outdir + '/' + out_filename_format
 if DEBUG:
     print('DEBUG: formatted CSV output:', outfile_format)
@@ -111,7 +112,6 @@ def submit_mcsm():
         ,  'minutes, but will be longer for more mutations.')
 #%%=====================================================================
 def get_results():
-
     output_df = pd.DataFrame()
     url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
     success_counter = 1
@@ -152,7 +152,7 @@ def format_results():
     print('Finished writing file:'
           , '\nFile:', outfile_format
           , '\nExpected no. of rows:', len(mcsm_df_formatted)
-          , '\nExpected no. of cols:', len(mcsm_df_formatted)
+          , '\nExpected no. of cols:', len(mcsm_df_formatted.columns)
           , '\n=============================================================')
 #%%=====================================================================
 def main():