minor edits to format mcsm data like sorting df

2020-07-09 11:15:56 +01:00 · 2020-07-09 11:15:56 +01:00 · 6402990154
commit 6402990154
parent 01fbc2a87b
4 changed files with 127 additions and 85 deletions
--- a/mcsm/ind_scripts/format_results.py
+++ b/mcsm/ind_scripts/format_results.py
@ -264,19 +264,29 @@ def format_mcsm_output(mcsm_outputcsv):
 		, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
 	#=============================================================================
 	# Adding colname: wild_pos: sometimes useful for plotting and db
-	print('Creating column: wild_position')
-	mcsm_data['wild_position'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
-	print(mcsm_data['wild_position'].head())
+	print('Creating column: wild_pos')
+	mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
+	print(mcsm_data['wild_pos'].head())
 	# Remove spaces b/w pasted columns
-	print('removing white space within column: wild_position')
-	mcsm_data['wild_position'] = mcsm_data['wild_position'].str.replace(' ', '')
-	print('Correctly formatted column: wild_position\n', mcsm_data['wild_position'].head()
+	print('removing white space within column: wild_pos')
+	mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
+	print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
+		, '\n===================================================================')
+	#=============================================================================
+	# Adding colname: wild_chain_pos: sometimes useful for plotting and db and is explicit
+    print('Creating column: wild_chain_pos')
+    mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
+    print(mcsm_data['wild_chain_pos'].head())
+    # Remove spaces b/w pasted columns
+    print('removing white space within column: wild_chain_pos')
+    mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
+    print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
 	      , '\n===================================================================')
 	#=============================================================================
 	#%% ensuring dtypes are string for the non-numeric cols
 	#) char cols
 	char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain'
-		         , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_position']
+		         , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']

 	#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
 	cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
@ -292,7 +302,12 @@ def format_mcsm_output(mcsm_outputcsv):
 	#=============================================================================
 	# Removing PredAff log column as it is not needed?
 	print('Removing col: PredAffLog since relevant info has been extracted from it')
-	mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog'])
+	mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
+	#=============================================================================
+	#sort df by position for convenience
+    print('Sorting df by position')
+    mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
+    print('sorted df:\n', mcsm_data_fs.head())
 	#%%===========================================================================
 	#############
 	# sanity check before writing file
@ -300,29 +315,28 @@ def format_mcsm_output(mcsm_outputcsv):
 	expected_ncols_toadd = 5 # beware of hardcoded numbers
 	dforig_len = dforig_shape[1]
 	expected_cols = dforig_len + expected_ncols_toadd
-	if len(mcsm_dataf.columns) == expected_cols:
+	if len(mcsm_data_fs.columns) == expected_cols:
 		print('PASS: formatting successful'
 		, '\nformatted df has expected no. of cols:', expected_cols
-		, '\ncolnames:', mcsm_dataf.columns
+		, '\ncolnames:', mcsm_data_fs.columns
 		, '\n----------------------------------------------------------------'
-		, '\ndtypes in cols:', mcsm_dataf.dtypes
+		, '\ndtypes in cols:', mcsm_data_fs.dtypes
 		, '\n----------------------------------------------------------------'
 		, '\norig data shape:', dforig_shape
-		, '\nformatted df shape:', mcsm_dataf.shape
+		, '\nformatted df shape:', mcsm_data_fs.shape
 		, '\n===============================================================')
 	else: 
 		print('FAIL: something went wrong in formatting df'
 		, '\nLen of orig df:', dforig_len
 		, '\nExpected number of cols to add:', expected_ncols_toadd
 		, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
-		, '\nGot no. of cols:', len(mcsm_dataf.columns)
+		, '\nGot no. of cols:', len(mcsm_data_fs.columns)
 		, '\nCheck formatting:'
 		, '\ncheck hardcoded value:', expected_ncols_toadd
 		, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
 		, '\n===============================================================')
 	      
-	      
-	return mcsm_dataf
+	return mcsm_data_fs
 #=======================================================================
 # call function
 mcsm_df_formatted = format_mcsm_output(infile)
--- a/mcsm/ind_scripts/format_results_notdef.py
+++ b/mcsm/ind_scripts/format_results_notdef.py
@ -23,45 +23,41 @@ os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
 os.getcwd()
 #=======================================================================
 #%% variable assignment: input and output 
-#drug = 'pyrazinamide'
-#gene = 'pncA'
-
-drug = 'rifampicin'
-gene = 'rpoB'
+drug = 'pyrazinamide'
+gene = 'pncA'

 #drug = args.drug
 #gene = args.gene

 gene_match = gene + '_p.'
 #==========
-# data dir
+# dirs
 #==========
 datadir = homedir + '/' + 'git/Data'
+indir = datadir + '/' + drug + '/' + 'input'
+outdir = datadir + '/' + drug + '/' + 'output'

 #=======
 # input:
 #=======
 # 1) result_urls (from outdir)
-outdir = datadir + '/' + drug + '/' + 'output'
-in_filename = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
-infile = outdir + '/' + in_filename
-print('Input filename:', in_filename
-      , '\nInput path(from output dir):', outdir
+
+in_filename_mcsm_output = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
+infile_mcsm_output = outdir + '/' + in_filename_mcsm_output
+print('Input file:', infile_mcsm_output
      , '\n=============================================================')
      
 #=======
 # output 
 #=======
-outdir =   datadir + '/' + drug + '/' + 'output'
-out_filename = gene.lower() + '_complex_mcsm_norm.csv'
-outfile =  outdir + '/' + out_filename
-print('Output filename:', out_filename
-      , '\nOutput path:', outdir
+out_filename_mcsm_norm = gene.lower() + '_complex_mcsm_norm.csv'
+outfile_mcsm_norm =  outdir + '/' + out_filename_mcsm_norm
+print('Output file:', out_filename_mcsm_norm
      , '\n=============================================================')

 #=======================================================================
 print('Reading input file')
-mcsm_data  = pd.read_csv(infile, sep = ',')
+mcsm_data  = pd.read_csv(infile_mcsm_output, sep = ',')

 mcsm_data.columns
 # PredAffLog = affinity_change_log
@ -231,19 +227,29 @@ print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
 	, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
 #=============================================================================
 # Adding colname: wild_pos: sometimes useful for plotting and db
-print('Creating column: wild_position')
-mcsm_data['wild_position'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
-print(mcsm_data['wild_position'].head())
+print('Creating column: wild_pos')
+mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
+print(mcsm_data['wild_pos'].head())
 # Remove spaces b/w pasted columns
 print('removing white space within column: wild_position')
-mcsm_data['wild_position'] = mcsm_data['wild_position'].str.replace(' ', '')
-print('Correctly formatted column: wild_position\n', mcsm_data['wild_position'].head()
+mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
+print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
+	  , '\n===================================================================')
+#=============================================================================
+#%% Adding colname: wild_chain_pos: sometimes useful for plotting and db and is explicit
+print('Creating column: wild_chain_pos')
+mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
+print(mcsm_data['wild_chain_pos'].head())
+# Remove spaces b/w pasted columns
+print('removing white space within column: wild_chain_pos')
+mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
+print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
 	  , '\n===================================================================')
 #=============================================================================
 #%% ensuring dtypes are string for the non-numeric cols
 #) char cols
 char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain'
-             , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_position']
+             , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']

 #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
 cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
@ -258,29 +264,35 @@ else:
 print(mcsm_data.dtypes)
 #%%
 #=============================================================================
-# Removing PredAff log column as it is not needed?
+#%% Removing PredAff log column as it is not needed?
 print('Removing col: PredAffLog since relevant info has been extracted from it')
-mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog'])
+mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
+print(mcsm_data_f.head())
+#=============================================================================
+#%% sort df by position for convenience
+print('Sorting df by position')
+mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
+print('sorted df:\n', mcsm_data_fs.head())
 #%%===========================================================================
-expected_ncols_toadd = 5 # beware of hardcoded numbers
+expected_ncols_toadd = 6 # beware of hardcoded numbers
 dforig_len = dforig_shape[1]
 expected_cols = dforig_len + expected_ncols_toadd
-if len(mcsm_dataf.columns) == expected_cols:
+if len(mcsm_data_fs.columns) == expected_cols:
 	print('PASS: formatting successful'
    	, '\nformatted df has expected no. of cols:', expected_cols
-	, '\ncolnames:', mcsm_dataf.columns
+    	, '\ncolnames:', mcsm_data_fs.columns
    	, '\n----------------------------------------------------------------'
-	, '\ndtypes in cols:', mcsm_dataf.dtypes
+    	, '\ndtypes in cols:', mcsm_data_fs.dtypes
    	, '\n----------------------------------------------------------------'
    	, '\norig data shape:', dforig_shape
-	, '\nformatted df shape:', mcsm_dataf.shape
+    	, '\nformatted df shape:', mcsm_data_fs.shape
    	, '\n===============================================================')
 else: 
 	print('FAIL: something went wrong in formatting df'
        , '\nLen of orig df:', dforig_len
    	, '\nExpected number of cols to add:', expected_ncols_toadd
    	, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
-	, '\nGot no. of cols:', len(mcsm_dataf.columns)
+    	, '\nGot no. of cols:', len(mcsm_data_fs.columns)
    	, '\nCheck formatting:'
    	, '\ncheck hardcoded value:', expected_ncols_toadd
    	, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
@ -288,12 +300,12 @@ else:
 #%%============================================================================
 # writing file
 print('Writing formatted df to csv')
-mcsm_dataf.to_csv(outfile, index = False)
+mcsm_data_fs.to_csv(outfile_mcsm_norm, index = False)

 print('Finished writing file:'
-      , '\nFile:', outfile
-      , '\nExpected no. of rows:', len(mcsm_dataf)
-      , '\nExpected no. of cols:', len(mcsm_dataf.columns)
+      , '\nFile:', outfile_mcsm_norm
+      , '\nExpected no. of rows:', len(mcsm_data_fs)
+      , '\nExpected no. of cols:', len(mcsm_data_fs.columns)
      , '\n=============================================================')
 #%%
 #End of script
--- a/mcsm/mcsm.py
+++ b/mcsm/mcsm.py
@ -376,23 +376,34 @@ def format_mcsm_output(mcsm_outputcsv):
    # adding column: wild_position
    # useful for plots and db
    #############
-    print('Creating column: wild_position')
-    mcsm_data['wild_position'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
-    print(mcsm_data['wild_position'].head())
+    print('Creating column: wild_pos')
+    mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
+    print(mcsm_data['wild_pos'].head())
    # Remove spaces b/w pasted columns
-    print('removing white space within column: wild_position')
-    mcsm_data['wild_position'] = mcsm_data['wild_position'].str.replace(' ', '')
-    print('Correctly formatted column: wild_position\n', mcsm_data['wild_position'].head()
+    print('removing white space within created column: wild_pos')
+    mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
+    print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
+          , '\n=========================================================')
+#%%=====================================================================
+    #############
+    # adding column: wild_chain_pos
+    # useful for plots and db and its explicit
+    #############
+    print('Creating column: wild_chain_pos')
+    mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
+    print(mcsm_data['wild_chain_pos'].head())
+    # Remove spaces b/w pasted columns
+    print('removing white space within created column: wild_chain_pos')
+    mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
+    print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
 	      , '\n=========================================================')
    
 #%%=====================================================================    
-    
    #############
    # ensuring corrrect dtype in non-numeric cols
    #############  
-    
    #) char cols
-    char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain', 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_position']
+    char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain', 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']

    #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
    cols_check_char = mcsm_data.select_dtypes(include = 'object').columns.isin(char_cols)
@ -408,36 +419,41 @@ def format_mcsm_output(mcsm_outputcsv):
 #%%=====================================================================
    # Removing PredAff log column as it is not needed?
    print('Removing col: PredAffLog since relevant info has been extracted from it')
-    mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog'])
+    mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
+#%%=====================================================================
+    # sort df by position for convenience
+    print('Sorting df by position')
+    mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
+    print('sorted df:\n', mcsm_data_fs.head())
 #%%=====================================================================
    #############
    # sanity check before writing file
    #############
-    expected_ncols_toadd = 5
+    expected_ncols_toadd = 6 # beware hardcoding!
    dforig_len = dforig_shape[1]
    expected_cols = dforig_len + expected_ncols_toadd
-    if len(mcsm_dataf.columns) == expected_cols:
+    if len(mcsm_data_fs.columns) == expected_cols:
        print('PASS: formatting successful'
                , '\nformatted df has expected no. of cols:', expected_cols
                , '\n---------------------------------------------------'
-                , '\ncolnames:', mcsm_dataf.columns
+                , '\ncolnames:', mcsm_data_fs.columns
                , '\n---------------------------------------------------'
-                , '\ndtypes in cols:', mcsm_dataf.dtypes
+                , '\ndtypes in cols:', mcsm_data_fs.dtypes
                , '\n---------------------------------------------------'
                , '\norig data shape:', dforig_shape
-                , '\nformatted df shape:', mcsm_dataf.shape
+                , '\nformatted df shape:', mcsm_data_fs.shape
                , '\n===================================================')
    else: 
-        sys.exit('FAIL: something went wrong in formatting df'
+        print('FAIL: something went wrong in formatting df'
                , '\nLen of orig df:', dforig_len
                , '\nExpected number of cols to add:', expected_ncols_toadd
                , '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
-                , '\nGot no. of cols:', len(mcsm_dataf.columns)
+                , '\nGot no. of cols:', len(mcsm_data_fs.columns)
                , '\nCheck formatting:'
                , '\ncheck hardcoded value:', expected_ncols_toadd
                , '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
                , '\n===================================================')
+        sys.exit()        
                
-                
-    return mcsm_dataf
+    return mcsm_data_fs

--- a/mcsm/mcsm_wrapper.py
+++ b/mcsm/mcsm_wrapper.py
@ -76,7 +76,8 @@ if DEBUG:
    print('DEBUG: mCSM output CSV file:', mcsm_output)

 # format_results globals
-out_filename_format = gene.lower() + '_mcsm_processed.csv'
+#out_filename_format = gene.lower() + '_mcsm_processed.csv'
+out_filename_format = gene.lower() + '_complex_mcsm_norm.csv'
 outfile_format =  outdir + '/' + out_filename_format
 if DEBUG:
    print('DEBUG: formatted CSV output:', outfile_format)
@ -111,7 +112,6 @@ def submit_mcsm():
        ,  'minutes, but will be longer for more mutations.')
 #%%=====================================================================
 def get_results():
-
    output_df = pd.DataFrame()
    url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
    success_counter = 1
@ -152,7 +152,7 @@ def format_results():
    print('Finished writing file:'
          , '\nFile:', outfile_format
          , '\nExpected no. of rows:', len(mcsm_df_formatted)
-          , '\nExpected no. of cols:', len(mcsm_df_formatted)
+          , '\nExpected no. of cols:', len(mcsm_df_formatted.columns)
          , '\n=============================================================')
 #%%=====================================================================
 def main():