minor edits to format mcsm data like sorting df

2020-07-09 11:15:56 +01:00 · 2020-07-09 11:15:56 +01:00 · 6402990154
commit 6402990154
parent 01fbc2a87b
4 changed files with 127 additions and 85 deletions
--- a/mcsm/ind_scripts/format_results.py
+++ b/mcsm/ind_scripts/format_results.py
@ -264,19 +264,29 @@ def format_mcsm_output(mcsm_outputcsv):
 		, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
 	#=============================================================================
 	# Adding colname: wild_pos: sometimes useful for plotting and db
-	print('Creating column: wild_position')
+	print('Creating column: wild_pos')
-	mcsm_data['wild_position'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
+	mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
-	print(mcsm_data['wild_position'].head())
+	print(mcsm_data['wild_pos'].head())
 	# Remove spaces b/w pasted columns
-	print('removing white space within column: wild_position')
+	print('removing white space within column: wild_pos')
-	mcsm_data['wild_position'] = mcsm_data['wild_position'].str.replace(' ', '')
+	mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
-	print('Correctly formatted column: wild_position\n', mcsm_data['wild_position'].head()
+	print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
 		, '\n===================================================================')
 	#=============================================================================
 	# Adding colname: wild_chain_pos: sometimes useful for plotting and db and is explicit
    print('Creating column: wild_chain_pos')
    mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
    print(mcsm_data['wild_chain_pos'].head())
    # Remove spaces b/w pasted columns
    print('removing white space within column: wild_chain_pos')
    mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
    print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
 	      , '\n===================================================================')
 	#=============================================================================
 	#%% ensuring dtypes are string for the non-numeric cols
 	#) char cols
 	char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain'
-		         , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_position']
+		         , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
 	#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
 	cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
@ -292,7 +302,12 @@ def format_mcsm_output(mcsm_outputcsv):
 	#=============================================================================
 	# Removing PredAff log column as it is not needed?
 	print('Removing col: PredAffLog since relevant info has been extracted from it')
-	mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog'])
+	mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
 	#=============================================================================
 	#sort df by position for convenience
    print('Sorting df by position')
    mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
    print('sorted df:\n', mcsm_data_fs.head())
 	#%%===========================================================================
 	#############
 	# sanity check before writing file
@ -300,29 +315,28 @@ def format_mcsm_output(mcsm_outputcsv):
 	expected_ncols_toadd = 5 # beware of hardcoded numbers
 	dforig_len = dforig_shape[1]
 	expected_cols = dforig_len + expected_ncols_toadd
-	if len(mcsm_dataf.columns) == expected_cols:
+	if len(mcsm_data_fs.columns) == expected_cols:
 		print('PASS: formatting successful'
 		, '\nformatted df has expected no. of cols:', expected_cols
-		, '\ncolnames:', mcsm_dataf.columns
+		, '\ncolnames:', mcsm_data_fs.columns
 		, '\n----------------------------------------------------------------'
-		, '\ndtypes in cols:', mcsm_dataf.dtypes
+		, '\ndtypes in cols:', mcsm_data_fs.dtypes
 		, '\n----------------------------------------------------------------'
 		, '\norig data shape:', dforig_shape
-		, '\nformatted df shape:', mcsm_dataf.shape
+		, '\nformatted df shape:', mcsm_data_fs.shape
 		, '\n===============================================================')
 	else: 
 		print('FAIL: something went wrong in formatting df'
 		, '\nLen of orig df:', dforig_len
 		, '\nExpected number of cols to add:', expected_ncols_toadd
 		, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
-		, '\nGot no. of cols:', len(mcsm_dataf.columns)
+		, '\nGot no. of cols:', len(mcsm_data_fs.columns)
 		, '\nCheck formatting:'
 		, '\ncheck hardcoded value:', expected_ncols_toadd
 		, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
 		, '\n===============================================================')
-	      
+	return mcsm_data_fs
 	return mcsm_dataf
 #=======================================================================
 # call function
 mcsm_df_formatted = format_mcsm_output(infile)
--- a/mcsm/ind_scripts/format_results_notdef.py
+++ b/mcsm/ind_scripts/format_results_notdef.py
@ -23,45 +23,41 @@ os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
 os.getcwd()
 #=======================================================================
 #%% variable assignment: input and output 
-#drug = 'pyrazinamide'
+drug = 'pyrazinamide'
-#gene = 'pncA'
+gene = 'pncA'
 drug = 'rifampicin'
 gene = 'rpoB'
 #drug = args.drug
 #gene = args.gene
 gene_match = gene + '_p.'
 #==========
-# data dir
+# dirs
 #==========
 datadir = homedir + '/' + 'git/Data'
 indir = datadir + '/' + drug + '/' + 'input'
 outdir = datadir + '/' + drug + '/' + 'output'
 #=======
 # input:
 #=======
 # 1) result_urls (from outdir)
-outdir = datadir + '/' + drug + '/' + 'output'
+
-in_filename = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
+in_filename_mcsm_output = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
-infile = outdir + '/' + in_filename
+infile_mcsm_output = outdir + '/' + in_filename_mcsm_output
-print('Input filename:', in_filename
+print('Input file:', infile_mcsm_output
      , '\nInput path(from output dir):', outdir
      , '\n=============================================================')
 #=======
 # output 
 #=======
-outdir =   datadir + '/' + drug + '/' + 'output'
+out_filename_mcsm_norm = gene.lower() + '_complex_mcsm_norm.csv'
-out_filename = gene.lower() + '_complex_mcsm_norm.csv'
+outfile_mcsm_norm =  outdir + '/' + out_filename_mcsm_norm
-outfile =  outdir + '/' + out_filename
+print('Output file:', out_filename_mcsm_norm
 print('Output filename:', out_filename
      , '\nOutput path:', outdir
      , '\n=============================================================')
 #=======================================================================
 print('Reading input file')
-mcsm_data  = pd.read_csv(infile, sep = ',')
+mcsm_data  = pd.read_csv(infile_mcsm_output, sep = ',')
 mcsm_data.columns
 # PredAffLog = affinity_change_log
@ -231,19 +227,29 @@ print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
 	, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
 #=============================================================================
 # Adding colname: wild_pos: sometimes useful for plotting and db
-print('Creating column: wild_position')
+print('Creating column: wild_pos')
-mcsm_data['wild_position'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
+mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
-print(mcsm_data['wild_position'].head())
+print(mcsm_data['wild_pos'].head())
 # Remove spaces b/w pasted columns
 print('removing white space within column: wild_position')
-mcsm_data['wild_position'] = mcsm_data['wild_position'].str.replace(' ', '')
+mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
-print('Correctly formatted column: wild_position\n', mcsm_data['wild_position'].head()
+print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
 	  , '\n===================================================================')
 #=============================================================================
 #%% Adding colname: wild_chain_pos: sometimes useful for plotting and db and is explicit
 print('Creating column: wild_chain_pos')
 mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
 print(mcsm_data['wild_chain_pos'].head())
 # Remove spaces b/w pasted columns
 print('removing white space within column: wild_chain_pos')
 mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
 print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
 	  , '\n===================================================================')
 #=============================================================================
 #%% ensuring dtypes are string for the non-numeric cols
 #) char cols
 char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain'
-             , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_position']
+             , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
 #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
 cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
@ -258,42 +264,48 @@ else:
 print(mcsm_data.dtypes)
 #%%
 #=============================================================================
-# Removing PredAff log column as it is not needed?
+#%% Removing PredAff log column as it is not needed?
 print('Removing col: PredAffLog since relevant info has been extracted from it')
-mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog'])
+mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
 print(mcsm_data_f.head())
 #=============================================================================
 #%% sort df by position for convenience
 print('Sorting df by position')
 mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
 print('sorted df:\n', mcsm_data_fs.head())
 #%%===========================================================================
-expected_ncols_toadd = 5 # beware of hardcoded numbers
+expected_ncols_toadd = 6 # beware of hardcoded numbers
 dforig_len = dforig_shape[1]
 expected_cols = dforig_len + expected_ncols_toadd
-if len(mcsm_dataf.columns) == expected_cols:
+if len(mcsm_data_fs.columns) == expected_cols:
 	print('PASS: formatting successful'
-	, '\nformatted df has expected no. of cols:', expected_cols
+    	, '\nformatted df has expected no. of cols:', expected_cols
-	, '\ncolnames:', mcsm_dataf.columns
+    	, '\ncolnames:', mcsm_data_fs.columns
-	, '\n----------------------------------------------------------------'
+    	, '\n----------------------------------------------------------------'
-	, '\ndtypes in cols:', mcsm_dataf.dtypes
+    	, '\ndtypes in cols:', mcsm_data_fs.dtypes
-	, '\n----------------------------------------------------------------'
+    	, '\n----------------------------------------------------------------'
-	, '\norig data shape:', dforig_shape
+    	, '\norig data shape:', dforig_shape
-	, '\nformatted df shape:', mcsm_dataf.shape
+    	, '\nformatted df shape:', mcsm_data_fs.shape
-	, '\n===============================================================')
+    	, '\n===============================================================')
 else: 
 	print('FAIL: something went wrong in formatting df'
-	, '\nLen of orig df:', dforig_len
+        , '\nLen of orig df:', dforig_len
-	, '\nExpected number of cols to add:', expected_ncols_toadd
+    	, '\nExpected number of cols to add:', expected_ncols_toadd
-	, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
+    	, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
-	, '\nGot no. of cols:', len(mcsm_dataf.columns)
+    	, '\nGot no. of cols:', len(mcsm_data_fs.columns)
-	, '\nCheck formatting:'
+    	, '\nCheck formatting:'
-	, '\ncheck hardcoded value:', expected_ncols_toadd
+    	, '\ncheck hardcoded value:', expected_ncols_toadd
-	, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
+    	, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
-	, '\n===============================================================')
+    	, '\n===============================================================')
 #%%============================================================================
 # writing file
 print('Writing formatted df to csv')
-mcsm_dataf.to_csv(outfile, index = False)
+mcsm_data_fs.to_csv(outfile_mcsm_norm, index = False)
 print('Finished writing file:'
-      , '\nFile:', outfile
+      , '\nFile:', outfile_mcsm_norm
-      , '\nExpected no. of rows:', len(mcsm_dataf)
+      , '\nExpected no. of rows:', len(mcsm_data_fs)
-      , '\nExpected no. of cols:', len(mcsm_dataf.columns)
+      , '\nExpected no. of cols:', len(mcsm_data_fs.columns)
      , '\n=============================================================')
 #%%
 #End of script
--- a/mcsm/mcsm.py
+++ b/mcsm/mcsm.py
@ -376,23 +376,34 @@ def format_mcsm_output(mcsm_outputcsv):
    # adding column: wild_position
    # useful for plots and db
    #############
-    print('Creating column: wild_position')
+    print('Creating column: wild_pos')
-    mcsm_data['wild_position'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
+    mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
-    print(mcsm_data['wild_position'].head())
+    print(mcsm_data['wild_pos'].head())
    # Remove spaces b/w pasted columns
-    print('removing white space within column: wild_position')
+    print('removing white space within created column: wild_pos')
-    mcsm_data['wild_position'] = mcsm_data['wild_position'].str.replace(' ', '')
+    mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
-    print('Correctly formatted column: wild_position\n', mcsm_data['wild_position'].head()
+    print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
          , '\n=========================================================')
 #%%=====================================================================
    #############
    # adding column: wild_chain_pos
    # useful for plots and db and its explicit
    #############
    print('Creating column: wild_chain_pos')
    mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
    print(mcsm_data['wild_chain_pos'].head())
    # Remove spaces b/w pasted columns
    print('removing white space within created column: wild_chain_pos')
    mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
    print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
 	      , '\n=========================================================')
 #%%=====================================================================    
    #############
    # ensuring corrrect dtype in non-numeric cols
    #############  
    #) char cols
-    char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain', 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_position']
+    char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain', 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
    #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
    cols_check_char = mcsm_data.select_dtypes(include = 'object').columns.isin(char_cols)
@ -408,36 +419,41 @@ def format_mcsm_output(mcsm_outputcsv):
 #%%=====================================================================
    # Removing PredAff log column as it is not needed?
    print('Removing col: PredAffLog since relevant info has been extracted from it')
-    mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog'])
+    mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
 #%%=====================================================================
    # sort df by position for convenience
    print('Sorting df by position')
    mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
    print('sorted df:\n', mcsm_data_fs.head())
 #%%=====================================================================
    #############
    # sanity check before writing file
    #############
-    expected_ncols_toadd = 5
+    expected_ncols_toadd = 6 # beware hardcoding!
    dforig_len = dforig_shape[1]
    expected_cols = dforig_len + expected_ncols_toadd
-    if len(mcsm_dataf.columns) == expected_cols:
+    if len(mcsm_data_fs.columns) == expected_cols:
        print('PASS: formatting successful'
                , '\nformatted df has expected no. of cols:', expected_cols
                , '\n---------------------------------------------------'
-                , '\ncolnames:', mcsm_dataf.columns
+                , '\ncolnames:', mcsm_data_fs.columns
                , '\n---------------------------------------------------'
-                , '\ndtypes in cols:', mcsm_dataf.dtypes
+                , '\ndtypes in cols:', mcsm_data_fs.dtypes
                , '\n---------------------------------------------------'
                , '\norig data shape:', dforig_shape
-                , '\nformatted df shape:', mcsm_dataf.shape
+                , '\nformatted df shape:', mcsm_data_fs.shape
                , '\n===================================================')
    else: 
-        sys.exit('FAIL: something went wrong in formatting df'
+        print('FAIL: something went wrong in formatting df'
                , '\nLen of orig df:', dforig_len
                , '\nExpected number of cols to add:', expected_ncols_toadd
                , '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
-                , '\nGot no. of cols:', len(mcsm_dataf.columns)
+                , '\nGot no. of cols:', len(mcsm_data_fs.columns)
                , '\nCheck formatting:'
                , '\ncheck hardcoded value:', expected_ncols_toadd
                , '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
                , '\n===================================================')
        sys.exit()        
-                
+    return mcsm_data_fs
    return mcsm_dataf
--- a/mcsm/mcsm_wrapper.py
+++ b/mcsm/mcsm_wrapper.py
@ -76,7 +76,8 @@ if DEBUG:
    print('DEBUG: mCSM output CSV file:', mcsm_output)
 # format_results globals
-out_filename_format = gene.lower() + '_mcsm_processed.csv'
+#out_filename_format = gene.lower() + '_mcsm_processed.csv'
 out_filename_format = gene.lower() + '_complex_mcsm_norm.csv'
 outfile_format =  outdir + '/' + out_filename_format
 if DEBUG:
    print('DEBUG: formatted CSV output:', outfile_format)
@ -111,7 +112,6 @@ def submit_mcsm():
        ,  'minutes, but will be longer for more mutations.')
 #%%=====================================================================
 def get_results():
    output_df = pd.DataFrame()
    url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
    success_counter = 1
@ -152,7 +152,7 @@ def format_results():
    print('Finished writing file:'
          , '\nFile:', outfile_format
          , '\nExpected no. of rows:', len(mcsm_df_formatted)
-          , '\nExpected no. of cols:', len(mcsm_df_formatted)
+          , '\nExpected no. of cols:', len(mcsm_df_formatted.columns)
          , '\n=============================================================')
 #%%=====================================================================
 def main():