From 640299015456c425b812929dcd9096e1ecc9e25d Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 9 Jul 2020 11:15:56 +0100 Subject: [PATCH] minor edits to format mcsm data like sorting df --- mcsm/ind_scripts/format_results.py | 44 ++++++--- mcsm/ind_scripts/format_results_notdef.py | 106 ++++++++++++---------- mcsm/mcsm.py | 56 ++++++++---- mcsm/mcsm_wrapper.py | 6 +- 4 files changed, 127 insertions(+), 85 deletions(-) diff --git a/mcsm/ind_scripts/format_results.py b/mcsm/ind_scripts/format_results.py index ffcf880..e2b05b0 100755 --- a/mcsm/ind_scripts/format_results.py +++ b/mcsm/ind_scripts/format_results.py @@ -264,19 +264,29 @@ def format_mcsm_output(mcsm_outputcsv): , '\nScaled affinity scores:\n', mcsm_data['affinity_scaled']) #============================================================================= # Adding colname: wild_pos: sometimes useful for plotting and db - print('Creating column: wild_position') - mcsm_data['wild_position'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) - print(mcsm_data['wild_position'].head()) + print('Creating column: wild_pos') + mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + print(mcsm_data['wild_pos'].head()) # Remove spaces b/w pasted columns - print('removing white space within column: wild_position') - mcsm_data['wild_position'] = mcsm_data['wild_position'].str.replace(' ', '') - print('Correctly formatted column: wild_position\n', mcsm_data['wild_position'].head() + print('removing white space within column: wild_pos') + mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '') + print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head() , '\n===================================================================') #============================================================================= + # Adding colname: wild_chain_pos: sometimes useful for plotting and db and is explicit + print('Creating column: wild_chain_pos') + mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str) + print(mcsm_data['wild_chain_pos'].head()) + # Remove spaces b/w pasted columns + print('removing white space within column: wild_chain_pos') + mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '') + print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head() + , '\n===================================================================') + #============================================================================= #%% ensuring dtypes are string for the non-numeric cols #) char cols char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain' - , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_position'] + , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos'] #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str) cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols) @@ -292,7 +302,12 @@ def format_mcsm_output(mcsm_outputcsv): #============================================================================= # Removing PredAff log column as it is not needed? print('Removing col: PredAffLog since relevant info has been extracted from it') - mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog']) + mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog']) + #============================================================================= + #sort df by position for convenience + print('Sorting df by position') + mcsm_data_fs = mcsm_data_f.sort_values(by = ['position']) + print('sorted df:\n', mcsm_data_fs.head()) #%%=========================================================================== ############# # sanity check before writing file @@ -300,29 +315,28 @@ def format_mcsm_output(mcsm_outputcsv): expected_ncols_toadd = 5 # beware of hardcoded numbers dforig_len = dforig_shape[1] expected_cols = dforig_len + expected_ncols_toadd - if len(mcsm_dataf.columns) == expected_cols: + if len(mcsm_data_fs.columns) == expected_cols: print('PASS: formatting successful' , '\nformatted df has expected no. of cols:', expected_cols - , '\ncolnames:', mcsm_dataf.columns + , '\ncolnames:', mcsm_data_fs.columns , '\n----------------------------------------------------------------' - , '\ndtypes in cols:', mcsm_dataf.dtypes + , '\ndtypes in cols:', mcsm_data_fs.dtypes , '\n----------------------------------------------------------------' , '\norig data shape:', dforig_shape - , '\nformatted df shape:', mcsm_dataf.shape + , '\nformatted df shape:', mcsm_data_fs.shape , '\n===============================================================') else: print('FAIL: something went wrong in formatting df' , '\nLen of orig df:', dforig_len , '\nExpected number of cols to add:', expected_ncols_toadd , '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')' - , '\nGot no. of cols:', len(mcsm_dataf.columns) + , '\nGot no. of cols:', len(mcsm_data_fs.columns) , '\nCheck formatting:' , '\ncheck hardcoded value:', expected_ncols_toadd , '\nis', expected_ncols_toadd, 'the no. of expected cols to add?' , '\n===============================================================') - - return mcsm_dataf + return mcsm_data_fs #======================================================================= # call function mcsm_df_formatted = format_mcsm_output(infile) diff --git a/mcsm/ind_scripts/format_results_notdef.py b/mcsm/ind_scripts/format_results_notdef.py index fbf99a0..7dc0450 100755 --- a/mcsm/ind_scripts/format_results_notdef.py +++ b/mcsm/ind_scripts/format_results_notdef.py @@ -23,45 +23,41 @@ os.chdir(homedir + '/git/LSHTM_analysis/mcsm') os.getcwd() #======================================================================= #%% variable assignment: input and output -#drug = 'pyrazinamide' -#gene = 'pncA' - -drug = 'rifampicin' -gene = 'rpoB' +drug = 'pyrazinamide' +gene = 'pncA' #drug = args.drug #gene = args.gene gene_match = gene + '_p.' #========== -# data dir +# dirs #========== datadir = homedir + '/' + 'git/Data' +indir = datadir + '/' + drug + '/' + 'input' +outdir = datadir + '/' + drug + '/' + 'output' #======= # input: #======= # 1) result_urls (from outdir) -outdir = datadir + '/' + drug + '/' + 'output' -in_filename = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py) -infile = outdir + '/' + in_filename -print('Input filename:', in_filename - , '\nInput path(from output dir):', outdir + +in_filename_mcsm_output = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py) +infile_mcsm_output = outdir + '/' + in_filename_mcsm_output +print('Input file:', infile_mcsm_output , '\n=============================================================') #======= # output #======= -outdir = datadir + '/' + drug + '/' + 'output' -out_filename = gene.lower() + '_complex_mcsm_norm.csv' -outfile = outdir + '/' + out_filename -print('Output filename:', out_filename - , '\nOutput path:', outdir +out_filename_mcsm_norm = gene.lower() + '_complex_mcsm_norm.csv' +outfile_mcsm_norm = outdir + '/' + out_filename_mcsm_norm +print('Output file:', out_filename_mcsm_norm , '\n=============================================================') #======================================================================= print('Reading input file') -mcsm_data = pd.read_csv(infile, sep = ',') +mcsm_data = pd.read_csv(infile_mcsm_output, sep = ',') mcsm_data.columns # PredAffLog = affinity_change_log @@ -231,19 +227,29 @@ print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change'] , '\nScaled affinity scores:\n', mcsm_data['affinity_scaled']) #============================================================================= # Adding colname: wild_pos: sometimes useful for plotting and db -print('Creating column: wild_position') -mcsm_data['wild_position'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) -print(mcsm_data['wild_position'].head()) +print('Creating column: wild_pos') +mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) +print(mcsm_data['wild_pos'].head()) # Remove spaces b/w pasted columns print('removing white space within column: wild_position') -mcsm_data['wild_position'] = mcsm_data['wild_position'].str.replace(' ', '') -print('Correctly formatted column: wild_position\n', mcsm_data['wild_position'].head() +mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '') +print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head() + , '\n===================================================================') +#============================================================================= +#%% Adding colname: wild_chain_pos: sometimes useful for plotting and db and is explicit +print('Creating column: wild_chain_pos') +mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str) +print(mcsm_data['wild_chain_pos'].head()) +# Remove spaces b/w pasted columns +print('removing white space within column: wild_chain_pos') +mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '') +print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head() , '\n===================================================================') #============================================================================= #%% ensuring dtypes are string for the non-numeric cols #) char cols char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain' - , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_position'] + , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos'] #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str) cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols) @@ -258,42 +264,48 @@ else: print(mcsm_data.dtypes) #%% #============================================================================= -# Removing PredAff log column as it is not needed? +#%% Removing PredAff log column as it is not needed? print('Removing col: PredAffLog since relevant info has been extracted from it') -mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog']) +mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog']) +print(mcsm_data_f.head()) +#============================================================================= +#%% sort df by position for convenience +print('Sorting df by position') +mcsm_data_fs = mcsm_data_f.sort_values(by = ['position']) +print('sorted df:\n', mcsm_data_fs.head()) #%%=========================================================================== -expected_ncols_toadd = 5 # beware of hardcoded numbers +expected_ncols_toadd = 6 # beware of hardcoded numbers dforig_len = dforig_shape[1] expected_cols = dforig_len + expected_ncols_toadd -if len(mcsm_dataf.columns) == expected_cols: +if len(mcsm_data_fs.columns) == expected_cols: print('PASS: formatting successful' - , '\nformatted df has expected no. of cols:', expected_cols - , '\ncolnames:', mcsm_dataf.columns - , '\n----------------------------------------------------------------' - , '\ndtypes in cols:', mcsm_dataf.dtypes - , '\n----------------------------------------------------------------' - , '\norig data shape:', dforig_shape - , '\nformatted df shape:', mcsm_dataf.shape - , '\n===============================================================') + , '\nformatted df has expected no. of cols:', expected_cols + , '\ncolnames:', mcsm_data_fs.columns + , '\n----------------------------------------------------------------' + , '\ndtypes in cols:', mcsm_data_fs.dtypes + , '\n----------------------------------------------------------------' + , '\norig data shape:', dforig_shape + , '\nformatted df shape:', mcsm_data_fs.shape + , '\n===============================================================') else: print('FAIL: something went wrong in formatting df' - , '\nLen of orig df:', dforig_len - , '\nExpected number of cols to add:', expected_ncols_toadd - , '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')' - , '\nGot no. of cols:', len(mcsm_dataf.columns) - , '\nCheck formatting:' - , '\ncheck hardcoded value:', expected_ncols_toadd - , '\nis', expected_ncols_toadd, 'the no. of expected cols to add?' - , '\n===============================================================') + , '\nLen of orig df:', dforig_len + , '\nExpected number of cols to add:', expected_ncols_toadd + , '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')' + , '\nGot no. of cols:', len(mcsm_data_fs.columns) + , '\nCheck formatting:' + , '\ncheck hardcoded value:', expected_ncols_toadd + , '\nis', expected_ncols_toadd, 'the no. of expected cols to add?' + , '\n===============================================================') #%%============================================================================ # writing file print('Writing formatted df to csv') -mcsm_dataf.to_csv(outfile, index = False) +mcsm_data_fs.to_csv(outfile_mcsm_norm, index = False) print('Finished writing file:' - , '\nFile:', outfile - , '\nExpected no. of rows:', len(mcsm_dataf) - , '\nExpected no. of cols:', len(mcsm_dataf.columns) + , '\nFile:', outfile_mcsm_norm + , '\nExpected no. of rows:', len(mcsm_data_fs) + , '\nExpected no. of cols:', len(mcsm_data_fs.columns) , '\n=============================================================') #%% #End of script diff --git a/mcsm/mcsm.py b/mcsm/mcsm.py index 9eb0e56..16f9004 100644 --- a/mcsm/mcsm.py +++ b/mcsm/mcsm.py @@ -376,23 +376,34 @@ def format_mcsm_output(mcsm_outputcsv): # adding column: wild_position # useful for plots and db ############# - print('Creating column: wild_position') - mcsm_data['wild_position'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) - print(mcsm_data['wild_position'].head()) + print('Creating column: wild_pos') + mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + print(mcsm_data['wild_pos'].head()) # Remove spaces b/w pasted columns - print('removing white space within column: wild_position') - mcsm_data['wild_position'] = mcsm_data['wild_position'].str.replace(' ', '') - print('Correctly formatted column: wild_position\n', mcsm_data['wild_position'].head() + print('removing white space within created column: wild_pos') + mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '') + print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head() , '\n=========================================================') +#%%===================================================================== + ############# + # adding column: wild_chain_pos + # useful for plots and db and its explicit + ############# + print('Creating column: wild_chain_pos') + mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str) + print(mcsm_data['wild_chain_pos'].head()) + # Remove spaces b/w pasted columns + print('removing white space within created column: wild_chain_pos') + mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '') + print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head() + , '\n=========================================================') #%%===================================================================== - ############# # ensuring corrrect dtype in non-numeric cols - ############# - + ############# #) char cols - char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain', 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_position'] + char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain', 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos'] #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str) cols_check_char = mcsm_data.select_dtypes(include = 'object').columns.isin(char_cols) @@ -408,36 +419,41 @@ def format_mcsm_output(mcsm_outputcsv): #%%===================================================================== # Removing PredAff log column as it is not needed? print('Removing col: PredAffLog since relevant info has been extracted from it') - mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog']) + mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog']) +#%%===================================================================== + # sort df by position for convenience + print('Sorting df by position') + mcsm_data_fs = mcsm_data_f.sort_values(by = ['position']) + print('sorted df:\n', mcsm_data_fs.head()) #%%===================================================================== ############# # sanity check before writing file ############# - expected_ncols_toadd = 5 + expected_ncols_toadd = 6 # beware hardcoding! dforig_len = dforig_shape[1] expected_cols = dforig_len + expected_ncols_toadd - if len(mcsm_dataf.columns) == expected_cols: + if len(mcsm_data_fs.columns) == expected_cols: print('PASS: formatting successful' , '\nformatted df has expected no. of cols:', expected_cols , '\n---------------------------------------------------' - , '\ncolnames:', mcsm_dataf.columns + , '\ncolnames:', mcsm_data_fs.columns , '\n---------------------------------------------------' - , '\ndtypes in cols:', mcsm_dataf.dtypes + , '\ndtypes in cols:', mcsm_data_fs.dtypes , '\n---------------------------------------------------' , '\norig data shape:', dforig_shape - , '\nformatted df shape:', mcsm_dataf.shape + , '\nformatted df shape:', mcsm_data_fs.shape , '\n===================================================') else: - sys.exit('FAIL: something went wrong in formatting df' + print('FAIL: something went wrong in formatting df' , '\nLen of orig df:', dforig_len , '\nExpected number of cols to add:', expected_ncols_toadd , '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')' - , '\nGot no. of cols:', len(mcsm_dataf.columns) + , '\nGot no. of cols:', len(mcsm_data_fs.columns) , '\nCheck formatting:' , '\ncheck hardcoded value:', expected_ncols_toadd , '\nis', expected_ncols_toadd, 'the no. of expected cols to add?' , '\n===================================================') + sys.exit() - - return mcsm_dataf + return mcsm_data_fs diff --git a/mcsm/mcsm_wrapper.py b/mcsm/mcsm_wrapper.py index 9d34c4e..3614d66 100755 --- a/mcsm/mcsm_wrapper.py +++ b/mcsm/mcsm_wrapper.py @@ -76,7 +76,8 @@ if DEBUG: print('DEBUG: mCSM output CSV file:', mcsm_output) # format_results globals -out_filename_format = gene.lower() + '_mcsm_processed.csv' +#out_filename_format = gene.lower() + '_mcsm_processed.csv' +out_filename_format = gene.lower() + '_complex_mcsm_norm.csv' outfile_format = outdir + '/' + out_filename_format if DEBUG: print('DEBUG: formatted CSV output:', outfile_format) @@ -111,7 +112,6 @@ def submit_mcsm(): , 'minutes, but will be longer for more mutations.') #%%===================================================================== def get_results(): - output_df = pd.DataFrame() url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1 success_counter = 1 @@ -152,7 +152,7 @@ def format_results(): print('Finished writing file:' , '\nFile:', outfile_format , '\nExpected no. of rows:', len(mcsm_df_formatted) - , '\nExpected no. of cols:', len(mcsm_df_formatted) + , '\nExpected no. of cols:', len(mcsm_df_formatted.columns) , '\n=============================================================') #%%===================================================================== def main():