From 8dc2fa732657bab133e91e542fa79402fdfb44f8 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 14 Jul 2020 14:07:23 +0100 Subject: [PATCH] fixed white space prob with mcsm input with merge --- foldx/runFoldx.py | 42 ++++++++--------- mcsm/ind_scripts/format_results.py | 34 ++++++++------ mcsm/ind_scripts/format_results_notdef.py | 42 ++++++++--------- mcsm/mcsm.py | 13 ++++-- mcsm/run_mcsm.py | 57 ++++++++++++++--------- scripts/or_kinship_link.py | 18 +++---- 6 files changed, 108 insertions(+), 98 deletions(-) diff --git a/foldx/runFoldx.py b/foldx/runFoldx.py index ed50594..ffd24e8 100755 --- a/foldx/runFoldx.py +++ b/foldx/runFoldx.py @@ -36,7 +36,6 @@ arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assm arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') - arg_parser.add_argument('-pdb', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called _complex.pdb in input_dir') arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called _snps.csv exists') @@ -50,38 +49,36 @@ args = arg_parser.parse_args() #gene = 'pncA' #gene_match = gene + '_p.' #%%===================================================================== -# Command Line Options -drug = args.drug -gene = args.gene +# Command line options +drug = args.drug +gene = args.gene -data_dir = args.data_dir -indir = args.input_dir -outdir = args.output_dir +datadir = args.datadir +indir = args.input_dir +outdir = args.output_dir -mut_filename = args.mutation_file -chainA = args.chain1 -chainB = args.chain2 -pdb_filename = args.pdb_file +mut_filename = args.mutation_file +chainA = args.chain1 +chainB = args.chain2 +pdb_filename = args.pdb_file # os.path.splitext will fail interestingly with file.pdb.txt.zip #pdb_name = os.path.splitext(pdb_file)[0] # Just the filename, thanks #pdb_name = Path(in_filename_pdb).stem -#============ +#============== # directories -#============ -if data_dir: - datadir = data_dir -else: +#============== +if not datadir: datadir = homedir + '/' + 'git/Data' - -if not indir: - indir = datadir + '/' + drug + '/' + 'input' - -if not outdir: - outdir = datadir + '/' + drug + '/' + 'output' +if not indir: + indir = datadir + '/' + drug + '/input' + +if not outdir: + outdir = datadir + '/' + drug + '/output' + # FIXME: this is a temporary directory and should be correctly handled process_dir = datadir + '/' + drug +'/' + 'processing' @@ -90,7 +87,6 @@ os.mkdir(process_dir) # input #======= # FIXME - if pdb_filename: pdb_name = Path(pdb_filename).stem else: diff --git a/mcsm/ind_scripts/format_results.py b/mcsm/ind_scripts/format_results.py index e80a1c6..d802e72 100755 --- a/mcsm/ind_scripts/format_results.py +++ b/mcsm/ind_scripts/format_results.py @@ -74,7 +74,11 @@ def format_mcsm_output(mcsm_outputcsv): ############# # Read file ############# - mcsm_data = pd.read_csv(mcsm_outputcsv, sep = ',') + mcsm_data_raw = pd.read_csv(mcsm_outputcsv, sep = ',') + + # strip white space from both ends in all columns + mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x) + dforig_shape = mcsm_data.shape print('dimensions of input file:', dforig_shape) @@ -85,7 +89,7 @@ def format_mcsm_output(mcsm_outputcsv): print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units' , '\n===================================================================') my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded - , 'Mutation information': 'mutation_information' # {wild_type}{mutant_type} + , 'Mutation information': 'mutationinformation' # {wild_type}{mutant_type} , 'Wild-type': 'wild_type' # one letter amino acid code , 'Position': 'position' # number , 'Mutant-type': 'mutant_type' # one letter amino acid code @@ -97,19 +101,19 @@ def format_mcsm_output(mcsm_outputcsv): mcsm_data.rename(columns = my_colnames_dict, inplace = True) #%%=========================================================================== ################################# - # populate mutation_information + # populate mutationinformation # col which is currently blank ################################# - # populate mutation_information column:mcsm style muts {WT}{MUT} - print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information']) - mcsm_data['mutation_information'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type'] - print('checking after populating:\n', mcsm_data['mutation_information'] + # populate mutationinformation column:mcsm style muts {WT}{MUT} + print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation']) + mcsm_data['mutationinformation'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type'] + print('checking after populating:\n', mcsm_data['mutationinformation'] , '\n===================================================================') # Remove spaces b/w pasted columns - print('removing white space within column: \mutation_information') - mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '') - print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information'] + print('removing white space within column: \mutationinformation') + mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '') + print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation'] , '\n===================================================================') #%%=========================================================================== ############# @@ -118,7 +122,7 @@ def format_mcsm_output(mcsm_outputcsv): # shouldn't exist as this should be eliminated at the time of running mcsm print('Sanity check:' , '\nChecking duplicate mutations') - if mcsm_data['mutation_information'].duplicated().sum() == 0: + if mcsm_data['mutationinformation'].duplicated().sum() == 0: print('PASS: No duplicate mutations detected (as expected)' , '\nDim of data:', mcsm_data.shape , '\n===============================================================') @@ -126,7 +130,7 @@ def format_mcsm_output(mcsm_outputcsv): print('FAIL (but not fatal): Duplicate mutations detected' , '\nDim of df with duplicates:', mcsm_data.shape , 'Removing duplicate entries') - mcsm_data = mcsm_data.drop_duplicates(['mutation_information']) + mcsm_data = mcsm_data.drop_duplicates(['mutationinformation']) print('Dim of data after removing duplicate muts:', mcsm_data.shape , '\n===============================================================') #%%=========================================================================== @@ -285,7 +289,7 @@ def format_mcsm_output(mcsm_outputcsv): #============================================================================= #%% ensuring dtypes are string for the non-numeric cols #) char cols - char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain' + char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain' , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos'] #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str) @@ -309,8 +313,8 @@ def format_mcsm_output(mcsm_outputcsv): mcsm_data_fs = mcsm_data_f.sort_values(by = ['position']) print('sorted df:\n', mcsm_data_fs.head()) - # Remove white space everywhere before output: bit me when merging!? - mcsm_data_fs.columns = mcsm_data_fs.columns.str.replace(' ', '') + # Ensuring column names are lowercase before output + mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower() #%%=========================================================================== ############# # sanity check before writing file diff --git a/mcsm/ind_scripts/format_results_notdef.py b/mcsm/ind_scripts/format_results_notdef.py index 8d442d7..b3e784f 100755 --- a/mcsm/ind_scripts/format_results_notdef.py +++ b/mcsm/ind_scripts/format_results_notdef.py @@ -13,7 +13,6 @@ import pandas as pd from pandas.api.types import is_string_dtype from pandas.api.types import is_numeric_dtype import numpy as np - #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') @@ -25,10 +24,6 @@ os.getcwd() #%% variable assignment: input and output drug = 'pyrazinamide' gene = 'pncA' - -#drug = args.drug -#gene = args.gene - gene_match = gene + '_p.' #========== # dirs @@ -41,7 +36,6 @@ outdir = datadir + '/' + drug + '/' + 'output' # input: #======= # 1) result_urls (from outdir) - in_filename_mcsm_output = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py) infile_mcsm_output = outdir + '/' + in_filename_mcsm_output print('Input file:', infile_mcsm_output @@ -57,9 +51,11 @@ print('Output file:', out_filename_mcsm_norm #======================================================================= print('Reading input file') -mcsm_data = pd.read_csv(infile_mcsm_output, sep = ',') +mcsm_data_raw = pd.read_csv(infile_mcsm_output, sep = ',') + +# strip white space from both ends in all columns +mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x) -mcsm_data.columns # PredAffLog = affinity_change_log # "DUETStability_Kcalpermol = DUET_change_kcalpermol dforig_shape = mcsm_data.shape @@ -72,7 +68,7 @@ print('dim of infile:', dforig_shape) print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units' , '\n===================================================================') my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded - , 'Mutation information': 'mutation_information' # {wild_type}{mutant_type} + , 'Mutation information': 'mutationinformation' # {wild_type}{mutant_type} , 'Wild-type': 'wild_type' # one letter amino acid code , 'Position': 'position' # number , 'Mutant-type': 'mutant_type' # one letter amino acid code @@ -83,17 +79,17 @@ my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info fr mcsm_data.rename(columns = my_colnames_dict, inplace = True) #%%=========================================================================== -# populate mutation_information column:mcsm style muts {WT}{MUT} -print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information']) -mcsm_data['mutation_information'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type'] -print('checking after populating:\n', mcsm_data['mutation_information'] +# populate mutationinformation column:mcsm style muts {WT}{MUT} +print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation']) +mcsm_data['mutationinformation'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type'] +print('checking after populating:\n', mcsm_data['mutationinformation'] , '\n===================================================================') -# Remove spaces b/w pasted columns -print('removing white space within column: \mutation_information') -mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '') -print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information'] - , '\n===================================================================') +# Remove spaces b/w pasted columns: not needed as white space removed at the time of import +#print('removing white space within column: \mutationinformation') +#mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '') +#print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation'] +# , '\n===================================================================') #%% Remove whitespace from column #orig_dtypes = mcsm_data.dtypes #https://stackoverflow.com/questions/33788913/pythonic-efficient-way-to-strip-whitespace-from-every-pandas-data-frame-cell-tha/33789292 @@ -103,7 +99,7 @@ print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_ # very important print('Sanity check:' , '\nChecking duplicate mutations') -if mcsm_data['mutation_information'].duplicated().sum() == 0: +if mcsm_data['mutationinformation'].duplicated().sum() == 0: print('PASS: No duplicate mutations detected (as expected)' , '\nDim of data:', mcsm_data.shape , '\n===============================================================') @@ -111,7 +107,7 @@ else: print('FAIL (but not fatal): Duplicate mutations detected' , '\nDim of df with duplicates:', mcsm_data.shape , 'Removing duplicate entries') - mcsm_data = mcsm_data.drop_duplicates(['mutation_information']) + mcsm_data = mcsm_data.drop_duplicates(['mutationinformation']) print('Dim of data after removing duplicate muts:', mcsm_data.shape , '\n===============================================================') #%%=========================================================================== @@ -248,7 +244,7 @@ print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos' #============================================================================= #%% ensuring dtypes are string for the non-numeric cols #) char cols -char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain' +char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain' , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos'] #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str) @@ -298,8 +294,8 @@ else: , '\nis', expected_ncols_toadd, 'the no. of expected cols to add?' , '\n===============================================================') #%%============================================================================ -# Remove white space everywhere before output: bit me when merging!? -mcsm_data_fs.columns = mcsm_data_fs.columns.str.replace(' ', '') +# Ensuring column names are lowercase before output +mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower() # writing file print('Writing formatted df to csv') diff --git a/mcsm/mcsm.py b/mcsm/mcsm.py index 3656fd7..b098636 100644 --- a/mcsm/mcsm.py +++ b/mcsm/mcsm.py @@ -183,7 +183,11 @@ def format_mcsm_output(mcsm_outputcsv): ############# # Read file ############# - mcsm_data = pd.read_csv(mcsm_outputcsv, sep = ',') + mcsm_data_raw = pd.read_csv(mcsm_outputcsv, sep = ',') + + # strip white space from both ends in all columns + mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x) + dforig_shape = mcsm_data.shape print('dimensions of input file:', dforig_shape) @@ -396,8 +400,7 @@ def format_mcsm_output(mcsm_outputcsv): print('removing white space within created column: wild_chain_pos') mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '') print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head() - , '\n=========================================================') - + , '\n=========================================================') #%%===================================================================== ############# # ensuring corrrect dtype in non-numeric cols @@ -426,8 +429,8 @@ def format_mcsm_output(mcsm_outputcsv): mcsm_data_fs = mcsm_data_f.sort_values(by = ['position']) print('sorted df:\n', mcsm_data_fs.head()) - # Remove white space everywhere before output: bit me when merging!? - mcsm_data_fs.columns = mcsm_data_fs.columns.str.replace(' ', '') + # Ensuring column names are lowercase before output + mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower() #%%===================================================================== ############# # sanity check before writing file diff --git a/mcsm/run_mcsm.py b/mcsm/run_mcsm.py index 8434091..b6ccb31 100755 --- a/mcsm/run_mcsm.py +++ b/mcsm/run_mcsm.py @@ -18,29 +18,39 @@ arg_parser.add_argument('-c', '--chain', help='Chain ID as per PDB, Case sensi arg_parser.add_argument('-l','--ligand', help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage', default = None) arg_parser.add_argument('-a','--affinity', help='Affinity in nM. REQUIRED only in "submit" stage', default = 0.99) arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File') -arg_parser.add_argument('--datadir', help = 'Data Directory') + +arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') +arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') +arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') + arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode') args = arg_parser.parse_args() - -gene = args.gene -drug = args.drug -stage = args.stage -chain = args.chain -ligand = args.ligand -affinity = args.affinity -pdb_filename = args.pdb_file -data_dir = args.data_dir -DEBUG = args.debug - -# Actual Globals :-) -host = args.host -prediction_url = args.url - +#======================================================================= +#%% variables #host = "http://biosig.unimelb.edu.au" #prediction_url = f"{host}/mcsm_lig/prediction" #drug = 'isoniazid' #gene = 'KatG' +#%%===================================================================== +# Command line options +gene = args.gene +drug = args.drug +stage = args.stage +chain = args.chain +ligand = args.ligand +affinity = args.affinity +pdb_filename = args.pdb_file + +datadir = args.datadir +indir = args.input_dir +outdir = args.output_dir + +DEBUG = args.debug + +# Actual Globals :-) +host = args.host +prediction_url = args.url # submit_mcsm globals homedir = os.path.expanduser('~') @@ -51,13 +61,14 @@ gene_match = gene + '_p.' #============ # directories #============ -if data_dir: - datadir = data_dir -else: - datadir = homedir + '/git/Data' - -indir = datadir + '/' + drug + '/' + 'input' -outdir = datadir + '/' + drug + '/' + 'output' +if not datadir: + datadir = homedir + '/' + 'git/Data' + +if not indir: + indir = datadir + '/' + drug + '/input' + +if not outdir: + outdir = datadir + '/' + drug + '/output' #======= # input diff --git a/scripts/or_kinship_link.py b/scripts/or_kinship_link.py index f248b0b..c70616b 100755 --- a/scripts/or_kinship_link.py +++ b/scripts/or_kinship_link.py @@ -46,18 +46,18 @@ args = arg_parser.parse_args() #drug = 'pyrazinamide' #start_cds = 2288681 #end_cds = 2289241 - -# cmd variables -gene = args.gene -drug = args.drug +#%%===================================================================== +# Command line options +gene = args.gene +drug = args.drug gene_match = gene + '_p.' -datadir = args.datadir -indir = args.input_dir -outdir = args.output_dir +datadir = args.datadir +indir = args.input_dir +outdir = args.output_dir -start_cds = args.start_coord -end_cds = args.end_coord +start_cds = args.start_coord +end_cds = args.end_coord #%%======================================================================= #==============