fixed white space prob with mcsm input with merge

This commit is contained in:
Tanushree Tunstall 2020-07-14 14:07:23 +01:00
parent 5a2084ba11
commit 8dc2fa7326
6 changed files with 108 additions and 98 deletions

View file

@ -36,7 +36,6 @@ arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assm
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input') arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
arg_parser.add_argument('-pdb', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir') arg_parser.add_argument('-pdb', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_snps.csv exists') arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_snps.csv exists')
@ -50,38 +49,36 @@ args = arg_parser.parse_args()
#gene = 'pncA' #gene = 'pncA'
#gene_match = gene + '_p.' #gene_match = gene + '_p.'
#%%===================================================================== #%%=====================================================================
# Command Line Options # Command line options
drug = args.drug drug = args.drug
gene = args.gene gene = args.gene
data_dir = args.data_dir datadir = args.datadir
indir = args.input_dir indir = args.input_dir
outdir = args.output_dir outdir = args.output_dir
mut_filename = args.mutation_file mut_filename = args.mutation_file
chainA = args.chain1 chainA = args.chain1
chainB = args.chain2 chainB = args.chain2
pdb_filename = args.pdb_file pdb_filename = args.pdb_file
# os.path.splitext will fail interestingly with file.pdb.txt.zip # os.path.splitext will fail interestingly with file.pdb.txt.zip
#pdb_name = os.path.splitext(pdb_file)[0] #pdb_name = os.path.splitext(pdb_file)[0]
# Just the filename, thanks # Just the filename, thanks
#pdb_name = Path(in_filename_pdb).stem #pdb_name = Path(in_filename_pdb).stem
#============ #==============
# directories # directories
#============ #==============
if data_dir: if not datadir:
datadir = data_dir
else:
datadir = homedir + '/' + 'git/Data' datadir = homedir + '/' + 'git/Data'
if not indir:
indir = datadir + '/' + drug + '/' + 'input'
if not outdir:
outdir = datadir + '/' + drug + '/' + 'output'
if not indir:
indir = datadir + '/' + drug + '/input'
if not outdir:
outdir = datadir + '/' + drug + '/output'
# FIXME: this is a temporary directory and should be correctly handled # FIXME: this is a temporary directory and should be correctly handled
process_dir = datadir + '/' + drug +'/' + 'processing' process_dir = datadir + '/' + drug +'/' + 'processing'
@ -90,7 +87,6 @@ os.mkdir(process_dir)
# input # input
#======= #=======
# FIXME # FIXME
if pdb_filename: if pdb_filename:
pdb_name = Path(pdb_filename).stem pdb_name = Path(pdb_filename).stem
else: else:

View file

@ -74,7 +74,11 @@ def format_mcsm_output(mcsm_outputcsv):
############# #############
# Read file # Read file
############# #############
mcsm_data = pd.read_csv(mcsm_outputcsv, sep = ',') mcsm_data_raw = pd.read_csv(mcsm_outputcsv, sep = ',')
# strip white space from both ends in all columns
mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
dforig_shape = mcsm_data.shape dforig_shape = mcsm_data.shape
print('dimensions of input file:', dforig_shape) print('dimensions of input file:', dforig_shape)
@ -85,7 +89,7 @@ def format_mcsm_output(mcsm_outputcsv):
print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units' print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
, '\n===================================================================') , '\n===================================================================')
my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
, 'Mutation information': 'mutation_information' # {wild_type}<position>{mutant_type} , 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
, 'Wild-type': 'wild_type' # one letter amino acid code , 'Wild-type': 'wild_type' # one letter amino acid code
, 'Position': 'position' # number , 'Position': 'position' # number
, 'Mutant-type': 'mutant_type' # one letter amino acid code , 'Mutant-type': 'mutant_type' # one letter amino acid code
@ -97,19 +101,19 @@ def format_mcsm_output(mcsm_outputcsv):
mcsm_data.rename(columns = my_colnames_dict, inplace = True) mcsm_data.rename(columns = my_colnames_dict, inplace = True)
#%%=========================================================================== #%%===========================================================================
################################# #################################
# populate mutation_information # populate mutationinformation
# col which is currently blank # col which is currently blank
################################# #################################
# populate mutation_information column:mcsm style muts {WT}<POS>{MUT} # populate mutationinformation column:mcsm style muts {WT}<POS>{MUT}
print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information']) print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
mcsm_data['mutation_information'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type'] mcsm_data['mutationinformation'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
print('checking after populating:\n', mcsm_data['mutation_information'] print('checking after populating:\n', mcsm_data['mutationinformation']
, '\n===================================================================') , '\n===================================================================')
# Remove spaces b/w pasted columns # Remove spaces b/w pasted columns
print('removing white space within column: \mutation_information') print('removing white space within column: \mutationinformation')
mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '') mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information'] print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
, '\n===================================================================') , '\n===================================================================')
#%%=========================================================================== #%%===========================================================================
############# #############
@ -118,7 +122,7 @@ def format_mcsm_output(mcsm_outputcsv):
# shouldn't exist as this should be eliminated at the time of running mcsm # shouldn't exist as this should be eliminated at the time of running mcsm
print('Sanity check:' print('Sanity check:'
, '\nChecking duplicate mutations') , '\nChecking duplicate mutations')
if mcsm_data['mutation_information'].duplicated().sum() == 0: if mcsm_data['mutationinformation'].duplicated().sum() == 0:
print('PASS: No duplicate mutations detected (as expected)' print('PASS: No duplicate mutations detected (as expected)'
, '\nDim of data:', mcsm_data.shape , '\nDim of data:', mcsm_data.shape
, '\n===============================================================') , '\n===============================================================')
@ -126,7 +130,7 @@ def format_mcsm_output(mcsm_outputcsv):
print('FAIL (but not fatal): Duplicate mutations detected' print('FAIL (but not fatal): Duplicate mutations detected'
, '\nDim of df with duplicates:', mcsm_data.shape , '\nDim of df with duplicates:', mcsm_data.shape
, 'Removing duplicate entries') , 'Removing duplicate entries')
mcsm_data = mcsm_data.drop_duplicates(['mutation_information']) mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
print('Dim of data after removing duplicate muts:', mcsm_data.shape print('Dim of data after removing duplicate muts:', mcsm_data.shape
, '\n===============================================================') , '\n===============================================================')
#%%=========================================================================== #%%===========================================================================
@ -285,7 +289,7 @@ def format_mcsm_output(mcsm_outputcsv):
#============================================================================= #=============================================================================
#%% ensuring dtypes are string for the non-numeric cols #%% ensuring dtypes are string for the non-numeric cols
#) char cols #) char cols
char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain' char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain'
, 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos'] , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str) #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
@ -309,8 +313,8 @@ def format_mcsm_output(mcsm_outputcsv):
mcsm_data_fs = mcsm_data_f.sort_values(by = ['position']) mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
print('sorted df:\n', mcsm_data_fs.head()) print('sorted df:\n', mcsm_data_fs.head())
# Remove white space everywhere before output: bit me when merging!? # Ensuring column names are lowercase before output
mcsm_data_fs.columns = mcsm_data_fs.columns.str.replace(' ', '') mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
#%%=========================================================================== #%%===========================================================================
############# #############
# sanity check before writing file # sanity check before writing file

View file

@ -13,7 +13,6 @@ import pandas as pd
from pandas.api.types import is_string_dtype from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype from pandas.api.types import is_numeric_dtype
import numpy as np import numpy as np
#======================================================================= #=======================================================================
#%% specify input and curr dir #%% specify input and curr dir
homedir = os.path.expanduser('~') homedir = os.path.expanduser('~')
@ -25,10 +24,6 @@ os.getcwd()
#%% variable assignment: input and output #%% variable assignment: input and output
drug = 'pyrazinamide' drug = 'pyrazinamide'
gene = 'pncA' gene = 'pncA'
#drug = args.drug
#gene = args.gene
gene_match = gene + '_p.' gene_match = gene + '_p.'
#========== #==========
# dirs # dirs
@ -41,7 +36,6 @@ outdir = datadir + '/' + drug + '/' + 'output'
# input: # input:
#======= #=======
# 1) result_urls (from outdir) # 1) result_urls (from outdir)
in_filename_mcsm_output = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py) in_filename_mcsm_output = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
infile_mcsm_output = outdir + '/' + in_filename_mcsm_output infile_mcsm_output = outdir + '/' + in_filename_mcsm_output
print('Input file:', infile_mcsm_output print('Input file:', infile_mcsm_output
@ -57,9 +51,11 @@ print('Output file:', out_filename_mcsm_norm
#======================================================================= #=======================================================================
print('Reading input file') print('Reading input file')
mcsm_data = pd.read_csv(infile_mcsm_output, sep = ',') mcsm_data_raw = pd.read_csv(infile_mcsm_output, sep = ',')
# strip white space from both ends in all columns
mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
mcsm_data.columns
# PredAffLog = affinity_change_log # PredAffLog = affinity_change_log
# "DUETStability_Kcalpermol = DUET_change_kcalpermol # "DUETStability_Kcalpermol = DUET_change_kcalpermol
dforig_shape = mcsm_data.shape dforig_shape = mcsm_data.shape
@ -72,7 +68,7 @@ print('dim of infile:', dforig_shape)
print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units' print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
, '\n===================================================================') , '\n===================================================================')
my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
, 'Mutation information': 'mutation_information' # {wild_type}<position>{mutant_type} , 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
, 'Wild-type': 'wild_type' # one letter amino acid code , 'Wild-type': 'wild_type' # one letter amino acid code
, 'Position': 'position' # number , 'Position': 'position' # number
, 'Mutant-type': 'mutant_type' # one letter amino acid code , 'Mutant-type': 'mutant_type' # one letter amino acid code
@ -83,17 +79,17 @@ my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info fr
mcsm_data.rename(columns = my_colnames_dict, inplace = True) mcsm_data.rename(columns = my_colnames_dict, inplace = True)
#%%=========================================================================== #%%===========================================================================
# populate mutation_information column:mcsm style muts {WT}<POS>{MUT} # populate mutationinformation column:mcsm style muts {WT}<POS>{MUT}
print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information']) print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
mcsm_data['mutation_information'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type'] mcsm_data['mutationinformation'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
print('checking after populating:\n', mcsm_data['mutation_information'] print('checking after populating:\n', mcsm_data['mutationinformation']
, '\n===================================================================') , '\n===================================================================')
# Remove spaces b/w pasted columns # Remove spaces b/w pasted columns: not needed as white space removed at the time of import
print('removing white space within column: \mutation_information') #print('removing white space within column: \mutationinformation')
mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '') #mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information'] #print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
, '\n===================================================================') # , '\n===================================================================')
#%% Remove whitespace from column #%% Remove whitespace from column
#orig_dtypes = mcsm_data.dtypes #orig_dtypes = mcsm_data.dtypes
#https://stackoverflow.com/questions/33788913/pythonic-efficient-way-to-strip-whitespace-from-every-pandas-data-frame-cell-tha/33789292 #https://stackoverflow.com/questions/33788913/pythonic-efficient-way-to-strip-whitespace-from-every-pandas-data-frame-cell-tha/33789292
@ -103,7 +99,7 @@ print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_
# very important # very important
print('Sanity check:' print('Sanity check:'
, '\nChecking duplicate mutations') , '\nChecking duplicate mutations')
if mcsm_data['mutation_information'].duplicated().sum() == 0: if mcsm_data['mutationinformation'].duplicated().sum() == 0:
print('PASS: No duplicate mutations detected (as expected)' print('PASS: No duplicate mutations detected (as expected)'
, '\nDim of data:', mcsm_data.shape , '\nDim of data:', mcsm_data.shape
, '\n===============================================================') , '\n===============================================================')
@ -111,7 +107,7 @@ else:
print('FAIL (but not fatal): Duplicate mutations detected' print('FAIL (but not fatal): Duplicate mutations detected'
, '\nDim of df with duplicates:', mcsm_data.shape , '\nDim of df with duplicates:', mcsm_data.shape
, 'Removing duplicate entries') , 'Removing duplicate entries')
mcsm_data = mcsm_data.drop_duplicates(['mutation_information']) mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
print('Dim of data after removing duplicate muts:', mcsm_data.shape print('Dim of data after removing duplicate muts:', mcsm_data.shape
, '\n===============================================================') , '\n===============================================================')
#%%=========================================================================== #%%===========================================================================
@ -248,7 +244,7 @@ print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'
#============================================================================= #=============================================================================
#%% ensuring dtypes are string for the non-numeric cols #%% ensuring dtypes are string for the non-numeric cols
#) char cols #) char cols
char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain' char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain'
, 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos'] , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str) #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
@ -298,8 +294,8 @@ else:
, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?' , '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
, '\n===============================================================') , '\n===============================================================')
#%%============================================================================ #%%============================================================================
# Remove white space everywhere before output: bit me when merging!? # Ensuring column names are lowercase before output
mcsm_data_fs.columns = mcsm_data_fs.columns.str.replace(' ', '') mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
# writing file # writing file
print('Writing formatted df to csv') print('Writing formatted df to csv')

View file

@ -183,7 +183,11 @@ def format_mcsm_output(mcsm_outputcsv):
############# #############
# Read file # Read file
############# #############
mcsm_data = pd.read_csv(mcsm_outputcsv, sep = ',') mcsm_data_raw = pd.read_csv(mcsm_outputcsv, sep = ',')
# strip white space from both ends in all columns
mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
dforig_shape = mcsm_data.shape dforig_shape = mcsm_data.shape
print('dimensions of input file:', dforig_shape) print('dimensions of input file:', dforig_shape)
@ -396,8 +400,7 @@ def format_mcsm_output(mcsm_outputcsv):
print('removing white space within created column: wild_chain_pos') print('removing white space within created column: wild_chain_pos')
mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '') mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head() print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
, '\n=========================================================') , '\n=========================================================')
#%%===================================================================== #%%=====================================================================
############# #############
# ensuring corrrect dtype in non-numeric cols # ensuring corrrect dtype in non-numeric cols
@ -426,8 +429,8 @@ def format_mcsm_output(mcsm_outputcsv):
mcsm_data_fs = mcsm_data_f.sort_values(by = ['position']) mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
print('sorted df:\n', mcsm_data_fs.head()) print('sorted df:\n', mcsm_data_fs.head())
# Remove white space everywhere before output: bit me when merging!? # Ensuring column names are lowercase before output
mcsm_data_fs.columns = mcsm_data_fs.columns.str.replace(' ', '') mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
#%%===================================================================== #%%=====================================================================
############# #############
# sanity check before writing file # sanity check before writing file

View file

@ -18,29 +18,39 @@ arg_parser.add_argument('-c', '--chain', help='Chain ID as per PDB, Case sensi
arg_parser.add_argument('-l','--ligand', help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage', default = None) arg_parser.add_argument('-l','--ligand', help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage', default = None)
arg_parser.add_argument('-a','--affinity', help='Affinity in nM. REQUIRED only in "submit" stage', default = 0.99) arg_parser.add_argument('-a','--affinity', help='Affinity in nM. REQUIRED only in "submit" stage', default = 0.99)
arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File') arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File')
arg_parser.add_argument('--datadir', help = 'Data Directory')
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode') arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode')
args = arg_parser.parse_args() args = arg_parser.parse_args()
#=======================================================================
gene = args.gene #%% variables
drug = args.drug
stage = args.stage
chain = args.chain
ligand = args.ligand
affinity = args.affinity
pdb_filename = args.pdb_file
data_dir = args.data_dir
DEBUG = args.debug
# Actual Globals :-)
host = args.host
prediction_url = args.url
#host = "http://biosig.unimelb.edu.au" #host = "http://biosig.unimelb.edu.au"
#prediction_url = f"{host}/mcsm_lig/prediction" #prediction_url = f"{host}/mcsm_lig/prediction"
#drug = 'isoniazid' #drug = 'isoniazid'
#gene = 'KatG' #gene = 'KatG'
#%%=====================================================================
# Command line options
gene = args.gene
drug = args.drug
stage = args.stage
chain = args.chain
ligand = args.ligand
affinity = args.affinity
pdb_filename = args.pdb_file
datadir = args.datadir
indir = args.input_dir
outdir = args.output_dir
DEBUG = args.debug
# Actual Globals :-)
host = args.host
prediction_url = args.url
# submit_mcsm globals # submit_mcsm globals
homedir = os.path.expanduser('~') homedir = os.path.expanduser('~')
@ -51,13 +61,14 @@ gene_match = gene + '_p.'
#============ #============
# directories # directories
#============ #============
if data_dir: if not datadir:
datadir = data_dir datadir = homedir + '/' + 'git/Data'
else:
datadir = homedir + '/git/Data' if not indir:
indir = datadir + '/' + drug + '/input'
indir = datadir + '/' + drug + '/' + 'input'
outdir = datadir + '/' + drug + '/' + 'output' if not outdir:
outdir = datadir + '/' + drug + '/output'
#======= #=======
# input # input

View file

@ -46,18 +46,18 @@ args = arg_parser.parse_args()
#drug = 'pyrazinamide' #drug = 'pyrazinamide'
#start_cds = 2288681 #start_cds = 2288681
#end_cds = 2289241 #end_cds = 2289241
#%%=====================================================================
# cmd variables # Command line options
gene = args.gene gene = args.gene
drug = args.drug drug = args.drug
gene_match = gene + '_p.' gene_match = gene + '_p.'
datadir = args.datadir datadir = args.datadir
indir = args.input_dir indir = args.input_dir
outdir = args.output_dir outdir = args.output_dir
start_cds = args.start_coord start_cds = args.start_coord
end_cds = args.end_coord end_cds = args.end_coord
#%%======================================================================= #%%=======================================================================
#============== #==============