fixed white space prob with mcsm input with merge
This commit is contained in:
parent
46b1505fdf
commit
7d36e0e36b
6 changed files with 108 additions and 98 deletions
|
@ -36,7 +36,6 @@ arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assm
|
||||||
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||||
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||||
|
|
||||||
|
|
||||||
arg_parser.add_argument('-pdb', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
|
arg_parser.add_argument('-pdb', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
|
||||||
arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_snps.csv exists')
|
arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_snps.csv exists')
|
||||||
|
|
||||||
|
@ -50,37 +49,35 @@ args = arg_parser.parse_args()
|
||||||
#gene = 'pncA'
|
#gene = 'pncA'
|
||||||
#gene_match = gene + '_p.'
|
#gene_match = gene + '_p.'
|
||||||
#%%=====================================================================
|
#%%=====================================================================
|
||||||
# Command Line Options
|
# Command line options
|
||||||
drug = args.drug
|
drug = args.drug
|
||||||
gene = args.gene
|
gene = args.gene
|
||||||
|
|
||||||
data_dir = args.data_dir
|
datadir = args.datadir
|
||||||
indir = args.input_dir
|
indir = args.input_dir
|
||||||
outdir = args.output_dir
|
outdir = args.output_dir
|
||||||
|
|
||||||
mut_filename = args.mutation_file
|
mut_filename = args.mutation_file
|
||||||
chainA = args.chain1
|
chainA = args.chain1
|
||||||
chainB = args.chain2
|
chainB = args.chain2
|
||||||
pdb_filename = args.pdb_file
|
pdb_filename = args.pdb_file
|
||||||
|
|
||||||
# os.path.splitext will fail interestingly with file.pdb.txt.zip
|
# os.path.splitext will fail interestingly with file.pdb.txt.zip
|
||||||
#pdb_name = os.path.splitext(pdb_file)[0]
|
#pdb_name = os.path.splitext(pdb_file)[0]
|
||||||
# Just the filename, thanks
|
# Just the filename, thanks
|
||||||
#pdb_name = Path(in_filename_pdb).stem
|
#pdb_name = Path(in_filename_pdb).stem
|
||||||
|
|
||||||
#============
|
#==============
|
||||||
# directories
|
# directories
|
||||||
#============
|
#==============
|
||||||
if data_dir:
|
if not datadir:
|
||||||
datadir = data_dir
|
|
||||||
else:
|
|
||||||
datadir = homedir + '/' + 'git/Data'
|
datadir = homedir + '/' + 'git/Data'
|
||||||
|
|
||||||
if not indir:
|
if not indir:
|
||||||
indir = datadir + '/' + drug + '/' + 'input'
|
indir = datadir + '/' + drug + '/input'
|
||||||
|
|
||||||
if not outdir:
|
if not outdir:
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
outdir = datadir + '/' + drug + '/output'
|
||||||
|
|
||||||
# FIXME: this is a temporary directory and should be correctly handled
|
# FIXME: this is a temporary directory and should be correctly handled
|
||||||
process_dir = datadir + '/' + drug +'/' + 'processing'
|
process_dir = datadir + '/' + drug +'/' + 'processing'
|
||||||
|
@ -90,7 +87,6 @@ os.mkdir(process_dir)
|
||||||
# input
|
# input
|
||||||
#=======
|
#=======
|
||||||
# FIXME
|
# FIXME
|
||||||
|
|
||||||
if pdb_filename:
|
if pdb_filename:
|
||||||
pdb_name = Path(pdb_filename).stem
|
pdb_name = Path(pdb_filename).stem
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -74,7 +74,11 @@ def format_mcsm_output(mcsm_outputcsv):
|
||||||
#############
|
#############
|
||||||
# Read file
|
# Read file
|
||||||
#############
|
#############
|
||||||
mcsm_data = pd.read_csv(mcsm_outputcsv, sep = ',')
|
mcsm_data_raw = pd.read_csv(mcsm_outputcsv, sep = ',')
|
||||||
|
|
||||||
|
# strip white space from both ends in all columns
|
||||||
|
mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
|
||||||
|
|
||||||
dforig_shape = mcsm_data.shape
|
dforig_shape = mcsm_data.shape
|
||||||
print('dimensions of input file:', dforig_shape)
|
print('dimensions of input file:', dforig_shape)
|
||||||
|
|
||||||
|
@ -85,7 +89,7 @@ def format_mcsm_output(mcsm_outputcsv):
|
||||||
print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
|
print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
|
||||||
, '\n===================================================================')
|
, '\n===================================================================')
|
||||||
my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
|
my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
|
||||||
, 'Mutation information': 'mutation_information' # {wild_type}<position>{mutant_type}
|
, 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
|
||||||
, 'Wild-type': 'wild_type' # one letter amino acid code
|
, 'Wild-type': 'wild_type' # one letter amino acid code
|
||||||
, 'Position': 'position' # number
|
, 'Position': 'position' # number
|
||||||
, 'Mutant-type': 'mutant_type' # one letter amino acid code
|
, 'Mutant-type': 'mutant_type' # one letter amino acid code
|
||||||
|
@ -97,19 +101,19 @@ def format_mcsm_output(mcsm_outputcsv):
|
||||||
mcsm_data.rename(columns = my_colnames_dict, inplace = True)
|
mcsm_data.rename(columns = my_colnames_dict, inplace = True)
|
||||||
#%%===========================================================================
|
#%%===========================================================================
|
||||||
#################################
|
#################################
|
||||||
# populate mutation_information
|
# populate mutationinformation
|
||||||
# col which is currently blank
|
# col which is currently blank
|
||||||
#################################
|
#################################
|
||||||
# populate mutation_information column:mcsm style muts {WT}<POS>{MUT}
|
# populate mutationinformation column:mcsm style muts {WT}<POS>{MUT}
|
||||||
print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information'])
|
print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
|
||||||
mcsm_data['mutation_information'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
|
mcsm_data['mutationinformation'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
|
||||||
print('checking after populating:\n', mcsm_data['mutation_information']
|
print('checking after populating:\n', mcsm_data['mutationinformation']
|
||||||
, '\n===================================================================')
|
, '\n===================================================================')
|
||||||
|
|
||||||
# Remove spaces b/w pasted columns
|
# Remove spaces b/w pasted columns
|
||||||
print('removing white space within column: \mutation_information')
|
print('removing white space within column: \mutationinformation')
|
||||||
mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '')
|
mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
|
||||||
print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information']
|
print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
|
||||||
, '\n===================================================================')
|
, '\n===================================================================')
|
||||||
#%%===========================================================================
|
#%%===========================================================================
|
||||||
#############
|
#############
|
||||||
|
@ -118,7 +122,7 @@ def format_mcsm_output(mcsm_outputcsv):
|
||||||
# shouldn't exist as this should be eliminated at the time of running mcsm
|
# shouldn't exist as this should be eliminated at the time of running mcsm
|
||||||
print('Sanity check:'
|
print('Sanity check:'
|
||||||
, '\nChecking duplicate mutations')
|
, '\nChecking duplicate mutations')
|
||||||
if mcsm_data['mutation_information'].duplicated().sum() == 0:
|
if mcsm_data['mutationinformation'].duplicated().sum() == 0:
|
||||||
print('PASS: No duplicate mutations detected (as expected)'
|
print('PASS: No duplicate mutations detected (as expected)'
|
||||||
, '\nDim of data:', mcsm_data.shape
|
, '\nDim of data:', mcsm_data.shape
|
||||||
, '\n===============================================================')
|
, '\n===============================================================')
|
||||||
|
@ -126,7 +130,7 @@ def format_mcsm_output(mcsm_outputcsv):
|
||||||
print('FAIL (but not fatal): Duplicate mutations detected'
|
print('FAIL (but not fatal): Duplicate mutations detected'
|
||||||
, '\nDim of df with duplicates:', mcsm_data.shape
|
, '\nDim of df with duplicates:', mcsm_data.shape
|
||||||
, 'Removing duplicate entries')
|
, 'Removing duplicate entries')
|
||||||
mcsm_data = mcsm_data.drop_duplicates(['mutation_information'])
|
mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
|
||||||
print('Dim of data after removing duplicate muts:', mcsm_data.shape
|
print('Dim of data after removing duplicate muts:', mcsm_data.shape
|
||||||
, '\n===============================================================')
|
, '\n===============================================================')
|
||||||
#%%===========================================================================
|
#%%===========================================================================
|
||||||
|
@ -285,7 +289,7 @@ def format_mcsm_output(mcsm_outputcsv):
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
#%% ensuring dtypes are string for the non-numeric cols
|
#%% ensuring dtypes are string for the non-numeric cols
|
||||||
#) char cols
|
#) char cols
|
||||||
char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain'
|
char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain'
|
||||||
, 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
|
, 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
|
||||||
|
|
||||||
#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
|
#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
|
||||||
|
@ -309,8 +313,8 @@ def format_mcsm_output(mcsm_outputcsv):
|
||||||
mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
|
mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
|
||||||
print('sorted df:\n', mcsm_data_fs.head())
|
print('sorted df:\n', mcsm_data_fs.head())
|
||||||
|
|
||||||
# Remove white space everywhere before output: bit me when merging!?
|
# Ensuring column names are lowercase before output
|
||||||
mcsm_data_fs.columns = mcsm_data_fs.columns.str.replace(' ', '')
|
mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
|
||||||
#%%===========================================================================
|
#%%===========================================================================
|
||||||
#############
|
#############
|
||||||
# sanity check before writing file
|
# sanity check before writing file
|
||||||
|
|
|
@ -13,7 +13,6 @@ import pandas as pd
|
||||||
from pandas.api.types import is_string_dtype
|
from pandas.api.types import is_string_dtype
|
||||||
from pandas.api.types import is_numeric_dtype
|
from pandas.api.types import is_numeric_dtype
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% specify input and curr dir
|
#%% specify input and curr dir
|
||||||
homedir = os.path.expanduser('~')
|
homedir = os.path.expanduser('~')
|
||||||
|
@ -25,10 +24,6 @@ os.getcwd()
|
||||||
#%% variable assignment: input and output
|
#%% variable assignment: input and output
|
||||||
drug = 'pyrazinamide'
|
drug = 'pyrazinamide'
|
||||||
gene = 'pncA'
|
gene = 'pncA'
|
||||||
|
|
||||||
#drug = args.drug
|
|
||||||
#gene = args.gene
|
|
||||||
|
|
||||||
gene_match = gene + '_p.'
|
gene_match = gene + '_p.'
|
||||||
#==========
|
#==========
|
||||||
# dirs
|
# dirs
|
||||||
|
@ -41,7 +36,6 @@ outdir = datadir + '/' + drug + '/' + 'output'
|
||||||
# input:
|
# input:
|
||||||
#=======
|
#=======
|
||||||
# 1) result_urls (from outdir)
|
# 1) result_urls (from outdir)
|
||||||
|
|
||||||
in_filename_mcsm_output = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
|
in_filename_mcsm_output = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
|
||||||
infile_mcsm_output = outdir + '/' + in_filename_mcsm_output
|
infile_mcsm_output = outdir + '/' + in_filename_mcsm_output
|
||||||
print('Input file:', infile_mcsm_output
|
print('Input file:', infile_mcsm_output
|
||||||
|
@ -57,9 +51,11 @@ print('Output file:', out_filename_mcsm_norm
|
||||||
|
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
print('Reading input file')
|
print('Reading input file')
|
||||||
mcsm_data = pd.read_csv(infile_mcsm_output, sep = ',')
|
mcsm_data_raw = pd.read_csv(infile_mcsm_output, sep = ',')
|
||||||
|
|
||||||
|
# strip white space from both ends in all columns
|
||||||
|
mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
|
||||||
|
|
||||||
mcsm_data.columns
|
|
||||||
# PredAffLog = affinity_change_log
|
# PredAffLog = affinity_change_log
|
||||||
# "DUETStability_Kcalpermol = DUET_change_kcalpermol
|
# "DUETStability_Kcalpermol = DUET_change_kcalpermol
|
||||||
dforig_shape = mcsm_data.shape
|
dforig_shape = mcsm_data.shape
|
||||||
|
@ -72,7 +68,7 @@ print('dim of infile:', dforig_shape)
|
||||||
print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
|
print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
|
||||||
, '\n===================================================================')
|
, '\n===================================================================')
|
||||||
my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
|
my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
|
||||||
, 'Mutation information': 'mutation_information' # {wild_type}<position>{mutant_type}
|
, 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
|
||||||
, 'Wild-type': 'wild_type' # one letter amino acid code
|
, 'Wild-type': 'wild_type' # one letter amino acid code
|
||||||
, 'Position': 'position' # number
|
, 'Position': 'position' # number
|
||||||
, 'Mutant-type': 'mutant_type' # one letter amino acid code
|
, 'Mutant-type': 'mutant_type' # one letter amino acid code
|
||||||
|
@ -83,17 +79,17 @@ my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info fr
|
||||||
|
|
||||||
mcsm_data.rename(columns = my_colnames_dict, inplace = True)
|
mcsm_data.rename(columns = my_colnames_dict, inplace = True)
|
||||||
#%%===========================================================================
|
#%%===========================================================================
|
||||||
# populate mutation_information column:mcsm style muts {WT}<POS>{MUT}
|
# populate mutationinformation column:mcsm style muts {WT}<POS>{MUT}
|
||||||
print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information'])
|
print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
|
||||||
mcsm_data['mutation_information'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
|
mcsm_data['mutationinformation'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
|
||||||
print('checking after populating:\n', mcsm_data['mutation_information']
|
print('checking after populating:\n', mcsm_data['mutationinformation']
|
||||||
, '\n===================================================================')
|
, '\n===================================================================')
|
||||||
|
|
||||||
# Remove spaces b/w pasted columns
|
# Remove spaces b/w pasted columns: not needed as white space removed at the time of import
|
||||||
print('removing white space within column: \mutation_information')
|
#print('removing white space within column: \mutationinformation')
|
||||||
mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '')
|
#mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
|
||||||
print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information']
|
#print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
|
||||||
, '\n===================================================================')
|
# , '\n===================================================================')
|
||||||
#%% Remove whitespace from column
|
#%% Remove whitespace from column
|
||||||
#orig_dtypes = mcsm_data.dtypes
|
#orig_dtypes = mcsm_data.dtypes
|
||||||
#https://stackoverflow.com/questions/33788913/pythonic-efficient-way-to-strip-whitespace-from-every-pandas-data-frame-cell-tha/33789292
|
#https://stackoverflow.com/questions/33788913/pythonic-efficient-way-to-strip-whitespace-from-every-pandas-data-frame-cell-tha/33789292
|
||||||
|
@ -103,7 +99,7 @@ print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_
|
||||||
# very important
|
# very important
|
||||||
print('Sanity check:'
|
print('Sanity check:'
|
||||||
, '\nChecking duplicate mutations')
|
, '\nChecking duplicate mutations')
|
||||||
if mcsm_data['mutation_information'].duplicated().sum() == 0:
|
if mcsm_data['mutationinformation'].duplicated().sum() == 0:
|
||||||
print('PASS: No duplicate mutations detected (as expected)'
|
print('PASS: No duplicate mutations detected (as expected)'
|
||||||
, '\nDim of data:', mcsm_data.shape
|
, '\nDim of data:', mcsm_data.shape
|
||||||
, '\n===============================================================')
|
, '\n===============================================================')
|
||||||
|
@ -111,7 +107,7 @@ else:
|
||||||
print('FAIL (but not fatal): Duplicate mutations detected'
|
print('FAIL (but not fatal): Duplicate mutations detected'
|
||||||
, '\nDim of df with duplicates:', mcsm_data.shape
|
, '\nDim of df with duplicates:', mcsm_data.shape
|
||||||
, 'Removing duplicate entries')
|
, 'Removing duplicate entries')
|
||||||
mcsm_data = mcsm_data.drop_duplicates(['mutation_information'])
|
mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
|
||||||
print('Dim of data after removing duplicate muts:', mcsm_data.shape
|
print('Dim of data after removing duplicate muts:', mcsm_data.shape
|
||||||
, '\n===============================================================')
|
, '\n===============================================================')
|
||||||
#%%===========================================================================
|
#%%===========================================================================
|
||||||
|
@ -248,7 +244,7 @@ print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
#%% ensuring dtypes are string for the non-numeric cols
|
#%% ensuring dtypes are string for the non-numeric cols
|
||||||
#) char cols
|
#) char cols
|
||||||
char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain'
|
char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain'
|
||||||
, 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
|
, 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
|
||||||
|
|
||||||
#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
|
#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
|
||||||
|
@ -298,8 +294,8 @@ else:
|
||||||
, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
|
, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
|
||||||
, '\n===============================================================')
|
, '\n===============================================================')
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
# Remove white space everywhere before output: bit me when merging!?
|
# Ensuring column names are lowercase before output
|
||||||
mcsm_data_fs.columns = mcsm_data_fs.columns.str.replace(' ', '')
|
mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
|
||||||
|
|
||||||
# writing file
|
# writing file
|
||||||
print('Writing formatted df to csv')
|
print('Writing formatted df to csv')
|
||||||
|
|
11
mcsm/mcsm.py
11
mcsm/mcsm.py
|
@ -183,7 +183,11 @@ def format_mcsm_output(mcsm_outputcsv):
|
||||||
#############
|
#############
|
||||||
# Read file
|
# Read file
|
||||||
#############
|
#############
|
||||||
mcsm_data = pd.read_csv(mcsm_outputcsv, sep = ',')
|
mcsm_data_raw = pd.read_csv(mcsm_outputcsv, sep = ',')
|
||||||
|
|
||||||
|
# strip white space from both ends in all columns
|
||||||
|
mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
|
||||||
|
|
||||||
dforig_shape = mcsm_data.shape
|
dforig_shape = mcsm_data.shape
|
||||||
print('dimensions of input file:', dforig_shape)
|
print('dimensions of input file:', dforig_shape)
|
||||||
|
|
||||||
|
@ -397,7 +401,6 @@ def format_mcsm_output(mcsm_outputcsv):
|
||||||
mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
|
mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
|
||||||
print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
|
print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
|
||||||
, '\n=========================================================')
|
, '\n=========================================================')
|
||||||
|
|
||||||
#%%=====================================================================
|
#%%=====================================================================
|
||||||
#############
|
#############
|
||||||
# ensuring corrrect dtype in non-numeric cols
|
# ensuring corrrect dtype in non-numeric cols
|
||||||
|
@ -426,8 +429,8 @@ def format_mcsm_output(mcsm_outputcsv):
|
||||||
mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
|
mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
|
||||||
print('sorted df:\n', mcsm_data_fs.head())
|
print('sorted df:\n', mcsm_data_fs.head())
|
||||||
|
|
||||||
# Remove white space everywhere before output: bit me when merging!?
|
# Ensuring column names are lowercase before output
|
||||||
mcsm_data_fs.columns = mcsm_data_fs.columns.str.replace(' ', '')
|
mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
|
||||||
#%%=====================================================================
|
#%%=====================================================================
|
||||||
#############
|
#############
|
||||||
# sanity check before writing file
|
# sanity check before writing file
|
||||||
|
|
|
@ -18,29 +18,39 @@ arg_parser.add_argument('-c', '--chain', help='Chain ID as per PDB, Case sensi
|
||||||
arg_parser.add_argument('-l','--ligand', help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage', default = None)
|
arg_parser.add_argument('-l','--ligand', help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage', default = None)
|
||||||
arg_parser.add_argument('-a','--affinity', help='Affinity in nM. REQUIRED only in "submit" stage', default = 0.99)
|
arg_parser.add_argument('-a','--affinity', help='Affinity in nM. REQUIRED only in "submit" stage', default = 0.99)
|
||||||
arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File')
|
arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File')
|
||||||
arg_parser.add_argument('--datadir', help = 'Data Directory')
|
|
||||||
|
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||||
|
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||||
|
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||||
|
|
||||||
arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode')
|
arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode')
|
||||||
|
|
||||||
args = arg_parser.parse_args()
|
args = arg_parser.parse_args()
|
||||||
|
#=======================================================================
|
||||||
gene = args.gene
|
#%% variables
|
||||||
drug = args.drug
|
|
||||||
stage = args.stage
|
|
||||||
chain = args.chain
|
|
||||||
ligand = args.ligand
|
|
||||||
affinity = args.affinity
|
|
||||||
pdb_filename = args.pdb_file
|
|
||||||
data_dir = args.data_dir
|
|
||||||
DEBUG = args.debug
|
|
||||||
|
|
||||||
# Actual Globals :-)
|
|
||||||
host = args.host
|
|
||||||
prediction_url = args.url
|
|
||||||
|
|
||||||
#host = "http://biosig.unimelb.edu.au"
|
#host = "http://biosig.unimelb.edu.au"
|
||||||
#prediction_url = f"{host}/mcsm_lig/prediction"
|
#prediction_url = f"{host}/mcsm_lig/prediction"
|
||||||
#drug = 'isoniazid'
|
#drug = 'isoniazid'
|
||||||
#gene = 'KatG'
|
#gene = 'KatG'
|
||||||
|
#%%=====================================================================
|
||||||
|
# Command line options
|
||||||
|
gene = args.gene
|
||||||
|
drug = args.drug
|
||||||
|
stage = args.stage
|
||||||
|
chain = args.chain
|
||||||
|
ligand = args.ligand
|
||||||
|
affinity = args.affinity
|
||||||
|
pdb_filename = args.pdb_file
|
||||||
|
|
||||||
|
datadir = args.datadir
|
||||||
|
indir = args.input_dir
|
||||||
|
outdir = args.output_dir
|
||||||
|
|
||||||
|
DEBUG = args.debug
|
||||||
|
|
||||||
|
# Actual Globals :-)
|
||||||
|
host = args.host
|
||||||
|
prediction_url = args.url
|
||||||
|
|
||||||
# submit_mcsm globals
|
# submit_mcsm globals
|
||||||
homedir = os.path.expanduser('~')
|
homedir = os.path.expanduser('~')
|
||||||
|
@ -51,13 +61,14 @@ gene_match = gene + '_p.'
|
||||||
#============
|
#============
|
||||||
# directories
|
# directories
|
||||||
#============
|
#============
|
||||||
if data_dir:
|
if not datadir:
|
||||||
datadir = data_dir
|
datadir = homedir + '/' + 'git/Data'
|
||||||
else:
|
|
||||||
datadir = homedir + '/git/Data'
|
|
||||||
|
|
||||||
indir = datadir + '/' + drug + '/' + 'input'
|
if not indir:
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
indir = datadir + '/' + drug + '/input'
|
||||||
|
|
||||||
|
if not outdir:
|
||||||
|
outdir = datadir + '/' + drug + '/output'
|
||||||
|
|
||||||
#=======
|
#=======
|
||||||
# input
|
# input
|
||||||
|
|
|
@ -46,18 +46,18 @@ args = arg_parser.parse_args()
|
||||||
#drug = 'pyrazinamide'
|
#drug = 'pyrazinamide'
|
||||||
#start_cds = 2288681
|
#start_cds = 2288681
|
||||||
#end_cds = 2289241
|
#end_cds = 2289241
|
||||||
|
#%%=====================================================================
|
||||||
# cmd variables
|
# Command line options
|
||||||
gene = args.gene
|
gene = args.gene
|
||||||
drug = args.drug
|
drug = args.drug
|
||||||
gene_match = gene + '_p.'
|
gene_match = gene + '_p.'
|
||||||
|
|
||||||
datadir = args.datadir
|
datadir = args.datadir
|
||||||
indir = args.input_dir
|
indir = args.input_dir
|
||||||
outdir = args.output_dir
|
outdir = args.output_dir
|
||||||
|
|
||||||
start_cds = args.start_coord
|
start_cds = args.start_coord
|
||||||
end_cds = args.end_coord
|
end_cds = args.end_coord
|
||||||
|
|
||||||
#%%=======================================================================
|
#%%=======================================================================
|
||||||
#==============
|
#==============
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue