Merge branch 'embb_dev'
This commit is contained in:
commit
c32de1bf0f
7 changed files with 238 additions and 116 deletions
|
@ -17,61 +17,52 @@ os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
|
||||||
from format_results_dynamut import *
|
from format_results_dynamut import *
|
||||||
from format_results_dynamut2 import *
|
from format_results_dynamut2 import *
|
||||||
########################################################################
|
########################################################################
|
||||||
# variables
|
|
||||||
# TODO: add cmd line args
|
|
||||||
|
|
||||||
#gene =
|
|
||||||
#drug =
|
|
||||||
|
|
||||||
#%% command line args
|
#%% command line args
|
||||||
arg_parser = argparse.ArgumentParser()
|
arg_parser = argparse.ArgumentParser()
|
||||||
arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
|
arg_parser.add_argument('-d', '--drug' , help = 'drug name (case sensitive)', default = None)
|
||||||
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
|
arg_parser.add_argument('-g', '--gene' , help = 'gene name (case sensitive)', default = None)
|
||||||
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
arg_parser.add_argument('--datadir' , help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||||
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||||
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||||
#arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true') # should be handled elsewhere!
|
#arg_parser.add_argument('--mkdir_name' , help = 'Output dir for processed results. This will be created if it does not exist')
|
||||||
|
arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
|
||||||
arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
|
arg_parser.add_argument('--debug' , action = 'store_true' , help = 'Debug Mode')
|
||||||
|
|
||||||
args = arg_parser.parse_args()
|
args = arg_parser.parse_args()
|
||||||
#=======================================================================
|
#%%============================================================================
|
||||||
#%% variable assignment: input and output paths & filenames
|
# variable assignment: input and output paths & filenames
|
||||||
drug = args.drug
|
drug = args.drug
|
||||||
gene = args.gene
|
gene = args.gene
|
||||||
datadir = args.datadir
|
datadir = args.datadir
|
||||||
indir = args.input_dir
|
indir = args.input_dir
|
||||||
outdir = args.output_dir
|
outdir = args.output_dir
|
||||||
#make_dirs = args.make_dirs
|
#outdir_dynamut2 = args.mkdir_name
|
||||||
|
make_dirs = args.make_dirs
|
||||||
|
|
||||||
#%% input and output dirs and files
|
|
||||||
#=======
|
#=======
|
||||||
# dirs
|
# dirs
|
||||||
#=======
|
#=======
|
||||||
if not datadir:
|
if not datadir:
|
||||||
datadir = homedir + '/' + 'git/Data'
|
datadir = homedir + '/git/Data/'
|
||||||
|
|
||||||
if not indir:
|
if not indir:
|
||||||
indir = datadir + '/' + drug + '/input'
|
indir = datadir + drug + '/input/'
|
||||||
|
|
||||||
if not outdir:
|
if not outdir:
|
||||||
outdir = datadir + '/' + drug + '/output'
|
outdir = datadir + drug + '/output/'
|
||||||
|
|
||||||
#%%=====================================================================
|
#if not mkdir_name:
|
||||||
|
outdir_dynamut = outdir + 'dynamut_results/'
|
||||||
datadir = homedir + '/git/Data'
|
outdir_dynamut2 = outdir + 'dynamut_results/dynamut2/'
|
||||||
indir = datadir + '/' + drug + '/input'
|
|
||||||
outdir = datadir + '/' + drug + '/output'
|
|
||||||
outdir_dynamut = outdir + '/dynamut_results/'
|
|
||||||
outdir_dynamut2 = outdir + '/dynamut_results/dynamut2/'
|
|
||||||
|
|
||||||
# Input file
|
# Input file
|
||||||
#infile_dynamut = outdir_dynamut + gene.lower() + '_dynamut_all_output_clean.csv'
|
#infile_dynamut = outdir_dynamut + gene.lower() + '_dynamut_all_output_clean.csv'
|
||||||
infile_dynamut2 = outdir_dynamut2 + gene.lower() + '_dynamut2_output_combined_clean.csv'
|
infile_dynamut2 = outdir_dynamut2 + gene.lower() + '_dynamut2_output_combined_clean.csv'
|
||||||
|
|
||||||
# Formatted output filename
|
# Formatted output filename
|
||||||
#outfile_dynamut_f = outdir_dynamut2 + gene + '_dynamut_norm.csv'
|
outfile_dynamut_f = outdir_dynamut2 + gene + '_dynamut_norm.csv'
|
||||||
outfile_dynamut2_f = outdir_dynamut2 + gene.lower() + '_dynamut2_norm.csv'
|
outfile_dynamut2_f = outdir_dynamut2 + gene + '_dynamut2_norm.csv'
|
||||||
|
#%%========================================================================
|
||||||
|
|
||||||
#===============================
|
#===============================
|
||||||
# CALL: format_results_dynamut
|
# CALL: format_results_dynamut
|
||||||
|
|
|
@ -17,8 +17,8 @@ my_host = 'http://biosig.unimelb.edu.au'
|
||||||
#headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
|
#headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
|
||||||
|
|
||||||
# TODO: add cmd line args
|
# TODO: add cmd line args
|
||||||
# gene =
|
#gene = ''
|
||||||
# drug =
|
#drug = ''
|
||||||
datadir = homedir + '/git/Data/'
|
datadir = homedir + '/git/Data/'
|
||||||
indir = datadir + drug + '/input/'
|
indir = datadir + drug + '/input/'
|
||||||
outdir = datadir + drug + '/output/'
|
outdir = datadir + drug + '/output/'
|
||||||
|
|
|
@ -16,7 +16,7 @@ arg_parser.add_argument('-H', '--host', help='mCSM Server', default = 'http:/
|
||||||
arg_parser.add_argument('-U', '--url', help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction')
|
arg_parser.add_argument('-U', '--url', help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction')
|
||||||
arg_parser.add_argument('-c', '--chain', help='Chain ID as per PDB, Case sensitive', default = 'A')
|
arg_parser.add_argument('-c', '--chain', help='Chain ID as per PDB, Case sensitive', default = 'A')
|
||||||
arg_parser.add_argument('-l','--ligand', help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage', default = None)
|
arg_parser.add_argument('-l','--ligand', help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage', default = None)
|
||||||
arg_parser.add_argument('-a','--affinity', help='Affinity in nM. REQUIRED only in "submit" stage', default = 0.99)
|
arg_parser.add_argument('-a','--affinity', help='Affinity in nM. REQUIRED only in "submit" stage', default = 10) #0.99 for pnca, gid, embb. For SP targets (alr,katg, rpob), use 10.
|
||||||
|
|
||||||
arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File')
|
arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File')
|
||||||
arg_parser.add_argument('-m','--mutation_file', help = 'Mutation File, mcsm style')
|
arg_parser.add_argument('-m','--mutation_file', help = 'Mutation File, mcsm style')
|
||||||
|
@ -42,8 +42,8 @@ args = arg_parser.parse_args()
|
||||||
#%% variables
|
#%% variables
|
||||||
#host = "http://biosig.unimelb.edu.au"
|
#host = "http://biosig.unimelb.edu.au"
|
||||||
#prediction_url = f"{host}/mcsm_lig/prediction"
|
#prediction_url = f"{host}/mcsm_lig/prediction"
|
||||||
#drug = 'isoniazid'
|
#drug = ''
|
||||||
#gene = 'KatG'
|
#gene = ''
|
||||||
#%%=====================================================================
|
#%%=====================================================================
|
||||||
# Command line options
|
# Command line options
|
||||||
gene = args.gene
|
gene = args.gene
|
||||||
|
|
|
@ -22,12 +22,11 @@ from pandas.api.types import is_numeric_dtype
|
||||||
sys.path.append(homedir + '/git/LSHTM_analysis/scripts')
|
sys.path.append(homedir + '/git/LSHTM_analysis/scripts')
|
||||||
from reference_dict import up_3letter_aa_dict
|
from reference_dict import up_3letter_aa_dict
|
||||||
from reference_dict import oneletter_aa_dict
|
from reference_dict import oneletter_aa_dict
|
||||||
|
#%%============================================================================
|
||||||
#%%#####################################################################
|
|
||||||
|
|
||||||
def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
|
def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
|
||||||
"""
|
"""
|
||||||
@param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all muts
|
@param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps
|
||||||
which is the result of combining all mcsm_ppi2 batch results, and using
|
which is the result of combining all mcsm_ppi2 batch results, and using
|
||||||
bash scripts to combine all the batch results into one file.
|
bash scripts to combine all the batch results into one file.
|
||||||
Formatting df to a pandas df and output as csv.
|
Formatting df to a pandas df and output as csv.
|
||||||
|
|
|
@ -19,19 +19,22 @@ arg_parser.add_argument('-g', '--gene' , help = 'gene name (case sensitive)
|
||||||
arg_parser.add_argument('--datadir' , help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
arg_parser.add_argument('--datadir' , help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||||
arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||||
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||||
|
arg_parser.add_argument('--input_file' , help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||||
|
|
||||||
#arg_parser.add_argument('--mkdir_name' , help = 'Output dir for processed results. This will be created if it does not exist')
|
#arg_parser.add_argument('--mkdir_name' , help = 'Output dir for processed results. This will be created if it does not exist')
|
||||||
arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
|
arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
|
||||||
|
|
||||||
arg_parser.add_argument('--debug' , action = 'store_true' , help = 'Debug Mode')
|
arg_parser.add_argument('--debug' , action = 'store_true' , help = 'Debug Mode')
|
||||||
|
|
||||||
args = arg_parser.parse_args()
|
args = arg_parser.parse_args()
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
# variable assignment: input and output paths & filenames
|
# variable assignment: input and output paths & filenames
|
||||||
drug = args.drug
|
drug = args.drug
|
||||||
gene = args.gene
|
gene = args.gene
|
||||||
datadir = args.datadir
|
datadir = args.datadir
|
||||||
indir = args.input_dir
|
indir = args.input_dir
|
||||||
outdir = args.output_dir
|
outdir = args.output_dir
|
||||||
|
infile_mcsm_ppi2 = args.input_file
|
||||||
|
|
||||||
#outdir_ppi2 = args.mkdir_name
|
#outdir_ppi2 = args.mkdir_name
|
||||||
make_dirs = args.make_dirs
|
make_dirs = args.make_dirs
|
||||||
|
|
||||||
|
@ -53,7 +56,8 @@ if not outdir:
|
||||||
outdir_ppi2 = outdir + 'mcsm_ppi2/'
|
outdir_ppi2 = outdir + 'mcsm_ppi2/'
|
||||||
|
|
||||||
# Input file
|
# Input file
|
||||||
infile_mcsm_ppi2 = outdir_ppi2 + gene.lower() + '_output_combined_clean.csv'
|
if not infile_mcsm_ppi2:
|
||||||
|
infile_mcsm_ppi2 = outdir_ppi2 + gene.lower() + '_output_combined_clean.csv'
|
||||||
|
|
||||||
# Formatted output file
|
# Formatted output file
|
||||||
outfile_mcsm_ppi2_f = outdir_ppi2 + gene.lower() + '_complex_mcsm_ppi2_norm.csv'
|
outfile_mcsm_ppi2_f = outdir_ppi2 + gene.lower() + '_complex_mcsm_ppi2_norm.csv'
|
||||||
|
@ -75,4 +79,4 @@ print('Finished writing file:'
|
||||||
, '\nExpected no. of cols:', len(mcsm_ppi2_df_f.columns)
|
, '\nExpected no. of cols:', len(mcsm_ppi2_df_f.columns)
|
||||||
, '\n=============================================================')
|
, '\n=============================================================')
|
||||||
|
|
||||||
#%%#####################################################################
|
#%%#####################################################################
|
||||||
|
|
|
@ -116,9 +116,10 @@ if not outdir:
|
||||||
#=======
|
#=======
|
||||||
gene_list_normal = ["pnca", "katg", "rpob", "alr"]
|
gene_list_normal = ["pnca", "katg", "rpob", "alr"]
|
||||||
|
|
||||||
|
#FIXME: for gid, this should be SRY as this is the drug...please check!!!!
|
||||||
if gene.lower() == "gid":
|
if gene.lower() == "gid":
|
||||||
print("\nReading mCSM file for gene:", gene)
|
print("\nReading mCSM file for gene:", gene)
|
||||||
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SAM.csv'
|
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously
|
||||||
if gene.lower() == "embb":
|
if gene.lower() == "embb":
|
||||||
print("\nReading mCSM file for gene:", gene)
|
print("\nReading mCSM file for gene:", gene)
|
||||||
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv'
|
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv'
|
||||||
|
@ -139,7 +140,7 @@ deepddg_df = pd.read_csv(infile_deepddg, sep = ',')
|
||||||
|
|
||||||
in_filename_dssp = gene.lower() + '_dssp.csv'
|
in_filename_dssp = gene.lower() + '_dssp.csv'
|
||||||
infile_dssp = outdir + in_filename_dssp
|
infile_dssp = outdir + in_filename_dssp
|
||||||
dssp_df = pd.read_csv(infile_dssp, sep = ',')
|
dssp_df_raw = pd.read_csv(infile_dssp, sep = ',')
|
||||||
|
|
||||||
in_filename_kd = gene.lower() + '_kd.csv'
|
in_filename_kd = gene.lower() + '_kd.csv'
|
||||||
infile_kd = outdir + in_filename_kd
|
infile_kd = outdir + in_filename_kd
|
||||||
|
@ -163,10 +164,13 @@ infilename_dynamut2 = gene.lower() + '_dynamut2_norm.csv'
|
||||||
infile_dynamut2 = outdir + 'dynamut_results/dynamut2/' + infilename_dynamut2
|
infile_dynamut2 = outdir + 'dynamut_results/dynamut2/' + infilename_dynamut2
|
||||||
dynamut2_df = pd.read_csv(infile_dynamut2, sep = ',')
|
dynamut2_df = pd.read_csv(infile_dynamut2, sep = ',')
|
||||||
|
|
||||||
#------------------------------------------------------------
|
infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv'
|
||||||
|
infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps
|
||||||
|
mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
|
||||||
|
|
||||||
|
#------------------------------------------------------------------------------
|
||||||
# ONLY:for gene pnca and gid: End logic should pick this up!
|
# ONLY:for gene pnca and gid: End logic should pick this up!
|
||||||
geneL_dy_na = ["pnca", "gid"]
|
geneL_dy_na = ['gid']
|
||||||
#if gene.lower() == "pnca" or "gid" :
|
|
||||||
if gene.lower() in geneL_dy_na :
|
if gene.lower() in geneL_dy_na :
|
||||||
print("\nGene:", gene.lower()
|
print("\nGene:", gene.lower()
|
||||||
, "\nReading Dynamut and mCSM_na files")
|
, "\nReading Dynamut and mCSM_na files")
|
||||||
|
@ -179,24 +183,27 @@ if gene.lower() in geneL_dy_na :
|
||||||
mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
|
mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
|
||||||
|
|
||||||
# ONLY:for gene embb and alr: End logic should pick this up!
|
# ONLY:for gene embb and alr: End logic should pick this up!
|
||||||
geneL_ppi2 = ["embb", "alr"]
|
geneL_ppi2 = ['embb', 'alr']
|
||||||
#if gene.lower() == "embb" or "alr":
|
#if gene.lower() == "embb" or "alr":
|
||||||
if gene.lower() in "embb" or "alr":
|
if gene.lower() in geneL_ppi2:
|
||||||
infilename_mcsm_ppi2 = gene.lower() + '_complex_mcsm_ppi2_norm.csv'
|
infilename_mcsm_ppi2 = gene.lower() + '_complex_mcsm_ppi2_norm.csv'
|
||||||
infile_mcsm_ppi2 = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2
|
infile_mcsm_ppi2 = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2
|
||||||
mcsm_ppi2_df = pd.read_csv(infile_mcsm_ppi2, sep = ',')
|
mcsm_ppi2_df = pd.read_csv(infile_mcsm_ppi2, sep = ',')
|
||||||
#--------------------------------------------------------------
|
|
||||||
infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv'
|
|
||||||
infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps
|
if gene.lower() == "embb":
|
||||||
mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
|
sel_chain = "B"
|
||||||
|
else:
|
||||||
|
sel_chain = "A"
|
||||||
|
#------------------------------------------------------------------------------
|
||||||
#=======
|
#=======
|
||||||
# output
|
# output
|
||||||
#=======
|
#=======
|
||||||
out_filename_comb = gene.lower() + '_all_params.csv'
|
out_filename_comb = gene.lower() + '_all_params.csv'
|
||||||
outfile_comb = outdir + out_filename_comb
|
outfile_comb = outdir + out_filename_comb
|
||||||
print('Output filename:', outfile_comb
|
print('\nOutput filename:', outfile_comb
|
||||||
, '\n===================================================================')
|
, '\n===================================================================')
|
||||||
|
|
||||||
# end of variable assignment for input and output files
|
# end of variable assignment for input and output files
|
||||||
#%%############################################################################
|
#%%############################################################################
|
||||||
#=====================
|
#=====================
|
||||||
|
@ -230,7 +237,7 @@ len(foldx_df.loc[foldx_df['ddg_foldx'] < 0])
|
||||||
foldx_scale = lambda x : x/abs(foldx_min) if x < 0 else (x/foldx_max if x >= 0 else 'failed')
|
foldx_scale = lambda x : x/abs(foldx_min) if x < 0 else (x/foldx_max if x >= 0 else 'failed')
|
||||||
|
|
||||||
foldx_df['foldx_scaled'] = foldx_df['ddg_foldx'].apply(foldx_scale)
|
foldx_df['foldx_scaled'] = foldx_df['ddg_foldx'].apply(foldx_scale)
|
||||||
print('Raw foldx scores:\n', foldx_df['ddg_foldx']
|
print('\nRaw foldx scores:\n', foldx_df['ddg_foldx']
|
||||||
, '\n---------------------------------------------------------------'
|
, '\n---------------------------------------------------------------'
|
||||||
, '\nScaled foldx scores:\n', foldx_df['foldx_scaled'])
|
, '\nScaled foldx scores:\n', foldx_df['foldx_scaled'])
|
||||||
|
|
||||||
|
@ -273,9 +280,42 @@ else:
|
||||||
|
|
||||||
#=======================
|
#=======================
|
||||||
# Deepddg
|
# Deepddg
|
||||||
|
# TODO: RERUN 'gid'
|
||||||
#=======================
|
#=======================
|
||||||
deepddg_df.shape
|
deepddg_df.shape
|
||||||
|
|
||||||
|
#--------------------------
|
||||||
|
# check if >1 chain
|
||||||
|
#--------------------------
|
||||||
|
deepddg_df.loc[:,'chain_id'].value_counts()
|
||||||
|
|
||||||
|
if len(deepddg_df.loc[:,'chain_id'].value_counts()) > 1:
|
||||||
|
print("\nChains detected: >1"
|
||||||
|
, "\nGene:", gene
|
||||||
|
, "\nChains:", deepddg_df.loc[:,'chain_id'].value_counts().index)
|
||||||
|
|
||||||
|
print('\nSelecting chain:', sel_chain, 'for gene:', gene)
|
||||||
|
|
||||||
|
deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain]
|
||||||
|
|
||||||
|
#--------------------------
|
||||||
|
# Check for duplicates
|
||||||
|
#--------------------------
|
||||||
|
if len(deepddg_df['mutationinformation'].duplicated().value_counts())> 1:
|
||||||
|
print("\nFAIL: Duplicates detected in DeepDDG infile"
|
||||||
|
, "\nNo. of duplicates:"
|
||||||
|
, deepddg_df['mutationinformation'].duplicated().value_counts()[1]
|
||||||
|
, "\nformat deepDDG infile before proceeding")
|
||||||
|
sys.exit()
|
||||||
|
else:
|
||||||
|
print("\nPASS: No duplicates detected in DeepDDG infile")
|
||||||
|
|
||||||
|
#--------------------------
|
||||||
|
# Drop chain id col as other targets don't have it.Check for duplicates
|
||||||
|
#--------------------------
|
||||||
|
col_to_drop = ['chain_id']
|
||||||
|
deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
|
||||||
|
|
||||||
#-------------------------
|
#-------------------------
|
||||||
# scale Deepddg values
|
# scale Deepddg values
|
||||||
#-------------------------
|
#-------------------------
|
||||||
|
@ -287,7 +327,7 @@ deepddg_max = deepddg_df['deepddg'].max()
|
||||||
deepddg_scale = lambda x : x/abs(deepddg_min) if x < 0 else (x/deepddg_max if x >= 0 else 'failed')
|
deepddg_scale = lambda x : x/abs(deepddg_min) if x < 0 else (x/deepddg_max if x >= 0 else 'failed')
|
||||||
|
|
||||||
deepddg_df['deepddg_scaled'] = deepddg_df['deepddg'].apply(deepddg_scale)
|
deepddg_df['deepddg_scaled'] = deepddg_df['deepddg'].apply(deepddg_scale)
|
||||||
print('Raw deepddg scores:\n', deepddg_df['deepddg']
|
print('\nRaw deepddg scores:\n', deepddg_df['deepddg']
|
||||||
, '\n---------------------------------------------------------------'
|
, '\n---------------------------------------------------------------'
|
||||||
, '\nScaled deepddg scores:\n', deepddg_df['deepddg_scaled'])
|
, '\nScaled deepddg scores:\n', deepddg_df['deepddg_scaled'])
|
||||||
|
|
||||||
|
@ -307,8 +347,8 @@ else:
|
||||||
print('\nFAIL: deepddg values scaled numbers MISmatch'
|
print('\nFAIL: deepddg values scaled numbers MISmatch'
|
||||||
, '\nExpected number:', deepddg_pos
|
, '\nExpected number:', deepddg_pos
|
||||||
, '\nGot:', deepddg_pos2
|
, '\nGot:', deepddg_pos2
|
||||||
, '\n======================================================')
|
, '\n======================================================')
|
||||||
|
|
||||||
#--------------------------
|
#--------------------------
|
||||||
# Deepddg outcome category
|
# Deepddg outcome category
|
||||||
#--------------------------
|
#--------------------------
|
||||||
|
@ -324,48 +364,15 @@ else:
|
||||||
, '\nGot:', doc[0]
|
, '\nGot:', doc[0]
|
||||||
, '\n======================================================')
|
, '\n======================================================')
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
#--------------------------
|
|
||||||
# check if >1 chain
|
|
||||||
#--------------------------
|
|
||||||
deepddg_df.loc[:,'chain_id'].value_counts()
|
|
||||||
|
|
||||||
if len(deepddg_df.loc[:,'chain_id'].value_counts()) > 1:
|
|
||||||
print("\nChains detected: >1"
|
|
||||||
, "\nGene:", gene
|
|
||||||
, "\nChains:", deepddg_df.loc[:,'chain_id'].value_counts().index)
|
|
||||||
|
|
||||||
#--------------------------
|
|
||||||
# subset chain
|
|
||||||
#--------------------------
|
|
||||||
if gene.lower() == "embb":
|
|
||||||
sel_chain = "B"
|
|
||||||
else:
|
|
||||||
sel_chain = "A"
|
|
||||||
|
|
||||||
deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain]
|
if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
|
||||||
|
print('\nPASS: Deepddg data is scaled between -1 and 1',
|
||||||
|
'\nproceeding with merge')
|
||||||
|
|
||||||
#--------------------------
|
#%%============================================================================
|
||||||
# Check for duplicates
|
|
||||||
#--------------------------
|
|
||||||
if len(deepddg_df['mutationinformation'].duplicated().value_counts())> 1:
|
|
||||||
print("\nFAIL: Duplicates detected in DeepDDG infile"
|
|
||||||
, "\nNo. of duplicates:"
|
|
||||||
, deepddg_df['mutationinformation'].duplicated().value_counts()[1]
|
|
||||||
, "\nformat deepDDG infile before proceeding")
|
|
||||||
sys.exit()
|
|
||||||
else:
|
|
||||||
print("\nPASS: No duplicates detected in DeepDDG infile")
|
|
||||||
|
|
||||||
#--------------------------
|
|
||||||
# Drop chain id col as other targets don't have itCheck for duplicates
|
|
||||||
#--------------------------
|
|
||||||
col_to_drop = ['chain_id']
|
|
||||||
deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
|
|
||||||
|
|
||||||
#%%=============================================================================
|
|
||||||
# Now merges begin
|
# Now merges begin
|
||||||
#%%=============================================================================
|
|
||||||
print('==================================='
|
print('==================================='
|
||||||
, '\nFirst merge: mcsm + foldx'
|
, '\nFirst merge: mcsm + foldx'
|
||||||
, '\n===================================')
|
, '\n===================================')
|
||||||
|
@ -395,10 +402,11 @@ print('\n\nResult of first merge:', mcsm_foldx_dfs.shape
|
||||||
mcsm_foldx_dfs[merging_cols_m1].apply(len)
|
mcsm_foldx_dfs[merging_cols_m1].apply(len)
|
||||||
mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs)
|
mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs)
|
||||||
|
|
||||||
#%% for embB and any other targets where mCSM-lig hasn't run for
|
#%% for embB and any other targets where mCSM-lig hasn't run for ALL nsSNPs.
|
||||||
# get the empty cells to be full of meaningful info
|
# Get the empty cells to be full of meaningful info
|
||||||
if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any():
|
if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any():
|
||||||
print ("NAs detected in mcsm cols after merge")
|
print ('\nNAs detected in mcsm cols after merge.'
|
||||||
|
, '\nCleaning data before merging deepddg_df')
|
||||||
|
|
||||||
##############################
|
##############################
|
||||||
# Extract relevant col values
|
# Extract relevant col values
|
||||||
|
@ -442,6 +450,8 @@ if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any():
|
||||||
mcsm_foldx_dfs['wt_aa_3lower'] = wt.map(lookup_dict)
|
mcsm_foldx_dfs['wt_aa_3lower'] = wt.map(lookup_dict)
|
||||||
mut = mcsm_foldx_dfs['mutant_type'].squeeze()
|
mut = mcsm_foldx_dfs['mutant_type'].squeeze()
|
||||||
mcsm_foldx_dfs['mut_aa_3lower'] = mut.map(lookup_dict)
|
mcsm_foldx_dfs['mut_aa_3lower'] = mut.map(lookup_dict)
|
||||||
|
else:
|
||||||
|
print('\nNo NAs detected in mcsm_fold_dfs. Proceeding to merge deepddg_df')
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
print('==================================='
|
print('==================================='
|
||||||
|
@ -460,14 +470,18 @@ ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns)
|
||||||
mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64')
|
mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64')
|
||||||
|
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
|
#FIXME: select df with 'chain' to allow corret dim merging!
|
||||||
print('==================================='
|
print('==================================='
|
||||||
, '\Third merge: dssp + kd'
|
, '\nThird merge: dssp + kd'
|
||||||
, '\n===================================')
|
, '\n===================================')
|
||||||
|
|
||||||
dssp_df.shape
|
dssp_df_raw.shape
|
||||||
kd_df.shape
|
kd_df.shape
|
||||||
rd_df.shape
|
rd_df.shape
|
||||||
|
|
||||||
|
dssp_df = dssp_df_raw[dssp_df_raw['chain_id'] == sel_chain]
|
||||||
|
dssp_df['chain_id'].value_counts()
|
||||||
|
|
||||||
#dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = "outer")
|
#dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = "outer")
|
||||||
merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
|
merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
|
||||||
dssp_kd_dfs = pd.merge(dssp_df
|
dssp_kd_dfs = pd.merge(dssp_df
|
||||||
|
@ -553,17 +567,19 @@ combined_df['wild_type'].equals(combined_df['wild_type_dssp'])
|
||||||
foo = combined_df[['wild_type', 'wild_type_kd', 'wt_3letter_caps', 'wt_aa_3lower', 'mut_aa_3lower']]
|
foo = combined_df[['wild_type', 'wild_type_kd', 'wt_3letter_caps', 'wt_aa_3lower', 'mut_aa_3lower']]
|
||||||
|
|
||||||
# Drop cols
|
# Drop cols
|
||||||
cols_to_drop = ['chain_id', 'wild_type_kd', 'wild_type_dssp', 'wt_3letter_caps' ]
|
cols_to_drop = ['chain_id', 'wild_type_kd', 'wild_type_dssp', 'wt_3letter_caps']
|
||||||
combined_df_clean = combined_df.drop(cols_to_drop, axis = 1)
|
combined_df_clean = combined_df.drop(cols_to_drop, axis = 1)
|
||||||
|
|
||||||
del(foo)
|
del(foo)
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
# Output columns
|
# Output columns
|
||||||
out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv'
|
out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv'
|
||||||
outfile_stab_struc = outdir + '/' + out_filename_stab_struc
|
outfile_stab_struc = outdir + out_filename_stab_struc
|
||||||
print('Output filename:', outfile_stab_struc
|
print('Output filename:', outfile_stab_struc
|
||||||
, '\n===================================================================')
|
, '\n===================================================================')
|
||||||
|
|
||||||
|
combined_df_clean
|
||||||
|
|
||||||
# write csv
|
# write csv
|
||||||
print('\nWriting file: combined stability and structural parameters')
|
print('\nWriting file: combined stability and structural parameters')
|
||||||
combined_df_clean.to_csv(outfile_stab_struc, index = False)
|
combined_df_clean.to_csv(outfile_stab_struc, index = False)
|
||||||
|
@ -642,7 +658,7 @@ else:
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
# Output columns: when dynamut, dynamut2 and others weren't being combined
|
# Output columns: when dynamut, dynamut2 and others weren't being combined
|
||||||
out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
|
out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
|
||||||
outfile_comb_afor = outdir + '/' + out_filename_comb_afor
|
outfile_comb_afor = outdir + out_filename_comb_afor
|
||||||
print('Output filename:', outfile_comb_afor
|
print('Output filename:', outfile_comb_afor
|
||||||
, '\n===================================================================')
|
, '\n===================================================================')
|
||||||
|
|
||||||
|
@ -657,7 +673,7 @@ print('\nFinished writing file:'
|
||||||
#dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df] # gid
|
#dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df] # gid
|
||||||
|
|
||||||
if gene.lower() == "pnca":
|
if gene.lower() == "pnca":
|
||||||
dfs_list = [dynamut_df, dynamut2_df]
|
dfs_list = [dynamut2_df]
|
||||||
if gene.lower() == "gid":
|
if gene.lower() == "gid":
|
||||||
dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df]
|
dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df]
|
||||||
if gene.lower() == "embb":
|
if gene.lower() == "embb":
|
||||||
|
@ -709,4 +725,4 @@ combined_all_params.to_csv(outfile_comb, index = False)
|
||||||
print('\nFinished writing file:'
|
print('\nFinished writing file:'
|
||||||
, '\nNo. of rows:', combined_all_params.shape[0]
|
, '\nNo. of rows:', combined_all_params.shape[0]
|
||||||
, '\nNo. of cols:', combined_all_params.shape[1])
|
, '\nNo. of cols:', combined_all_params.shape[1])
|
||||||
#%% end of script
|
#%% end of script
|
||||||
|
|
112
scripts/plotting/TESTING_PLOTS.R
Executable file
112
scripts/plotting/TESTING_PLOTS.R
Executable file
|
@ -0,0 +1,112 @@
|
||||||
|
#!/usr/bin/env Rscript
|
||||||
|
getwd()
|
||||||
|
setwd("~/git/LSHTM_analysis/scripts/plotting")
|
||||||
|
getwd()
|
||||||
|
source("Header_TT.R")
|
||||||
|
|
||||||
|
drug = 'streptomycin'
|
||||||
|
gene = 'gid'
|
||||||
|
|
||||||
|
spec = matrix(c(
|
||||||
|
"drug" , "d", 1, "character",
|
||||||
|
"gene" , "g", 1, "character",
|
||||||
|
"data_file1" , "fa", 2, "character",
|
||||||
|
"data_file2" , "fb", 2, "character"
|
||||||
|
), byrow = TRUE, ncol = 4)
|
||||||
|
|
||||||
|
opt = getopt(spec)
|
||||||
|
|
||||||
|
drug = opt$drug
|
||||||
|
gene = opt$gene
|
||||||
|
infile_params = opt$data_file1
|
||||||
|
infile_metadata = opt$data_file2
|
||||||
|
|
||||||
|
if(is.null(drug)|is.null(gene)) {
|
||||||
|
stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
|
||||||
|
}
|
||||||
|
|
||||||
|
#===========
|
||||||
|
# Input
|
||||||
|
#===========
|
||||||
|
source("get_plotting_dfs.R")
|
||||||
|
|
||||||
|
#===========
|
||||||
|
# output
|
||||||
|
#===========
|
||||||
|
# PS
|
||||||
|
bp_subcols_duet = "barplot_coloured_PS.svg"
|
||||||
|
plot_bp_subcols_duet = paste0(plotdir, "/", bp_subcols_duet)
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# add frequency of positions (from lib data.table)
|
||||||
|
|
||||||
|
setDT(subcols_df_ps)[, pos_count := .N, by = .(position)]
|
||||||
|
|
||||||
|
|
||||||
|
foo = data.frame(subcols_df_ps$mutationinformation
|
||||||
|
, subcols_df_ps$position
|
||||||
|
, subcols_df_ps$pos_count)
|
||||||
|
|
||||||
|
#snpsBYpos_df <- subcols_df_ps %>%
|
||||||
|
# group_by(position) %>%
|
||||||
|
# summarize(snpsBYpos = mean(pos_count))
|
||||||
|
|
||||||
|
|
||||||
|
#********************
|
||||||
|
# generate plot: PS
|
||||||
|
# NO axis colours
|
||||||
|
#********************
|
||||||
|
g = ggplot(subcols_df_ps
|
||||||
|
, aes(x = factor(position, ordered = T)))
|
||||||
|
g2 = g + geom_bar()
|
||||||
|
|
||||||
|
g2
|
||||||
|
|
||||||
|
foo = g2 + geom_text(stat='count', aes(label = ..count..))
|
||||||
|
foo
|
||||||
|
|
||||||
|
######
|
||||||
|
bp_subcols_duet = "TEST_PS.svg"
|
||||||
|
plot_bp_subcols_duet = paste0(plotdir, "/", bp_subcols_duet)
|
||||||
|
print(paste0("plot name:", plot_bp_subcols_duet))
|
||||||
|
svg(plot_bp_subcols_duet, width = 26, height = 4)
|
||||||
|
|
||||||
|
g1 = ggplot(subcols_df_ps, aes(x = factor(position, ordered = T)
|
||||||
|
, y = pos_count)) +
|
||||||
|
geom_bar(stat = "summary"
|
||||||
|
, aes(fill = group), colour = "grey") +
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
###################################
|
||||||
|
|
||||||
|
g = ggplot(subcols_df_ps
|
||||||
|
, aes(x = factor(position, ordered = T)))
|
||||||
|
outPlot_bp_ps = g +
|
||||||
|
geom_bar(aes(fill = group), colour = "grey") +
|
||||||
|
scale_fill_manual( values = subcols_ps
|
||||||
|
, guide = "none") +
|
||||||
|
theme( axis.text.x = element_text(size = my_xaxls
|
||||||
|
, angle = 90
|
||||||
|
, hjust = 1
|
||||||
|
, vjust = 0.4)
|
||||||
|
, axis.text.y = element_text(size = my_yaxls
|
||||||
|
, angle = 0
|
||||||
|
, hjust = 1
|
||||||
|
, vjust = 0)
|
||||||
|
, axis.title.x = element_text(size = my_xaxts)
|
||||||
|
, axis.title.y = element_text(size = my_yaxts ) ) +
|
||||||
|
labs(title = ""
|
||||||
|
#title = my_title
|
||||||
|
, x = "Position"
|
||||||
|
, y = "Frequency")
|
||||||
|
|
||||||
|
print(outPlot_bp_ps)
|
||||||
|
#dev.off()
|
||||||
|
|
||||||
|
######################################################################=
|
||||||
|
# End of script
|
||||||
|
######################################################################=
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue