saving work after running combining_dfs.py

This commit is contained in:
Tanushree Tunstall 2021-11-12 14:16:48 +00:00
parent dad8f526a2
commit 4eaa0b5d2b
7 changed files with 136 additions and 92 deletions

0
dynamut/format_results_dynamut.py Normal file → Executable file
View file

0
dynamut/format_results_dynamut2.py Normal file → Executable file
View file

51
dynamut/run_format_results_dynamut.py Normal file → Executable file
View file

@ -17,24 +17,53 @@ os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
from format_results_dynamut import * from format_results_dynamut import *
from format_results_dynamut2 import * from format_results_dynamut2 import *
######################################################################## ########################################################################
# variables #%% command line args
# TODO: add cmd line args arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug' , help = 'drug name (case sensitive)', default = None)
arg_parser.add_argument('-g', '--gene' , help = 'gene name (case sensitive)', default = None)
arg_parser.add_argument('--datadir' , help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
#arg_parser.add_argument('--mkdir_name' , help = 'Output dir for processed results. This will be created if it does not exist')
arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
gene = 'gid' arg_parser.add_argument('--debug' , action = 'store_true' , help = 'Debug Mode')
drug = 'streptomycin'
datadir = homedir + '/git/Data' args = arg_parser.parse_args()
indir = datadir + '/' + drug + '/input' #%%============================================================================
outdir = datadir + '/' + drug + '/output' # variable assignment: input and output paths & filenames
outdir_dynamut = outdir + '/dynamut_results/' drug = args.drug
outdir_dynamut2 = outdir + '/dynamut_results/dynamut2/' gene = args.gene
datadir = args.datadir
indir = args.input_dir
outdir = args.output_dir
#outdir_dynamut2 = args.mkdir_name
make_dirs = args.make_dirs
#=======
# dirs
#=======
if not datadir:
datadir = homedir + '/git/Data/'
if not indir:
indir = datadir + drug + '/input/'
if not outdir:
outdir = datadir + drug + '/output/'
#if not mkdir_name:
outdir_dynamut = outdir + 'dynamut_results/'
outdir_dynamut2 = outdir + 'dynamut_results/dynamut2/'
# Input file # Input file
infile_dynamut = outdir_dynamut + gene + '_dynamut_all_output_clean.csv' infile_dynamut = outdir_dynamut + gene + '_dynamut_all_output_clean.csv'
infile_dynamut2 = outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv' infile_dynamut2 = outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv'
# Formatted output filename # Formatted output filename
outfile_dynamut_f = outdir_dynamut2 + gene + '_complex_dynamut_norm.csv' outfile_dynamut_f = outdir_dynamut2 + gene + '_dynamut_norm.csv'
outfile_dynamut2_f = outdir_dynamut2 + gene + '_complex_dynamut2_norm.csv' outfile_dynamut2_f = outdir_dynamut2 + gene + '_dynamut2_norm.csv'
#%%========================================================================
#=============================== #===============================
# CALL: format_results_dynamut # CALL: format_results_dynamut

View file

@ -17,8 +17,8 @@ my_host = 'http://biosig.unimelb.edu.au'
#headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"} #headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
# TODO: add cmd line args # TODO: add cmd line args
#gene = 'gid' #gene = ''
drug = 'streptomycin' #drug = ''
datadir = homedir + '/git/Data/' datadir = homedir + '/git/Data/'
indir = datadir + drug + '/input/' indir = datadir + drug + '/input/'
outdir = datadir + drug + '/output/' outdir = datadir + drug + '/output/'

View file

@ -22,12 +22,11 @@ from pandas.api.types import is_numeric_dtype
sys.path.append(homedir + '/git/LSHTM_analysis/scripts') sys.path.append(homedir + '/git/LSHTM_analysis/scripts')
from reference_dict import up_3letter_aa_dict from reference_dict import up_3letter_aa_dict
from reference_dict import oneletter_aa_dict from reference_dict import oneletter_aa_dict
#%%============================================================================
#%%#####################################################################
def format_mcsm_ppi2_output(mcsm_ppi2_output_csv): def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
""" """
@param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all muts @param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps
which is the result of combining all mcsm_ppi2 batch results, and using which is the result of combining all mcsm_ppi2 batch results, and using
bash scripts to combine all the batch results into one file. bash scripts to combine all the batch results into one file.
Formatting df to a pandas df and output as csv. Formatting df to a pandas df and output as csv.

View file

@ -19,9 +19,10 @@ arg_parser.add_argument('-g', '--gene' , help = 'gene name (case sensitive)
arg_parser.add_argument('--datadir' , help = 'Data Directory. By default, it assmumes homedir + git/Data') arg_parser.add_argument('--datadir' , help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input') arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
arg_parser.add_argument('--input_file' , help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
#arg_parser.add_argument('--mkdir_name' , help = 'Output dir for processed results. This will be created if it does not exist') #arg_parser.add_argument('--mkdir_name' , help = 'Output dir for processed results. This will be created if it does not exist')
arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true') arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
arg_parser.add_argument('--debug' , action = 'store_true' , help = 'Debug Mode') arg_parser.add_argument('--debug' , action = 'store_true' , help = 'Debug Mode')
args = arg_parser.parse_args() args = arg_parser.parse_args()
@ -32,6 +33,8 @@ gene = args.gene
datadir = args.datadir datadir = args.datadir
indir = args.input_dir indir = args.input_dir
outdir = args.output_dir outdir = args.output_dir
infile_mcsm_ppi2 = args.input_file
#outdir_ppi2 = args.mkdir_name #outdir_ppi2 = args.mkdir_name
make_dirs = args.make_dirs make_dirs = args.make_dirs
@ -53,6 +56,7 @@ if not outdir:
outdir_ppi2 = outdir + 'mcsm_ppi2/' outdir_ppi2 = outdir + 'mcsm_ppi2/'
# Input file # Input file
if not infile_mcsm_ppi2:
infile_mcsm_ppi2 = outdir_ppi2 + gene.lower() + '_output_combined_clean.csv' infile_mcsm_ppi2 = outdir_ppi2 + gene.lower() + '_output_combined_clean.csv'
# Formatted output file # Formatted output file

View file

@ -119,7 +119,7 @@ gene_list_normal = ["pnca", "katg", "rpob", "alr"]
#FIXME: for gid, this should be SRY as this is the drug...please check!!!! #FIXME: for gid, this should be SRY as this is the drug...please check!!!!
if gene.lower() == "gid": if gene.lower() == "gid":
print("\nReading mCSM file for gene:", gene) print("\nReading mCSM file for gene:", gene)
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SAM.csv' in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously
if gene.lower() == "embb": if gene.lower() == "embb":
print("\nReading mCSM file for gene:", gene) print("\nReading mCSM file for gene:", gene)
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv'
@ -140,7 +140,7 @@ deepddg_df = pd.read_csv(infile_deepddg, sep = ',')
in_filename_dssp = gene.lower() + '_dssp.csv' in_filename_dssp = gene.lower() + '_dssp.csv'
infile_dssp = outdir + in_filename_dssp infile_dssp = outdir + in_filename_dssp
dssp_df = pd.read_csv(infile_dssp, sep = ',') dssp_df_raw = pd.read_csv(infile_dssp, sep = ',')
in_filename_kd = gene.lower() + '_kd.csv' in_filename_kd = gene.lower() + '_kd.csv'
infile_kd = outdir + in_filename_kd infile_kd = outdir + in_filename_kd
@ -164,10 +164,13 @@ infilename_dynamut2 = gene.lower() + '_dynamut2_norm.csv'
infile_dynamut2 = outdir + 'dynamut_results/dynamut2/' + infilename_dynamut2 infile_dynamut2 = outdir + 'dynamut_results/dynamut2/' + infilename_dynamut2
dynamut2_df = pd.read_csv(infile_dynamut2, sep = ',') dynamut2_df = pd.read_csv(infile_dynamut2, sep = ',')
#------------------------------------------------------------ infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv'
infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps
mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
#------------------------------------------------------------------------------
# ONLY:for gene pnca and gid: End logic should pick this up! # ONLY:for gene pnca and gid: End logic should pick this up!
geneL_dy_na = ["pnca", "gid"] geneL_dy_na = ['gid']
#if gene.lower() == "pnca" or "gid" :
if gene.lower() in geneL_dy_na : if gene.lower() in geneL_dy_na :
print("\nGene:", gene.lower() print("\nGene:", gene.lower()
, "\nReading Dynamut and mCSM_na files") , "\nReading Dynamut and mCSM_na files")
@ -179,27 +182,28 @@ if gene.lower() in geneL_dy_na :
infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',') mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
# FIXME: ppi2, not extracted as expected for alr
# TODO: get mcsm_ppi2 data for alr
# ONLY:for gene embb and alr: End logic should pick this up! # ONLY:for gene embb and alr: End logic should pick this up!
geneL_ppi2 = ["embb", "alr"] geneL_ppi2 = ['embb', 'alr']
#if gene.lower() == "embb" or "alr": #if gene.lower() == "embb" or "alr":
if gene.lower() in "embb" or "alr": if gene.lower() in geneL_ppi2:
infilename_mcsm_ppi2 = gene.lower() + '_complex_mcsm_ppi2_norm.csv' infilename_mcsm_ppi2 = gene.lower() + '_complex_mcsm_ppi2_norm.csv'
infile_mcsm_ppi2 = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2 infile_mcsm_ppi2 = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2
mcsm_ppi2_df = pd.read_csv(infile_mcsm_ppi2, sep = ',') mcsm_ppi2_df = pd.read_csv(infile_mcsm_ppi2, sep = ',')
#--------------------------------------------------------------
infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv'
infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps
mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
if gene.lower() == "embb":
sel_chain = "B"
else:
sel_chain = "A"
#------------------------------------------------------------------------------
#======= #=======
# output # output
#======= #=======
out_filename_comb = gene.lower() + '_all_params.csv' out_filename_comb = gene.lower() + '_all_params.csv'
outfile_comb = outdir + out_filename_comb outfile_comb = outdir + out_filename_comb
print('Output filename:', outfile_comb print('\nOutput filename:', outfile_comb
, '\n===================================================================') , '\n===================================================================')
# end of variable assignment for input and output files # end of variable assignment for input and output files
#%%############################################################################ #%%############################################################################
#===================== #=====================
@ -233,7 +237,7 @@ len(foldx_df.loc[foldx_df['ddg_foldx'] < 0])
foldx_scale = lambda x : x/abs(foldx_min) if x < 0 else (x/foldx_max if x >= 0 else 'failed') foldx_scale = lambda x : x/abs(foldx_min) if x < 0 else (x/foldx_max if x >= 0 else 'failed')
foldx_df['foldx_scaled'] = foldx_df['ddg_foldx'].apply(foldx_scale) foldx_df['foldx_scaled'] = foldx_df['ddg_foldx'].apply(foldx_scale)
print('Raw foldx scores:\n', foldx_df['ddg_foldx'] print('\nRaw foldx scores:\n', foldx_df['ddg_foldx']
, '\n---------------------------------------------------------------' , '\n---------------------------------------------------------------'
, '\nScaled foldx scores:\n', foldx_df['foldx_scaled']) , '\nScaled foldx scores:\n', foldx_df['foldx_scaled'])
@ -276,9 +280,42 @@ else:
#======================= #=======================
# Deepddg # Deepddg
# TODO: RERUN 'gid'
#======================= #=======================
deepddg_df.shape deepddg_df.shape
#--------------------------
# check if >1 chain
#--------------------------
deepddg_df.loc[:,'chain_id'].value_counts()
if len(deepddg_df.loc[:,'chain_id'].value_counts()) > 1:
print("\nChains detected: >1"
, "\nGene:", gene
, "\nChains:", deepddg_df.loc[:,'chain_id'].value_counts().index)
print('\nSelecting chain:', sel_chain, 'for gene:', gene)
deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain]
#--------------------------
# Check for duplicates
#--------------------------
if len(deepddg_df['mutationinformation'].duplicated().value_counts())> 1:
print("\nFAIL: Duplicates detected in DeepDDG infile"
, "\nNo. of duplicates:"
, deepddg_df['mutationinformation'].duplicated().value_counts()[1]
, "\nformat deepDDG infile before proceeding")
sys.exit()
else:
print("\nPASS: No duplicates detected in DeepDDG infile")
#--------------------------
# Drop chain id col as other targets don't have it.Check for duplicates
#--------------------------
col_to_drop = ['chain_id']
deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
#------------------------- #-------------------------
# scale Deepddg values # scale Deepddg values
#------------------------- #-------------------------
@ -290,7 +327,7 @@ deepddg_max = deepddg_df['deepddg'].max()
deepddg_scale = lambda x : x/abs(deepddg_min) if x < 0 else (x/deepddg_max if x >= 0 else 'failed') deepddg_scale = lambda x : x/abs(deepddg_min) if x < 0 else (x/deepddg_max if x >= 0 else 'failed')
deepddg_df['deepddg_scaled'] = deepddg_df['deepddg'].apply(deepddg_scale) deepddg_df['deepddg_scaled'] = deepddg_df['deepddg'].apply(deepddg_scale)
print('Raw deepddg scores:\n', deepddg_df['deepddg'] print('\nRaw deepddg scores:\n', deepddg_df['deepddg']
, '\n---------------------------------------------------------------' , '\n---------------------------------------------------------------'
, '\nScaled deepddg scores:\n', deepddg_df['deepddg_scaled']) , '\nScaled deepddg scores:\n', deepddg_df['deepddg_scaled'])
@ -328,48 +365,14 @@ else:
, '\n======================================================') , '\n======================================================')
sys.exit() sys.exit()
#--------------------------
# check if >1 chain
#--------------------------
deepddg_df.loc[:,'chain_id'].value_counts()
if len(deepddg_df.loc[:,'chain_id'].value_counts()) > 1: if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
print("\nChains detected: >1" print('\nPASS: Deepddg data is scaled between -1 and 1',
, "\nGene:", gene '\nproceeding with merge')
, "\nChains:", deepddg_df.loc[:,'chain_id'].value_counts().index)
#-------------------------- #%%============================================================================
# FIXME: This needs to happen BEFORE scaling as it will vary
# subset chain
#--------------------------
if gene.lower() == "embb":
sel_chain = "B"
else:
sel_chain = "A"
deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain]
#--------------------------
# Check for duplicates
#--------------------------
if len(deepddg_df['mutationinformation'].duplicated().value_counts())> 1:
print("\nFAIL: Duplicates detected in DeepDDG infile"
, "\nNo. of duplicates:"
, deepddg_df['mutationinformation'].duplicated().value_counts()[1]
, "\nformat deepDDG infile before proceeding")
sys.exit()
else:
print("\nPASS: No duplicates detected in DeepDDG infile")
#--------------------------
# Drop chain id col as other targets don't have itCheck for duplicates
#--------------------------
col_to_drop = ['chain_id']
deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
#%%=============================================================================
# Now merges begin # Now merges begin
#%%=============================================================================
print('===================================' print('==================================='
, '\nFirst merge: mcsm + foldx' , '\nFirst merge: mcsm + foldx'
, '\n===================================') , '\n===================================')
@ -399,10 +402,11 @@ print('\n\nResult of first merge:', mcsm_foldx_dfs.shape
mcsm_foldx_dfs[merging_cols_m1].apply(len) mcsm_foldx_dfs[merging_cols_m1].apply(len)
mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs) mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs)
#%% for embB and any other targets where mCSM-lig hasn't run for #%% for embB and any other targets where mCSM-lig hasn't run for ALL nsSNPs.
# get the empty cells to be full of meaningful info # Get the empty cells to be full of meaningful info
if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any(): if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any():
print ("NAs detected in mcsm cols after merge") print ('\nNAs detected in mcsm cols after merge.'
, '\nCleaning data before merging deepddg_df')
############################## ##############################
# Extract relevant col values # Extract relevant col values
@ -446,6 +450,8 @@ if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any():
mcsm_foldx_dfs['wt_aa_3lower'] = wt.map(lookup_dict) mcsm_foldx_dfs['wt_aa_3lower'] = wt.map(lookup_dict)
mut = mcsm_foldx_dfs['mutant_type'].squeeze() mut = mcsm_foldx_dfs['mutant_type'].squeeze()
mcsm_foldx_dfs['mut_aa_3lower'] = mut.map(lookup_dict) mcsm_foldx_dfs['mut_aa_3lower'] = mut.map(lookup_dict)
else:
print('\nNo NAs detected in mcsm_fold_dfs. Proceeding to merge deepddg_df')
#%% #%%
print('===================================' print('==================================='
@ -464,14 +470,18 @@ ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns)
mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64') mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64')
#%%============================================================================ #%%============================================================================
#FIXME: select df with 'chain' to allow corret dim merging!
print('===================================' print('==================================='
, '\Third merge: dssp + kd' , '\nThird merge: dssp + kd'
, '\n===================================') , '\n===================================')
dssp_df.shape dssp_df_raw.shape
kd_df.shape kd_df.shape
rd_df.shape rd_df.shape
dssp_df = dssp_df_raw[dssp_df_raw['chain_id'] == sel_chain]
dssp_df['chain_id'].value_counts()
#dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = "outer") #dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = "outer")
merging_cols_m2 = detect_common_cols(dssp_df, kd_df) merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
dssp_kd_dfs = pd.merge(dssp_df dssp_kd_dfs = pd.merge(dssp_df
@ -564,10 +574,12 @@ del(foo)
#%%============================================================================ #%%============================================================================
# Output columns # Output columns
out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv' out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv'
outfile_stab_struc = outdir + '/' + out_filename_stab_struc outfile_stab_struc = outdir + out_filename_stab_struc
print('Output filename:', outfile_stab_struc print('Output filename:', outfile_stab_struc
, '\n===================================================================') , '\n===================================================================')
combined_df_clean
# write csv # write csv
print('\nWriting file: combined stability and structural parameters') print('\nWriting file: combined stability and structural parameters')
combined_df_clean.to_csv(outfile_stab_struc, index = False) combined_df_clean.to_csv(outfile_stab_struc, index = False)
@ -646,7 +658,7 @@ else:
#%%============================================================================ #%%============================================================================
# Output columns: when dynamut, dynamut2 and others weren't being combined # Output columns: when dynamut, dynamut2 and others weren't being combined
out_filename_comb_afor = gene.lower() + '_comb_afor.csv' out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
outfile_comb_afor = outdir + '/' + out_filename_comb_afor outfile_comb_afor = outdir + out_filename_comb_afor
print('Output filename:', outfile_comb_afor print('Output filename:', outfile_comb_afor
, '\n===================================================================') , '\n===================================================================')
@ -661,7 +673,7 @@ print('\nFinished writing file:'
#dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df] # gid #dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df] # gid
if gene.lower() == "pnca": if gene.lower() == "pnca":
dfs_list = [dynamut_df, dynamut2_df] dfs_list = [dynamut2_df]
if gene.lower() == "gid": if gene.lower() == "gid":
dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df] dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df]
if gene.lower() == "embb": if gene.lower() == "embb":