From 4eaa0b5d2b817ccadda36529042a371a539c7d6b Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 12 Nov 2021 14:16:48 +0000 Subject: [PATCH] saving work after running combining_dfs.py --- dynamut/format_results_dynamut.py | 0 dynamut/format_results_dynamut2.py | 0 dynamut/run_format_results_dynamut.py | 53 ++++++-- dynamut/run_get_results_dynamut.py | 6 +- mcsm_ppi2/format_results_mcsm_ppi2.py | 5 +- mcsm_ppi2/run_format_results_mcsm_ppi2.py | 18 +-- scripts/combining_dfs.py | 146 ++++++++++++---------- 7 files changed, 136 insertions(+), 92 deletions(-) mode change 100644 => 100755 dynamut/format_results_dynamut.py mode change 100644 => 100755 dynamut/format_results_dynamut2.py mode change 100644 => 100755 dynamut/run_format_results_dynamut.py diff --git a/dynamut/format_results_dynamut.py b/dynamut/format_results_dynamut.py old mode 100644 new mode 100755 diff --git a/dynamut/format_results_dynamut2.py b/dynamut/format_results_dynamut2.py old mode 100644 new mode 100755 diff --git a/dynamut/run_format_results_dynamut.py b/dynamut/run_format_results_dynamut.py old mode 100644 new mode 100755 index 02af524..a626d14 --- a/dynamut/run_format_results_dynamut.py +++ b/dynamut/run_format_results_dynamut.py @@ -17,24 +17,53 @@ os.chdir (homedir + '/git/LSHTM_analysis/dynamut') from format_results_dynamut import * from format_results_dynamut2 import * ######################################################################## -# variables -# TODO: add cmd line args +#%% command line args +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug' , help = 'drug name (case sensitive)', default = None) +arg_parser.add_argument('-g', '--gene' , help = 'gene name (case sensitive)', default = None) +arg_parser.add_argument('--datadir' , help = 'Data Directory. By default, it assmumes homedir + git/Data') +arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') +arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') +#arg_parser.add_argument('--mkdir_name' , help = 'Output dir for processed results. This will be created if it does not exist') +arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true') -gene = 'gid' -drug = 'streptomycin' -datadir = homedir + '/git/Data' -indir = datadir + '/' + drug + '/input' -outdir = datadir + '/' + drug + '/output' -outdir_dynamut = outdir + '/dynamut_results/' -outdir_dynamut2 = outdir + '/dynamut_results/dynamut2/' +arg_parser.add_argument('--debug' , action = 'store_true' , help = 'Debug Mode') + +args = arg_parser.parse_args() +#%%============================================================================ +# variable assignment: input and output paths & filenames +drug = args.drug +gene = args.gene +datadir = args.datadir +indir = args.input_dir +outdir = args.output_dir +#outdir_dynamut2 = args.mkdir_name +make_dirs = args.make_dirs + +#======= +# dirs +#======= +if not datadir: + datadir = homedir + '/git/Data/' + +if not indir: + indir = datadir + drug + '/input/' + +if not outdir: + outdir = datadir + drug + '/output/' + +#if not mkdir_name: +outdir_dynamut = outdir + 'dynamut_results/' +outdir_dynamut2 = outdir + 'dynamut_results/dynamut2/' # Input file infile_dynamut = outdir_dynamut + gene + '_dynamut_all_output_clean.csv' infile_dynamut2 = outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv' # Formatted output filename -outfile_dynamut_f = outdir_dynamut2 + gene + '_complex_dynamut_norm.csv' -outfile_dynamut2_f = outdir_dynamut2 + gene + '_complex_dynamut2_norm.csv' +outfile_dynamut_f = outdir_dynamut2 + gene + '_dynamut_norm.csv' +outfile_dynamut2_f = outdir_dynamut2 + gene + '_dynamut2_norm.csv' +#%%======================================================================== #=============================== # CALL: format_results_dynamut @@ -69,4 +98,4 @@ print('Finished writing file:' , '\nExpected no. of cols:', len(dynamut2_df_f.columns) , '\n=============================================================') -#%%##################################################################### \ No newline at end of file +#%%##################################################################### diff --git a/dynamut/run_get_results_dynamut.py b/dynamut/run_get_results_dynamut.py index e9e82ef..b8e6662 100755 --- a/dynamut/run_get_results_dynamut.py +++ b/dynamut/run_get_results_dynamut.py @@ -17,8 +17,8 @@ my_host = 'http://biosig.unimelb.edu.au' #headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"} # TODO: add cmd line args -#gene = 'gid' -drug = 'streptomycin' +#gene = '' +#drug = '' datadir = homedir + '/git/Data/' indir = datadir + drug + '/input/' outdir = datadir + drug + '/output/' @@ -41,4 +41,4 @@ get_results(url_file = my_url_file , output_dir = outdir , outfile_suffix = my_suffix) -######################################################################## \ No newline at end of file +######################################################################## diff --git a/mcsm_ppi2/format_results_mcsm_ppi2.py b/mcsm_ppi2/format_results_mcsm_ppi2.py index 69d4835..49c05b2 100755 --- a/mcsm_ppi2/format_results_mcsm_ppi2.py +++ b/mcsm_ppi2/format_results_mcsm_ppi2.py @@ -22,12 +22,11 @@ from pandas.api.types import is_numeric_dtype sys.path.append(homedir + '/git/LSHTM_analysis/scripts') from reference_dict import up_3letter_aa_dict from reference_dict import oneletter_aa_dict - -#%%##################################################################### +#%%============================================================================ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv): """ - @param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all muts + @param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps which is the result of combining all mcsm_ppi2 batch results, and using bash scripts to combine all the batch results into one file. Formatting df to a pandas df and output as csv. diff --git a/mcsm_ppi2/run_format_results_mcsm_ppi2.py b/mcsm_ppi2/run_format_results_mcsm_ppi2.py index c0340ad..ab036f9 100755 --- a/mcsm_ppi2/run_format_results_mcsm_ppi2.py +++ b/mcsm_ppi2/run_format_results_mcsm_ppi2.py @@ -19,19 +19,22 @@ arg_parser.add_argument('-g', '--gene' , help = 'gene name (case sensitive) arg_parser.add_argument('--datadir' , help = 'Data Directory. By default, it assmumes homedir + git/Data') arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') +arg_parser.add_argument('--input_file' , help = 'Output dir for results. By default, it assmes homedir + + output') + #arg_parser.add_argument('--mkdir_name' , help = 'Output dir for processed results. This will be created if it does not exist') arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true') - arg_parser.add_argument('--debug' , action = 'store_true' , help = 'Debug Mode') args = arg_parser.parse_args() #%%============================================================================ # variable assignment: input and output paths & filenames -drug = args.drug -gene = args.gene -datadir = args.datadir -indir = args.input_dir -outdir = args.output_dir +drug = args.drug +gene = args.gene +datadir = args.datadir +indir = args.input_dir +outdir = args.output_dir +infile_mcsm_ppi2 = args.input_file + #outdir_ppi2 = args.mkdir_name make_dirs = args.make_dirs @@ -53,7 +56,8 @@ if not outdir: outdir_ppi2 = outdir + 'mcsm_ppi2/' # Input file -infile_mcsm_ppi2 = outdir_ppi2 + gene.lower() + '_output_combined_clean.csv' +if not infile_mcsm_ppi2: + infile_mcsm_ppi2 = outdir_ppi2 + gene.lower() + '_output_combined_clean.csv' # Formatted output file outfile_mcsm_ppi2_f = outdir_ppi2 + gene.lower() + '_complex_mcsm_ppi2_norm.csv' diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index 2573f5d..cb47d50 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -119,7 +119,7 @@ gene_list_normal = ["pnca", "katg", "rpob", "alr"] #FIXME: for gid, this should be SRY as this is the drug...please check!!!! if gene.lower() == "gid": print("\nReading mCSM file for gene:", gene) - in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SAM.csv' + in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously if gene.lower() == "embb": print("\nReading mCSM file for gene:", gene) in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' @@ -140,7 +140,7 @@ deepddg_df = pd.read_csv(infile_deepddg, sep = ',') in_filename_dssp = gene.lower() + '_dssp.csv' infile_dssp = outdir + in_filename_dssp -dssp_df = pd.read_csv(infile_dssp, sep = ',') +dssp_df_raw = pd.read_csv(infile_dssp, sep = ',') in_filename_kd = gene.lower() + '_kd.csv' infile_kd = outdir + in_filename_kd @@ -164,10 +164,13 @@ infilename_dynamut2 = gene.lower() + '_dynamut2_norm.csv' infile_dynamut2 = outdir + 'dynamut_results/dynamut2/' + infilename_dynamut2 dynamut2_df = pd.read_csv(infile_dynamut2, sep = ',') -#------------------------------------------------------------ +infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv' +infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps +mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None) + +#------------------------------------------------------------------------------ # ONLY:for gene pnca and gid: End logic should pick this up! -geneL_dy_na = ["pnca", "gid"] -#if gene.lower() == "pnca" or "gid" : +geneL_dy_na = ['gid'] if gene.lower() in geneL_dy_na : print("\nGene:", gene.lower() , "\nReading Dynamut and mCSM_na files") @@ -179,27 +182,28 @@ if gene.lower() in geneL_dy_na : infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',') -# FIXME: ppi2, not extracted as expected for alr -# TODO: get mcsm_ppi2 data for alr # ONLY:for gene embb and alr: End logic should pick this up! -geneL_ppi2 = ["embb", "alr"] +geneL_ppi2 = ['embb', 'alr'] #if gene.lower() == "embb" or "alr": -if gene.lower() in "embb" or "alr": +if gene.lower() in geneL_ppi2: infilename_mcsm_ppi2 = gene.lower() + '_complex_mcsm_ppi2_norm.csv' infile_mcsm_ppi2 = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2 mcsm_ppi2_df = pd.read_csv(infile_mcsm_ppi2, sep = ',') -#-------------------------------------------------------------- -infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv' -infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps -mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None) - + + +if gene.lower() == "embb": + sel_chain = "B" +else: + sel_chain = "A" +#------------------------------------------------------------------------------ #======= # output #======= out_filename_comb = gene.lower() + '_all_params.csv' outfile_comb = outdir + out_filename_comb -print('Output filename:', outfile_comb +print('\nOutput filename:', outfile_comb , '\n===================================================================') + # end of variable assignment for input and output files #%%############################################################################ #===================== @@ -233,7 +237,7 @@ len(foldx_df.loc[foldx_df['ddg_foldx'] < 0]) foldx_scale = lambda x : x/abs(foldx_min) if x < 0 else (x/foldx_max if x >= 0 else 'failed') foldx_df['foldx_scaled'] = foldx_df['ddg_foldx'].apply(foldx_scale) -print('Raw foldx scores:\n', foldx_df['ddg_foldx'] +print('\nRaw foldx scores:\n', foldx_df['ddg_foldx'] , '\n---------------------------------------------------------------' , '\nScaled foldx scores:\n', foldx_df['foldx_scaled']) @@ -276,9 +280,42 @@ else: #======================= # Deepddg +# TODO: RERUN 'gid' #======================= deepddg_df.shape +#-------------------------- +# check if >1 chain +#-------------------------- +deepddg_df.loc[:,'chain_id'].value_counts() + +if len(deepddg_df.loc[:,'chain_id'].value_counts()) > 1: + print("\nChains detected: >1" + , "\nGene:", gene + , "\nChains:", deepddg_df.loc[:,'chain_id'].value_counts().index) + +print('\nSelecting chain:', sel_chain, 'for gene:', gene) + +deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain] + +#-------------------------- +# Check for duplicates +#-------------------------- +if len(deepddg_df['mutationinformation'].duplicated().value_counts())> 1: + print("\nFAIL: Duplicates detected in DeepDDG infile" + , "\nNo. of duplicates:" + , deepddg_df['mutationinformation'].duplicated().value_counts()[1] + , "\nformat deepDDG infile before proceeding") + sys.exit() +else: + print("\nPASS: No duplicates detected in DeepDDG infile") + +#-------------------------- +# Drop chain id col as other targets don't have it.Check for duplicates +#-------------------------- +col_to_drop = ['chain_id'] +deepddg_df = deepddg_df.drop(col_to_drop, axis = 1) + #------------------------- # scale Deepddg values #------------------------- @@ -290,7 +327,7 @@ deepddg_max = deepddg_df['deepddg'].max() deepddg_scale = lambda x : x/abs(deepddg_min) if x < 0 else (x/deepddg_max if x >= 0 else 'failed') deepddg_df['deepddg_scaled'] = deepddg_df['deepddg'].apply(deepddg_scale) -print('Raw deepddg scores:\n', deepddg_df['deepddg'] +print('\nRaw deepddg scores:\n', deepddg_df['deepddg'] , '\n---------------------------------------------------------------' , '\nScaled deepddg scores:\n', deepddg_df['deepddg_scaled']) @@ -310,8 +347,8 @@ else: print('\nFAIL: deepddg values scaled numbers MISmatch' , '\nExpected number:', deepddg_pos , '\nGot:', deepddg_pos2 - , '\n======================================================') - + , '\n======================================================') + #-------------------------- # Deepddg outcome category #-------------------------- @@ -327,49 +364,15 @@ else: , '\nGot:', doc[0] , '\n======================================================') sys.exit() - -#-------------------------- -# check if >1 chain -#-------------------------- -deepddg_df.loc[:,'chain_id'].value_counts() - -if len(deepddg_df.loc[:,'chain_id'].value_counts()) > 1: - print("\nChains detected: >1" - , "\nGene:", gene - , "\nChains:", deepddg_df.loc[:,'chain_id'].value_counts().index) -#-------------------------- -# FIXME: This needs to happen BEFORE scaling as it will vary -# subset chain -#-------------------------- -if gene.lower() == "embb": - sel_chain = "B" -else: - sel_chain = "A" -deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain] +if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1: + print('\nPASS: Deepddg data is scaled between -1 and 1', + '\nproceeding with merge') -#-------------------------- -# Check for duplicates -#-------------------------- -if len(deepddg_df['mutationinformation'].duplicated().value_counts())> 1: - print("\nFAIL: Duplicates detected in DeepDDG infile" - , "\nNo. of duplicates:" - , deepddg_df['mutationinformation'].duplicated().value_counts()[1] - , "\nformat deepDDG infile before proceeding") - sys.exit() -else: - print("\nPASS: No duplicates detected in DeepDDG infile") - -#-------------------------- -# Drop chain id col as other targets don't have itCheck for duplicates -#-------------------------- -col_to_drop = ['chain_id'] -deepddg_df = deepddg_df.drop(col_to_drop, axis = 1) - -#%%============================================================================= +#%%============================================================================ # Now merges begin -#%%============================================================================= + print('===================================' , '\nFirst merge: mcsm + foldx' , '\n===================================') @@ -399,10 +402,11 @@ print('\n\nResult of first merge:', mcsm_foldx_dfs.shape mcsm_foldx_dfs[merging_cols_m1].apply(len) mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs) -#%% for embB and any other targets where mCSM-lig hasn't run for -# get the empty cells to be full of meaningful info +#%% for embB and any other targets where mCSM-lig hasn't run for ALL nsSNPs. +# Get the empty cells to be full of meaningful info if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any(): - print ("NAs detected in mcsm cols after merge") + print ('\nNAs detected in mcsm cols after merge.' + , '\nCleaning data before merging deepddg_df') ############################## # Extract relevant col values @@ -446,6 +450,8 @@ if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any(): mcsm_foldx_dfs['wt_aa_3lower'] = wt.map(lookup_dict) mut = mcsm_foldx_dfs['mutant_type'].squeeze() mcsm_foldx_dfs['mut_aa_3lower'] = mut.map(lookup_dict) +else: + print('\nNo NAs detected in mcsm_fold_dfs. Proceeding to merge deepddg_df') #%% print('===================================' @@ -464,14 +470,18 @@ ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns) mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64') #%%============================================================================ +#FIXME: select df with 'chain' to allow corret dim merging! print('===================================' - , '\Third merge: dssp + kd' + , '\nThird merge: dssp + kd' , '\n===================================') -dssp_df.shape +dssp_df_raw.shape kd_df.shape rd_df.shape +dssp_df = dssp_df_raw[dssp_df_raw['chain_id'] == sel_chain] +dssp_df['chain_id'].value_counts() + #dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = "outer") merging_cols_m2 = detect_common_cols(dssp_df, kd_df) dssp_kd_dfs = pd.merge(dssp_df @@ -557,17 +567,19 @@ combined_df['wild_type'].equals(combined_df['wild_type_dssp']) foo = combined_df[['wild_type', 'wild_type_kd', 'wt_3letter_caps', 'wt_aa_3lower', 'mut_aa_3lower']] # Drop cols -cols_to_drop = ['chain_id', 'wild_type_kd', 'wild_type_dssp', 'wt_3letter_caps' ] +cols_to_drop = ['chain_id', 'wild_type_kd', 'wild_type_dssp', 'wt_3letter_caps'] combined_df_clean = combined_df.drop(cols_to_drop, axis = 1) del(foo) #%%============================================================================ # Output columns out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv' -outfile_stab_struc = outdir + '/' + out_filename_stab_struc +outfile_stab_struc = outdir + out_filename_stab_struc print('Output filename:', outfile_stab_struc , '\n===================================================================') +combined_df_clean + # write csv print('\nWriting file: combined stability and structural parameters') combined_df_clean.to_csv(outfile_stab_struc, index = False) @@ -646,7 +658,7 @@ else: #%%============================================================================ # Output columns: when dynamut, dynamut2 and others weren't being combined out_filename_comb_afor = gene.lower() + '_comb_afor.csv' -outfile_comb_afor = outdir + '/' + out_filename_comb_afor +outfile_comb_afor = outdir + out_filename_comb_afor print('Output filename:', outfile_comb_afor , '\n===================================================================') @@ -661,7 +673,7 @@ print('\nFinished writing file:' #dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df] # gid if gene.lower() == "pnca": - dfs_list = [dynamut_df, dynamut2_df] + dfs_list = [dynamut2_df] if gene.lower() == "gid": dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df] if gene.lower() == "embb":