From 1f266c4cb8124bea491f285c0308eab647a4ceb1 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 11 Jan 2022 17:51:28 +0000 Subject: [PATCH] more tidying and formatting for combining_dfs.py. Hopefully no more after today --- scripts/combining_dfs.py | 297 ++++++++++++++++++++-------- scripts/plotting/get_plotting_dfs.R | 4 +- 2 files changed, 217 insertions(+), 84 deletions(-) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index fce24e8..c18b8ae 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -227,26 +227,97 @@ outfile_stab_struc = outdir + out_filename_stab_struc # end of variable assignment for input and output files #%%############################################################################ +print('\n===================================================' + , '\nCombining dfs for gene:', gene.lower() + , '\nDrug name:', drug + , '\n====================================================') #===================== -# some preprocessing +# variable setting #===================== +gene_pdb_f = gene.lower() + '_complex.pdb' +gene_with_offset = ['alr', 'katg', 'rpob'] +#geneL_snap2 = ['alr', 'katg', 'rpob'] +gene_with_offset_missing_aa = ['embb'] + +if gene.lower() == 'alr': + offset_val = 34 + +if gene.lower() == 'katg': + offset_val = 23 + +if gene.lower() == 'rpob': + offset_val = 28 + +if gene.lower() == 'embb': + offset_val = 23 + missing_aa_start = 248 + missing_aa_end = 269 + missing_res_offset = missing_aa_end - missing_aa_start + +if gene.lower() == 'gid': + offset_val = 0 + +if gene.lower() == 'pnca': + offset_val = 0 + +#=========== +# DSSP +#=========== +print('\nFormatting dssp_df:' + , '\nSelecting chain:', sel_chain, 'for gene', gene.lower()) + +dssp_df = dssp_df_raw[dssp_df_raw['chain_id'] == sel_chain] +print('\nDim of dssp_df:', dssp_df.shape) +dssp_df['chain_id'].value_counts() + #=========== # KD #=========== -kd_df.shape - -# geneL_kd = ['alr'] -# if gene.lower() in geneL_kd: -# print('\nRunning gene:', gene.lower() -# ,'\nChecking start numbering') +if gene.lower() in gene_with_offset: + print ('\nRunning analysis for gene with offset:', gene.lower() + , '\nOffset value:', offset_val + , '\nFormatting kd_df with offset value' + , kd_df.shape) + kd_df['position'] = kd_df['position'] + offset_val -if kd_df['wild_type_kd'].str.contains('X').any(): - print('\nDetected X in wild_type_kd' - , '\nRunning gene:', gene.lower() - , '\nChecking start numbering') - kd_df = kd_df[~kd_df['wild_type_kd'].str.contains('X')] +if gene.lower() in gene_with_offset_missing_aa: + print ('\nRunning analysis for gene with offset and missing aa:', gene.lower() + , '\nOffset value:', offset_val + , '\nMissing residues gap:', missing_res_offset + , '\nFormatting kd_df with offset value' + , '\nDim of kd_df:', kd_df.shape) + + # First correct offset, then correct GAP + print('\nFirst correcting offset') + kd_df['position'] = kd_df['position'] + offset_val + print('\nNow correcting gap') + kd_df.loc[kd_df['position'] >= 248, 'position'] = kd_df['position'] + missing_res_offset + +else: + print ('\nRunning analysis for gene WITHOUT offset:', gene.lower() + , '\nNo offset value:' + , '\nUsing kd_df as is read' + , kd_df.shape) + +print('\nChecking the position numbers from the 3 dfs: dssp, kd and rd') +dssp_kd_rd_pos_checks = pd.DataFrame ({"dssp_pos": dssp_df['position'] + , "kd_pos": kd_df['position'] + , "rd_pos": rd_df['position']}) + +dkr_pos_check = [ + dssp_kd_rd_pos_checks['dssp_pos'].equals(dssp_kd_rd_pos_checks['kd_pos']) + , dssp_kd_rd_pos_checks['dssp_pos'].equals(dssp_kd_rd_pos_checks['rd_pos']) + , dssp_kd_rd_pos_checks['kd_pos'].equals(dssp_kd_rd_pos_checks['rd_pos']) + ] +if all(dkr_pos_check): + print('\nPASS: position check passed for dssp, rd and kd dfs ') +else: + print('\nFAIL: position check Failed for dssp, rd and kd dfs' + , '\nNumbering mismatch. Please check these dfs.') + sys.exit() +#%%============================================================================ #=========== # FoldX #=========== @@ -439,27 +510,15 @@ consurf_df.shape consurf_df.columns print('\nRenaming cols and assigning pretty column names') -geneL_consurf = ['alr', 'katg', 'rpob'] - -if gene.lower() in geneL_consurf: +if gene.lower() in gene_with_offset: consurf_df = consurf_df.rename(columns={'POS' : 'position_consurf'}) #--------------------------- # Specify the offset #--------------------------- - print('\nAdding offset value for gene:', gene.lower()) + print('\nFormatting consurf df' + , '\nAdding offset value for gene:', gene.lower() + , '\nUsing offset val:', offset_val) - if gene.lower() == 'alr': - offset_val = 34 - print('\nUsing offset val:', offset_val) - - if gene.lower() == 'katg': - offset_val = 23 - print('\nUsing offset val:', offset_val) - - if gene.lower() == 'rpob': - offset_val = 28 - print('\nUsing offset val:', offset_val) - consurf_df['position'] = consurf_df['position_consurf'] + offset_val else: @@ -586,9 +645,7 @@ snap2_df.shape #---------------------- # rename colums #---------------------- -geneL_snap2 = ['alr', 'katg', 'rpob'] - -if gene.lower() in geneL_snap2: +if gene.lower() in gene_with_offset: print('\nReading SNAP2 for gene:', gene.lower() , '\nOffset column also being read' , '\nRenaming columns...' @@ -665,9 +722,7 @@ else: snap2_df.dtypes snap2_df.columns -geneL_snap2 = ['alr', 'katg', 'rpob'] - -if gene.lower() in geneL_snap2: +if gene.lower() in gene_with_offset : print('\nSelecting cols SNAP2 for gene:', gene.lower()) snap2_df_f = snap2_df[['mutationinformation' , 'mutationinformation_snap2' @@ -685,7 +740,6 @@ else: #%%============================================================================ # Now merges begin - print('===================================' , '\nFirst merge: mcsm + foldx' , '\n===================================') @@ -715,7 +769,7 @@ print('\n\nResult of first merge:', mcsm_foldx_dfs.shape mcsm_foldx_dfs[merging_cols_m1].apply(len) mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs) -#%% for embB and any other targets where mCSM-lig hasn't run for ALL nsSNPs. +#%% for embB: mCSM-lig hasn't run for ALL nsSNPs. # Get the empty cells to be full of meaningful info if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any(): print ('\nNAs detected in mcsm cols after merge.' @@ -776,9 +830,8 @@ mcsm_foldx_deepddg_dfs = pd.merge(mcsm_foldx_dfs , deepddg_df , on = 'mutationinformation' , how = "left") -mcsm_foldx_deepddg_dfs['deepddg_outcome'].value_counts() -ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns) +mcsm_foldx_deepddg_dfs['deepddg_outcome'].value_counts() mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64') @@ -792,9 +845,6 @@ dssp_df_raw.shape kd_df.shape rd_df.shape -dssp_df = dssp_df_raw[dssp_df_raw['chain_id'] == sel_chain] -dssp_df['chain_id'].value_counts() - #dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = "outer") merging_cols_m2 = detect_common_cols(dssp_df, kd_df) dssp_kd_dfs = pd.merge(dssp_df @@ -826,18 +876,16 @@ dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs) #%%============================================================================ print('===================================' - , '\nFourth merge*: fourth merge + consurf_df' + , '\nFifth merge: fourth merge + consurf_df' , '\dssp_kd_rd_dfs + consurf_df' , '\n===================================') #dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = "outer") merging_cols_m3_v2 = detect_common_cols(dssp_kd_rd_dfs, consurf_df) dssp_kd_rd_con_dfs = pd.merge(dssp_kd_rd_dfs - , consurf_df + , consurf_df_f , on = merging_cols_m3_v2 , how = "outer") -ncols_m3_v2 = len(dssp_kd_rd_con_dfs.columns) - print('\n\nResult of fourth merge*:', dssp_kd_rd_con_dfs.shape , '\n===================================================================') dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len) @@ -845,8 +893,8 @@ dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len) == len(dssp_kd_rd_con_dfs) #%%============================================================================ print('=======================================' - , '\nFifth merge: Second merge + fourth merge' - , '\nmcsm_foldx_dfs + dssp_kd_rd_dfs' + , '\nSixth merge: Second merge + fifth merge' + , '\nmcsm_foldx_dfs + dssp_kd_rd_con_dfs' , '\n=======================================') #combined_df = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = "inner") @@ -854,56 +902,109 @@ print('=======================================' #combined_df = pd.merge(mcsm_foldx_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how = "inner") #combined_df_expected_cols = ncols_m1 + ncols_m3 - len(merging_cols_m4) -# with deepddg values -merging_cols_m4 = detect_common_cols(mcsm_foldx_deepddg_dfs, dssp_kd_rd_dfs) -combined_df = pd.merge(mcsm_foldx_deepddg_dfs - , dssp_kd_rd_dfs +# Detect merging cols +merging_cols_m4 = detect_common_cols(mcsm_foldx_deepddg_dfs, dssp_kd_rd_con_dfs) + +if len(merging_cols_m4) > 1 and all(x in merging_cols_m4 for x in ['chain', 'position', 'wild_type']) : + print('\nlength of merging_cols_m4 is > 1 and has mixed dtype' + , '\nDropping chain and wild_type from dssp_kd_rd_con_dfs') + + dssp_kd_rd_con_dfs = dssp_kd_rd_con_dfs.drop(['chain', 'wild_type'], axis = 1) + print('\nDetecting merging cols again') + merging_cols_m4 = detect_common_cols(mcsm_foldx_deepddg_dfs, dssp_kd_rd_con_dfs) + print('\nMerging cols length:', len(merging_cols_m4)) + +if len(merging_cols_m4) == 1 and 'position' in merging_cols_m4: + print('\nMerging column length == 1. Using it' + , '\nMerging column name:', merging_cols_m4) +else: + sys.exit('\nFAIL: to merge mcsm_foldx_deepddg and dssp_kd_rd_con_dfs. Please check') + + +combined_7dfs = pd.merge(mcsm_foldx_deepddg_dfs + , dssp_kd_rd_con_dfs , on = merging_cols_m4 , how = "inner") -combined_df_expected_cols = ncols_deepddg_merge + ncols_m3 - len(merging_cols_m4) +if len(combined_7dfs.columns) == len(mcsm_foldx_deepddg_dfs.columns) + len(dssp_kd_rd_con_dfs.columns) - len(merging_cols_m4): + print('\n Seven dfs successfully combined:' + , '\nmcsm-lig', '\nfoldx', '\ndeepddg' + , '\ndssp', '\nkd', '\nrd' + , '\nconsurf') +else: + print('\nSomething went wrong with the seventh merge' + , '\nPlease check individual merges') + sys.exit() +#%%============================================================================ +print('=======================================' + , '\nSeventh merge: Sixth merge + snap2_df_f' + , '\ncombined_7dfs + snap_df_f' + , '\n=======================================') -# Check: whether logic effects anything else! +# Detect merging cols +merging_cols_m4_v2 = detect_common_cols(combined_7dfs, snap2_df_f) + +if len(merging_cols_m4_v2) == 1: + print ('\nProceeding with merging SNAP2 df with 7 already combined dfs') +else: + print('\nHalting Seventh merge. Please check merging cols' + ,'\nLength of merging_cols_m4_v2 is:', len(merging_cols_m4_v2)) + sys.exit() + +combined_df = pd.merge(combined_7dfs + , snap2_df_f + , on = merging_cols_m4_v2 + , how = "inner") + +combined_df_expected_cols = len(combined_7dfs.columns) + len(snap2_df_f.columns) - len(merging_cols_m4_v2) + +#%% Dimension checks for combined_df: +# CHECK whether logic effects anything else! if not gene == "embB": print("\nGene is:", gene) if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols: - print('PASS: successfully combined 5 dfs' + print('PASS: successfully combined 8 dfs' , '\nNo. of rows combined_df:', len(combined_df) , '\nNo. of cols combined_df:', len(combined_df.columns)) - else: - #sys.exit('FAIL: check individual df merges') - print("\nGene is:", gene - , "\ncombined_df length:", len(combined_df) - , "\nmcsm_df_length:", len(mcsm_df) - ) - if len(combined_df.columns) == combined_df_expected_cols: - print('PASS: successfully combined 5 dfs' - , '\nNo. of rows combined_df:', len(combined_df) - , '\nNo. of cols combined_df:', len(combined_df.columns)) - else: - sys.exit('FAIL: check individual merges') +elif len(combined_df) == len(foldx_df) and len(combined_df.columns) == combined_df_expected_cols: + print('PASS: successfully combined 8 dfs' + , '\nNo. of rows combined_df:', len(combined_df) + , '\nNo. of cols combined_df:', len(combined_df.columns)) +else: + sys.exit('FAIL: check individual merges for seventh merge') -print('\nResult of Fourth merge:', combined_df.shape +print('\nResult of Seventh merge:', combined_df.shape , '\n===================================================================') -combined_df[merging_cols_m4].apply(len) -combined_df[merging_cols_m4].apply(len) == len(combined_df) +# should be TRUE and length match mcsm_df len +combined_df[merging_cols_m4_v2].apply(len) +combined_df[merging_cols_m4_v2].apply(len) == len(combined_df) + #%%============================================================================ # Format the combined df columns combined_df_colnames = combined_df.columns # check redundant columns combined_df['chain'].equals(combined_df['chain_id']) -combined_df['wild_type'].equals(combined_df['wild_type_kd']) # has nan +combined_df['wild_type'].equals(combined_df['wild_type_kd']) # should not have nan combined_df['wild_type'].equals(combined_df['wild_type_dssp']) # sanity check foo = combined_df[['wild_type', 'wild_type_kd', 'wt_3letter_caps', 'wt_aa_3lower', 'mut_aa_3lower']] # Drop cols -cols_to_drop = ['chain_id', 'wild_type_kd', 'wild_type_dssp', 'wt_3letter_caps'] +cols_to_drop = ['chain_id', 'wild_type_kd', 'wild_type_dssp' + , 'wt_3letter_caps', 'wt_aa_3lower', 'mut_aa_3lower'] + +print('\nDropping:', len(cols_to_drop), 'cols to get clean_df for futher combining' + , '\nncols before dropping:', len(combined_df.columns) + , '\nDropped column names:', cols_to_drop) + combined_df_clean = combined_df.drop(cols_to_drop, axis = 1) -combined_df_clean.columns + +print('\nncols after dropping:', len(combined_df_clean.columns)) + +combined_df_clean_colnames = combined_df_clean.columns del(foo) #%%============================================================================ #--------------------- @@ -966,7 +1067,7 @@ if len(combined_stab_afor) == len(combined_df_clean) and len(combined_stab_afor. else: sys.exit('\nFAIL: check individual df merges') -print('\n\nResult of Fifth merge:', combined_stab_afor.shape +print('\n\nResult of Seventh merge:', combined_stab_afor.shape , '\n===================================================================') combined_stab_afor[merging_cols_m5].apply(len) @@ -1029,8 +1130,7 @@ dfs_merged = reduce(lambda left,right: pd.merge(left , right , on = ['mutationinformation'] #, how = 'inner') - , how = join_type) - , dfs_list) + , how = join_type) , dfs_list) # drop excess columns drop_cols = detect_common_cols(dfs_merged, combined_stab_afor) drop_cols.remove('mutationinformation') @@ -1038,8 +1138,8 @@ drop_cols.remove('mutationinformation') dfs_merged_clean = dfs_merged.drop(drop_cols, axis = 1) merging_cols_m6 = detect_common_cols(dfs_merged_clean, combined_stab_afor) -len(dfs_merged_clean.columns) len(combined_stab_afor.columns) +len(dfs_merged_clean.columns) combined_all_params = pd.merge(combined_stab_afor , dfs_merged_clean @@ -1064,12 +1164,47 @@ if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any(): , '\nDropping these before writing file') extra_cols_to_drop = list(combined_all_params.columns.str.extract(r'(.*_x$|.*_y$)', expand = True).dropna()[0]) print('\nTotal cols:', len(combined_all_params.columns) - ,'\nDropping:', len(extra_cols_to_drop), 'columns') + ,'\nDropping:', len(extra_cols_to_drop), 'columns' + , '\nThese are:', extra_cols_to_drop) #extra_cols_to_drop = ['chain_x', 'chain_y'] combined_all_params = combined_all_params.drop(extra_cols_to_drop, axis = 1) else: print('\nNo duplicate column names detected, just writing file' , '\nTotal cols:', len(combined_all_params.columns) ) +#%%============================================================================== +# Some final formatting for combined_all_params df + +#--------------------------------------- +# Add: Map 1 letter +# code to 3Upper for Mutant aa +#--------------------------------------- +# initialise a sub dict that is lookup dict for +# 3-LETTER aa code to 1-LETTER aa code + +lookup_dict = dict() +for k, v in oneletter_aa_dict.items(): + lookup_dict[k] = v['three_letter_code_upper'] + mut = combined_all_params['mutant_type'].squeeze() + combined_all_params['mut_3upper'] = mut.map(lookup_dict) +else: + print('\nFailed to add 3 letter upper code') + +# Add offset value +combined_all_params['seq_offset4pdb'] = offset_val + +# Drop extra columns +redundant_colsL = ['mutationinformation_snap2' + , 'wt_upper' + , 'mut_upper' + , 'pdb_file'] + +combined_all_params_f = combined_all_params.drop(redundant_colsL + , axis = 1 + , errors = 'ignore') + +# Add pdb_file name at the end +combined_all_params_f['pdb_file'] = gene_pdb_f + #%%============================================================================ #--------------------- # Output 3: write csv @@ -1078,9 +1213,9 @@ print('\nWriting file: all params') print('\nOutput 3 filename:', outfile_comb , '\n===================================================================\n') -combined_all_params.to_csv(outfile_comb, index = False) +combined_all_params_f.to_csv(outfile_comb, index = False) print('\nFinished writing file:' - , '\nNo. of rows:', combined_all_params.shape[0] - , '\nNo. of cols:', combined_all_params.shape[1]) -#%% end of script \ No newline at end of file + , '\nNo. of rows:', combined_all_params_f.shape[0] + , '\nNo. of cols:', combined_all_params_f.shape[1]) +#%% end of script diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index b55f98a..23a99c7 100644 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -5,8 +5,6 @@ ######################################################### # working dir and loading libraries getwd() -#setwd("~/git/LSHTM_analysis/scripts/plotting") - source("/home/tanu/git/LSHTM_analysis/scripts/plotting/Header_TT.R") #******************** @@ -53,7 +51,7 @@ my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting() max_ang <- round(max(my_df_u[LigDist_colname])) min_ang <- round(min(my_df_u[LigDist_colname])) -cat("\nLigand distance cut off, colname:", LigDist_colname +cat("\nLigand distance colname:", LigDist_colname , "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b" , "\nThe min distance", gene, "structure df" , ":", min_ang, "\u212b")