From 1f266c4cb8124bea491f285c0308eab647a4ceb1 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 11 Jan 2022 17:51:28 +0000
Subject: [PATCH] more tidying and formatting for combining_dfs.py. Hopefully
 no more after today

---
 scripts/combining_dfs.py            | 297 ++++++++++++++++++++--------
 scripts/plotting/get_plotting_dfs.R |   4 +-
 2 files changed, 217 insertions(+), 84 deletions(-)

diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py
index fce24e8..c18b8ae 100755
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@@ -227,26 +227,97 @@ outfile_stab_struc =  outdir + out_filename_stab_struc
 
 # end of variable assignment for input and output files
 #%%############################################################################  
+print('\n==================================================='
+      , '\nCombining dfs for gene:', gene.lower()
+      , '\nDrug name:', drug
+      , '\n====================================================')
 #=====================
-# some preprocessing
+# variable setting
 #=====================
+gene_pdb_f = gene.lower() + '_complex.pdb'
 
+gene_with_offset = ['alr', 'katg', 'rpob']
+#geneL_snap2 = ['alr', 'katg', 'rpob']
+gene_with_offset_missing_aa = ['embb']
+
+if gene.lower() == 'alr':
+    offset_val = 34
+
+if gene.lower() == 'katg':
+    offset_val = 23
+
+if gene.lower() == 'rpob':
+   offset_val = 28
+
+if gene.lower() == 'embb':
+   offset_val = 23
+   missing_aa_start = 248
+   missing_aa_end = 269
+   missing_res_offset = missing_aa_end - missing_aa_start
+   
+if gene.lower() == 'gid':
+   offset_val = 0
+   
+if gene.lower() == 'pnca':
+   offset_val = 0
+
+#===========
+# DSSP
+#===========
+print('\nFormatting dssp_df:'
+      , '\nSelecting chain:', sel_chain, 'for gene', gene.lower())
+
+dssp_df = dssp_df_raw[dssp_df_raw['chain_id'] == sel_chain]
+print('\nDim of dssp_df:', dssp_df.shape)
+dssp_df['chain_id'].value_counts()
+   
 #===========
 # KD
 #===========
-kd_df.shape
-
-# geneL_kd = ['alr']
-# if gene.lower() in geneL_kd:
-#     print('\nRunning gene:', gene.lower()
-#           ,'\nChecking start numbering')
+if gene.lower() in gene_with_offset:
+    print ('\nRunning analysis for gene with offset:', gene.lower()
+           , '\nOffset value:', offset_val  
+           , '\nFormatting kd_df with offset value'
+           , kd_df.shape)
+    kd_df['position'] = kd_df['position'] + offset_val
     
-if kd_df['wild_type_kd'].str.contains('X').any():
-    print('\nDetected X in wild_type_kd'
-          , '\nRunning gene:', gene.lower()
-          , '\nChecking start numbering')
-    kd_df = kd_df[~kd_df['wild_type_kd'].str.contains('X')]
+if gene.lower() in gene_with_offset_missing_aa:
+   print ('\nRunning analysis for gene with offset and missing aa:', gene.lower()
+           , '\nOffset value:', offset_val  
+           , '\nMissing residues gap:', missing_res_offset
+           , '\nFormatting kd_df with offset value'
+           , '\nDim of kd_df:', kd_df.shape)
+   
+   # First correct offset, then correct GAP
+   print('\nFirst correcting offset')
+   kd_df['position'] = kd_df['position'] + offset_val
 
+   print('\nNow correcting gap')
+   kd_df.loc[kd_df['position'] >= 248, 'position'] = kd_df['position'] + missing_res_offset
+
+else:
+    print ('\nRunning analysis for gene WITHOUT offset:', gene.lower()
+           , '\nNo offset value:'
+           , '\nUsing kd_df as is read'
+           , kd_df.shape)
+    
+print('\nChecking the position numbers from the 3 dfs: dssp, kd and rd')   
+dssp_kd_rd_pos_checks =  pd.DataFrame ({"dssp_pos": dssp_df['position']
+                                       , "kd_pos": kd_df['position']
+                                       , "rd_pos": rd_df['position']})
+                                       
+dkr_pos_check = [
+    dssp_kd_rd_pos_checks['dssp_pos'].equals(dssp_kd_rd_pos_checks['kd_pos'])
+    , dssp_kd_rd_pos_checks['dssp_pos'].equals(dssp_kd_rd_pos_checks['rd_pos'])
+    , dssp_kd_rd_pos_checks['kd_pos'].equals(dssp_kd_rd_pos_checks['rd_pos'])
+       ]
+if all(dkr_pos_check):
+    print('\nPASS: position check passed for dssp, rd and kd dfs ')
+else:
+    print('\nFAIL: position check Failed for dssp, rd and kd dfs'
+             , '\nNumbering mismatch. Please check these dfs.')
+    sys.exit()                                
+#%%============================================================================   
 #===========
 # FoldX
 #===========
@@ -439,27 +510,15 @@ consurf_df.shape
 consurf_df.columns
 print('\nRenaming cols and assigning pretty column names')
  
-geneL_consurf = ['alr', 'katg', 'rpob']
-
-if gene.lower() in geneL_consurf:
+if gene.lower() in gene_with_offset:
     consurf_df = consurf_df.rename(columns={'POS' : 'position_consurf'})
     #---------------------------
     # Specify the offset 
     #---------------------------
-    print('\nAdding offset value for gene:', gene.lower())
+    print('\nFormatting consurf df'
+          , '\nAdding offset value for gene:', gene.lower()
+          , '\nUsing offset val:', offset_val)
         
-    if gene.lower() == 'alr':
-        offset_val = 34      
-        print('\nUsing offset val:', offset_val)
-    
-    if gene.lower() == 'katg':
-        offset_val = 23
-        print('\nUsing offset val:', offset_val)
-
-    if gene.lower() == 'rpob':
-       offset_val = 28
-       print('\nUsing offset val:', offset_val)
-
     consurf_df['position'] = consurf_df['position_consurf'] + offset_val
     
 else:
@@ -586,9 +645,7 @@ snap2_df.shape
 #----------------------
 # rename colums
 #---------------------- 
-geneL_snap2 = ['alr', 'katg', 'rpob']
-
-if gene.lower() in geneL_snap2:
+if gene.lower() in gene_with_offset:
     print('\nReading SNAP2 for gene:', gene.lower()
           , '\nOffset column also being read'
           , '\nRenaming columns...'
@@ -665,9 +722,7 @@ else:
 snap2_df.dtypes
 snap2_df.columns
 
-geneL_snap2 = ['alr', 'katg', 'rpob']
-
-if gene.lower() in geneL_snap2:
+if gene.lower() in gene_with_offset :
     print('\nSelecting cols SNAP2 for gene:', gene.lower())
     snap2_df_f = snap2_df[['mutationinformation'
                        , 'mutationinformation_snap2'
@@ -685,7 +740,6 @@ else:
     
 #%%============================================================================
 # Now merges begin
-
 print('==================================='
       , '\nFirst merge: mcsm + foldx'
       , '\n===================================')
@@ -715,7 +769,7 @@ print('\n\nResult of first merge:', mcsm_foldx_dfs.shape
 mcsm_foldx_dfs[merging_cols_m1].apply(len)
 mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs)
 
-#%% for embB and any other targets where mCSM-lig hasn't run for ALL nsSNPs.
+#%% for embB: mCSM-lig hasn't run for ALL nsSNPs.
 # Get the empty cells to be full of meaningful info
 if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any():  
     print ('\nNAs detected in mcsm cols after merge.'
@@ -776,9 +830,8 @@ mcsm_foldx_deepddg_dfs = pd.merge(mcsm_foldx_dfs
                                   , deepddg_df
                                   , on = 'mutationinformation'
                                   , how = "left")
-mcsm_foldx_deepddg_dfs['deepddg_outcome'].value_counts()
 
-ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns)
+mcsm_foldx_deepddg_dfs['deepddg_outcome'].value_counts()
 
 mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64')
 
@@ -792,9 +845,6 @@ dssp_df_raw.shape
 kd_df.shape
 rd_df.shape
 
-dssp_df = dssp_df_raw[dssp_df_raw['chain_id'] == sel_chain]
-dssp_df['chain_id'].value_counts()
-
 #dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = "outer")
 merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
 dssp_kd_dfs = pd.merge(dssp_df
@@ -826,18 +876,16 @@ dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
 
 #%%============================================================================
 print('==================================='
-      , '\nFourth merge*: fourth merge + consurf_df' 
+      , '\nFifth merge: fourth merge + consurf_df' 
       , '\dssp_kd_rd_dfs + consurf_df'
       , '\n===================================')
 #dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = "outer")
 merging_cols_m3_v2 = detect_common_cols(dssp_kd_rd_dfs,  consurf_df)
 dssp_kd_rd_con_dfs = pd.merge(dssp_kd_rd_dfs
-                          , consurf_df
+                          , consurf_df_f
                           , on = merging_cols_m3_v2 
                           , how = "outer")
 
-ncols_m3_v2 = len(dssp_kd_rd_con_dfs.columns)
-
 print('\n\nResult of fourth merge*:', dssp_kd_rd_con_dfs.shape
       , '\n===================================================================')
 dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len)
@@ -845,8 +893,8 @@ dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len) == len(dssp_kd_rd_con_dfs)
 
 #%%============================================================================
 print('======================================='
-      , '\nFifth merge: Second merge + fourth merge'
-      , '\nmcsm_foldx_dfs + dssp_kd_rd_dfs'
+      , '\nSixth merge: Second merge + fifth merge'
+      , '\nmcsm_foldx_dfs + dssp_kd_rd_con_dfs'
       , '\n=======================================')
 
 #combined_df = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = "inner")
@@ -854,56 +902,109 @@ print('======================================='
 #combined_df = pd.merge(mcsm_foldx_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how  = "inner")
 #combined_df_expected_cols = ncols_m1 + ncols_m3 - len(merging_cols_m4)
 
-# with deepddg values
-merging_cols_m4 = detect_common_cols(mcsm_foldx_deepddg_dfs, dssp_kd_rd_dfs)
-combined_df = pd.merge(mcsm_foldx_deepddg_dfs
-                       , dssp_kd_rd_dfs
+# Detect merging cols
+merging_cols_m4 = detect_common_cols(mcsm_foldx_deepddg_dfs, dssp_kd_rd_con_dfs)
+
+if len(merging_cols_m4) > 1 and all(x in merging_cols_m4 for x in ['chain', 'position', 'wild_type']) :
+    print('\nlength of merging_cols_m4 is > 1 and has mixed dtype'
+          , '\nDropping chain and wild_type from dssp_kd_rd_con_dfs')
+    
+    dssp_kd_rd_con_dfs = dssp_kd_rd_con_dfs.drop(['chain', 'wild_type'], axis = 1)
+    print('\nDetecting merging cols again')
+    merging_cols_m4 = detect_common_cols(mcsm_foldx_deepddg_dfs, dssp_kd_rd_con_dfs)
+    print('\nMerging cols length:', len(merging_cols_m4))
+
+if len(merging_cols_m4) == 1 and 'position' in merging_cols_m4:
+    print('\nMerging column length == 1. Using it'
+        , '\nMerging column name:', merging_cols_m4)
+else:
+    sys.exit('\nFAIL: to merge mcsm_foldx_deepddg and dssp_kd_rd_con_dfs. Please check')
+    
+    
+combined_7dfs = pd.merge(mcsm_foldx_deepddg_dfs
+                       , dssp_kd_rd_con_dfs
                        , on = merging_cols_m4
                        , how  = "inner")
 
-combined_df_expected_cols = ncols_deepddg_merge + ncols_m3 - len(merging_cols_m4)
+if len(combined_7dfs.columns) == len(mcsm_foldx_deepddg_dfs.columns) + len(dssp_kd_rd_con_dfs.columns) - len(merging_cols_m4):
+    print('\n Seven dfs successfully combined:'
+          , '\nmcsm-lig', '\nfoldx', '\ndeepddg'
+          , '\ndssp', '\nkd', '\nrd'
+          , '\nconsurf')
+else:
+    print('\nSomething went wrong with the seventh merge'
+          , '\nPlease check individual merges')
+    sys.exit()  
+#%%============================================================================
+print('======================================='
+      , '\nSeventh merge: Sixth merge + snap2_df_f'
+      , '\ncombined_7dfs + snap_df_f'
+      , '\n=======================================')
 
-# Check: whether logic effects anything else!
+# Detect merging cols
+merging_cols_m4_v2 = detect_common_cols(combined_7dfs, snap2_df_f)
+
+if len(merging_cols_m4_v2) == 1:
+    print ('\nProceeding with merging SNAP2 df with 7 already combined dfs')
+else:
+    print('\nHalting Seventh merge. Please check merging cols'
+          ,'\nLength of merging_cols_m4_v2 is:', len(merging_cols_m4_v2))
+    sys.exit()
+
+combined_df = pd.merge(combined_7dfs
+                       , snap2_df_f
+                       , on = merging_cols_m4_v2
+                       , how  = "inner")    
+
+combined_df_expected_cols = len(combined_7dfs.columns) + len(snap2_df_f.columns) - len(merging_cols_m4_v2)
+   
+#%% Dimension checks for combined_df:
+# CHECK whether logic effects anything else!
 if not gene == "embB":
     print("\nGene is:", gene)
     if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols:
-        print('PASS: successfully combined 5 dfs'
+        print('PASS: successfully combined 8 dfs'
               , '\nNo. of rows combined_df:', len(combined_df)
               , '\nNo. of cols combined_df:', len(combined_df.columns))   
-    else:
-        #sys.exit('FAIL: check individual df merges')
-        print("\nGene is:", gene
-              , "\ncombined_df length:", len(combined_df)
-              , "\nmcsm_df_length:", len(mcsm_df)
-              )
-        if len(combined_df.columns) == combined_df_expected_cols:
-            print('PASS: successfully combined 5 dfs'
-                  , '\nNo. of rows combined_df:', len(combined_df)
-                  , '\nNo. of cols combined_df:', len(combined_df.columns))
-        else:
-            sys.exit('FAIL: check individual merges')        
+elif len(combined_df) == len(foldx_df) and len(combined_df.columns) == combined_df_expected_cols:
+    print('PASS: successfully combined 8 dfs'
+               , '\nNo. of rows combined_df:', len(combined_df)
+               , '\nNo. of cols combined_df:', len(combined_df.columns))
+else:
+    sys.exit('FAIL: check individual merges for seventh merge')        
             
-print('\nResult of Fourth merge:', combined_df.shape
+print('\nResult of Seventh merge:', combined_df.shape
       , '\n===================================================================')
 
-combined_df[merging_cols_m4].apply(len)
-combined_df[merging_cols_m4].apply(len) == len(combined_df)
+# should be TRUE and length match mcsm_df len
+combined_df[merging_cols_m4_v2].apply(len)
+combined_df[merging_cols_m4_v2].apply(len) == len(combined_df)
+
 #%%============================================================================
 # Format the combined df columns
 combined_df_colnames = combined_df.columns
 
 # check redundant columns
 combined_df['chain'].equals(combined_df['chain_id'])
-combined_df['wild_type'].equals(combined_df['wild_type_kd']) # has nan
+combined_df['wild_type'].equals(combined_df['wild_type_kd']) # should not have nan
 combined_df['wild_type'].equals(combined_df['wild_type_dssp'])
 
 # sanity check
 foo = combined_df[['wild_type', 'wild_type_kd', 'wt_3letter_caps', 'wt_aa_3lower', 'mut_aa_3lower']] 
 
 # Drop cols
-cols_to_drop = ['chain_id', 'wild_type_kd', 'wild_type_dssp', 'wt_3letter_caps']
+cols_to_drop = ['chain_id', 'wild_type_kd', 'wild_type_dssp'
+                , 'wt_3letter_caps', 'wt_aa_3lower', 'mut_aa_3lower']
+
+print('\nDropping:', len(cols_to_drop), 'cols to get clean_df for futher combining'
+      , '\nncols before dropping:', len(combined_df.columns)
+      , '\nDropped column names:', cols_to_drop)
+
 combined_df_clean = combined_df.drop(cols_to_drop, axis = 1)
-combined_df_clean.columns
+
+print('\nncols after dropping:', len(combined_df_clean.columns))
+
+combined_df_clean_colnames = combined_df_clean.columns
 del(foo)
 #%%============================================================================
 #---------------------
@@ -966,7 +1067,7 @@ if len(combined_stab_afor) == len(combined_df_clean) and len(combined_stab_afor.
 else:
     sys.exit('\nFAIL: check individual df merges')
 
-print('\n\nResult of Fifth merge:', combined_stab_afor.shape
+print('\n\nResult of Seventh merge:', combined_stab_afor.shape
       , '\n===================================================================')
 
 combined_stab_afor[merging_cols_m5].apply(len)
@@ -1029,8 +1130,7 @@ dfs_merged = reduce(lambda  left,right: pd.merge(left
                                                 , right
                                                 , on = ['mutationinformation']
                                                 #, how = 'inner')
-                                                , how = join_type) 
-                   , dfs_list)
+                                                , how = join_type) , dfs_list)
 # drop excess columns
 drop_cols = detect_common_cols(dfs_merged, combined_stab_afor)
 drop_cols.remove('mutationinformation')
@@ -1038,8 +1138,8 @@ drop_cols.remove('mutationinformation')
 dfs_merged_clean = dfs_merged.drop(drop_cols, axis = 1)
 merging_cols_m6 = detect_common_cols(dfs_merged_clean, combined_stab_afor)
 
-len(dfs_merged_clean.columns)
 len(combined_stab_afor.columns)
+len(dfs_merged_clean.columns)
 
 combined_all_params = pd.merge(combined_stab_afor
                                , dfs_merged_clean
@@ -1064,12 +1164,47 @@ if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any():
            , '\nDropping these before writing file')
      extra_cols_to_drop = list(combined_all_params.columns.str.extract(r'(.*_x$|.*_y$)', expand = True).dropna()[0])
      print('\nTotal cols:',  len(combined_all_params.columns)
-         ,'\nDropping:', len(extra_cols_to_drop), 'columns')
+         ,'\nDropping:', len(extra_cols_to_drop), 'columns'
+         , '\nThese are:', extra_cols_to_drop)
      #extra_cols_to_drop = ['chain_x', 'chain_y']
      combined_all_params = combined_all_params.drop(extra_cols_to_drop, axis = 1)
 else:
     print('\nNo duplicate column names detected, just writing file'
           , '\nTotal cols:', len(combined_all_params.columns) )
+#%%==============================================================================
+# Some final formatting for combined_all_params df
+
+#---------------------------------------
+# Add: Map 1 letter 
+# code to 3Upper for Mutant aa
+#---------------------------------------
+# initialise a sub dict that is lookup dict for 
+# 3-LETTER aa code to 1-LETTER aa code
+
+lookup_dict = dict()
+for k, v in oneletter_aa_dict.items():
+    lookup_dict[k] = v['three_letter_code_upper']
+    mut = combined_all_params['mutant_type'].squeeze()
+    combined_all_params['mut_3upper'] = mut.map(lookup_dict)
+else:
+    print('\nFailed to add 3 letter upper code')
+
+# Add offset value
+combined_all_params['seq_offset4pdb'] = offset_val
+
+# Drop extra columns
+redundant_colsL = ['mutationinformation_snap2'
+                   , 'wt_upper'
+                   , 'mut_upper'
+                   , 'pdb_file']
+
+combined_all_params_f = combined_all_params.drop(redundant_colsL
+                                                 , axis = 1
+                                                 , errors = 'ignore')
+
+# Add pdb_file name at the end
+combined_all_params_f['pdb_file'] = gene_pdb_f
+
 #%%============================================================================
 #---------------------
 # Output 3: write csv
@@ -1078,9 +1213,9 @@ print('\nWriting file: all params')
 print('\nOutput 3 filename:', outfile_comb
       , '\n===================================================================\n')
 
-combined_all_params.to_csv(outfile_comb, index = False)
+combined_all_params_f.to_csv(outfile_comb, index = False)
 
 print('\nFinished writing file:'
-      , '\nNo. of rows:', combined_all_params.shape[0]
-      , '\nNo. of cols:', combined_all_params.shape[1])
-#%% end of script
\ No newline at end of file
+      , '\nNo. of rows:', combined_all_params_f.shape[0]
+      , '\nNo. of cols:', combined_all_params_f.shape[1])
+#%% end of script
diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R
index b55f98a..23a99c7 100644
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@@ -5,8 +5,6 @@
 #########################################################
 # working dir and loading libraries
 getwd()
-#setwd("~/git/LSHTM_analysis/scripts/plotting")
-
 source("/home/tanu/git/LSHTM_analysis/scripts/plotting/Header_TT.R")
 
 #********************
@@ -53,7 +51,7 @@ my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
 max_ang <- round(max(my_df_u[LigDist_colname]))
 min_ang <- round(min(my_df_u[LigDist_colname]))
 
-cat("\nLigand distance cut off, colname:", LigDist_colname
+cat("\nLigand distance colname:", LigDist_colname
     , "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b"
     , "\nThe min distance", gene, "structure df" , ":", min_ang, "\u212b")