handled rpob 5uhc position offset in mcsm_ppi2

2022-01-04 10:45:29 +00:00 · 2022-01-04 10:45:29 +00:00 · 00b84ccb1c
commit 00b84ccb1c
parent 46e2c93885
30 changed files with 395 additions and 63 deletions
--- a/mcsm_ppi2/format_results_mcsm_ppi2.py
+++ b/mcsm_ppi2/format_results_mcsm_ppi2.py
@ -24,7 +24,7 @@ from reference_dict import up_3letter_aa_dict
 from reference_dict import oneletter_aa_dict
 #%%============================================================================    

-def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
+def format_mcsm_ppi2_output(mcsm_ppi2_output_csv, gene_name):
    """
    @param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps 
     which is the result of combining all mcsm_ppi2 batch results, and using
@ -78,30 +78,57 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
    
    # # check
    # mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD'])
-    # mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])
-#%%============================================================================    
-    #############
-    # rename cols
-    #############
-    # format colnames: all lowercase and consistent colnames
-    mcsm_ppi2_data.columns
-    print('Assigning meaningful colnames'
-            , '\n=======================================================')
-    
-    my_colnames_dict = {'chain': 'chain'
-        , 'wild-type': 'wt_upper'
-        , 'res-number': 'position'
-        , 'mutant': 'mut_upper'
-        , 'distance-to-interface': 'interface_dist'
-        , 'mcsm-ppi2-prediction': 'mcsm_ppi2_affinity'
-        , 'affinity': 'mcsm_ppi2_outcome'
-        , 'w_type': 'wild_type' # one letter amino acid code
-        , 'm_type': 'mutant_type' # one letter amino acid code  
-} 
+    # mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])    
+#%%=====================================================================
+# add offset specified position number for rpob since 5uhc with chain 'C' was
+# used to run the analysis

+    geneL_sp = ['rpob']
+    if gene_name.lower() in geneL_sp:
+        offset = 6
+        chain_orig = 'A'
+        
+        # Add offset corrected position number. matching with rpob nsSNPs used for mCSM-lig
+        # and also add corresponding chain id matching with rpob nsSNPs used for mCSM-lig
+        mcsm_ppi2_data['position'] = mcsm_ppi2_data['res-number'] - offset
+        mcsm_ppi2_data['chain'] = chain_orig
+        mcsm_ppi2_data['5uhc_offset'] = offset
+    
+        #############
+        # rename cols
+        #############
+        # format colnames: all lowercase and consistent colnames
+        mcsm_ppi2_data.columns
+        print('Assigning meaningful colnames'
+              , '\n=======================================================')
+     
+        my_colnames_dict = {'chain'                  : 'chain'
+                            , 'position'             : 'position'
+                            , '5uhc_offset'          : '5uhc_offset'
+                            , 'wild-type'            : 'wt_upper'
+                            , 'res-number'           : '5uhc_position'
+                            , 'mutant'               : 'mut_upper'
+                            , 'distance-to-interface': 'interface_dist'
+                            , 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
+                            , 'affinity'             : 'mcsm_ppi2_outcome'
+                            , 'w_type'               : 'wild_type' # one letter amino acid code
+                            , 'm_type'               : 'mutant_type' # one letter amino acid code  
+                            } 
+    else:
+        my_colnames_dict = {'chain'                  : 'chain'
+                            , 'wild-type'            : 'wt_upper'
+                            , 'res-number'           : 'position'
+                            , 'mutant'               : 'mut_upper'
+                            , 'distance-to-interface': 'interface_dist'
+                            , 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
+                            , 'affinity'             : 'mcsm_ppi2_outcome'
+                            , 'w_type'               : 'wild_type' # one letter amino acid code
+                            , 'm_type'               : 'mutant_type' # one letter amino acid code  
+                            }
+#%%==============================================================================        
    mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True)
    mcsm_ppi2_data.columns
-
+         
    #############
    # create mutationinformation column
    #############    
@ -137,22 +164,47 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
              , '\nExpected number:', mcsm_ppi2_pos
              , '\nGot:', mcsm_ppi2_pos2
              , '\n======================================================')
-
 #%%=====================================================================
-    #############
+    ###################
    # reorder columns
-    #############
+    ###################
    mcsm_ppi2_data.columns
-    mcsm_ppi2_dataf = mcsm_ppi2_data[['mutationinformation'
-                                , 'mcsm_ppi2_affinity'
-                                , 'mcsm_ppi2_scaled'
-                                , 'mcsm_ppi2_outcome'
-                                , 'interface_dist'
-                                , 'wild_type'
-                                , 'position'
-                                , 'mutant_type'
-                                , 'wt_upper'
-                                , 'mut_upper'
-                                , 'chain']]
+    
+    #---------------------
+    # Determine col order
+    #---------------------
+    
+    core_cols = ['mutationinformation'
+                , 'mcsm_ppi2_affinity'
+                , 'mcsm_ppi2_scaled'
+                , 'mcsm_ppi2_outcome'
+                , 'interface_dist'
+                , 'wild_type'
+                , 'position'
+                , 'mutant_type'
+                , 'wt_upper'
+                , 'mut_upper'
+                , 'chain']
+    
+    if gene_name.lower() in geneL_sp:
+        
+        column_order = core_cols + ['5uhc_offset', '5uhc_position']
+    
+    else:
+        
+        column_order = core_cols.copy()
+        
+    #--------------
+    # reorder now
+    #--------------    
+    mcsm_ppi2_dataf = mcsm_ppi2_data[column_order]
+
+#%%============================================================================
+    ###################
+    # Sort df based on 
+    # position columns
+    ###################
+    mcsm_ppi2_dataf.sort_values(by = ['position', 'mutant_type'], inplace = True, ascending = True)
+    
    return(mcsm_ppi2_dataf)
-#%%##################################################################### 
+#%%##################################################################### 
--- a/mcsm_ppi2/run_format_results_mcsm_ppi2.py
+++ b/mcsm_ppi2/run_format_results_mcsm_ppi2.py
@ -67,7 +67,7 @@ outfile_mcsm_ppi2_f = outdir_ppi2 + gene.lower() + '_complex_mcsm_ppi2_norm.csv'
 # Data: gid+streptomycin
 #==========================
 print('Formatting results for:', infile_mcsm_ppi2)
-mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2)
+mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2, gene_name = gene)

 # writing file
 print('Writing formatted df to csv')