handled rpob 5uhc position offset in mcsm_ppi2

2022-01-04 10:45:29 +00:00 · 2022-01-04 10:45:29 +00:00 · 00b84ccb1c
commit 00b84ccb1c
parent 46e2c93885
30 changed files with 395 additions and 63 deletions
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@ -53,7 +53,7 @@ homedir = os.path.expanduser('~')

 # set working dir
 os.getcwd()
-#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()

 # FIXME: local imports
@ -170,6 +170,18 @@ infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv'
 infile_mcsm_f_snps     = outdir + infilename_mcsm_f_snps
 mcsm_f_snps            = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)

+# more output added
+## consurf [change colnames]
+
+infilename_consurf     = gene.lower() + '_consurf_grades_f.csv'
+infile_consurf         = outdir + 'consurf/'+ infilename_consurf
+consurf_df             = pd.read_csv(infile_consurf, sep = ',')
+
+## SNAP2 [add normalised score]
+infilename_snap2       = gene.lower() + '_snap2_output.csv'
+infile_snap2           = outdir + 'snap2/'+ infilename_snap2
+snap2_df               = pd.read_csv(infile_snap2, sep = ',')
+
 #------------------------------------------------------------------------------
 # ONLY:for gene pnca and gid: End logic should pick this up!
 geneL_na = ['gid', 'rpob']
@ -196,7 +208,7 @@ if gene.lower() in geneL_dy:
    # infile_mcsm_na        = outdir + 'mcsm_na_results/' + infilename_mcsm_na 
    # mcsm_na_df            = pd.read_csv(infile_mcsm_na, sep = ',')

-# ONLY:for gene embb and alr: End logic should pick this up!
+# ONLY:for gene embb and alr and katg: End logic should pick this up!
 geneL_ppi2 = ['embb', 'alr']
 #if gene.lower() == "embb" or "alr":
 if gene.lower() in geneL_ppi2:
@ -381,6 +393,247 @@ else:
 if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
    print('\nPASS: Deepddg data is scaled between -1 and 1',
           '\nproceeding with merge')
+     
+#=======================
+# Consurf
+#=======================
+consurf_df.shape
+
+# drop row 0: as it contains no value but hangover text
+consurf_df = consurf_df.drop(index=0)
+
+#----------------------
+# rename colums
+#----------------------   
+consurf_df.columns
+print('\nRenaming cols and assigning pretty column names')
+ 
+geneL_consurf = ['alr', 'katg', 'rpob']
+
+if gene.lower() in geneL_consurf:
+    consurf_df = consurf_df.rename(columns={'POS' : 'position_consurf'})
+    #---------------------------
+    # Specify the offset 
+    #---------------------------
+    print('\nAdding offset value for gene:', gene.lower())
+        
+    if gene.lower() == 'alr':
+        offset_val = 34
+        
+        print('\nUsing offset val:', offset_val)
+    if gene.lower() == 'katg':
+        offset_val = 23
+        print('\nUsing offset val:', offset_val)
+
+    if gene.lower() == 'rpob':
+       offset_val = 28
+       print('\nUsing offset val:', offset_val)
+
+    consurf_df['position'] = consurf_df['position_consurf'] + offset_val
+    
+else:
+    consurf_df = consurf_df.rename(columns={'POS' : 'position'})
+
+consurf_df = consurf_df.rename(columns={'SEQ'     : 'wild_type'
+                                        , '3LATOM': 'wt_3upper'
+                                        , 'SCORE' : 'consurf_score'
+                                        , 'COLOR' : 'consurf_colour_str'
+                                        , 'CONFIDENCEINTERVAL'       : 'consurf_ci'
+                                        , 'CONFIDENCEINTERVALCOLORS' : 'consurf_ci_colour'
+                                        , 'MSADATA'  : 'consurf_msa_data'
+                                        , 'RESIDUEVARIETY' : 'consurf_aa_variety'})
+# quick check
+if len(consurf_df) == len(rd_df):
+    print('\nPASS: length of consurf df is as expected'
+          , '\nProceeding to format consurf df')
+else:
+    print('\nFAIL: length mismatch'
+          , '\nExpected nrows:', len(rd_df)
+          , '\nGot:', len(consurf_df))
+
+consurf_df.dtypes
+consurf_df['consurf_score'] = consurf_df['consurf_score'].astype(float)
+
+consurf_df['consurf_colour'] = consurf_df['consurf_colour_str'].str.extract(r'(\d).*')
+consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int)
+
+consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0')
+consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int)
+
+consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):')
+consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci_upper'].astype(float)
+
+consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci'].str.extract(r':(.*)')
+consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci_lower'].astype(float)
+
+#consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)')
+#consurf_df['wt_3upper_f']
+consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
+
+consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)')
+
+#-------------------------
+# scale consurf values
+#-------------------------
+# Rescale values in consurf_score col b/w -1 and 1 so negative numbers
+# stay neg and pos numbers stay positive
+consurf_min = consurf_df['consurf_score'].min()
+consurf_max = consurf_df['consurf_score'].max() 
+consurf_min
+consurf_max
+
+# quick check
+len(consurf_df.loc[consurf_df['consurf_score'] >= 0])
+len(consurf_df.loc[consurf_df['consurf_score'] < 0])
+
+consurf_scale = lambda x : x/abs(consurf_min) if x < 0 else (x/consurf_max if x >= 0 else 'failed')
+
+consurf_df['consurf_scaled'] = consurf_df['consurf_score'].apply(consurf_scale)
+print('\nRaw consurf scores:\n', consurf_df['consurf_score']
+    , '\n---------------------------------------------------------------'
+    , '\nScaled consurf scores:\n', consurf_df['consurf_scaled'])
+
+# additional check added
+csmi = consurf_df['consurf_scaled'].min()
+csma = consurf_df['consurf_scaled'].max()
+
+c = consurf_df[consurf_df['consurf_score']>=0].count()
+consurf_pos = c.get(key = 'consurf_score')
+
+c2 = consurf_df[consurf_df['consurf_scaled']>=0].count()
+consurf_pos2 = c2.get(key = 'consurf_scaled')
+
+if consurf_pos == consurf_pos2 and csmi == -1 and csma == 1:
+    print('\nPASS: Consurf values scaled correctly b/w -1 and 1')
+else:
+    print('\nFAIL: Consurf values scaled numbers MISmatch'
+          , '\nExpected number:', consurf_pos
+          , '\nGot:', consurf_pos2
+          , '\n======================================================')
+
+consurf_df.dtypes
+consurf_df.columns
+
+#---------------------------
+# select columns 
+# (and also determine order)
+#---------------------------
+consurf_df_f = consurf_df[['position'
+                           , 'wild_type'
+                           , 'chain'
+                           , 'wt_3upper'
+                           , 'consurf_score'
+                           , 'consurf_scaled'
+                           , 'consurf_colour'
+                           , 'consurf_colour_rev'
+                           , 'consurf_ci_upper'
+                           , 'consurf_ci_lower'
+                           , 'consurf_ci_colour'
+                           , 'consurf_msa_data'
+                           , 'consurf_aa_variety']]
+
+#=======================
+# SNAP2
+#=======================
+snap2_df.shape
+
+#----------------------
+# rename colums
+#---------------------- 
+geneL_snap2 = ['alr', 'katg', 'rpob']
+
+if gene.lower() in geneL_snap2:
+    print('\nReading SNAP2 for gene:', gene.lower()
+          , '\nOffset column also being read'
+          , '\nRenaming columns...'
+          , '\nColumn mutationinformation exists. Renaming SNAP2 column variant --> mutationinformation')
+    
+    snap2_df = snap2_df.rename(columns = {'mutationinformation': 'mutationinformation'
+                                          , 'Variant'          : 'mutationinformation_snap2'
+                                          , 'Predicted Effect' : 'snap2_outcome'
+                                          , 'Score'            : 'snap2_score'
+                                          , 'Expected Accuracy': 'snap2_accuracy_pc'})
+else:
+    print('\nReading SNAP2 for gene:', gene.lower()
+          , '\nNo offset column for SNAP2'
+          , '\nRenaming columns...'
+          , '\nRenaming SNAP2 column variant --> mutationinformation')
+
+    snap2_df = snap2_df.rename(columns = {'Variant'            : 'mutationinformation'
+                                          , 'Predicted Effect' : 'snap2_outcome'
+                                          , 'Score'            : 'snap2_score'
+                                          , 'Expected Accuracy': 'snap2_accuracy_pc'})
+    
+snap2_df.columns
+snap2_df.head()
+snap2_df.dtypes
+
+snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].str.replace('%','')
+snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].astype(int)
+
+#-------------------------
+# scale snap2 values
+#-------------------------
+# Rescale values in snap2_score col b/w -1 and 1 so negative numbers
+# stay neg and pos numbers stay positive
+snap2_min = snap2_df['snap2_score'].min()
+snap2_max = snap2_df['snap2_score'].max() 
+snap2_min
+snap2_max
+
+# quick check
+len(snap2_df.loc[snap2_df['snap2_score'] >= 0])
+len(snap2_df.loc[snap2_df['snap2_score'] < 0])
+
+snap2_scale = lambda x : x/abs(snap2_min) if x < 0 else (x/snap2_max if x >= 0 else 'failed')
+
+snap2_df['snap2_scaled'] = snap2_df['snap2_score'].apply(snap2_scale)
+print('\nRaw snap2 scores:\n', snap2_df['snap2_score']
+    , '\n---------------------------------------------------------------'
+    , '\nScaled snap2 scores:\n', snap2_df['snap2_scaled'])
+
+# additional check added
+ssmi = snap2_df['snap2_scaled'].min()
+ssma = snap2_df['snap2_scaled'].max()
+
+sn = snap2_df[snap2_df['snap2_score']>=0].count()
+snap2_pos = sn.get(key = 'snap2_score')
+
+sn2 = snap2_df[snap2_df['snap2_scaled']>=0].count()
+snap2_pos2 = sn2.get(key = 'snap2_scaled')
+
+if snap2_pos == snap2_pos2 and csmi == -1 and csma == 1:
+    print('\nPASS: Snap2 values scaled correctly b/w -1 and 1')
+else:
+    print('\nFAIL: snap2 values scaled numbers MISmatch'
+          , '\nExpected number:', snap2_pos
+          , '\nGot:', snap2_pos2
+          , '\n======================================================')
+
+#---------------------------
+# select columns 
+# (and also determine order)
+#---------------------------
+snap2_df.dtypes
+snap2_df.columns
+
+geneL_snap2 = ['alr', 'katg', 'rpob']
+
+if gene.lower() in geneL_snap2:
+    print('\nSelecting cols SNAP2 for gene:', gene.lower())
+    snap2_df_f = snap2_df[['mutationinformation'
+                       , 'mutationinformation_snap2'
+                       , 'snap2_score'
+                       , 'snap2_scaled'
+                       , 'snap2_accuracy_pc'
+                       , 'snap2_outcome']]
+else:
+    print('\nSelecting cols SNAP2 for gene:', gene.lower())
+    snap2_df_f = snap2_df[['mutationinformation'
+                       , 'snap2_score'
+                       , 'snap2_scaled'
+                       , 'snap2_accuracy_pc'
+                       , 'snap2_outcome']]
    
 #%%============================================================================
 # Now merges begin
@ -499,7 +752,9 @@ merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
 dssp_kd_dfs = pd.merge(dssp_df
                       , kd_df
                       , on = merging_cols_m2
-                       , how = "outer")
+                       #, how = "outer")
+                       , how = "inner")
+

 print('\n\nResult of third merge:', dssp_kd_dfs.shape
      , '\n===================================================================')
@ -521,6 +776,26 @@ print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape
      , '\n===================================================================')
 dssp_kd_rd_dfs[merging_cols_m3].apply(len)
 dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
+
+#%%============================================================================
+print('==================================='
+      , '\nFourth merge*: fourth merge + consurf_df' 
+      , '\dssp_kd_rd_dfs + consurf_df'
+      , '\n===================================')
+#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = "outer")
+merging_cols_m3_v2 = detect_common_cols(dssp_kd_rd_dfs,  consurf_df)
+dssp_kd_rd_con_dfs = pd.merge(dssp_kd_rd_dfs
+                          , consurf_df
+                          , on = merging_cols_m3_v2 
+                          , how = "outer")
+
+ncols_m3_v2 = len(dssp_kd_rd_con_dfs.columns)
+
+print('\n\nResult of fourth merge*:', dssp_kd_rd_con_dfs.shape
+      , '\n===================================================================')
+dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len)
+dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len) == len(dssp_kd_rd_con_dfs)
+
 #%%============================================================================
 print('======================================='
      , '\nFifth merge: Second merge + fourth merge'