From 00b84ccb1c22a27c800bf86448a219fefcb92eda Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 4 Jan 2022 10:45:29 +0000
Subject: [PATCH] handled rpob 5uhc position offset in mcsm_ppi2

---
 dynamut/format_results_dynamut.py           |   0
 dynamut/format_results_dynamut2.py          |   0
 dynamut/run_format_results_dynamut.py       |   0
 dynamut/split_csv_chain.sh                  |   6 +-
 mcsm_na/examples.py                         |   0
 mcsm_na/format_results_mcsm_na.py           |   0
 mcsm_na/get_results_mcsm_na.py              |   0
 mcsm_na/run_format_results_mcsm_na.py       |   0
 mcsm_na/submit_mcsm_na.py                   |   0
 mcsm_ppi2/format_results_mcsm_ppi2.py       | 126 ++++++---
 mcsm_ppi2/run_format_results_mcsm_ppi2.py   |   2 +-
 scripts/combining_dfs.py                    | 281 +++++++++++++++++++-
 scripts/data_extraction_epistasis.py        |  41 +--
 scripts/kd_df.py                            |   2 +-
 scripts/plotting/basic_barplots_combined.R  |   0
 scripts/plotting/corr_adjusted_PS_LIG.R     |   0
 scripts/plotting/dirs.R                     |   0
 scripts/plotting/dist_plots_check.R         |   0
 scripts/plotting/extreme_muts.R             |   0
 scripts/plotting/get_plotting_dfs.R         |   0
 scripts/plotting/ggcorr_all_PS_LIG.R        |   0
 scripts/plotting/hist_af_or_base.R          |   0
 scripts/plotting/hist_af_or_combined.R      |   0
 scripts/plotting/legend_adjustment.R        |   0
 scripts/plotting/opp_mcsm_muts.R            |   0
 scripts/plotting/or_plots_combined.R        |   0
 scripts/plotting/other_plots_combined.R     |   0
 scripts/plotting/output_tables.R            |   0
 scripts/plotting/ps_plots_combined.R        |   0
 scripts/plotting/resolving_ambiguous_muts.R |   0
 30 files changed, 395 insertions(+), 63 deletions(-)
 mode change 100755 => 100644 dynamut/format_results_dynamut.py
 mode change 100755 => 100644 dynamut/format_results_dynamut2.py
 mode change 100755 => 100644 dynamut/run_format_results_dynamut.py
 mode change 100755 => 100644 mcsm_na/examples.py
 mode change 100755 => 100644 mcsm_na/format_results_mcsm_na.py
 mode change 100755 => 100644 mcsm_na/get_results_mcsm_na.py
 mode change 100755 => 100644 mcsm_na/run_format_results_mcsm_na.py
 mode change 100755 => 100644 mcsm_na/submit_mcsm_na.py
 mode change 100755 => 100644 scripts/plotting/basic_barplots_combined.R
 mode change 100755 => 100644 scripts/plotting/corr_adjusted_PS_LIG.R
 mode change 100755 => 100644 scripts/plotting/dirs.R
 mode change 100755 => 100644 scripts/plotting/dist_plots_check.R
 mode change 100755 => 100644 scripts/plotting/extreme_muts.R
 mode change 100755 => 100644 scripts/plotting/get_plotting_dfs.R
 mode change 100755 => 100644 scripts/plotting/ggcorr_all_PS_LIG.R
 mode change 100755 => 100644 scripts/plotting/hist_af_or_base.R
 mode change 100755 => 100644 scripts/plotting/hist_af_or_combined.R
 mode change 100755 => 100644 scripts/plotting/legend_adjustment.R
 mode change 100755 => 100644 scripts/plotting/opp_mcsm_muts.R
 mode change 100755 => 100644 scripts/plotting/or_plots_combined.R
 mode change 100755 => 100644 scripts/plotting/other_plots_combined.R
 mode change 100755 => 100644 scripts/plotting/output_tables.R
 mode change 100755 => 100644 scripts/plotting/ps_plots_combined.R
 mode change 100755 => 100644 scripts/plotting/resolving_ambiguous_muts.R

diff --git a/dynamut/format_results_dynamut.py b/dynamut/format_results_dynamut.py
old mode 100755
new mode 100644
diff --git a/dynamut/format_results_dynamut2.py b/dynamut/format_results_dynamut2.py
old mode 100755
new mode 100644
diff --git a/dynamut/run_format_results_dynamut.py b/dynamut/run_format_results_dynamut.py
old mode 100755
new mode 100644
diff --git a/dynamut/split_csv_chain.sh b/dynamut/split_csv_chain.sh
index ac60faa..0512c1f 100755
--- a/dynamut/split_csv_chain.sh
+++ b/dynamut/split_csv_chain.sh
@@ -31,7 +31,11 @@ split ../../../${INFILE} -l ${CHUNK} -d snp_batch_
 #~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 50  
 
 # Date: 05/10/2021
-#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 20    
+#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 20   
+
+# Date: 30/11/2021
+#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps_chain.csv snp_batches 20
+for i in {00..40}; do mv snp_batch_${i} snp_batch_${i}.txt; done
   
 # add .txt to the files
 ########################################################################
diff --git a/mcsm_na/examples.py b/mcsm_na/examples.py
old mode 100755
new mode 100644
diff --git a/mcsm_na/format_results_mcsm_na.py b/mcsm_na/format_results_mcsm_na.py
old mode 100755
new mode 100644
diff --git a/mcsm_na/get_results_mcsm_na.py b/mcsm_na/get_results_mcsm_na.py
old mode 100755
new mode 100644
diff --git a/mcsm_na/run_format_results_mcsm_na.py b/mcsm_na/run_format_results_mcsm_na.py
old mode 100755
new mode 100644
diff --git a/mcsm_na/submit_mcsm_na.py b/mcsm_na/submit_mcsm_na.py
old mode 100755
new mode 100644
diff --git a/mcsm_ppi2/format_results_mcsm_ppi2.py b/mcsm_ppi2/format_results_mcsm_ppi2.py
index 49c05b2..656e47b 100755
--- a/mcsm_ppi2/format_results_mcsm_ppi2.py
+++ b/mcsm_ppi2/format_results_mcsm_ppi2.py
@@ -24,7 +24,7 @@ from reference_dict import up_3letter_aa_dict
 from reference_dict import oneletter_aa_dict
 #%%============================================================================    
 
-def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
+def format_mcsm_ppi2_output(mcsm_ppi2_output_csv, gene_name):
     """
     @param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps 
      which is the result of combining all mcsm_ppi2 batch results, and using
@@ -78,30 +78,57 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
     
     # # check
     # mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD'])
-    # mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])
-#%%============================================================================    
-    #############
-    # rename cols
-    #############
-    # format colnames: all lowercase and consistent colnames
-    mcsm_ppi2_data.columns
-    print('Assigning meaningful colnames'
-            , '\n=======================================================')
-    
-    my_colnames_dict = {'chain': 'chain'
-        , 'wild-type': 'wt_upper'
-        , 'res-number': 'position'
-        , 'mutant': 'mut_upper'
-        , 'distance-to-interface': 'interface_dist'
-        , 'mcsm-ppi2-prediction': 'mcsm_ppi2_affinity'
-        , 'affinity': 'mcsm_ppi2_outcome'
-        , 'w_type': 'wild_type' # one letter amino acid code
-        , 'm_type': 'mutant_type' # one letter amino acid code  
-} 
+    # mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])    
+#%%=====================================================================
+# add offset specified position number for rpob since 5uhc with chain 'C' was
+# used to run the analysis
 
+    geneL_sp = ['rpob']
+    if gene_name.lower() in geneL_sp:
+        offset = 6
+        chain_orig = 'A'
+        
+        # Add offset corrected position number. matching with rpob nsSNPs used for mCSM-lig
+        # and also add corresponding chain id matching with rpob nsSNPs used for mCSM-lig
+        mcsm_ppi2_data['position'] = mcsm_ppi2_data['res-number'] - offset
+        mcsm_ppi2_data['chain'] = chain_orig
+        mcsm_ppi2_data['5uhc_offset'] = offset
+    
+        #############
+        # rename cols
+        #############
+        # format colnames: all lowercase and consistent colnames
+        mcsm_ppi2_data.columns
+        print('Assigning meaningful colnames'
+              , '\n=======================================================')
+     
+        my_colnames_dict = {'chain'                  : 'chain'
+                            , 'position'             : 'position'
+                            , '5uhc_offset'          : '5uhc_offset'
+                            , 'wild-type'            : 'wt_upper'
+                            , 'res-number'           : '5uhc_position'
+                            , 'mutant'               : 'mut_upper'
+                            , 'distance-to-interface': 'interface_dist'
+                            , 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
+                            , 'affinity'             : 'mcsm_ppi2_outcome'
+                            , 'w_type'               : 'wild_type' # one letter amino acid code
+                            , 'm_type'               : 'mutant_type' # one letter amino acid code  
+                            } 
+    else:
+        my_colnames_dict = {'chain'                  : 'chain'
+                            , 'wild-type'            : 'wt_upper'
+                            , 'res-number'           : 'position'
+                            , 'mutant'               : 'mut_upper'
+                            , 'distance-to-interface': 'interface_dist'
+                            , 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
+                            , 'affinity'             : 'mcsm_ppi2_outcome'
+                            , 'w_type'               : 'wild_type' # one letter amino acid code
+                            , 'm_type'               : 'mutant_type' # one letter amino acid code  
+                            }
+#%%==============================================================================        
     mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True)
     mcsm_ppi2_data.columns
-
+         
     #############
     # create mutationinformation column
     #############    
@@ -137,22 +164,47 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
               , '\nExpected number:', mcsm_ppi2_pos
               , '\nGot:', mcsm_ppi2_pos2
               , '\n======================================================')
-
 #%%=====================================================================
-    #############
+    ###################
     # reorder columns
-    #############
+    ###################
     mcsm_ppi2_data.columns
-    mcsm_ppi2_dataf = mcsm_ppi2_data[['mutationinformation'
-                                , 'mcsm_ppi2_affinity'
-                                , 'mcsm_ppi2_scaled'
-                                , 'mcsm_ppi2_outcome'
-                                , 'interface_dist'
-                                , 'wild_type'
-                                , 'position'
-                                , 'mutant_type'
-                                , 'wt_upper'
-                                , 'mut_upper'
-                                , 'chain']]
+    
+    #---------------------
+    # Determine col order
+    #---------------------
+    
+    core_cols = ['mutationinformation'
+                , 'mcsm_ppi2_affinity'
+                , 'mcsm_ppi2_scaled'
+                , 'mcsm_ppi2_outcome'
+                , 'interface_dist'
+                , 'wild_type'
+                , 'position'
+                , 'mutant_type'
+                , 'wt_upper'
+                , 'mut_upper'
+                , 'chain']
+    
+    if gene_name.lower() in geneL_sp:
+        
+        column_order = core_cols + ['5uhc_offset', '5uhc_position']
+    
+    else:
+        
+        column_order = core_cols.copy()
+        
+    #--------------
+    # reorder now
+    #--------------    
+    mcsm_ppi2_dataf = mcsm_ppi2_data[column_order]
+
+#%%============================================================================
+    ###################
+    # Sort df based on 
+    # position columns
+    ###################
+    mcsm_ppi2_dataf.sort_values(by = ['position', 'mutant_type'], inplace = True, ascending = True)
+    
     return(mcsm_ppi2_dataf)
-#%%##################################################################### 
+#%%##################################################################### 
\ No newline at end of file
diff --git a/mcsm_ppi2/run_format_results_mcsm_ppi2.py b/mcsm_ppi2/run_format_results_mcsm_ppi2.py
index ab036f9..e0f3612 100755
--- a/mcsm_ppi2/run_format_results_mcsm_ppi2.py
+++ b/mcsm_ppi2/run_format_results_mcsm_ppi2.py
@@ -67,7 +67,7 @@ outfile_mcsm_ppi2_f = outdir_ppi2 + gene.lower() + '_complex_mcsm_ppi2_norm.csv'
 # Data: gid+streptomycin
 #==========================
 print('Formatting results for:', infile_mcsm_ppi2)
-mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2)
+mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2, gene_name = gene)
 
 # writing file
 print('Writing formatted df to csv')
diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py
index d6cb2fd..b33d36f 100755
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@@ -53,7 +53,7 @@ homedir = os.path.expanduser('~')
 
 # set working dir
 os.getcwd()
-#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()
 
 # FIXME: local imports
@@ -170,6 +170,18 @@ infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv'
 infile_mcsm_f_snps     = outdir + infilename_mcsm_f_snps
 mcsm_f_snps            = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
 
+# more output added
+## consurf [change colnames]
+
+infilename_consurf     = gene.lower() + '_consurf_grades_f.csv'
+infile_consurf         = outdir + 'consurf/'+ infilename_consurf
+consurf_df             = pd.read_csv(infile_consurf, sep = ',')
+
+## SNAP2 [add normalised score]
+infilename_snap2       = gene.lower() + '_snap2_output.csv'
+infile_snap2           = outdir + 'snap2/'+ infilename_snap2
+snap2_df               = pd.read_csv(infile_snap2, sep = ',')
+
 #------------------------------------------------------------------------------
 # ONLY:for gene pnca and gid: End logic should pick this up!
 geneL_na = ['gid', 'rpob']
@@ -196,7 +208,7 @@ if gene.lower() in geneL_dy:
     # infile_mcsm_na        = outdir + 'mcsm_na_results/' + infilename_mcsm_na 
     # mcsm_na_df            = pd.read_csv(infile_mcsm_na, sep = ',')
 
-# ONLY:for gene embb and alr: End logic should pick this up!
+# ONLY:for gene embb and alr and katg: End logic should pick this up!
 geneL_ppi2 = ['embb', 'alr']
 #if gene.lower() == "embb" or "alr":
 if gene.lower() in geneL_ppi2:
@@ -381,6 +393,247 @@ else:
 if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
     print('\nPASS: Deepddg data is scaled between -1 and 1',
            '\nproceeding with merge')
+     
+#=======================
+# Consurf
+#=======================
+consurf_df.shape
+
+# drop row 0: as it contains no value but hangover text
+consurf_df = consurf_df.drop(index=0)
+
+#----------------------
+# rename colums
+#----------------------   
+consurf_df.columns
+print('\nRenaming cols and assigning pretty column names')
+ 
+geneL_consurf = ['alr', 'katg', 'rpob']
+
+if gene.lower() in geneL_consurf:
+    consurf_df = consurf_df.rename(columns={'POS' : 'position_consurf'})
+    #---------------------------
+    # Specify the offset 
+    #---------------------------
+    print('\nAdding offset value for gene:', gene.lower())
+        
+    if gene.lower() == 'alr':
+        offset_val = 34
+        
+        print('\nUsing offset val:', offset_val)
+    if gene.lower() == 'katg':
+        offset_val = 23
+        print('\nUsing offset val:', offset_val)
+
+    if gene.lower() == 'rpob':
+       offset_val = 28
+       print('\nUsing offset val:', offset_val)
+
+    consurf_df['position'] = consurf_df['position_consurf'] + offset_val
+    
+else:
+    consurf_df = consurf_df.rename(columns={'POS' : 'position'})
+
+consurf_df = consurf_df.rename(columns={'SEQ'     : 'wild_type'
+                                        , '3LATOM': 'wt_3upper'
+                                        , 'SCORE' : 'consurf_score'
+                                        , 'COLOR' : 'consurf_colour_str'
+                                        , 'CONFIDENCEINTERVAL'       : 'consurf_ci'
+                                        , 'CONFIDENCEINTERVALCOLORS' : 'consurf_ci_colour'
+                                        , 'MSADATA'  : 'consurf_msa_data'
+                                        , 'RESIDUEVARIETY' : 'consurf_aa_variety'})
+# quick check
+if len(consurf_df) == len(rd_df):
+    print('\nPASS: length of consurf df is as expected'
+          , '\nProceeding to format consurf df')
+else:
+    print('\nFAIL: length mismatch'
+          , '\nExpected nrows:', len(rd_df)
+          , '\nGot:', len(consurf_df))
+
+consurf_df.dtypes
+consurf_df['consurf_score'] = consurf_df['consurf_score'].astype(float)
+
+consurf_df['consurf_colour'] = consurf_df['consurf_colour_str'].str.extract(r'(\d).*')
+consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int)
+
+consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0')
+consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int)
+
+consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):')
+consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci_upper'].astype(float)
+
+consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci'].str.extract(r':(.*)')
+consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci_lower'].astype(float)
+
+#consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)')
+#consurf_df['wt_3upper_f']
+consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
+
+consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)')
+
+#-------------------------
+# scale consurf values
+#-------------------------
+# Rescale values in consurf_score col b/w -1 and 1 so negative numbers
+# stay neg and pos numbers stay positive
+consurf_min = consurf_df['consurf_score'].min()
+consurf_max = consurf_df['consurf_score'].max() 
+consurf_min
+consurf_max
+
+# quick check
+len(consurf_df.loc[consurf_df['consurf_score'] >= 0])
+len(consurf_df.loc[consurf_df['consurf_score'] < 0])
+
+consurf_scale = lambda x : x/abs(consurf_min) if x < 0 else (x/consurf_max if x >= 0 else 'failed')
+
+consurf_df['consurf_scaled'] = consurf_df['consurf_score'].apply(consurf_scale)
+print('\nRaw consurf scores:\n', consurf_df['consurf_score']
+    , '\n---------------------------------------------------------------'
+    , '\nScaled consurf scores:\n', consurf_df['consurf_scaled'])
+
+# additional check added
+csmi = consurf_df['consurf_scaled'].min()
+csma = consurf_df['consurf_scaled'].max()
+
+c = consurf_df[consurf_df['consurf_score']>=0].count()
+consurf_pos = c.get(key = 'consurf_score')
+
+c2 = consurf_df[consurf_df['consurf_scaled']>=0].count()
+consurf_pos2 = c2.get(key = 'consurf_scaled')
+
+if consurf_pos == consurf_pos2 and csmi == -1 and csma == 1:
+    print('\nPASS: Consurf values scaled correctly b/w -1 and 1')
+else:
+    print('\nFAIL: Consurf values scaled numbers MISmatch'
+          , '\nExpected number:', consurf_pos
+          , '\nGot:', consurf_pos2
+          , '\n======================================================')
+
+consurf_df.dtypes
+consurf_df.columns
+
+#---------------------------
+# select columns 
+# (and also determine order)
+#---------------------------
+consurf_df_f = consurf_df[['position'
+                           , 'wild_type'
+                           , 'chain'
+                           , 'wt_3upper'
+                           , 'consurf_score'
+                           , 'consurf_scaled'
+                           , 'consurf_colour'
+                           , 'consurf_colour_rev'
+                           , 'consurf_ci_upper'
+                           , 'consurf_ci_lower'
+                           , 'consurf_ci_colour'
+                           , 'consurf_msa_data'
+                           , 'consurf_aa_variety']]
+
+#=======================
+# SNAP2
+#=======================
+snap2_df.shape
+
+#----------------------
+# rename colums
+#---------------------- 
+geneL_snap2 = ['alr', 'katg', 'rpob']
+
+if gene.lower() in geneL_snap2:
+    print('\nReading SNAP2 for gene:', gene.lower()
+          , '\nOffset column also being read'
+          , '\nRenaming columns...'
+          , '\nColumn mutationinformation exists. Renaming SNAP2 column variant --> mutationinformation')
+    
+    snap2_df = snap2_df.rename(columns = {'mutationinformation': 'mutationinformation'
+                                          , 'Variant'          : 'mutationinformation_snap2'
+                                          , 'Predicted Effect' : 'snap2_outcome'
+                                          , 'Score'            : 'snap2_score'
+                                          , 'Expected Accuracy': 'snap2_accuracy_pc'})
+else:
+    print('\nReading SNAP2 for gene:', gene.lower()
+          , '\nNo offset column for SNAP2'
+          , '\nRenaming columns...'
+          , '\nRenaming SNAP2 column variant --> mutationinformation')
+
+    snap2_df = snap2_df.rename(columns = {'Variant'            : 'mutationinformation'
+                                          , 'Predicted Effect' : 'snap2_outcome'
+                                          , 'Score'            : 'snap2_score'
+                                          , 'Expected Accuracy': 'snap2_accuracy_pc'})
+    
+snap2_df.columns
+snap2_df.head()
+snap2_df.dtypes
+
+snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].str.replace('%','')
+snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].astype(int)
+
+#-------------------------
+# scale snap2 values
+#-------------------------
+# Rescale values in snap2_score col b/w -1 and 1 so negative numbers
+# stay neg and pos numbers stay positive
+snap2_min = snap2_df['snap2_score'].min()
+snap2_max = snap2_df['snap2_score'].max() 
+snap2_min
+snap2_max
+
+# quick check
+len(snap2_df.loc[snap2_df['snap2_score'] >= 0])
+len(snap2_df.loc[snap2_df['snap2_score'] < 0])
+
+snap2_scale = lambda x : x/abs(snap2_min) if x < 0 else (x/snap2_max if x >= 0 else 'failed')
+
+snap2_df['snap2_scaled'] = snap2_df['snap2_score'].apply(snap2_scale)
+print('\nRaw snap2 scores:\n', snap2_df['snap2_score']
+    , '\n---------------------------------------------------------------'
+    , '\nScaled snap2 scores:\n', snap2_df['snap2_scaled'])
+
+# additional check added
+ssmi = snap2_df['snap2_scaled'].min()
+ssma = snap2_df['snap2_scaled'].max()
+
+sn = snap2_df[snap2_df['snap2_score']>=0].count()
+snap2_pos = sn.get(key = 'snap2_score')
+
+sn2 = snap2_df[snap2_df['snap2_scaled']>=0].count()
+snap2_pos2 = sn2.get(key = 'snap2_scaled')
+
+if snap2_pos == snap2_pos2 and csmi == -1 and csma == 1:
+    print('\nPASS: Snap2 values scaled correctly b/w -1 and 1')
+else:
+    print('\nFAIL: snap2 values scaled numbers MISmatch'
+          , '\nExpected number:', snap2_pos
+          , '\nGot:', snap2_pos2
+          , '\n======================================================')
+
+#---------------------------
+# select columns 
+# (and also determine order)
+#---------------------------
+snap2_df.dtypes
+snap2_df.columns
+
+geneL_snap2 = ['alr', 'katg', 'rpob']
+
+if gene.lower() in geneL_snap2:
+    print('\nSelecting cols SNAP2 for gene:', gene.lower())
+    snap2_df_f = snap2_df[['mutationinformation'
+                       , 'mutationinformation_snap2'
+                       , 'snap2_score'
+                       , 'snap2_scaled'
+                       , 'snap2_accuracy_pc'
+                       , 'snap2_outcome']]
+else:
+    print('\nSelecting cols SNAP2 for gene:', gene.lower())
+    snap2_df_f = snap2_df[['mutationinformation'
+                       , 'snap2_score'
+                       , 'snap2_scaled'
+                       , 'snap2_accuracy_pc'
+                       , 'snap2_outcome']]
     
 #%%============================================================================
 # Now merges begin
@@ -499,7 +752,9 @@ merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
 dssp_kd_dfs = pd.merge(dssp_df
                        , kd_df
                        , on = merging_cols_m2
-                       , how = "outer")
+                       #, how = "outer")
+                       , how = "inner")
+
 
 print('\n\nResult of third merge:', dssp_kd_dfs.shape
       , '\n===================================================================')
@@ -521,6 +776,26 @@ print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape
       , '\n===================================================================')
 dssp_kd_rd_dfs[merging_cols_m3].apply(len)
 dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
+
+#%%============================================================================
+print('==================================='
+      , '\nFourth merge*: fourth merge + consurf_df' 
+      , '\dssp_kd_rd_dfs + consurf_df'
+      , '\n===================================')
+#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = "outer")
+merging_cols_m3_v2 = detect_common_cols(dssp_kd_rd_dfs,  consurf_df)
+dssp_kd_rd_con_dfs = pd.merge(dssp_kd_rd_dfs
+                          , consurf_df
+                          , on = merging_cols_m3_v2 
+                          , how = "outer")
+
+ncols_m3_v2 = len(dssp_kd_rd_con_dfs.columns)
+
+print('\n\nResult of fourth merge*:', dssp_kd_rd_con_dfs.shape
+      , '\n===================================================================')
+dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len)
+dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len) == len(dssp_kd_rd_con_dfs)
+
 #%%============================================================================
 print('======================================='
       , '\nFifth merge: Second merge + fourth merge'
diff --git a/scripts/data_extraction_epistasis.py b/scripts/data_extraction_epistasis.py
index 6626338..094a82b 100755
--- a/scripts/data_extraction_epistasis.py
+++ b/scripts/data_extraction_epistasis.py
@@ -75,15 +75,14 @@ args = arg_parser.parse_args()
 drug = args.drug
 gene = args.gene
 
-#drug = 'pyrazinamide'
-#gene = 'pncA'
-
 gene_match = gene + '_p.'
 print('mut pattern for gene', gene, ':',  gene_match)
 
 nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
 print('nsSNP for gene', gene, ':',  nssnp_match)
 
+nssnp_match2 = re.compile(nssnp_match)
+
 wt_regex = gene_match.lower()+'([A-Za-z]{3})'
 print('wt regex:', wt_regex)
 
@@ -219,20 +218,21 @@ meta_gene_epi = meta_gene_multi.loc[(meta_gene_multi['dr_mult_snp_count']>1) | (
 
 #%% TEST
 # formatting, replace !nssnp_match  with nothing
-foo1 = 	'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX'
-foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala'
+#foo1 = 	'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX'
+#foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala'
 
 
-foo1_s = foo1.split(';')
-foo1_s
-nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
-arse=list(filter(nssnp_match2.match, foo1_s))
-arse
+#foo1_s = foo1.split(';')
+#foo1_s
+#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
+#arse=list(filter(nssnp_match2.match, foo1_s))
+#arse
+
+#foo1_s2 = ';'.join(arse)
+#foo1_s2
 
-foo1_s2 = ';'.join(arse)
-foo1_s2
 #%%
-nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
+#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
 
 # dr_muts_col
 dr_clean_col = dr_muts_col + '_clean'
@@ -248,6 +248,7 @@ for i, v in enumerate(meta_gene_epi[dr_muts_col]):
     dr2_s = v.split(';')
     print(dr2_s)
     dr2_sf = list(filter(nssnp_match2.match, dr2_s))
+    #dr2_sf = list(filter(nssnp_match.match, dr2_s))
     print(dr2_sf)
     dr2_sf2  = ';'.join(dr2_sf)
     meta_gene_epi[dr_clean_col].iloc[i] = dr2_sf2
@@ -262,13 +263,13 @@ meta_gene_epi[other_clean_col] = ''
 
 for i, v in enumerate(meta_gene_epi[other_muts_col]):
     #print(i, v)
-    print('======================================================')
-    print(i)
-    print(v)
+    #print('======================================================')
+    #print(i)
+    #print(v)
     other2_s = v.split(';')
-    print(other2_s)
+    #print(other2_s)
     other2_sf = list(filter(nssnp_match2.match,  other2_s))
-    print(other2_sf)
+    #print(other2_sf)
     other2_sf2  = ';'.join(other2_sf)
     meta_gene_epi[other_clean_col].iloc[i] =  other2_sf2
 
@@ -281,7 +282,8 @@ meta_gene_epi_f = meta_gene_epi[['id', 'sample'
                                , 'dr_mult_snp_count'
                                , other_muts_col, other_clean_col
                                , 'other_mult_snp_count']]
-meta_gene_epi_f.columns
+#print(meta_gene_epi_f.columns)
+print(meta_gene_epi_f)
 
 cols_to_output = ['id', 'sample'
                    , dr_clean_col
@@ -293,7 +295,6 @@ cols_to_output = ['id', 'sample'
 meta_gene_epi_f2 = meta_gene_epi_f[cols_to_output]
 
 
-
 #%%
 # formatting, replace !nssnp_match  with nothing
 #nssnp_neg_match = '(?!pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})'
diff --git a/scripts/kd_df.py b/scripts/kd_df.py
index 53b1264..50748de 100755
--- a/scripts/kd_df.py
+++ b/scripts/kd_df.py
@@ -92,7 +92,7 @@ else:
 infile_fasta = indir + '/' + in_filename_fasta
 print('Input fasta file:', infile_fasta
       , '\n============================================================')
-
+ 
 #=======
 # output 
 #=======
diff --git a/scripts/plotting/basic_barplots_combined.R b/scripts/plotting/basic_barplots_combined.R
old mode 100755
new mode 100644
diff --git a/scripts/plotting/corr_adjusted_PS_LIG.R b/scripts/plotting/corr_adjusted_PS_LIG.R
old mode 100755
new mode 100644
diff --git a/scripts/plotting/dirs.R b/scripts/plotting/dirs.R
old mode 100755
new mode 100644
diff --git a/scripts/plotting/dist_plots_check.R b/scripts/plotting/dist_plots_check.R
old mode 100755
new mode 100644
diff --git a/scripts/plotting/extreme_muts.R b/scripts/plotting/extreme_muts.R
old mode 100755
new mode 100644
diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R
old mode 100755
new mode 100644
diff --git a/scripts/plotting/ggcorr_all_PS_LIG.R b/scripts/plotting/ggcorr_all_PS_LIG.R
old mode 100755
new mode 100644
diff --git a/scripts/plotting/hist_af_or_base.R b/scripts/plotting/hist_af_or_base.R
old mode 100755
new mode 100644
diff --git a/scripts/plotting/hist_af_or_combined.R b/scripts/plotting/hist_af_or_combined.R
old mode 100755
new mode 100644
diff --git a/scripts/plotting/legend_adjustment.R b/scripts/plotting/legend_adjustment.R
old mode 100755
new mode 100644
diff --git a/scripts/plotting/opp_mcsm_muts.R b/scripts/plotting/opp_mcsm_muts.R
old mode 100755
new mode 100644
diff --git a/scripts/plotting/or_plots_combined.R b/scripts/plotting/or_plots_combined.R
old mode 100755
new mode 100644
diff --git a/scripts/plotting/other_plots_combined.R b/scripts/plotting/other_plots_combined.R
old mode 100755
new mode 100644
diff --git a/scripts/plotting/output_tables.R b/scripts/plotting/output_tables.R
old mode 100755
new mode 100644
diff --git a/scripts/plotting/ps_plots_combined.R b/scripts/plotting/ps_plots_combined.R
old mode 100755
new mode 100644
diff --git a/scripts/plotting/resolving_ambiguous_muts.R b/scripts/plotting/resolving_ambiguous_muts.R
old mode 100755
new mode 100644