handled rpob 5uhc position offset in mcsm_ppi2

2022-01-04 10:45:29 +00:00 · 2022-01-04 10:45:29 +00:00 · 00b84ccb1c
commit 00b84ccb1c
parent 46e2c93885
30 changed files with 395 additions and 63 deletions
--- a/dynamut/format_results_dynamut.py
+++ b/dynamut/format_results_dynamut.py
--- a/dynamut/format_results_dynamut2.py
+++ b/dynamut/format_results_dynamut2.py
--- a/dynamut/run_format_results_dynamut.py
+++ b/dynamut/run_format_results_dynamut.py
--- a/dynamut/split_csv_chain.sh
+++ b/dynamut/split_csv_chain.sh
@ -31,7 +31,11 @@ split ../../../${INFILE} -l ${CHUNK} -d snp_batch_
 #~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 50  
 # Date: 05/10/2021
-#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 20    
+#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 20   
 # Date: 30/11/2021
 #~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps_chain.csv snp_batches 20
 for i in {00..40}; do mv snp_batch_${i} snp_batch_${i}.txt; done
 # add .txt to the files
 ########################################################################
--- a/mcsm_na/examples.py
+++ b/mcsm_na/examples.py
--- a/mcsm_na/format_results_mcsm_na.py
+++ b/mcsm_na/format_results_mcsm_na.py
--- a/mcsm_na/get_results_mcsm_na.py
+++ b/mcsm_na/get_results_mcsm_na.py
--- a/mcsm_na/run_format_results_mcsm_na.py
+++ b/mcsm_na/run_format_results_mcsm_na.py
--- a/mcsm_na/submit_mcsm_na.py
+++ b/mcsm_na/submit_mcsm_na.py
--- a/mcsm_ppi2/format_results_mcsm_ppi2.py
+++ b/mcsm_ppi2/format_results_mcsm_ppi2.py
@ -24,7 +24,7 @@ from reference_dict import up_3letter_aa_dict
 from reference_dict import oneletter_aa_dict
 #%%============================================================================    
-def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
+def format_mcsm_ppi2_output(mcsm_ppi2_output_csv, gene_name):
    """
    @param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps 
     which is the result of combining all mcsm_ppi2 batch results, and using
@ -78,30 +78,57 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
    # # check
    # mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD'])
-    # mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])
+    # mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])    
-#%%============================================================================    
+#%%=====================================================================
-    #############
+# add offset specified position number for rpob since 5uhc with chain 'C' was
-    # rename cols
+# used to run the analysis
    #############
    # format colnames: all lowercase and consistent colnames
    mcsm_ppi2_data.columns
    print('Assigning meaningful colnames'
            , '\n=======================================================')
    my_colnames_dict = {'chain': 'chain'
        , 'wild-type': 'wt_upper'
        , 'res-number': 'position'
        , 'mutant': 'mut_upper'
        , 'distance-to-interface': 'interface_dist'
        , 'mcsm-ppi2-prediction': 'mcsm_ppi2_affinity'
        , 'affinity': 'mcsm_ppi2_outcome'
        , 'w_type': 'wild_type' # one letter amino acid code
        , 'm_type': 'mutant_type' # one letter amino acid code  
 } 
    geneL_sp = ['rpob']
    if gene_name.lower() in geneL_sp:
        offset = 6
        chain_orig = 'A'
        # Add offset corrected position number. matching with rpob nsSNPs used for mCSM-lig
        # and also add corresponding chain id matching with rpob nsSNPs used for mCSM-lig
        mcsm_ppi2_data['position'] = mcsm_ppi2_data['res-number'] - offset
        mcsm_ppi2_data['chain'] = chain_orig
        mcsm_ppi2_data['5uhc_offset'] = offset
        #############
        # rename cols
        #############
        # format colnames: all lowercase and consistent colnames
        mcsm_ppi2_data.columns
        print('Assigning meaningful colnames'
              , '\n=======================================================')
        my_colnames_dict = {'chain'                  : 'chain'
                            , 'position'             : 'position'
                            , '5uhc_offset'          : '5uhc_offset'
                            , 'wild-type'            : 'wt_upper'
                            , 'res-number'           : '5uhc_position'
                            , 'mutant'               : 'mut_upper'
                            , 'distance-to-interface': 'interface_dist'
                            , 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
                            , 'affinity'             : 'mcsm_ppi2_outcome'
                            , 'w_type'               : 'wild_type' # one letter amino acid code
                            , 'm_type'               : 'mutant_type' # one letter amino acid code  
                            } 
    else:
        my_colnames_dict = {'chain'                  : 'chain'
                            , 'wild-type'            : 'wt_upper'
                            , 'res-number'           : 'position'
                            , 'mutant'               : 'mut_upper'
                            , 'distance-to-interface': 'interface_dist'
                            , 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
                            , 'affinity'             : 'mcsm_ppi2_outcome'
                            , 'w_type'               : 'wild_type' # one letter amino acid code
                            , 'm_type'               : 'mutant_type' # one letter amino acid code  
                            }
 #%%==============================================================================        
    mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True)
    mcsm_ppi2_data.columns
-
+         
    #############
    # create mutationinformation column
    #############    
@ -137,22 +164,47 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
              , '\nExpected number:', mcsm_ppi2_pos
              , '\nGot:', mcsm_ppi2_pos2
              , '\n======================================================')
 #%%=====================================================================
-    #############
+    ###################
    # reorder columns
-    #############
+    ###################
    mcsm_ppi2_data.columns
-    mcsm_ppi2_dataf = mcsm_ppi2_data[['mutationinformation'
+    
-                                , 'mcsm_ppi2_affinity'
+    #---------------------
-                                , 'mcsm_ppi2_scaled'
+    # Determine col order
-                                , 'mcsm_ppi2_outcome'
+    #---------------------
-                                , 'interface_dist'
+    
-                                , 'wild_type'
+    core_cols = ['mutationinformation'
-                                , 'position'
+                , 'mcsm_ppi2_affinity'
-                                , 'mutant_type'
+                , 'mcsm_ppi2_scaled'
-                                , 'wt_upper'
+                , 'mcsm_ppi2_outcome'
-                                , 'mut_upper'
+                , 'interface_dist'
-                                , 'chain']]
+                , 'wild_type'
                , 'position'
                , 'mutant_type'
                , 'wt_upper'
                , 'mut_upper'
                , 'chain']
    if gene_name.lower() in geneL_sp:
        column_order = core_cols + ['5uhc_offset', '5uhc_position']
    else:
        column_order = core_cols.copy()
    #--------------
    # reorder now
    #--------------    
    mcsm_ppi2_dataf = mcsm_ppi2_data[column_order]
 #%%============================================================================
    ###################
    # Sort df based on 
    # position columns
    ###################
    mcsm_ppi2_dataf.sort_values(by = ['position', 'mutant_type'], inplace = True, ascending = True)
    return(mcsm_ppi2_dataf)
-#%%##################################################################### 
+#%%##################################################################### 
--- a/mcsm_ppi2/run_format_results_mcsm_ppi2.py
+++ b/mcsm_ppi2/run_format_results_mcsm_ppi2.py
@ -67,7 +67,7 @@ outfile_mcsm_ppi2_f = outdir_ppi2 + gene.lower() + '_complex_mcsm_ppi2_norm.csv'
 # Data: gid+streptomycin
 #==========================
 print('Formatting results for:', infile_mcsm_ppi2)
-mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2)
+mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2, gene_name = gene)
 # writing file
 print('Writing formatted df to csv')
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@ -53,7 +53,7 @@ homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
-#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()
 # FIXME: local imports
@ -170,6 +170,18 @@ infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv'
 infile_mcsm_f_snps     = outdir + infilename_mcsm_f_snps
 mcsm_f_snps            = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
 # more output added
 ## consurf [change colnames]
 infilename_consurf     = gene.lower() + '_consurf_grades_f.csv'
 infile_consurf         = outdir + 'consurf/'+ infilename_consurf
 consurf_df             = pd.read_csv(infile_consurf, sep = ',')
 ## SNAP2 [add normalised score]
 infilename_snap2       = gene.lower() + '_snap2_output.csv'
 infile_snap2           = outdir + 'snap2/'+ infilename_snap2
 snap2_df               = pd.read_csv(infile_snap2, sep = ',')
 #------------------------------------------------------------------------------
 # ONLY:for gene pnca and gid: End logic should pick this up!
 geneL_na = ['gid', 'rpob']
@ -196,7 +208,7 @@ if gene.lower() in geneL_dy:
    # infile_mcsm_na        = outdir + 'mcsm_na_results/' + infilename_mcsm_na 
    # mcsm_na_df            = pd.read_csv(infile_mcsm_na, sep = ',')
-# ONLY:for gene embb and alr: End logic should pick this up!
+# ONLY:for gene embb and alr and katg: End logic should pick this up!
 geneL_ppi2 = ['embb', 'alr']
 #if gene.lower() == "embb" or "alr":
 if gene.lower() in geneL_ppi2:
@ -381,6 +393,247 @@ else:
 if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
    print('\nPASS: Deepddg data is scaled between -1 and 1',
           '\nproceeding with merge')
 #=======================
 # Consurf
 #=======================
 consurf_df.shape
 # drop row 0: as it contains no value but hangover text
 consurf_df = consurf_df.drop(index=0)
 #----------------------
 # rename colums
 #----------------------   
 consurf_df.columns
 print('\nRenaming cols and assigning pretty column names')
 geneL_consurf = ['alr', 'katg', 'rpob']
 if gene.lower() in geneL_consurf:
    consurf_df = consurf_df.rename(columns={'POS' : 'position_consurf'})
    #---------------------------
    # Specify the offset 
    #---------------------------
    print('\nAdding offset value for gene:', gene.lower())
    if gene.lower() == 'alr':
        offset_val = 34
        print('\nUsing offset val:', offset_val)
    if gene.lower() == 'katg':
        offset_val = 23
        print('\nUsing offset val:', offset_val)
    if gene.lower() == 'rpob':
       offset_val = 28
       print('\nUsing offset val:', offset_val)
    consurf_df['position'] = consurf_df['position_consurf'] + offset_val
 else:
    consurf_df = consurf_df.rename(columns={'POS' : 'position'})
 consurf_df = consurf_df.rename(columns={'SEQ'     : 'wild_type'
                                        , '3LATOM': 'wt_3upper'
                                        , 'SCORE' : 'consurf_score'
                                        , 'COLOR' : 'consurf_colour_str'
                                        , 'CONFIDENCEINTERVAL'       : 'consurf_ci'
                                        , 'CONFIDENCEINTERVALCOLORS' : 'consurf_ci_colour'
                                        , 'MSADATA'  : 'consurf_msa_data'
                                        , 'RESIDUEVARIETY' : 'consurf_aa_variety'})
 # quick check
 if len(consurf_df) == len(rd_df):
    print('\nPASS: length of consurf df is as expected'
          , '\nProceeding to format consurf df')
 else:
    print('\nFAIL: length mismatch'
          , '\nExpected nrows:', len(rd_df)
          , '\nGot:', len(consurf_df))
 consurf_df.dtypes
 consurf_df['consurf_score'] = consurf_df['consurf_score'].astype(float)
 consurf_df['consurf_colour'] = consurf_df['consurf_colour_str'].str.extract(r'(\d).*')
 consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int)
 consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0')
 consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int)
 consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):')
 consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci_upper'].astype(float)
 consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci'].str.extract(r':(.*)')
 consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci_lower'].astype(float)
 #consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)')
 #consurf_df['wt_3upper_f']
 consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
 consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)')
 #-------------------------
 # scale consurf values
 #-------------------------
 # Rescale values in consurf_score col b/w -1 and 1 so negative numbers
 # stay neg and pos numbers stay positive
 consurf_min = consurf_df['consurf_score'].min()
 consurf_max = consurf_df['consurf_score'].max() 
 consurf_min
 consurf_max
 # quick check
 len(consurf_df.loc[consurf_df['consurf_score'] >= 0])
 len(consurf_df.loc[consurf_df['consurf_score'] < 0])
 consurf_scale = lambda x : x/abs(consurf_min) if x < 0 else (x/consurf_max if x >= 0 else 'failed')
 consurf_df['consurf_scaled'] = consurf_df['consurf_score'].apply(consurf_scale)
 print('\nRaw consurf scores:\n', consurf_df['consurf_score']
    , '\n---------------------------------------------------------------'
    , '\nScaled consurf scores:\n', consurf_df['consurf_scaled'])
 # additional check added
 csmi = consurf_df['consurf_scaled'].min()
 csma = consurf_df['consurf_scaled'].max()
 c = consurf_df[consurf_df['consurf_score']>=0].count()
 consurf_pos = c.get(key = 'consurf_score')
 c2 = consurf_df[consurf_df['consurf_scaled']>=0].count()
 consurf_pos2 = c2.get(key = 'consurf_scaled')
 if consurf_pos == consurf_pos2 and csmi == -1 and csma == 1:
    print('\nPASS: Consurf values scaled correctly b/w -1 and 1')
 else:
    print('\nFAIL: Consurf values scaled numbers MISmatch'
          , '\nExpected number:', consurf_pos
          , '\nGot:', consurf_pos2
          , '\n======================================================')
 consurf_df.dtypes
 consurf_df.columns
 #---------------------------
 # select columns 
 # (and also determine order)
 #---------------------------
 consurf_df_f = consurf_df[['position'
                           , 'wild_type'
                           , 'chain'
                           , 'wt_3upper'
                           , 'consurf_score'
                           , 'consurf_scaled'
                           , 'consurf_colour'
                           , 'consurf_colour_rev'
                           , 'consurf_ci_upper'
                           , 'consurf_ci_lower'
                           , 'consurf_ci_colour'
                           , 'consurf_msa_data'
                           , 'consurf_aa_variety']]
 #=======================
 # SNAP2
 #=======================
 snap2_df.shape
 #----------------------
 # rename colums
 #---------------------- 
 geneL_snap2 = ['alr', 'katg', 'rpob']
 if gene.lower() in geneL_snap2:
    print('\nReading SNAP2 for gene:', gene.lower()
          , '\nOffset column also being read'
          , '\nRenaming columns...'
          , '\nColumn mutationinformation exists. Renaming SNAP2 column variant --> mutationinformation')
    snap2_df = snap2_df.rename(columns = {'mutationinformation': 'mutationinformation'
                                          , 'Variant'          : 'mutationinformation_snap2'
                                          , 'Predicted Effect' : 'snap2_outcome'
                                          , 'Score'            : 'snap2_score'
                                          , 'Expected Accuracy': 'snap2_accuracy_pc'})
 else:
    print('\nReading SNAP2 for gene:', gene.lower()
          , '\nNo offset column for SNAP2'
          , '\nRenaming columns...'
          , '\nRenaming SNAP2 column variant --> mutationinformation')
    snap2_df = snap2_df.rename(columns = {'Variant'            : 'mutationinformation'
                                          , 'Predicted Effect' : 'snap2_outcome'
                                          , 'Score'            : 'snap2_score'
                                          , 'Expected Accuracy': 'snap2_accuracy_pc'})
 snap2_df.columns
 snap2_df.head()
 snap2_df.dtypes
 snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].str.replace('%','')
 snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].astype(int)
 #-------------------------
 # scale snap2 values
 #-------------------------
 # Rescale values in snap2_score col b/w -1 and 1 so negative numbers
 # stay neg and pos numbers stay positive
 snap2_min = snap2_df['snap2_score'].min()
 snap2_max = snap2_df['snap2_score'].max() 
 snap2_min
 snap2_max
 # quick check
 len(snap2_df.loc[snap2_df['snap2_score'] >= 0])
 len(snap2_df.loc[snap2_df['snap2_score'] < 0])
 snap2_scale = lambda x : x/abs(snap2_min) if x < 0 else (x/snap2_max if x >= 0 else 'failed')
 snap2_df['snap2_scaled'] = snap2_df['snap2_score'].apply(snap2_scale)
 print('\nRaw snap2 scores:\n', snap2_df['snap2_score']
    , '\n---------------------------------------------------------------'
    , '\nScaled snap2 scores:\n', snap2_df['snap2_scaled'])
 # additional check added
 ssmi = snap2_df['snap2_scaled'].min()
 ssma = snap2_df['snap2_scaled'].max()
 sn = snap2_df[snap2_df['snap2_score']>=0].count()
 snap2_pos = sn.get(key = 'snap2_score')
 sn2 = snap2_df[snap2_df['snap2_scaled']>=0].count()
 snap2_pos2 = sn2.get(key = 'snap2_scaled')
 if snap2_pos == snap2_pos2 and csmi == -1 and csma == 1:
    print('\nPASS: Snap2 values scaled correctly b/w -1 and 1')
 else:
    print('\nFAIL: snap2 values scaled numbers MISmatch'
          , '\nExpected number:', snap2_pos
          , '\nGot:', snap2_pos2
          , '\n======================================================')
 #---------------------------
 # select columns 
 # (and also determine order)
 #---------------------------
 snap2_df.dtypes
 snap2_df.columns
 geneL_snap2 = ['alr', 'katg', 'rpob']
 if gene.lower() in geneL_snap2:
    print('\nSelecting cols SNAP2 for gene:', gene.lower())
    snap2_df_f = snap2_df[['mutationinformation'
                       , 'mutationinformation_snap2'
                       , 'snap2_score'
                       , 'snap2_scaled'
                       , 'snap2_accuracy_pc'
                       , 'snap2_outcome']]
 else:
    print('\nSelecting cols SNAP2 for gene:', gene.lower())
    snap2_df_f = snap2_df[['mutationinformation'
                       , 'snap2_score'
                       , 'snap2_scaled'
                       , 'snap2_accuracy_pc'
                       , 'snap2_outcome']]
 #%%============================================================================
 # Now merges begin
@ -499,7 +752,9 @@ merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
 dssp_kd_dfs = pd.merge(dssp_df
                       , kd_df
                       , on = merging_cols_m2
-                       , how = "outer")
+                       #, how = "outer")
                       , how = "inner")
 print('\n\nResult of third merge:', dssp_kd_dfs.shape
      , '\n===================================================================')
@ -521,6 +776,26 @@ print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape
      , '\n===================================================================')
 dssp_kd_rd_dfs[merging_cols_m3].apply(len)
 dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
 #%%============================================================================
 print('==================================='
      , '\nFourth merge*: fourth merge + consurf_df' 
      , '\dssp_kd_rd_dfs + consurf_df'
      , '\n===================================')
 #dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = "outer")
 merging_cols_m3_v2 = detect_common_cols(dssp_kd_rd_dfs,  consurf_df)
 dssp_kd_rd_con_dfs = pd.merge(dssp_kd_rd_dfs
                          , consurf_df
                          , on = merging_cols_m3_v2 
                          , how = "outer")
 ncols_m3_v2 = len(dssp_kd_rd_con_dfs.columns)
 print('\n\nResult of fourth merge*:', dssp_kd_rd_con_dfs.shape
      , '\n===================================================================')
 dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len)
 dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len) == len(dssp_kd_rd_con_dfs)
 #%%============================================================================
 print('======================================='
      , '\nFifth merge: Second merge + fourth merge'
--- a/scripts/data_extraction_epistasis.py
+++ b/scripts/data_extraction_epistasis.py
@ -75,15 +75,14 @@ args = arg_parser.parse_args()
 drug = args.drug
 gene = args.gene
 #drug = 'pyrazinamide'
 #gene = 'pncA'
 gene_match = gene + '_p.'
 print('mut pattern for gene', gene, ':',  gene_match)
 nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
 print('nsSNP for gene', gene, ':',  nssnp_match)
 nssnp_match2 = re.compile(nssnp_match)
 wt_regex = gene_match.lower()+'([A-Za-z]{3})'
 print('wt regex:', wt_regex)
@ -219,20 +218,21 @@ meta_gene_epi = meta_gene_multi.loc[(meta_gene_multi['dr_mult_snp_count']>1) | (
 #%% TEST
 # formatting, replace !nssnp_match  with nothing
-foo1 = 	'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX'
+#foo1 = 	'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX'
-foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala'
+#foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala'
-foo1_s = foo1.split(';')
+#foo1_s = foo1.split(';')
-foo1_s
+#foo1_s
-nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
+#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
-arse=list(filter(nssnp_match2.match, foo1_s))
+#arse=list(filter(nssnp_match2.match, foo1_s))
-arse
+#arse
 #foo1_s2 = ';'.join(arse)
 #foo1_s2
 foo1_s2 = ';'.join(arse)
 foo1_s2
 #%%
-nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
+#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
 # dr_muts_col
 dr_clean_col = dr_muts_col + '_clean'
@ -248,6 +248,7 @@ for i, v in enumerate(meta_gene_epi[dr_muts_col]):
    dr2_s = v.split(';')
    print(dr2_s)
    dr2_sf = list(filter(nssnp_match2.match, dr2_s))
    #dr2_sf = list(filter(nssnp_match.match, dr2_s))
    print(dr2_sf)
    dr2_sf2  = ';'.join(dr2_sf)
    meta_gene_epi[dr_clean_col].iloc[i] = dr2_sf2
@ -262,13 +263,13 @@ meta_gene_epi[other_clean_col] = ''
 for i, v in enumerate(meta_gene_epi[other_muts_col]):
    #print(i, v)
-    print('======================================================')
+    #print('======================================================')
-    print(i)
+    #print(i)
-    print(v)
+    #print(v)
    other2_s = v.split(';')
-    print(other2_s)
+    #print(other2_s)
    other2_sf = list(filter(nssnp_match2.match,  other2_s))
-    print(other2_sf)
+    #print(other2_sf)
    other2_sf2  = ';'.join(other2_sf)
    meta_gene_epi[other_clean_col].iloc[i] =  other2_sf2
@ -281,7 +282,8 @@ meta_gene_epi_f = meta_gene_epi[['id', 'sample'
                               , 'dr_mult_snp_count'
                               , other_muts_col, other_clean_col
                               , 'other_mult_snp_count']]
-meta_gene_epi_f.columns
+#print(meta_gene_epi_f.columns)
 print(meta_gene_epi_f)
 cols_to_output = ['id', 'sample'
                   , dr_clean_col
@ -293,7 +295,6 @@ cols_to_output = ['id', 'sample'
 meta_gene_epi_f2 = meta_gene_epi_f[cols_to_output]
 #%%
 # formatting, replace !nssnp_match  with nothing
 #nssnp_neg_match = '(?!pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})'
--- a/scripts/kd_df.py
+++ b/scripts/kd_df.py
@ -92,7 +92,7 @@ else:
 infile_fasta = indir + '/' + in_filename_fasta
 print('Input fasta file:', infile_fasta
      , '\n============================================================')
-
+ 
 #=======
 # output 
 #=======
--- a/scripts/plotting/basic_barplots_combined.R
+++ b/scripts/plotting/basic_barplots_combined.R
--- a/scripts/plotting/corr_adjusted_PS_LIG.R
+++ b/scripts/plotting/corr_adjusted_PS_LIG.R
--- a/scripts/plotting/dirs.R
+++ b/scripts/plotting/dirs.R
--- a/scripts/plotting/dist_plots_check.R
+++ b/scripts/plotting/dist_plots_check.R
--- a/scripts/plotting/extreme_muts.R
+++ b/scripts/plotting/extreme_muts.R
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
--- a/scripts/plotting/ggcorr_all_PS_LIG.R
+++ b/scripts/plotting/ggcorr_all_PS_LIG.R
--- a/scripts/plotting/hist_af_or_base.R
+++ b/scripts/plotting/hist_af_or_base.R
--- a/scripts/plotting/hist_af_or_combined.R
+++ b/scripts/plotting/hist_af_or_combined.R
--- a/scripts/plotting/legend_adjustment.R
+++ b/scripts/plotting/legend_adjustment.R
--- a/scripts/plotting/opp_mcsm_muts.R
+++ b/scripts/plotting/opp_mcsm_muts.R
--- a/scripts/plotting/or_plots_combined.R
+++ b/scripts/plotting/or_plots_combined.R
--- a/scripts/plotting/other_plots_combined.R
+++ b/scripts/plotting/other_plots_combined.R
--- a/scripts/plotting/output_tables.R
+++ b/scripts/plotting/output_tables.R
--- a/scripts/plotting/ps_plots_combined.R
+++ b/scripts/plotting/ps_plots_combined.R
--- a/scripts/plotting/resolving_ambiguous_muts.R
+++ b/scripts/plotting/resolving_ambiguous_muts.R