From f290d8ec9ec8c234a4b316e5bf5546d491658c52 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 25 Aug 2022 22:21:05 +0100
Subject: [PATCH] addded appendix tables and frequent muts

---
 scripts/.swp                                  |  Bin 0 -> 12288 bytes
 scripts/DE_CHECK_DEL                          | 1590 +++++++++++++++++
 .../gid/gid_ORandSNP_results.R                |  204 +++
 .../plotting_thesis/gid/gid_appendix_tables.R |  426 +++++
 .../plotting_thesis/gid/gid_ks_test_all_PS.R  |  545 ++++++
 .../plotting/plotting_thesis/ks_test_all_PS.R |  552 ++++++
 .../rpob/rpob_appendix_tables.R               |  482 +++++
 scripts/rgb_hex.R                             |   33 +
 8 files changed, 3832 insertions(+)
 create mode 100644 scripts/.swp
 create mode 100644 scripts/DE_CHECK_DEL
 create mode 100644 scripts/plotting/plotting_thesis/gid/gid_ORandSNP_results.R
 create mode 100644 scripts/plotting/plotting_thesis/gid/gid_appendix_tables.R
 create mode 100755 scripts/plotting/plotting_thesis/gid/gid_ks_test_all_PS.R
 create mode 100755 scripts/plotting/plotting_thesis/ks_test_all_PS.R
 create mode 100644 scripts/plotting/plotting_thesis/rpob/rpob_appendix_tables.R
 create mode 100644 scripts/rgb_hex.R

diff --git a/scripts/.swp b/scripts/.swp
new file mode 100644
index 0000000000000000000000000000000000000000..47bc07535235558148fed777bd05f03a2473f997
GIT binary patch
literal 12288
zcmeI%PfEi;6bA6u?gXu(7l?Lap@d1&OeSkrf`1lL*QQoQkffBjcIzQLghz1U0bF_p
z@eqDPW+J-i&KBVtcyD+wFPY)DE1jtG!{fk&v0!c4)U4jihSWuQ@9RbTtSE2GYS9Fr
zQGf!g7g!r7air`$`g^;<&h~iqSK<vQKmiI+fC3bt00k&Ofu#x5vyMFAw5~gG-R+%U
zyDPa-fC3bt00k&O0SZun0u-PC1t{<j1&XQ2M^8j=;_~?a-~ayqME#C<MLZ*(5RZr&
zae+8O7(|A!@4n$1Toj-H1t>rP3Q&Lo6rcbFC_sTF3S3rq)qHw&9UPpVoQeEJDKfvA
z?5j|XhAJH8NNF^1z1Wu8rY!Y6ouV`vDObK{?7MlBnqCwtHEffg%Z;6jZPGt_nSEfs
Yd+l9x=I3;Rk;tB#xL*5t^wNAFUnAgNS^xk5

literal 0
HcmV?d00001

diff --git a/scripts/DE_CHECK_DEL b/scripts/DE_CHECK_DEL
new file mode 100644
index 0000000..1af8c68
--- /dev/null
+++ b/scripts/DE_CHECK_DEL
@@ -0,0 +1,1590 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jun 28 14:15:40 2022
+
+@author: tanu
+"""
+###############################################################################
+homedir = os.path.expanduser("~")
+sys.path.append(homedir + '/git/LSHTM_analysis/scripts/')
+
+from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
+from tidy_split import tidy_split
+###############################################################################
+#%% creating required variables: gene and drug dependent, and input filename
+gene_match = gene + '_p.'
+print('mut pattern for gene', gene, ':',  gene_match)
+
+nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
+print('nsSNP for gene', gene, ':',  nssnp_match)
+
+nssnp_match_captureG = "(" + nssnp_match + ")"
+
+wt_regex = gene_match.lower()+'([A-Za-z]{3})'
+print('wt regex:', wt_regex)
+
+mut_regex = r'[0-9]+(\w{3})$'
+print('mt regex:', mut_regex)
+
+pos_regex = r'([0-9]+)'
+print('position regex:', pos_regex)
+
+# building cols to extract
+dr_muts_col = 'dr_mutations_' + drug
+other_muts_col = 'other_mutations_' + drug
+resistance_col = 'drtype'
+
+print('Extracting columns based on variables:\n'
+	, drug
+	, '\n'
+	, dr_muts_col
+	, '\n'
+	, other_muts_col
+    , '\n'
+    , resistance_col
+	, '\n===============================================================')
+#=======
+# input
+#=======
+#in_filename_master_master  = 'original_tanushree_data_v2.csv' #19k
+in_filename_master  = 'mtb_gwas_meta_v6.csv' #35k
+infile_master = datadir + '/' + in_filename_master
+print('Input file: ', infile_master
+      , '\n============================================================')
+
+#=======
+# output
+#=======
+# several output files: in respective sections at the time of outputting files
+print('Output filename: in the respective sections'
+      , '\nOutput path: ', outdir
+      , '\n=============================================================')
+
+#end of variable assignment for input and output files
+###############################################################################
+#%% Read input file
+master_data  = pd.read_csv(infile_master, sep = ',')  
+
+# column names
+#list(master_data.columns)
+
+# extract elevant columns to extract from meta data related to the drug
+if in_filename_master == 'original_tanushree_data_v2.csv':
+    meta_data = master_data[['id'
+                             , 'country'
+                             , 'lineage'
+                             , 'sublineage'
+                             , 'drtype'
+                             , drug
+                             , dr_muts_col
+                             , other_muts_col]]
+else:
+    core_cols = ['id'
+                 , 'sample'
+                 #, 'patient_id'
+                 #, 'strain'
+                 , 'lineage'
+                 , 'sublineage' 
+                 , 'country_code'
+                 #, 'geographic_source'
+                 ,  resistance_col]
+    
+    variable_based_cols = [drug
+                           , dr_muts_col
+                           , other_muts_col]
+       
+    cols_to_extract = core_cols + variable_based_cols
+    print('Extracting', len(cols_to_extract), 'columns from master data')
+
+    meta_data = master_data[cols_to_extract]  
+    del(master_data, variable_based_cols, cols_to_extract)
+
+print('Extracted meta data from filename:', in_filename_master
+      , '\nDim:', meta_data.shape)
+
+# checks and results
+total_samples = meta_data['id'].nunique() 
+print('RESULT: Total samples:', total_samples
+		, '\n===========================================================')
+
+# counts NA per column
+meta_data.isna().sum()
+print('No. of NAs/column:' + '\n', meta_data.isna().sum()
+		, '\n===========================================================\n')
+###############################################################################
+#%% Quick checks:
+meta_data['lineage'].value_counts()
+meta_data['lineage'].value_counts().sum()    
+meta_data['lineage'].nunique()
+
+# replace lineage with 'L' in lineage_labels
+#meta_data['lineage_labels'] = meta_data['lineage']
+#meta_data['lineage_labels'].equals(meta_data['lineage'])
+#all(meta_data['lineage_labels'].value_counts() == meta_data['lineage'].value_counts())
+#meta_data['lineage_labels'] = meta_data['lineage_labels'].str.replace('lineage', 'L')
+#meta_data['lineage'].value_counts() 
+#meta_data['lineage_labels'].value_counts()
+
+meta_data['lineage'] = meta_data['lineage'].str.replace('lineage', 'L')
+meta_data['lineage'].value_counts()
+print("\n================================"
+      , "\nLineage numbers"
+      , "\nComplete lineage samples:", meta_data['lineage'].value_counts().sum()
+      , "\nMissing lineage samples:", meta_data['id'].nunique() - meta_data['lineage'].value_counts().sum()
+      , "\n================================")
+
+meta_data['id'].nunique()
+meta_data['sample'].nunique()
+meta_data['id'].equals(meta_data['sample'])
+foo = meta_data.copy()
+foo['Diff'] = np.where( foo['id'] == foo['sample'] , '1', '0')
+foo['Diff'].value_counts()
+
+meta_data['drtype'].value_counts()
+meta_data['drtype'].value_counts().sum()
+print("\n================================"
+      , "\ndrtype numbers"
+      , "\nComplete drtype samples:", meta_data['drtype'].value_counts().sum()
+      , "\nMissing drtype samples:", meta_data['id'].nunique() - meta_data['drtype'].value_counts().sum()
+      , "\n================================")
+
+meta_data['drug_name'] = meta_data[drug].map({1:'R'
+                                              , 0:'S'})
+meta_data['drug_name'].value_counts()
+meta_data[drug].value_counts()
+meta_data[drug].value_counts().sum()
+print("\n================================"
+      , "\ndrug", drug, "numbers"
+      , "\nComplete drug samples:", meta_data[drug].value_counts().sum()
+      , "\nMissing drug samples:", meta_data['id'].nunique() - meta_data[drug].value_counts().sum()
+      , "\n================================")
+
+print("\n================================"
+      , "\ndrug", drug, "numbers"
+      , "\nComplete drug samples:", meta_data['drug_name'].value_counts().sum()
+      , "\nMissing drug samples:", meta_data['id'].nunique() - meta_data['drug_name'].value_counts().sum()
+      , "\n================================")
+
+#%% Quick counts: All samples, drug:
+print('===========================================================\n'
+      , 'RESULT: Total no. of samples tested for', drug, ':', meta_data[drug].value_counts().sum()
+	  , '\n===========================================================\n'
+      , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
+	  , '\n===========================================================\n'
+      , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
+	  , '\n===========================================================')
+
+print('===========================================================\n'
+      , 'RESULT: Total no. of samples tested for', drug, ':', meta_data['drug_name'].value_counts().sum()
+	  , '\n===========================================================\n'
+      , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data['drug_name'].value_counts()
+	  , '\n===========================================================\n'
+      , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data['drug_name'].value_counts(normalize = True)*100
+	  , '\n===========================================================')
+
+##############################################################################
+#%% Extracting nsSNP for gene (meta_gene_all): from dr_muts col and other_muts_col
+#meta_data_gene = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ]
+# so downstream code doesn't change
+meta_gene_all = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ]
+#%% DF: with dr_muts_col, meta_gene_dr
+meta_gene_dr = meta_gene_all.loc[meta_gene_all[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
+meta_gene_dr1 = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
+
+if meta_gene_dr.equals(meta_gene_dr1):
+    print('\nPASS: DF with column:', dr_muts_col, 'extracted successfully'
+          , '\ngene_snp_match in column:',dr_muts_col, meta_gene_dr.shape)
+else:
+    sys.exit('\nFAIL: DF with column:', dr_muts_col,'could not be extracted'
+             , '\nshape of df1:', meta_gene_dr.shape
+             , '\nshape of df2:', meta_gene_dr1.shape
+             , '\nCheck again!')
+##############################################################################    
+#%% DF: with other_muts_col, other_gene_dr
+meta_gene_other = meta_gene_all.loc[meta_gene_all[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
+meta_gene_other1 = meta_data.loc[meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
+
+print('gene_snp_match in dr:', len(meta_gene_other))
+meta_gene_other.equals(meta_gene_other1)
+
+if meta_gene_other.equals(meta_gene_other1):
+    print('\nPASS: DF with column:', other_muts_col,'extracted successfully'
+          , '\ngene_snp_match in column:',other_muts_col, meta_gene_other.shape)
+else:
+    sys.exit('\nFAIL: DF with column:', other_muts_col,'could not be extracted'
+             , '\nshape of df1:', meta_gene_other.shape
+             , '\nshape of df2:', meta_gene_other1.shape
+             , '\nCheck again!')
+############################################################################## 
+#%% Quick counts: <gene>nsSNP samples, drug
+meta_gene_all[drug].value_counts() 
+
+print('===========================================================\n'
+      , 'RESULT: Total no. of samples for', drug, 'with nsSNP mutations:', meta_gene_all['id'].nunique()
+	  , '\n===========================================================\n'
+      
+      , '===========================================================\n'
+      , 'RESULT: Total no. of samples tested for', drug, 'with nsSNP mutations:', meta_gene_all[drug].value_counts().sum()
+	  , '\n===========================================================\n'
+      
+      , '===========================================================\n'
+      , 'RESULT: Total no. of samples tested for', drug, 'with nsSNP mutations:', meta_gene_all['drug_name'].value_counts().sum()
+
+	  , '\n===========================================================\n'
+      , 'RESULT: No. of Sus and Res', drug, 'samples with nsSNP:\n', meta_gene_all['drug_name'].value_counts()
+
+	  , '\n===========================================================\n'
+      , 'RESULT: Percentage of Sus and Res', drug, 'samples with nsSNP mutations:\n', meta_gene_all['drug_name'].value_counts(normalize = True)*100
+	  , '\n===========================================================')
+###############################################################################
+#%% Create a copy of indices for downstream mergeing
+#meta_gene_all['index_orig']      = meta_gene_all.index
+#meta_gene_all['index_orig_copy'] = meta_gene_all.index
+
+#all(meta_gene_all.index.values == meta_gene_all['index_orig'].values)
+#all(meta_gene_all.index.values == meta_gene_all['index_orig_copy'].values)
+##############################################################################   
+#%% Important sanity checks: Dr muts column for tidy split(), nsSNPs, etc.
+# Split based on semi colon dr_muts_col
+search = ";"
+ 
+# count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence
+count_df_dr = meta_gene_dr[['id', dr_muts_col]].copy()
+count_df_dr['dr_semicolon_count'] = meta_gene_dr.loc[:, dr_muts_col].str.count(search, re.I)
+dr_sc_C = count_df_dr['dr_semicolon_count'].value_counts().reset_index()
+dr_sc_C 
+
+dr_sc_C['index_semicolon'] = (dr_sc_C['index'] + 1) *dr_sc_C['dr_semicolon_count']
+dr_sc_C 
+expected_drC = dr_sc_C['index_semicolon'].sum()
+expected_drC
+
+# count no. of nsSNPs and extract those nsSNPs
+count_df_dr['dr_geneSNP_count'] = meta_gene_dr.loc[:, dr_muts_col].str.count(nssnp_match, re.I)
+dr_gene_count1 = count_df_dr['dr_geneSNP_count'].sum()
+
+###############################################################################
+# This is to find out how many samples have 1 and more than 1 mutation,so you
+# can use it to check if your data extraction process for dr_muts 
+# and other_muts has worked correctly AND also to check the dim of the 
+# final formatted data.
+# This will have: unique COMBINATION of sample id and <gene_match> mutations
+#========
+# First: counting <gene_match> mutations in dr_muts_col column
+#========
+print('Now counting WT &', nssnp_match, 'muts within the column:', dr_muts_col)
+# drop na and extract a clean df
+clean_df =  meta_data.dropna(subset=[dr_muts_col])
+
+# sanity check: count na
+na_count = meta_data[dr_muts_col].isna().sum()
+
+if len(clean_df) == (total_samples - na_count):
+   print('PASS: clean_df extracted: length is', len(clean_df)
+         , '\nNo.of NAs in', dr_muts_col, '=', na_count, '/', total_samples
+         , '\n==========================================================')
+else:
+    sys.exit('FAIL: Could not drop NAs')
+
+dr_gene_count = 0
+wt = 0
+id_dr = []
+id2_dr = []
+dr_gene_mutsL = []
+#nssnp_match_regex = re.compile(nssnp_match)
+
+for i, id in enumerate(clean_df.id):
+    #print (i, id)
+    #id_dr.append(id)
+    #count_gene_dr = clean_df[dr_muts_col].iloc[i].count(gene_match) # can include stop muts
+    count_gene_dr = len(re.findall(nssnp_match, clean_df[dr_muts_col].iloc[i], re.IGNORECASE))
+    gene_drL = re.findall(nssnp_match, clean_df[dr_muts_col].iloc[i], re.IGNORECASE)
+    #print(count_gene_dr)
+    if count_gene_dr > 0:
+        id_dr.append(id)
+    if count_gene_dr > 1:
+        id2_dr.append(id)
+        #print(id, count_gene_dr)    
+    dr_gene_count = dr_gene_count + count_gene_dr
+    dr_gene_mutsL = dr_gene_mutsL + gene_drL
+    count_wt = clean_df[dr_muts_col].iloc[i].count('WT')
+    wt = wt + count_wt
+    
+print('RESULTS:')        
+print('Total WT in dr_muts_col:', wt)
+print('Total matches of', gene, 'SNP matches in', dr_muts_col, ':', dr_gene_count)
+print('Total samples with > 1', gene, 'nsSNPs in dr_muts_col:', len(id2_dr) )
+print('Total matches of UNIQUE', gene, 'SNP matches in', dr_muts_col, ':', len(set(dr_gene_mutsL)))
+print('=================================================================')
+
+if dr_gene_count == dr_gene_count1:
+    print('\nPass: dr gene SNP count match')
+else:
+    sys.exit('\nFAIL: dr gene SNP count MISmatch')
+    
+del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
+
+#%% tidy_split(): dr_muts_col
+#=========
+# DF1: dr_muts_col
+# and remove leading white spaces
+#=========
+#col_to_split1 = dr_muts_col
+print ('Firstly, applying tidy split on dr muts df', meta_gene_dr.shape
+       , '\ncolumn name to apply tidy_split():'    , dr_muts_col
+       , '\n============================================================')
+
+# apply tidy_split()
+dr_WF0 = tidy_split(meta_gene_dr, dr_muts_col, sep = ';') 
+# remove leading white space else these are counted as distinct mutations as well
+dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.strip() 
+
+if len(dr_WF0) == expected_drC:
+    print('\nPass: tidy split on', dr_muts_col, 'worked'
+          , '\nExpected nrows:', expected_drC
+          , '\nGot nrows:', len(dr_WF0) )
+else:
+    print('\nFAIL: tidy split on', dr_muts_col, 'did not work'
+          , '\nExpected nrows:', expected_drC
+          , '\nGot nrows:', len(dr_WF0) )
+
+# Extract only the samples/rows with nssnp_match
+#dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)]
+dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(nssnp_match, regex = True, case = False)]
+#dr_gene_WF0_v2 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(nssnp_match_captureG, regex = True, case = False)]
+
+print('Lengths after tidy split and extracting', nssnp_match, 'muts:'
+      , '\nOld length:' , len(meta_gene_dr)
+      , '\nLength after split:', len(dr_WF0)
+      , '\nLength of nssnp df:', len(dr_gene_WF0)
+      , '\nExpected len:', dr_gene_count
+      , '\n=============================================================')
+
+# Important: Assign 'column name' on which split was performed as an extra column
+# This is so you can identify if mutations are dr_type or other in the final df
+dr_df = dr_gene_WF0.assign(mutation_info = dr_muts_col)
+print('Dim of dr_df:', dr_df.shape
+	, '\n=============================================================='
+	, '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category'
+	, '\n===============================================================')
+
+if other_muts_col in dr_df.columns:
+    print('Dropping:', other_muts_col, 'from WF gene dr_df')
+    dr_df  = dr_df.drop([other_muts_col], axis = 1)
+########################################################################
+#%% Important sanity checks: other muts column for tidy split(), nsSNPs, etc.
+# Split based on semi colon on other_muts_col
+# count of occurrence of ";" in other_muts_col: No.of semicolons + 1 is no. of rows created * occurence
+count_df_other = meta_gene_other[['id', other_muts_col]].copy()
+count_df_other['other_semicolon_count'] = meta_gene_other.loc[:, other_muts_col].str.count(search, re.I)
+other_sc_C = count_df_other['other_semicolon_count'].value_counts().reset_index()
+other_sc_C
+other_sc_C['index_semicolon'] = (other_sc_C['index']+1)*other_sc_C['other_semicolon_count']
+other_sc_C
+expected_otherC = other_sc_C['index_semicolon'].sum()
+expected_otherC
+
+# count no. of nsSNPs and extract those nsSNPs
+count_df_other['other_geneSNP_count'] = meta_gene_other.loc[:, other_muts_col].str.count(nssnp_match, re.I)
+other_gene_count1 = count_df_other['other_geneSNP_count'].sum()
+
+# This is to find out how many samples have 1 and more than 1 mutation,so you
+# can use it to check if your data extraction process for dr_muts 
+# and other_muts has worked correctly AND also to check the dim of the 
+# final formatted data.
+# This will have: unique COMBINATION of sample id and <gene_match> mutations
+
+#========
+# Second: counting <gene_match> mutations in other_muts_col column
+#========
+print('Now counting WT &', nssnp_match, 'muts within the column:', other_muts_col)    
+# drop na and extract a clean df
+clean_df =  meta_data.dropna(subset=[other_muts_col])
+
+# sanity check: count na
+na_count = meta_data[other_muts_col].isna().sum()
+
+if len(clean_df) == (total_samples - na_count):
+   print('PASS: clean_df extracted: length is', len(clean_df)
+         , '\nNo.of NAs =', na_count, '/', total_samples
+         , '\n=========================================================')
+else:
+    sys.exit('FAIL: Could not drop NAs')
+
+other_gene_count = 0
+wt_other = 0
+id_other = []
+id2_other = []
+other_gene_mutsL = []
+    
+for i, id in enumerate(clean_df.id):
+    #print (i, id)
+    #id_other.append(id)
+    #count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match) # can include stop muts
+    count_gene_other = len(re.findall(nssnp_match, clean_df[other_muts_col].iloc[i], re.IGNORECASE))
+    gene_otherL = re.findall(nssnp_match, clean_df[other_muts_col].iloc[i], re.IGNORECASE)
+    #print(count_gene_other)
+    if count_gene_other > 0:
+        id_other.append(id)
+    if count_gene_other > 1:
+        id2_other.append(id)
+        #print(id, count_gene_other)    
+    other_gene_count = other_gene_count + count_gene_other
+    other_gene_mutsL = other_gene_mutsL + gene_otherL
+    count_wt = clean_df[other_muts_col].iloc[i].count('WT')
+    wt_other = wt_other + count_wt        
+     
+print('RESULTS:')
+print('Total WT in other_muts_col:', wt_other)
+print('Total matches of', gene, 'SNP matches in', other_muts_col, ':', other_gene_count)
+print('Total samples with > 1', gene, 'nsSNPs in other_muts_col:', len(id2_other) )
+print('Total matches of UNIQUE', gene, 'SNP matches in', other_muts_col, ':', len(set(other_gene_mutsL)))
+print('=================================================================')
+
+if other_gene_count == other_gene_count1:
+    print('\nPass: other gene SNP count match')
+else:
+    sys.exit('\nFAIL: other gene SNP count MISmatch')
+    
+del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other, count_wt )
+#%% tidy_split(): other_muts_col
+#=========
+# DF2: other_muts_col
+# and remove leading white spaces
+#=========
+#col_to_split2 = other_muts_col
+print ('applying second tidy split() separately on other muts df', meta_gene_other.shape
+       , '\ncolumn name to apply tidy_split():', other_muts_col
+       , '\n============================================================')
+
+# apply tidy_split()
+other_WF1 = tidy_split(meta_gene_other, other_muts_col, sep = ';') 
+# remove the leading white spaces in the column
+other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip() 
+
+# extract only the samples/rows with nssnp_match
+#other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)]
+other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(nssnp_match, regex = True, case = False)]
+
+print('Lengths after tidy split and extracting', gene_match, 'muts:',
+      '\nOld length:' , len(meta_gene_other),
+      '\nLength after split:', len(other_WF1),
+      '\nLength of nssnp df:', len(other_gene_WF1),
+      '\nExpected len:', other_gene_count
+      , '\n=============================================================')
+
+# Important: Assign 'column name' on which split was performed as an extra column
+# This is so you can identify if mutations are dr_type or other in the final df
+other_df = other_gene_WF1.assign(mutation_info = other_muts_col)
+print('dim of other_df:', other_df.shape
+	, '\n==============================================================='
+	, '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category'
+	, '\n===============================================================')
+
+if dr_muts_col in other_df.columns:
+    print('Dropping:', dr_muts_col, 'from WF gene other_df')
+    other_df  = other_df.drop([dr_muts_col], axis = 1)
+###############################################################################
+#%% Finding ambiguous muts
+common_snps_dr_other = set(dr_gene_mutsL).intersection(set(other_gene_mutsL))
+
+#%% More sanity checks: expected unique snps and nrows in LF data
+expected_unique_snps =  len(set(dr_gene_mutsL)) +  len(set(other_gene_mutsL)) - len(common_snps_dr_other)
+expected_rows = dr_gene_count + other_gene_count
+
+#%% Useful results to note==> counting dr, other and common muts
+print('\n==================================================================='
+      , '\nCount unique nsSNPs for', gene, ':'
+      , expected_unique_snps
+      , '\n===================================================================')
+
+Vcounts_dr = pd.Series(dr_gene_mutsL).value_counts()
+Vcounts_common_dr = Vcounts_dr.get(list(common_snps_dr_other))
+print('\n==================================================================='
+      , "\nCount of samples for common muts in dr muts\n"
+      , Vcounts_common_dr
+      , '\n===================================================================')
+
+Vcounts_other = pd.Series(other_gene_mutsL).value_counts()
+Vcounts_common_other = Vcounts_other.get(list(common_snps_dr_other))
+
+print('\n==================================================================='
+      , '\nCount of samples for common muts in other muts\n'
+      , Vcounts_common_other
+      , '\n===================================================================')
+
+print('\n==================================================================='
+      , '\nPredicting total no. of rows in the curated df:', expected_rows
+      , '\n===================================================================')
+
+#%% another way: Add value checks for dict so you can know if its correct for LF data count below
+dr_snps_vc_dict = pd.Series(dr_gene_mutsL).value_counts().to_dict()
+other_snps_vc_dict = pd.Series(other_gene_mutsL).value_counts().to_dict()
+
+for k, v in dr_snps_vc_dict.items():
+    if k in common_snps_dr_other:
+        print(k,v)
+    
+for k, v in other_snps_vc_dict.items():
+    if k in common_snps_dr_other:
+        print(k,v)
+
+# using defaultdict
+Cdict = collections.defaultdict(int)
+ 
+# iterating key, val with chain()
+for key, val in itertools.chain(dr_snps_vc_dict.items(), other_snps_vc_dict.items()):
+    if key in common_snps_dr_other:
+        Cdict[key] += val
+    else:
+        Cdict[key] = val
+     
+print(dict(Cdict))
+
+for k, v in Cdict.items():
+    if k in common_snps_dr_other:
+        print(k,v)   
+        
+# convert defaultDict to dict
+SnpFDict_orig = dict(Cdict)
+
+def lower_dict(d):
+   new_dict = dict((k.lower(), v) for k, v in d.items())
+   return new_dict
+
+SnpFDict = lower_dict(SnpFDict_orig)
+###############################################################################
+# USE Vcounts to get expected curated df
+# resolve dm om muts funda
+#%% Concatenating two dfs: gene_LF0
+# equivalent of rbind in R
+#==========
+# Concatentating the two dfs: equivalent of rbind in R
+#==========
+# Important: Change column names to allow concat:
+# dr_muts.. & other_muts : 'mutation'
+print('Now concatenating the two dfs by row'
+      , '\nFirst assigning a common colname: "mutation" to the col containing muts'
+      , '\nThis is done for both dfs'
+      , '\n===================================================================')
+
+dr_df.columns
+dr_df.rename(columns = {dr_muts_col: 'mutation'}, inplace = True)
+dr_df.columns
+
+other_df.columns
+other_df.rename(columns = {other_muts_col: 'mutation'}, inplace = True)
+other_df.columns
+
+if len(dr_df.columns) == len(other_df.columns):
+    print('Checking dfs for concatening by rows:'
+          , '\nDim of dr_df:', dr_df.shape
+          , '\nDim of other_df:', other_df.shape
+          , '\nExpected nrows:', len(dr_df) +  len(other_df)
+          , '\n=============================================================')
+else:
+    sys.exit('FAIL: No. of cols mismatch for concatenating')
+
+# checking colnames before concat
+print('Checking colnames BEFORE concatenating the two dfs...')
+if (set(dr_df.columns) == set(other_df.columns)):
+    print('PASS: column names match necessary for merging two dfs')
+else:
+    sys.exit('FAIL: Colnames mismatch for concatenating!')
+
+# concatenate (axis = 0): Rbind, adn keeo original index
+print('Now appending the two dfs: Rbind')
+gene_LF_comb = pd.concat([dr_df, other_df]
+                         #, ignore_index = True
+                         , axis = 0)
+
+if  gene_LF_comb.index.nunique() == len(meta_gene_all.index):
+    print('\nPASS:length of index in LF data:', len(gene_LF_comb.index)
+      , '\nLength of unique index in LF data:', gene_LF_comb.index.nunique()
+      )
+else:
+    sys.exit('\nFAIL: expected length for combined LF data mismatch:'
+             , '\nExpected length:', len(meta_gene_all.index)
+             , '\nGot:',  gene_LF_comb.index.nunique() )
+
+print('Finding stop mutations in concatenated df')
+stop_muts = gene_LF_comb['mutation'].str.contains('\*').sum()
+if stop_muts == 0:
+    print('PASS: No stop mutations detected')
+else:
+    print('stop mutations detected'
+          , '\nNo. of stop muts:', stop_muts, '\n'
+          , gene_LF_comb.groupby(['mutation_info'])['mutation'].apply(lambda x: x[x.str.contains('\*')].count())
+          , '\nNow removing them')    
+    
+gene_LF0_nssnp = gene_LF_comb[~gene_LF_comb['mutation'].str.contains('\*')]
+print('snps only: by subtracting stop muts:', len(gene_LF0_nssnp))
+
+gene_LF0 = gene_LF_comb[gene_LF_comb['mutation'].str.contains(nssnp_match, case = False)]
+print('snps only by direct regex:', len(gene_LF0))
+
+if gene_LF0_nssnp.equals(gene_LF0):
+    print('PASS: regex for extracting nssnp worked correctly & stop mutations successfully removed'
+          , '\nUsing the regex extracted df')
+else:
+    sys.exit('FAIL: posssibly regex or no of stop mutations'
+             , 'Regex being used:', nssnp_match)
+    #sys.exit()
+
+# checking colnames and length after concat
+print('Checking colnames AFTER concatenating the two dfs...')
+if (set(dr_df.columns) == set(gene_LF0.columns)):
+    print('PASS: column names match'
+          , '\n=============================================================')
+else:
+    sys.exit('FAIL: Colnames mismatch AFTER concatenating')
+    
+print('Checking concatenated df')
+if len(gene_LF0) == (len(dr_df) +  len(other_df))- stop_muts:
+    print('PASS:length of df after concat match'
+    , '\n===============================================================')
+else:
+    sys.exit('FAIL: length mismatch')
+
+#%% NEW check2:id, sample, etc.
+gene_LF0['id'].count()
+gene_LF0['id'].nunique()
+gene_LF0['sample'].nunique()
+gene_LF0['mutation_info'].value_counts()
+gene_LF0[drug].isna().sum()
+#%% Concatenating two dfs sanity checks: gene_LF1
+#=========================================================
+# This is hopefully clean data, but just double check
+# Filter LF data so that you only have
+# mutations corresponding to nssnp_match (case insensitive)
+# this will be your list you run OR calcs 
+#=========================================================
+print('Length of gene_LF0:', len(gene_LF0)
+      , '\nThis should be what we need. But just double checking and extracting nsSNP for', gene
+      , '\nfrom LF0 (concatenated data) using case insensitive regex match:', nssnp_match)
+
+#gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)]
+gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)].copy()
+
+if len(gene_LF0) == len(gene_LF1):
+    print('PASS: length of gene_LF0 and gene_LF1 match',
+          '\nConfirming extraction and concatenating worked correctly'
+          , '\n==========================================================')
+else:
+    print('FAIL: BUT NOT FATAL!'
+          , '\ngene_LF0 and gene_LF1 lengths differ'
+          , '\nsuggesting error in extraction process'
+          , ' use gene_LF1 for downstreama analysis'
+          , '\n=========================================================')   
+    
+print('Length of dfs pre and post processing...'
+      , '\ngene WF data (unique samples in each row):',len(meta_gene_all)
+      , '\ngene LF data (unique mutation in each row):', len(gene_LF1)
+      , '\n=============================================================')
+  
+# sanity check for extraction
+# This ought to pass if nsnsp_match happens right at the beginning of creating 'expected_rows'
+print('Verifying whether extraction process worked correctly...')
+if len(gene_LF1) == expected_rows:
+    print('PASS: extraction process performed correctly'
+          , '\nExpected length:', expected_rows
+          , '\nGot:', len(gene_LF1)
+          , '\nRESULT: Total no. of mutant sequences for logo plot:', len(gene_LF1)
+          , '\n=========================================================')
+else:
+    print('FAIL: extraction process has bugs'
+          , '\nExpected length:', expected_rows
+          , '\nGot:', len(gene_LF1)
+          , '\nDebug please'
+          , '\n=========================================================')
+
+# more sanity checks
+print('Performing some more sanity checks...')
+
+# From LF1: useful for OR counts
+# no. of unique muts
+distinct_muts = gene_LF1.mutation.value_counts()
+print('Distinct genomic mutations:', len(distinct_muts))
+
+# no. of samples contributing the unique muts
+gene_LF1.id.nunique()
+print('No.of samples contributing to distinct genomic muts:', gene_LF1.id.nunique())
+
+# no. of dr and other muts
+mut_grouped = gene_LF1.groupby('mutation_info').mutation.nunique()
+print('No.of unique dr and other muts:\n', gene_LF1.groupby('mutation_info').mutation.nunique())
+
+# sanity check
+if len(distinct_muts) == mut_grouped.sum() :
+    print('PASS:length of LF1 is as expected'
+    , '\n===============================================================')
+else:
+    print('FAIL: Mistmatch in count of muts'
+          , '\nExpected count:', len(distinct_muts)
+          , '\nActual count:', mut_grouped.sum()
+          , '\nMuts should be distinct within dr* and other* type'
+          , '\nInspecting...possibly ambiguous muts'
+          , '\nNo. of possible ambiguous muts:', mut_grouped.sum() - len(distinct_muts)
+          , '\n=========================================================')
+    
+muts_split = list(gene_LF1.groupby('mutation_info'))
+dr_muts = muts_split[0][1].mutation 
+other_muts =  muts_split[1][1].mutation
+print('splitting muts by mut_info:', muts_split)
+print('no.of dr_muts samples:', len(dr_muts))
+print('no. of other_muts samples', len(other_muts))
+#%% Ambiguous muts: the same mutation cannot be classed as a drug AND 'others'
+if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
+    print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category'
+          , '\n===============================================================')
+else:
+    print('PASS: NO ambiguous muts detected'
+          , '\nMuts are unique to dr_ and other_ mutation class'
+          , '\n=========================================================')
+
+# inspect dr_muts and other muts: Fixed in case no ambiguous muts detected!
+if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
+    print('Finding ambiguous muts...'
+          , '\n========================================================='
+          , '\nTotal no. of samples in dr_muts present in other_muts:', dr_muts.isin(other_muts).sum()
+          , '\nThese are:', dr_muts[dr_muts.isin(other_muts)]
+          , '\n========================================================='
+          , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum()
+          , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)]
+          , '\n=========================================================')
+    
+    print('Counting no. of ambiguous muts...'
+          , '\nNo. of ambiguous muts in dr:'
+          , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist())
+          , '\nNo. of ambiguous muts in other:'
+          , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())
+          , '\n=========================================================')
+    
+    if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): 
+        common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()
+        print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts))
+              , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n')
+        print('\n===========================================================') 
+else:
+    #sys.exit('Error: ambiguous muts present, but extraction failed. Debug!')
+    print('Ambiguous muts are NOT present')
+
+#%% DOES NOT depend on common_muts
+gene_LF1_orig = gene_LF1.copy()
+gene_LF1_orig.equals(gene_LF1)
+
+# copy the old columns for checking
+gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info']
+gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info'] 
+gene_LF1['mutation_info'].value_counts()
+
+#%% Ambiguous muts: revised annotation for mutation_info
+if 'common_muts' in globals():
+    ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
+    ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts()    
+    ambiguous_muts_value_counts
+    
+    # gene_LF1_orig = gene_LF1.copy()
+    # gene_LF1_orig.equals(gene_LF1)
+    
+    # # copy the old columns for checking
+    # gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info'] 
+    # gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info'] 
+    # gene_LF1['mutation_info'].value_counts()
+    #%% Inspect ambiguous muts
+    #=====================================
+    # Now create a new df that will have:
+        # ambiguous muts
+        # mutation_info
+        # revised mutation_info
+    # The revised label is based on value_counts
+    # for mutaiton_info. The corresponding mutation_info:
+    # label is chosen that corresponds to the max of value counts
+    #=====================================
+    ambig_muts_rev_df = pd.DataFrame()
+    changes_val = []
+    changes_dict = {}
+    
+    common_muts
+    gene_LF1['mutation'].head()
+    #common_muts_lower =  list((map(lambda x: x.lower(), common_muts)))
+    #common_muts_lower
+    
+    for i in common_muts:
+    #for i in common_muts_lower:
+        #print(i)
+        temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
+        temp_df
+      # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending
+        max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts()
+        max_info_table
+        revised_label = max_info_table[[0]].index[0][1] # max value_count
+        old_label     = max_info_table[[1]].index[0][1] # min value_count 
+        print('\nAmbiguous mutation labels...'
+              , '\nSetting mutation_info for', i, 'to', revised_label)
+        temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info_orig'] == old_label)
+                          , revised_label
+                          , temp_df['mutation_info_orig'])
+        ambig_muts_rev_df = pd.concat([ambig_muts_rev_df,temp_df])
+        f = max_info_table[[1]]
+        old_label_count = f[[0]][0]
+        changes_val.append(old_label_count) 
+        cc_dict = f.to_dict()
+        changes_dict.update(cc_dict)
+                          
+    ambig_muts_rev_df['mutation_info_REV'].value_counts() 
+    ambig_muts_rev_df['mutation_info_orig'].value_counts() 
+    changes_val
+    changes_total = sum(changes_val)
+    changes_dict
+    n_changes = sum(changes_dict.values())
+    #%% OUTFILE 1, write file: ambiguous muts and ambiguous mut counts
+    #==================
+    # ambiguous muts
+    #==================
+    #dr_muts.XXX_csvXXXX('dr_muts.csv', header = True)
+    #other_muts.XXXX_csvXXX('other_muts.csv', header = True)
+    if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
+        out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
+        outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
+        print('\n----------------------------------'
+              , '\nWriting file: ambiguous muts'
+              , '\n----------------------------------'
+              , '\nFilename:', outfile_ambig_muts)
+        inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
+        inspect.to_csv(outfile_ambig_muts, index = True)
+    
+        print('Finished writing:', out_filename_ambig_muts
+              , '\nNo. of rows:', len(inspect)
+              , '\nNo. of cols:', len(inspect.columns)
+              , '\nNo. of rows = no. of samples with the ambiguous muts present:'
+              , dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum()
+              , '\n=============================================================')
+        del(out_filename_ambig_muts)
+        
+    #%% OUTFILE 2, write file: ambiguous mut counts    
+    #======================
+    # ambiguous mut counts
+    #======================
+    out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv'
+    outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts
+    print('\n----------------------------------'
+           , '\nWriting file: ambiguous mut counts'
+           , '\n----------------------------------'
+           , '\nFilename:', outfile_ambig_mut_counts)
+    
+    ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
+    #%% FIXME: Add sanity check to make sure you can add value_count checks
+    #%% Resolving ambiguous muts: Merging ambiguous muts with the revised annotations
+    #=================
+    # Merge ambig muts
+    # with gene_LF1
+    #===================
+    ambig_muts_rev_df.index
+    gene_LF1.index
+    all(ambig_muts_rev_df.index.isin(gene_LF1.index))
+    
+    any(gene_LF1.index.isin(ambig_muts_rev_df.index))
+   # if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == len(ambig_muts_rev_df)):
+    if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == ambig_muts_rev_df.index.nunique()):
+
+        print('\nPASS: ambiguous mut indices present in gene_LF1. Prepare to merge...')
+    else:
+        sys.exit('\nFAIL:ambiguous mut indices MISmatch. Check section Resolving ambiguous muts')
+    
+    ##########################################################################
+    for index, row in ambig_muts_rev_df.iterrows():
+        curr_mut = row['mutation']
+        curr_rev = row['mutation_info_REV']
+        print('\n=====\nAmbiguous Mutation: index:', index, '\nmutation:', curr_mut, '\nNew:', curr_rev, '\n=====\n' )
+        print('\n-----\nReplacing original: index:', index, '\nmutation: '
+              , gene_LF1.loc[index,'mutation']
+              , '\nmutation_info to replace:'
+              , gene_LF1.loc[index,'mutation_info']
+              , '\nwith:', curr_rev, '\n-----')
+        replacement_row=(gene_LF1.index==index) & (gene_LF1['mutation'] == curr_mut)
+        gene_LF1.loc[replacement_row, 'mutation_info'] = curr_rev 
+        
+    ###########################################################################
+     
+    gene_LF1['mutation_info_orig'].value_counts()
+    gene_LF1['mutation_info_v1'].value_counts()
+    gene_LF1['mutation_info'].value_counts()
+
+    # Sanity check1: if there are still any ambiguous muts
+    #muts_split_rev = list(gene_LF1.groupby('mutation_info_v1'))
+    muts_split_rev = list(gene_LF1.groupby('mutation_info'))
+    dr_muts_rev = muts_split_rev[0][1].mutation 
+    other_muts_rev =  muts_split_rev[1][1].mutation
+    print('splitting muts by mut_info:', muts_split_rev)
+    print('no.of dr_muts samples:', len(dr_muts_rev))
+    print('no. of other_muts samples', len(other_muts_rev))  
+    
+    if not dr_muts_rev.isin(other_muts_rev).sum() & other_muts_rev.isin(dr_muts_rev).sum() > 0:
+        print('\nAmbiguous muts corrected. Proceeding with downstream analysis')
+    else:
+        print('\nAmbiguous muts NOT corrected. Quitting!')
+        sys.exit()
+else:
+    print('Mutations ARE NOT ambiguous, proceeding to downstream analyses')
+
+#gene_LF1['mutation_info_v1'].value_counts()
+gene_LF1['mutation_info'].value_counts()
+
+# reassign
+#%% PHEW! Good to go for downstream stuff
+#%% Add column: Mutationinformation ==> gene_LF1
+# splitting mutation column to get mCSM style muts
+#=====================================================
+# Formatting df: read aa dict and pull relevant info
+#=====================================================
+print('Now some more formatting:'
+      , '\nRead aa dict and pull relevant info'
+      , '\nFormat mutations:'
+      , '\nsplit mutation into mCSM style muts: '
+      , '\nFormatting mutation in mCSM style format: {WT}<POS>{MUT}'
+      , '\nAssign aa properties: adding 2 cols at a time for each prop'
+      , '\n===================================================================')
+
+# BEWARE hardcoding : only works as we are adding aa prop once for wt and once for mut
+# in each lookup cycle 
+ncol_mutf_add = 3 # mut split into 3 cols
+ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping
+
+#======================================================================
+# Split 'mutation' column into three:  wild_type, position and
+# mutant_type separately. Then map three letter code to one using
+# reference_dict imported at the beginning.
+# After importing, convert to mutation to lowercase for 
+# compatibility with dict 
+#=======================================================================
+gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower()
+
+print('wt regex being used:', wt_regex
+      , '\nmut regex being used:', mut_regex
+      , '\nposition regex being used:', pos_regex)
+
+mylen0 = len(gene_LF1.columns)
+
+#=========================================================
+# Iterate through the dict, create a lookup dict i.e
+# lookup_dict = {three_letter_code: one_letter_code}.
+# lookup dict should be the key and the value (you want to create a column for)
+# Then use this to perform the mapping separetly for wild type and mutant cols.
+# The three letter code is extracted using a string match match from the 
+# dataframe and then converted
+# to 'pandas series'since map only works in pandas series
+#===========================================================
+print('Adding', ncol_mutf_add, 'more cols:\n')
+
+# initialise a sub dict that is lookup dict for three letter code to 1-letter code
+# adding three more cols
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['one_letter_code']
+    #wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()converts to a series that map works on
+    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
+    gene_LF1['wild_type'] = wt.map(lookup_dict)   
+    #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
+    gene_LF1['mutant_type'] = mut.map(lookup_dict)
+
+#-------------------
+# extract position 
+# info from mutation 
+# column separetly using string match
+#-------------------
+#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') 
+gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex) 
+
+#mylen1 = len(gene_LF1.columns)
+mylen0_v2 = len(gene_LF1.columns)
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt & mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+#if mylen1 ==  mylen0 + ncol_mutf_add:
+if mylen0_v2 ==  mylen0 + ncol_mutf_add:
+    print('PASS: successfully added', ncol_mutf_add, 'cols'
+      , '\nold length:', mylen0
+      , '\nnew len:', mylen0_v2)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen0
+          , '\nnew len:', mylen0_v2)
+
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+##########################################################################
+# combine the wild_type+poistion+mutant_type columns to generate 
+# mutationinformation (matches mCSM output field)
+# Remember to use .map(str) for int col types to allow string concatenation
+###########################################################################
+gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type']
+print('Created column: mutationinformation'
+	, '\n=====================================================================\n'
+    , gene_LF1.mutationinformation.head(10))
+
+# order by position for convenience
+gene_LF1.dtypes
+
+# converting position to numeric
+gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
+
+# sort by position inplace 
+foo = gene_LF1['position'].value_counts()
+foo = foo.sort_index()
+
+gene_LF1.sort_values(by = ['position'], inplace = True)
+bar = gene_LF1['position'].value_counts()
+bar = bar.sort_index()
+
+if all(foo == bar):
+    print('PASS: df ordered by position')
+    print(gene_LF1['position'].head())
+else:
+    sys.exit('FAIL: df could not be ordered. Check source')
+
+print('\nDim of gene_LF1:', len(gene_LF1.columns), 'more cols:\n')
+
+#%% Create a copy of mutationinformation column for downstream mergeing
+gene_LF1['Mut']      = gene_LF1['mutationinformation'] 
+gene_LF1['Mut_copy'] = gene_LF1['mutationinformation'] 
+
+#%% Create a copy of indices for downstream mergeing
+gene_LF1['index_orig']      = gene_LF1.index
+gene_LF1['index_orig_copy'] = gene_LF1.index
+
+all(gene_LF1.index.values == gene_LF1['index_orig'].values)
+all(gene_LF1.index.values == gene_LF1['index_orig_copy'].values)
+
+#%% Quick sanity check for position freq count
+# count the freq of 'other muts' samples
+test_df = gene_LF1.copy()
+test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'position']]
+# add freq column
+#test_df['sample_freq'] = test_df.groupby('id')['id'].transform('count')
+#print('Revised dim of other_muts_df:',test_df.shape) 
+test_df['scount'] = test_df['mutation'].map(SnpFDict)
+test_df = test_df.sort_values(['position', 'mutationinformation'])
+#%% Map mutation frequency count as a column
+gene_LF1['snp_frequency'] = gene_LF1['mutation'].map(SnpFDict)
+#%% Map position frequency count as a column
+z =  gene_LF1['position'].value_counts()
+z1 = z.to_dict()
+gene_LF1['pos_count'] = gene_LF1['position'].map(z1)
+
+#test_df2 = test_df.loc[test_df['position'] == 10] 
+###############################################################################
+cols_added = ['Mut', 'Mut_copy', 'index', 'index_copy', 'pos_count', 'snp_frequency']
+print('\nAdded', len(cols_added), 'more cols:\n'
+      , '\nDim of new gene_LF1:', len(gene_LF1.columns))
+mylen1 = len(gene_LF1.columns) # updated my_len1
+###############################################################################
+#%% Add column: aa property_water ==> gene_LF1
+#=========
+# iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_prop_water} 
+# Do this for both wild_type and mutant as above.
+#=========
+print('Adding', ncol_aa_add, 'more cols:\n')
+
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+# adding two more cols
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_prop_water']
+    #print(lookup_dict)
+    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
+    gene_LF1['wt_prop_water'] = wt.map(lookup_dict)   
+    #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
+    gene_LF1['mut_prop_water'] = mut.map(lookup_dict)
+
+mylen2 = len(gene_LF1.columns)    
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen2 == mylen1 + ncol_aa_add:
+    print('PASS: successfully added', ncol_aa_add, 'cols'
+      , '\nold length:', mylen1
+      , '\nnew len:', mylen2)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen1
+          , '\nnew len:', mylen2)
+
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+#%% Add column: aa_prop_polarity ==> gene_LF1
+#========
+# iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_prop_polarity} 
+# Do this for both wild_type and mutant as above.
+#=========
+print('Adding', ncol_aa_add, 'more cols:\n')
+
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+# adding two more cols
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_prop_polarity']
+    #print(lookup_dict)
+    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
+    gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict)   
+   #mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
+    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
+    gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
+    
+mylen3 = len(gene_LF1.columns)       
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen3 ==  mylen2 + ncol_aa_add:
+    print('PASS: successfully added', ncol_aa_add, 'cols'
+      , '\nold length:', mylen1
+      , '\nnew len:', mylen2)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen1
+          , '\nnew len:', mylen2)
+   
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+#%% Add column: aa_calcprop ==> gene_LF1
+#========
+# iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_calcprop} 
+# Do this for both wild_type and mutant as above.
+#=========
+print('Adding', ncol_aa_add, 'more cols:\n')
+
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_calcprop']
+    #print(lookup_dict)
+    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
+    gene_LF1['wt_calcprop'] = wt.map(lookup_dict)   
+    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
+    gene_LF1['mut_calcprop'] = mut.map(lookup_dict)
+ 
+mylen4 = len(gene_LF1.columns)   
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen4 ==  mylen3 + ncol_aa_add:
+    print('PASS: successfully added', ncol_aa_add, 'cols'
+      , '\nold length:', mylen3
+      , '\nnew len:', mylen4)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen3
+          , '\nnew len:', mylen4)
+
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+#%% NEW mappings: gene_LF2
+# gene_LF2: copy gene_LF1
+gene_LF2 = gene_LF1.copy()
+gene_LF2.index
+
+#%% Add total unique id count
+gene_LF2['id'].nunique()
+gene_LF2['Mut'].nunique()
+total_id_ucount = gene_LF2['id'].nunique()
+total_id_ucount
+total_id_ucount2 = gene_LF2['sample'].nunique()
+total_id_ucount2
+
+if total_id_ucount == total_id_ucount2:
+    print('\nPASS: sample and id unique counts match')
+else:
+    print('\nFAIL: sample and id unique counts DO NOT match!'
+          , '\nGWAS worry!?')
+
+# Add all sample ids in a list for sanity checks
+#gene_LF2['id_list']  = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].apply(list))
+#==========================================
+# Add column: total unique id/sample count
+#==========================================
+gene_LF2['total_id_ucount'] = total_id_ucount
+
+#==========================================
+# DELETE as already mapped: Add column: mutation count in all samples 
+#==========================================
+# gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
+# gene_LF2['mut_id_ucount']
+
+gene_LF2['snp_frequency'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
+
+# gene_LF1['snp_frequency'].equals(gene_LF2['mut_id_ucount'])
+
+#%% AF for gene
+#=================
+# Add column: AF
+#=================
+gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount']
+gene_LF2['maf'].head()
+###############################################################################
+#%% Further mappings: gene_LF3, with mutationinformation as INDEX
+gene_LF3 = gene_LF2.copy()
+
+# Assign index: mutationinformation for mapping
+gene_LF3 = gene_LF3.set_index(['mutationinformation'])
+gene_LF3.index
+gene_LF3['id'].nunique()
+gene_LF3['Mut'].nunique()
+gene_LF3.index.nunique()
+
+all(gene_LF3['Mut'] == gene_LF3.index)
+
+#%% Mapping 1: column '<drtype>'
+#============================
+# column name: <drtype>
+#============================
+gene_LF3['drtype'].value_counts()
+
+# mapping 2.1: numeric
+drtype_map = {'XDR': 5
+              , 'Pre-XDR': 4
+              , 'MDR': 3
+              , 'Pre-MDR': 2
+              , 'Other': 1
+              , 'Sensitive': 0}
+
+gene_LF3['drtype_numeric']  = gene_LF3['drtype'].map(drtype_map)
+
+gene_LF3['drtype'].value_counts()
+gene_LF3['drtype_numeric'].value_counts()
+
+# Multimode: drtype
+#=============================
+# Recalculation: Revised drtype
+# max(multimode)
+#=============================
+#--------------------------------
+# drtype: ALL values:
+# numeric and names in an array 
+#--------------------------------
+gene_LF3['drtype_all_vals']  = gene_LF3['drtype_numeric']
+gene_LF3['drtype_all_names'] = gene_LF3['drtype']
+
+gene_LF3['drtype_all_vals']  = gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
+gene_LF3['drtype_all_vals'].head()
+
+gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
+gene_LF3['drtype_all_names'].head()
+
+#---------------------------------
+# Revised drtype: max(Multimode)
+#--------------------------------
+gene_LF3['drtype_multimode'] = gene_LF3.groupby('mutationinformation')['drtype_numeric'].agg(multimode)
+gene_LF3['drtype_multimode']
+
+# Now get the max from multimode
+gene_LF3['drtype_mode'] = gene_LF3['drtype_multimode'].apply(lambda x: np.nanmax(x))
+gene_LF3.head()
+
+#----------------------
+# Revised drtype: Max
+#----------------------
+gene_LF3.head()
+gene_LF3['drtype_max'] =  gene_LF3.groupby('mutationinformation')['drtype_numeric'].max()
+gene_LF3.head()
+
+foo = gene_LF3[['Mut', 'position', 'drtype', 'drtype_multimode', 'drtype_mode', 'drtype_max']]
+foo2 = foo.sort_values(['position', 'Mut'])
+###############################################################################
+#%% Mapping 2: column '<dst>', drug
+
+#=======================
+# column name: <drug>
+#=======================
+# mapping 1.1: labels
+dm_om_label_map = {dr_muts_col: 'DM'
+             , other_muts_col: 'OM'}
+dm_om_label_map
+
+gene_LF3['mutation_info_labels'] = gene_LF3['mutation_info'].map(dm_om_label_map)
+
+# mapping 1.2: numeric
+dm_om_num_map = {dr_muts_col: 1
+              , other_muts_col: 0}
+
+gene_LF3['dm_om_numeric']      = gene_LF3['mutation_info'].map(dm_om_num_map)
+gene_LF3['dm_om_numeric_orig'] = gene_LF3['mutation_info_orig'].map(dm_om_num_map)
+
+gene_LF3['mutation_info'].value_counts()
+gene_LF3['mutation_info_labels'].value_counts()
+gene_LF3['mutation_info_orig'].value_counts()
+gene_LF3['dm_om_numeric'].value_counts()
+gene_LF3['dm_om_numeric_orig'].value_counts()
+
+# Check value_counts: column '<drug>', mutation_info
+gene_LF3['mutation_info'].value_counts()
+gene_LF3['mutation_info_v1'].value_counts()
+gene_LF3['mutation_info_orig'].value_counts()
+
+#============================
+# column name: <dst>
+#============================
+# copy dst column
+gene_LF3['dst'] = gene_LF3[drug] # to allow cross checking
+gene_LF3['dst'].equals(gene_LF3[drug])
+
+# populate dst_multimode
+gene_LF3['dst_multimode'] = gene_LF3[drug]
+
+gene_LF3[drug].isnull().sum()
+gene_LF3['dst_multimode'].isnull().sum()
+
+gene_LF3['dst_multimode'].value_counts()
+gene_LF3['dst_multimode'].value_counts().sum()
+#%% Multimode: dst
+# For each mutation, generate the revised dst which is the mode of dm_om_numeric
+#=============================
+# Recalculation: Revised dst
+# max(multimode)
+#=============================
+# Get multimode for dm_om_numeric column
+#dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
+dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
+dm_om_multimode_LF3
+dm_om_multimode_LF3.isnull().sum()
+
+gene_LF3['dst_multimode_all'] = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
+gene_LF3['dst_multimode_all']
+
+# Fill using multimode ONLY where NA in dst_multimode column
+gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
+gene_LF3['dst_multimode']
+gene_LF3['dst_multimode'].value_counts()
+
+#----------------------------------
+# Revised dst column: max of mode
+#----------------------------------
+# Finally created a revised dst with the max from the multimode
+# Now get the max from multimode
+#gene_LF3['dst_mode']  = gene_LF3.groupby('mutationinformation')['dst_noNA'].max() # this somehow is not right!
+#gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
+gene_LF3['dst_mode'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x)) #ML
+
+#-----------------------------------------------------------------------------
+#-----------------------------------------------------------------------------
+# NOTE: unexpected weirdness with above, so redoing it!
+mmdf = pd.DataFrame(gene_LF3.groupby('mutationinformation')['dst_mode'].agg(multimode))
+mmdf['dst2'] = mmdf['dst_mode'].apply(lambda x: int(max(x)))
+mmdf=mmdf.reset_index()
+
+# rename cols to make sure merge will have the names you expect
+mmdf2 = mmdf.rename(columns = {'dst_mode':'dst_multimode', 'dst2':'dst_mode'})
+
+# IMPORTANT!
+gene_LF3_copy = gene_LF3.copy()
+gene_LF3_copy.drop(["dst_mode", "dst_multimode", "dst_multimode_all"], axis = 1, inplace = True)
+
+# Now merge gene_LF3.copy and mmdf2
+gene_LF3_merged = pd.merge(gene_LF3_copy, mmdf2, on='mutationinformation')
+df_check4 = gene_LF3_merged[['mutationinformation',  'dst', 'dst_multimode', 'dst_mode', 'position' ]]
+
+# now reassign the merged df to gene_LF3 for integration with downstream
+gene_LF3 = gene_LF3_merged.copy()
+#-----------------------------------------------------------------------------
+#-----------------------------------------------------------------------------
+
+# sanity checks
+#gene_LF3['dst_noNA'].equals(gene_LF3['dst_mode'])
+gene_LF3[drug].value_counts()
+#gene_LF3['dst_noNA'].value_counts()
+gene_LF3['dst_mode'].value_counts()
+
+if (gene_LF3['dst_mode'].value_counts().sum() == len(gene_LF3)) and (gene_LF3['dst_mode'].value_counts()[1] > gene_LF3[drug].value_counts()[1]) and gene_LF3['dst_mode'].value_counts()[0] > gene_LF3[drug].value_counts()[0]:
+    print('\nPASS: revised dst mode created successfully and the counts are more than <drug> col count')
+else:
+    print('\nFAIL:  revised dst mode numbers MISmatch')
+
+#foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']]
+#foo2 = foo.sort_values(['position', 'Mut'])
+
+print('\n------------------------------------------------------'
+      , '\nRevised counting: mutation_info i.e dm om column\n'
+      , '\n-----------------------------------------------------\n'
+      
+      , '\n----------------------------------'
+      , '\nOriginal drug column count'
+      , '\n----------------------------------\n'
+      ,  gene_LF3[drug].value_counts()
+      , '\nTotal samples [original]', gene_LF3[drug].value_counts().sum()
+      
+      , '\n----------------------------------'
+      , '\nRevised drug column count\n'
+      , '\n----------------------------------\n'
+      , gene_LF3['dst_mode'].value_counts()
+      , '\nTotal samples [revised]', gene_LF3['dst_mode'].value_counts().sum()
+      
+      # , '\n----------------------------------'
+      # , '\nRevised drug column count: dst_noNA\n'
+      # , '\n----------------------------------\n'
+      # ,  gene_LF3['dst_noNA'].value_counts()
+      )
+#%% Create revised mutation_info_column based on dst_mode
+#---------------------------------------
+# Create revised mutation_info_column
+#---------------------------------------
+# Will need to overwrite column 'mutation_info_labels', since downstream depends on it
+
+# Make a copy you before overwriting
+#gene_LF3['mutation_info_labels_v1'] =  gene_LF3['mutation_info'].map(dm_om_label_map)
+gene_LF3['mutation_info_labels_v1'] = gene_LF3['mutation_info_labels']
+gene_LF3['mutation_info_labels_v1'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()
+
+# Now overwrite
+gene_LF3['mutation_info_labels_dst']  = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
+gene_LF3['mutation_info_labels']  =  gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
+
+if all(gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()):
+    print('\nPASS: Revised mutation_info column created successfully')
+    gene_LF3 = gene_LF3.drop(['mutation_info_labels_dst'], axis = 1)
+else:
+    print('\nmutation info labels numbers mismatch'
+          , '\nPlease check section for mapping dst_mode to labels')
+    
+gene_LF3['mutation_info_orig'].value_counts()
+#gene_LF3['mutation_info_labels'].value_counts()
+gene_LF3['mutation_info_labels_orig'] = gene_LF3['mutation_info_orig'].map(dm_om_label_map)
+gene_LF3['mutation_info_labels_orig'].value_counts()
+
+#%% FIXME: Get multimode for dm_om_numeric column
+#dm_om_multimode_LF4 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
+#dm_om_multimode_LF4
+#gene_LF3['dst_multimode_numeric'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF4)
+
+# %% sanity check for the revised dst
+gene_LF3[drug].value_counts()
+gene_LF3[drug].value_counts().sum()
+
+gene_LF3['mutation_info_labels_orig'].value_counts()
+
+gene_LF3['dst_mode'].value_counts()
+gene_LF3['dst_mode'].value_counts().sum()
+
+# direct comparision
+gene_LF3['dst'].value_counts()
+gene_LF3['mutation_info_labels'].value_counts()
+gene_LF3['mutation_info_labels_v1'].value_counts()
+
+#%% Lineage
+gene_LF3['lineage'].value_counts()
+# lineage_label_numeric = {'lineage1' : 1
+#                     , 'lineage2'   : 2        
+#                     , 'lineage3'   : 3             
+#                     , 'lineage4'   : 4              
+#                     , 'lineage5'   : 5              
+#                     , 'lineage6'   : 6              
+#                     , 'lineage7'   : 7               
+#                     , 'lineageBOV' : 8}
+
+lineage_label_numeric = {'L1' : 1
+                    , 'L2'   : 2        
+                    , 'L3'   : 3             
+                    , 'L4'   : 4              
+                    , 'L5'   : 5              
+                    , 'L6'   : 6              
+                    , 'L7'   : 7               
+                    , 'LBOV' : 8}
+
+lineage_label_numeric
+#%% Lineage cols selected: gene_LF3_ColsSel
+gene_LF3_ColsSel = gene_LF3.copy()
+gene_LF3_ColsSel.index
+gene_LF3_ColsSel.columns
+#gene_LF3_ColsSel['index_orig_copy'] = gene_LF3_ColsSel['index_orig'] # delete
+
+# copy column to allow cross checks after stripping white space
+gene_LF3_ColsSel['lineage']         = gene_LF3_ColsSel['lineage'].str.strip()
+gene_LF3_ColsSel['lineage_corrupt'] = gene_LF3_ColsSel['lineage'] 
+all(gene_LF3_ColsSel['lineage_corrupt'].value_counts() == gene_LF3_ColsSel['lineage'].value_counts())
+gene_LF3_ColsSel['lineage_corrupt'].value_counts()
+
+gene_LF3_ColsSel = gene_LF3_ColsSel[['index_orig_copy', 'Mut', 'position', 'snp_frequency', 'lineage', 'lineage_corrupt']]
+gene_LF3_ColsSel.columns
+
+#%% tidy_split(): Lineage
+# Create df with tidy_split: lineage
+lf_lin_split = tidy_split(gene_LF3_ColsSel, 'lineage_corrupt', sep = ';') 
+lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip()
+lf_lin_split['lineage_corrupt'].value_counts()
+#%% Important sanity check for tidy_split(): lineage
+# Split based on semi colon dr_muts_col
+search = ";"
+ 
+# count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence
+count_df_lin = gene_LF3_ColsSel[['lineage']].copy()
+count_df_lin['lineage_semicolon_count'] = gene_LF3_ColsSel.loc[:, 'lineage'].str.count(search, re.I)
+lin_sc_C = count_df_lin['lineage_semicolon_count'].value_counts().reset_index()
+lin_sc_C 
+
+lin_sc_C['index_semicolon'] = (lin_sc_C['index'] + 1) * lin_sc_C['lineage_semicolon_count']
+lin_sc_C 
+expected_linC = lin_sc_C['index_semicolon'].sum() + gene_LF3_ColsSel['lineage'].isnull().sum()
+expected_linC
+
+if (len(lf_lin_split) == expected_linC):
+    print('\nPASS: tidy_split() length match for lineage')
+else:
+    sys.exit('\nFAIL: tidy_split() length MISMATCH. Check lineage semicolon count')
diff --git a/scripts/plotting/plotting_thesis/gid/gid_ORandSNP_results.R b/scripts/plotting/plotting_thesis/gid/gid_ORandSNP_results.R
new file mode 100644
index 0000000..972def0
--- /dev/null
+++ b/scripts/plotting/plotting_thesis/gid/gid_ORandSNP_results.R
@@ -0,0 +1,204 @@
+#!/usr/bin/env Rscript
+#source("~/git/LSHTM_analysis/config/gid.R")
+#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+
+#=======
+# output
+#=======
+outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/")
+###################################################################
+# FIXME: ADD distance to NA when SP replies
+
+geneL_normal  = c("pnca")
+geneL_na      = c("gid", "rpob")
+geneL_ppi2    = c("alr", "embb", "katg", "rpob")
+
+# LigDist_colname # from globals used
+# ppi2Dist_colname #from globals used 
+# naDist_colname #from globals used
+
+common_cols  = c("mutationinformation"
+                 , "X5uhc_position"
+                 , "X5uhc_offset"
+                 , "position"
+                 , "dst_mode"
+                 , "mutation_info_labels"
+                 , "sensitivity", dist_columns )
+
+
+########################################
+categ_cols_to_factor = grep( "_outcome|_info", colnames(merged_df3) )
+fact_cols = colnames(merged_df3)[categ_cols_to_factor]
+
+if (any(lapply(merged_df3[, fact_cols], class) == "character")){
+  cat("\nChanging", length(categ_cols_to_factor), "cols to factor")
+  merged_df3[, fact_cols] <- lapply(merged_df3[, fact_cols], as.factor)
+  if (all(lapply(merged_df3[, fact_cols], class) == "factor")){
+    cat("\nSuccessful: cols changed to factor")
+  }
+}else{
+  cat("\nRequested cols aready factors")
+}
+
+cat("\ncols changed to factor are:\n", colnames(merged_df3)[categ_cols_to_factor] )
+
+####################################
+# merged_df3: NECESSARY pre-processing
+###################################
+#df3 = merged_df3
+plot_cols = c("mutationinformation", "mutation_info_labels", "position", "dst_mode"
+                   , all_cols)
+
+all_cols = c(common_cols
+             , all_stability_cols
+             , all_affinity_cols
+             , all_conserv_cols)
+
+
+# counting
+foo = merged_df3[, c("mutationinformation"
+                     , "wild_pos"
+                     , "position"
+                     , "sensitivity"
+                     , "avg_lig_affinity"
+                     , "avg_lig_affinity_scaled"
+                     , "avg_lig_affinity_outcome"
+                     , "ligand_distance"
+                     , "ligand_affinity_change"
+                     , "affinity_scaled"
+                     , "ligand_outcome"
+                     , "consurf_colour_rev"
+                     , "consurf_outcome")]
+
+table(foo$consurf_outcome)
+foo2 = foo[foo$ligand_distance<10,]
+
+table(foo2$ligand_outcome)
+
+#############################
+# wide plots SNP
+# DRUG
+length(aa_pos_drug); aa_pos_drug
+drug = foo[foo$position%in%aa_pos_drug,]
+drug$wild_pos
+#CA
+
+
+###############################################
+# OR plot
+
+bar = merged_df3[, c("mutationinformation"
+                     , "wild_pos"
+                     , "position"
+                     , "sensitivity"
+                     , affinity_dist_colnames
+                     , "or_mychisq"
+                     , "pval_fisher"
+                     #, "pval_chisq"
+                     , "neglog_pval_fisher"
+                     , "log10_or_mychisq")]
+
+# bar$p_adj_bonferroni = p.adjust(bar$pval_fisher, method = "bonferroni")
+# bar$signif_bon = bar$p_adj_bonferroni
+# bar = dplyr::mutate(bar
+#                     , signif_bon = case_when(signif_bon == 0.05 ~ "."
+#                                              , signif_bon <=0.0001 ~ '****'
+#                                              , signif_bon <=0.001 ~ '***'
+#                                              , signif_bon <=0.01 ~ '**'
+#                                              , signif_bon <0.05 ~ '*'
+#                                              , TRUE ~ 'ns'))
+
+bar$p_adj_fdr = p.adjust(bar$pval_fisher, method = "BH")
+bar$signif_fdr = bar$p_adj_fdr
+bar = dplyr::mutate(bar
+                    , signif_fdr = case_when(signif_fdr == 0.05 ~ "."
+                                             , signif_fdr <=0.0001 ~ '****'
+                                             , signif_fdr <=0.001 ~ '***'
+                                             , signif_fdr <=0.01 ~ '**'
+                                             , signif_fdr <0.05 ~ '*'
+                                             , TRUE ~ 'ns'))
+
+# sort df
+bar = bar[order(bar$or_mychisq, decreasing = T), ]
+bar = bar[, c("mutationinformation"
+               , "wild_pos"
+               , "position"
+               , "sensitivity"
+               , affinity_dist_colnames
+               , "or_mychisq"
+               #, "pval_fisher"
+               #, "pval_chisq"
+               #, "neglog_pval_fisher"
+               #, "log10_or_mychisq"
+               #, "signif_bon"
+               , "p_adj_fdr"
+               , "signif_fdr")]
+
+table(bar$sensitivity)
+
+table(bar$or_mychisq>1&bar$signif_fdr) # sen and res ~ OR
+
+str(bar)
+sen = bar[bar$or_mychisq<1,]
+sen = na.omit(sen)
+
+res = bar[bar$or_mychisq>1,]
+res = na.omit(res)
+
+# comp 
+bar_or = bar[!is.na(bar$or_mychisq),]
+table(bar_or$sensitivity)
+      
+sen1 = bar_or[bar_or$or_mychisq<1,] # sen and res ~OR
+res1 = bar_or[bar_or$or_mychisq>1,] # sen and res ~OR
+
+# sanity check
+if (nrow(bar_or) == nrow(sen1) + nrow(res1) ){
+  cat("\nPASS: df with or successfully sourced"
+      , "\nCalculating % of muts with OR>1")
+}else{
+  stop("Abort: df with or numbers mimatch")
+}
+
+# percent for OR muts
+pc_orR = nrow(res1)/(nrow(sen1) + nrow(res1)); pc_orR
+cat("\nPercentage of muts with OR>1 i.e resistant:"
+    , pc_orR *100 )
+
+# muts with highest OR
+head(bar_or$mutationinformation, 10)
+
+# sort df
+bar_or = bar_or[order(bar_or$or_mychisq
+                      , bar_or$ligand_distance
+                      , bar_or$interface_dist
+                      , decreasing = T), ]
+
+bar_or$drug_site = ifelse(bar_or$position%in%aa_pos_drug, "drug", "no")
+table(bar_or$drug_site)
+
+bar_or$dsl_site = ifelse(bar_or$position%in%aa_pos_dsl, "dsl", "no")
+table(bar_or$dsl_site)
+
+bar_or$ca_site = ifelse(bar_or$position%in%aa_pos_ca, "ca", "no")
+table(bar_or$ca_site)
+
+bar_or$cdl_site = ifelse(bar_or$position%in%aa_pos_cdl, "cdl", "no")
+table(bar_or$cdl_site)
+
+
+top10_or = bar_or[1:10,]
+
+# are these active sites
+top10_or$position[top10_or$position%in%active_aa_pos]
+
+
+# clostest most sig
+bar_or_lig = bar_or[bar_or$ligand_distance<10,]
+bar_or_lig = bar_or_lig[order(bar_or_lig$ligand_distance, -bar_or_lig$or_mychisq), ]
+table(bar_or_lig$signif_fdr)
+
+
+bar_or_ppi = bar_or[bar_or$interface_dist<10,]
+bar_or_ppi = bar_or_ppi[order(bar_or_ppi$interface_dist, -bar_or_ppi$or_mychisq), ]
+table(bar_or_ppi$signif_fdr)
diff --git a/scripts/plotting/plotting_thesis/gid/gid_appendix_tables.R b/scripts/plotting/plotting_thesis/gid/gid_appendix_tables.R
new file mode 100644
index 0000000..eb37c9f
--- /dev/null
+++ b/scripts/plotting/plotting_thesis/gid/gid_appendix_tables.R
@@ -0,0 +1,426 @@
+#!/usr/bin/env Rscript       
+
+source("~/git/LSHTM_analysis/config/gid.R")
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+
+#=======
+# output
+#=======
+outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/")
+outdir_stats = paste0(outdir_images,"stats/") 
+cat("\nOutput dir for stats:", outdir_stats)
+###################################################################
+geneL_normal  = c("pnca")
+#geneL_na      = c("gid", "rpob")
+geneL_na_v2   = c("gid")
+geneL_nca    = c("alr", "embb", "katg", "rpob")
+geneL_both    = c("rpob")
+
+
+if (tolower(gene)%in%geneL_na_v2) {
+  gene_colnames = c("mcsm_na_affinity", "mcsm_na_outcome")
+}
+
+if (tolower(gene)%in%geneL_nca) {
+  gene_colnames = c("mcsm_nca_affinity", "mcsm_nca_outcome")
+}
+
+#from plotting_globals()
+LigDist_colname
+ppi2Dist_colname 
+naDist_colname
+
+delta_symbol #delta_symbol = "\u0394"; delta_symbol
+angstroms_symbol
+
+cat("\nAffinity Distance colnames:", length(affinity_dist_colnames)
+    , "\nThese are:", affinity_dist_colnames)
+#===========
+# Data used
+#===========
+df3 = merged_df3
+
+cols_to_output = c("position"
+                   , "sensitivity"
+                   , "mutationinformation"
+                   , affinity_dist_colnames[1]
+                   , "ligand_affinity_change"
+                   , "ligand_outcome"
+                   , "mmcsm_lig"
+                   , "mmcsm_lig_outcome"
+                   , affinity_dist_colnames[2]
+                   # #, affinity_dist_colnames[3]
+                   # , "mcsm_na_affinity"
+                   # , "mcsm_na_outcome"
+                   # #, "mcsm_nca_affinity"
+                   # #, "mcsm_nca_outcome"
+                   , gene_colnames
+                   , "maf"
+                   , "or_mychisq"
+                   , "pval_fisher")
+
+cols_to_output
+df3_output = df3[, cols_to_output]
+colnames(df3_output)
+cat("\nSelecting columns:", length(colnames(df3_output)))
+#===============================================
+# Add COLS and rounding: adjusted P-values + MAF
+#==============================================
+#-----------------------------
+# adjusted P-values
+#-----------------------------
+# add cols: p_adj_fdr and signif_fdr
+df3_output$p_adj_fdr = p.adjust(df3_output$pval_fisher, method = "fdr")
+df3_output$signif_fdr = df3_output$p_adj_fdr
+df3_output = dplyr::mutate(df3_output
+                           , signif_fdr = case_when(signif_fdr == 0.05 ~ "."
+                                                    , signif_fdr <=0.0001 ~ '****'
+                                                    , signif_fdr <=0.001 ~ '***'
+                                                    , signif_fdr <=0.01 ~ '**'
+                                                    , signif_fdr <0.05 ~ '*'
+                                                    , TRUE ~ 'ns'))
+# rounding
+df3_output$or_mychisq = round(df3_output$or_mychisq,2)
+df3_output$p_adj_fdr  = round(df3_output$p_adj_fdr,2)
+head(df3_output)
+
+#----------
+# MAF (%)
+#----------
+# add col maf_percent
+df3_output$maf_percent = df3_output$maf*100
+
+# rounding
+df3_output$maf_percent = round(df3_output$maf_percent,2)
+head(df3_output$af); head(df3_output$maf);head(df3_output$maf_percent)
+
+#----------
+# P-value
+#----------
+df3_output$pval_fisher = round(df3_output$pval_fisher,2)
+
+class(df3_output)
+head(df3_output)
+
+####################################
+# Appendix: ligand affinity
+####################################
+df_lig = df3_output[df3_output[[LigDist_colname]]<DistCutOff,]
+
+cols_to_output_lig = c("position"
+                       , "sensitivity"
+                       , "mutationinformation"
+                       , LigDist_colname
+                       , "ligand_affinity_change"
+                       , "ligand_outcome"
+                       , "mmcsm_lig"
+                       , "mmcsm_lig_outcome"
+                       , "maf_percent"
+                       , "or_mychisq"
+                       , "pval_fisher"
+                       , "p_adj_fdr"
+                       , "signif_fdr")
+# select cols
+Out_df_lig = df_lig[, cols_to_output_lig]
+
+# sort df by OR and then MAF: highest OR and highest MAF
+#Out_df_ligS1 = Out_df_lig[order(Out_df_lig$or_mychisq, decreasing = T), ]
+Out_df_ligS = Out_df_lig[order(-Out_df_lig$or_mychisq, Out_df_lig$maf_percent), ]
+
+#head(Out_df_ligS1); tail(Out_df_ligS1)
+head(Out_df_ligS); tail(Out_df_ligS)
+
+colsNames_to_output_lig = c("position"
+                            , "sensitivity"
+                            , "Mutation"
+                            , paste0("Lig-Dist (", angstroms_symbol, ")")
+                            , "mCSM-ligand affinity"
+                            , "mCSM ligand_outcome"
+                            , "mmCSM-ligand affinity"
+                            , "mmCSM ligand_outcome"
+                            , paste0("MAF ","(%)")
+                            , "Odds Ratio"
+                            , "P-value"
+                            , "Adjusted P-value"
+                            , "Adjusted P-value significance")
+
+colnames(Out_df_ligS) = colsNames_to_output_lig
+head(Out_df_ligS)
+
+#--------------------
+# write output file: KS test within grpup
+#----------------------
+Out_ligT = paste0(outdir_stats
+                  , tolower(gene)
+                  , "_lig_muts.csv")
+
+cat("Output of Ligand muts:", Out_ligT )
+write.csv(Out_df_ligS, Out_ligT, row.names = FALSE)
+
+########################################################################
+####################################
+# Appendix: NA affinity
+# naDist_colname
+####################################
+# Filtered data
+df_nca = df3_output[df3_output[[naDist_colname]]<DistCutOff,]
+
+# select cols
+cols_to_output_nca = c("position"
+                        , "sensitivity"
+                        , "mutationinformation"
+                        , naDist_colname
+                        , gene_colnames
+                        , "maf_percent"
+                        , "or_mychisq"
+                        , "pval_fisher"
+                        , "p_adj_fdr"
+                        , "signif_fdr")
+
+# extract output cols
+Out_df_nca = df_nca[, cols_to_output_nca]
+
+# sort df by OR and then MAF: Highest OR and Highest MAF
+#Out_df_ncaS = Out_df_nca[order(Out_df_nca$or_mychisq, decreasing = T), ]
+Out_df_ncaS = Out_df_nca[order(-Out_df_nca$or_mychisq, Out_df_nca$maf_percent), ]
+
+colsNames_to_output_nca = c("position"
+                             , "sensitivity"
+                             , "Mutation"
+                             , paste0("NA-Dist (", angstroms_symbol, ")")
+                             , paste0("mCSM-NA (", delta_symbol,delta_symbol,"G)")
+                             , "mCSM-NA outcome"
+                             , paste0("MAF ","(%)")
+                             , "Odds Ratio"
+                             , "P-value"
+                             , "Adjusted P-value"
+                             , "Adjusted P-value significance")
+
+colnames(Out_df_ncaS) = colsNames_to_output_nca
+Out_df_ncaS
+#--------------------
+# write output file: KS test within grpup
+#----------------------
+Out_ncaT = paste0(outdir_stats
+                   , tolower(gene)
+                   , "_na_muts.csv")
+
+cat("Output of NA muts:", Out_ncaT )
+write.csv(Out_df_ncaS, Out_ncaT, row.names = FALSE)
+#################################################################################
+#################################################################################
+#################################################################################
+##########################################################
+# higest or/maf and stability effects
+###########################################################
+# convert to percet
+df3$maf_percent = df3$maf*100
+
+cols_to_output_effects = c("position"
+                   , "sensitivity"
+                   , "mutationinformation"
+                   , "avg_stability"
+                   , "avg_stability_outcome"
+                   , affinity_dist_colnames[1]
+                   , "avg_lig_affinity"
+                   , "avg_lig_affinity_outcome"
+                   , affinity_dist_colnames[2]
+                   , gene_colnames
+                   , "maf_percent"
+                   , "or_mychisq"
+                   , "pval_fisher")
+
+df3_effects = df3[, cols_to_output_effects]
+nrow(df3_effects); ncol(df3_effects)
+
+# add cols: p_adj_fdr and signif_fdr
+df3_effects$p_adj_fdr = p.adjust(df3_effects$pval_fisher, method = "fdr")
+df3_effects$signif_fdr = df3_effects$p_adj_fdr
+df3_effects = dplyr::mutate(df3_effects
+                           , signif_fdr = case_when(signif_fdr == 0.05 ~ "."
+                                                    , signif_fdr <=0.0001 ~ '****'
+                                                    , signif_fdr <=0.001 ~ '***'
+                                                    , signif_fdr <=0.01 ~ '**'
+                                                    , signif_fdr <0.05 ~ '*'
+                                                    , TRUE ~ 'ns'))
+# rounding
+df3_effects$or_mychisq = round(df3_effects$or_mychisq,2)
+df3_effects$p_adj_fdr  = round(df3_effects$p_adj_fdr,2)
+head(df3_effects)
+
+#-------------------
+# Highest OR and MAF
+#-------------------
+mut_hor  = df3_effects[df3_effects$or_mychisq%in%(max(df3_effects$or_mychisq, na.rm = T)),]
+mut_hmaf = df3_effects[df3_effects$maf_percent%in%(max(df3_effects$maf_percent, na.rm = T)),]
+
+if (identical(colnames(mut_hor), colnames(mut_hmaf)) ){
+  
+  # add cols
+  mut_hor$mutational_effect  = "Mutation with highest OR"
+  mut_hmaf$mutational_effect = "Most frequent mutation"
+  cat("\nPass: or and maf")
+}else{
+  quit("Abort: colnames or and maf mismatch")
+}
+
+#-------------------
+# Avg stability
+# most DD/SS: average stability
+#-------------------
+mut_h_avs_dd = df3_effects[df3_effects$avg_stability%in%(min(df3_effects$avg_stability, na.rm = T)),]
+mut_h_avs_ss = df3_effects[df3_effects$avg_stability%in%(max(df3_effects$avg_stability, na.rm = T)),]
+
+if (identical(colnames(mut_h_avs_dd), colnames(mut_h_avs_ss)) ){
+  
+  # add cols
+  mut_h_avs_dd$mutational_effect = "Most Destabilising for protomer"
+  mut_h_avs_ss$mutational_effect = "Most Stabilising for protomer"
+  
+  cat("\nPass : avg stability")
+}else{
+  quit("Abort: colnames stability mismatch")
+}
+
+#-------------------
+# Filtered columns
+# most DD/SS: ligand
+#-------------------
+df3_effects_lig = df3_effects[df3_effects[[LigDist_colname]]<DistCutOff,]
+nrow(df3_effects_lig)
+
+mut_h_lig_dd = df3_effects_lig[df3_effects_lig$avg_lig_affinity%in%(min(df3_effects_lig$avg_lig_affinity, na.rm = T)),]
+mut_h_lig_ss = df3_effects_lig[df3_effects_lig$avg_lig_affinity%in%(max(df3_effects_lig$avg_lig_affinity, na.rm = T)),]
+
+if (identical(colnames(mut_h_lig_dd), colnames(mut_h_lig_ss)) ){
+  
+  # add cols
+  mut_h_lig_dd$mutational_effect = "Most Destabilising for Ligand affinity"
+  mut_h_lig_ss$mutational_effect = "Most Stabilising for Ligand affinity"
+  
+  cat("\nPass: avg ligand affinity")
+}else{
+  quit("Abort: colnames lig mismatch")
+}
+
+#-------------------
+# Filtered columns
+# most DD/SS: NA
+#-------------------
+if (tolower(gene)%in%geneL_na_v2 ){
+  
+  df3_effects_na = df3_effects[df3_effects[[naDist_colname]]<DistCutOff,]
+  nrow(df3_effects_na)
+  mut_h_na_dd = df3_effects_na[df3_effects_na$mcsm_na_affinity%in%(min(df3_effects_na$mcsm_na_affinity, na.rm = T)),]
+  mut_h_na_ss = df3_effects_na[df3_effects_na$mcsm_na_affinity%in%(max(df3_effects_na$mcsm_na_affinity, na.rm = T)),]
+  
+  # add cols
+  mut_h_na_dd$mutational_effect = "Most Destabilising for NA affinity"
+  mut_h_na_ss$mutational_effect = "Most Stabilising for NA affinity"
+  
+  if (identical(colnames(mut_h_na_dd), colnames(mut_h_na_ss)) ){
+    cat("\nPass 1: NCA")
+  }else{
+    quit("Abort: colnames nca mismatch")
+  }
+  
+  if (identical(colnames(mut_h_na_dd), colnames(mut_h_lig_dd)) ){
+    cat("\nPass 2: NCA")
+  }else{
+    quit("Abort: colnames ppi2 mismatch")
+  }
+  #combine
+  gene_aff_combined = rbind(mut_h_na_dd, mut_h_na_ss)
+  
+}
+
+#-------------------
+# Filtered columns
+# most DD/SS: ppi2
+#-------------------
+if (tolower(gene)%in%geneL_ppi2 ){
+
+  df3_effects_ppi2 = df3_effects[df3_effects[[ppi2Dist_colname]]<DistCutOff,]
+  nrow(df3_effects_ppi2)
+
+  mut_h_ppi2_dd = df3_effects_ppi2[df3_effects_ppi2$mcsm_ppi2_affinity%in%(min(df3_effects_ppi2$mcsm_ppi2_affinity, na.rm = T)),]
+  mut_h_ppi2_ss = df3_effects_ppi2[df3_effects_ppi2$mcsm_ppi2_affinity%in%(max(df3_effects_ppi2$mcsm_ppi2_affinity, na.rm = T)),]
+  
+  # add cols
+  mut_h_ppi2_dd$mutational_effect = "Most Destabilising for PPI affinity"
+  mut_h_ppi2_dd$mutational_effect = "Most Stabilising for PPI affinity"
+  
+  if (identical(colnames(mut_h_ppi2_dd), colnames(mut_h_ppi2_ss)) ){
+    cat("\nPass 1: ppi2")
+  }else{
+    quit("Abort: colnames ppi2 mismatch")
+  }
+  
+  if (identical(colnames(mut_h_ppi2_dd), colnames(mut_h_lig_dd)) ){
+    cat("\nPass 2 : ppi2")
+  }else{
+    quit("Abort: colnames ppi2 mismatch")
+  }
+  #combine
+  gene_aff_combined = rbind(mut_h_na_dd, mut_h_na_ss)
+  
+}
+
+#============
+# final combine
+#============
+if ( identical(colnames(mut_hor), colnames(mut_h_lig_dd)) ){
+  cat("PASS: all")
+  combined_table = rbind(mut_hor, mut_hmaf,
+                         mut_h_avs_dd, mut_h_avs_ss,
+                         mut_h_lig_dd, mut_h_lig_ss,
+                         gene_aff_combined)
+  cat("\nCombined table dim:", "\nnrow:", nrow(combined_table), "\nncol:", ncol(combined_table))
+}else{
+  quit("Abort: colnames ppi2 mismatch")
+}
+
+# Assign pretty colnames
+colnames(combined_table)
+colsNames_combined_table = c("position"
+                            , "sensitivity"
+                            , "Mutation"
+                            , paste0("Avg stability (", delta_symbol,delta_symbol,"G)")
+                            , "avg stability outcome"
+                            
+                            , paste0("Lig-Dist (", angstroms_symbol, ")")
+                            , "Avg ligand affinity"
+                            , "Ligand affinity outcome"
+                            
+                            , paste0("NA-Dist (", angstroms_symbol, ")")
+                            , paste0("mCSM-NA (", delta_symbol,delta_symbol,"G)")
+                            , "mCSM-NA outcome"
+                            
+                            #, paste0("PPI-Dist (", angstroms_symbol, ")")
+                            #, paste0("mCSM-PPI (", delta_symbol,delta_symbol,"G)")
+                            #, "mCSM-PPI outcome"
+                            
+                            , paste0("MAF ","(%)")
+                            , "Odds Ratio"
+                            , "P-value"
+                            , "Adjusted P-value"
+                            , "Adjusted P-value significance"
+                            , "Mutational effect")
+
+if ( length(colnames(combined_table)) == length(colsNames_combined_table) ) {
+  cat("Assiging pretty colnames for output")
+  colnames(combined_table) <- colsNames_combined_table
+  #colnames(combined_table)
+}else{
+  stop("\nAbort: No. of cols mismatch. Cannot assign pretty colnames for output")
+}
+
+#--------------------
+# write output file: KS test within grpup
+#----------------------
+Out_combined_effectsT = paste0(outdir_stats
+                  , tolower(gene)
+                  , "_mut_effects.csv")
+
+cat("Output of effects:", Out_combined_effectsT )
+write.csv(combined_table, Out_combined_effectsT, row.names = FALSE)
diff --git a/scripts/plotting/plotting_thesis/gid/gid_ks_test_all_PS.R b/scripts/plotting/plotting_thesis/gid/gid_ks_test_all_PS.R
new file mode 100755
index 0000000..c8086cd
--- /dev/null
+++ b/scripts/plotting/plotting_thesis/gid/gid_ks_test_all_PS.R
@@ -0,0 +1,545 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: KS test for PS/DUET lineage distributions
+#=======================================================================
+#!/usr/bin/env Rscript       
+#source("~/git/LSHTM_analysis/config/gid.R")
+# get plottting dfs 
+#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+
+#=============
+# Output
+#=============
+outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/")
+outdir_stats = paste0(outdir_images,"stats/") 
+
+# ks test by lineage
+#ks_lineage = paste0(outdir, "/KS_lineage_all_muts.csv")
+
+###########################
+# Data for  stats
+# you need df2 or df2_comp
+# since this is one-many relationship 
+###########################
+
+# REASSIGNMENT
+df2  = merged_df2[, colnames(merged_df2)%in%plotting_cols]
+
+# quick checks
+colnames(df2)
+str(df2)
+
+########################################################################
+table(df2$lineage); str(df2$lineage)
+
+# subset only lineages1-4
+sel_lineages = c("L1"
+                 , "L2"
+                 , "L3"
+                 , "L4")
+
+# subset selected lineages
+df_lin = subset(df2, subset = lineage %in% sel_lineages)
+
+table(df_lin$lineage)
+table(df_lin$sensitivity)
+
+table(df_lin$lineage, df_lin$sensitivity)
+
+#ensure lineage is a factor 
+#str(df_lin$lineage); str(df_lin$lineage_labels)
+#df_lin$lineage = as.factor(df_lin$lineage)
+#df_lin$lineage_labels = as.factor(df_lin$lineage)
+table(df_lin$lineage); table(df_lin$lineage_labels)
+
+#==============================
+# Stats for average stability
+#=============================
+# individual: CHECKS
+lin1 = df_lin[df_lin$lineage == "L1",]$avg_stability_scaled
+lin2 = df_lin[df_lin$lineage == "L2",]$avg_stability_scaled
+lin3 = df_lin[df_lin$lineage == "L3",]$avg_stability_scaled
+lin4 = df_lin[df_lin$lineage == "L4",]$avg_stability_scaled
+
+ks.test(lin1, lin4)
+
+ks.test(df_lin$avg_stability_scaled[df_lin$lineage == "L1"]
+        , df_lin$avg_stability_scaled[df_lin$lineage == "L4"])
+
+#=======================================================================
+my_lineages = levels(factor(df_lin$lineage)); my_lineages
+#=======================================================================
+# Loop
+#0 : < 2.2e-16
+#=====================
+# Lineage 1 comparisons
+#=====================
+my_lin1 = "L1"
+#my_lineages_comp_l1 = c("L2", "L3", "L4")
+my_lineages_comp_l1 = my_lineages[-match(my_lin1, my_lineages)]
+
+ks_df_l1 = data.frame()
+
+for (i in my_lineages_comp_l1){
+   cat(i)
+   l1_df = data.frame(comparison = NA, method = NA, ks_statistic = NA, ks_pvalue = NA
+                      , n_samples = NA
+                      , n_samples_total = NA)
+   
+   lineage_comp = paste0(my_lin1, " vs ", i)
+   n_samples_lin  = nrow(df_lin[df_lin$lineage == my_lin1,])
+   n_samples_i    = nrow(df_lin[df_lin$lineage == i,])
+   n_samples_all  = paste0(my_lin1, "(", n_samples_lin, ")" 
+                           , ", "
+                           , i, "(", n_samples_i, ")") 
+   
+   n_samples_total = n_samples_lin + n_samples_i
+
+   ks_method = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin1]
+          , df_lin$avg_stability_scaled[df_lin$lineage == i])$method
+   
+   ks_statistic = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin1]
+          , df_lin$avg_stability_scaled[df_lin$lineage == i])$statistic
+   
+   ks_pvalue = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin1]
+          , df_lin$avg_stability_scaled[df_lin$lineage == i])$p.value
+   
+   # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
+   l1_df$comparison     = lineage_comp
+   l1_df$method         = ks_method
+   l1_df$ks_statistic   = ks_statistic[[1]]
+   l1_df$ks_pvalue      = ks_pvalue
+   l1_df$n_samples      = n_samples_all
+   l1_df$n_samples_total= n_samples_total
+   
+   print(l1_df)
+   ks_df_l1 = rbind(ks_df_l1,l1_df)
+   
+}
+
+ks_df_l1
+
+# adjusted p-value: bonferroni
+ks_df_l1$p_adj_bonferroni = p.adjust(ks_df_l1$ks_pvalue, method = "bonferroni")
+ks_df_l1$signif_bon = ks_df_l1$p_adj_bonferroni
+ks_df_l1 = dplyr::mutate(ks_df_l1
+                          , signif_bon = case_when(signif_bon == 0.05 ~ "."
+                                                   , signif_bon <=0.0001 ~ '****'
+                                                   , signif_bon <=0.001 ~ '***' 
+                                                   , signif_bon <=0.01 ~ '**'
+                                                   , signif_bon <0.05 ~ '*'
+                                                   , TRUE ~ 'ns'))
+
+# adjusted p-value:fdr
+ks_df_l1$p_adj_fdr = p.adjust(ks_df_l1$ks_pvalue, method = "fdr")
+ks_df_l1$signif_fdr = ks_df_l1$p_adj_fdr
+ks_df_l1 = dplyr::mutate(ks_df_l1
+                       , signif_fdr = case_when(signif_fdr == 0.05 ~ "."
+                                                , signif_fdr <=0.0001 ~ '****'
+                                                , signif_fdr <=0.001 ~ '***' 
+                                                , signif_fdr <=0.01 ~ '**'
+                                                , signif_fdr <0.05 ~ '*'
+                                                , TRUE ~ 'ns'))
+
+ks_df_l1
+
+#####################################################################
+
+#=====================
+# Lineage 2 comparisons
+#=====================
+my_lin2 = "L2"
+#my_lineages_comp_l2 = c("L1", lineage3", "L4")
+my_lineages_comp_l2 = my_lineages[-match(my_lin2, my_lineages)]
+
+ks_df_l2 = data.frame()
+
+for (i in my_lineages_comp_l2){
+   cat(i)
+   l2_df = data.frame(comparison = NA, method = NA, ks_statistic = NA, ks_pvalue = NA
+                      , n_samples = NA
+                      , n_samples_total = NA)
+   
+   lineage_comp = paste0(my_lin2, " vs ", i)
+   n_samples_lin  = nrow(df_lin[df_lin$lineage == my_lin2,])
+   n_samples_i    = nrow(df_lin[df_lin$lineage == i,])
+   n_samples_all  = paste0(my_lin2, "(", n_samples_lin, ")" 
+                           , ", "
+                           , i, "(", n_samples_i, ")")
+   
+   n_samples_total = n_samples_lin + n_samples_i
+   
+   ks_method = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin2]
+                       , df_lin$avg_stability_scaled[df_lin$lineage == i])$method
+   
+   ks_statistic = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin2]
+                          , df_lin$avg_stability_scaled[df_lin$lineage == i])$statistic
+   
+   ks_pvalue = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin2]
+                       , df_lin$avg_stability_scaled[df_lin$lineage == i])$p.value
+   
+   # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
+   l2_df$comparison           = lineage_comp
+   l2_df$method          = ks_method
+   l2_df$ks_statistic    = ks_statistic[[1]]
+   l2_df$ks_pvalue       = ks_pvalue
+   l2_df$n_samples       = n_samples_all
+   l2_df$n_samples_total = n_samples_total
+   
+   print(l2_df)
+   ks_df_l2 = rbind(ks_df_l2, l2_df)
+   
+}
+
+
+# adjusted p-value: bonferroni
+ks_df_l2$p_adj_bonferroni = p.adjust(ks_df_l2$ks_pvalue, method = "bonferroni")
+ks_df_l2$signif_bon = ks_df_l2$p_adj_bonferroni
+ks_df_l2 = dplyr::mutate(ks_df_l2
+                         , signif_bon = case_when(signif_bon == 0.05 ~ "."
+                                                  , signif_bon <=0.0001 ~ '****'
+                                                  , signif_bon <=0.001 ~ '***' 
+                                                  , signif_bon <=0.01 ~ '**'
+                                                  , signif_bon <0.05 ~ '*'
+                                                  , TRUE ~ 'ns'))
+
+# adjusted p-value:fdr
+ks_df_l2$p_adj_fdr = p.adjust(ks_df_l2$ks_pvalue, method = "fdr")
+ks_df_l2$signif_fdr = ks_df_l2$p_adj_fdr
+ks_df_l2 = dplyr::mutate(ks_df_l2
+                         , signif_fdr = case_when(signif_fdr == 0.05 ~ "."
+                                                  , signif_fdr <=0.0001 ~ '****'
+                                                  , signif_fdr <=0.001 ~ '***' 
+                                                  , signif_fdr <=0.01 ~ '**'
+                                                  , signif_fdr <0.05 ~ '*'
+                                                  , TRUE ~ 'ns'))
+
+ks_df_l2
+
+#=====================
+# Lineage 3 comparisons
+#=====================
+my_lin3 = "L3"
+#my_lineages_comp_l3 = c("L1", lineage2", "L4")
+my_lineages_comp_l3 = my_lineages[-match(my_lin3, my_lineages)]
+
+ks_df_l3 = data.frame()
+
+for (i in my_lineages_comp_l3){
+   cat(i)
+   l3_df = data.frame(comparison = NA, method = NA, ks_statistic = NA, ks_pvalue = NA
+                      , n_samples = NA
+                      , n_samples_total = NA)
+   
+   lineage_comp = paste0(my_lin3, " vs ", i)
+   n_samples_lin  = nrow(df_lin[df_lin$lineage == my_lin3,])
+   n_samples_i    = nrow(df_lin[df_lin$lineage == i,])
+   n_samples_all  = paste0(my_lin3, "(", n_samples_lin, ")" 
+                           , ", "
+                           , i, "(", n_samples_i, ")") 
+   n_samples_total = n_samples_lin + n_samples_i
+   
+   ks_method = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin3]
+                       , df_lin$avg_stability_scaled[df_lin$lineage == i])$method
+   
+   ks_statistic = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin3]
+                          , df_lin$avg_stability_scaled[df_lin$lineage == i])$statistic
+   
+   ks_pvalue = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin3]
+                       , df_lin$avg_stability_scaled[df_lin$lineage == i])$p.value
+   
+   # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
+   l3_df$comparison           = lineage_comp
+   l3_df$method          = ks_method
+   l3_df$ks_statistic    = ks_statistic[[1]]
+   l3_df$ks_pvalue       = ks_pvalue
+   l3_df$n_samples       = n_samples_all
+   l3_df$n_samples_total = n_samples_total
+   
+   print(l3_df)
+   ks_df_l3 = rbind(ks_df_l3, l3_df)
+   
+}
+
+# adjusted p-value: bonferroni
+ks_df_l3$p_adj_bonferroni = p.adjust(ks_df_l3$ks_pvalue, method = "bonferroni")
+ks_df_l3$signif_bon = ks_df_l3$p_adj_bonferroni
+ks_df_l3 = dplyr::mutate(ks_df_l3
+                         , signif_bon = case_when(signif_bon == 0.05 ~ "."
+                                                  , signif_bon <=0.0001 ~ '****'
+                                                  , signif_bon <=0.001 ~ '***' 
+                                                  , signif_bon <=0.01 ~ '**'
+                                                  , signif_bon <0.05 ~ '*'
+                                                  , TRUE ~ 'ns'))
+
+# adjusted p-value:fdr
+ks_df_l3$p_adj_fdr = p.adjust(ks_df_l3$ks_pvalue, method = "fdr")
+ks_df_l3$signif_fdr = ks_df_l3$p_adj_fdr
+ks_df_l3 = dplyr::mutate(ks_df_l3
+                         , signif_fdr = case_when(signif_fdr == 0.05 ~ "."
+                                                  , signif_fdr <=0.0001 ~ '****'
+                                                  , signif_fdr <=0.001 ~ '***' 
+                                                  , signif_fdr <=0.01 ~ '**'
+                                                  , signif_fdr <0.05 ~ '*'
+                                                  , TRUE ~ 'ns'))
+
+ks_df_l3
+
+####################################################################
+# combine all 3 ks_dfs
+n_dfs = 3
+if ( all.equal(nrow(ks_df_l1), nrow(ks_df_l2), nrow(ks_df_l3)) && 
+     all.equal(ncol(ks_df_l1), ncol(ks_df_l2), ncol(ks_df_l3)) ){
+   cat("\nPASS: Calculating expected rows and cols for sanity checks on combined_dfs")
+   expected_rows = nrow(ks_df_l1) * n_dfs
+   expected_cols = ncol(ks_df_l1)
+   ks_df_combined = rbind(ks_df_l1, ks_df_l2, ks_df_l3)
+   if ( nrow(ks_df_combined) == expected_rows && ncol(ks_df_combined) == expected_cols ){
+      cat("\nPASS: combined df successfully created"
+          , "\nnrow combined_df:", nrow(ks_df_combined)
+          , "\nncol combined_df:", ncol(ks_df_combined))
+   }
+   else{
+      cat("\nFAIL: Dim mismatch"
+          , "\nExpected rows:", expected_rows
+          , "\nGot:", nrow(ks_df_combined)
+          , "\nExpected cols:", expected_cols
+          , "\nGot:", ncol(ks_df_combined))
+   }
+}else{
+   cat("\nFAIL: Could not generate expected_rows and/or expected_cols"
+       , "\nCheck hardcoded value of n_dfs")
+}
+
+#----------------------------
+# ADD extra cols: formatting
+#----------------------------
+# adding pvalue_signif
+ks_df_combined$pvalue_signif = ks_df_combined$ks_pvalue
+str(ks_df_combined$pvalue_signif)
+
+ks_df_combined = dplyr::mutate(ks_df_combined
+                         , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
+                                                     , pvalue_signif <=0.0001 ~ '****'
+                                                     , pvalue_signif <=0.001 ~ '***'
+                                                     , pvalue_signif <=0.01 ~ '**'
+                                                     , pvalue_signif <0.05 ~ '*'
+                                                     , TRUE ~ 'ns'))
+
+# Remove duplicates
+rows_to_remove = c("L2 vs L1", "L3 vs L1", "L3 vs L2")
+
+ks_df_combined_f = ks_df_combined[-match(rows_to_remove, ks_df_combined$comparison),]
+#################################################################################
+#================================
+# R vs S distribution: Overall
+#=================================
+df_lin_R = df_lin[df_lin$sensitivity == "R",]
+df_lin_S = df_lin[df_lin$sensitivity == "S",]
+
+overall_RS_df = data.frame(comparison = NA, method = NA, ks_statistic = NA, ks_pvalue = NA
+                           , n_samples = NA
+                           , n_samples_total = NA)
+
+comp_type = "overall R vs S distributions"
+
+n_samples_R     = nrow(df_lin_R)
+n_samples_S     = nrow(df_lin_S)
+n_samples_all   = paste0("R(n=", n_samples_R, ")" , " ", "S(n=", n_samples_S, ")")
+n_samples_total = n_samples_R+n_samples_S
+
+ks.test(df_lin_R$avg_stability_scaled
+        , df_lin_S$avg_stability_scaled)
+
+ks_method    = ks.test(df_lin_R$avg_stability_scaled
+                       , df_lin_S$avg_stability_scaled)$method
+
+ks_statistic = ks.test(df_lin_R$avg_stability_scaled
+                       , df_lin_S$avg_stability_scaled)$statistic
+
+ks_pvalue    = ks.test(df_lin_R$avg_stability_scaled
+                       , df_lin_S$avg_stability_scaled)$p.value
+
+overall_RS_df$comparison          = comp_type
+overall_RS_df$method         = ks_method
+overall_RS_df$ks_statistic   = ks_statistic
+overall_RS_df$ks_pvalue      = ks_pvalue
+overall_RS_df$n_samples      = n_samples_all
+overall_RS_df$n_samples_total= n_samples_total
+
+#----------------------------
+# ADD extra cols
+#----------------------------
+# adjusted p-value: bonferroni
+overall_RS_df$p_adj_bonferroni = p.adjust(overall_RS_df$ks_pvalue, method = "bonferroni")
+overall_RS_df$signif_bon = overall_RS_df$p_adj_bonferroni
+overall_RS_df = dplyr::mutate(overall_RS_df
+                         , signif_bon = case_when(signif_bon == 0.05 ~ "."
+                                                  , signif_bon <=0.0001 ~ '****'
+                                                  , signif_bon <=0.001 ~ '***' 
+                                                  , signif_bon <=0.01 ~ '**'
+                                                  , signif_bon <0.05 ~ '*'
+                                                  , TRUE ~ 'ns'))
+
+# adjusted p-value:fdr
+overall_RS_df$p_adj_fdr = p.adjust(overall_RS_df$ks_pvalue, method = "fdr")
+overall_RS_df$signif_fdr = overall_RS_df$p_adj_fdr
+overall_RS_df = dplyr::mutate(overall_RS_df
+                         , signif_fdr = case_when(signif_fdr == 0.05 ~ "."
+                                                  , signif_fdr <=0.0001 ~ '****'
+                                                  , signif_fdr <=0.001 ~ '***' 
+                                                  , signif_fdr <=0.01 ~ '**'
+                                                  , signif_fdr <0.05 ~ '*'
+                                                  , TRUE ~ 'ns'))
+# unadjusted p-values
+overall_RS_df$pvalue_signif = overall_RS_df$ks_pvalue
+overall_RS_df = dplyr::mutate(overall_RS_df
+                              , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
+                                                          , pvalue_signif <=0.0001 ~ '****'
+                                                          , pvalue_signif <=0.001 ~ '***'
+                                                          , pvalue_signif <=0.01 ~ '**'
+                                                          , pvalue_signif <0.05 ~ '*'
+                                                          , TRUE ~ 'ns'))
+#####################################################################
+if (all(colnames(ks_df_combined_f) == colnames(overall_RS_df))){
+  
+  cat("\nPASS:combining KS test results")
+}else{
+  stop("\nAbort: Cannot combine results for KS test")
+}
+
+ks_df_combined_f2 = rbind(ks_df_combined_f, overall_RS_df)
+
+# ADD extra cols
+ks_df_combined_f2$ks_comp_type = "between_lineages"
+ks_df_combined_f2$gene_name = tolower(gene)
+
+ks_df_combined_f2
+
+###########################################################################
+#=================================
+# Within lineage R vs S: MANUAL
+#=================================
+lin1 = df_lin[df_lin$lineage == my_lin1,]
+ks.test(lin1$avg_stability_scaled[lin1$sensitivity == "R"]
+        , lin1$avg_stability_scaled[lin1$sensitivity == "S"])
+
+lin2 = df_lin[df_lin$lineage == my_lin2,]
+ks.test(lin2$avg_stability_scaled[lin2$sensitivity == "R"]
+        , lin2$avg_stability_scaled[lin2$sensitivity == "S"])
+
+
+lin3 = df_lin[df_lin$lineage == "L3",]
+ks.test(lin3$avg_stability_scaled[lin3$sensitivity == "R"]
+        , lin3$avg_stability_scaled[lin3$sensitivity == "S"])
+
+lin4 = df_lin[df_lin$lineage == "L4",]
+ks.test(lin4$avg_stability_scaled[lin4$sensitivity == "R"]
+        , lin4$avg_stability_scaled[lin4$sensitivity == "S"])
+###################################################################
+#=======================
+# Within lineage R vs S: LOOP
+#=======================
+all_within_lin_df = data.frame()
+
+for (i in c(my_lin1
+            , my_lin2
+            , my_lin3
+            )){
+  #cat(i)
+  
+  within_lin_df = data.frame(comparison = NA, method = NA, ks_statistic = NA, ks_pvalue = NA
+                             , n_samples = NA
+                             , n_samples_total = NA)
+  
+  within_lineage_comp = paste0(i, ": R vs S")
+  my_lin_df = df_lin[df_lin$lineage == i,]
+  
+  lin_R_n       = nrow(my_lin_df[my_lin_df$sensitivity == "R",])
+  lin_S_n       = nrow(my_lin_df[my_lin_df$sensitivity == "S",])
+  lin_total_n   = lin_R_n+lin_S_n
+  n_samples_all = paste0("R(n=", lin_R_n, ")" , ", ", "S(n=", lin_S_n, ")")
+    
+  ks_method = ks.test(my_lin_df$avg_stability_scaled[my_lin_df$sensitivity == "R"]
+                      , my_lin_df$avg_stability_scaled[my_lin_df$sensitivity == "S"])$method
+  
+  ks_statistic = ks.test(my_lin_df$avg_stability_scaled[my_lin_df$sensitivity == "R"]
+                         , my_lin_df$avg_stability_scaled[my_lin_df$sensitivity == "S"])$statistic
+  
+  ks_pvalue =ks.test(my_lin_df$avg_stability_scaled[my_lin_df$sensitivity == "R"]
+                     , my_lin_df$avg_stability_scaled[my_lin_df$sensitivity == "S"])$p.value
+  
+  # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
+  within_lin_df$comparison     = within_lineage_comp
+  within_lin_df$method         = ks_method
+  within_lin_df$ks_statistic   = ks_statistic[[1]]
+  within_lin_df$ks_pvalue      = ks_pvalue
+  within_lin_df$n_samples      = n_samples_all
+  within_lin_df$n_samples_total=lin_total_n
+  
+  print(my_lin_df)
+  all_within_lin_df = rbind(all_within_lin_df, within_lin_df)
+  
+}
+all_within_lin_df
+
+#----------------------------
+# ADD extra cols: formatting
+#----------------------------
+# adjusted p-value: bonferroni
+all_within_lin_df$p_adj_bonferroni = p.adjust(all_within_lin_df$ks_pvalue, method = "bonferroni")
+all_within_lin_df$signif_bon = all_within_lin_df$p_adj_bonferroni
+all_within_lin_df = dplyr::mutate(all_within_lin_df
+                                  , signif_bon = case_when(signif_bon == 0.05 ~ "."
+                                                           , signif_bon <=0.0001 ~ '****'
+                                                           , signif_bon <=0.001 ~ '***'
+                                                           , signif_bon <=0.01 ~ '**'
+                                                           , signif_bon <0.05 ~ '*'
+                                                           , TRUE ~ 'ns'))
+
+# adjusted p-value:fdr
+all_within_lin_df$p_adj_fdr = p.adjust(all_within_lin_df$ks_pvalue, method = "fdr")
+all_within_lin_df$signif_fdr = all_within_lin_df$p_adj_fdr
+all_within_lin_df = dplyr::mutate(all_within_lin_df
+                                  , signif_fdr = case_when(signif_fdr == 0.05 ~ "."
+                                                           , signif_fdr <=0.0001 ~ '****'
+                                                           , signif_fdr <=0.001 ~ '***'
+                                                           , signif_fdr <=0.01 ~ '**'
+                                                           , signif_fdr <0.05 ~ '*'
+                                                           , TRUE ~ 'ns'))
+
+# unadjusted p-value
+all_within_lin_df$pvalue_signif = all_within_lin_df$ks_pvalue
+str(all_within_lin_df$pvalue_signif)
+all_within_lin_df = dplyr::mutate(all_within_lin_df
+                                  , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
+                                                              , pvalue_signif <=0.0001 ~ '****'
+                                                              , pvalue_signif <=0.001 ~ '***'
+                                                              , pvalue_signif <=0.01 ~ '**'
+                                                              , pvalue_signif <0.05 ~ '*'
+                                                              , TRUE ~ 'ns'))
+
+# ADD info cols
+all_within_lin_df$ks_comp_type = "within_lineages"
+all_within_lin_df$gene_name    = tolower(gene)
+
+##################################################################
+
+if (all(colnames(ks_df_combined_f2) == colnames(all_within_lin_df))){
+  
+  cat("\nPASS:combining KS test results")
+}else{
+  stop("\nAbort: Cannot combine results for KS test")
+}
+
+ks_df_combined_all = rbind(ks_df_combined_f2, all_within_lin_df)
+
+#--------------------
+# write output file: KS test within grpup
+#----------------------
+Out_ks_all = paste0(outdir_stats
+                        , tolower(gene)
+                        , "_ks_lineage_all_comp.csv")
+
+cat("Output of KS test all comparisons:", Out_ks_all )
+write.csv(ks_df_combined_all, Out_ks_all, row.names = FALSE)
diff --git a/scripts/plotting/plotting_thesis/ks_test_all_PS.R b/scripts/plotting/plotting_thesis/ks_test_all_PS.R
new file mode 100755
index 0000000..97e93d2
--- /dev/null
+++ b/scripts/plotting/plotting_thesis/ks_test_all_PS.R
@@ -0,0 +1,552 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: KS test for PS/DUET lineage distributions
+#=======================================================================
+#!/usr/bin/env Rscript       
+#source("~/git/LSHTM_analysis/config/alr.R")
+source("~/git/LSHTM_analysis/config/embb.R")
+#source("~/git/LSHTM_analysis/config/katg.R")
+#source("~/git/LSHTM_analysis/config/gid.R")
+#source("~/git/LSHTM_analysis/config/pnca.R")
+#source("~/git/LSHTM_analysis/config/rpob.R")
+
+# get plottting dfs 
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+
+source("~/git/LSHTM_analysis/scripts/plotting/plotting_colnames.R")
+#=============
+# Output
+#=============
+outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/")
+outdir_stats = paste0(outdir_images,"stats/") 
+
+# ks test by lineage
+#ks_lineage = paste0(outdir, "/KS_lineage_all_muts.csv")
+
+###########################
+# Data for  stats
+# you need df2 or df2_comp
+# since this is one-many relationship 
+###########################
+
+# REASSIGNMENT
+df2  = merged_df2[, colnames(merged_df2)%in%plotting_cols]
+
+# quick checks
+colnames(df2)
+str(df2)
+
+########################################################################
+table(df2$lineage); str(df2$lineage)
+
+# subset only lineages1-4
+sel_lineages = c("L1"
+                 , "L2"
+                 , "L3"
+                 , "L4")
+
+# subset selected lineages
+df_lin = subset(df2, subset = lineage %in% sel_lineages)
+
+table(df_lin$lineage)
+table(df_lin$sensitivity)
+
+table(df_lin$lineage, df_lin$sensitivity)
+
+#ensure lineage is a factor 
+#str(df_lin$lineage); str(df_lin$lineage_labels)
+#df_lin$lineage = as.factor(df_lin$lineage)
+#df_lin$lineage_labels = as.factor(df_lin$lineage)
+table(df_lin$lineage); table(df_lin$lineage_labels)]
+
+#==============================
+# Stats for average stability
+#=============================
+# individual: CHECKS
+lin1 = df_lin[df_lin$lineage == "L1",]$avg_stability_scaled
+lin2 = df_lin[df_lin$lineage == "L2",]$avg_stability_scaled
+lin3 = df_lin[df_lin$lineage == "L3",]$avg_stability_scaled
+lin4 = df_lin[df_lin$lineage == "L4",]$avg_stability_scaled
+
+ks.test(lin1, lin4)
+
+ks.test(df_lin$avg_stability_scaled[df_lin$lineage == "L1"]
+        , df_lin$avg_stability_scaled[df_lin$lineage == "L4"])
+
+#=======================================================================
+my_lineages = levels(factor(df_lin$lineage)); my_lineages
+#=======================================================================
+# Loop
+#0 : < 2.2e-16
+#=====================
+# Lineage 1 comparisons
+#=====================
+my_lin1 = "L1"
+#my_lineages_comp_l1 = c("L2", "L3", "L4")
+my_lineages_comp_l1 = my_lineages[-match(my_lin1, my_lineages)]
+
+ks_df_l1 = data.frame()
+
+for (i in my_lineages_comp_l1){
+   cat(i)
+   l1_df = data.frame(comparison = NA, method = NA, ks_statistic = NA, ks_pvalue = NA
+                      , n_samples = NA
+                      , n_samples_total = NA)
+   
+   lineage_comp = paste0(my_lin1, " vs ", i)
+   n_samples_lin  = nrow(df_lin[df_lin$lineage == my_lin1,])
+   n_samples_i    = nrow(df_lin[df_lin$lineage == i,])
+   n_samples_all  = paste0(my_lin1, "(", n_samples_lin, ")" 
+                           , ", "
+                           , i, "(", n_samples_i, ")") 
+   
+   n_samples_total = n_samples_lin + n_samples_i
+
+   ks_method = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin1]
+          , df_lin$avg_stability_scaled[df_lin$lineage == i])$method
+   
+   ks_statistic = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin1]
+          , df_lin$avg_stability_scaled[df_lin$lineage == i])$statistic
+   
+   ks_pvalue = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin1]
+          , df_lin$avg_stability_scaled[df_lin$lineage == i])$p.value
+   
+   # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
+   l1_df$comparison     = lineage_comp
+   l1_df$method         = ks_method
+   l1_df$ks_statistic   = ks_statistic[[1]]
+   l1_df$ks_pvalue      = ks_pvalue
+   l1_df$n_samples      = n_samples_all
+   l1_df$n_samples_total= n_samples_total
+   
+   print(l1_df)
+   ks_df_l1 = rbind(ks_df_l1,l1_df)
+   
+}
+
+ks_df_l1
+
+# adjusted p-value: bonferroni
+ks_df_l1$p_adj_bonferroni = p.adjust(ks_df_l1$ks_pvalue, method = "bonferroni")
+ks_df_l1$signif_bon = ks_df_l1$p_adj_bonferroni
+ks_df_l1 = dplyr::mutate(ks_df_l1
+                          , signif_bon = case_when(signif_bon == 0.05 ~ "."
+                                                   , signif_bon <=0.0001 ~ '****'
+                                                   , signif_bon <=0.001 ~ '***' 
+                                                   , signif_bon <=0.01 ~ '**'
+                                                   , signif_bon <0.05 ~ '*'
+                                                   , TRUE ~ 'ns'))
+
+# adjusted p-value:fdr
+ks_df_l1$p_adj_fdr = p.adjust(ks_df_l1$ks_pvalue, method = "fdr")
+ks_df_l1$signif_fdr = ks_df_l1$p_adj_fdr
+ks_df_l1 = dplyr::mutate(ks_df_l1
+                       , signif_fdr = case_when(signif_fdr == 0.05 ~ "."
+                                                , signif_fdr <=0.0001 ~ '****'
+                                                , signif_fdr <=0.001 ~ '***' 
+                                                , signif_fdr <=0.01 ~ '**'
+                                                , signif_fdr <0.05 ~ '*'
+                                                , TRUE ~ 'ns'))
+
+ks_df_l1
+
+#####################################################################
+
+#=====================
+# Lineage 2 comparisons
+#=====================
+my_lin2 = "L2"
+#my_lineages_comp_l2 = c("L1", lineage3", "L4")
+my_lineages_comp_l2 = my_lineages[-match(my_lin2, my_lineages)]
+
+ks_df_l2 = data.frame()
+
+for (i in my_lineages_comp_l2){
+   cat(i)
+   l2_df = data.frame(comparison = NA, method = NA, ks_statistic = NA, ks_pvalue = NA
+                      , n_samples = NA
+                      , n_samples_total = NA)
+   
+   lineage_comp = paste0(my_lin2, " vs ", i)
+   n_samples_lin  = nrow(df_lin[df_lin$lineage == my_lin2,])
+   n_samples_i    = nrow(df_lin[df_lin$lineage == i,])
+   n_samples_all  = paste0(my_lin2, "(", n_samples_lin, ")" 
+                           , ", "
+                           , i, "(", n_samples_i, ")")
+   
+   n_samples_total = n_samples_lin + n_samples_i
+   
+   ks_method = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin2]
+                       , df_lin$avg_stability_scaled[df_lin$lineage == i])$method
+   
+   ks_statistic = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin2]
+                          , df_lin$avg_stability_scaled[df_lin$lineage == i])$statistic
+   
+   ks_pvalue = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin2]
+                       , df_lin$avg_stability_scaled[df_lin$lineage == i])$p.value
+   
+   # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
+   l2_df$comparison           = lineage_comp
+   l2_df$method          = ks_method
+   l2_df$ks_statistic    = ks_statistic[[1]]
+   l2_df$ks_pvalue       = ks_pvalue
+   l2_df$n_samples       = n_samples_all
+   l2_df$n_samples_total = n_samples_total
+   
+   print(l2_df)
+   ks_df_l2 = rbind(ks_df_l2, l2_df)
+   
+}
+
+
+# adjusted p-value: bonferroni
+ks_df_l2$p_adj_bonferroni = p.adjust(ks_df_l2$ks_pvalue, method = "bonferroni")
+ks_df_l2$signif_bon = ks_df_l2$p_adj_bonferroni
+ks_df_l2 = dplyr::mutate(ks_df_l2
+                         , signif_bon = case_when(signif_bon == 0.05 ~ "."
+                                                  , signif_bon <=0.0001 ~ '****'
+                                                  , signif_bon <=0.001 ~ '***' 
+                                                  , signif_bon <=0.01 ~ '**'
+                                                  , signif_bon <0.05 ~ '*'
+                                                  , TRUE ~ 'ns'))
+
+# adjusted p-value:fdr
+ks_df_l2$p_adj_fdr = p.adjust(ks_df_l2$ks_pvalue, method = "fdr")
+ks_df_l2$signif_fdr = ks_df_l2$p_adj_fdr
+ks_df_l2 = dplyr::mutate(ks_df_l2
+                         , signif_fdr = case_when(signif_fdr == 0.05 ~ "."
+                                                  , signif_fdr <=0.0001 ~ '****'
+                                                  , signif_fdr <=0.001 ~ '***' 
+                                                  , signif_fdr <=0.01 ~ '**'
+                                                  , signif_fdr <0.05 ~ '*'
+                                                  , TRUE ~ 'ns'))
+
+ks_df_l2
+
+#=====================
+# Lineage 3 comparisons
+#=====================
+my_lin3 = "L3"
+#my_lineages_comp_l3 = c("L1", lineage2", "L4")
+my_lineages_comp_l3 = my_lineages[-match(my_lin3, my_lineages)]
+
+ks_df_l3 = data.frame()
+
+for (i in my_lineages_comp_l3){
+   cat(i)
+   l3_df = data.frame(comparison = NA, method = NA, ks_statistic = NA, ks_pvalue = NA
+                      , n_samples = NA
+                      , n_samples_total = NA)
+   
+   lineage_comp = paste0(my_lin3, " vs ", i)
+   n_samples_lin  = nrow(df_lin[df_lin$lineage == my_lin3,])
+   n_samples_i    = nrow(df_lin[df_lin$lineage == i,])
+   n_samples_all  = paste0(my_lin3, "(", n_samples_lin, ")" 
+                           , ", "
+                           , i, "(", n_samples_i, ")") 
+   n_samples_total = n_samples_lin + n_samples_i
+   
+   ks_method = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin3]
+                       , df_lin$avg_stability_scaled[df_lin$lineage == i])$method
+   
+   ks_statistic = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin3]
+                          , df_lin$avg_stability_scaled[df_lin$lineage == i])$statistic
+   
+   ks_pvalue = ks.test(df_lin$avg_stability_scaled[df_lin$lineage == my_lin3]
+                       , df_lin$avg_stability_scaled[df_lin$lineage == i])$p.value
+   
+   # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
+   l3_df$comparison           = lineage_comp
+   l3_df$method          = ks_method
+   l3_df$ks_statistic    = ks_statistic[[1]]
+   l3_df$ks_pvalue       = ks_pvalue
+   l3_df$n_samples       = n_samples_all
+   l3_df$n_samples_total = n_samples_total
+   
+   print(l3_df)
+   ks_df_l3 = rbind(ks_df_l3, l3_df)
+   
+}
+
+# adjusted p-value: bonferroni
+ks_df_l3$p_adj_bonferroni = p.adjust(ks_df_l3$ks_pvalue, method = "bonferroni")
+ks_df_l3$signif_bon = ks_df_l3$p_adj_bonferroni
+ks_df_l3 = dplyr::mutate(ks_df_l3
+                         , signif_bon = case_when(signif_bon == 0.05 ~ "."
+                                                  , signif_bon <=0.0001 ~ '****'
+                                                  , signif_bon <=0.001 ~ '***' 
+                                                  , signif_bon <=0.01 ~ '**'
+                                                  , signif_bon <0.05 ~ '*'
+                                                  , TRUE ~ 'ns'))
+
+# adjusted p-value:fdr
+ks_df_l3$p_adj_fdr = p.adjust(ks_df_l3$ks_pvalue, method = "fdr")
+ks_df_l3$signif_fdr = ks_df_l3$p_adj_fdr
+ks_df_l3 = dplyr::mutate(ks_df_l3
+                         , signif_fdr = case_when(signif_fdr == 0.05 ~ "."
+                                                  , signif_fdr <=0.0001 ~ '****'
+                                                  , signif_fdr <=0.001 ~ '***' 
+                                                  , signif_fdr <=0.01 ~ '**'
+                                                  , signif_fdr <0.05 ~ '*'
+                                                  , TRUE ~ 'ns'))
+
+ks_df_l3
+
+####################################################################
+# combine all 3 ks_dfs
+n_dfs = 3
+if ( all.equal(nrow(ks_df_l1), nrow(ks_df_l2), nrow(ks_df_l3)) && 
+     all.equal(ncol(ks_df_l1), ncol(ks_df_l2), ncol(ks_df_l3)) ){
+   cat("\nPASS: Calculating expected rows and cols for sanity checks on combined_dfs")
+   expected_rows = nrow(ks_df_l1) * n_dfs
+   expected_cols = ncol(ks_df_l1)
+   ks_df_combined = rbind(ks_df_l1, ks_df_l2, ks_df_l3)
+   if ( nrow(ks_df_combined) == expected_rows && ncol(ks_df_combined) == expected_cols ){
+      cat("\nPASS: combined df successfully created"
+          , "\nnrow combined_df:", nrow(ks_df_combined)
+          , "\nncol combined_df:", ncol(ks_df_combined))
+   }
+   else{
+      cat("\nFAIL: Dim mismatch"
+          , "\nExpected rows:", expected_rows
+          , "\nGot:", nrow(ks_df_combined)
+          , "\nExpected cols:", expected_cols
+          , "\nGot:", ncol(ks_df_combined))
+   }
+}else{
+   cat("\nFAIL: Could not generate expected_rows and/or expected_cols"
+       , "\nCheck hardcoded value of n_dfs")
+}
+
+#----------------------------
+# ADD extra cols: formatting
+#----------------------------
+# adding pvalue_signif
+ks_df_combined$pvalue_signif = ks_df_combined$ks_pvalue
+str(ks_df_combined$pvalue_signif)
+
+ks_df_combined = dplyr::mutate(ks_df_combined
+                         , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
+                                                     , pvalue_signif <=0.0001 ~ '****'
+                                                     , pvalue_signif <=0.001 ~ '***'
+                                                     , pvalue_signif <=0.01 ~ '**'
+                                                     , pvalue_signif <0.05 ~ '*'
+                                                     , TRUE ~ 'ns'))
+
+# Remove duplicates
+rows_to_remove = c("L2 vs L1", "L3 vs L1", "L3 vs L2")
+
+ks_df_combined_f = ks_df_combined[-match(rows_to_remove, ks_df_combined$comparison),]
+#################################################################################
+#================================
+# R vs S distribution: Overall
+#=================================
+df_lin_R = df_lin[df_lin$sensitivity == "R",]
+df_lin_S = df_lin[df_lin$sensitivity == "S",]
+
+overall_RS_df = data.frame(comparison = NA, method = NA, ks_statistic = NA, ks_pvalue = NA
+                           , n_samples = NA
+                           , n_samples_total = NA)
+
+comp_type = "overall R vs S distributions"
+
+n_samples_R     = nrow(df_lin_R)
+n_samples_S     = nrow(df_lin_S)
+n_samples_all   = paste0("R(n=", n_samples_R, ")" , " ", "S(n=", n_samples_S, ")")
+n_samples_total = n_samples_R+n_samples_S
+
+ks.test(df_lin_R$avg_stability_scaled
+        , df_lin_S$avg_stability_scaled)
+
+ks_method    = ks.test(df_lin_R$avg_stability_scaled
+                       , df_lin_S$avg_stability_scaled)$method
+
+ks_statistic = ks.test(df_lin_R$avg_stability_scaled
+                       , df_lin_S$avg_stability_scaled)$statistic
+
+ks_pvalue    = ks.test(df_lin_R$avg_stability_scaled
+                       , df_lin_S$avg_stability_scaled)$p.value
+
+overall_RS_df$comparison          = comp_type
+overall_RS_df$method         = ks_method
+overall_RS_df$ks_statistic   = ks_statistic
+overall_RS_df$ks_pvalue      = ks_pvalue
+overall_RS_df$n_samples      = n_samples_all
+overall_RS_df$n_samples_total= n_samples_total
+
+#----------------------------
+# ADD extra cols
+#----------------------------
+# adjusted p-value: bonferroni
+overall_RS_df$p_adj_bonferroni = p.adjust(overall_RS_df$ks_pvalue, method = "bonferroni")
+overall_RS_df$signif_bon = overall_RS_df$p_adj_bonferroni
+overall_RS_df = dplyr::mutate(overall_RS_df
+                         , signif_bon = case_when(signif_bon == 0.05 ~ "."
+                                                  , signif_bon <=0.0001 ~ '****'
+                                                  , signif_bon <=0.001 ~ '***' 
+                                                  , signif_bon <=0.01 ~ '**'
+                                                  , signif_bon <0.05 ~ '*'
+                                                  , TRUE ~ 'ns'))
+
+# adjusted p-value:fdr
+overall_RS_df$p_adj_fdr = p.adjust(overall_RS_df$ks_pvalue, method = "fdr")
+overall_RS_df$signif_fdr = overall_RS_df$p_adj_fdr
+overall_RS_df = dplyr::mutate(overall_RS_df
+                         , signif_fdr = case_when(signif_fdr == 0.05 ~ "."
+                                                  , signif_fdr <=0.0001 ~ '****'
+                                                  , signif_fdr <=0.001 ~ '***' 
+                                                  , signif_fdr <=0.01 ~ '**'
+                                                  , signif_fdr <0.05 ~ '*'
+                                                  , TRUE ~ 'ns'))
+# unadjusted p-values
+overall_RS_df$pvalue_signif = overall_RS_df$ks_pvalue
+overall_RS_df = dplyr::mutate(overall_RS_df
+                              , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
+                                                          , pvalue_signif <=0.0001 ~ '****'
+                                                          , pvalue_signif <=0.001 ~ '***'
+                                                          , pvalue_signif <=0.01 ~ '**'
+                                                          , pvalue_signif <0.05 ~ '*'
+                                                          , TRUE ~ 'ns'))
+#####################################################################
+if (all(colnames(ks_df_combined_f) == colnames(overall_RS_df))){
+  
+  cat("\nPASS:combining KS test results")
+}else{
+  stop("\nAbort: Cannot combine results for KS test")
+}
+
+ks_df_combined_f2 = rbind(ks_df_combined_f, overall_RS_df)
+
+# ADD extra cols
+ks_df_combined_f2$ks_comp_type = "between_lineages"
+ks_df_combined_f2$gene_name = tolower(gene)
+
+ks_df_combined_f2
+
+###########################################################################
+#=================================
+# Within lineage R vs S: MANUAL
+#=================================
+lin1 = df_lin[df_lin$lineage == my_lin1,]
+ks.test(lin1$avg_stability_scaled[lin1$sensitivity == "R"]
+        , lin1$avg_stability_scaled[lin1$sensitivity == "S"])
+
+lin2 = df_lin[df_lin$lineage == my_lin2,]
+ks.test(lin2$avg_stability_scaled[lin2$sensitivity == "R"]
+        , lin2$avg_stability_scaled[lin2$sensitivity == "S"])
+
+
+lin3 = df_lin[df_lin$lineage == "L3",]
+ks.test(lin3$avg_stability_scaled[lin3$sensitivity == "R"]
+        , lin3$avg_stability_scaled[lin3$sensitivity == "S"])
+
+lin4 = df_lin[df_lin$lineage == "L4",]
+ks.test(lin4$avg_stability_scaled[lin4$sensitivity == "R"]
+        , lin4$avg_stability_scaled[lin4$sensitivity == "S"])
+###################################################################
+#=======================
+# Within lineage R vs S: LOOP
+#=======================
+all_within_lin_df = data.frame()
+
+for (i in c(my_lin1
+            , my_lin2
+            , my_lin3
+            )){
+  #cat(i)
+  
+  within_lin_df = data.frame(comparison = NA, method = NA, ks_statistic = NA, ks_pvalue = NA
+                             , n_samples = NA
+                             , n_samples_total = NA)
+  
+  within_lineage_comp = paste0(i, ": R vs S")
+  my_lin_df = df_lin[df_lin$lineage == i,]
+  
+  lin_R_n       = nrow(my_lin_df[my_lin_df$sensitivity == "R",])
+  lin_S_n       = nrow(my_lin_df[my_lin_df$sensitivity == "S",])
+  lin_total_n   = lin_R_n+lin_S_n
+  n_samples_all = paste0("R(n=", lin_R_n, ")" , ", ", "S(n=", lin_S_n, ")")
+    
+  ks_method = ks.test(my_lin_df$avg_stability_scaled[my_lin_df$sensitivity == "R"]
+                      , my_lin_df$avg_stability_scaled[my_lin_df$sensitivity == "S"])$method
+  
+  ks_statistic = ks.test(my_lin_df$avg_stability_scaled[my_lin_df$sensitivity == "R"]
+                         , my_lin_df$avg_stability_scaled[my_lin_df$sensitivity == "S"])$statistic
+  
+  ks_pvalue =ks.test(my_lin_df$avg_stability_scaled[my_lin_df$sensitivity == "R"]
+                     , my_lin_df$avg_stability_scaled[my_lin_df$sensitivity == "S"])$p.value
+  
+  # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
+  within_lin_df$comparison     = within_lineage_comp
+  within_lin_df$method         = ks_method
+  within_lin_df$ks_statistic   = ks_statistic[[1]]
+  within_lin_df$ks_pvalue      = ks_pvalue
+  within_lin_df$n_samples      = n_samples_all
+  within_lin_df$n_samples_total=lin_total_n
+  
+  print(my_lin_df)
+  all_within_lin_df = rbind(all_within_lin_df, within_lin_df)
+  
+}
+all_within_lin_df
+
+#----------------------------
+# ADD extra cols: formatting
+#----------------------------
+# adjusted p-value: bonferroni
+all_within_lin_df$p_adj_bonferroni = p.adjust(all_within_lin_df$ks_pvalue, method = "bonferroni")
+all_within_lin_df$signif_bon = all_within_lin_df$p_adj_bonferroni
+all_within_lin_df = dplyr::mutate(all_within_lin_df
+                                  , signif_bon = case_when(signif_bon == 0.05 ~ "."
+                                                           , signif_bon <=0.0001 ~ '****'
+                                                           , signif_bon <=0.001 ~ '***'
+                                                           , signif_bon <=0.01 ~ '**'
+                                                           , signif_bon <0.05 ~ '*'
+                                                           , TRUE ~ 'ns'))
+
+# adjusted p-value:fdr
+all_within_lin_df$p_adj_fdr = p.adjust(all_within_lin_df$ks_pvalue, method = "fdr")
+all_within_lin_df$signif_fdr = all_within_lin_df$p_adj_fdr
+all_within_lin_df = dplyr::mutate(all_within_lin_df
+                                  , signif_fdr = case_when(signif_fdr == 0.05 ~ "."
+                                                           , signif_fdr <=0.0001 ~ '****'
+                                                           , signif_fdr <=0.001 ~ '***'
+                                                           , signif_fdr <=0.01 ~ '**'
+                                                           , signif_fdr <0.05 ~ '*'
+                                                           , TRUE ~ 'ns'))
+
+# unadjusted p-value
+all_within_lin_df$pvalue_signif = all_within_lin_df$ks_pvalue
+str(all_within_lin_df$pvalue_signif)
+all_within_lin_df = dplyr::mutate(all_within_lin_df
+                                  , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
+                                                              , pvalue_signif <=0.0001 ~ '****'
+                                                              , pvalue_signif <=0.001 ~ '***'
+                                                              , pvalue_signif <=0.01 ~ '**'
+                                                              , pvalue_signif <0.05 ~ '*'
+                                                              , TRUE ~ 'ns'))
+
+# ADD info cols
+all_within_lin_df$ks_comp_type = "within_lineages"
+all_within_lin_df$gene_name    = tolower(gene)
+
+##################################################################
+
+if (all(colnames(ks_df_combined_f2) == colnames(all_within_lin_df))){
+  
+  cat("\nPASS:combining KS test results")
+}else{
+  stop("\nAbort: Cannot combine results for KS test")
+}
+
+ks_df_combined_all = rbind(ks_df_combined_f2, all_within_lin_df)
+
+#--------------------
+# write output file: KS test within grpup
+#----------------------
+Out_ks_all = paste0(outdir_stats
+                        , tolower(gene)
+                        , "_ks_lineage_all_comp.csv")
+
+cat("Output of KS test all comparisons:", Out_ks_all )
+write.csv(ks_df_combined_all, Out_ks_all, row.names = FALSE)
diff --git a/scripts/plotting/plotting_thesis/rpob/rpob_appendix_tables.R b/scripts/plotting/plotting_thesis/rpob/rpob_appendix_tables.R
new file mode 100644
index 0000000..4beabc3
--- /dev/null
+++ b/scripts/plotting/plotting_thesis/rpob/rpob_appendix_tables.R
@@ -0,0 +1,482 @@
+#!/usr/bin/env Rscript       
+
+source("~/git/LSHTM_analysis/config/gid.R")
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+
+#=======
+# output
+#=======
+outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/")
+outdir_stats = paste0(outdir_images,"stats/") 
+cat("\nOutput dir for stats:", outdir_stats)
+###################################################################
+geneL_normal  = c("pnca")
+#geneL_na      = c("gid", "rpob")
+geneL_na_v2   = c("gid")
+geneL_nca    = c("alr", "embb", "katg", "rpob")
+geneL_both    = c("rpob")
+
+
+if (tolower(gene)%in%geneL_na_v2) {
+  gene_colnames = c("mcsm_na_affinity", "mcsm_na_outcome")
+}
+
+if (tolower(gene)%in%geneL_nca) {
+  gene_colnames = c("mcsm_nca_affinity", "mcsm_nca_outcome")
+}
+
+
+#from plotting_globals()
+LigDist_colname
+ppi2Dist_colname 
+naDist_colname
+
+delta_symbol #delta_symbol = "\u0394"; delta_symbol
+angstroms_symbol
+
+cat("\nAffinity Distance colnames:", length(affinity_dist_colnames)
+    , "\nThese are:", affinity_dist_colnames)
+#===========
+# Data used
+#===========
+df3 = merged_df3
+
+cols_to_output = c("position"
+                   , "sensitivity"
+                   , "mutationinformation"
+                   , affinity_dist_colnames[1]
+                   , "ligand_affinity_change"
+                   , "ligand_outcome"
+                   , "mmcsm_lig"
+                   , "mmcsm_lig_outcome"
+                   , affinity_dist_colnames[2]
+                   # #, affinity_dist_colnames[3]
+                   # , "mcsm_na_affinity"
+                   # , "mcsm_na_outcome"
+                   # #, "mcsm_nca_affinity"
+                   # #, "mcsm_nca_outcome"
+                   , gene_colnames
+                   , "maf"
+                   , "or_mychisq"
+                   , "pval_fisher")
+
+cols_to_output
+df3_output = df3[, cols_to_output]
+colnames(df3_output)
+cat("\nSelecting columns:", length(colnames(df3_output)))
+#===============================================
+# Add COLS and rounding: adjusted P-values + MAF
+#==============================================
+#-----------------------------
+# adjusted P-values
+#-----------------------------
+# add cols: p_adj_fdr and signif_fdr
+df3_output$p_adj_fdr = p.adjust(df3_output$pval_fisher, method = "fdr")
+df3_output$signif_fdr = df3_output$p_adj_fdr
+df3_output = dplyr::mutate(df3_output
+                           , signif_fdr = case_when(signif_fdr == 0.05 ~ "."
+                                                    , signif_fdr <=0.0001 ~ '****'
+                                                    , signif_fdr <=0.001 ~ '***'
+                                                    , signif_fdr <=0.01 ~ '**'
+                                                    , signif_fdr <0.05 ~ '*'
+                                                    , TRUE ~ 'ns'))
+# rounding
+df3_output$or_mychisq = round(df3_output$or_mychisq,2)
+df3_output$p_adj_fdr  = round(df3_output$p_adj_fdr,2)
+head(df3_output)
+
+#----------
+# MAF (%)
+#----------
+# add col maf_percent
+df3_output$maf_percent = df3_output$maf*100
+
+# rounding
+df3_output$maf_percent = round(df3_output$maf_percent,2)
+head(df3_output$af); head(df3_output$maf);head(df3_output$maf_percent)
+
+#----------
+# P-value
+#----------
+df3_output$pval_fisher = round(df3_output$pval_fisher,2)
+
+class(df3_output)
+head(df3_output)
+
+####################################
+# Appendix: ligand affinity
+####################################
+df_lig = df3_output[df3_output[[LigDist_colname]]<DistCutOff,]
+
+cols_to_output_lig = c("position"
+                       , "sensitivity"
+                       , "mutationinformation"
+                       , LigDist_colname
+                       , "ligand_affinity_change"
+                       , "ligand_outcome"
+                       , "mmcsm_lig"
+                       , "mmcsm_lig_outcome"
+                       , "maf_percent"
+                       , "or_mychisq"
+                       , "pval_fisher"
+                       , "p_adj_fdr"
+                       , "signif_fdr")
+# select cols
+Out_df_lig = df_lig[, cols_to_output_lig]
+
+# sort df by OR and then MAF: highest OR and highest MAF
+#Out_df_ligS1 = Out_df_lig[order(Out_df_lig$or_mychisq, decreasing = T), ]
+Out_df_ligS = Out_df_lig[order(-Out_df_lig$or_mychisq, Out_df_lig$maf_percent), ]
+
+#head(Out_df_ligS1); tail(Out_df_ligS1)
+head(Out_df_ligS); tail(Out_df_ligS)
+
+colsNames_to_output_lig = c("position"
+                            , "sensitivity"
+                            , "Mutation"
+                            , paste0("Lig-Dist (", angstroms_symbol, ")")
+                            , "mCSM-ligand affinity"
+                            , "mCSM ligand_outcome"
+                            , "mmCSM-ligand affinity"
+                            , "mmCSM ligand_outcome"
+                            , paste0("MAF ","(%)")
+                            , "Odds Ratio"
+                            , "P-value"
+                            , "Adjusted P-value"
+                            , "Adjusted P-value significance")
+
+colnames(Out_df_ligS) = colsNames_to_output_lig
+head(Out_df_ligS)
+
+#--------------------
+# write output file: KS test within grpup
+#----------------------
+Out_ligT = paste0(outdir_stats
+                  , tolower(gene)
+                  , "_lig_muts.csv")
+
+cat("Output of Ligand muts:", Out_ligT )
+write.csv(Out_df_ligS, Out_ligT, row.names = FALSE)
+
+########################################################################
+####################################
+# Appendix: NA affinity
+# naDist_colname
+####################################
+
+# Filtered data
+df_nca = df3_output[df3_output[[naDist_colname]]<DistCutOff,]
+
+# select cols
+cols_to_output_nca = c("position"
+                        , "sensitivity"
+                        , "mutationinformation"
+                        , naDist_colname
+                        , gene_colnames
+                        , "maf_percent"
+                        , "or_mychisq"
+                        , "pval_fisher"
+                        , "p_adj_fdr"
+                        , "signif_fdr")
+
+# extract output cols
+Out_df_nca = df_nca[, cols_to_output_nca]
+
+# sort df by OR and then MAF: Highest OR and Highest MAF
+#Out_df_ncaS = Out_df_nca[order(Out_df_nca$or_mychisq, decreasing = T), ]
+Out_df_ncaS = Out_df_nca[order(-Out_df_nca$or_mychisq, Out_df_nca$maf_percent), ]
+
+
+colsNames_to_output_nca = c("position"
+                             , "sensitivity"
+                             , "Mutation"
+                             , paste0("NA-Dist (", angstroms_symbol, ")")
+                             , paste0("mCSM-NA (", delta_symbol,delta_symbol,"G)")
+                             , "mCSM-NA outcome"
+                             , paste0("MAF ","(%)")
+                             , "Odds Ratio"
+                             , "P-value"
+                             , "Adjusted P-value"
+                             , "Adjusted P-value significance")
+
+colnames(Out_df_ncaS) = colsNames_to_output_nca
+Out_df_ncaS
+#--------------------
+# write output file: KS test within grpup
+#----------------------
+Out_ncaT = paste0(outdir_stats
+                   , tolower(gene)
+                   , "_na_muts.csv")
+
+cat("Output of NA muts:", Out_ncaT )
+write.csv(Out_df_ncaS, Out_ncaT, row.names = FALSE)
+########################################################################
+####################################
+# Appendix: ppi2 affinity
+####################################
+
+# Filtered data
+df_ppi2 = df3_output[df3_output[[ppi2Dist_colname]]<DistCutOff,]
+
+# select cols
+cols_to_output_ppi2 = c("position"
+                        , "sensitivity"
+                        , "mutationinformation"
+                        , ppi2Dist_colname
+                        #, "mcsm_ppi2_affinity"
+                        #, "mcsm_ppi2_outcome"
+                        , gene_colnames
+                        , "maf_percent"
+                        , "or_mychisq"
+                        , "pval_fisher"
+                        , "p_adj_fdr"
+                        , "signif_fdr")
+
+# extract output cols
+Out_df_ppi2 = df_ppi2[, cols_to_output_ppi2]
+
+# sort df by OR and then MAF: Highest OR and Highest MAF
+#Out_df_ppi2S = Out_df_ppi2[order(Out_df_ppi2$or_mychisq, decreasing = T), ]
+Out_df_ppi2S = Out_df_ppi2[order(-Out_df_ppi2$or_mychisq, Out_df_ppi2$maf_percent), ]
+
+
+colsNames_to_output_ppi2 = c("position"
+                             , "sensitivity"
+                             , "Mutation"
+                             , paste0("PPI2-Dist (", angstroms_symbol, ")")
+                             , paste0("mCSM-PPI2 (", delta_symbol,delta_symbol,"G)")
+                             , "mCSM-PPI2 outcome"
+                             , paste0("MAF ","(%)")
+                             , "Odds Ratio"
+                             , "P-value"
+                             , "Adjusted P-value"
+                             , "Adjusted P-value significance")
+
+colnames(Out_df_ppi2S) = colsNames_to_output_ppi2
+Out_df_ppi2S
+#--------------------
+# write output file: KS test within grpup
+#----------------------
+Out_ppi2T = paste0(outdir_stats
+                   , tolower(gene)
+                   , "_ppi2_muts.csv")
+
+cat("Output of PPI2 muts:", Out_ppi2T )
+write.csv(Out_df_ppi2S, Out_ppi2T, row.names = FALSE)
+
+#################################################################################
+#################################################################################
+#################################################################################
+##########################################################
+# higest or/maf and stability effects
+###########################################################
+# convert to percet
+df3$maf_percent = df3$maf*100
+
+cols_to_output_effects = c("position"
+                   , "sensitivity"
+                   , "mutationinformation"
+                   , "avg_stability"
+                   , "avg_stability_outcome"
+                   , affinity_dist_colnames[1]
+                   , "avg_lig_affinity"
+                   , "avg_lig_affinity_outcome"
+                   , affinity_dist_colnames[2]
+                   , gene_colnames
+                   , "maf_percent"
+                   , "or_mychisq"
+                   , "pval_fisher")
+
+df3_effects = df3[, cols_to_output_effects]
+nrow(df3_effects); ncol(df3_effects)
+
+# add cols: p_adj_fdr and signif_fdr
+df3_effects$p_adj_fdr = p.adjust(df3_effects$pval_fisher, method = "fdr")
+df3_effects$signif_fdr = df3_effects$p_adj_fdr
+df3_effects = dplyr::mutate(df3_effects
+                           , signif_fdr = case_when(signif_fdr == 0.05 ~ "."
+                                                    , signif_fdr <=0.0001 ~ '****'
+                                                    , signif_fdr <=0.001 ~ '***'
+                                                    , signif_fdr <=0.01 ~ '**'
+                                                    , signif_fdr <0.05 ~ '*'
+                                                    , TRUE ~ 'ns'))
+# rounding
+df3_effects$or_mychisq = round(df3_effects$or_mychisq,2)
+df3_effects$p_adj_fdr  = round(df3_effects$p_adj_fdr,2)
+head(df3_effects)
+
+#-------------------
+# Highest OR and MAF
+#-------------------
+mut_hor  = df3_effects[df3_effects$or_mychisq%in%(max(df3_effects$or_mychisq, na.rm = T)),]
+mut_hmaf = df3_effects[df3_effects$maf_percent%in%(max(df3_effects$maf_percent, na.rm = T)),]
+
+if (identical(colnames(mut_hor), colnames(mut_hmaf)) ){
+  
+  # add cols
+  mut_hor$mutational_effect  = "Mutation with highest OR"
+  mut_hmaf$mutational_effect = "Most frequent mutation"
+  cat("\nPass: or and maf")
+}else{
+  quit("Abort: colnames or and maf mismatch")
+}
+
+#-------------------
+# Avg stability
+# most DD/SS: average stability
+#-------------------
+mut_h_avs_dd = df3_effects[df3_effects$avg_stability%in%(min(df3_effects$avg_stability, na.rm = T)),]
+mut_h_avs_ss = df3_effects[df3_effects$avg_stability%in%(max(df3_effects$avg_stability, na.rm = T)),]
+
+if (identical(colnames(mut_h_avs_dd), colnames(mut_h_avs_ss)) ){
+  
+  # add cols
+  mut_h_avs_dd$mutational_effect = "Most Destabilising for protomer"
+  mut_h_avs_ss$mutational_effect = "Most Stabilising for protomer"
+  
+  cat("\nPass : avg stability")
+}else{
+  quit("Abort: colnames stability mismatch")
+}
+
+#-------------------
+# Filtered columns
+# most DD/SS: ligand
+#-------------------
+df3_effects_lig = df3_effects[df3_effects[[LigDist_colname]]<DistCutOff,]
+nrow(df3_effects_lig)
+
+mut_h_lig_dd = df3_effects_lig[df3_effects_lig$avg_lig_affinity%in%(min(df3_effects_lig$avg_lig_affinity, na.rm = T)),]
+mut_h_lig_ss = df3_effects_lig[df3_effects_lig$avg_lig_affinity%in%(max(df3_effects_lig$avg_lig_affinity, na.rm = T)),]
+
+if (identical(colnames(mut_h_lig_dd), colnames(mut_h_lig_ss)) ){
+  
+  # add cols
+  mut_h_lig_dd$mutational_effect = "Most Destabilising for Ligand affinity"
+  mut_h_lig_ss$mutational_effect = "Most Stabilising for Ligand affinity"
+  
+  cat("\nPass: avg ligand affinity")
+}else{
+  quit("Abort: colnames lig mismatch")
+}
+
+
+if (tolower(gene)%in%geneL_both ){
+  
+  #-------------------
+  # Filtered columns
+  # most DD/SS: NA
+  #-------------------
+  
+  df3_effects_na = df3_effects[df3_effects[[naDist_colname]]<DistCutOff,]
+  nrow(df3_effects_na)
+  mut_h_na_dd = df3_effects_na[df3_effects_na$mcsm_na_affinity%in%(min(df3_effects_na$mcsm_na_affinity, na.rm = T)),]
+  mut_h_na_ss = df3_effects_na[df3_effects_na$mcsm_na_affinity%in%(max(df3_effects_na$mcsm_na_affinity, na.rm = T)),]
+  
+  # add cols: NA
+  mut_h_na_dd$mutational_effect = "Most Destabilising for NA affinity"
+  mut_h_na_ss$mutational_effect = "Most Stabilising for NA affinity"
+  
+  if (identical(colnames(mut_h_na_dd), colnames(mut_h_na_ss)) ){
+    cat("\nPass 1: NCA")
+  }else{
+    quit("Abort: colnames nca mismatch")
+  }
+  
+  if (identical(colnames(mut_h_na_dd), colnames(mut_h_lig_dd)) ){
+    cat("\nPass 2: NCA")
+  }else{
+    quit("Abort: colnames ppi2 mismatch")
+  }
+  
+  #-------------------
+  # Filtered columns
+  # most DD/SS: ppi2
+  #-------------------
+  
+  df3_effects_ppi2 = df3_effects[df3_effects[[ppi2Dist_colname]]<DistCutOff,]
+  nrow(df3_effects_ppi2)
+  
+  mut_h_ppi2_dd = df3_effects_ppi2[df3_effects_ppi2$mcsm_ppi2_affinity%in%(min(df3_effects_ppi2$mcsm_ppi2_affinity, na.rm = T)),]
+  mut_h_ppi2_ss = df3_effects_ppi2[df3_effects_ppi2$mcsm_ppi2_affinity%in%(max(df3_effects_ppi2$mcsm_ppi2_affinity, na.rm = T)),]
+  
+  # add cols: PPI2
+  mut_h_ppi2_dd$mutational_effect = "Most Destabilising for PPI affinity"
+  mut_h_ppi2_dd$mutational_effect = "Most Stabilising for PPI affinity"
+  
+  if (identical(colnames(mut_h_ppi2_dd), colnames(mut_h_ppi2_ss)) ){
+    cat("\nPass 1: ppi2")
+  }else{
+    quit("Abort: colnames ppi2 mismatch")
+  }
+  
+  if (identical(colnames(mut_h_ppi2_dd), colnames(mut_h_lig_dd)) ){
+    cat("\nPass 2 : ppi2")
+  }else{
+    quit("Abort: colnames ppi2 mismatch")
+  }
+  
+  # combine BOTH: NA and PPI2
+  gene_aff_combined = rbind(mut_h_na_dd, mut_h_na_ss)
+    }
+
+#===============
+# final combine
+#================
+if ( identical(colnames(mut_hor), colnames(mut_h_lig_dd)) ){
+  cat("PASS: all")
+  combined_table = rbind(mut_hor, mut_hmaf,
+                         mut_h_avs_dd, mut_h_avs_ss,
+                         mut_h_lig_dd, mut_h_lig_ss,
+                         gene_aff_combined)
+  cat("\nCombined table dim:", "\nnrow:", nrow(combined_table), "\nncol:", ncol(combined_table))
+}else{
+  quit("Abort: colnames ppi2 mismatch")
+}
+
+# Assign pretty colnames
+colnames(combined_table)
+colsNames_combined_table = c("position"
+                            , "sensitivity"
+                            , "Mutation"
+                            , paste0("Avg stability (", delta_symbol,delta_symbol,"G)")
+                            , "avg stability outcome"
+                            
+                            , paste0("Lig-Dist (", angstroms_symbol, ")")
+                            , "Avg ligand affinity"
+                            , "Ligand affinity outcome"
+                            
+                            , paste0("NA-Dist (", angstroms_symbol, ")")
+                            , paste0("mCSM-NA (", delta_symbol,delta_symbol,"G)")
+                            , "mCSM-NA outcome"
+                            
+                            , paste0("PPI-Dist (", angstroms_symbol, ")")
+                            , paste0("mCSM-PPI (", delta_symbol,delta_symbol,"G)")
+                            , "mCSM-PPI outcome"
+                            
+                            , paste0("MAF ","(%)")
+                            , "Odds Ratio"
+                            , "P-value"
+                            , "Adjusted P-value"
+                            , "Adjusted P-value significance"
+                            , "Mutational effect")
+
+if ( length(colnames(combined_table)) == length(colsNames_combined_table) ) {
+  cat("Assiging pretty colnames for output")
+  colnames(combined_table) <- colsNames_combined_table
+  #colnames(combined_table)
+}else{
+  stop("\nAbort: No. of cols mismatch. Cannot assign pretty colnames for output")
+}
+
+#--------------------
+# write output file: KS test within grpup
+#----------------------
+Out_combined_effectsT = paste0(outdir_stats
+                  , tolower(gene)
+                  , "_mut_effects.csv")
+
+cat("Output of effects:", Out_combined_effectsT )
+write.csv(combined_table, Out_combined_effectsT, row.names = FALSE)
+
+
diff --git a/scripts/rgb_hex.R b/scripts/rgb_hex.R
new file mode 100644
index 0000000..a260b4e
--- /dev/null
+++ b/scripts/rgb_hex.R
@@ -0,0 +1,33 @@
+x= c(c(100,100,059),
+c(063,016,037),
+c(094,049,067),
+c(098,078,086),
+c(098,092,096),
+c(100,100,100),
+c(084,094,094),
+c(065,086,090),
+c(029,069,075),
+c(004,049,051)
+)
+
+rescale(c(1,2,3),0:255)
+
+v = round(scales::rescale_mid(x
+                , to  = c(0,255)
+                #, from = c(lig_min,lig_max)
+                , mid = 50)
+)
+
+
+rgb(1.00,1.00,0.59)                                                     
+rgb(0.63,0.16,0.37)
+rgb(0.94,0.49,0.67)                                                     
+rgb(0.98,0.78,0.86)                                                     
+rgb(0.98,0.92,0.96)                                                    
+rgb(1.00,1.00,1.00)                                                    
+rgb(0.84,0.94,0.94)                                                     
+rgb(0.65,0.86,0.90)                                                     
+rgb(0.29,0.69,0.75)                                                     
+rgb(0.04,0.49,0.51)                                                     
+
+                             
\ No newline at end of file