diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py
old mode 100755
new mode 100644
index dc89813..de3791e
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@@ -1,11 +1,9 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 '''
-Created on Tue Aug  6 12:56:03 2019
-
 @author: tanu
 '''
-
+#%% Description
 # FIXME: include error checking to enure you only
 # concentrate on positions that have structural info?
 
@@ -43,6 +41,9 @@ Created on Tue Aug  6 12:56:03 2019
 #-------------
 #reference_dict.py
 #tidy_split.py
+
+# bash counting for cross checks: example
+#grep -oP 'pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3}' mtb_gwas_meta_v6.csv  | sort | uniq -c | wc -l
 #=======================================================================
 #%% load libraries
 import os, sys
@@ -50,6 +51,8 @@ import re
 import pandas as pd
 import numpy as np
 import argparse
+from statistics import mean, median, mode
+from statistics import multimode
 #=======================================================================
 #%% dir and local imports
 homedir = os.path.expanduser('~') 
@@ -57,7 +60,6 @@ homedir = os.path.expanduser('~')
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()
-
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
@@ -71,7 +73,7 @@ arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and outp
 arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
 
 args = arg_parser.parse_args()
-#=======================================================================
+###############################################################################
 #%% variable assignment: input and output paths & filenames
 drug = args.drug
 gene = args.gene
@@ -79,8 +81,8 @@ datadir = args.datadir
 indir   = args.input_dir
 outdir  = args.output_dir
 make_dirs  = args.make_dirs
-
-#%% input and output dirs and files
+###############################################################################
+#%% variable assignment: input and output dirs and files
 #=======
 # dirs
 #=======
@@ -88,10 +90,10 @@ if not datadir:
     datadir = homedir + '/' + 'git/Data'
     
 if not indir:
-    indir = datadir + '/' + drug + '/input'
+    indir = datadir + '/' + drug + '/input_v2'
     
 if not outdir:
-    outdir = datadir + '/' + drug + '/output'
+    outdir = datadir + '/' + drug + '/output_v2'
 
 if make_dirs:
     print('make_dirs is turned on, creating data dir:', datadir)
@@ -128,18 +130,20 @@ if not os.path.isdir(outdir):
     print('ERROR: Output directory does not exist:', outdir
     , '\nPlease create or specify outdir and rerun\nelse specify cmd option --make_dirs')
     sys.exit()
-    
-# Requires
+###############################################################################    
+#%% required local imports
 from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
 from tidy_split import tidy_split
-
-#=======================================================================    
+###############################################################################
+#%% creating required variables: gene and drug dependent, and input filename
 gene_match = gene + '_p.'
 print('mut pattern for gene', gene, ':',  gene_match)
 
 nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
 print('nsSNP for gene', gene, ':',  nssnp_match)
 
+nssnp_match_captureG = "(" + nssnp_match + ")"
+
 wt_regex = gene_match.lower()+'([A-Za-z]{3})'
 print('wt regex:', wt_regex)
 
@@ -163,8 +167,6 @@ print('Extracting columns based on variables:\n'
     , '\n'
     , resistance_col
 	, '\n===============================================================')
-#=======================================================================
-
 #=======
 # input
 #=======
@@ -182,8 +184,8 @@ print('Output filename: in the respective sections'
       , '\nOutput path: ', outdir
       , '\n=============================================================')
 
-#%%end of variable assignment for input and output files
-#=======================================================================
+#end of variable assignment for input and output files
+###############################################################################
 #%% Read input file
 master_data  = pd.read_csv(infile_master, sep = ',')  
 
@@ -207,20 +209,8 @@ else:
                  #, 'strain'
                  , 'lineage'
                  , 'sublineage' 
-                 #, 'country'
                  , 'country_code'
-                 , 'geographic_source'
-                 #, 'region'
-                 #, 'location'
-                 #, 'host_body_site'
-                 #, 'environment_material'
-                 #, 'host_status'
-                 #, 'host_sex'
-                 #, 'submitted_host_sex'
-                 #, 'hiv_status'
-                 #, 'HIV_status'
-                 #, 'tissue_type'
-                 #, 'isolation_source'
+                 #, 'geographic_source'
                  ,  resistance_col]
     
     variable_based_cols = [drug
@@ -245,33 +235,171 @@ print('RESULT: Total samples:', total_samples
 meta_data.isna().sum()
 print('No. of NAs/column:' + '\n', meta_data.isna().sum()
 		, '\n===========================================================\n')
+###############################################################################
+#%% Quick checks:
+meta_data['lineage'].value_counts()
+meta_data['lineage'].value_counts().sum()    
+meta_data['lineage'].nunique()
 
-#%% Write check file
-check_file = outdir + '/' + gene.lower() + '_gwas.csv'
-meta_data.to_csv(check_file, index = False)
-print('\n----------------------------------'
-      , '\nWriting subsetted gwas data:', gene
-      , '\n----------------------------------'
-      , '\nFile', check_file
-      , '\nDim:', meta_data.shape)
+# replace lineage with 'L' in lineage_labels
+#meta_data['lineage_labels'] = meta_data['lineage']
+#meta_data['lineage_labels'].equals(meta_data['lineage'])
+#all(meta_data['lineage_labels'].value_counts() == meta_data['lineage'].value_counts())
+#meta_data['lineage_labels'] = meta_data['lineage_labels'].str.replace('lineage', 'L')
+#meta_data['lineage'].value_counts() 
+#meta_data['lineage_labels'].value_counts()
 
-# equivalent of table in R
-# drug counts: complete samples for OR calcs
-meta_data[drug].value_counts() 
+meta_data['lineage'] = meta_data['lineage'].str.replace('lineage', 'L')
+meta_data['lineage'].value_counts()
+print("\n================================"
+      , "\nLineage numbers"
+      , "\nComplete lineage samples:", meta_data['lineage'].value_counts().sum()
+      , "\nMissing lineage samples:", meta_data['id'].nunique() - meta_data['lineage'].value_counts().sum()
+      , "\n================================")
+
+
+meta_data['id'].nunique()
+meta_data['sample'].nunique()
+meta_data['id'].equals(meta_data['sample'])
+foo = meta_data.copy()
+foo['Diff'] = np.where( foo['id'] == foo['sample'] , '1', '0')
+foo['Diff'].value_counts()
+
+meta_data['drtype'].value_counts()
+meta_data['drtype'].value_counts().sum()
+print("\n================================"
+      , "\ndrtype numbers"
+      , "\nComplete drtype samples:", meta_data['drtype'].value_counts().sum()
+      , "\nMissing drtype samples:", meta_data['id'].nunique() - meta_data['drtype'].value_counts().sum()
+      , "\n================================")
+
+meta_data['drug_name'] = meta_data[drug].map({1:'R'
+                                              , 0:'S'})
+meta_data['drug_name'].value_counts()
+meta_data[drug].value_counts()
+meta_data[drug].value_counts().sum()
+print("\n================================"
+      , "\ndrug", drug, "numbers"
+      , "\nComplete drug samples:", meta_data[drug].value_counts().sum()
+      , "\nMissing drug samples:", meta_data['id'].nunique() - meta_data[drug].value_counts().sum()
+      , "\n================================")
+
+print("\n================================"
+      , "\ndrug", drug, "numbers"
+      , "\nComplete drug samples:", meta_data['drug_name'].value_counts().sum()
+      , "\nMissing drug samples:", meta_data['id'].nunique() - meta_data['drug_name'].value_counts().sum()
+      , "\n================================")
+
+#%% Quick counts: All samples, drug:
 print('===========================================================\n'
+      , 'RESULT: Total no. of samples tested for', drug, ':', meta_data[drug].value_counts().sum()
+	  , '\n===========================================================\n'
       , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
 	  , '\n===========================================================\n'
       , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
-		, '\n===========================================================')
+	  , '\n===========================================================')
 
-#%%
-# IMPORTANT sanity check:
+print('===========================================================\n'
+      , 'RESULT: Total no. of samples tested for', drug, ':', meta_data['drug_name'].value_counts().sum()
+	  , '\n===========================================================\n'
+      , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data['drug_name'].value_counts()
+	  , '\n===========================================================\n'
+      , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data['drug_name'].value_counts(normalize = True)*100
+	  , '\n===========================================================')
+
+##############################################################################
+#%% Extracting nsSNP for gene (meta_gene_all): from dr_muts col and other_muts_col
+#meta_data_gene = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ]
+# so downstream code doesn't change
+meta_gene_all = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ]
+#%% DF: with dr_muts_col
+meta_gene_dr = meta_gene_all.loc[meta_gene_all[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
+meta_gene_dr1 = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
+
+if meta_gene_dr.equals(meta_gene_dr1):
+    print('\nPASS: DF with column:', dr_muts_col, 'extracted successfully'
+          , '\ngene_snp_match in column:',dr_muts_col, meta_gene_dr.shape)
+else:
+    sys.exit('\nFAIL: DF with column:', dr_muts_col,'could not be extracted'
+             , '\nshape of df1:', meta_gene_dr.shape
+             , '\nshape of df2:', meta_gene_dr1.shape
+             , '\nCheck again!')
+##############################################################################    
+#%% DF: with other_muts_col
+meta_gene_other = meta_gene_all.loc[meta_gene_all[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
+meta_gene_other1 = meta_data.loc[meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
+
+print('gene_snp_match in dr:', len(meta_gene_other))
+meta_gene_other.equals(meta_gene_other1)
+
+if meta_gene_other.equals(meta_gene_other1):
+    print('\nPASS: DF with column:', other_muts_col,'extracted successfully'
+          , '\ngene_snp_match in column:',other_muts_col, meta_gene_other.shape)
+else:
+    sys.exit('\nFAIL: DF with column:', other_muts_col,'could not be extracted'
+             , '\nshape of df1:', meta_gene_other.shape
+             , '\nshape of df2:', meta_gene_other1.shape
+             , '\nCheck again!')
+############################################################################## 
+#%% Quick counts: <gene>nsSNP samples, drug
+meta_gene_all[drug].value_counts() 
+
+print('===========================================================\n'
+      , 'RESULT: Total no. of samples for', drug, 'with nsSNP mutations:', meta_gene_all['id'].nunique()
+	  , '\n===========================================================\n'
+      
+      , '===========================================================\n'
+      , 'RESULT: Total no. of samples tested for', drug, 'with nsSNP mutations:', meta_gene_all[drug].value_counts().sum()
+	  , '\n===========================================================\n'
+      
+      , '===========================================================\n'
+      , 'RESULT: Total no. of samples tested for', drug, 'with nsSNP mutations:', meta_gene_all['drug_name'].value_counts().sum()
+
+	  , '\n===========================================================\n'
+      , 'RESULT: No. of Sus and Res', drug, 'samples with nsSNP:\n', meta_gene_all['drug_name'].value_counts()
+
+	  , '\n===========================================================\n'
+      , 'RESULT: Percentage of Sus and Res', drug, 'samples with nsSNP mutations:\n', meta_gene_all['drug_name'].value_counts(normalize = True)*100
+	  , '\n===========================================================')
+###############################################################################
+#%% Create a copy of indices for downstream mergeing
+meta_gene_all['index_orig']      = meta_gene_all.index
+meta_gene_all['index_orig_copy'] = meta_gene_all.index
+
+all(meta_gene_all.index.values == meta_gene_all['index_orig'].values)
+all(meta_gene_all.index.values == meta_gene_all['index_orig_copy'].values)
+##############################################################################   
+#%% Important sanity checks: Dr muts column for tidy split(), nsSNPs, etc.
+# Split based on semi colon dr_muts_col and other_muts_col
+search = ";"
+ 
+# count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence
+count_df_dr = meta_gene_dr[['id', dr_muts_col]]
+count_df_dr['dr_semicolon_count'] = meta_gene_dr.loc[:, dr_muts_col].str.count(search, re.I)
+dr_sc_C = count_df_dr['dr_semicolon_count'].value_counts().reset_index()
+dr_sc_C 
+
+dr_sc_C['index_semicolon'] = (dr_sc_C['index'] + 1) *dr_sc_C['dr_semicolon_count']
+dr_sc_C 
+expected_drC = dr_sc_C['index_semicolon'].sum()
+expected_drC
+
+# count no. of nsSNPs and extract those nsSNPs
+count_df_dr['dr_geneSNP_count'] = meta_gene_dr.loc[:, dr_muts_col].str.count(nssnp_match, re.I)
+dr_gene_count1 = count_df_dr['dr_geneSNP_count'].sum()
+# DEL: need to count by find_all
+# dr_snps_df = meta_gene_dr.loc[:, dr_muts_col].str.extract(nssnp_match_captureG, re.I)
+# dr_snps_df.columns = [dr_muts_col]
+# dr_snps_countU = dr_snps_df[dr_muts_col].nunique()
+# dr_snps_countU
+# dr_snps_count_dict = dr_snps_df[dr_muts_col].value_counts().to_dict()
+# dr_snps_count_dict
+###############################################################################
 # This is to find out how many samples have 1 and more than 1 mutation,so you
 # can use it to check if your data extraction process for dr_muts 
 # and other_muts has worked correctly AND also to check the dim of the 
 # final formatted data.
 # This will have: unique COMBINATION of sample id and <gene_match> mutations
-
 #========
 # First: counting <gene_match> mutations in dr_muts_col column
 #========
@@ -293,6 +421,7 @@ dr_gene_count = 0
 wt = 0
 id_dr = []
 id2_dr = []
+dr_gene_mutsL = []
 #nssnp_match_regex = re.compile(nssnp_match)
 
 for i, id in enumerate(clean_df.id):
@@ -300,6 +429,7 @@ for i, id in enumerate(clean_df.id):
     #id_dr.append(id)
     #count_gene_dr = clean_df[dr_muts_col].iloc[i].count(gene_match) # can include stop muts
     count_gene_dr = len(re.findall(nssnp_match, clean_df[dr_muts_col].iloc[i], re.IGNORECASE))
+    gene_drL = re.findall(nssnp_match, clean_df[dr_muts_col].iloc[i], re.IGNORECASE)
     #print(count_gene_dr)
     if count_gene_dr > 0:
         id_dr.append(id)
@@ -307,6 +437,7 @@ for i, id in enumerate(clean_df.id):
         id2_dr.append(id)
         #print(id, count_gene_dr)    
     dr_gene_count = dr_gene_count + count_gene_dr
+    dr_gene_mutsL = dr_gene_mutsL + gene_drL
     count_wt = clean_df[dr_muts_col].iloc[i].count('WT')
     wt = wt + count_wt
     
@@ -314,270 +445,44 @@ print('RESULTS:')
 print('Total WT in dr_muts_col:', wt)
 print('Total matches of', gene, 'SNP matches in', dr_muts_col, ':', dr_gene_count)
 print('Total samples with > 1', gene, 'nsSNPs in dr_muts_col:', len(id2_dr) )
+print('Total matches of UNIQUE', gene, 'SNP matches in', dr_muts_col, ':', len(set(dr_gene_mutsL)))
 print('=================================================================')
 
+if dr_gene_count == dr_gene_count1:
+    print('\nPass: dr gene SNP count match')
+else:
+    sys.exit('\nFAIL: dr gene SNP count MISmatch')
+    
 del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
 
-#========
-# Second: counting <gene_match> mutations in other_muts_col column
-#========
-print('Now counting WT &', nssnp_match, 'muts within the column:', other_muts_col)    
-# drop na and extract a clean df
-clean_df =  meta_data.dropna(subset=[other_muts_col])
-
-# sanity check: count na
-na_count = meta_data[other_muts_col].isna().sum()
-
-if len(clean_df) == (total_samples - na_count):
-   print('PASS: clean_df extracted: length is', len(clean_df)
-         , '\nNo.of NAs =', na_count, '/', total_samples
-         , '\n=========================================================')
-else:
-    sys.exit('FAIL: Could not drop NAs')
-
-other_gene_count = 0
-wt_other = 0
-id_other = []
-id2_other = []
-        
-for i, id in enumerate(clean_df.id):
-    #print (i, id)
-    #id_other.append(id)
-    #count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match) # can include stop muts
-    count_gene_other = len(re.findall(nssnp_match, clean_df[other_muts_col].iloc[i], re.IGNORECASE))
-    if count_gene_other > 0:
-        id_other.append(id)
-    if count_gene_other > 1:
-        id2_other.append(id)
-        #print(id, count_gene_other)    
-    other_gene_count = other_gene_count + count_gene_other
-    count_wt = clean_df[other_muts_col].iloc[i].count('WT')
-    wt_other = wt_other + count_wt        
-     
-print('RESULTS:')
-print('Total WT in other_muts_col:', wt_other)
-print('Total matches of', gene, 'SNP matches in', other_muts_col, ':', other_gene_count)
-print('Total samples with > 1', gene, 'nsSNPs in other_muts_col:', len(id2_other) )
-print('=================================================================')
-
-print('Predicting total no. of rows in the curated df:', dr_gene_count + other_gene_count
-      , '\n===================================================================')
-expected_rows = dr_gene_count + other_gene_count
-
-#del( wt_other, clean_df, i, id, na_count, id2_other, count_gene_other, count_wt)
-del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other, count_wt )
-
-#%%
-############
-# extracting dr and other muts separately along with the common cols
-#############
-print('Extracting dr_muts from col:', dr_muts_col, 'with other meta_data')
-print('muts to extract:', nssnp_match )
-
-#===============
-# dr mutations: extract gene_match entries with meta data and ONLY dr_muts col
-#===============
-if in_filename_master == 'original_tanushree_data_v2.csv':
-    meta_data_dr = meta_data[['id'
-                              ,'country'
-                              ,'lineage'
-                              ,'sublineage'
-                              ,'drtype'
-                              , drug
-                              , dr_muts_col]]
-else:
-    dr_based_cols = [drug, dr_muts_col]
-    cols_to_extract = core_cols + dr_based_cols
-    print('Extracting', len(cols_to_extract), 'columns from meta data')
-    meta_data_dr = meta_data[cols_to_extract]
-    del(dr_based_cols, cols_to_extract)
-
-if meta_data_dr.shape[0] == len(meta_data) and meta_data_dr.shape[1] == (len(meta_data.columns)-1):
-    print('PASS: Dimensions match'
-          , '\n===============================================================')
-else:
-    print('FAIL: Dimensions mismatch:'
-          , 'Expected dim:', len(meta_data), (len(meta_data.columns)-1)
-          , '\nGot:', meta_data_dr.shape
-	      , '\n===============================================================')
-    sys.exit()
-
-# Extract within this the nsSNPs for gene of interest using string match
-#meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(gene_match, na = False)]
-meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
-print('gene_snp_match in dr:', len(meta_gene_dr))
-  
-dr_id = meta_gene_dr['id'].unique()
-print('RESULT: No. of samples with dr muts in', gene, ':', len(dr_id))
-
-if len(id_dr) == len(meta_gene_dr):
-    print('PASS: lengths match'
-    , '\n===============================================================')
-else:
-    print('FAIL: length mismatch'
-             , '\nExpected len:', len(id_dr)
-             , '\nGot:', len(meta_gene_dr))
-    sys.exit()
-
-dr_id = pd.Series(dr_id)
-
-#=================
-# other mutations: extract nssnp_match entries from other_muts_col
-#==================
-print('Extracting other_muts from:', other_muts_col,'with other meta_data')
-print('muts to extract:', nssnp_match)
-
-if in_filename_master ==  'original_tanushree_data_v2.csv':
-    meta_data_other = meta_data[['id'
-                                 , 'country'
-                                 , 'lineage'
-                                 , 'sublineage'
-                                 , 'drtype'
-                                 , drug
-                                 , other_muts_col]]
-else:
-    other_based_cols = [drug, other_muts_col]
-    cols_to_extract = core_cols + other_based_cols
-    print('Extracting', len(cols_to_extract), 'columns from meta data')
-    meta_data_other = meta_data[cols_to_extract]
-    del(other_based_cols, cols_to_extract)
-
-if meta_data_other.shape[0] == len(meta_data) and meta_data_other.shape[1] == (len(meta_data.columns)-1):
-    print('PASS: Dimensions match'
-          , '\n===============================================================')
-else:
-    print('FAIL: Dimensions mismatch:'
-          , 'Expected dim:', len(meta_data), (len(meta_data.columns)-1)
-          , '\nGot:', meta_data_other.shape
-	      , '\n===============================================================')
-    sys.exit()
-
-# Extract within this nSSNP for gene of interest using string match
-#meta_gene_other  = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(gene_match, na = False)] 
-meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] 
-print('gene_snp_match in other:', len(meta_gene_other))
-
-other_id = meta_gene_other['id'].unique()
-print('RESULT: No. of samples with other muts in', gene, ':', len(other_id))
-
-if len(id_other) == len(meta_gene_other):
-    print('PASS: lengths match'
-    , '\n==============================================================')
-else:
-    print('FAIL: length mismatch'
-          , '\nExpected len:', len(id_other)
-          , '\nGot:', len(meta_gene_other))
-    sys.exit()
-
-other_id = pd.Series(other_id)
-#%% Find common IDs
-print('Now extracting common_ids...')
-common_mut_ids = dr_id.isin(other_id).sum() 
-print('RESULT: No. of common ids:', common_mut_ids)
-
-# sanity checks
-# check if True: should be since these are common
-if dr_id.isin(other_id).sum() == other_id.isin(dr_id).sum():
-    print('PASS: Cross check on no. of common ids')
-else:
-    sys.exit('FAIL: Cross check on no. of common ids failed')
-    
-# check if the common are indeed the same!
-# bit of a tautology, but better safe than sorry!
-common_ids = dr_id[dr_id.isin(other_id)]
-common_ids = common_ids.reset_index()
-common_ids.columns = ['index', 'id']
-
-common_ids2 = other_id[other_id.isin(dr_id)]
-common_ids2 = common_ids2.reset_index()
-common_ids2.columns = ['index', 'id2']
-
-# should be True
-if common_ids['id'].equals(common_ids2['id2']):
-    print('PASS: Further cross checks on common ids')
-    nu_common_ids = common_ids.nunique()
-else:
-    sys.exit('FAIL: Further cross checks on common ids')
-
-# good sanity check: use it later to check gene_sample_counts
-expected_gene_samples = (len(meta_gene_dr) + len(meta_gene_other) - common_mut_ids) 
-print('Expected no. of gene samples:', expected_gene_samples
-      , '\n=================================================================')
-#%% write file
-#print(outdir)
-out_filename_cid = gene.lower() + '_common_ids.csv'
-outfile_cid =  outdir + '/' + out_filename_cid
-print('\n----------------------------------'
-      '\nWriting file: common_ids'
-      '\n----------------------------------'
-      , '\nFile:', outfile_cid
-      , '\nNo. of rows:', len(common_ids)
-      , '\n=============================================================')
-
-common_ids.to_csv(outfile_cid, index = False)
-
-del(out_filename_cid)
-
-# clear variables
-del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2)
-
-#%% Now extract 'all' gene specific nsSNP mutations: i.e 'nssnp_match'
-print('Extracting nsSNP match:', gene, 'mutations from cols:\n'
-      , dr_muts_col, 'and', other_muts_col, 'using string match:'
-      , '\n===================================================================')
-#meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(gene_match, na = False) | meta_data[other_muts_col].str.contains(gene_match, na = False) ]
-meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) | meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ]
-
-extracted_gene_samples = meta_gene_all['id'].nunique()
-print('RESULT: No. of unique samples with', gene, 'nsSNP muts:', extracted_gene_samples
-      , '\n===================================================================')
-
-# sanity check: length of gene samples
-print('Performing sanity check:')
-if extracted_gene_samples == expected_gene_samples:
-    print('PASS: expected & actual no. of nssnp gene samples match'
-          , '\nNo. of gene samples:', len(meta_gene_all)
-          , '\n=========================================================')
-else:
-    sys.exit('FAIL: Length mismatch in gene samples!')
-
-# count NA  in drug column
-gene_na = meta_gene_all[drug].isna().sum() 
-print('No. of gene samples without', drug, 'testing:', gene_na)
-
-# use it later to check number of complete samples from LF data
-comp_gene_samples = len(meta_gene_all) - gene_na
-print('Complete gene samples tested for', drug, ':', comp_gene_samples)
-print('=================================================================')
-
-# Comment: This is still dirty data since these
-# are samples that have nsSNP muts, but can have others as well
-# since the format for mutations is mut1; mut2, etc.
-print('This is still dirty data: samples have ', nssnp_match, 'muts but may have others as well'
-      , '\nsince the format for mutations is mut1; mut2, etc.'
-      , '\n=============================================================')
-
-print('Performing tidy_split(): to separate the mutations into indivdual rows')
-
+#%% tidy_split(): dr_muts_col
 #=========
 # DF1: dr_muts_col
+# and remove leading white spaces
 #=========
-########
-# tidy_split(): on dr_muts_col column and remove leading white spaces
-########    
 col_to_split1 = dr_muts_col
 print ('Firstly, applying tidy split on dr muts df', meta_gene_dr.shape
-       , '\ncolumn name to apply tidy_split():', col_to_split1
+       , '\ncolumn name to apply tidy_split():'    , col_to_split1
        , '\n============================================================')
+
 # apply tidy_split()
 dr_WF0 = tidy_split(meta_gene_dr, col_to_split1, sep = ';') 
 # remove leading white space else these are counted as distinct mutations as well
-#dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.lstrip() 
 dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.strip() 
 
-# extract only the samples/rows with nssnp_match
+if len(dr_WF0) == expected_drC:
+    print('\nPass: tidy split on', dr_muts_col, 'worked'
+          , '\nExpected nrows:', expected_drC
+          , '\nGot nrows:', len(dr_WF0) )
+else:
+    print('\nFAIL: tidy split on', dr_muts_col, 'did not work'
+          , '\nExpected nrows:', expected_drC
+          , '\nGot nrows:', len(dr_WF0) )
+
+# Extract only the samples/rows with nssnp_match
 #dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)]
 dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(nssnp_match, regex = True, case = False)]
+dr_gene_WF0_v2 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(nssnp_match_captureG, regex = True, case = False)]
 
 print('Lengths after tidy split and extracting', nssnp_match, 'muts:'
       , '\nOld length:' , len(meta_gene_dr)
@@ -585,6 +490,7 @@ print('Lengths after tidy split and extracting', nssnp_match, 'muts:'
       , '\nLength of nssnp df:', len(dr_gene_WF0)
       , '\nExpected len:', dr_gene_count
       , '\n=============================================================')
+
 if len(dr_gene_WF0) == dr_gene_count:
     print('PASS: length matches expected length'
     , '\n===============================================================')
@@ -619,752 +525,28 @@ print('Dim of dr_df:', dr_df.shape
 	, '\n=============================================================='
 	, '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category'
 	, '\n===============================================================')
-
-#%%
-#=========
-# DF2: other_muts_col
-#=========
-########
-# tidy_split(): on other_muts_col column and remove leading white spaces
-######## 
-col_to_split2 = other_muts_col
-print ('applying second tidy split() separately on other muts df', meta_gene_other.shape
-       , '\ncolumn name to apply tidy_split():', col_to_split2
-       , '\n============================================================')
-
-# apply tidy_split()
-other_WF1 = tidy_split(meta_gene_other, col_to_split2, sep = ';') 
-# remove the leading white spaces in the column
-other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip() 
-
-# extract only the samples/rows with nssnp_match
-#other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)]
-other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(nssnp_match, regex = True, case = False)]
-
-print('Lengths after tidy split and extracting', gene_match, 'muts:',
-      '\nOld length:' , len(meta_gene_other),
-      '\nLength after split:', len(other_WF1),
-      '\nLength of nssnp df:', len(other_gene_WF1),
-      '\nExpected len:', other_gene_count
-      , '\n=============================================================')
-if len(other_gene_WF1) == other_gene_count:
-    print('PASS: length matches expected length'
-    , '\n===============================================================')
-else:
-    sys.exit('FAIL: lengths mismatch')
-
-# count the freq of 'other muts' samples
-other_muts_df = other_gene_WF1 [['id', other_muts_col]] 
-print('Dim of other_muts_df:', other_muts_df.shape) 
-
-# add freq column
-other_muts_df['other_sample_freq'] = other_muts_df.groupby('id')['id'].transform('count')
-print('Revised dim of other_muts_df:', other_muts_df.shape) 
-
-c2 = other_muts_df.other_sample_freq.value_counts()
-print('Counting no. of sample frequency:\n', c2)
-print('=================================================================')
-# sanity check: length of gene samples
-if len(other_gene_WF1) == c2.sum():
-    print('PASS: WF data has expected length'
-    , '\nLength of other_gene WFO:', c2.sum()
-    , '\n===============================================================')
-else:
-    sys.exit('FAIL: Length mismatch')
-    
-# Important: Assign 'column name' on which split was performed as an extra column
-# This is so you can identify if mutations are dr_type or other in the final df
-other_df = other_gene_WF1.assign(mutation_info = other_muts_col)
-print('dim of other_df:', other_df.shape
-	, '\n==============================================================='
-	, '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category'
-	, '\n===============================================================')
-
-#%%
-#==========
-# Concatentating the two dfs: equivalent of rbind in R
-#==========
-# Important: Change column names to allow concat:
-# dr_muts.. & other_muts : 'mutation'
-print('Now concatenating the two dfs by row'
-      , '\nFirst assigning a common colname: "mutation" to the col containing muts'
-      , '\nThis is done for both dfs'
-      , '\n===================================================================')
-
-dr_df.columns
-dr_df.rename(columns = {dr_muts_col: 'mutation'}, inplace = True)
-dr_df.columns
-
-other_df.columns
-other_df.rename(columns = {other_muts_col: 'mutation'}, inplace = True)
-other_df.columns
-
-if len(dr_df.columns) == len(other_df.columns):
-    print('Checking dfs for concatening by rows:'
-          , '\nDim of dr_df:', dr_df.shape
-          , '\nDim of other_df:', other_df.shape
-          , '\nExpected nrows:', len(dr_df) +  len(other_df)
-          , '\n=============================================================')
-else:
-    sys.exit('FAIL: No. of cols mismatch for concatenating')
-
-# checking colnames before concat
-print('Checking colnames BEFORE concatenating the two dfs...')
-if (set(dr_df.columns) == set(other_df.columns)):
-    print('PASS: column names match necessary for merging two dfs')
-else:
-    sys.exit('FAIL: Colnames mismatch for concatenating!')
-
-# concatenate (axis = 0): Rbind
-print('Now appending the two dfs: Rbind')
-gene_LF_comb = pd.concat([dr_df, other_df], ignore_index = True, axis = 0)
-
-print('Finding stop mutations in concatenated df')
-stop_muts = gene_LF_comb['mutation'].str.contains('\*').sum()
-if stop_muts == 0:
-    print('PASS: No stop mutations detected')
-else:
-    print('stop mutations detected'
-          , '\nNo. of stop muts:', stop_muts, '\n'
-          , gene_LF_comb.groupby(['mutation_info'])['mutation'].apply(lambda x: x[x.str.contains('\*')].count())
-          , '\nNow removing them')    
-    
-gene_LF0_nssnp = gene_LF_comb[~gene_LF_comb['mutation'].str.contains('\*')]
-print('snps only: by subtracting stop muts:', len(gene_LF0_nssnp))
-
-gene_LF0 = gene_LF_comb[gene_LF_comb['mutation'].str.contains(nssnp_match, case = False)]
-print('snps only by direct regex:', len(gene_LF0))
-
-if gene_LF0_nssnp.equals(gene_LF0):
-    print('PASS: regex for extracting nssnp worked correctly & stop mutations successfully removed'
-          , '\nUsing the regex extracted df')
-else:
-    sys.exit('FAIL: posssibly regex or no of stop mutations'
-             , 'Regex being used:', nssnp_match)
-    #sys.exit()
-
-# checking colnames and length after concat
-print('Checking colnames AFTER concatenating the two dfs...')
-if (set(dr_df.columns) == set(gene_LF0.columns)):
-    print('PASS: column names match'
-          , '\n=============================================================')
-else:
-    sys.exit('FAIL: Colnames mismatch AFTER concatenating')
-    
-print('Checking concatenated df')
-if len(gene_LF0) == (len(dr_df) +  len(other_df))- stop_muts :
-    print('PASS:length of df after concat match'
-    , '\n===============================================================')
-else:
-    sys.exit('FAIL: length mismatch')
-
-#%%
-###########
-# This is hopefully clean data, but just double check
-# Filter LF data so that you only have
-# mutations corresponding to nssnp_match (case insensitive)
-# this will be your list you run OR calcs 
-###########
-print('Length of gene_LF0:', len(gene_LF0)
-      , '\nThis should be what we need. But just double checking and extracting nsSNP for', gene
-      , '\nfrom LF0 (concatenated data) using case insensitive regex match:', nssnp_match)
-
-gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)]   
-
-if len(gene_LF0) == len(gene_LF1):
-    print('PASS: length of gene_LF0 and gene_LF1 match',
-          '\nConfirming extraction and concatenating worked correctly'
-          , '\n==========================================================')
-else:
-    print('FAIL: BUT NOT FATAL!'
-          , '\ngene_LF0 and gene_LF1 lengths differ'
-          , '\nsuggesting error in extraction process'
-          , ' use gene_LF1 for downstreama analysis'
-          , '\n=========================================================')   
-    
-print('Length of dfs pre and post processing...'
-      , '\ngene WF data (unique samples in each row):', extracted_gene_samples
-      , '\ngene LF data (unique mutation in each row):', len(gene_LF1)
-      , '\n=============================================================')
-  
-#%% sanity check for extraction
-# This ought to pass if nsnsp_match happens right at the beginning of creating 'expected_rows'
-print('Verifying whether extraction process worked correctly...')
-if len(gene_LF1) == expected_rows:
-    print('PASS: extraction process performed correctly'
-          , '\nExpected length:', expected_rows
-          , '\nGot:', len(gene_LF1)
-          , '\nRESULT: Total no. of mutant sequences for logo plot:', len(gene_LF1)
-          , '\n=========================================================')
-else:
-    print('FAIL: extraction process has bugs'
-          , '\nExpected length:', expected_rows
-          , '\nGot:', len(gene_LF1)
-          , '\nDebug please'
-          , '\n=========================================================')
-#%%
-print('Performing some more sanity checks...')
-
-# From LF1: useful for OR counts
-# no. of unique muts
-distinct_muts = gene_LF1.mutation.value_counts()
-print('Distinct genomic mutations:', len(distinct_muts))
-
-# no. of samples contributing the unique muts
-gene_LF1.id.nunique()
-print('No.of samples contributing to distinct genomic muts:', gene_LF1.id.nunique())
-
-# no. of dr and other muts
-mut_grouped = gene_LF1.groupby('mutation_info').mutation.nunique()
-print('No.of unique dr and other muts:\n', gene_LF1.groupby('mutation_info').mutation.nunique())
-
-# sanity check
-if len(distinct_muts) == mut_grouped.sum() :
-    print('PASS:length of LF1 is as expected'
-    , '\n===============================================================')
-else:
-    print('FAIL: Mistmatch in count of muts'
-          , '\nExpected count:', len(distinct_muts)
-          , '\nActual count:', mut_grouped.sum()
-          , '\nMuts should be distinct within dr* and other* type'
-          , '\nInspecting...possibly ambiguous muts'
-          , '\nNo. of possible ambiguous muts:', mut_grouped.sum() - len(distinct_muts)
-          , '\n=========================================================')
-    
-muts_split = list(gene_LF1.groupby('mutation_info'))
-dr_muts = muts_split[0][1].mutation 
-other_muts =  muts_split[1][1].mutation
-print('splitting muts by mut_info:', muts_split)
-print('no.of dr_muts samples:', len(dr_muts))
-print('no. of other_muts samples', len(other_muts))       
-
-#%%
-# IMPORTANT: The same mutation cannot be classed as a drug AND 'others'
-if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
-    print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category'
-          , '\n===============================================================')
-else:
-    print('PASS: NO ambiguous muts detected'
-          , '\nMuts are unique to dr_ and other_ mutation class'
-          , '\n=========================================================')
-
-# inspect dr_muts and other muts: Fixed in case no ambiguous muts detected!
-if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
-    print('Finding ambiguous muts...'
-          , '\n========================================================='
-          , '\nTotal no. of samples in dr_muts present in other_muts:', dr_muts.isin(other_muts).sum()
-          , '\nThese are:', dr_muts[dr_muts.isin(other_muts)]
-          , '\n========================================================='
-          , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum()
-          , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)]
-          , '\n=========================================================')
-    
-    print('Counting no. of ambiguous muts...'
-          , '\nNo. of ambiguous muts in dr:'
-          , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist())
-          , '\nNo. of ambiguous muts in other:'
-          , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())
-          , '\n=========================================================')
-    
-    if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): 
-        common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()
-        print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts))
-              , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n')
-        print('\n===========================================================') 
-else:
-    #sys.exit('Error: ambiguous muts present, but extraction failed. Debug!')
-    print('No: ambiguous muts present')
-    
-#print('Counting no. of ambiguous muts...')
-#if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): 
-#    common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()
-#    print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts))
-#          , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n')
-#    print('\n===========================================================')
-#else:
-#   print('Error: ambiguous muts detected, but extraction failed. Debug!'
-#         , '\nNo. of ambiguous muts in dr:'
-#         , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist())
-#         , '\nNo. of ambiguous muts in other:'
-#         , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())
-#         , '\n=========================================================')       
-         
-#%% clear variables
-del(id_dr, id_other
-    #, meta_data
-    , meta_gene_dr, meta_gene_other
-    #, mut_grouped
-    , muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na)  
-
-del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1)                     
- 
-#%%: write file: ambiguous muts
-# uncomment as necessary
-#print(outdir)
-#dr_muts.to_csv('dr_muts.csv', header = True)
-#other_muts.to_csv('other_muts.csv', header = True)
-if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
-    out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
-    outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
-    print('\n----------------------------------'
-          , '\nWriting file: ambiguous muts'
-          , '\n----------------------------------'
-          , '\nFilename:', outfile_ambig_muts)
-    inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
-    inspect.to_csv(outfile_ambig_muts, index = False)
-
-    print('Finished writing:', out_filename_ambig_muts
-          , '\nNo. of rows:', len(inspect)
-          , '\nNo. of cols:', len(inspect.columns)
-          , '\nNo. of rows = no. of samples with the ambiguous muts present:'
-          , dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum()
-          , '\n=============================================================')
-    del(out_filename_ambig_muts)
-
-#%% end of data extraction and some files writing. Below are some more files writing.
-#=============================================================================
-#%% Formatting df: read aa dict and pull relevant info
-print('Now some more formatting:'
-      , '\nread aa dict and pull relevant info'
-      , '\nformat mutations:'
-      , '\nsplit mutation into mCSM style muts: '
-      , '\nFormatting mutation in mCSM style format: {WT}<POS>{MUT}'
-      , '\nassign aa properties: adding 2 cols at a time for each prop'
-      , '\n===================================================================')
-
-# BEWARE hardcoding : only works as we are adding aa prop once for wt and once for mut
-# in each lookup cycle 
-ncol_mutf_add = 3 # mut split into 3 cols
-ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping
-
-#===========
-# Split 'mutation' column into three:  wild_type, position and
-# mutant_type separately. Then map three letter code to one using
-# reference_dict imported at the beginning.
-# After importing, convert to mutation to lowercase for compatibility with dict 
-#===========
-gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower()
-
-print('wt regex being used:', wt_regex
-      , '\nmut regex being used:', mut_regex
-      , '\nposition regex being used:', pos_regex)
-
-mylen0 = len(gene_LF1.columns)
-#=======
-# Iterate through the dict, create a lookup dict i.e
-# lookup_dict = {three_letter_code: one_letter_code}.
-# lookup dict should be the key and the value (you want to create a column for)
-# Then use this to perform the mapping separetly for wild type and mutant cols.
-# The three letter code is extracted using a string match match from the dataframe and then converted
-# to 'pandas series'since map only works in pandas series
-#=======
-print('Adding', ncol_mutf_add, 'more cols:\n')
-
-# initialise a sub dict that is lookup dict for three letter code to 1-letter code
-# adding three more cols
-lookup_dict = dict()
-for k, v in my_aa_dict.items():
-    lookup_dict[k] = v['one_letter_code']
-    #wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()converts to a series that map works on
-    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
-    gene_LF1['wild_type'] = wt.map(lookup_dict)   
-    #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
-    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
-    gene_LF1['mutant_type'] = mut.map(lookup_dict)
-
-# extract position info from mutation column separetly using string match
-#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') 
-gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex) 
-
-mylen1 = len(gene_LF1.columns)
-
-# sanity checks
-print('checking if 3-letter wt&mut residue extraction worked correctly')
-if wt.isna().sum() & mut.isna().sum() == 0:
-   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
-         , '\nNo NAs detected:'
-         , '\nwild-type\n', wt
-         , '\nmutant-type\n', mut
-         , '\ndim of df:', gene_LF1.shape)
-else:
-    print('FAIL: 3-letter wt&mut residue extraction failed'
-          , '\nNo NAs detected:'
-          , '\nwild-type\n', wt
-          , '\nmutant-type\n', mut
-          , '\ndim of df:', gene_LF1.shape)
-
-if mylen1 ==  mylen0 + ncol_mutf_add:
-    print('PASS: successfully added', ncol_mutf_add, 'cols'
-      , '\nold length:', mylen0
-      , '\nnew len:', mylen1)
-else:
-    print('FAIL: failed to add cols:'
-          , '\nold length:', mylen0
-          , '\nnew len:', mylen1)
-
-# clear variables
-del(k, v, wt, mut, lookup_dict)
-
-#=========
-# iterate through the dict, create a lookup dict that i.e
-# lookup_dict =  {three_letter_code: aa_prop_water} 
-# Do this for both wild_type and mutant as above.
-#=========
-print('Adding', ncol_aa_add, 'more cols:\n')
-
-# initialise a sub dict that is lookup dict for three letter code to aa prop
-# adding two more cols
-lookup_dict = dict()
-for k, v in my_aa_dict.items():
-    lookup_dict[k] = v['aa_prop_water']
-    #print(lookup_dict)
-    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
-    gene_LF1['wt_prop_water'] = wt.map(lookup_dict)   
-    #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
-    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
-    gene_LF1['mut_prop_water'] = mut.map(lookup_dict)
-
-mylen2 = len(gene_LF1.columns)    
-
-# sanity checks
-print('checking if 3-letter wt&mut residue extraction worked correctly')
-if wt.isna().sum() & mut.isna().sum() == 0:
-   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
-         , '\nNo NAs detected:'
-         , '\nwild-type\n', wt
-         , '\nmutant-type\n', mut
-         , '\ndim of df:', gene_LF1.shape)
-else:
-    print('FAIL: 3-letter wt&mut residue extraction failed'
-          , '\nNo NAs detected:'
-          , '\nwild-type\n', wt
-          , '\nmutant-type\n', mut
-          , '\ndim of df:', gene_LF1.shape)
-
-if mylen2 == mylen1 + ncol_aa_add:
-    print('PASS: successfully added', ncol_aa_add, 'cols'
-      , '\nold length:', mylen1
-      , '\nnew len:', mylen2)
-else:
-    print('FAIL: failed to add cols:'
-          , '\nold length:', mylen1
-          , '\nnew len:', mylen2)
-
-# clear variables
-del(k, v, wt, mut, lookup_dict)
-
-#========
-# iterate through the dict, create a lookup dict that i.e
-# lookup_dict =  {three_letter_code: aa_prop_polarity} 
-# Do this for both wild_type and mutant as above.
-#=========
-print('Adding', ncol_aa_add, 'more cols:\n')
-
-# initialise a sub dict that is lookup dict for three letter code to aa prop
-# adding two more cols
-lookup_dict = dict()
-for k, v in my_aa_dict.items():
-    lookup_dict[k] = v['aa_prop_polarity']
-    #print(lookup_dict)
-    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
-    gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict)   
-   #mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
-    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
-    gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
-    
-mylen3 = len(gene_LF1.columns)       
-
-# sanity checks
-print('checking if 3-letter wt&mut residue extraction worked correctly')
-if wt.isna().sum() & mut.isna().sum() == 0:
-   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
-         , '\nNo NAs detected:'
-         , '\nwild-type\n', wt
-         , '\nmutant-type\n', mut
-         , '\ndim of df:', gene_LF1.shape)
-else:
-    print('FAIL: 3-letter wt&mut residue extraction failed'
-          , '\nNo NAs detected:'
-          , '\nwild-type\n', wt
-          , '\nmutant-type\n', mut
-          , '\ndim of df:', gene_LF1.shape)
-
-if mylen3 ==  mylen2 + ncol_aa_add:
-    print('PASS: successfully added', ncol_aa_add, 'cols'
-      , '\nold length:', mylen1
-      , '\nnew len:', mylen2)
-else:
-    print('FAIL: failed to add cols:'
-          , '\nold length:', mylen1
-          , '\nnew len:', mylen2)
-   
-# clear variables
-del(k, v, wt, mut, lookup_dict)
-
-#========
-# iterate through the dict, create a lookup dict that i.e
-# lookup_dict =  {three_letter_code: aa_calcprop} 
-# Do this for both wild_type and mutant as above.
-#=========
-print('Adding', ncol_aa_add, 'more cols:\n')
-
-lookup_dict = dict()
-for k, v in my_aa_dict.items():
-    lookup_dict[k] = v['aa_calcprop']
-    #print(lookup_dict)
-    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
-    gene_LF1['wt_calcprop'] = wt.map(lookup_dict)   
-    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
-    gene_LF1['mut_calcprop'] = mut.map(lookup_dict)
- 
-mylen4 = len(gene_LF1.columns)   
-
-# sanity checks
-print('checking if 3-letter wt&mut residue extraction worked correctly')
-if wt.isna().sum() & mut.isna().sum() == 0:
-   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
-         , '\nNo NAs detected:'
-         , '\nwild-type\n', wt
-         , '\nmutant-type\n', mut
-         , '\ndim of df:', gene_LF1.shape)
-else:
-    print('FAIL: 3-letter wt&mut residue extraction failed'
-          , '\nNo NAs detected:'
-          , '\nwild-type\n', wt
-          , '\nmutant-type\n', mut
-          , '\ndim of df:', gene_LF1.shape)
-
-if mylen4 ==  mylen3 + ncol_aa_add:
-    print('PASS: successfully added', ncol_aa_add, 'cols'
-      , '\nold length:', mylen3
-      , '\nnew len:', mylen4)
-else:
-    print('FAIL: failed to add cols:'
-          , '\nold length:', mylen3
-          , '\nnew len:', mylen4)
-
-# clear variables
-del(k, v, wt, mut, lookup_dict)
-
-########
-# combine the wild_type+poistion+mutant_type columns to generate 
-# mutationinformation (matches mCSM output field)
-# Remember to use .map(str) for int col types to allow string concatenation
-#########
-gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type']
-print('Created column: mutationinformation'
-	, '\n=====================================================================\n'
-    , gene_LF1.mutationinformation.head(10))
-
-#order by position for convenience
-gene_LF1.dtypes
-
-# converting position to numeric
-gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
-
-# sort by position inplace 
-foo = gene_LF1['position'].value_counts()
-gene_LF1.sort_values(by = ['position'], inplace = True)
-bar = gene_LF1['position'].value_counts()
-
-if (foo == bar).all():
-    print('PASS: df ordered by position')
-    print(gene_LF1['position'].head())
-else:
-    print('FAIL: df could not be orderd. Check source')
-    sys.exit()
-
-#%% Write file: mCSM muts
-snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique())
-snps_only.head()
-# assign column name
-snps_only.columns = ['mutationinformation']
-
-# count how many positions this corresponds to
-pos_only = pd.DataFrame(gene_LF1['position'].unique()) 
-
-print('Checking NA in snps...')# should be 0
-if snps_only.mutationinformation.isna().sum() == 0:
-    print ('PASS: NO NAs/missing entries for SNPs'
-    , '\n===============================================================')
-else:
-    sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
-
-out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv'
-outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
-
-print('\n----------------------------------'
-      , '\nWriting file: mCSM style muts'
-      , '\n----------------------------------'
-      , '\nFile:', outfile_mcsmsnps
-      , '\nmutation format (SNP): {WT}<POS>{MUT}'
-      , '\nNo. of distinct muts:', len(snps_only)
-      , '\nNo. of distinct positions:', len(pos_only)
-      , '\n=============================================================')
-
-snps_only.to_csv(outfile_mcsmsnps, header = False, index = False)
-
-print('Finished writing:', outfile_mcsmsnps
-      , '\nNo. of rows:', len(snps_only)
-      , '\nNo. of cols:', len(snps_only.columns)
-      , '\n=============================================================')
-del(out_filename_mcsmsnps)
-
-#%% write frequency of position counts
-metadata_pos = pd.DataFrame(gene_LF1['position'])
-z =  gene_LF1['position'].value_counts()
-z1 = z.to_dict()
-metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
-metadata_pos['meta_pos_count'].value_counts()
-
-metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
-
-out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
-outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
-print('\n----------------------------------'
-      , '\nWriting file: Metadata poscounts'
-      , '\n----------------------------------'
-      , '\nFile:', outfile_metadata_poscounts
-      , '\n============================================================')
-
-metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
-print('Finished writing:', outfile_metadata_poscounts
-      , '\nNo. of rows:', len(metadata_pos)
-      , '\nNo. of cols:', len(metadata_pos.columns)
-      , '\n=============================================================')
-del(out_filename_metadata_poscounts)
-
-#%% Write file: gene_metadata (i.e gene_LF1)
-# where each row has UNIQUE mutations NOT unique sample ids
-#out_filename_metadata = gene.lower() + '_metadata_raw.csv' #! rationale?
-out_filename_metadata = gene.lower() + '_metadata.csv'
-outfile_metadata = outdir + '/' + out_filename_metadata
-print('\n----------------------------------'
-      , '\nWriting file: LF formatted data'
-      , '\n----------------------------------'
-      , '\nFile:', outfile_metadata
-      , '\n============================================================')
-
-gene_LF1.to_csv(outfile_metadata, header = True, index = False)
-print('Finished writing:', outfile_metadata
-      , '\nNo. of rows:', len(gene_LF1)
-      , '\nNo. of cols:', len(gene_LF1.columns)
-      , '\n=============================================================')
-del(out_filename_metadata)
-
-#%% write file: mCSM style but with repitions for MSA and logo plots
-all_muts_msa = pd.DataFrame(gene_LF1['mutationinformation']) 
-all_muts_msa.head()
-# assign column name
-all_muts_msa.columns = ['mutationinformation']
-
-# make sure it is string
-all_muts_msa.columns.dtype
-
-# sort the column
-all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation')
-
-# create an extra column with protein name
-#all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1') 
-#all_muts_msa_sorted.head()
-
-# rearrange columns so the fasta name is the first column (required for mutate.script)
-#all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
-all_muts_msa_sorted.head()
-
-print('Checking NA in snps...')# should be 0
-if all_muts_msa.mutationinformation.isna().sum() == 0:
-    print ('PASS: NO NAs/missing entries for SNPs'
-    , '\n===============================================================')
-else:
-    sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
-
-out_filename_msa = gene.lower() +'_all_muts_msa.csv'
-outfile_msa = outdir + '/' + out_filename_msa
-
-print('\n----------------------------------'
-      , '\nWriting file: mCSM style muts for msa'
-      , '\n----------------------------------'
-      , '\nFile:', outfile_msa
-      , '\nmutation format (SNP): {WT}<POS>{MUT}'
-      , '\nNo.of lines of msa:', len(all_muts_msa))
-
-all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False)
-
-print('Finished writing:', outfile_msa
-      , '\nNo. of rows:', len(all_muts_msa)
-      , '\nNo. of cols:', len(all_muts_msa.columns)
-      , '\n=============================================================')
-
-del(out_filename_msa)
-
-#%% write file for mutational positions
-# count how many positions this corresponds to
-pos_only = pd.DataFrame(gene_LF1['position'].unique()) 
-# assign column name
-pos_only.columns = ['position']
-# make sure dtype of column position is int or numeric and not string
-pos_only.position.dtype
-pos_only['position'] = pos_only['position'].astype(int)
-pos_only.position.dtype
-
-# sort by position value
-pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)
-
-out_filename_pos = gene.lower() + '_mutational_positons.csv'
-outfile_pos = outdir + '/' + out_filename_pos
-
-print('\n----------------------------------'
-      , '\nWriting file: mutational positions'
-      , '\n----------------------------------'
-      , '\nFile:', outfile_pos
-      , '\nNo. of distinct positions:', len(pos_only_sorted)
-      , '\n=============================================================')
-
-pos_only_sorted.to_csv(outfile_pos, header = True, index = False)
-
-print('Finished writing:', outfile_pos
-      , '\nNo. of rows:', len(pos_only_sorted)
-      , '\nNo. of cols:', len(pos_only_sorted.columns)
-      , '\n============================================================='
-      , '\n\n\n')
-
-del(out_filename_pos)
-#%% quick summary output
-#print('============================================'
-#      , '\nQuick summary output for', drug, 'and' , gene.lower()
-#      , '\n============================================'
-#      , '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique()
-#      , '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique()
-#      , '\nTotal no. of samples with missense muts:', len(gene_LF1)
-#      , '\n============================================================='
-#      , '\n\n\n')
-
-print('============================================'
-      , '\nQuick summary output for', drug, 'and' , gene.lower()
-      , '\n============================================'
-      , '\nTotal samples:', total_samples
-      , '\nNo. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
-	  , '\n'
-      , '\nPercentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
-	  , '\n'
-      , '\nTotal no.of unique dr muts:', mut_grouped[dr_muts_col]
-      , '\nTotal no.of unique other muts:', mut_grouped[other_muts_col]     
-      , '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique()
-      , '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique()
-      , '\nTotal no. of samples with missense muts:', len(gene_LF1)
-      , '\nTotal no. of unique samples with missense muts:', gene_LF1['id'].nunique()  
-      , '\n'
-      , '\nTotal no.of samples with common_ids:',  nu_common_ids['id'])
-
-if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
-    print('\nTotal no.of samples with ambiguous muts:', len(inspect)
-          #, '\nTotal no.of unique ambiguous muts:', len(common_muts)
-          , '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
-          , '\n============================================================='
-          , '\n\n\n')
-#=======================================================================
-print(u'\u2698' * 50,
-      '\nEnd of script: Data extraction and writing files'
-      '\n' + u'\u2698' * 50 )
-#%% end of script
+########################################################################
+#%% Important sanity checks: other muts column for tidy split(), nsSNPs, etc.
+
+# count of occurrence of ";" in other_muts_col: No.of semicolons + 1 is no. of rows created * occurence
+count_df_other = meta_gene_other[['id', other_muts_col]]
+count_df_other['other_semicolon_count'] = meta_gene_other.loc[:, other_muts_col].str.count(search, re.I)
+other_sc_C = count_df_other['other_semicolon_count'].value_counts().reset_index()
+other_sc_C
+other_sc_C['index_semicolon'] = (other_sc_C['index']+1)*other_sc_C['other_semicolon_count']
+other_sc_C
+expected_otherC = other_sc_C['index_semicolon'].sum()
+expected_otherC
+
+# count no. of nsSNPs and extract those nsSNPs
+count_df_other['other_geneSNP_count'] = meta_gene_other.loc[:, other_muts_col].str.count(nssnp_match, re.I)
+
+# DEL: need to count by find_all
+# other_snps_df = meta_gene_other.loc[:, other_muts_col].str.extract(nssnp_match_captureG, re.I)
+# other_snps_df
+# other_snps_df.columns = [other_muts_col]
+# other_snps_countU = other_snps_df[other_muts_col].nunique()
+# other_snps_countU
+# other_snps_count_dict = other_snps_df[other_muts_col].value_counts().to_dict()
+# other_snps_count_dict
+###############################################################################
\ No newline at end of file
diff --git a/scripts/data_extraction_v2.py b/scripts/data_extraction_v2.py
deleted file mode 100644
index d90c8c4..0000000
--- a/scripts/data_extraction_v2.py
+++ /dev/null
@@ -1,1931 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-'''
-Created on Tue Aug  6 12:56:03 2019
-
-@author: tanu
-'''
-
-# FIXME: include error checking to enure you only
-# concentrate on positions that have structural info?
-
-# FIXME: import dirs.py to get the basic dir paths available
-#=======================================================================
-# TASK: extract ALL <gene> matched mutations from GWAS data
-# Input data file has the following format: each row = unique sample id 
-# id,country,lineage,sublineage,drtype,drug,dr_muts_col,other_muts_col...
-# 0,sampleID,USA,lineage2,lineage2.2.1,Drug-resistant,0.0,WT,gene_match<wt>POS<mut>; pncA_c.<wt>POS<mut>...
-# where multiple mutations and multiple mutation types are separated by ';'. 
-# We are interested in the protein coding region i.e mutation with the<gene>_'p.' format.
-# This script splits the mutations on the ';' and extracts protein coding muts only
-# where each row is a separate mutation
-# sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique
-
-
-# output files: all lower case
-# 1) <gene>_gwas.csv
-# 2) <gene>_common_ids.csv
-# 3) <gene>_ambiguous_muts.csv
-# 4) <gene>_mcsm_formatted_snps.csv
-# 5) <gene>_metadata_poscounts.csv
-# 6) <gene>_metadata.csv
-# 7) <gene>_all_muts_msa.csv
-# 8) <gene>_mutational_positons.csv
-
-#------------
-# NOTE
-#-----------
-# drtype is renamed to 'resistance' in the 35k dataset
-# all colnames in the ouput files lowercase
-
-#-------------
-# requires
-#-------------
-#reference_dict.py
-#tidy_split.py
-#=======================================================================
-#%% load libraries
-import os, sys
-import re
-import pandas as pd
-import numpy as np
-import argparse
-from statistics import mean, median, mode
-from statistics import multimode
-#=======================================================================
-#%% dir and local imports
-homedir = os.path.expanduser('~') 
-# set working dir
-os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/scripts')
-os.getcwd()
-#=======================================================================
-#%% command line args
-arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
-arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
-arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
-arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
-arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
-arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true')
-
-arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
-
-args = arg_parser.parse_args()
-#=======================================================================
-#%% variable assignment: input and output paths & filenames
-drug = args.drug
-gene = args.gene
-datadir = args.datadir
-indir   = args.input_dir
-outdir  = args.output_dir
-make_dirs  = args.make_dirs
-
-#%% input and output dirs and files
-#=======
-# dirs
-#=======
-if not datadir:
-    datadir = homedir + '/' + 'git/Data'
-    
-if not indir:
-    indir = datadir + '/' + drug + '/input_v2'
-    
-if not outdir:
-    outdir = datadir + '/' + drug + '/output_v2'
-
-if make_dirs:
-    print('make_dirs is turned on, creating data dir:', datadir)
-    try: 
-        os.makedirs(datadir, exist_ok = True) 
-        print("Directory '%s' created successfully" %datadir) 
-    except OSError as error: 
-        print("Directory '%s' can not be created") 
-  
-    print('make_dirs is turned on, creating indir:', indir)
-    try: 
-        os.makedirs(indir, exist_ok = True) 
-        print("Directory '%s' created successfully" %indir) 
-    except OSError as error: 
-        print("Directory '%s' can not be created") 
-  
-    print('make_dirs is turned on, creating outdir:', outdir)
-    try: 
-        os.makedirs(outdir, exist_ok = True) 
-        print("Directory '%s' created successfully" %outdir) 
-    except OSError as error: 
-        print("Directory '%s' can not be created") 
-
-# handle missing dirs here
-if not os.path.isdir(datadir):
-    print('ERROR: Data directory does not exist:', datadir
-    , '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option --make_dirs')
-    sys.exit()
-if not os.path.isdir(indir):
-    print('ERROR: Input directory does not exist:', indir
-    , '\nPlease either create or specify indir and rerun\nelse specify cmd option --make_dirs')
-    sys.exit()
-if not os.path.isdir(outdir):
-    print('ERROR: Output directory does not exist:', outdir
-    , '\nPlease create or specify outdir and rerun\nelse specify cmd option --make_dirs')
-    sys.exit()
-    
-# Requires
-from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
-from tidy_split import tidy_split
-
-#=======================================================================    
-gene_match = gene + '_p.'
-print('mut pattern for gene', gene, ':',  gene_match)
-
-nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
-print('nsSNP for gene', gene, ':',  nssnp_match)
-
-wt_regex = gene_match.lower()+'([A-Za-z]{3})'
-print('wt regex:', wt_regex)
-
-mut_regex = r'[0-9]+(\w{3})$'
-print('mt regex:', mut_regex)
-
-pos_regex = r'([0-9]+)'
-print('position regex:', pos_regex)
-
-# building cols to extract
-dr_muts_col = 'dr_mutations_' + drug
-other_muts_col = 'other_mutations_' + drug
-resistance_col = 'drtype'
-
-print('Extracting columns based on variables:\n'
-	, drug
-	, '\n'
-	, dr_muts_col
-	, '\n'
-	, other_muts_col
-    , '\n'
-    , resistance_col
-	, '\n===============================================================')
-#=======================================================================
-
-#=======
-# input
-#=======
-#in_filename_master_master  = 'original_tanushree_data_v2.csv' #19k
-in_filename_master  = 'mtb_gwas_meta_v6.csv' #35k
-infile_master = datadir + '/' + in_filename_master
-print('Input file: ', infile_master
-      , '\n============================================================')
-
-#=======
-# output
-#=======
-# several output files: in respective sections at the time of outputting files
-print('Output filename: in the respective sections'
-      , '\nOutput path: ', outdir
-      , '\n=============================================================')
-
-#%%end of variable assignment for input and output files
-#=======================================================================
-#%% Read input file
-master_data  = pd.read_csv(infile_master, sep = ',')  
-
-# column names
-#list(master_data.columns)
-
-# extract elevant columns to extract from meta data related to the drug
-if in_filename_master == 'original_tanushree_data_v2.csv':
-    meta_data = master_data[['id'
-                             , 'country'
-                             , 'lineage'
-                             , 'sublineage'
-                             , 'drtype'
-                             , drug
-                             , dr_muts_col
-                             , other_muts_col]]
-else:
-    core_cols = ['id'
-                 , 'sample'
-                 #, 'patient_id'
-                 #, 'strain'
-                 , 'lineage'
-                 , 'sublineage' 
-                 , 'country_code'
-                 #, 'geographic_source'
-                 ,  resistance_col]
-    
-    variable_based_cols = [drug
-                           , dr_muts_col
-                           , other_muts_col]
-       
-    cols_to_extract = core_cols + variable_based_cols
-    print('Extracting', len(cols_to_extract), 'columns from master data')
-
-    meta_data = master_data[cols_to_extract]  
-    del(master_data, variable_based_cols, cols_to_extract)
-
-print('Extracted meta data from filename:', in_filename_master
-      , '\nDim:', meta_data.shape)
-
-# checks and results
-total_samples = meta_data['id'].nunique() 
-print('RESULT: Total samples:', total_samples
-		, '\n===========================================================')
-
-# counts NA per column
-meta_data.isna().sum()
-print('No. of NAs/column:' + '\n', meta_data.isna().sum()
-		, '\n===========================================================\n')
-
-#%% Quick checks:
-meta_data['lineage'].value_counts()
-meta_data['lineage'].value_counts().sum()    
-meta_data['lineage'].nunique()
-
-# replace lineage with 'L' in lineage_labels
-#meta_data['lineage_labels'] = meta_data['lineage']
-#meta_data['lineage_labels'].equals(meta_data['lineage'])
-#all(meta_data['lineage_labels'].value_counts() == meta_data['lineage'].value_counts())
-#meta_data['lineage_labels'] = meta_data['lineage_labels'].str.replace('lineage', 'L')
-#meta_data['lineage'].value_counts() 
-#meta_data['lineage_labels'].value_counts()
-
-meta_data['lineage'] = meta_data['lineage'].str.replace('lineage', 'L')
-
-meta_data['id'].nunique()
-meta_data['sample'].nunique()
-meta_data['id'].equals(meta_data['sample'])
-foo = meta_data.copy()
-foo['Diff'] = np.where( foo['id'] == foo['sample'] , '1', '0')
-foo['Diff'].value_counts()
-
-meta_data['drtype'].value_counts()
-meta_data['drtype'].value_counts().sum()
-
-#%% Write file: <gene>_gwas.csv
-check_file = outdir + '/' + gene.lower() + '_gwas.csv'
-meta_data.to_csv(check_file, index = False)
-print('\n----------------------------------'
-      , '\nWriting subsetted gwas data:', gene
-      , '\n----------------------------------'
-      , '\nFile', check_file
-      , '\nDim:', meta_data.shape)
-
-# equivalent of table in R
-# drug counts: complete samples for OR calcs
-meta_data[drug].value_counts() 
-print('===========================================================\n'
-      , 'RESULT: Total no. of samples tested for', drug, meta_data[drug].value_counts().sum()
-	  , '\n===========================================================\n'
-      , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
-	  , '\n===========================================================\n'
-      , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
-	  , '\n===========================================================')
-#%% Important sanity checks
-# This is to find out how many samples have 1 and more than 1 mutation,so you
-# can use it to check if your data extraction process for dr_muts 
-# and other_muts has worked correctly AND also to check the dim of the 
-# final formatted data.
-# This will have: unique COMBINATION of sample id and <gene_match> mutations
-#========
-# First: counting <gene_match> mutations in dr_muts_col column
-#========
-print('Now counting WT &', nssnp_match, 'muts within the column:', dr_muts_col)
-# drop na and extract a clean df
-clean_df =  meta_data.dropna(subset=[dr_muts_col])
-
-# sanity check: count na
-na_count = meta_data[dr_muts_col].isna().sum()
-
-if len(clean_df) == (total_samples - na_count):
-   print('PASS: clean_df extracted: length is', len(clean_df)
-         , '\nNo.of NAs in', dr_muts_col, '=', na_count, '/', total_samples
-         , '\n==========================================================')
-else:
-    sys.exit('FAIL: Could not drop NAs')
-
-dr_gene_count = 0
-wt = 0
-id_dr = []
-id2_dr = []
-#nssnp_match_regex = re.compile(nssnp_match)
-
-for i, id in enumerate(clean_df.id):
-    #print (i, id)
-    #id_dr.append(id)
-    #count_gene_dr = clean_df[dr_muts_col].iloc[i].count(gene_match) # can include stop muts
-    count_gene_dr = len(re.findall(nssnp_match, clean_df[dr_muts_col].iloc[i], re.IGNORECASE))
-    #print(count_gene_dr)
-    if count_gene_dr > 0:
-        id_dr.append(id)
-    if count_gene_dr > 1:
-        id2_dr.append(id)
-        #print(id, count_gene_dr)    
-    dr_gene_count = dr_gene_count + count_gene_dr
-    count_wt = clean_df[dr_muts_col].iloc[i].count('WT')
-    wt = wt + count_wt
-    
-print('RESULTS:')        
-print('Total WT in dr_muts_col:', wt)
-print('Total matches of', gene, 'SNP matches in', dr_muts_col, ':', dr_gene_count)
-print('Total samples with > 1', gene, 'nsSNPs in dr_muts_col:', len(id2_dr) )
-print('=================================================================')
-
-del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
-
-#========
-# Second: counting <gene_match> mutations in other_muts_col column
-#========
-print('Now counting WT &', nssnp_match, 'muts within the column:', other_muts_col)    
-# drop na and extract a clean df
-clean_df =  meta_data.dropna(subset=[other_muts_col])
-
-# sanity check: count na
-na_count = meta_data[other_muts_col].isna().sum()
-
-if len(clean_df) == (total_samples - na_count):
-   print('PASS: clean_df extracted: length is', len(clean_df)
-         , '\nNo.of NAs =', na_count, '/', total_samples
-         , '\n=========================================================')
-else:
-    sys.exit('FAIL: Could not drop NAs')
-
-other_gene_count = 0
-wt_other = 0
-id_other = []
-id2_other = []
-        
-for i, id in enumerate(clean_df.id):
-    #print (i, id)
-    #id_other.append(id)
-    #count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match) # can include stop muts
-    count_gene_other = len(re.findall(nssnp_match, clean_df[other_muts_col].iloc[i], re.IGNORECASE))
-    if count_gene_other > 0:
-        id_other.append(id)
-    if count_gene_other > 1:
-        id2_other.append(id)
-        #print(id, count_gene_other)    
-    other_gene_count = other_gene_count + count_gene_other
-    count_wt = clean_df[other_muts_col].iloc[i].count('WT')
-    wt_other = wt_other + count_wt        
-     
-print('RESULTS:')
-print('Total WT in other_muts_col:', wt_other)
-print('Total matches of', gene, 'SNP matches in', other_muts_col, ':', other_gene_count)
-print('Total samples with > 1', gene, 'nsSNPs in other_muts_col:', len(id2_other) )
-print('=================================================================')
-
-print('Predicting total no. of rows in the curated df:', dr_gene_count + other_gene_count
-      , '\n===================================================================')
-expected_rows = dr_gene_count + other_gene_count
-
-#del( wt_other, clean_df, i, id, na_count, id2_other, count_gene_other, count_wt)
-del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other, count_wt )
-
-#%% Extracting dr and other muts separately along with common cols
-print('Extracting dr_muts from col:', dr_muts_col, 'with other meta_data')
-print('muts to extract:', nssnp_match )
-
-#===============
-# dr mutations: extract gene_match entries with meta data and ONLY dr_muts col
-#===============
-if in_filename_master == 'original_tanushree_data_v2.csv':
-    meta_data_dr = meta_data[['id'
-                              ,'country'
-                              ,'lineage'
-                              ,'sublineage'
-                              ,'drtype'
-                              , drug
-                              , dr_muts_col]]
-else:
-    dr_based_cols = [drug, dr_muts_col]
-    cols_to_extract = core_cols + dr_based_cols
-    print('Extracting', len(cols_to_extract), 'columns from meta data')
-    meta_data_dr = meta_data[cols_to_extract]
-    del(dr_based_cols, cols_to_extract)
-
-if meta_data_dr.shape[0] == len(meta_data) and meta_data_dr.shape[1] == (len(meta_data.columns)-1):
-    print('PASS: Dimensions match'
-          , '\n===============================================================')
-else:
-    print('FAIL: Dimensions mismatch:'
-          , 'Expected dim:', len(meta_data), (len(meta_data.columns)-1)
-          , '\nGot:', meta_data_dr.shape
-	      , '\n===============================================================')
-    sys.exit()
-
-# Extract within this the nsSNPs for gene of interest using string match
-#meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(gene_match, na = False)]
-meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
-print('gene_snp_match in dr:', len(meta_gene_dr))
-  
-dr_id = meta_gene_dr['id'].unique()
-print('RESULT: No. of samples with dr muts in', gene, ':', len(dr_id))
-
-if len(id_dr) == len(meta_gene_dr):
-    print('PASS: lengths match'
-    , '\n===============================================================')
-else:
-    print('FAIL: length mismatch'
-             , '\nExpected len:', len(id_dr)
-             , '\nGot:', len(meta_gene_dr))
-    sys.exit()
-
-dr_id = pd.Series(dr_id)
-
-#=================
-# other mutations: extract nssnp_match entries from other_muts_col
-#==================
-print('Extracting other_muts from:', other_muts_col,'with other meta_data')
-print('muts to extract:', nssnp_match)
-
-if in_filename_master ==  'original_tanushree_data_v2.csv':
-    meta_data_other = meta_data[['id'
-                                 , 'country'
-                                 , 'lineage'
-                                 , 'sublineage'
-                                 , 'drtype'
-                                 , drug
-                                 , other_muts_col]]
-else:
-    other_based_cols = [drug, other_muts_col]
-    cols_to_extract = core_cols + other_based_cols
-    print('Extracting', len(cols_to_extract), 'columns from meta data')
-    meta_data_other = meta_data[cols_to_extract]
-    del(other_based_cols, cols_to_extract)
-
-if meta_data_other.shape[0] == len(meta_data) and meta_data_other.shape[1] == (len(meta_data.columns)-1):
-    print('PASS: Dimensions match'
-          , '\n===============================================================')
-else:
-    print('FAIL: Dimensions mismatch:'
-          , 'Expected dim:', len(meta_data), (len(meta_data.columns)-1)
-          , '\nGot:', meta_data_other.shape
-	      , '\n===============================================================')
-    sys.exit()
-
-# Extract within this nSSNP for gene of interest using string match
-#meta_gene_other  = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(gene_match, na = False)] 
-meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] 
-print('gene_snp_match in other:', len(meta_gene_other))
-
-other_id = meta_gene_other['id'].unique()
-print('RESULT: No. of samples with other muts in', gene, ':', len(other_id))
-
-if len(id_other) == len(meta_gene_other):
-    print('PASS: lengths match'
-    , '\n==============================================================')
-else:
-    print('FAIL: length mismatch'
-          , '\nExpected len:', len(id_other)
-          , '\nGot:', len(meta_gene_other))
-    sys.exit()
-
-other_id = pd.Series(other_id)
-#%% Find common IDs
-#==================
-# Common ids
-#==================
-print('Now extracting common_ids...')
-common_mut_ids = dr_id.isin(other_id).sum() 
-print('RESULT: No. of common ids:', common_mut_ids)
-
-# sanity checks
-# check if True: should be since these are common
-if dr_id.isin(other_id).sum() == other_id.isin(dr_id).sum():
-    print('PASS: Cross check on no. of common ids')
-else:
-    sys.exit('FAIL: Cross check on no. of common ids failed')
-    
-# check if the common are indeed the same!
-# bit of a tautology, but better safe than sorry!
-common_ids = dr_id[dr_id.isin(other_id)]
-common_ids = common_ids.reset_index()
-common_ids.columns = ['index', 'id']
-
-common_ids2 = other_id[other_id.isin(dr_id)]
-common_ids2 = common_ids2.reset_index()
-common_ids2.columns = ['index', 'id2']
-
-# should be True
-if common_ids['id'].equals(common_ids2['id2']):
-    print('PASS: Further cross checks on common ids')
-    nu_common_ids = common_ids.nunique()
-else:
-    sys.exit('FAIL: Further cross checks on common ids')
-
-# good sanity check: use it later to check gene_sample_counts
-expected_gene_samples = (len(meta_gene_dr) + len(meta_gene_other) - common_mut_ids) 
-print('Expected no. of gene samples:', expected_gene_samples
-      , '\n=================================================================')
-#%% Write file: <gene>_common_ids.csv
-#print(outdir)
-out_filename_cid = gene.lower() + '_common_ids.csv'
-outfile_cid =  outdir + '/' + out_filename_cid
-print('\n----------------------------------'
-      '\nWriting file: common_ids'
-      '\n----------------------------------'
-      , '\nFile:', outfile_cid
-      , '\nNo. of rows:', len(common_ids)
-      , '\n=============================================================')
-
-common_ids.to_csv(outfile_cid, index = False)
-
-del(out_filename_cid)
-
-# clear variables
-del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2)
-#%% Extract gene specific nsSNPs: all nsSNPs i.e.'nssnp_match'
-print('Extracting nsSNP match:', gene, 'mutations from cols:\n'
-      , dr_muts_col, 'and', other_muts_col, 'using string match:'
-      , '\n===================================================================')
-#meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(gene_match, na = False) | meta_data[other_muts_col].str.contains(gene_match, na = False) ]
-meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match
-                                                                  , na = False
-                                                                  , regex = True
-                                                                  , case = False) | meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ]
-
-extracted_gene_samples = meta_gene_all['id'].nunique()
-print('RESULT: No. of unique samples with', gene, 'nsSNP muts:', extracted_gene_samples
-      , '\n===================================================================')
-
-# sanity check: length of gene samples
-print('Performing sanity check:')
-if extracted_gene_samples == expected_gene_samples:
-    print('PASS: expected & actual no. of nssnp gene samples match'
-          , '\nNo. of gene samples:', len(meta_gene_all)
-          , '\n=========================================================')
-else:
-    sys.exit('FAIL: Length mismatch in gene samples!')
-
-# count NA in drug column
-gene_na = meta_gene_all[drug].isna().sum() 
-print('No. of gene samples without', drug, 'testing:', gene_na)
-
-# use it later to check number of complete samples from LF data
-comp_gene_samples = len(meta_gene_all) - gene_na
-print('Complete gene samples tested for', drug, ':', comp_gene_samples)
-print('=================================================================')
-
-# Comment: This is still dirty data since these
-# are samples that have nsSNP muts, but can have others as well
-# since the format for mutations is mut1; mut2, etc.
-print('This is still dirty data: samples have ', nssnp_match, 'muts but may have others as well'
-      , '\nsince the format for mutations is mut1; mut2, etc.'
-      , '\n=============================================================')
-
-print('Performing tidy_split(): to separate the mutations into individual rows')
-
-#%% tidy_split(): dr_muts_col
-#=========
-# DF1: dr_muts_col
-# and remove leading white spaces
-#=========
-col_to_split1 = dr_muts_col
-print ('Firstly, applying tidy split on dr muts df', meta_gene_dr.shape
-       , '\ncolumn name to apply tidy_split():', col_to_split1
-       , '\n============================================================')
-# apply tidy_split()
-dr_WF0 = tidy_split(meta_gene_dr, col_to_split1, sep = ';') 
-# remove leading white space else these are counted as distinct mutations as well
-dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.strip() 
-
-# extract only the samples/rows with nssnp_match
-#dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)]
-dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(nssnp_match, regex = True, case = False)]
-
-print('Lengths after tidy split and extracting', nssnp_match, 'muts:'
-      , '\nOld length:' , len(meta_gene_dr)
-      , '\nLength after split:', len(dr_WF0)
-      , '\nLength of nssnp df:', len(dr_gene_WF0)
-      , '\nExpected len:', dr_gene_count
-      , '\n=============================================================')
-if len(dr_gene_WF0) == dr_gene_count:
-    print('PASS: length matches expected length'
-    , '\n===============================================================')
-else:
-    sys.exit('FAIL: lengths mismatch')
-
-# count the freq of 'dr_muts' samples
-dr_muts_df = dr_gene_WF0 [['id', dr_muts_col]] 
-print('Dim of dr_muts_df:', dr_muts_df.shape)
-
-# add freq column
-dr_muts_df['dr_sample_freq'] = dr_muts_df.groupby('id')['id'].transform('count') 
-#dr_muts_df['dr_sample_freq'] = dr_muts_df.loc[dr_muts_df.groupby('id')].transform('count')
-print('Revised dim of dr_muts_df:', dr_muts_df.shape) 
-
-c1 = dr_muts_df.dr_sample_freq.value_counts()
-print('Counting no. of sample frequency:\n', c1
-      , '\n===================================================================')
-
-# sanity check: length of gene samples
-if len(dr_gene_WF0) == c1.sum():
-    print('PASS: WF data has expected length'
-    , '\nLength of dr_gene WFO:', c1.sum()
-    , '\n===============================================================')
-else:
-    sys.exit('FAIL: length mismatch!')
-
-# Important: Assign 'column name' on which split was performed as an extra column
-# This is so you can identify if mutations are dr_type or other in the final df
-dr_df = dr_gene_WF0.assign(mutation_info = dr_muts_col)
-print('Dim of dr_df:', dr_df.shape
-	, '\n=============================================================='
-	, '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category'
-	, '\n===============================================================')
-#%% tidy_split(): other_muts_col
-#=========
-# DF2: other_muts_col
-# and remove leading white spaces
-#=========
-col_to_split2 = other_muts_col
-print ('applying second tidy split() separately on other muts df', meta_gene_other.shape
-       , '\ncolumn name to apply tidy_split():', col_to_split2
-       , '\n============================================================')
-
-# apply tidy_split()
-other_WF1 = tidy_split(meta_gene_other, col_to_split2, sep = ';') 
-# remove the leading white spaces in the column
-other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip() 
-
-# extract only the samples/rows with nssnp_match
-#other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)]
-other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(nssnp_match, regex = True, case = False)]
-
-print('Lengths after tidy split and extracting', gene_match, 'muts:',
-      '\nOld length:' , len(meta_gene_other),
-      '\nLength after split:', len(other_WF1),
-      '\nLength of nssnp df:', len(other_gene_WF1),
-      '\nExpected len:', other_gene_count
-      , '\n=============================================================')
-if len(other_gene_WF1) == other_gene_count:
-    print('PASS: length matches expected length'
-    , '\n===============================================================')
-else:
-    sys.exit('FAIL: lengths mismatch')
-
-# count the freq of 'other muts' samples
-other_muts_df = other_gene_WF1 [['id', other_muts_col]] 
-print('Dim of other_muts_df:', other_muts_df.shape) 
-
-# add freq column
-other_muts_df['other_sample_freq'] = other_muts_df.groupby('id')['id'].transform('count')
-print('Revised dim of other_muts_df:', other_muts_df.shape) 
-
-c2 = other_muts_df.other_sample_freq.value_counts()
-print('Counting no. of sample frequency:\n', c2)
-print('=================================================================')
-# sanity check: length of gene samples
-if len(other_gene_WF1) == c2.sum():
-    print('PASS: WF data has expected length'
-    , '\nLength of other_gene WFO:', c2.sum()
-    , '\n===============================================================')
-else:
-    sys.exit('FAIL: Length mismatch')
-    
-# Important: Assign 'column name' on which split was performed as an extra column
-# This is so you can identify if mutations are dr_type or other in the final df
-other_df = other_gene_WF1.assign(mutation_info = other_muts_col)
-print('dim of other_df:', other_df.shape
-	, '\n==============================================================='
-	, '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category'
-	, '\n===============================================================')
-#%% NEW check1: dr muts and other muts
-#foo = dr_WF0[dr_muts_col][~dr_WF0[dr_muts_col].isin(dr_df[dr_muts_col])]
-#bar = other_WF1[other_muts_col][~other_WF1[other_muts_col].isin(other_df[other_muts_col])]
-dr_df['id'][~dr_df['id'].isin(other_df['id'])].count()
-other_df['id'][~other_df['id'].isin(dr_df['id'])].count()
-
-#%% Concatenating two dfs: gene_LF0
-# equivalent of rbind in R
-#==========
-# Concatentating the two dfs: equivalent of rbind in R
-#==========
-# Important: Change column names to allow concat:
-# dr_muts.. & other_muts : 'mutation'
-print('Now concatenating the two dfs by row'
-      , '\nFirst assigning a common colname: "mutation" to the col containing muts'
-      , '\nThis is done for both dfs'
-      , '\n===================================================================')
-
-dr_df.columns
-dr_df.rename(columns = {dr_muts_col: 'mutation'}, inplace = True)
-dr_df.columns
-
-other_df.columns
-other_df.rename(columns = {other_muts_col: 'mutation'}, inplace = True)
-other_df.columns
-
-if len(dr_df.columns) == len(other_df.columns):
-    print('Checking dfs for concatening by rows:'
-          , '\nDim of dr_df:', dr_df.shape
-          , '\nDim of other_df:', other_df.shape
-          , '\nExpected nrows:', len(dr_df) +  len(other_df)
-          , '\n=============================================================')
-else:
-    sys.exit('FAIL: No. of cols mismatch for concatenating')
-
-# checking colnames before concat
-print('Checking colnames BEFORE concatenating the two dfs...')
-if (set(dr_df.columns) == set(other_df.columns)):
-    print('PASS: column names match necessary for merging two dfs')
-else:
-    sys.exit('FAIL: Colnames mismatch for concatenating!')
-
-# concatenate (axis = 0): Rbind
-print('Now appending the two dfs: Rbind')
-gene_LF_comb = pd.concat([dr_df, other_df], ignore_index = True, axis = 0)
-
-print('Finding stop mutations in concatenated df')
-stop_muts = gene_LF_comb['mutation'].str.contains('\*').sum()
-if stop_muts == 0:
-    print('PASS: No stop mutations detected')
-else:
-    print('stop mutations detected'
-          , '\nNo. of stop muts:', stop_muts, '\n'
-          , gene_LF_comb.groupby(['mutation_info'])['mutation'].apply(lambda x: x[x.str.contains('\*')].count())
-          , '\nNow removing them')    
-    
-gene_LF0_nssnp = gene_LF_comb[~gene_LF_comb['mutation'].str.contains('\*')]
-print('snps only: by subtracting stop muts:', len(gene_LF0_nssnp))
-
-gene_LF0 = gene_LF_comb[gene_LF_comb['mutation'].str.contains(nssnp_match, case = False)]
-print('snps only by direct regex:', len(gene_LF0))
-
-if gene_LF0_nssnp.equals(gene_LF0):
-    print('PASS: regex for extracting nssnp worked correctly & stop mutations successfully removed'
-          , '\nUsing the regex extracted df')
-else:
-    sys.exit('FAIL: posssibly regex or no of stop mutations'
-             , 'Regex being used:', nssnp_match)
-    #sys.exit()
-
-# checking colnames and length after concat
-print('Checking colnames AFTER concatenating the two dfs...')
-if (set(dr_df.columns) == set(gene_LF0.columns)):
-    print('PASS: column names match'
-          , '\n=============================================================')
-else:
-    sys.exit('FAIL: Colnames mismatch AFTER concatenating')
-    
-print('Checking concatenated df')
-if len(gene_LF0) == (len(dr_df) +  len(other_df))- stop_muts:
-    print('PASS:length of df after concat match'
-    , '\n===============================================================')
-else:
-    sys.exit('FAIL: length mismatch')
-
-#%% NEW check2:id, sample, etc.
-gene_LF0['id'].count()
-gene_LF0['id'].nunique()
-gene_LF0['sample'].nunique()
-gene_LF0['mutation_info'].value_counts()
-gene_LF0[drug].isna().sum()
-#%% Concatenating two dfs sanity checks: gene_LF1
-#=========================================================
-# This is hopefully clean data, but just double check
-# Filter LF data so that you only have
-# mutations corresponding to nssnp_match (case insensitive)
-# this will be your list you run OR calcs 
-#=========================================================
-print('Length of gene_LF0:', len(gene_LF0)
-      , '\nThis should be what we need. But just double checking and extracting nsSNP for', gene
-      , '\nfrom LF0 (concatenated data) using case insensitive regex match:', nssnp_match)
-
-gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)]   
-
-if len(gene_LF0) == len(gene_LF1):
-    print('PASS: length of gene_LF0 and gene_LF1 match',
-          '\nConfirming extraction and concatenating worked correctly'
-          , '\n==========================================================')
-else:
-    print('FAIL: BUT NOT FATAL!'
-          , '\ngene_LF0 and gene_LF1 lengths differ'
-          , '\nsuggesting error in extraction process'
-          , ' use gene_LF1 for downstreama analysis'
-          , '\n=========================================================')   
-    
-print('Length of dfs pre and post processing...'
-      , '\ngene WF data (unique samples in each row):', extracted_gene_samples
-      , '\ngene LF data (unique mutation in each row):', len(gene_LF1)
-      , '\n=============================================================')
-  
-# sanity check for extraction
-# This ought to pass if nsnsp_match happens right at the beginning of creating 'expected_rows'
-print('Verifying whether extraction process worked correctly...')
-if len(gene_LF1) == expected_rows:
-    print('PASS: extraction process performed correctly'
-          , '\nExpected length:', expected_rows
-          , '\nGot:', len(gene_LF1)
-          , '\nRESULT: Total no. of mutant sequences for logo plot:', len(gene_LF1)
-          , '\n=========================================================')
-else:
-    print('FAIL: extraction process has bugs'
-          , '\nExpected length:', expected_rows
-          , '\nGot:', len(gene_LF1)
-          , '\nDebug please'
-          , '\n=========================================================')
-
-# more sanity checks
-print('Performing some more sanity checks...')
-
-# From LF1: useful for OR counts
-# no. of unique muts
-distinct_muts = gene_LF1.mutation.value_counts()
-print('Distinct genomic mutations:', len(distinct_muts))
-
-# no. of samples contributing the unique muts
-gene_LF1.id.nunique()
-print('No.of samples contributing to distinct genomic muts:', gene_LF1.id.nunique())
-
-# no. of dr and other muts
-mut_grouped = gene_LF1.groupby('mutation_info').mutation.nunique()
-print('No.of unique dr and other muts:\n', gene_LF1.groupby('mutation_info').mutation.nunique())
-
-# sanity check
-if len(distinct_muts) == mut_grouped.sum() :
-    print('PASS:length of LF1 is as expected'
-    , '\n===============================================================')
-else:
-    print('FAIL: Mistmatch in count of muts'
-          , '\nExpected count:', len(distinct_muts)
-          , '\nActual count:', mut_grouped.sum()
-          , '\nMuts should be distinct within dr* and other* type'
-          , '\nInspecting...possibly ambiguous muts'
-          , '\nNo. of possible ambiguous muts:', mut_grouped.sum() - len(distinct_muts)
-          , '\n=========================================================')
-    
-muts_split = list(gene_LF1.groupby('mutation_info'))
-dr_muts = muts_split[0][1].mutation 
-other_muts =  muts_split[1][1].mutation
-print('splitting muts by mut_info:', muts_split)
-print('no.of dr_muts samples:', len(dr_muts))
-print('no. of other_muts samples', len(other_muts))       
-
-#%% Ambiguous muts
-# IMPORTANT: The same mutation cannot be classed as a drug AND 'others'
-if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
-    print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category'
-          , '\n===============================================================')
-else:
-    print('PASS: NO ambiguous muts detected'
-          , '\nMuts are unique to dr_ and other_ mutation class'
-          , '\n=========================================================')
-
-# inspect dr_muts and other muts: Fixed in case no ambiguous muts detected!
-if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
-    print('Finding ambiguous muts...'
-          , '\n========================================================='
-          , '\nTotal no. of samples in dr_muts present in other_muts:', dr_muts.isin(other_muts).sum()
-          , '\nThese are:', dr_muts[dr_muts.isin(other_muts)]
-          , '\n========================================================='
-          , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum()
-          , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)]
-          , '\n=========================================================')
-    
-    print('Counting no. of ambiguous muts...'
-          , '\nNo. of ambiguous muts in dr:'
-          , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist())
-          , '\nNo. of ambiguous muts in other:'
-          , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())
-          , '\n=========================================================')
-    
-    if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): 
-        common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()
-        print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts))
-              , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n')
-        print('\n===========================================================') 
-else:
-    #sys.exit('Error: ambiguous muts present, but extraction failed. Debug!')
-    print('No: ambiguous muts present')
-    
-#print('Counting no. of ambiguous muts...')
-#if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): 
-#    common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()
-#    print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts))
-#          , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n')
-#    print('\n===========================================================')
-#else:
-#   print('Error: ambiguous muts detected, but extraction failed. Debug!'
-#         , '\nNo. of ambiguous muts in dr:'
-#         , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist())
-#         , '\nNo. of ambiguous muts in other:'
-#         , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())
-#         , '\n=========================================================')       
-
-#%% Ambiguous muts: revised annotation for mutation_info
-ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
-ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts()    
-ambiguous_muts_value_counts
-
-gene_LF1_orig = gene_LF1.copy()
-gene_LF1_orig.equals(gene_LF1)
-
-# copy the old columns for checking
-gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info'] 
-gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info'] 
-gene_LF1['mutation_info'].value_counts()
-
-#=====================================
-# Now create a new df that will have:
-    # ambiguous muts
-    # mutation_info
-    # revised mutation_info
-# The revised label is based on value_counts
-# for mutaiton_info. The corresponding mutation_info:
-# label is chosen that corresponds to the max of value counts
-#=====================================
-ambig_muts_rev_df = pd.DataFrame()
-changes_val = []
-changes_dict = {}
-
-for i in common_muts:
-    #print(i)
-    temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
-    
-  # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending
-    max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts()
-    
-    revised_label = max_info_table[[0]].index[0][1] # max value_count
-    old_label     = max_info_table[[1]].index[0][1] # min value_count 
-    print('\nAmbiguous mutation labels...'
-          , '\nSetting mutation_info for', i, 'to', revised_label)
-    temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info_orig'] == old_label)
-                      , revised_label
-                      , temp_df['mutation_info_orig'])
-    ambig_muts_rev_df = pd.concat([ambig_muts_rev_df,temp_df])
-    f = max_info_table[[1]]
-    old_label_count = f[[0]][0]
-    changes_val.append(old_label_count) 
-    cc_dict = f.to_dict()
-    changes_dict.update(cc_dict)
-                      
-ambig_muts_rev_df['mutation_info_REV'].value_counts() 
-ambig_muts_rev_df['mutation_info_orig'].value_counts() 
-changes_val
-changes_total = sum(changes_val)
-changes_dict
-
-#%%FIXME: TODO: Add sanity check to make sure you can add value_count checks
-#%% Merging ambiguous muts
-#=================
-# Merge ambig muts
-# with gene_LF1
-#===================
-ambig_muts_rev_df.index
-gene_LF1.index
-all(ambig_muts_rev_df.index.isin(gene_LF1.index))
-
-gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV']
-gene_LF1['mutation_info_orig'].value_counts()
-gene_LF1['mutation_info_v1'].value_counts()
-foo = gene_LF1.iloc[ambig_muts_rev_df.index]
-
-# Sanity check1: if there are still any ambiguous muts
-muts_split_rev = list(gene_LF1.groupby('mutation_info_v1'))
-dr_muts_rev = muts_split_rev[0][1].mutation 
-other_muts_rev =  muts_split_rev[1][1].mutation
-print('splitting muts by mut_info:', muts_split_rev)
-print('no.of dr_muts samples:', len(dr_muts_rev))
-print('no. of other_muts samples', len(other_muts_rev))  
-
-if not dr_muts_rev.isin(other_muts_rev).sum() & other_muts_rev.isin(dr_muts_rev).sum() > 0:
-    print('\nAmbiguous muts corrected. Proceeding with downstream analysis')
-else:
-    print('\nAmbiguous muts NOT corrected. Quitting!')
-    sys.exit()
-
-#%% OUTFILE 3, write file: ambiguous muts and ambiguous mut counts
-#===================
-# ambiguous muts
-#=======================
-#dr_muts.to_csv('dr_muts.csv', header = True)
-#other_muts.to_csv('other_muts.csv', header = True)
-if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
-    out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
-    outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
-    print('\n----------------------------------'
-          , '\nWriting file: ambiguous muts'
-          , '\n----------------------------------'
-          , '\nFilename:', outfile_ambig_muts)
-    inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
-    inspect.to_csv(outfile_ambig_muts, index = True)
-
-    print('Finished writing:', out_filename_ambig_muts
-          , '\nNo. of rows:', len(inspect)
-          , '\nNo. of cols:', len(inspect.columns)
-          , '\nNo. of rows = no. of samples with the ambiguous muts present:'
-          , dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum()
-          , '\n=============================================================')
-    del(out_filename_ambig_muts)
-
-#=======================
-# ambiguous mut counts
-#=======================
-out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv'
-outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts
-print('\n----------------------------------'
-       , '\nWriting file: ambiguous muts'
-       , '\n----------------------------------'
-       , '\nFilename:', outfile_ambig_mut_counts)
-
-ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
-
-# clear variables
-del(id_dr, id_other
-    #, meta_data
-    , meta_gene_dr, meta_gene_other
-    #, mut_grouped
-    , muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na)  
-
-del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1)                     
- 
-#%% Add column: Mutationinformation
-# splitting mutation column to get mCSM style muts
-#=======================
-# Formatting df: read aa dict and pull relevant info
-#=======================
-print('Now some more formatting:'
-      , '\nRead aa dict and pull relevant info'
-      , '\nFormat mutations:'
-      , '\nsplit mutation into mCSM style muts: '
-      , '\nFormatting mutation in mCSM style format: {WT}<POS>{MUT}'
-      , '\nAssign aa properties: adding 2 cols at a time for each prop'
-      , '\n===================================================================')
-
-# BEWARE hardcoding : only works as we are adding aa prop once for wt and once for mut
-# in each lookup cycle 
-ncol_mutf_add = 3 # mut split into 3 cols
-ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping
-
-#===========
-# Split 'mutation' column into three:  wild_type, position and
-# mutant_type separately. Then map three letter code to one using
-# reference_dict imported at the beginning.
-# After importing, convert to mutation to lowercase for compatibility with dict 
-#===========
-gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower()
-
-print('wt regex being used:', wt_regex
-      , '\nmut regex being used:', mut_regex
-      , '\nposition regex being used:', pos_regex)
-
-mylen0 = len(gene_LF1.columns)
-
-#=======
-# Iterate through the dict, create a lookup dict i.e
-# lookup_dict = {three_letter_code: one_letter_code}.
-# lookup dict should be the key and the value (you want to create a column for)
-# Then use this to perform the mapping separetly for wild type and mutant cols.
-# The three letter code is extracted using a string match match from the dataframe and then converted
-# to 'pandas series'since map only works in pandas series
-#=======
-print('Adding', ncol_mutf_add, 'more cols:\n')
-
-# initialise a sub dict that is lookup dict for three letter code to 1-letter code
-# adding three more cols
-lookup_dict = dict()
-for k, v in my_aa_dict.items():
-    lookup_dict[k] = v['one_letter_code']
-    #wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()converts to a series that map works on
-    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
-    gene_LF1['wild_type'] = wt.map(lookup_dict)   
-    #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
-    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
-    gene_LF1['mutant_type'] = mut.map(lookup_dict)
-
-# extract position info from mutation column separetly using string match
-#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') 
-gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex) 
-
-mylen1 = len(gene_LF1.columns)
-
-# sanity checks
-print('checking if 3-letter wt&mut residue extraction worked correctly')
-if wt.isna().sum() & mut.isna().sum() == 0:
-   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
-         , '\nNo NAs detected:'
-         , '\nwild-type\n', wt
-         , '\nmutant-type\n', mut
-         , '\ndim of df:', gene_LF1.shape)
-else:
-    print('FAIL: 3-letter wt&mut residue extraction failed'
-          , '\nNo NAs detected:'
-          , '\nwild-type\n', wt
-          , '\nmutant-type\n', mut
-          , '\ndim of df:', gene_LF1.shape)
-
-if mylen1 ==  mylen0 + ncol_mutf_add:
-    print('PASS: successfully added', ncol_mutf_add, 'cols'
-      , '\nold length:', mylen0
-      , '\nnew len:', mylen1)
-else:
-    print('FAIL: failed to add cols:'
-          , '\nold length:', mylen0
-          , '\nnew len:', mylen1)
-
-# clear variables
-del(k, v, wt, mut, lookup_dict)
-
-########
-# combine the wild_type+poistion+mutant_type columns to generate 
-# mutationinformation (matches mCSM output field)
-# Remember to use .map(str) for int col types to allow string concatenation
-#########
-gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type']
-print('Created column: mutationinformation'
-	, '\n=====================================================================\n'
-    , gene_LF1.mutationinformation.head(10))
-
-#order by position for convenience
-gene_LF1.dtypes
-
-# converting position to numeric
-gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
-
-# sort by position inplace 
-foo = gene_LF1['position'].value_counts()
-gene_LF1.sort_values(by = ['position'], inplace = True)
-bar = gene_LF1['position'].value_counts()
-
-if (foo == bar).all():
-    print('PASS: df ordered by position')
-    print(gene_LF1['position'].head())
-else:
-    print('FAIL: df could not be ordered. Check source')
-    sys.exit()
-
-#%% OUTFILE 4, write file mCSM style muts
-snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique())
-snps_only.head()
-# assign column name
-snps_only.columns = ['mutationinformation']
-
-# count how many positions this corresponds to
-pos_only = pd.DataFrame(gene_LF1['position'].unique()) 
-
-print('Checking NA in snps...')# should be 0
-if snps_only.mutationinformation.isna().sum() == 0:
-    print ('PASS: NO NAs/missing entries for SNPs'
-    , '\n===============================================================')
-else:
-    sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
-
-# write file: mCSM muts
-out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv'
-outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
-
-print('\n----------------------------------'
-      , '\nWriting file: mCSM style muts'
-      , '\n----------------------------------'
-      , '\nFile:', outfile_mcsmsnps
-      , '\nmutation format (SNP): {WT}<POS>{MUT}'
-      , '\nNo. of distinct muts:', len(snps_only)
-      , '\nNo. of distinct positions:', len(pos_only)
-      , '\n=============================================================')
-
-snps_only.to_csv(outfile_mcsmsnps, header = False, index = False)
-
-print('Finished writing:', outfile_mcsmsnps
-      , '\nNo. of rows:', len(snps_only)
-      , '\nNo. of cols:', len(snps_only.columns)
-      , '\n=============================================================')
-del(out_filename_mcsmsnps)
-#%% OUTFILE 5, write file frequency of position counts
-metadata_pos = pd.DataFrame(gene_LF1['position'])
-z =  gene_LF1['position'].value_counts()
-z1 = z.to_dict()
-metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
-metadata_pos['meta_pos_count'].value_counts()
-
-metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
-
-out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
-outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
-print('\n----------------------------------'
-      , '\nWriting file: Metadata poscounts'
-      , '\n----------------------------------'
-      , '\nFile:', outfile_metadata_poscounts
-      , '\n============================================================')
-
-metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
-print('Finished writing:', outfile_metadata_poscounts
-      , '\nNo. of rows:', len(metadata_pos)
-      , '\nNo. of cols:', len(metadata_pos.columns)
-      , '\n=============================================================')
-del(out_filename_metadata_poscounts)
-#%% OUTFILE 6, write file <gene>_metadata 
-# gene_LF1, where each row has UNIQUE mutations NOT unique sample ids
-out_filename_metadata = gene.lower() + '_metadata.csv'
-outfile_metadata = outdir + '/' + out_filename_metadata
-print('\n----------------------------------'
-      , '\nWriting file: LF formatted data'
-      , '\n----------------------------------'
-      , '\nFile:', outfile_metadata
-      , '\n============================================================')
-
-gene_LF1.to_csv(outfile_metadata, header = True, index = False)
-print('Finished writing:', outfile_metadata
-      , '\nNo. of rows:', len(gene_LF1)
-      , '\nNo. of cols:', len(gene_LF1.columns)
-      , '\n=============================================================')
-del(out_filename_metadata)
-#%% OUTFILE 7, write file MSA plots
-# mCSM style muts but with repetitions
-# !!! THINK !!!: Implications if mut is coming from the same sample, etc.
-all_muts_msa = pd.DataFrame(gene_LF1['mutationinformation']) 
-all_muts_msa.head()
-
-# assign column name
-all_muts_msa.columns = ['mutationinformation']
-
-# make sure it is string
-all_muts_msa.columns.dtype
-
-# sort the column
-all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation')
-
-# create an extra column with protein name
-#all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1') 
-#all_muts_msa_sorted.head()
-
-# rearrange columns so the fasta name is the first column (required for mutate.script)
-#all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
-all_muts_msa_sorted.head()
-
-print('Checking NA in snps...')# should be 0
-if all_muts_msa.mutationinformation.isna().sum() == 0:
-    print ('PASS: NO NAs/missing entries for SNPs'
-    , '\n===============================================================')
-else:
-    sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
-
-out_filename_msa = gene.lower() +'_all_muts_msa.csv'
-outfile_msa = outdir + '/' + out_filename_msa
-
-print('\n----------------------------------'
-      , '\nWriting file: mCSM style muts for msa'
-      , '\n----------------------------------'
-      , '\nFile:', outfile_msa
-      , '\nmutation format (SNP): {WT}<POS>{MUT}'
-      , '\nNo.of lines of msa:', len(all_muts_msa))
-
-all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False)
-
-print('Finished writing:', outfile_msa
-      , '\nNo. of rows:', len(all_muts_msa)
-      , '\nNo. of cols:', len(all_muts_msa.columns)
-      , '\n=============================================================')
-
-del(out_filename_msa)
-#%% OUTFILE 8, write file mutational position with count
-# count how many positions this corresponds to
-pos_only = pd.DataFrame(gene_LF1['position'].unique()) 
-# assign column name
-pos_only.columns = ['position']
-# make sure dtype of column position is int or numeric and not string
-pos_only.position.dtype
-pos_only['position'] = pos_only['position'].astype(int)
-pos_only.position.dtype
-
-# sort by position value
-pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)
-
-out_filename_pos = gene.lower() + '_mutational_positons.csv'
-outfile_pos = outdir + '/' + out_filename_pos
-
-print('\n----------------------------------'
-      , '\nWriting file: mutational positions'
-      , '\n----------------------------------'
-      , '\nFile:', outfile_pos
-      , '\nNo. of distinct positions:', len(pos_only_sorted)
-      , '\n=============================================================')
-
-pos_only_sorted.to_csv(outfile_pos, header = True, index = False)
-
-print('Finished writing:', outfile_pos
-      , '\nNo. of rows:', len(pos_only_sorted)
-      , '\nNo. of cols:', len(pos_only_sorted.columns)
-      , '\n============================================================='
-      , '\n\n\n')
-
-del(out_filename_pos)
-#%% Add column: aa property_water
-#=========
-# iterate through the dict, create a lookup dict that i.e
-# lookup_dict =  {three_letter_code: aa_prop_water} 
-# Do this for both wild_type and mutant as above.
-#=========
-print('Adding', ncol_aa_add, 'more cols:\n')
-
-# initialise a sub dict that is lookup dict for three letter code to aa prop
-# adding two more cols
-lookup_dict = dict()
-for k, v in my_aa_dict.items():
-    lookup_dict[k] = v['aa_prop_water']
-    #print(lookup_dict)
-    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
-    gene_LF1['wt_prop_water'] = wt.map(lookup_dict)   
-    #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
-    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
-    gene_LF1['mut_prop_water'] = mut.map(lookup_dict)
-
-mylen2 = len(gene_LF1.columns)    
-
-# sanity checks
-print('checking if 3-letter wt&mut residue extraction worked correctly')
-if wt.isna().sum() & mut.isna().sum() == 0:
-   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
-         , '\nNo NAs detected:'
-         , '\nwild-type\n', wt
-         , '\nmutant-type\n', mut
-         , '\ndim of df:', gene_LF1.shape)
-else:
-    print('FAIL: 3-letter wt&mut residue extraction failed'
-          , '\nNo NAs detected:'
-          , '\nwild-type\n', wt
-          , '\nmutant-type\n', mut
-          , '\ndim of df:', gene_LF1.shape)
-
-if mylen2 == mylen1 + ncol_aa_add:
-    print('PASS: successfully added', ncol_aa_add, 'cols'
-      , '\nold length:', mylen1
-      , '\nnew len:', mylen2)
-else:
-    print('FAIL: failed to add cols:'
-          , '\nold length:', mylen1
-          , '\nnew len:', mylen2)
-
-# clear variables
-del(k, v, wt, mut, lookup_dict)
-#%% Add column: aa_prop_polarity
-#========
-# iterate through the dict, create a lookup dict that i.e
-# lookup_dict =  {three_letter_code: aa_prop_polarity} 
-# Do this for both wild_type and mutant as above.
-#=========
-print('Adding', ncol_aa_add, 'more cols:\n')
-
-# initialise a sub dict that is lookup dict for three letter code to aa prop
-# adding two more cols
-lookup_dict = dict()
-for k, v in my_aa_dict.items():
-    lookup_dict[k] = v['aa_prop_polarity']
-    #print(lookup_dict)
-    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
-    gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict)   
-   #mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
-    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
-    gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
-    
-mylen3 = len(gene_LF1.columns)       
-
-# sanity checks
-print('checking if 3-letter wt&mut residue extraction worked correctly')
-if wt.isna().sum() & mut.isna().sum() == 0:
-   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
-         , '\nNo NAs detected:'
-         , '\nwild-type\n', wt
-         , '\nmutant-type\n', mut
-         , '\ndim of df:', gene_LF1.shape)
-else:
-    print('FAIL: 3-letter wt&mut residue extraction failed'
-          , '\nNo NAs detected:'
-          , '\nwild-type\n', wt
-          , '\nmutant-type\n', mut
-          , '\ndim of df:', gene_LF1.shape)
-
-if mylen3 ==  mylen2 + ncol_aa_add:
-    print('PASS: successfully added', ncol_aa_add, 'cols'
-      , '\nold length:', mylen1
-      , '\nnew len:', mylen2)
-else:
-    print('FAIL: failed to add cols:'
-          , '\nold length:', mylen1
-          , '\nnew len:', mylen2)
-   
-# clear variables
-del(k, v, wt, mut, lookup_dict)
-#%% Add column: aa_calcprop
-#========
-# iterate through the dict, create a lookup dict that i.e
-# lookup_dict =  {three_letter_code: aa_calcprop} 
-# Do this for both wild_type and mutant as above.
-#=========
-print('Adding', ncol_aa_add, 'more cols:\n')
-
-lookup_dict = dict()
-for k, v in my_aa_dict.items():
-    lookup_dict[k] = v['aa_calcprop']
-    #print(lookup_dict)
-    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
-    gene_LF1['wt_calcprop'] = wt.map(lookup_dict)   
-    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
-    gene_LF1['mut_calcprop'] = mut.map(lookup_dict)
- 
-mylen4 = len(gene_LF1.columns)   
-
-# sanity checks
-print('checking if 3-letter wt&mut residue extraction worked correctly')
-if wt.isna().sum() & mut.isna().sum() == 0:
-   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
-         , '\nNo NAs detected:'
-         , '\nwild-type\n', wt
-         , '\nmutant-type\n', mut
-         , '\ndim of df:', gene_LF1.shape)
-else:
-    print('FAIL: 3-letter wt&mut residue extraction failed'
-          , '\nNo NAs detected:'
-          , '\nwild-type\n', wt
-          , '\nmutant-type\n', mut
-          , '\ndim of df:', gene_LF1.shape)
-
-if mylen4 ==  mylen3 + ncol_aa_add:
-    print('PASS: successfully added', ncol_aa_add, 'cols'
-      , '\nold length:', mylen3
-      , '\nnew len:', mylen4)
-else:
-    print('FAIL: failed to add cols:'
-          , '\nold length:', mylen3
-          , '\nnew len:', mylen4)
-
-# clear variables
-del(k, v, wt, mut, lookup_dict)
-#%% NEW mappings: gene_LF2
-# gene_LF2: copy gene_LF1
-gene_LF2 = gene_LF1.copy()
-
-#%% AF for gene
-gene_LF2['id'].nunique()
-gene_LF2['mutationinformation'].nunique()
-total_id_ucount = gene_LF2['id'].nunique()
-total_id_ucount
-total_id_ucount2 = gene_LF2['sample'].nunique()
-total_id_ucount2
-
-if total_id_ucount == total_id_ucount2:
-    print('\nPASS: sample and id unique counts match')
-else:
-    print('\nFAIL: sample and id unique counts DO NOT match!'
-          , '\nGWAS worry!?')
-
-# Add all sample ids in a list for sanity checks
-#gene_LF2['id_list']  = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].apply(list))
-
-#=======================================
-# Add column: total unique id/sample count
-#=======================================
-gene_LF2['total_id_ucount'] = total_id_ucount
-
-#==========================================
-# Add column: mutation count in all samples 
-#==========================================
-gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
-gene_LF2['mut_id_ucount']
-
-#=================
-# Add column: AF
-#=================
-gene_LF2['maf'] = gene_LF2['mut_id_ucount']/gene_LF2['total_id_ucount']
-gene_LF2['maf'].head()
-
-#%% Mapping 1: column '<drug>', mutation_info
-#=======================
-# column name: <drug>
-#=======================
-# mapping 1.1: labels
-dm_om_label_map = {dr_muts_col: 'DM'
-             , other_muts_col: 'OM'}
-dm_om_label_map
-gene_LF2['mutation_info_labels'] = gene_LF2['mutation_info'].map(dm_om_label_map)
-
-# mapping 1.2: numeric
-dm_om_num_map = {dr_muts_col: 1
-              , other_muts_col: 0}
-
-gene_LF2['dm_om_numeric'] = gene_LF2['mutation_info'].map(dm_om_num_map)
-gene_LF2['dm_om_numeric_orig'] = gene_LF2['mutation_info_orig'].map(dm_om_num_map)
-
-gene_LF2['mutation_info'].value_counts()
-gene_LF2['mutation_info_labels'].value_counts()
-gene_LF2['dm_om_numeric'].value_counts()
-gene_LF2['dm_om_numeric_orig'].value_counts()
-
-#%% Mapping 2: column '<drtype>', mutation
-#============================
-# column name: <drtype>
-#============================
-gene_LF2['drtype'].value_counts()
-
-# mapping 2.1: numeric
-drtype_map = {'XDR': 5
-              , 'Pre-XDR': 4
-              , 'MDR': 3
-              , 'Pre-MDR': 2
-              , 'Other': 1
-              , 'Sensitive': 0}
-
-gene_LF2['drtype_numeric']  = gene_LF2['drtype'].map(drtype_map)
-
-gene_LF2['drtype'].value_counts()
-gene_LF2['drtype_numeric'].value_counts()
-
-#%% Recalculations: Multimode
-# copy dst column
-gene_LF2['dst'] = gene_LF2[drug] # to allow cross checking
-gene_LF2['dst'].equals(gene_LF2[drug])
-
-gene_LF2['dst_multimode'] = gene_LF2[drug]
-
-# sanity check
-gene_LF2[drug].value_counts()
-gene_LF2['dst_multimode'].value_counts()
-
-gene_LF2[drug].isnull().sum()
-gene_LF2['dst_multimode'].isnull().sum()
-
-gene_LF2['mutationinformation'].value_counts()
-gene_LF2[drug].isnull().groupby(gene_LF2['mutationinformation']).sum()
-
-# GOAL is to populate na in the dst column from the count of the dm_om_numeric column                          
-gene_LF2['dst_multimode'].isnull().groupby(gene_LF2['mutationinformation']).sum()                                  
-gene_LF2.index
-gene_LF2['index_orig'] = gene_LF2.index # need it for setting back later
-#%% FIXME: Add sanity check
-#gene_LF2['index_orig'].equals(gene_LF2.index)
-#%% Set index: 'mutationinformation' for adding multimode
-gene_LF2['Mut'] = gene_LF2['mutationinformation']
-#%% Further mappings: gene_LF3
-gene_LF3 = gene_LF2.set_index(['Mut'])
-gene_LF3.index
-#gene_LF3 = gene_LF2.set_index(['mutationinformation'])
-
-gene_LF3['dst_multimode'].value_counts()
-gene_LF3['dst_multimode'].value_counts().sum()
-#%% Multimode: dst
-# For each mutation, generate the revised dst which is the mode of dm_om_numeric
-#=============================
-# Recalculation: Revised dst
-# max(multimode)
-#=============================
-# Get multimode for dm_om_numeric column
-dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
-dm_om_multimode_LF3
-
-# Fill using multimode ONLY where NA in dst_multimode column
-gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
-
-# Now get the max from multimode
-gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
-print(gene_LF3)
-
-#----------------------------
-# Revised dst column: Max
-#----------------------------
-# Finally created a revised dst with the max from the multimode
-gene_LF3['dst_mode']  = gene_LF3.groupby('mutationinformation')['dst_noNA'].max()
-
-#%% Multimode: drtype
-#=============================
-# Recalculation: Revised drtype
-# max(multimode)
-#=============================
-#--------------------------------
-# drtype: ALL values:
-# numeric and names in an array 
-#--------------------------------
-gene_LF3['drtype_all_vals']  = gene_LF3['drtype_numeric']
-gene_LF3['drtype_all_names'] = gene_LF3['drtype']
-
-gene_LF3['drtype_all_vals']  = gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
-gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
-
-#---------------------------------
-# Revised drtype: max(Multimode)
-#--------------------------------
-gene_LF3['drtype_multimode'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode)
-gene_LF3['drtype_multimode']
-
-# Now get the max from multimode
-gene_LF3['drtype_mode'] = gene_LF3['drtype_multimode'].apply(lambda x: np.nanmax(x))
-gene_LF3.head()
-
-#----------------------
-# Revised drtype: Max
-#----------------------
-gene_LF3.head()
-gene_LF3['drtype_max'] =  gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].max()
-gene_LF3.head()
-
-#%% Revised counts
-gene_LF3['dst_mode'].value_counts()
-gene_LF3[drug].value_counts()
-
-print('\n------------------------------------------------------'
-      , '\nRevised counting: mutation_info i.e dm om column'
-      , '\n-----------------------------------------------------'
-      
-      , '\n----------------------------------'
-      , '\nOriginal drug column count'
-      , '\n----------------------------------'
-      ,  gene_LF3[drug].value_counts()
-      , '\nTotal samples [original]:', gene_LF3[drug].value_counts().sum()
-      
-      , '\n----------------------------------'
-      , '\nRevised drug column count'
-      , '\n----------------------------------'
-      , gene_LF3['dst_mode'].value_counts()
-      , '\nTotal samples [revised]:', gene_LF3['dst_mode'].value_counts().sum()
-      )
-#%% FIXME: CHECK THIS, run this with mutation_info_REV
-#---------------------------------------
-# Create revised mutation_info_column
-#---------------------------------------
-# Note this is overriding, since downstream depends on it
-# make a copy you if you need to keep that
-# create a copy before running this
-
-gene_LF3['mutation_info_labels_orig'] =  gene_LF3['mutation_info_labels'] 
-
-gene_LF3['mutation_info_labels']  = gene_LF3['dst_mode'].map({1: 'DM'
-                                                    , 0: 'OM'})
-
-gene_LF3['mutation_info_labels_orig'].value_counts()
-gene_LF3['mutation_info_labels_orig'].value_counts().sum()
-
-gene_LF3['mutation_info_labels'].value_counts()
-gene_LF3['mutation_info_labels'].value_counts().sum()
-
-gene_LF3['mutation_info_labels'].value_counts()
-gene_LF3['mutation_info'].value_counts()
-gene_LF3['mutation_info_orig'].value_counts()
-gene_LF3['mutation_info_orig'].value_counts().sum()
-
-gene_LF3['mutation_info_v1'].value_counts()
-gene_LF3['mutation_info_v1'].value_counts().sum()
-
-#%% TEST muts
-test_muts = ['G132A', 'V180F', 'G108R', 'A102P']
-test_muts = ['L4S', 'L4W', 'A102P']
-
-gene_LF3 = gene_LF2[gene_LF2.loc[:,'mutationinformation'].isin(test_muts)]
-gene_LF4 = gene_LF3[['id', 'mutationinformation', drug
-                     , 'mutation_info_orig', 'dm_om_numeric_orig', 'dst', 'dst_multimode']]
-
-# Reset index as it allows the groupby expression to directly map
-gene_LF4.index
-gene_LF4= gene_LF4.set_index(['mutationinformation'])
-gene_LF4.index
-
-# Get multimode for dm_om_numeric column
-dm_om_multimode_LF4 = gene_LF4.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
-dm_om_multimode_LF4
-
-gene_LF4['dst_multimode'] = gene_LF4['dst_multimode'].fillna(dm_om_multimode_LF4)
-
-# LINEAGE 
-foo = gene_LF2.copy()
-foo = foo[foo.loc[:,'mutationinformation'].isin(test_muts)]
-foo = foo[['id', 'mutationinformation','lineage' ]]
-foo['MUT'] = foo['mutationinformation']
-foo['lineage'] = foo['lineage'].str.strip()
-foo['lineage_corrupt'] = foo['lineage']
-
-#foo['lineage_ucount'] = foo['mutationinformation'].map(foo.groupby('mutationinformation')['lineage'].nunique())# seems wrong!
-
-foo2 = tidy_split(foo, 'lineage_corrupt', sep = ';') 
-foo2['lineage_corrupt'] = foo2['lineage_corrupt'].str.strip()
-foo2['lineage_corrupt_list']  = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_corrupt'].apply(list))
-foo2['lineage_corrupt_ucount'] = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_corrupt'].nunique())
-
-foo2.groupby('mutationinformation')['lineage_corrupt'].value_counts()
-foo2['lineage_corrupt'].value_counts()
-foo2['lineage_corrupt_ucount']
-foo2.index
-foo2 = foo2.set_index(['mutationinformation'])
-
-# now merge 
-foo.index
-foo.index.nunique()
-foo2.index.nunique()
-foo_copy = foo.copy()
-
-foo_copy['lineage_ucount'] = foo_copy['lineage'] 
-foo_copy.loc[foo2.index, 'lineage_ucount'] = foo2['lineage_corrupt_ucount']
-
-#%%FIXME: do regex for lineage for meta data else the ; messes it up
-# MOVE THIS TO THE RELEVANT section
-#--------------------------
-# lineage multimode mode
-#--------------------------
-foo['lineage_labels'] = foo['lineage_labels'].str.replace('lineage', 'L')
-# foo_updated = foo.replace(to_replace ='lineage', value = 'L', regex = True) # doesn't specify a column
-foo['lineage_labels'] = foo['lineage']
-
-#df['team'] =  df['team'].apply(lambda x: re.sub(r'[\n\r]*','', str(x)))
-foo['lineage_labels'] = foo['lineage'].apply(lambda x: re.sub(r'lineage','L', str(x)))
-
-lineage_label_numeric = {'lineage1' : 1
-                    , 'lineage2'   : 2        
-                    , 'lineage3'   : 3             
-                    , 'lineage4'   : 4              
-                    , 'lineage5'   : 5              
-                    , 'lineage6'   : 6              
-                    , 'lineage7'   : 7               
-                    , 'lineageBOV' : 8}
-
-
-lineage_label_numeric
-foo2['lineage_corrupt'].value_counts()
-foo2['lineage_numeric']  = foo2['lineage_corrupt'].map(lineage_label_numeric)
-foo2['lineage_numeric'].value_counts()
-
-foo2['lineage_numeric_list']  = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_numeric'].apply(list))
-foo2['lineage_numeric_list']
-foo2['lineage_multimode'] = foo2.groupby(['mutationinformation'])['lineage_numeric'].agg(multimode)
-
-c2 = foo2[foo2.loc[:, 'MUT'].isin(['A102P'])]
-c2['lineage_numeric'].value_counts()
-
-
-#%% Lineage counts (including the ones containing multiple entries)[[<<< INCOMPLETE, TB finished]]
-
-# Get information about how many distinct lineages each mutation comes from
-gene_LF3['lineage'].value_counts()
-gene_LF3['lineage'] = gene_LF3['lineage'].str.strip()
-gene_LF3['lineage'].value_counts()
-
-# Create a column: lineage_corrupt
-gene_LF3['lineage_corrupt'] = gene_LF3['lineage']
-
-##############################
-# CHECK may be you only need it for multimode merge
-# set index
-# gene_LF3['index_orig_copy'] = gene_LF3['index_orig']
-# gene_LF3['index_orig_copy'].head
-
-# gene_LF3.index
-# gene_LF3 = gene_LF3.set_index(['index_orig_copy'])
-# gene_LF3.index
-################################
-
-# Create df with tidy_split: lineage
-lf_lin_split = tidy_split(gene_LF3, 'lineage_corrupt', sep = ';') 
-lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip()
-
-# Add all lineages for each mutation
-lf_lin_split['lineage_corrupt_list']  = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_corrupt'].apply(list))
-
-# Add lineage unique count
-lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_corrupt'].nunique())
-
-# Add lineage_set
-lf_lin_split['lineage_set']    = lf_lin_split['lineage_corrupt_list'].apply(lambda x : set(list(x)))
-lf_lin_split['lineage_ulist']  = lf_lin_split['lineage_set'].apply(lambda x : list(x))
-
-#--------------
-# Multimode lineage 
-# ONLY after split else the corrupt ones don't get mapped
-#--------------
-# Do this mapping after tidy split else the ones with the 
-lineage_label_numeric = {'L1'    : 1
-                        , 'L2'   : 2        
-                        , 'L3'   : 3             
-                        , 'L4'   : 4              
-                        , 'L5'   : 5              
-                        , 'L6'   : 6              
-                        , 'L7'   : 7               
-                        , 'LBOV' : 8}
-
-# lineage_numeric = {'lineage1'      : 1
-#                     , 'lineage2'   : 2        
-#                     , 'lineage3'   : 3             
-#                     , 'lineage4'   : 4              
-#                     , 'lineage5'   : 5              
-#                     , 'lineage6'   : 6              
-#                     , 'lineage7'   : 7               
-#                     , 'lineageBOV' : 8}
-
-lf_lin_split['lineage_corrupt'].value_counts()
-lf_lin_split['lineage_numeric'] = lf_lin_split['lineage_corrupt'].map(lineage_label_numeric)
-lf_lin_split['lineage_corrupt'].value_counts()
-lf_lin_split['lineage_numeric'].value_counts()
-
-lf_lin_split['lineage_numeric_list']  = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_numeric'].apply(list))
-lf_lin_split['lineage_numeric_list']
-
-#-------------------
-# change indices
-# to Mut as Multimode allows direct mapping this way!
-#-------------------
-lf_lin_split['Mut'] = lf_lin_split['mutationinformation']
-lf_lin_split['Mut']
-lf_lin_split = lf_lin_split.set_index(['Mut'])
-lf_lin_split.index
-
-lf_lin_split['lineage_multimode'] = lf_lin_split.groupby(['mutationinformation'])['lineage_numeric'].agg(multimode)
-lf_lin_split['lineage_multimode'].value_counts()
-
-#-------------------
-# Reset indices
-# to index orig  to allow merge with gene_LF3
-#-------------------
-lf_lin_split.index
-lf_lin_split['index_orig_copy'] = lf_lin_split['index_orig']
-lf_lin_split = lf_lin_split.set_index(['index_orig_copy'])
-lf_lin_split.index
-
-###############################################################################
-#================================
-# Merge with gene_LF3 with
-# lf_lin_split
-#================================
-# set index
-gene_LF3['index_orig_copy'] = gene_LF3['index_orig']
-gene_LF3['index_orig_copy'].head
-
-gene_LF3.index
-gene_LF3 = gene_LF3.set_index(['index_orig_copy'])
-gene_LF3.index
-
-#-------------------------
-# colum lineage_ucount:
-# contribution of each distinct lineage
-#-------------------------
-gene_LF3['lineage_ucount'] = gene_LF3['lineage'] 
-
-#-------------------------
-# colum lineage list:
-#-------------------------
-#gene_LF3['lineage_set'] = gene_LF3['lineage'] 
-gene_LF3['lineage_ulist'] = gene_LF3['lineage'] 
-
-gene_LF3['lineage_list'] = gene_LF3['lineage'] 
-
-#-------------------------
-# colum lineage_list mode:
-#-------------------------
-gene_LF3['lineage_mode'] = gene_LF3['lineage'] 
-
-########################
-
-# merge based on indices
-gene_LF3.index.nunique()
-lf_lin_split.index.nunique()
-all(gene_LF3.index.isin(lf_lin_split.index))
-all(lf_lin_split.index.isin(gene_LF3.index))
-gene_LF3.index
-lf_lin_split.index
-
-if (gene_LF3.index.nunique() == lf_lin_split.index.nunique()) and ( all(gene_LF3.index.isin(lf_lin_split.index)) == all(lf_lin_split.index.isin(gene_LF3.index)) ):
-    print('\nPass: merging lineage_ucount with gene_LF3')
-else:
-    print('\nFail: Indices mismatch, cannot merge! Quitting!')
-    sys.exit()
-
-###########################
-# magic merge happens here
-###########################
-gene_LF3.loc[lf_lin_split.index, 'lineage_ucount']  = lf_lin_split['lineage_corrupt_ucount']
-gene_LF3['lineage_ucount'].value_counts()
-
-#gene_LF3.loc[lf_lin_split.index, 'lineage_set']  = lf_lin_split['lineage_set'] 
-#gene_LF3['lineage_set'].value_counts()
-gene_LF3.loc[lf_lin_split.index, 'lineage_ulist']  = lf_lin_split['lineage_ulist'] 
-gene_LF3['lineage_ulist'].value_counts()
-
-gene_LF3.loc[lf_lin_split.index, 'lineage_list']  = lf_lin_split['lineage_corrupt_list'] 
-gene_LF3['lineage_list'].value_counts()
-
-gene_LF3.loc[lf_lin_split.index, 'lineage_mode']  = lf_lin_split['lineage_multimode']
-gene_LF3['lineage_mode'].value_counts()
-
-foo = gene_LF3[['mutationinformation', 'lineage', 'lineage_ucount'
-                #, 'lineage_set'
-                , 'lineage_ulist'
-                , 'lineage_mode'
-                , 'lineage_list']]
-
-#%% sanity checks
-check1 = gene_LF3[['mutationinformation', 'lineage', 'lineage_ucount']]
-check2 = check1[check1.loc[:, 'mutationinformation'].isin(['H57D'])]
-check2.value_counts()
-
-#%% Reset index: original indices [WAS above ]
-#gene_LF3 = gene_LF3.reset_index()
-gene_LF3.index
-gene_LF3['mutationinformation'] = gene_LF3.index
-gene_LF3 = gene_LF3.set_index(['index_orig'])
-
-gene_LF3[['mutationinformation']]
-gene_LF3.index
-#%% Remove MUT column not needed
-# sanity check
-if (all(gene_LF3['Mut'] == gene_LF3['mutationinformation'])):
-    print('\nPass: Mutationinformation check successful')
-else:
-    sys.exit('\nERROR: mutationin cross checks failed. Please check your group_by() aggregate functions')
-
-# Drop mutation column
-gene_LF3.drop(['MUT'], axis = 1, inplace = True)
-#%% ADD summary results
-#%% final output file with selected columns
\ No newline at end of file