From 6330a2e716b0a0e5cbff049ca22de64bb603f153 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 14 Apr 2022 10:16:12 +0100
Subject: [PATCH] added v2 for data extraction

---
 scripts/data_extraction.py    |    2 +-
 scripts/data_extraction_v2.py | 1109 +++++++++++++++++++++++++++++++++
 2 files changed, 1110 insertions(+), 1 deletion(-)
 create mode 100644 scripts/data_extraction_v2.py
diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py
index b991490..241ff5b 100755
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@@ -622,7 +622,7 @@ print('Dim of dr_df:', dr_df.shape
 
 #%%
 #=========
-# DF2: other_mutations_pyrazinamdie
+# DF2: other_muts_col
 #=========
 ########
 # tidy_split(): on other_muts_col column and remove leading white spaces
diff --git a/scripts/data_extraction_v2.py b/scripts/data_extraction_v2.py
new file mode 100644
index 0000000..aa7ec93
--- /dev/null
+++ b/scripts/data_extraction_v2.py
@@ -0,0 +1,1109 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+Created on Tue Aug  6 12:56:03 2019
+
+@author: tanu
+'''
+
+# FIXME: include error checking to enure you only
+# concentrate on positions that have structural info?
+
+# FIXME: import dirs.py to get the basic dir paths available
+#=======================================================================
+# TASK: extract ALL <gene> matched mutations from GWAS data
+# Input data file has the following format: each row = unique sample id 
+# id,country,lineage,sublineage,drtype,drug,dr_muts_col,other_muts_col...
+# 0,sampleID,USA,lineage2,lineage2.2.1,Drug-resistant,0.0,WT,gene_match<wt>POS<mut>; pncA_c.<wt>POS<mut>...
+# where multiple mutations and multiple mutation types are separated by ';'. 
+# We are interested in the protein coding region i.e mutation with the<gene>_'p.' format.
+# This script splits the mutations on the ';' and extracts protein coding muts only
+# where each row is a separate mutation
+# sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique
+
+
+# output files: all lower case
+# 1) <gene>_gwas.csv
+# 2) <gene>_common_ids.csv
+# 3) <gene>_ambiguous_muts.csv
+# 4) <gene>_mcsm_formatted_snps.csv
+# 5) <gene>_metadata_poscounts.csv
+# 6) <gene>_metadata.csv
+# 7) <gene>_all_muts_msa.csv
+# 8) <gene>_mutational_positons.csv
+
+#------------
+# NOTE
+#-----------
+# drtype is renamed to 'resistance' in the 35k dataset
+# all colnames in the ouput files lowercase
+
+#-------------
+# requires
+#-------------
+#reference_dict.py
+#tidy_split.py
+#=======================================================================
+#%% load libraries
+import os, sys
+import re
+import pandas as pd
+import numpy as np
+import argparse
+from statistics import mean, median, mode
+from statistics import multimode
+#=======================================================================
+#%% dir and local imports
+homedir = os.path.expanduser('~') 
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+os.getcwd()
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
+arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true')
+
+arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output paths & filenames
+drug = args.drug
+gene = args.gene
+datadir = args.datadir
+indir   = args.input_dir
+outdir  = args.output_dir
+make_dirs  = args.make_dirs
+
+#%% input and output dirs and files
+#=======
+# dirs
+#=======
+if not datadir:
+    datadir = homedir + '/' + 'git/Data'
+    
+if not indir:
+    indir = datadir + '/' + drug + '/input_v2'
+    
+if not outdir:
+    outdir = datadir + '/' + drug + '/output_v2'
+
+if make_dirs:
+    print('make_dirs is turned on, creating data dir:', datadir)
+    try: 
+        os.makedirs(datadir, exist_ok = True) 
+        print("Directory '%s' created successfully" %datadir) 
+    except OSError as error: 
+        print("Directory '%s' can not be created") 
+  
+    print('make_dirs is turned on, creating indir:', indir)
+    try: 
+        os.makedirs(indir, exist_ok = True) 
+        print("Directory '%s' created successfully" %indir) 
+    except OSError as error: 
+        print("Directory '%s' can not be created") 
+  
+    print('make_dirs is turned on, creating outdir:', outdir)
+    try: 
+        os.makedirs(outdir, exist_ok = True) 
+        print("Directory '%s' created successfully" %outdir) 
+    except OSError as error: 
+        print("Directory '%s' can not be created") 
+
+# handle missing dirs here
+if not os.path.isdir(datadir):
+    print('ERROR: Data directory does not exist:', datadir
+    , '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option --make_dirs')
+    sys.exit()
+if not os.path.isdir(indir):
+    print('ERROR: Input directory does not exist:', indir
+    , '\nPlease either create or specify indir and rerun\nelse specify cmd option --make_dirs')
+    sys.exit()
+if not os.path.isdir(outdir):
+    print('ERROR: Output directory does not exist:', outdir
+    , '\nPlease create or specify outdir and rerun\nelse specify cmd option --make_dirs')
+    sys.exit()
+    
+# Requires
+from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
+from tidy_split import tidy_split
+
+#=======================================================================    
+gene_match = gene + '_p.'
+print('mut pattern for gene', gene, ':',  gene_match)
+
+nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
+print('nsSNP for gene', gene, ':',  nssnp_match)
+
+wt_regex = gene_match.lower()+'([A-Za-z]{3})'
+print('wt regex:', wt_regex)
+
+mut_regex = r'[0-9]+(\w{3})$'
+print('mt regex:', mut_regex)
+
+pos_regex = r'([0-9]+)'
+print('position regex:', pos_regex)
+
+# building cols to extract
+dr_muts_col = 'dr_mutations_' + drug
+other_muts_col = 'other_mutations_' + drug
+resistance_col = 'drtype'
+
+print('Extracting columns based on variables:\n'
+	, drug
+	, '\n'
+	, dr_muts_col
+	, '\n'
+	, other_muts_col
+    , '\n'
+    , resistance_col
+	, '\n===============================================================')
+#=======================================================================
+
+#=======
+# input
+#=======
+#in_filename_master_master  = 'original_tanushree_data_v2.csv' #19k
+in_filename_master  = 'mtb_gwas_meta_v6.csv' #35k
+infile_master = datadir + '/' + in_filename_master
+print('Input file: ', infile_master
+      , '\n============================================================')
+
+#=======
+# output
+#=======
+# several output files: in respective sections at the time of outputting files
+print('Output filename: in the respective sections'
+      , '\nOutput path: ', outdir
+      , '\n=============================================================')
+
+#%%end of variable assignment for input and output files
+#=======================================================================
+#%% Read input file
+master_data  = pd.read_csv(infile_master, sep = ',')  
+
+# column names
+#list(master_data.columns)
+
+# extract elevant columns to extract from meta data related to the drug
+if in_filename_master == 'original_tanushree_data_v2.csv':
+    meta_data = master_data[['id'
+                             , 'country'
+                             , 'lineage'
+                             , 'sublineage'
+                             , 'drtype'
+                             , drug
+                             , dr_muts_col
+                             , other_muts_col]]
+else:
+    core_cols = ['id'
+                 , 'sample'
+                 #, 'patient_id'
+                 #, 'strain'
+                 , 'lineage'
+                 , 'sublineage' 
+                 , 'country_code'
+                 #, 'geographic_source'
+                 ,  resistance_col]
+    
+    variable_based_cols = [drug
+                           , dr_muts_col
+                           , other_muts_col]
+       
+    cols_to_extract = core_cols + variable_based_cols
+    print('Extracting', len(cols_to_extract), 'columns from master data')
+
+    meta_data = master_data[cols_to_extract]  
+    del(master_data, variable_based_cols, cols_to_extract)
+
+print('Extracted meta data from filename:', in_filename_master
+      , '\nDim:', meta_data.shape)
+
+# checks and results
+total_samples = meta_data['id'].nunique() 
+print('RESULT: Total samples:', total_samples
+		, '\n===========================================================')
+
+# counts NA per column
+meta_data.isna().sum()
+print('No. of NAs/column:' + '\n', meta_data.isna().sum()
+		, '\n===========================================================\n')
+
+#%% Quick checks:
+meta_data['lineage'].value_counts()
+meta_data['lineage'].value_counts().sum()    
+meta_data['lineage'].nunique()
+
+meta_data['id'].nunique()
+meta_data['sample'].nunique()
+meta_data['id'].equals(meta_data['sample'])
+foo = meta_data.copy()
+foo['Diff'] = np.where( foo['id'] == foo['sample'] , '1', '0')
+foo['Diff'].value_counts()
+
+meta_data['drtype'].value_counts()
+meta_data['drtype'].value_counts().sum()
+
+#%% Write file: <gene>_gwas.csv
+check_file = outdir + '/' + gene.lower() + '_gwas.csv'
+meta_data.to_csv(check_file, index = False)
+print('\n----------------------------------'
+      , '\nWriting subsetted gwas data:', gene
+      , '\n----------------------------------'
+      , '\nFile', check_file
+      , '\nDim:', meta_data.shape)
+
+# equivalent of table in R
+# drug counts: complete samples for OR calcs
+meta_data[drug].value_counts() 
+print('===========================================================\n'
+      , 'RESULT: Total no. of samples tested for', drug, meta_data[drug].value_counts().sum()
+	  , '\n===========================================================\n'
+      , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
+	  , '\n===========================================================\n'
+      , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
+		, '\n===========================================================')
+#%% Important sanity checks
+# This is to find out how many samples have 1 and more than 1 mutation,so you
+# can use it to check if your data extraction process for dr_muts 
+# and other_muts has worked correctly AND also to check the dim of the 
+# final formatted data.
+# This will have: unique COMBINATION of sample id and <gene_match> mutations
+#========
+# First: counting <gene_match> mutations in dr_muts_col column
+#========
+print('Now counting WT &', nssnp_match, 'muts within the column:', dr_muts_col)
+# drop na and extract a clean df
+clean_df =  meta_data.dropna(subset=[dr_muts_col])
+
+# sanity check: count na
+na_count = meta_data[dr_muts_col].isna().sum()
+
+if len(clean_df) == (total_samples - na_count):
+   print('PASS: clean_df extracted: length is', len(clean_df)
+         , '\nNo.of NAs in', dr_muts_col, '=', na_count, '/', total_samples
+         , '\n==========================================================')
+else:
+    sys.exit('FAIL: Could not drop NAs')
+
+dr_gene_count = 0
+wt = 0
+id_dr = []
+id2_dr = []
+#nssnp_match_regex = re.compile(nssnp_match)
+
+for i, id in enumerate(clean_df.id):
+    #print (i, id)
+    #id_dr.append(id)
+    #count_gene_dr = clean_df[dr_muts_col].iloc[i].count(gene_match) # can include stop muts
+    count_gene_dr = len(re.findall(nssnp_match, clean_df[dr_muts_col].iloc[i], re.IGNORECASE))
+    #print(count_gene_dr)
+    if count_gene_dr > 0:
+        id_dr.append(id)
+    if count_gene_dr > 1:
+        id2_dr.append(id)
+        #print(id, count_gene_dr)    
+    dr_gene_count = dr_gene_count + count_gene_dr
+    count_wt = clean_df[dr_muts_col].iloc[i].count('WT')
+    wt = wt + count_wt
+    
+print('RESULTS:')        
+print('Total WT in dr_muts_col:', wt)
+print('Total matches of', gene, 'SNP matches in', dr_muts_col, ':', dr_gene_count)
+print('Total samples with > 1', gene, 'nsSNPs in dr_muts_col:', len(id2_dr) )
+print('=================================================================')
+
+del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
+
+#========
+# Second: counting <gene_match> mutations in other_muts_col column
+#========
+print('Now counting WT &', nssnp_match, 'muts within the column:', other_muts_col)    
+# drop na and extract a clean df
+clean_df =  meta_data.dropna(subset=[other_muts_col])
+
+# sanity check: count na
+na_count = meta_data[other_muts_col].isna().sum()
+
+if len(clean_df) == (total_samples - na_count):
+   print('PASS: clean_df extracted: length is', len(clean_df)
+         , '\nNo.of NAs =', na_count, '/', total_samples
+         , '\n=========================================================')
+else:
+    sys.exit('FAIL: Could not drop NAs')
+
+other_gene_count = 0
+wt_other = 0
+id_other = []
+id2_other = []
+        
+for i, id in enumerate(clean_df.id):
+    #print (i, id)
+    #id_other.append(id)
+    #count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match) # can include stop muts
+    count_gene_other = len(re.findall(nssnp_match, clean_df[other_muts_col].iloc[i], re.IGNORECASE))
+    if count_gene_other > 0:
+        id_other.append(id)
+    if count_gene_other > 1:
+        id2_other.append(id)
+        #print(id, count_gene_other)    
+    other_gene_count = other_gene_count + count_gene_other
+    count_wt = clean_df[other_muts_col].iloc[i].count('WT')
+    wt_other = wt_other + count_wt        
+     
+print('RESULTS:')
+print('Total WT in other_muts_col:', wt_other)
+print('Total matches of', gene, 'SNP matches in', other_muts_col, ':', other_gene_count)
+print('Total samples with > 1', gene, 'nsSNPs in other_muts_col:', len(id2_other) )
+print('=================================================================')
+
+print('Predicting total no. of rows in the curated df:', dr_gene_count + other_gene_count
+      , '\n===================================================================')
+expected_rows = dr_gene_count + other_gene_count
+
+#del( wt_other, clean_df, i, id, na_count, id2_other, count_gene_other, count_wt)
+del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other, count_wt )
+
+#%% Extracting dr and other muts separately along with common cols
+print('Extracting dr_muts from col:', dr_muts_col, 'with other meta_data')
+print('muts to extract:', nssnp_match )
+
+#===============
+# dr mutations: extract gene_match entries with meta data and ONLY dr_muts col
+#===============
+if in_filename_master == 'original_tanushree_data_v2.csv':
+    meta_data_dr = meta_data[['id'
+                              ,'country'
+                              ,'lineage'
+                              ,'sublineage'
+                              ,'drtype'
+                              , drug
+                              , dr_muts_col]]
+else:
+    dr_based_cols = [drug, dr_muts_col]
+    cols_to_extract = core_cols + dr_based_cols
+    print('Extracting', len(cols_to_extract), 'columns from meta data')
+    meta_data_dr = meta_data[cols_to_extract]
+    del(dr_based_cols, cols_to_extract)
+
+if meta_data_dr.shape[0] == len(meta_data) and meta_data_dr.shape[1] == (len(meta_data.columns)-1):
+    print('PASS: Dimensions match'
+          , '\n===============================================================')
+else:
+    print('FAIL: Dimensions mismatch:'
+          , 'Expected dim:', len(meta_data), (len(meta_data.columns)-1)
+          , '\nGot:', meta_data_dr.shape
+	      , '\n===============================================================')
+    sys.exit()
+
+# Extract within this the nsSNPs for gene of interest using string match
+#meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(gene_match, na = False)]
+meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
+print('gene_snp_match in dr:', len(meta_gene_dr))
+  
+dr_id = meta_gene_dr['id'].unique()
+print('RESULT: No. of samples with dr muts in', gene, ':', len(dr_id))
+
+if len(id_dr) == len(meta_gene_dr):
+    print('PASS: lengths match'
+    , '\n===============================================================')
+else:
+    print('FAIL: length mismatch'
+             , '\nExpected len:', len(id_dr)
+             , '\nGot:', len(meta_gene_dr))
+    sys.exit()
+
+dr_id = pd.Series(dr_id)
+
+#=================
+# other mutations: extract nssnp_match entries from other_muts_col
+#==================
+print('Extracting other_muts from:', other_muts_col,'with other meta_data')
+print('muts to extract:', nssnp_match)
+
+if in_filename_master ==  'original_tanushree_data_v2.csv':
+    meta_data_other = meta_data[['id'
+                                 , 'country'
+                                 , 'lineage'
+                                 , 'sublineage'
+                                 , 'drtype'
+                                 , drug
+                                 , other_muts_col]]
+else:
+    other_based_cols = [drug, other_muts_col]
+    cols_to_extract = core_cols + other_based_cols
+    print('Extracting', len(cols_to_extract), 'columns from meta data')
+    meta_data_other = meta_data[cols_to_extract]
+    del(other_based_cols, cols_to_extract)
+
+if meta_data_other.shape[0] == len(meta_data) and meta_data_other.shape[1] == (len(meta_data.columns)-1):
+    print('PASS: Dimensions match'
+          , '\n===============================================================')
+else:
+    print('FAIL: Dimensions mismatch:'
+          , 'Expected dim:', len(meta_data), (len(meta_data.columns)-1)
+          , '\nGot:', meta_data_other.shape
+	      , '\n===============================================================')
+    sys.exit()
+
+# Extract within this nSSNP for gene of interest using string match
+#meta_gene_other  = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(gene_match, na = False)] 
+meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] 
+print('gene_snp_match in other:', len(meta_gene_other))
+
+other_id = meta_gene_other['id'].unique()
+print('RESULT: No. of samples with other muts in', gene, ':', len(other_id))
+
+if len(id_other) == len(meta_gene_other):
+    print('PASS: lengths match'
+    , '\n==============================================================')
+else:
+    print('FAIL: length mismatch'
+          , '\nExpected len:', len(id_other)
+          , '\nGot:', len(meta_gene_other))
+    sys.exit()
+
+other_id = pd.Series(other_id)
+#%% Find common IDs
+#==================
+# Common ids
+#==================
+print('Now extracting common_ids...')
+common_mut_ids = dr_id.isin(other_id).sum() 
+print('RESULT: No. of common ids:', common_mut_ids)
+
+# sanity checks
+# check if True: should be since these are common
+if dr_id.isin(other_id).sum() == other_id.isin(dr_id).sum():
+    print('PASS: Cross check on no. of common ids')
+else:
+    sys.exit('FAIL: Cross check on no. of common ids failed')
+    
+# check if the common are indeed the same!
+# bit of a tautology, but better safe than sorry!
+common_ids = dr_id[dr_id.isin(other_id)]
+common_ids = common_ids.reset_index()
+common_ids.columns = ['index', 'id']
+
+common_ids2 = other_id[other_id.isin(dr_id)]
+common_ids2 = common_ids2.reset_index()
+common_ids2.columns = ['index', 'id2']
+
+# should be True
+if common_ids['id'].equals(common_ids2['id2']):
+    print('PASS: Further cross checks on common ids')
+    nu_common_ids = common_ids.nunique()
+else:
+    sys.exit('FAIL: Further cross checks on common ids')
+
+# good sanity check: use it later to check gene_sample_counts
+expected_gene_samples = (len(meta_gene_dr) + len(meta_gene_other) - common_mut_ids) 
+print('Expected no. of gene samples:', expected_gene_samples
+      , '\n=================================================================')
+#%% Write file: <gene>_common_ids.csv
+#print(outdir)
+out_filename_cid = gene.lower() + '_common_ids.csv'
+outfile_cid =  outdir + '/' + out_filename_cid
+print('\n----------------------------------'
+      '\nWriting file: common_ids'
+      '\n----------------------------------'
+      , '\nFile:', outfile_cid
+      , '\nNo. of rows:', len(common_ids)
+      , '\n=============================================================')
+
+common_ids.to_csv(outfile_cid, index = False)
+
+del(out_filename_cid)
+
+# clear variables
+del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2)
+
+#%% Extract gene specific nsSNPs: all nsSNPs i.e.'nssnp_match'
+print('Extracting nsSNP match:', gene, 'mutations from cols:\n'
+      , dr_muts_col, 'and', other_muts_col, 'using string match:'
+      , '\n===================================================================')
+#meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(gene_match, na = False) | meta_data[other_muts_col].str.contains(gene_match, na = False) ]
+meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match
+                                                                  , na = False
+                                                                  , regex = True
+                                                                  , case = False) | meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ]
+
+extracted_gene_samples = meta_gene_all['id'].nunique()
+print('RESULT: No. of unique samples with', gene, 'nsSNP muts:', extracted_gene_samples
+      , '\n===================================================================')
+
+# sanity check: length of gene samples
+print('Performing sanity check:')
+if extracted_gene_samples == expected_gene_samples:
+    print('PASS: expected & actual no. of nssnp gene samples match'
+          , '\nNo. of gene samples:', len(meta_gene_all)
+          , '\n=========================================================')
+else:
+    sys.exit('FAIL: Length mismatch in gene samples!')
+
+# count NA in drug column
+gene_na = meta_gene_all[drug].isna().sum() 
+print('No. of gene samples without', drug, 'testing:', gene_na)
+
+# use it later to check number of complete samples from LF data
+comp_gene_samples = len(meta_gene_all) - gene_na
+print('Complete gene samples tested for', drug, ':', comp_gene_samples)
+print('=================================================================')
+
+# Comment: This is still dirty data since these
+# are samples that have nsSNP muts, but can have others as well
+# since the format for mutations is mut1; mut2, etc.
+print('This is still dirty data: samples have ', nssnp_match, 'muts but may have others as well'
+      , '\nsince the format for mutations is mut1; mut2, etc.'
+      , '\n=============================================================')
+
+print('Performing tidy_split(): to separate the mutations into individual rows')
+
+#%% tidy_split(): dr_muts_col
+#=========
+# DF1: dr_muts_col
+# and remove leading white spaces
+#=========
+col_to_split1 = dr_muts_col
+print ('Firstly, applying tidy split on dr muts df', meta_gene_dr.shape
+       , '\ncolumn name to apply tidy_split():', col_to_split1
+       , '\n============================================================')
+# apply tidy_split()
+dr_WF0 = tidy_split(meta_gene_dr, col_to_split1, sep = ';') 
+# remove leading white space else these are counted as distinct mutations as well
+dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.strip() 
+
+# extract only the samples/rows with nssnp_match
+#dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)]
+dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(nssnp_match, regex = True, case = False)]
+
+print('Lengths after tidy split and extracting', nssnp_match, 'muts:'
+      , '\nOld length:' , len(meta_gene_dr)
+      , '\nLength after split:', len(dr_WF0)
+      , '\nLength of nssnp df:', len(dr_gene_WF0)
+      , '\nExpected len:', dr_gene_count
+      , '\n=============================================================')
+if len(dr_gene_WF0) == dr_gene_count:
+    print('PASS: length matches expected length'
+    , '\n===============================================================')
+else:
+    sys.exit('FAIL: lengths mismatch')
+
+# count the freq of 'dr_muts' samples
+dr_muts_df = dr_gene_WF0 [['id', dr_muts_col]] 
+print('Dim of dr_muts_df:', dr_muts_df.shape)
+
+# add freq column
+dr_muts_df['dr_sample_freq'] = dr_muts_df.groupby('id')['id'].transform('count') 
+#dr_muts_df['dr_sample_freq'] = dr_muts_df.loc[dr_muts_df.groupby('id')].transform('count')
+print('Revised dim of dr_muts_df:', dr_muts_df.shape) 
+
+c1 = dr_muts_df.dr_sample_freq.value_counts()
+print('Counting no. of sample frequency:\n', c1
+      , '\n===================================================================')
+
+# sanity check: length of gene samples
+if len(dr_gene_WF0) == c1.sum():
+    print('PASS: WF data has expected length'
+    , '\nLength of dr_gene WFO:', c1.sum()
+    , '\n===============================================================')
+else:
+    sys.exit('FAIL: length mismatch!')
+
+# Important: Assign 'column name' on which split was performed as an extra column
+# This is so you can identify if mutations are dr_type or other in the final df
+dr_df = dr_gene_WF0.assign(mutation_info = dr_muts_col)
+print('Dim of dr_df:', dr_df.shape
+	, '\n=============================================================='
+	, '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category'
+	, '\n===============================================================')
+#%% tidy_split(): other_muts_col
+#=========
+# DF2: other_muts_col
+# and remove leading white spaces
+#=========
+col_to_split2 = other_muts_col
+print ('applying second tidy split() separately on other muts df', meta_gene_other.shape
+       , '\ncolumn name to apply tidy_split():', col_to_split2
+       , '\n============================================================')
+
+# apply tidy_split()
+other_WF1 = tidy_split(meta_gene_other, col_to_split2, sep = ';') 
+# remove the leading white spaces in the column
+other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip() 
+
+# extract only the samples/rows with nssnp_match
+#other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)]
+other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(nssnp_match, regex = True, case = False)]
+
+print('Lengths after tidy split and extracting', gene_match, 'muts:',
+      '\nOld length:' , len(meta_gene_other),
+      '\nLength after split:', len(other_WF1),
+      '\nLength of nssnp df:', len(other_gene_WF1),
+      '\nExpected len:', other_gene_count
+      , '\n=============================================================')
+if len(other_gene_WF1) == other_gene_count:
+    print('PASS: length matches expected length'
+    , '\n===============================================================')
+else:
+    sys.exit('FAIL: lengths mismatch')
+
+# count the freq of 'other muts' samples
+other_muts_df = other_gene_WF1 [['id', other_muts_col]] 
+print('Dim of other_muts_df:', other_muts_df.shape) 
+
+# add freq column
+other_muts_df['other_sample_freq'] = other_muts_df.groupby('id')['id'].transform('count')
+print('Revised dim of other_muts_df:', other_muts_df.shape) 
+
+c2 = other_muts_df.other_sample_freq.value_counts()
+print('Counting no. of sample frequency:\n', c2)
+print('=================================================================')
+# sanity check: length of gene samples
+if len(other_gene_WF1) == c2.sum():
+    print('PASS: WF data has expected length'
+    , '\nLength of other_gene WFO:', c2.sum()
+    , '\n===============================================================')
+else:
+    sys.exit('FAIL: Length mismatch')
+    
+# Important: Assign 'column name' on which split was performed as an extra column
+# This is so you can identify if mutations are dr_type or other in the final df
+other_df = other_gene_WF1.assign(mutation_info = other_muts_col)
+print('dim of other_df:', other_df.shape
+	, '\n==============================================================='
+	, '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category'
+	, '\n===============================================================')
+#%% NEW check1: dr muts and other muts
+#foo = dr_WF0[dr_muts_col][~dr_WF0[dr_muts_col].isin(dr_df[dr_muts_col])]
+#bar = other_WF1[other_muts_col][~other_WF1[other_muts_col].isin(other_df[other_muts_col])]
+dr_df['id'][~dr_df['id'].isin(other_df['id'])].count()
+other_df['id'][~other_df['id'].isin(dr_df['id'])].count()
+
+#%% Concatenating two dfs: gene_LF0
+# equivalent of rbind in R
+#==========
+# Concatentating the two dfs: equivalent of rbind in R
+#==========
+# Important: Change column names to allow concat:
+# dr_muts.. & other_muts : 'mutation'
+print('Now concatenating the two dfs by row'
+      , '\nFirst assigning a common colname: "mutation" to the col containing muts'
+      , '\nThis is done for both dfs'
+      , '\n===================================================================')
+
+dr_df.columns
+dr_df.rename(columns = {dr_muts_col: 'mutation'}, inplace = True)
+dr_df.columns
+
+other_df.columns
+other_df.rename(columns = {other_muts_col: 'mutation'}, inplace = True)
+other_df.columns
+
+if len(dr_df.columns) == len(other_df.columns):
+    print('Checking dfs for concatening by rows:'
+          , '\nDim of dr_df:', dr_df.shape
+          , '\nDim of other_df:', other_df.shape
+          , '\nExpected nrows:', len(dr_df) +  len(other_df)
+          , '\n=============================================================')
+else:
+    sys.exit('FAIL: No. of cols mismatch for concatenating')
+
+# checking colnames before concat
+print('Checking colnames BEFORE concatenating the two dfs...')
+if (set(dr_df.columns) == set(other_df.columns)):
+    print('PASS: column names match necessary for merging two dfs')
+else:
+    sys.exit('FAIL: Colnames mismatch for concatenating!')
+
+# concatenate (axis = 0): Rbind
+print('Now appending the two dfs: Rbind')
+gene_LF_comb = pd.concat([dr_df, other_df], ignore_index = True, axis = 0)
+
+print('Finding stop mutations in concatenated df')
+stop_muts = gene_LF_comb['mutation'].str.contains('\*').sum()
+if stop_muts == 0:
+    print('PASS: No stop mutations detected')
+else:
+    print('stop mutations detected'
+          , '\nNo. of stop muts:', stop_muts, '\n'
+          , gene_LF_comb.groupby(['mutation_info'])['mutation'].apply(lambda x: x[x.str.contains('\*')].count())
+          , '\nNow removing them')    
+    
+gene_LF0_nssnp = gene_LF_comb[~gene_LF_comb['mutation'].str.contains('\*')]
+print('snps only: by subtracting stop muts:', len(gene_LF0_nssnp))
+
+gene_LF0 = gene_LF_comb[gene_LF_comb['mutation'].str.contains(nssnp_match, case = False)]
+print('snps only by direct regex:', len(gene_LF0))
+
+if gene_LF0_nssnp.equals(gene_LF0):
+    print('PASS: regex for extracting nssnp worked correctly & stop mutations successfully removed'
+          , '\nUsing the regex extracted df')
+else:
+    sys.exit('FAIL: posssibly regex or no of stop mutations'
+             , 'Regex being used:', nssnp_match)
+    #sys.exit()
+
+# checking colnames and length after concat
+print('Checking colnames AFTER concatenating the two dfs...')
+if (set(dr_df.columns) == set(gene_LF0.columns)):
+    print('PASS: column names match'
+          , '\n=============================================================')
+else:
+    sys.exit('FAIL: Colnames mismatch AFTER concatenating')
+    
+print('Checking concatenated df')
+if len(gene_LF0) == (len(dr_df) +  len(other_df))- stop_muts:
+    print('PASS:length of df after concat match'
+    , '\n===============================================================')
+else:
+    sys.exit('FAIL: length mismatch')
+
+#%% NEW check2:id, sample, etc.
+gene_LF0['id'].count()
+gene_LF0['id'].nunique()
+gene_LF0['sample'].nunique()
+gene_LF0['mutation_info'].value_counts()
+gene_LF0[drug].isna().sum()
+#%% Concatenating two dfs sanity checks: gene_LF1
+#=========================================================
+# This is hopefully clean data, but just double check
+# Filter LF data so that you only have
+# mutations corresponding to nssnp_match (case insensitive)
+# this will be your list you run OR calcs 
+#=========================================================
+print('Length of gene_LF0:', len(gene_LF0)
+      , '\nThis should be what we need. But just double checking and extracting nsSNP for', gene
+      , '\nfrom LF0 (concatenated data) using case insensitive regex match:', nssnp_match)
+
+gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)]   
+
+if len(gene_LF0) == len(gene_LF1):
+    print('PASS: length of gene_LF0 and gene_LF1 match',
+          '\nConfirming extraction and concatenating worked correctly'
+          , '\n==========================================================')
+else:
+    print('FAIL: BUT NOT FATAL!'
+          , '\ngene_LF0 and gene_LF1 lengths differ'
+          , '\nsuggesting error in extraction process'
+          , ' use gene_LF1 for downstreama analysis'
+          , '\n=========================================================')   
+    
+print('Length of dfs pre and post processing...'
+      , '\ngene WF data (unique samples in each row):', extracted_gene_samples
+      , '\ngene LF data (unique mutation in each row):', len(gene_LF1)
+      , '\n=============================================================')
+  
+# sanity check for extraction
+# This ought to pass if nsnsp_match happens right at the beginning of creating 'expected_rows'
+print('Verifying whether extraction process worked correctly...')
+if len(gene_LF1) == expected_rows:
+    print('PASS: extraction process performed correctly'
+          , '\nExpected length:', expected_rows
+          , '\nGot:', len(gene_LF1)
+          , '\nRESULT: Total no. of mutant sequences for logo plot:', len(gene_LF1)
+          , '\n=========================================================')
+else:
+    print('FAIL: extraction process has bugs'
+          , '\nExpected length:', expected_rows
+          , '\nGot:', len(gene_LF1)
+          , '\nDebug please'
+          , '\n=========================================================')
+
+# more sanity checks
+print('Performing some more sanity checks...')
+
+# From LF1: useful for OR counts
+# no. of unique muts
+distinct_muts = gene_LF1.mutation.value_counts()
+print('Distinct genomic mutations:', len(distinct_muts))
+
+# no. of samples contributing the unique muts
+gene_LF1.id.nunique()
+print('No.of samples contributing to distinct genomic muts:', gene_LF1.id.nunique())
+
+# no. of dr and other muts
+mut_grouped = gene_LF1.groupby('mutation_info').mutation.nunique()
+print('No.of unique dr and other muts:\n', gene_LF1.groupby('mutation_info').mutation.nunique())
+
+# sanity check
+if len(distinct_muts) == mut_grouped.sum() :
+    print('PASS:length of LF1 is as expected'
+    , '\n===============================================================')
+else:
+    print('FAIL: Mistmatch in count of muts'
+          , '\nExpected count:', len(distinct_muts)
+          , '\nActual count:', mut_grouped.sum()
+          , '\nMuts should be distinct within dr* and other* type'
+          , '\nInspecting...possibly ambiguous muts'
+          , '\nNo. of possible ambiguous muts:', mut_grouped.sum() - len(distinct_muts)
+          , '\n=========================================================')
+    
+muts_split = list(gene_LF1.groupby('mutation_info'))
+dr_muts = muts_split[0][1].mutation 
+other_muts =  muts_split[1][1].mutation
+print('splitting muts by mut_info:', muts_split)
+print('no.of dr_muts samples:', len(dr_muts))
+print('no. of other_muts samples', len(other_muts))       
+
+#%% Ambiguous muts
+# IMPORTANT: The same mutation cannot be classed as a drug AND 'others'
+if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
+    print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category'
+          , '\n===============================================================')
+else:
+    print('PASS: NO ambiguous muts detected'
+          , '\nMuts are unique to dr_ and other_ mutation class'
+          , '\n=========================================================')
+
+# inspect dr_muts and other muts: Fixed in case no ambiguous muts detected!
+if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
+    print('Finding ambiguous muts...'
+          , '\n========================================================='
+          , '\nTotal no. of samples in dr_muts present in other_muts:', dr_muts.isin(other_muts).sum()
+          , '\nThese are:', dr_muts[dr_muts.isin(other_muts)]
+          , '\n========================================================='
+          , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum()
+          , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)]
+          , '\n=========================================================')
+    
+    print('Counting no. of ambiguous muts...'
+          , '\nNo. of ambiguous muts in dr:'
+          , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist())
+          , '\nNo. of ambiguous muts in other:'
+          , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())
+          , '\n=========================================================')
+    
+    if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): 
+        common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()
+        print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts))
+              , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n')
+        print('\n===========================================================') 
+else:
+    #sys.exit('Error: ambiguous muts present, but extraction failed. Debug!')
+    print('No: ambiguous muts present')
+    
+#print('Counting no. of ambiguous muts...')
+#if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): 
+#    common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()
+#    print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts))
+#          , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n')
+#    print('\n===========================================================')
+#else:
+#   print('Error: ambiguous muts detected, but extraction failed. Debug!'
+#         , '\nNo. of ambiguous muts in dr:'
+#         , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist())
+#         , '\nNo. of ambiguous muts in other:'
+#         , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())
+#         , '\n=========================================================')       
+
+#%% Ambiguous muts: revised annotation for mutation_info
+ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
+
+ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts()    
+ambiguous_muts_value_counts
+
+bar = gene_LF1.copy()
+bar.equals(gene_LF1)
+
+gene_LF1_orig = gene_LF1.copy()
+
+#=====================================
+# Now create a new df that will have:
+    # ambiguous muts
+    # mutation_info
+    # revised mutation_info
+# The revised label is based on value_counts
+# for mutaiton_info. The corresponding mutation_info:
+# label is chosen that corresponds to the max of value counts
+#=====================================
+ambig_muts_rev_df = pd.DataFrame()
+
+for i in common_muts:
+    #print(i)
+    temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info']]
+    
+  # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending
+    max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info']].value_counts()
+    
+    revised_label = max_info_table[[0]].index[0][1] # max value_count
+    old_label     = max_info_table[[1]].index[0][1] # min value_count 
+    print('\nAmbiguous mutation labels...'
+          , '\nSetting mutation_info for', i, 'to', revised_label)
+    temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info'] == old_label)
+                      , revised_label
+                      , temp_df['mutation_info'])
+    ambig_muts_rev_df = pd.concat([ambig_muts_rev_df,temp_df])
+                      
+ambig_muts_rev_df['mutation_info'].value_counts() 
+ambig_muts_rev_df['mutation_info_REV'].value_counts() 
+
+#=================
+# Merge ambig muts
+# with gene_LF1
+#===================
+bar = gene_LF1.copy()
+bar['mutation_info_old'] = bar['mutation_info']
+
+ambig_muts_rev_df.index
+bar.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
+bar['mutation_info_old'].value_counts()
+bar['mutation_info'].value_counts()
+foo = bar.iloc[ambig_muts_rev_df.index]
+foo[['mutation', 'mutation_info', 'mutation_info_old']]
+
+# CHECK if there are still any ambiguous muts
+
+#%% ROUND THE HOUSES: DELETE
+foo = ambig_muts_rev_df[['mutation', 'mutation_info_REV']]
+fooU = foo.drop_duplicates()
+
+#https://cmdlinetips.com/2021/04/convert-two-column-values-from-pandas-dataframe-to-a-dictionary/
+fooD = fooU.set_index('mutation').to_dict()['mutation_info_REV']
+fooD
+#https://stackoverflow.com/questions/45334020/pandas-map-column-data-based-on-value-from-another-column-using-if-to-determine
+#df['New_State_Name'] = np.where(df['Name']=='Person1',df['State'].map(state_map),df['State'].map(state_map2))
+bar['NEW'] = bar['mutation_info_old']
+bar['NEW'].value_counts()
+bar['mutation_info_old'].value_counts()
+
+bar['NEW'].value_counts()
+bar2 = bar[['mutation', 'mutation_info', 'NEW']]
+bar2.isnull().sum()
+
+bar['NEW'].value_counts()
+bar['mutation_info'].value_counts()
+bar.loc[bar['mutation'].isin(fooD.keys()), 'NEW'] = bar['mutation_info'].map(fooD)
+bar['NEW'] = bar['NEW'].map(fooD).fillna(bar['NEW'])
+
+bar['NEW2'].value_counts()
+bar['NEW'].value_counts()
+#%% OUTFILE 3, write file: ambiguous muts and ambiguous mut counts
+
+#print(outdir)
+===================
+# ambiguous muts
+#=======================
+#dr_muts.to_csv('dr_muts.csv', header = True)
+#other_muts.to_csv('other_muts.csv', header = True)
+if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
+    out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
+    outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
+    print('\n----------------------------------'
+          , '\nWriting file: ambiguous muts'
+          , '\n----------------------------------'
+          , '\nFilename:', outfile_ambig_muts)
+    inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
+    inspect.to_csv(outfile_ambig_muts, index = True)
+
+    print('Finished writing:', out_filename_ambig_muts
+          , '\nNo. of rows:', len(inspect)
+          , '\nNo. of cols:', len(inspect.columns)
+          , '\nNo. of rows = no. of samples with the ambiguous muts present:'
+          , dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum()
+          , '\n=============================================================')
+    del(out_filename_ambig_muts)
+
+#=======================
+# ambiguous mut counts
+#=======================
+out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv'
+outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts
+print('\n----------------------------------'
+       , '\nWriting file: ambiguous muts'
+       , '\n----------------------------------'
+       , '\nFilename:', outfile_ambig_mut_counts)
+
+ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
+
+#%% clear variables
+del(id_dr, id_other
+    #, meta_data
+    , meta_gene_dr, meta_gene_other
+    #, mut_grouped
+    , muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na)  
+
+del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1)                     
+ 
+#%% Splitting mutation column to get mCSM style muts i.e. mutationinformation column
+
+#%% NEW TODO: map mutationinformation
+#%% NEW: mappings
+#=======================
+# column name: <drug>
+#=======================
+# mapping 1: labels
+dm_om_label_map = {dr_muts_col: 'DM'
+             , other_muts_col: 'OM'}
+
+gene_LF0['mutation_info_labels'] = gene_LF0['mutation_info'].map(dm_om_label_map)
+
+# mapping 2: numeric
+dm_om_num_map = {dr_muts_col: 1
+              , other_muts_col: 0}
+
+gene_LF0['dm_om_numeric'] = gene_LF0['mutation_info'].map(dm_om_num_map)
+
+gene_LF0['mutation_info'].value_counts()
+gene_LF0['mutation_info_labels'].value_counts()
+gene_LF0['dm_om_numeric'].value_counts()
+
+#============================
+# column name: <drtype>
+#============================
+gene_LF0['drtype'].value_counts()
+
+# mapping: numeric
+drtype_map = {'XDR': 5
+              , 'Pre-XDR': 4
+              , 'MDR': 3
+              , 'Pre-MDR': 2
+              , 'Other': 1
+              , 'Sensitive': 0}
+gene_LF0['drtype_numeric']  = gene_LF0['drtype'].map(drtype_map)
+
+gene_LF0['drtype'].value_counts()
+gene_LF0['drtype_numeric'].value_counts()
+
+#%% multimode
+# COPY dst column
+gene_LF0['dst'] = gene_LF0[drug] # to allow cross checking
+gene_LF0['dst_multimode'] = gene_LF0[drug]
+
+# sanity check
+gene_LF0[drug].value_counts()
+gene_LF0['dst_multimode'].value_counts()
+gene_LF0[drug].isna().sum()
+
+if gene_LF0[drug].value_counts().sum()+gene_LF0[drug].isna().sum() == len(gene_LF0):
+    print('\nPASS:', 'Numbers match')
+else:
+    print('\nFAIL:', 'Numbers mismatch')
+
+gene_LF0['mutation'].value_counts()
+#data.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count')
+gene_LF0[drug].isnull().groupby(gene_LF0['mutation']).sum()
+                            
+# GOAL is to populate na in the dst column from the count of the dm_om_numeric column                          
+gene_LF0['dst_multimode'].isnull().groupby(gene_LF0['mutationinformation']).sum()                       
+
+# COPY mutationinformation for sanity check
+#data['mutation'] = data['mutationinformation']
+gene_LF0['mutationinformation'] = gene_LF0['mutation']
+                           
+# Get multimode for dm_om_numeric column
+dm_om_multimode = gene_LF0.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
+dm_om_multimode
+
+gene_LF0['dst_multimode'] = gene_LF0['dst_multimode'].fillna(dm_om_multimode)
+gene_LF0['dst_noNA'] = gene_LF0['dst_multimode'].apply(lambda x: np.nanmax(x))
+print(gene_LF0)
+
+# Finally created a revised dst with the max from the multimode
+gene_LF0['dst_mode']  = gene_LF0.groupby('mutationinformation')['dst_noNA'].max()