From ac0d14e1164fd5224d8023b506c522d5d41c1402 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 26 Apr 2022 17:03:40 +0100
Subject: [PATCH] done but getting an error when running from cmd

---
 scripts/data_extraction.py | 716 +++++++++++++++++++++++++++----------
 1 file changed, 521 insertions(+), 195 deletions(-)
 mode change 100644 => 100755 scripts/data_extraction.py

diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py
old mode 100644
new mode 100755
index b6f6411..b064ec1
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@@ -314,7 +314,7 @@ print('===========================================================\n'
 #meta_data_gene = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ]
 # so downstream code doesn't change
 meta_gene_all = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ]
-#%% DF: with dr_muts_col
+#%% DF: with dr_muts_col, meta_gene_dr
 meta_gene_dr = meta_gene_all.loc[meta_gene_all[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
 meta_gene_dr1 = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
 
@@ -327,7 +327,7 @@ else:
              , '\nshape of df2:', meta_gene_dr1.shape
              , '\nCheck again!')
 ##############################################################################    
-#%% DF: with other_muts_col
+#%% DF: with other_muts_col, other_gene_dr
 meta_gene_other = meta_gene_all.loc[meta_gene_all[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
 meta_gene_other1 = meta_data.loc[meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
 
@@ -365,11 +365,11 @@ print('===========================================================\n'
 	  , '\n===========================================================')
 ###############################################################################
 #%% Create a copy of indices for downstream mergeing
-meta_gene_all['index_orig']      = meta_gene_all.index
-meta_gene_all['index_orig_copy'] = meta_gene_all.index
+#meta_gene_all['index_orig']      = meta_gene_all.index
+#meta_gene_all['index_orig_copy'] = meta_gene_all.index
 
-all(meta_gene_all.index.values == meta_gene_all['index_orig'].values)
-all(meta_gene_all.index.values == meta_gene_all['index_orig_copy'].values)
+#all(meta_gene_all.index.values == meta_gene_all['index_orig'].values)
+#all(meta_gene_all.index.values == meta_gene_all['index_orig_copy'].values)
 ##############################################################################   
 #%% Important sanity checks: Dr muts column for tidy split(), nsSNPs, etc.
 # Split based on semi colon dr_muts_col
@@ -456,13 +456,13 @@ del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
 # DF1: dr_muts_col
 # and remove leading white spaces
 #=========
-col_to_split1 = dr_muts_col
+#col_to_split1 = dr_muts_col
 print ('Firstly, applying tidy split on dr muts df', meta_gene_dr.shape
-       , '\ncolumn name to apply tidy_split():'    , col_to_split1
+       , '\ncolumn name to apply tidy_split():'    , dr_muts_col
        , '\n============================================================')
 
 # apply tidy_split()
-dr_WF0 = tidy_split(meta_gene_dr, col_to_split1, sep = ';') 
+dr_WF0 = tidy_split(meta_gene_dr, dr_muts_col, sep = ';') 
 # remove leading white space else these are counted as distinct mutations as well
 dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.strip() 
 
@@ -579,13 +579,13 @@ del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other, count_wt )
 # DF2: other_muts_col
 # and remove leading white spaces
 #=========
-col_to_split2 = other_muts_col
+#col_to_split2 = other_muts_col
 print ('applying second tidy split() separately on other muts df', meta_gene_other.shape
-       , '\ncolumn name to apply tidy_split():', col_to_split2
+       , '\ncolumn name to apply tidy_split():', other_muts_col
        , '\n============================================================')
 
 # apply tidy_split()
-other_WF1 = tidy_split(meta_gene_other, col_to_split2, sep = ';') 
+other_WF1 = tidy_split(meta_gene_other, other_muts_col, sep = ';') 
 # remove the leading white spaces in the column
 other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip() 
 
@@ -644,7 +644,7 @@ print('\n==================================================================='
       , '\nPredicting total no. of rows in the curated df:', expected_rows
       , '\n===================================================================')
 
-#%%another way: Add value checks for dict so you can know if its correct for LF data count below
+#%% another way: Add value checks for dict so you can know if its correct for LF data count below
 dr_snps_vc_dict = pd.Series(dr_gene_mutsL).value_counts().to_dict()
 other_snps_vc_dict = pd.Series(other_gene_mutsL).value_counts().to_dict()
 
@@ -683,7 +683,7 @@ SnpFDict = lower_dict(SnpFDict_orig)
 ###############################################################################
 # USE Vcounts to get expected curated df
 # resolve dm om muts funda
-#%%#%% Concatenating two dfs: gene_LF0
+#%% Concatenating two dfs: gene_LF0
 # equivalent of rbind in R
 #==========
 # Concatentating the two dfs: equivalent of rbind in R
@@ -922,13 +922,20 @@ ambig_muts_rev_df = pd.DataFrame()
 changes_val = []
 changes_dict = {}
 
-for i in common_muts:
-    #print(i)
+##BROKENNNN!!!!
+
+common_muts_lower =  list((map(lambda x: x.lower(), common_muts)))
+common_muts_lower
+##BROKENNNN!!!!
+#for i in common_muts:
+for i in common_muts_lower:
+
+    print(i)
     temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
-    
+    temp_df
   # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending
     max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts()
-    
+    max_info_table
     revised_label = max_info_table[[0]].index[0][1] # max value_count
     old_label     = max_info_table[[1]].index[0][1] # min value_count 
     print('\nAmbiguous mutation labels...'
@@ -948,12 +955,12 @@ ambig_muts_rev_df['mutation_info_orig'].value_counts()
 changes_val
 changes_total = sum(changes_val)
 changes_dict
-#%% OUTFILE 3, write file: ambiguous muts and ambiguous mut counts
-#===================
+#%% OUTFILE 1, write file: ambiguous muts and ambiguous mut counts
+#==================
 # ambiguous muts
-#=======================
-#dr_muts.to_csv('dr_muts.csv', header = True)
-#other_muts.to_csv('other_muts.csv', header = True)
+#==================
+#dr_muts.XXX_csvXXXX('dr_muts.csv', header = True)
+#other_muts.XXXX_csvXXX('other_muts.csv', header = True)
 if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
     out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
     outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
@@ -972,19 +979,20 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
           , '\n=============================================================')
     del(out_filename_ambig_muts)
     
-#%% FIXME: ambig mut counts
-#=======================
+#%% OUTFILE 2, write file: ambiguous mut counts    
+#======================
 # ambiguous mut counts
-#=======================
+#======================
 out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv'
 outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts
 print('\n----------------------------------'
-       , '\nWriting file: ambiguous muts'
+       , '\nWriting file: ambiguous mut counts'
        , '\n----------------------------------'
        , '\nFilename:', outfile_ambig_mut_counts)
 
 ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
-#%%FIXME: TODO: Add sanity check to make sure you can add value_count checks
+#%% FIXME: Add sanity check to make sure you can add value_count checks
+print('\nREACHED here...................>>>>>>>')
 #%% Resolving ambiguous muts
 # Merging ambiguous muts
 #=================
@@ -995,6 +1003,12 @@ ambig_muts_rev_df.index
 gene_LF1.index
 all(ambig_muts_rev_df.index.isin(gene_LF1.index))
 
+any(gene_LF1.index.isin(ambig_muts_rev_df.index))
+if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == len(ambig_muts_rev_df)):
+    print('\nPASS: ambiguous mut indices present in gene_LF1. Prepare to merge...')
+else:
+    sys.exit('\nFAIL:ambiguous mut indices MISmatch. Check section Resolving ambiguous muts')
+
 #gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV']
 gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
 
@@ -1024,9 +1038,9 @@ gene_LF1['mutation_info'].value_counts()
 #%% PHEW! Good to go for downstream stuff
 #%% Add column: Mutationinformation
 # splitting mutation column to get mCSM style muts
-#=======================
+#=====================================================
 # Formatting df: read aa dict and pull relevant info
-#=======================
+#=====================================================
 print('Now some more formatting:'
       , '\nRead aa dict and pull relevant info'
       , '\nFormat mutations:'
@@ -1040,12 +1054,13 @@ print('Now some more formatting:'
 ncol_mutf_add = 3 # mut split into 3 cols
 ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping
 
-#===========
+#======================================================================
 # Split 'mutation' column into three:  wild_type, position and
 # mutant_type separately. Then map three letter code to one using
 # reference_dict imported at the beginning.
-# After importing, convert to mutation to lowercase for compatibility with dict 
-#===========
+# After importing, convert to mutation to lowercase for 
+# compatibility with dict 
+#=======================================================================
 gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower()
 
 print('wt regex being used:', wt_regex
@@ -1054,14 +1069,15 @@ print('wt regex being used:', wt_regex
 
 mylen0 = len(gene_LF1.columns)
 
-#=======
+#=========================================================
 # Iterate through the dict, create a lookup dict i.e
 # lookup_dict = {three_letter_code: one_letter_code}.
 # lookup dict should be the key and the value (you want to create a column for)
 # Then use this to perform the mapping separetly for wild type and mutant cols.
-# The three letter code is extracted using a string match match from the dataframe and then converted
+# The three letter code is extracted using a string match match from the 
+# dataframe and then converted
 # to 'pandas series'since map only works in pandas series
-#=======
+#===========================================================
 print('Adding', ncol_mutf_add, 'more cols:\n')
 
 # initialise a sub dict that is lookup dict for three letter code to 1-letter code
@@ -1076,7 +1092,11 @@ for k, v in my_aa_dict.items():
     mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
     gene_LF1['mutant_type'] = mut.map(lookup_dict)
 
-# extract position info from mutation column separetly using string match
+#-------------------
+# extract position 
+# info from mutation 
+# column separetly using string match
+#-------------------
 #gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') 
 gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex) 
 
@@ -1109,11 +1129,11 @@ else:
 # clear variables
 del(k, v, wt, mut, lookup_dict)
 
-########
+##########################################################################
 # combine the wild_type+poistion+mutant_type columns to generate 
 # mutationinformation (matches mCSM output field)
 # Remember to use .map(str) for int col types to allow string concatenation
-#########
+###########################################################################
 gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type']
 print('Created column: mutationinformation'
 	, '\n=====================================================================\n'
@@ -1138,7 +1158,6 @@ bar = gene_LF1['position'].value_counts()
 #else:
 #    print('FAIL: df could not be ordered. Check source')
 #    sys.exit()
-
 #%% Create a copy of mutationinformation column for downstream mergeing
 gene_LF1['Mut']      = gene_LF1['mutationinformation'] 
 gene_LF1['Mut_copy'] = gene_LF1['mutationinformation'] 
@@ -1150,7 +1169,7 @@ gene_LF1['index_orig_copy'] = gene_LF1.index
 all(gene_LF1.index.values == gene_LF1['index_orig'].values)
 all(gene_LF1.index.values == gene_LF1['index_orig_copy'].values)
 
-#%% quick sanity check for position freq count
+#%% Quick sanity check for position freq count
 # count the freq of 'other muts' samples
 test_df = gene_LF1.copy()
 test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'position']]
@@ -1158,6 +1177,7 @@ test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'positi
 #test_df['sample_freq'] = test_df.groupby('id')['id'].transform('count')
 #print('Revised dim of other_muts_df:',test_df.shape) 
 test_df['scount'] = test_df['mutation'].map(SnpFDict)
+test_df = test_df.sort_values(['position', 'mutationinformation'])
 #%% Map mutation frequency count as a column
 gene_LF1['snp_frequency'] = gene_LF1['mutation'].map(SnpFDict)
 #%% Map position frequency count as a column
@@ -1166,75 +1186,151 @@ z1 = z.to_dict()
 gene_LF1['pos_count'] = gene_LF1['position'].map(z1)
 
 #test_df2 = test_df.loc[test_df['position'] == 10] 
-
-#%% OUTFILE 4, write file mCSM style muts
-snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique())
-snps_only.head()
-# assign column name
-snps_only.columns = ['mutationinformation']
-
-# count how many positions this corresponds to
-pos_only = pd.DataFrame(gene_LF1['position'].unique()) 
-pos_only
-
-print('Checking NA in snps...')# should be 0
-if snps_only.mutationinformation.isna().sum() == 0:
-    print ('PASS: NO NAs/missing entries for SNPs'
-    , '\n===============================================================')
-else:
-    sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
-
-# write file: mCSM muts
-out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv'
-outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
-
-print('\n----------------------------------'
-      , '\nWriting file: mCSM style muts'
-      , '\n----------------------------------'
-      , '\nFile:', outfile_mcsmsnps
-      , '\nmutation format (SNP): {WT}<POS>{MUT}'
-      , '\nNo. of distinct muts:', len(snps_only)
-      , '\nNo. of distinct positions:', len(pos_only)
-      , '\n=============================================================')
-
-snps_only.to_csv(outfile_mcsmsnps, header = False, index = False)
-
-print('Finished writing:', outfile_mcsmsnps
-      , '\nNo. of rows:', len(snps_only)
-      , '\nNo. of cols:', len(snps_only.columns)
-      , '\n=============================================================')
-del(out_filename_mcsmsnps)
-
-#%% OUTFILE 5, write file frequency of position counts:  MOVE TO THE END
-metadata_pos = pd.DataFrame(gene_LF1['position'])
-metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
-metadata_pos['meta_pos_count'].value_counts()
-
-metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
-
-out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
-outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
-print('\n----------------------------------'
-      , '\nWriting file: Metadata poscounts'
-      , '\n----------------------------------'
-      , '\nFile:', outfile_metadata_poscounts
-      , '\n============================================================')
-
-metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
-print('Finished writing:', outfile_metadata_poscounts
-      , '\nNo. of rows:', len(metadata_pos)
-      , '\nNo. of cols:', len(metadata_pos.columns)
-      , '\n=============================================================')
-del(out_filename_metadata_poscounts)
-#%% OUTFILE 6, write file <gene>_metadata:  MOVE TO THE END
-# ---THINK ---
-#%% OUTFILE 7, write file MSA plots:  MOVE TO THE END
-# -- THINK ---
-#%% OUTFILE 8, write file mutational position with count:  MOVE TO THE END
-# -- THINK ---
+###############################################################################
+###############################################################################
 #%% Add column: aa property_water
+#=========
+# iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_prop_water} 
+# Do this for both wild_type and mutant as above.
+#=========
+print('Adding', ncol_aa_add, 'more cols:\n')
+
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+# adding two more cols
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_prop_water']
+    #print(lookup_dict)
+    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
+    gene_LF1['wt_prop_water'] = wt.map(lookup_dict)   
+    #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
+    gene_LF1['mut_prop_water'] = mut.map(lookup_dict)
+
+mylen2 = len(gene_LF1.columns)    
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen2 == mylen1 + ncol_aa_add:
+    print('PASS: successfully added', ncol_aa_add, 'cols'
+      , '\nold length:', mylen1
+      , '\nnew len:', mylen2)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen1
+          , '\nnew len:', mylen2)
+
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
 #%% Add column: aa_prop_polarity
+#========
+# iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_prop_polarity} 
+# Do this for both wild_type and mutant as above.
+#=========
+print('Adding', ncol_aa_add, 'more cols:\n')
+
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+# adding two more cols
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_prop_polarity']
+    #print(lookup_dict)
+    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
+    gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict)   
+   #mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
+    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
+    gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
+    
+mylen3 = len(gene_LF1.columns)       
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen3 ==  mylen2 + ncol_aa_add:
+    print('PASS: successfully added', ncol_aa_add, 'cols'
+      , '\nold length:', mylen1
+      , '\nnew len:', mylen2)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen1
+          , '\nnew len:', mylen2)
+   
+# clear variables
+del(k, v, wt, mut, lookup_dict)
 #%% Add column: aa_calcprop
+#========
+# iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_calcprop} 
+# Do this for both wild_type and mutant as above.
+#=========
+print('Adding', ncol_aa_add, 'more cols:\n')
+
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_calcprop']
+    #print(lookup_dict)
+    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
+    gene_LF1['wt_calcprop'] = wt.map(lookup_dict)   
+    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
+    gene_LF1['mut_calcprop'] = mut.map(lookup_dict)
+ 
+mylen4 = len(gene_LF1.columns)   
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen4 ==  mylen3 + ncol_aa_add:
+    print('PASS: successfully added', ncol_aa_add, 'cols'
+      , '\nold length:', mylen3
+      , '\nnew len:', mylen4)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen3
+          , '\nnew len:', mylen4)
+
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
 #%% NEW mappings: gene_LF2
 # gene_LF2: copy gene_LF1
 gene_LF2 = gene_LF1.copy()
@@ -1267,6 +1363,8 @@ gene_LF2['total_id_ucount'] = total_id_ucount
 # gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
 # gene_LF2['mut_id_ucount']
 
+gene_LF2['snp_frequency'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
+
 # gene_LF1['snp_frequency'].equals(gene_LF2['mut_id_ucount'])
 
 #%% AF for gene
@@ -1275,7 +1373,7 @@ gene_LF2['total_id_ucount'] = total_id_ucount
 #=================
 gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount']
 gene_LF2['maf'].head()
-
+###############################################################################
 #%% Further mappings: gene_LF3, with mutationinformation as INDEX
 gene_LF3 = gene_LF2.copy()
 
@@ -1324,6 +1422,7 @@ gene_LF3['drtype_all_vals'].head()
 
 gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
 gene_LF3['drtype_all_names'].head()
+
 #---------------------------------
 # Revised drtype: max(Multimode)
 #--------------------------------
@@ -1406,6 +1505,7 @@ gene_LF3['dst_multimode_all']
 # Fill using multimode ONLY where NA in dst_multimode column
 gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
 gene_LF3['dst_multimode']
+gene_LF3['dst_multimode'].value_counts()
 
 #----------------------------------
 # Revised dst column: max of mode
@@ -1422,6 +1522,11 @@ gene_LF3[drug].value_counts()
 #gene_LF3['dst_noNA'].value_counts()
 gene_LF3['dst_mode'].value_counts()
 
+if (gene_LF3['dst_mode'].value_counts().sum() == len(gene_LF3)) and (gene_LF3['dst_mode'].value_counts()[1] > gene_LF3[drug].value_counts()[1]) and gene_LF3['dst_mode'].value_counts()[0] > gene_LF3[drug].value_counts()[0]:
+    print('\nPASS: revised dst mode created successfully and the counts are more than <drug> col count')
+else:
+    print('\nFAIL:  revised dst mode numbers MISmatch')
+
 foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']]
 foo2 = foo.sort_values(['position', 'Mut'])
 
@@ -1460,6 +1565,7 @@ gene_LF3['mutation_info_labels_v1'].value_counts() == gene_LF3['mutation_info_la
 # Now overwrite
 gene_LF3['mutation_info_labels_dst']  = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
 gene_LF3['mutation_info_labels']  =  gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
+
 if all(gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()):
     print('\nPASS: Revised mutation_info column created successfully')
     gene_LF3 = gene_LF3.drop(['mutation_info_labels_dst'], axis = 1)
@@ -1480,6 +1586,7 @@ gene_LF3['mutation_info_labels_orig'].value_counts()
 # %% sanity check for the revised dst
 gene_LF3[drug].value_counts()
 gene_LF3[drug].value_counts().sum()
+
 gene_LF3['mutation_info_labels_orig'].value_counts()
 
 gene_LF3['dst_mode'].value_counts()
@@ -1488,6 +1595,8 @@ gene_LF3['dst_mode'].value_counts().sum()
 # direct comparision
 gene_LF3['dst'].value_counts()
 gene_LF3['mutation_info_labels'].value_counts()
+gene_LF3['mutation_info_labels_v1'].value_counts()
+
 #%% Lineage
 gene_LF3['lineage'].value_counts()
 # lineage_label_numeric = {'lineage1' : 1
@@ -1509,24 +1618,52 @@ lineage_label_numeric = {'L1' : 1
                     , 'LBOV' : 8}
 
 lineage_label_numeric
-# copy column to allow cross checks after stripping white space
-gene_LF3['lineage'] = gene_LF3['lineage'].str.strip()
-gene_LF3['lineage_corrupt'] = gene_LF3['lineage'] 
-#all(gene_LF3['lineage_corrupt'].value_counts() ==gene_LF3['lineage'].value_counts())
-gene_LF3['lineage_corrupt'].value_counts()
+#%% Lineage cols selected: gene_LF3_ColsSel
+gene_LF3_ColsSel = gene_LF3.copy()
+gene_LF3_ColsSel.index
+gene_LF3_ColsSel.columns
+#gene_LF3_ColsSel['index_orig_copy'] = gene_LF3_ColsSel['index_orig'] # delete
 
-#%%tidy_split(): Lineage
+# copy column to allow cross checks after stripping white space
+gene_LF3_ColsSel['lineage']         = gene_LF3_ColsSel['lineage'].str.strip()
+gene_LF3_ColsSel['lineage_corrupt'] = gene_LF3_ColsSel['lineage'] 
+all(gene_LF3_ColsSel['lineage_corrupt'].value_counts() == gene_LF3_ColsSel['lineage'].value_counts())
+gene_LF3_ColsSel['lineage_corrupt'].value_counts()
+
+gene_LF3_ColsSel = gene_LF3_ColsSel[['index_orig_copy', 'Mut', 'position', 'snp_frequency', 'lineage', 'lineage_corrupt']]
+gene_LF3_ColsSel.columns
+
+#%% tidy_split(): Lineage
 # Create df with tidy_split: lineage
-lf_lin_split = tidy_split(gene_LF3, 'lineage_corrupt', sep = ';') 
+lf_lin_split = tidy_split(gene_LF3_ColsSel, 'lineage_corrupt', sep = ';') 
 lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip()
 lf_lin_split['lineage_corrupt'].value_counts()
+#%% Important sanity check for tidy_split(): lineage
+# Split based on semi colon dr_muts_col
+search = ";"
+ 
+# count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence
+count_df_lin = gene_LF3_ColsSel[['lineage']]
+count_df_lin['lineage_semicolon_count'] = gene_LF3_ColsSel.loc[:, 'lineage'].str.count(search, re.I)
+lin_sc_C = count_df_lin['lineage_semicolon_count'].value_counts().reset_index()
+lin_sc_C 
 
-# Map lineage labels to numbers to allow metrics
+lin_sc_C['index_semicolon'] = (lin_sc_C['index'] + 1) * lin_sc_C['lineage_semicolon_count']
+lin_sc_C 
+expected_linC = lin_sc_C['index_semicolon'].sum() + gene_LF3_ColsSel['lineage'].isnull().sum()
+expected_linC
+
+if (len(lf_lin_split) == expected_linC):
+    print('\nPASS: tidy_split() length match for lineage')
+else:
+    sys.exit('\nFAIL: tidy_split() length MISMATCH. Check lineage semicolon count')
+###############################################################################
+#%% Map lineage labels to numbers to allow metrics
 lf_lin_split['lineage_numeric']  = lf_lin_split['lineage_corrupt'].map(lineage_label_numeric)
 lf_lin_split['lineage_numeric'].value_counts()
 
 #--------------------------------
-# Lineage_corrupt ALL values:
+# Add lineage_list: ALL values:
 #--------------------------------
 # Add all lineages for each mutation
 lf_lin_split['lineage_corrupt_list']  = lf_lin_split['lineage_corrupt'] 
@@ -1535,12 +1672,21 @@ lf_lin_split['lineage_corrupt_list'].value_counts()
 lf_lin_split['lineage_corrupt_list']  = lf_lin_split.groupby('mutationinformation').lineage_corrupt_list.apply(list)
 lf_lin_split['lineage_corrupt_list'].value_counts()
 
+#--------------------------------
+# Add lineage count: ALL
+#--------------------------------
+lf_lin_split['lineage_corrupt_count']  = lf_lin_split['lineage_corrupt_list'].apply(lambda x : len(list(x)))
+
+#--------------------------------
 # Add lineage unique count
+#--------------------------------
 lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['Mut'].map(lf_lin_split.groupby('Mut')['lineage_corrupt'].nunique())
 #lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['lineage_corrupt']
 lf_lin_split['lineage_corrupt_ucount'].value_counts()
 
+#--------------------------------
 # Add lineage_set
+#--------------------------------
 lf_lin_split['lineage_set']    = lf_lin_split['lineage_corrupt_list'].apply(lambda x : set(list(x)))
 lf_lin_split['lineage_ulist']  = lf_lin_split['lineage_set'].apply(lambda x : list(x))
 
@@ -1551,95 +1697,275 @@ lf_lin_split['lineage_multimode'] = lf_lin_split.groupby('mutationinformation')[
 lf_lin_split['lineage_multimode'].value_counts()
 
 # cant take max as it doesn't mean anyting!
-foo = lf_lin_split[['Mut', 'lineage_ulist']]
 
 ###############################################################################
-#%% Final merge: gene_LF4 with lineage_split_df
-gene_LF4 = gene_LF3.copy()
-gene_LF4.index
-gene_LF4['index_orig_copy'] = gene_LF4['index_orig']
+#%% Select only the columns you want to merge from lf_lin_split 
+lf_lin_split.columns
+lf_lin_split_ColSel = lf_lin_split[['lineage_corrupt_list','lineage_corrupt_count'
+                                    , 'lineage_corrupt_ucount' ,'lineage_ulist', 'lineage_multimode']]
+lf_lin_split_ColSel.columns
 
-lf_lin_split['index_orig_copy'] = lf_lin_split['index_orig']
-#================================
-# Merge with gene_LF3 with
-# lf_lin_split baseed on index
-#================================
-# set index for lf_lin_split
-lf_lin_split.index
-lf_lin_split.reset_index(inplace=True)
-lf_lin_split['mutationinformation']
-lf_lin_split = lf_lin_split.set_index(['index_orig_copy'])
-lf_lin_split.index
+lf_lin_split_ColSel.rename(columns = {'lineage_corrupt_list'     : 'lineage_list_all'
+                               , 'lineage_corrupt_count'  : 'lineage_count_all'
+                               , 'lineage_corrupt_ucount' : 'lineage_count_unique'
+                               , 'lineage_ulist'          : 'lineage_list_unique'
+                               , 'lineage_multimode'      : 'lineage_multimode'}, inplace = True)
 
-# set index for gene_LF4
-gene_LF4.index
-gene_LF4.reset_index(inplace=True)
-gene_LF4.index
-gene_LF4['mutationinformation']
-gene_LF4['index_orig_copy']
-gene_LF4 = gene_LF4.set_index(['index_orig_copy'])
-gene_LF4.index
+lf_lin_split_ColSel.columns
+ncols_to_merge = len(lf_lin_split_ColSel.columns)
 
-#-------------------------
-# colum lineage_ucount:
-# contribution of each distinct lineage
-#-------------------------
-gene_LF4['lineage_ucount'] = gene_LF4['lineage'] 
+#%% Final merge: gene_LF3 with lf_lin_split_ColSel: gene_LF4
+#===============================
+# merge: inner join by default
+# join: left join default
+# concat: outer join by default
+#df1.join(df2)
+#===============================
+len(lf_lin_split)
+len(gene_LF3)
 
-#-------------------------
-# colum lineage list:
-#-------------------------
-#gene_LF4['lineage_set'] = gene_LF4['lineage'] 
-gene_LF4['lineage_ulist'] = gene_LF4['lineage'] 
-
-gene_LF4['lineage_list'] = gene_LF4['lineage'] 
-
-#-------------------------
-# colum lineage_list mode:
-#-------------------------
-gene_LF4['lineage_mode'] = gene_LF4['lineage'] 
-
-# merge based on indices
-gene_LF4.index.nunique()
-lf_lin_split.index.nunique()
-all(gene_LF4.index.isin(lf_lin_split.index))
-all(lf_lin_split.index.isin(gene_LF4.index))
-gene_LF4.index
-lf_lin_split.index
-
-if (gene_LF4.index.nunique() == lf_lin_split.index.nunique()) and ( all(gene_LF4.index.isin(lf_lin_split.index)) == all(lf_lin_split.index.isin(gene_LF4.index)) ):
-    print('\nPass: merging lineage_ucount with gene_LF4')
+# Dropping duplicates from lineage df
+lf_lin_split_dups = lf_lin_split_ColSel[lf_lin_split_ColSel.index.duplicated()]
+lf_lin_split_U    = lf_lin_split_ColSel[~lf_lin_split_ColSel.index.duplicated()]
+if len(lf_lin_split_U) == len(snps_only):
+    print('\nPASS: Duplicate entries removed from lf_lin'
+          , '\nReady to start the final merge to generate gene_LF4')
 else:
-    sys.exit('\nFail: Indices mismatch, cannot merge! Quitting!')
+    print('\nFAIL: DF after duplicate removal numbers MISmatch ')
+    
+gene_LF3.index
+lf_lin_split_U.index
 
-###########################
-# magic merge happens here
-###########################
-# FIXME: add check here to see if the duplicated indices rows are actual duplicates as the cols I need should be summary cols
-lf_lin_split.index.drop_duplicates(keep='first')
-lf_lin_split = lf_lin_split
-lf_lin_split_U = lf_lin_split[~lf_lin_split.index.duplicated(keep='first')]
-lf_lin_split_U.shape
+#--------------------
+# Creating gene_LF4  
+#-------------------- 
+if all(gene_LF3.index.isin(lf_lin_split_U.index)) :
+    print('\nIndices match, staring final merge...')
+    gene_LF4 = gene_LF3.join(lf_lin_split_U)
+    if len(gene_LF4.columns) == len(gene_LF3.columns) + ncols_to_merge: 
+        print('\nPASS: gene_LF4 created after successfully merging with lineage info'
+              , '\nShape of gene_LF4:', gene_LF4.shape)
+else:
+    sys.exit('\nFAIL: gene_LF4 could not be created. Dfs could not be merged. Check indices or dfs length again!')
 
-gene_LF4.loc[lf_lin_split_U.index, 'lineage_ucount']  = lf_lin_split_U['lineage_corrupt_ucount']
-gene_LF4['lineage_ucount'].value_counts()
+#----------------------------------------
+# Dropping redundant cols from  gene_LF4  
+#----------------------------------------
+if all(gene_LF4.index == gene_LF4['Mut']) and gene_LF4['index_orig'].equals(gene_LF4['index_orig_copy']):
+    gene_LF4.reset_index(inplace=True)
+    #gene_LF4 = gene_LF4.set_index(['index_orig'])
+    print('\nPass Mutationinformationa and index columns checked, the duplicates can now be dropped')
+    gene_LF4 = gene_LF4.drop(['Mut_copy', 'index_orig_copy', 'mutation_info_v1' ], axis = 1)
 
-#gene_LF4.loc[lf_lin_split_U.index, 'lineage_set']  = lf_lin_split_U['lineage_set'] 
-#gene_LF4['lineage_set'].value_counts()
-gene_LF4.loc[lf_lin_split_U.index, 'lineage_ulist']  = lf_lin_split_U['lineage_ulist'] 
-gene_LF4['lineage_ulist'].value_counts()
+#%% Final output with relevant columns
+print('\nFinal output file: Dim of gene_LF4:', gene_LF4.shape)
 
-gene_LF4.loc[lf_lin_split_U.index, 'lineage_list']  = lf_lin_split_U['lineage_corrupt_list'] 
-gene_LF4['lineage_list'].value_counts()
+# cols_to_output = ['mutationinformation', 'id', 'sample', 'lineage', 'sublineage',
+#        'country_code', 'drtype', 'pyrazinamide', 'mutation', 'drug_name',
+#        'mutation_info', 'mutation_info_orig', 'wild_type', 'mutant_type',
+#        'position', 'Mut', 'index_orig', 'pos_count', 'total_id_ucount',
+#        'snp_frequency', 'maf', 'drtype_numeric', 'drtype_all_vals',
+#        'drtype_all_names', 'drtype_multimode', 'drtype_mode', 'drtype_max',
+#        'mutation_info_labels', 'dm_om_numeric', 'dm_om_numeric_orig', 'dst',
+#        'dst_multimode', 'dst_multimode_all', 'dst_mode', 'lineage_list_all',
+#        'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique',
+#        'lineage_multimode']
+#%% OUTFILE 3, write file mCSM style muts
+snps_only = pd.DataFrame(gene_LF4['mutationinformation'].unique())
+snps_only.head()
+# assign column name
+snps_only.columns = ['mutationinformation']
 
-gene_LF4.loc[lf_lin_split_U.index, 'lineage_mode']  = lf_lin_split_U['lineage_multimode']
-gene_LF4['lineage_mode'].value_counts()
+# count how many positions this corresponds to
+pos_only = pd.DataFrame(gene_LF4['position'].unique()) 
+pos_only
 
-foo = gene_LF4[['mutationinformation', 'lineage', 'lineage_ucount'
-                #, 'lineage_set'
-                , 'lineage_ulist'
-                , 'lineage_mode'
-                , 'lineage_list']]
-#%%
+print('Checking NA in snps...')# should be 0
+if snps_only.mutationinformation.isna().sum() == 0:
+    print ('PASS: NO NAs/missing entries for SNPs'
+    , '\n===============================================================')
+else:
+    sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
 
-#Subset relevant columns for output and put the rest of the output here
\ No newline at end of file
+# write file: mCSM muts
+out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv'
+outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
+
+print('\n----------------------------------'
+      , '\nWriting file: mCSM style muts'
+      , '\n----------------------------------'
+      , '\nFile:', outfile_mcsmsnps
+      , '\nmutation format (SNP): {WT}<POS>{MUT}'
+      , '\nNo. of distinct muts:', len(snps_only)
+      , '\nNo. of distinct positions:', len(pos_only)
+      , '\n=============================================================')
+
+snps_only.to_csv(outfile_mcsmsnps, header = False, index = False)
+
+print('Finished writing:', outfile_mcsmsnps
+      , '\nNo. of rows:', len(snps_only)
+      , '\nNo. of cols:', len(snps_only.columns)
+      , '\n=============================================================')
+del(out_filename_mcsmsnps)
+###############################################################################
+#%% OUTFILE 4, write file frequency of position count
+metadata_pos = pd.DataFrame(gene_LF4['position'])
+metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
+metadata_pos['meta_pos_count'].value_counts()
+
+metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
+
+out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
+outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
+print('\n----------------------------------'
+      , '\nWriting file: Metadata poscounts'
+      , '\n----------------------------------'
+      , '\nFile:', outfile_metadata_poscounts
+      , '\n============================================================')
+
+metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
+print('Finished writing:', outfile_metadata_poscounts
+      , '\nNo. of rows:', len(metadata_pos)
+      , '\nNo. of cols:', len(metadata_pos.columns)
+      , '\n=============================================================')
+del(out_filename_metadata_poscounts)
+###############################################################################
+#%% OUTFILE 5, write file <gene>_metadata
+# where each row has UNIQUE mutations NOT unique sample ids
+out_filename_metadata = gene.lower() + '_metadata.csv'
+outfile_metadata = outdir + '/' + out_filename_metadata
+print('\n----------------------------------'
+      , '\nWriting file: LF formatted data'
+      , '\n----------------------------------'
+      , '\nFile:', outfile_metadata
+      , '\n============================================================')
+
+gene_LF4.to_csv(outfile_metadata, header = True, index = False)
+print('Finished writing:', outfile_metadata
+      , '\nNo. of rows:', len(gene_LF4)
+      , '\nNo. of cols:', len(gene_LF4.columns)
+      , '\n=============================================================')
+del(out_filename_metadata)
+###############################################################################
+#%% OUTFILE 6, write file MSA plots
+#write file: mCSM style but with repitions for MSA and logo plots
+all_muts_msa = pd.DataFrame(gene_LF4['mutationinformation']) 
+all_muts_msa.head()
+
+# assign column name
+all_muts_msa.columns = ['mutationinformation']
+
+# make sure it is string
+all_muts_msa.columns.dtype
+
+# sort the column
+all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation')
+
+# create an extra column with protein name
+#all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1') 
+#all_muts_msa_sorted.head()
+
+# rearrange columns so the fasta name is the first column (required for mutate.script)
+#all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
+all_muts_msa_sorted.head()
+
+print('Checking NA in snps...')# should be 0
+if all_muts_msa.mutationinformation.isna().sum() == 0:
+    print ('PASS: NO NAs/missing entries for SNPs'
+    , '\n===============================================================')
+else:
+    sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
+
+out_filename_msa = gene.lower() +'_all_muts_msa.csv'
+outfile_msa = outdir + '/' + out_filename_msa
+
+print('\n----------------------------------'
+      , '\nWriting file: mCSM style muts for msa'
+      , '\n----------------------------------'
+      , '\nFile:', outfile_msa
+      , '\nmutation format (SNP): {WT}<POS>{MUT}'
+      , '\nNo.of lines of msa:', len(all_muts_msa))
+
+all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False)
+
+print('Finished writing:', outfile_msa
+      , '\nNo. of rows:', len(all_muts_msa)
+      , '\nNo. of cols:', len(all_muts_msa.columns)
+      , '\n=============================================================')
+
+del(out_filename_msa)
+###############################################################################
+#%% OUTFILE 7, write file mutational position with count
+# write file for mutational positions
+# count how many positions this corresponds to
+pos_only = pd.DataFrame(gene_LF4['position'].unique()) 
+# assign column name
+pos_only.columns = ['position']
+# make sure dtype of column position is int or numeric and not string
+pos_only.position.dtype
+pos_only['position'] = pos_only['position'].astype(int)
+pos_only.position.dtype
+
+# sort by position value
+pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)
+
+out_filename_pos = gene.lower() + '_mutational_positons.csv'
+outfile_pos = outdir + '/' + out_filename_pos
+
+print('\n----------------------------------'
+      , '\nWriting file: mutational positions'
+      , '\n----------------------------------'
+      , '\nFile:', outfile_pos
+      , '\nNo. of distinct positions:', len(pos_only_sorted)
+      , '\n=============================================================')
+
+pos_only_sorted.to_csv(outfile_pos, header = True, index = False)
+
+print('Finished writing:', outfile_pos
+      , '\nNo. of rows:', len(pos_only_sorted)
+      , '\nNo. of cols:', len(pos_only_sorted.columns)
+      , '\n============================================================='
+      , '\n\n\n')
+
+del(out_filename_pos)
+###############################################################################
+#%% Quick summary output
+print('============================================'
+      , '\nQuick summary output for', drug, 'and' , gene.lower()
+      , '\n============================================'
+      , '\nTotal samples:', total_samples
+      , '\nNo. of Sus and Res [original]', drug, 'samples:\n', meta_data[drug].value_counts()
+	  , '\n'
+      , '\nPercentage of Sus and Res [original]', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
+	  , '\n'
+      
+      , '\nNo. of Sus and Res [Revised]', drug, 'samples:\n', gene_LF4['dst_mode'].value_counts()
+	  , '\n'
+      , '\nPercentage of Sus and Res [Revised]', drug, 'samples:\n', gene_LF4['dst_mode'].value_counts(normalize = True)*100
+	  , '\n'
+     
+      , '\nTotal no. of unique snps:', len(snps_only)
+      , '\nTotal no. of unique snps:', dr_muts_rev.nunique()+other_muts_rev.nunique()
+
+      , '\nTotal no.of unique dr muts:', dr_muts_rev.nunique() # ADD 
+      , '\nTotal no.of unique other muts:', other_muts_rev.nunique()#ADD 
+      
+      
+      , '\nTotal no.of unique missense muts:', gene_LF4['mutationinformation'].nunique()
+      , '\nTotal no.of unique positions associated with missense muts:', gene_LF4['position'].nunique()
+      , '\nTotal no. of samples with missense muts:', len(gene_LF4)
+      , '\nTotal no. of unique samples with missense muts:', gene_LF4['id'].nunique()  
+      , '\n')
+
+if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
+    print('\nTotal no.of samples with ambiguous muts:', len(inspect)
+          #, '\nTotal no.of unique ambiguous muts:', len(common_muts)
+          , '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
+          , '\n============================================================='
+          , '\nPost resolving ambiguity\n'
+          , ambig_muts_rev_df['mutation_info_REV'].value_counts())
+#=======================================================================
+print(u'\u2698' * 50,
+      '\nEnd of script: Data extraction and writing files'
+      '\n' + u'\u2698' * 50 )
+#%% end of script
\ No newline at end of file