done but getting an error when running from cmd

2022-04-26 17:03:40 +01:00 · 2022-04-26 17:03:40 +01:00 · ac0d14e116
commit ac0d14e116
parent 29e9d10e39
1 changed files with 521 additions and 195 deletions
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -314,7 +314,7 @@ print('===========================================================\n'
 #meta_data_gene = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ]
 # so downstream code doesn't change
 meta_gene_all = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ]
-#%% DF: with dr_muts_col
+#%% DF: with dr_muts_col, meta_gene_dr
 meta_gene_dr = meta_gene_all.loc[meta_gene_all[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
 meta_gene_dr1 = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
@ -327,7 +327,7 @@ else:
             , '\nshape of df2:', meta_gene_dr1.shape
             , '\nCheck again!')
 ##############################################################################    
-#%% DF: with other_muts_col
+#%% DF: with other_muts_col, other_gene_dr
 meta_gene_other = meta_gene_all.loc[meta_gene_all[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
 meta_gene_other1 = meta_data.loc[meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
@ -365,11 +365,11 @@ print('===========================================================\n'
 	  , '\n===========================================================')
 ###############################################################################
 #%% Create a copy of indices for downstream mergeing
-meta_gene_all['index_orig']      = meta_gene_all.index
+#meta_gene_all['index_orig']      = meta_gene_all.index
-meta_gene_all['index_orig_copy'] = meta_gene_all.index
+#meta_gene_all['index_orig_copy'] = meta_gene_all.index
-all(meta_gene_all.index.values == meta_gene_all['index_orig'].values)
+#all(meta_gene_all.index.values == meta_gene_all['index_orig'].values)
-all(meta_gene_all.index.values == meta_gene_all['index_orig_copy'].values)
+#all(meta_gene_all.index.values == meta_gene_all['index_orig_copy'].values)
 ##############################################################################   
 #%% Important sanity checks: Dr muts column for tidy split(), nsSNPs, etc.
 # Split based on semi colon dr_muts_col
@ -456,13 +456,13 @@ del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
 # DF1: dr_muts_col
 # and remove leading white spaces
 #=========
-col_to_split1 = dr_muts_col
+#col_to_split1 = dr_muts_col
 print ('Firstly, applying tidy split on dr muts df', meta_gene_dr.shape
-       , '\ncolumn name to apply tidy_split():'    , col_to_split1
+       , '\ncolumn name to apply tidy_split():'    , dr_muts_col
       , '\n============================================================')
 # apply tidy_split()
-dr_WF0 = tidy_split(meta_gene_dr, col_to_split1, sep = ';') 
+dr_WF0 = tidy_split(meta_gene_dr, dr_muts_col, sep = ';') 
 # remove leading white space else these are counted as distinct mutations as well
 dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.strip() 
@ -579,13 +579,13 @@ del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other, count_wt )
 # DF2: other_muts_col
 # and remove leading white spaces
 #=========
-col_to_split2 = other_muts_col
+#col_to_split2 = other_muts_col
 print ('applying second tidy split() separately on other muts df', meta_gene_other.shape
-       , '\ncolumn name to apply tidy_split():', col_to_split2
+       , '\ncolumn name to apply tidy_split():', other_muts_col
       , '\n============================================================')
 # apply tidy_split()
-other_WF1 = tidy_split(meta_gene_other, col_to_split2, sep = ';') 
+other_WF1 = tidy_split(meta_gene_other, other_muts_col, sep = ';') 
 # remove the leading white spaces in the column
 other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip() 
@ -683,7 +683,7 @@ SnpFDict = lower_dict(SnpFDict_orig)
 ###############################################################################
 # USE Vcounts to get expected curated df
 # resolve dm om muts funda
-#%%#%% Concatenating two dfs: gene_LF0
+#%% Concatenating two dfs: gene_LF0
 # equivalent of rbind in R
 #==========
 # Concatentating the two dfs: equivalent of rbind in R
@ -922,13 +922,20 @@ ambig_muts_rev_df = pd.DataFrame()
 changes_val = []
 changes_dict = {}
-for i in common_muts:
+##BROKENNNN!!!!
    #print(i)
    temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
 common_muts_lower =  list((map(lambda x: x.lower(), common_muts)))
 common_muts_lower
 ##BROKENNNN!!!!
 #for i in common_muts:
 for i in common_muts_lower:
    print(i)
    temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
    temp_df
  # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending
    max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts()
-    
+    max_info_table
    revised_label = max_info_table[[0]].index[0][1] # max value_count
    old_label     = max_info_table[[1]].index[0][1] # min value_count 
    print('\nAmbiguous mutation labels...'
@ -948,12 +955,12 @@ ambig_muts_rev_df['mutation_info_orig'].value_counts()
 changes_val
 changes_total = sum(changes_val)
 changes_dict
-#%% OUTFILE 3, write file: ambiguous muts and ambiguous mut counts
+#%% OUTFILE 1, write file: ambiguous muts and ambiguous mut counts
-#===================
+#==================
 # ambiguous muts
-#=======================
+#==================
-#dr_muts.to_csv('dr_muts.csv', header = True)
+#dr_muts.XXX_csvXXXX('dr_muts.csv', header = True)
-#other_muts.to_csv('other_muts.csv', header = True)
+#other_muts.XXXX_csvXXX('other_muts.csv', header = True)
 if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
    out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
    outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
@ -972,19 +979,20 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
          , '\n=============================================================')
    del(out_filename_ambig_muts)
-#%% FIXME: ambig mut counts
+#%% OUTFILE 2, write file: ambiguous mut counts    
-#=======================
+#======================
 # ambiguous mut counts
-#=======================
+#======================
 out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv'
 outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts
 print('\n----------------------------------'
-       , '\nWriting file: ambiguous muts'
+       , '\nWriting file: ambiguous mut counts'
       , '\n----------------------------------'
       , '\nFilename:', outfile_ambig_mut_counts)
 ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
-#%%FIXME: TODO: Add sanity check to make sure you can add value_count checks
+#%% FIXME: Add sanity check to make sure you can add value_count checks
 print('\nREACHED here...................>>>>>>>')
 #%% Resolving ambiguous muts
 # Merging ambiguous muts
 #=================
@ -995,6 +1003,12 @@ ambig_muts_rev_df.index
 gene_LF1.index
 all(ambig_muts_rev_df.index.isin(gene_LF1.index))
 any(gene_LF1.index.isin(ambig_muts_rev_df.index))
 if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == len(ambig_muts_rev_df)):
    print('\nPASS: ambiguous mut indices present in gene_LF1. Prepare to merge...')
 else:
    sys.exit('\nFAIL:ambiguous mut indices MISmatch. Check section Resolving ambiguous muts')
 #gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV']
 gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
@ -1024,9 +1038,9 @@ gene_LF1['mutation_info'].value_counts()
 #%% PHEW! Good to go for downstream stuff
 #%% Add column: Mutationinformation
 # splitting mutation column to get mCSM style muts
-#=======================
+#=====================================================
 # Formatting df: read aa dict and pull relevant info
-#=======================
+#=====================================================
 print('Now some more formatting:'
      , '\nRead aa dict and pull relevant info'
      , '\nFormat mutations:'
@ -1040,12 +1054,13 @@ print('Now some more formatting:'
 ncol_mutf_add = 3 # mut split into 3 cols
 ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping
-#===========
+#======================================================================
 # Split 'mutation' column into three:  wild_type, position and
 # mutant_type separately. Then map three letter code to one using
 # reference_dict imported at the beginning.
-# After importing, convert to mutation to lowercase for compatibility with dict 
+# After importing, convert to mutation to lowercase for 
-#===========
+# compatibility with dict 
 #=======================================================================
 gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower()
 print('wt regex being used:', wt_regex
@ -1054,14 +1069,15 @@ print('wt regex being used:', wt_regex
 mylen0 = len(gene_LF1.columns)
-#=======
+#=========================================================
 # Iterate through the dict, create a lookup dict i.e
 # lookup_dict = {three_letter_code: one_letter_code}.
 # lookup dict should be the key and the value (you want to create a column for)
 # Then use this to perform the mapping separetly for wild type and mutant cols.
-# The three letter code is extracted using a string match match from the dataframe and then converted
+# The three letter code is extracted using a string match match from the 
 # dataframe and then converted
 # to 'pandas series'since map only works in pandas series
-#=======
+#===========================================================
 print('Adding', ncol_mutf_add, 'more cols:\n')
 # initialise a sub dict that is lookup dict for three letter code to 1-letter code
@ -1076,7 +1092,11 @@ for k, v in my_aa_dict.items():
    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
    gene_LF1['mutant_type'] = mut.map(lookup_dict)
-# extract position info from mutation column separetly using string match
+#-------------------
 # extract position 
 # info from mutation 
 # column separetly using string match
 #-------------------
 #gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') 
 gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex) 
@ -1109,11 +1129,11 @@ else:
 # clear variables
 del(k, v, wt, mut, lookup_dict)
-########
+##########################################################################
 # combine the wild_type+poistion+mutant_type columns to generate 
 # mutationinformation (matches mCSM output field)
 # Remember to use .map(str) for int col types to allow string concatenation
-#########
+###########################################################################
 gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type']
 print('Created column: mutationinformation'
 	, '\n=====================================================================\n'
@ -1138,7 +1158,6 @@ bar = gene_LF1['position'].value_counts()
 #else:
 #    print('FAIL: df could not be ordered. Check source')
 #    sys.exit()
 #%% Create a copy of mutationinformation column for downstream mergeing
 gene_LF1['Mut']      = gene_LF1['mutationinformation'] 
 gene_LF1['Mut_copy'] = gene_LF1['mutationinformation'] 
@ -1150,7 +1169,7 @@ gene_LF1['index_orig_copy'] = gene_LF1.index
 all(gene_LF1.index.values == gene_LF1['index_orig'].values)
 all(gene_LF1.index.values == gene_LF1['index_orig_copy'].values)
-#%% quick sanity check for position freq count
+#%% Quick sanity check for position freq count
 # count the freq of 'other muts' samples
 test_df = gene_LF1.copy()
 test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'position']]
@ -1158,6 +1177,7 @@ test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'positi
 #test_df['sample_freq'] = test_df.groupby('id')['id'].transform('count')
 #print('Revised dim of other_muts_df:',test_df.shape) 
 test_df['scount'] = test_df['mutation'].map(SnpFDict)
 test_df = test_df.sort_values(['position', 'mutationinformation'])
 #%% Map mutation frequency count as a column
 gene_LF1['snp_frequency'] = gene_LF1['mutation'].map(SnpFDict)
 #%% Map position frequency count as a column
@ -1166,75 +1186,151 @@ z1 = z.to_dict()
 gene_LF1['pos_count'] = gene_LF1['position'].map(z1)
 #test_df2 = test_df.loc[test_df['position'] == 10] 
-
+###############################################################################
-#%% OUTFILE 4, write file mCSM style muts
+###############################################################################
 snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique())
 snps_only.head()
 # assign column name
 snps_only.columns = ['mutationinformation']
 # count how many positions this corresponds to
 pos_only = pd.DataFrame(gene_LF1['position'].unique()) 
 pos_only
 print('Checking NA in snps...')# should be 0
 if snps_only.mutationinformation.isna().sum() == 0:
    print ('PASS: NO NAs/missing entries for SNPs'
    , '\n===============================================================')
 else:
    sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
 # write file: mCSM muts
 out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv'
 outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
 print('\n----------------------------------'
      , '\nWriting file: mCSM style muts'
      , '\n----------------------------------'
      , '\nFile:', outfile_mcsmsnps
      , '\nmutation format (SNP): {WT}<POS>{MUT}'
      , '\nNo. of distinct muts:', len(snps_only)
      , '\nNo. of distinct positions:', len(pos_only)
      , '\n=============================================================')
 snps_only.to_csv(outfile_mcsmsnps, header = False, index = False)
 print('Finished writing:', outfile_mcsmsnps
      , '\nNo. of rows:', len(snps_only)
      , '\nNo. of cols:', len(snps_only.columns)
      , '\n=============================================================')
 del(out_filename_mcsmsnps)
 #%% OUTFILE 5, write file frequency of position counts:  MOVE TO THE END
 metadata_pos = pd.DataFrame(gene_LF1['position'])
 metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
 metadata_pos['meta_pos_count'].value_counts()
 metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
 out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
 outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
 print('\n----------------------------------'
      , '\nWriting file: Metadata poscounts'
      , '\n----------------------------------'
      , '\nFile:', outfile_metadata_poscounts
      , '\n============================================================')
 metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
 print('Finished writing:', outfile_metadata_poscounts
      , '\nNo. of rows:', len(metadata_pos)
      , '\nNo. of cols:', len(metadata_pos.columns)
      , '\n=============================================================')
 del(out_filename_metadata_poscounts)
 #%% OUTFILE 6, write file <gene>_metadata:  MOVE TO THE END
 # ---THINK ---
 #%% OUTFILE 7, write file MSA plots:  MOVE TO THE END
 # -- THINK ---
 #%% OUTFILE 8, write file mutational position with count:  MOVE TO THE END
 # -- THINK ---
 #%% Add column: aa property_water
 #=========
 # iterate through the dict, create a lookup dict that i.e
 # lookup_dict =  {three_letter_code: aa_prop_water} 
 # Do this for both wild_type and mutant as above.
 #=========
 print('Adding', ncol_aa_add, 'more cols:\n')
 # initialise a sub dict that is lookup dict for three letter code to aa prop
 # adding two more cols
 lookup_dict = dict()
 for k, v in my_aa_dict.items():
    lookup_dict[k] = v['aa_prop_water']
    #print(lookup_dict)
    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
    gene_LF1['wt_prop_water'] = wt.map(lookup_dict)   
    #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
    gene_LF1['mut_prop_water'] = mut.map(lookup_dict)
 mylen2 = len(gene_LF1.columns)    
 # sanity checks
 print('checking if 3-letter wt&mut residue extraction worked correctly')
 if wt.isna().sum() & mut.isna().sum() == 0:
   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
         , '\nNo NAs detected:'
         , '\nwild-type\n', wt
         , '\nmutant-type\n', mut
         , '\ndim of df:', gene_LF1.shape)
 else:
    print('FAIL: 3-letter wt&mut residue extraction failed'
          , '\nNo NAs detected:'
          , '\nwild-type\n', wt
          , '\nmutant-type\n', mut
          , '\ndim of df:', gene_LF1.shape)
 if mylen2 == mylen1 + ncol_aa_add:
    print('PASS: successfully added', ncol_aa_add, 'cols'
      , '\nold length:', mylen1
      , '\nnew len:', mylen2)
 else:
    print('FAIL: failed to add cols:'
          , '\nold length:', mylen1
          , '\nnew len:', mylen2)
 # clear variables
 del(k, v, wt, mut, lookup_dict)
 #%% Add column: aa_prop_polarity
 #========
 # iterate through the dict, create a lookup dict that i.e
 # lookup_dict =  {three_letter_code: aa_prop_polarity} 
 # Do this for both wild_type and mutant as above.
 #=========
 print('Adding', ncol_aa_add, 'more cols:\n')
 # initialise a sub dict that is lookup dict for three letter code to aa prop
 # adding two more cols
 lookup_dict = dict()
 for k, v in my_aa_dict.items():
    lookup_dict[k] = v['aa_prop_polarity']
    #print(lookup_dict)
    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
    gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict)   
   #mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
    gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
 mylen3 = len(gene_LF1.columns)       
 # sanity checks
 print('checking if 3-letter wt&mut residue extraction worked correctly')
 if wt.isna().sum() & mut.isna().sum() == 0:
   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
         , '\nNo NAs detected:'
         , '\nwild-type\n', wt
         , '\nmutant-type\n', mut
         , '\ndim of df:', gene_LF1.shape)
 else:
    print('FAIL: 3-letter wt&mut residue extraction failed'
          , '\nNo NAs detected:'
          , '\nwild-type\n', wt
          , '\nmutant-type\n', mut
          , '\ndim of df:', gene_LF1.shape)
 if mylen3 ==  mylen2 + ncol_aa_add:
    print('PASS: successfully added', ncol_aa_add, 'cols'
      , '\nold length:', mylen1
      , '\nnew len:', mylen2)
 else:
    print('FAIL: failed to add cols:'
          , '\nold length:', mylen1
          , '\nnew len:', mylen2)
 # clear variables
 del(k, v, wt, mut, lookup_dict)
 #%% Add column: aa_calcprop
 #========
 # iterate through the dict, create a lookup dict that i.e
 # lookup_dict =  {three_letter_code: aa_calcprop} 
 # Do this for both wild_type and mutant as above.
 #=========
 print('Adding', ncol_aa_add, 'more cols:\n')
 lookup_dict = dict()
 for k, v in my_aa_dict.items():
    lookup_dict[k] = v['aa_calcprop']
    #print(lookup_dict)
    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
    gene_LF1['wt_calcprop'] = wt.map(lookup_dict)   
    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
    gene_LF1['mut_calcprop'] = mut.map(lookup_dict)
 mylen4 = len(gene_LF1.columns)   
 # sanity checks
 print('checking if 3-letter wt&mut residue extraction worked correctly')
 if wt.isna().sum() & mut.isna().sum() == 0:
   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
         , '\nNo NAs detected:'
         , '\nwild-type\n', wt
         , '\nmutant-type\n', mut
         , '\ndim of df:', gene_LF1.shape)
 else:
    print('FAIL: 3-letter wt&mut residue extraction failed'
          , '\nNo NAs detected:'
          , '\nwild-type\n', wt
          , '\nmutant-type\n', mut
          , '\ndim of df:', gene_LF1.shape)
 if mylen4 ==  mylen3 + ncol_aa_add:
    print('PASS: successfully added', ncol_aa_add, 'cols'
      , '\nold length:', mylen3
      , '\nnew len:', mylen4)
 else:
    print('FAIL: failed to add cols:'
          , '\nold length:', mylen3
          , '\nnew len:', mylen4)
 # clear variables
 del(k, v, wt, mut, lookup_dict)
 #%% NEW mappings: gene_LF2
 # gene_LF2: copy gene_LF1
 gene_LF2 = gene_LF1.copy()
@ -1267,6 +1363,8 @@ gene_LF2['total_id_ucount'] = total_id_ucount
 # gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
 # gene_LF2['mut_id_ucount']
 gene_LF2['snp_frequency'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
 # gene_LF1['snp_frequency'].equals(gene_LF2['mut_id_ucount'])
 #%% AF for gene
@ -1275,7 +1373,7 @@ gene_LF2['total_id_ucount'] = total_id_ucount
 #=================
 gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount']
 gene_LF2['maf'].head()
-
+###############################################################################
 #%% Further mappings: gene_LF3, with mutationinformation as INDEX
 gene_LF3 = gene_LF2.copy()
@ -1324,6 +1422,7 @@ gene_LF3['drtype_all_vals'].head()
 gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
 gene_LF3['drtype_all_names'].head()
 #---------------------------------
 # Revised drtype: max(Multimode)
 #--------------------------------
@ -1406,6 +1505,7 @@ gene_LF3['dst_multimode_all']
 # Fill using multimode ONLY where NA in dst_multimode column
 gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
 gene_LF3['dst_multimode']
 gene_LF3['dst_multimode'].value_counts()
 #----------------------------------
 # Revised dst column: max of mode
@ -1422,6 +1522,11 @@ gene_LF3[drug].value_counts()
 #gene_LF3['dst_noNA'].value_counts()
 gene_LF3['dst_mode'].value_counts()
 if (gene_LF3['dst_mode'].value_counts().sum() == len(gene_LF3)) and (gene_LF3['dst_mode'].value_counts()[1] > gene_LF3[drug].value_counts()[1]) and gene_LF3['dst_mode'].value_counts()[0] > gene_LF3[drug].value_counts()[0]:
    print('\nPASS: revised dst mode created successfully and the counts are more than <drug> col count')
 else:
    print('\nFAIL:  revised dst mode numbers MISmatch')
 foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']]
 foo2 = foo.sort_values(['position', 'Mut'])
@ -1460,6 +1565,7 @@ gene_LF3['mutation_info_labels_v1'].value_counts() == gene_LF3['mutation_info_la
 # Now overwrite
 gene_LF3['mutation_info_labels_dst']  = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
 gene_LF3['mutation_info_labels']  =  gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
 if all(gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()):
    print('\nPASS: Revised mutation_info column created successfully')
    gene_LF3 = gene_LF3.drop(['mutation_info_labels_dst'], axis = 1)
@ -1480,6 +1586,7 @@ gene_LF3['mutation_info_labels_orig'].value_counts()
 # %% sanity check for the revised dst
 gene_LF3[drug].value_counts()
 gene_LF3[drug].value_counts().sum()
 gene_LF3['mutation_info_labels_orig'].value_counts()
 gene_LF3['dst_mode'].value_counts()
@ -1488,6 +1595,8 @@ gene_LF3['dst_mode'].value_counts().sum()
 # direct comparision
 gene_LF3['dst'].value_counts()
 gene_LF3['mutation_info_labels'].value_counts()
 gene_LF3['mutation_info_labels_v1'].value_counts()
 #%% Lineage
 gene_LF3['lineage'].value_counts()
 # lineage_label_numeric = {'lineage1' : 1
@ -1509,24 +1618,52 @@ lineage_label_numeric = {'L1' : 1
                    , 'LBOV' : 8}
 lineage_label_numeric
 #%% Lineage cols selected: gene_LF3_ColsSel
 gene_LF3_ColsSel = gene_LF3.copy()
 gene_LF3_ColsSel.index
 gene_LF3_ColsSel.columns
 #gene_LF3_ColsSel['index_orig_copy'] = gene_LF3_ColsSel['index_orig'] # delete
 # copy column to allow cross checks after stripping white space
-gene_LF3['lineage'] = gene_LF3['lineage'].str.strip()
+gene_LF3_ColsSel['lineage']         = gene_LF3_ColsSel['lineage'].str.strip()
-gene_LF3['lineage_corrupt'] = gene_LF3['lineage'] 
+gene_LF3_ColsSel['lineage_corrupt'] = gene_LF3_ColsSel['lineage'] 
-#all(gene_LF3['lineage_corrupt'].value_counts() ==gene_LF3['lineage'].value_counts())
+all(gene_LF3_ColsSel['lineage_corrupt'].value_counts() == gene_LF3_ColsSel['lineage'].value_counts())
-gene_LF3['lineage_corrupt'].value_counts()
+gene_LF3_ColsSel['lineage_corrupt'].value_counts()
 gene_LF3_ColsSel = gene_LF3_ColsSel[['index_orig_copy', 'Mut', 'position', 'snp_frequency', 'lineage', 'lineage_corrupt']]
 gene_LF3_ColsSel.columns
 #%% tidy_split(): Lineage
 # Create df with tidy_split: lineage
-lf_lin_split = tidy_split(gene_LF3, 'lineage_corrupt', sep = ';') 
+lf_lin_split = tidy_split(gene_LF3_ColsSel, 'lineage_corrupt', sep = ';') 
 lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip()
 lf_lin_split['lineage_corrupt'].value_counts()
 #%% Important sanity check for tidy_split(): lineage
 # Split based on semi colon dr_muts_col
 search = ";"
-# Map lineage labels to numbers to allow metrics
+# count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence
 count_df_lin = gene_LF3_ColsSel[['lineage']]
 count_df_lin['lineage_semicolon_count'] = gene_LF3_ColsSel.loc[:, 'lineage'].str.count(search, re.I)
 lin_sc_C = count_df_lin['lineage_semicolon_count'].value_counts().reset_index()
 lin_sc_C 
 lin_sc_C['index_semicolon'] = (lin_sc_C['index'] + 1) * lin_sc_C['lineage_semicolon_count']
 lin_sc_C 
 expected_linC = lin_sc_C['index_semicolon'].sum() + gene_LF3_ColsSel['lineage'].isnull().sum()
 expected_linC
 if (len(lf_lin_split) == expected_linC):
    print('\nPASS: tidy_split() length match for lineage')
 else:
    sys.exit('\nFAIL: tidy_split() length MISMATCH. Check lineage semicolon count')
 ###############################################################################
 #%% Map lineage labels to numbers to allow metrics
 lf_lin_split['lineage_numeric']  = lf_lin_split['lineage_corrupt'].map(lineage_label_numeric)
 lf_lin_split['lineage_numeric'].value_counts()
 #--------------------------------
-# Lineage_corrupt ALL values:
+# Add lineage_list: ALL values:
 #--------------------------------
 # Add all lineages for each mutation
 lf_lin_split['lineage_corrupt_list']  = lf_lin_split['lineage_corrupt'] 
@ -1535,12 +1672,21 @@ lf_lin_split['lineage_corrupt_list'].value_counts()
 lf_lin_split['lineage_corrupt_list']  = lf_lin_split.groupby('mutationinformation').lineage_corrupt_list.apply(list)
 lf_lin_split['lineage_corrupt_list'].value_counts()
 #--------------------------------
 # Add lineage count: ALL
 #--------------------------------
 lf_lin_split['lineage_corrupt_count']  = lf_lin_split['lineage_corrupt_list'].apply(lambda x : len(list(x)))
 #--------------------------------
 # Add lineage unique count
 #--------------------------------
 lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['Mut'].map(lf_lin_split.groupby('Mut')['lineage_corrupt'].nunique())
 #lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['lineage_corrupt']
 lf_lin_split['lineage_corrupt_ucount'].value_counts()
 #--------------------------------
 # Add lineage_set
 #--------------------------------
 lf_lin_split['lineage_set']    = lf_lin_split['lineage_corrupt_list'].apply(lambda x : set(list(x)))
 lf_lin_split['lineage_ulist']  = lf_lin_split['lineage_set'].apply(lambda x : list(x))
@ -1551,95 +1697,275 @@ lf_lin_split['lineage_multimode'] = lf_lin_split.groupby('mutationinformation')[
 lf_lin_split['lineage_multimode'].value_counts()
 # cant take max as it doesn't mean anyting!
 foo = lf_lin_split[['Mut', 'lineage_ulist']]
 ###############################################################################
-#%% Final merge: gene_LF4 with lineage_split_df
+#%% Select only the columns you want to merge from lf_lin_split 
-gene_LF4 = gene_LF3.copy()
+lf_lin_split.columns
-gene_LF4.index
+lf_lin_split_ColSel = lf_lin_split[['lineage_corrupt_list','lineage_corrupt_count'
-gene_LF4['index_orig_copy'] = gene_LF4['index_orig']
+                                    , 'lineage_corrupt_ucount' ,'lineage_ulist', 'lineage_multimode']]
 lf_lin_split_ColSel.columns
-lf_lin_split['index_orig_copy'] = lf_lin_split['index_orig']
+lf_lin_split_ColSel.rename(columns = {'lineage_corrupt_list'     : 'lineage_list_all'
-#================================
+                               , 'lineage_corrupt_count'  : 'lineage_count_all'
-# Merge with gene_LF3 with
+                               , 'lineage_corrupt_ucount' : 'lineage_count_unique'
-# lf_lin_split baseed on index
+                               , 'lineage_ulist'          : 'lineage_list_unique'
-#================================
+                               , 'lineage_multimode'      : 'lineage_multimode'}, inplace = True)
 # set index for lf_lin_split
 lf_lin_split.index
 lf_lin_split.reset_index(inplace=True)
 lf_lin_split['mutationinformation']
 lf_lin_split = lf_lin_split.set_index(['index_orig_copy'])
 lf_lin_split.index
-# set index for gene_LF4
+lf_lin_split_ColSel.columns
-gene_LF4.index
+ncols_to_merge = len(lf_lin_split_ColSel.columns)
 gene_LF4.reset_index(inplace=True)
 gene_LF4.index
 gene_LF4['mutationinformation']
 gene_LF4['index_orig_copy']
 gene_LF4 = gene_LF4.set_index(['index_orig_copy'])
 gene_LF4.index
-#-------------------------
+#%% Final merge: gene_LF3 with lf_lin_split_ColSel: gene_LF4
-# colum lineage_ucount:
+#===============================
-# contribution of each distinct lineage
+# merge: inner join by default
-#-------------------------
+# join: left join default
-gene_LF4['lineage_ucount'] = gene_LF4['lineage'] 
+# concat: outer join by default
 #df1.join(df2)
 #===============================
 len(lf_lin_split)
 len(gene_LF3)
-#-------------------------
+# Dropping duplicates from lineage df
-# colum lineage list:
+lf_lin_split_dups = lf_lin_split_ColSel[lf_lin_split_ColSel.index.duplicated()]
-#-------------------------
+lf_lin_split_U    = lf_lin_split_ColSel[~lf_lin_split_ColSel.index.duplicated()]
-#gene_LF4['lineage_set'] = gene_LF4['lineage'] 
+if len(lf_lin_split_U) == len(snps_only):
-gene_LF4['lineage_ulist'] = gene_LF4['lineage'] 
+    print('\nPASS: Duplicate entries removed from lf_lin'
-
+          , '\nReady to start the final merge to generate gene_LF4')
 gene_LF4['lineage_list'] = gene_LF4['lineage'] 
 #-------------------------
 # colum lineage_list mode:
 #-------------------------
 gene_LF4['lineage_mode'] = gene_LF4['lineage'] 
 # merge based on indices
 gene_LF4.index.nunique()
 lf_lin_split.index.nunique()
 all(gene_LF4.index.isin(lf_lin_split.index))
 all(lf_lin_split.index.isin(gene_LF4.index))
 gene_LF4.index
 lf_lin_split.index
 if (gene_LF4.index.nunique() == lf_lin_split.index.nunique()) and ( all(gene_LF4.index.isin(lf_lin_split.index)) == all(lf_lin_split.index.isin(gene_LF4.index)) ):
    print('\nPass: merging lineage_ucount with gene_LF4')
 else:
-    sys.exit('\nFail: Indices mismatch, cannot merge! Quitting!')
+    print('\nFAIL: DF after duplicate removal numbers MISmatch ')
-###########################
+gene_LF3.index
-# magic merge happens here
+lf_lin_split_U.index
 ###########################
 # FIXME: add check here to see if the duplicated indices rows are actual duplicates as the cols I need should be summary cols
 lf_lin_split.index.drop_duplicates(keep='first')
 lf_lin_split = lf_lin_split
 lf_lin_split_U = lf_lin_split[~lf_lin_split.index.duplicated(keep='first')]
 lf_lin_split_U.shape
-gene_LF4.loc[lf_lin_split_U.index, 'lineage_ucount']  = lf_lin_split_U['lineage_corrupt_ucount']
+#--------------------
-gene_LF4['lineage_ucount'].value_counts()
+# Creating gene_LF4  
 #-------------------- 
 if all(gene_LF3.index.isin(lf_lin_split_U.index)) :
    print('\nIndices match, staring final merge...')
    gene_LF4 = gene_LF3.join(lf_lin_split_U)
    if len(gene_LF4.columns) == len(gene_LF3.columns) + ncols_to_merge: 
        print('\nPASS: gene_LF4 created after successfully merging with lineage info'
              , '\nShape of gene_LF4:', gene_LF4.shape)
 else:
    sys.exit('\nFAIL: gene_LF4 could not be created. Dfs could not be merged. Check indices or dfs length again!')
-#gene_LF4.loc[lf_lin_split_U.index, 'lineage_set']  = lf_lin_split_U['lineage_set'] 
+#----------------------------------------
-#gene_LF4['lineage_set'].value_counts()
+# Dropping redundant cols from  gene_LF4  
-gene_LF4.loc[lf_lin_split_U.index, 'lineage_ulist']  = lf_lin_split_U['lineage_ulist'] 
+#----------------------------------------
-gene_LF4['lineage_ulist'].value_counts()
+if all(gene_LF4.index == gene_LF4['Mut']) and gene_LF4['index_orig'].equals(gene_LF4['index_orig_copy']):
    gene_LF4.reset_index(inplace=True)
    #gene_LF4 = gene_LF4.set_index(['index_orig'])
    print('\nPass Mutationinformationa and index columns checked, the duplicates can now be dropped')
    gene_LF4 = gene_LF4.drop(['Mut_copy', 'index_orig_copy', 'mutation_info_v1' ], axis = 1)
-gene_LF4.loc[lf_lin_split_U.index, 'lineage_list']  = lf_lin_split_U['lineage_corrupt_list'] 
+#%% Final output with relevant columns
-gene_LF4['lineage_list'].value_counts()
+print('\nFinal output file: Dim of gene_LF4:', gene_LF4.shape)
-gene_LF4.loc[lf_lin_split_U.index, 'lineage_mode']  = lf_lin_split_U['lineage_multimode']
+# cols_to_output = ['mutationinformation', 'id', 'sample', 'lineage', 'sublineage',
-gene_LF4['lineage_mode'].value_counts()
+#        'country_code', 'drtype', 'pyrazinamide', 'mutation', 'drug_name',
 #        'mutation_info', 'mutation_info_orig', 'wild_type', 'mutant_type',
 #        'position', 'Mut', 'index_orig', 'pos_count', 'total_id_ucount',
 #        'snp_frequency', 'maf', 'drtype_numeric', 'drtype_all_vals',
 #        'drtype_all_names', 'drtype_multimode', 'drtype_mode', 'drtype_max',
 #        'mutation_info_labels', 'dm_om_numeric', 'dm_om_numeric_orig', 'dst',
 #        'dst_multimode', 'dst_multimode_all', 'dst_mode', 'lineage_list_all',
 #        'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique',
 #        'lineage_multimode']
 #%% OUTFILE 3, write file mCSM style muts
 snps_only = pd.DataFrame(gene_LF4['mutationinformation'].unique())
 snps_only.head()
 # assign column name
 snps_only.columns = ['mutationinformation']
-foo = gene_LF4[['mutationinformation', 'lineage', 'lineage_ucount'
+# count how many positions this corresponds to
-                #, 'lineage_set'
+pos_only = pd.DataFrame(gene_LF4['position'].unique()) 
-                , 'lineage_ulist'
+pos_only
                , 'lineage_mode'
                , 'lineage_list']]
 #%%
-#Subset relevant columns for output and put the rest of the output here
+print('Checking NA in snps...')# should be 0
 if snps_only.mutationinformation.isna().sum() == 0:
    print ('PASS: NO NAs/missing entries for SNPs'
    , '\n===============================================================')
 else:
    sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
 # write file: mCSM muts
 out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv'
 outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
 print('\n----------------------------------'
      , '\nWriting file: mCSM style muts'
      , '\n----------------------------------'
      , '\nFile:', outfile_mcsmsnps
      , '\nmutation format (SNP): {WT}<POS>{MUT}'
      , '\nNo. of distinct muts:', len(snps_only)
      , '\nNo. of distinct positions:', len(pos_only)
      , '\n=============================================================')
 snps_only.to_csv(outfile_mcsmsnps, header = False, index = False)
 print('Finished writing:', outfile_mcsmsnps
      , '\nNo. of rows:', len(snps_only)
      , '\nNo. of cols:', len(snps_only.columns)
      , '\n=============================================================')
 del(out_filename_mcsmsnps)
 ###############################################################################
 #%% OUTFILE 4, write file frequency of position count
 metadata_pos = pd.DataFrame(gene_LF4['position'])
 metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
 metadata_pos['meta_pos_count'].value_counts()
 metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
 out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
 outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
 print('\n----------------------------------'
      , '\nWriting file: Metadata poscounts'
      , '\n----------------------------------'
      , '\nFile:', outfile_metadata_poscounts
      , '\n============================================================')
 metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
 print('Finished writing:', outfile_metadata_poscounts
      , '\nNo. of rows:', len(metadata_pos)
      , '\nNo. of cols:', len(metadata_pos.columns)
      , '\n=============================================================')
 del(out_filename_metadata_poscounts)
 ###############################################################################
 #%% OUTFILE 5, write file <gene>_metadata
 # where each row has UNIQUE mutations NOT unique sample ids
 out_filename_metadata = gene.lower() + '_metadata.csv'
 outfile_metadata = outdir + '/' + out_filename_metadata
 print('\n----------------------------------'
      , '\nWriting file: LF formatted data'
      , '\n----------------------------------'
      , '\nFile:', outfile_metadata
      , '\n============================================================')
 gene_LF4.to_csv(outfile_metadata, header = True, index = False)
 print('Finished writing:', outfile_metadata
      , '\nNo. of rows:', len(gene_LF4)
      , '\nNo. of cols:', len(gene_LF4.columns)
      , '\n=============================================================')
 del(out_filename_metadata)
 ###############################################################################
 #%% OUTFILE 6, write file MSA plots
 #write file: mCSM style but with repitions for MSA and logo plots
 all_muts_msa = pd.DataFrame(gene_LF4['mutationinformation']) 
 all_muts_msa.head()
 # assign column name
 all_muts_msa.columns = ['mutationinformation']
 # make sure it is string
 all_muts_msa.columns.dtype
 # sort the column
 all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation')
 # create an extra column with protein name
 #all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1') 
 #all_muts_msa_sorted.head()
 # rearrange columns so the fasta name is the first column (required for mutate.script)
 #all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
 all_muts_msa_sorted.head()
 print('Checking NA in snps...')# should be 0
 if all_muts_msa.mutationinformation.isna().sum() == 0:
    print ('PASS: NO NAs/missing entries for SNPs'
    , '\n===============================================================')
 else:
    sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
 out_filename_msa = gene.lower() +'_all_muts_msa.csv'
 outfile_msa = outdir + '/' + out_filename_msa
 print('\n----------------------------------'
      , '\nWriting file: mCSM style muts for msa'
      , '\n----------------------------------'
      , '\nFile:', outfile_msa
      , '\nmutation format (SNP): {WT}<POS>{MUT}'
      , '\nNo.of lines of msa:', len(all_muts_msa))
 all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False)
 print('Finished writing:', outfile_msa
      , '\nNo. of rows:', len(all_muts_msa)
      , '\nNo. of cols:', len(all_muts_msa.columns)
      , '\n=============================================================')
 del(out_filename_msa)
 ###############################################################################
 #%% OUTFILE 7, write file mutational position with count
 # write file for mutational positions
 # count how many positions this corresponds to
 pos_only = pd.DataFrame(gene_LF4['position'].unique()) 
 # assign column name
 pos_only.columns = ['position']
 # make sure dtype of column position is int or numeric and not string
 pos_only.position.dtype
 pos_only['position'] = pos_only['position'].astype(int)
 pos_only.position.dtype
 # sort by position value
 pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)
 out_filename_pos = gene.lower() + '_mutational_positons.csv'
 outfile_pos = outdir + '/' + out_filename_pos
 print('\n----------------------------------'
      , '\nWriting file: mutational positions'
      , '\n----------------------------------'
      , '\nFile:', outfile_pos
      , '\nNo. of distinct positions:', len(pos_only_sorted)
      , '\n=============================================================')
 pos_only_sorted.to_csv(outfile_pos, header = True, index = False)
 print('Finished writing:', outfile_pos
      , '\nNo. of rows:', len(pos_only_sorted)
      , '\nNo. of cols:', len(pos_only_sorted.columns)
      , '\n============================================================='
      , '\n\n\n')
 del(out_filename_pos)
 ###############################################################################
 #%% Quick summary output
 print('============================================'
      , '\nQuick summary output for', drug, 'and' , gene.lower()
      , '\n============================================'
      , '\nTotal samples:', total_samples
      , '\nNo. of Sus and Res [original]', drug, 'samples:\n', meta_data[drug].value_counts()
 	  , '\n'
      , '\nPercentage of Sus and Res [original]', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
 	  , '\n'
      , '\nNo. of Sus and Res [Revised]', drug, 'samples:\n', gene_LF4['dst_mode'].value_counts()
 	  , '\n'
      , '\nPercentage of Sus and Res [Revised]', drug, 'samples:\n', gene_LF4['dst_mode'].value_counts(normalize = True)*100
 	  , '\n'
      , '\nTotal no. of unique snps:', len(snps_only)
      , '\nTotal no. of unique snps:', dr_muts_rev.nunique()+other_muts_rev.nunique()
      , '\nTotal no.of unique dr muts:', dr_muts_rev.nunique() # ADD 
      , '\nTotal no.of unique other muts:', other_muts_rev.nunique()#ADD 
      , '\nTotal no.of unique missense muts:', gene_LF4['mutationinformation'].nunique()
      , '\nTotal no.of unique positions associated with missense muts:', gene_LF4['position'].nunique()
      , '\nTotal no. of samples with missense muts:', len(gene_LF4)
      , '\nTotal no. of unique samples with missense muts:', gene_LF4['id'].nunique()  
      , '\n')
 if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
    print('\nTotal no.of samples with ambiguous muts:', len(inspect)
          #, '\nTotal no.of unique ambiguous muts:', len(common_muts)
          , '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
          , '\n============================================================='
          , '\nPost resolving ambiguity\n'
          , ambig_muts_rev_df['mutation_info_REV'].value_counts())
 #=======================================================================
 print(u'\u2698' * 50,
      '\nEnd of script: Data extraction and writing files'
      '\n' + u'\u2698' * 50 )
 #%% end of script