added sections and slotted relevant bits from data_extraction to v2

2022-04-14 12:21:16 +01:00 · 2022-04-14 12:21:16 +01:00 · f05cb96346
commit f05cb96346
parent e6faf80c20
2 changed files with 446 additions and 52 deletions
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -1210,7 +1210,7 @@ print('Finished writing:', outfile_mcsmsnps
      , '\n=============================================================')
 del(out_filename_mcsmsnps)

-#%%# write frequency of position counts
+#%% write frequency of position counts
 metadata_pos = pd.DataFrame(gene_LF1['position'])
 z =  gene_LF1['position'].value_counts()
 z1 = z.to_dict()
@ -1219,8 +1219,6 @@ metadata_pos['meta_pos_count'].value_counts()

 metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)

-# Write file: gene_metadata (i.e gene_LF1)
-# where each row has UNIQUE mutations NOT unique sample ids
 out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
 outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
 print('\n----------------------------------'
@ -1236,7 +1234,6 @@ print('Finished writing:', outfile_metadata_poscounts
      , '\n=============================================================')
 del(out_filename_metadata_poscounts)

-
 #%% Write file: gene_metadata (i.e gene_LF1)
 # where each row has UNIQUE mutations NOT unique sample ids
 #out_filename_metadata = gene.lower() + '_metadata_raw.csv' #! rationale?
--- a/scripts/data_extraction_v2.py
+++ b/scripts/data_extraction_v2.py
@ -905,14 +905,16 @@ else:

 #%% Ambiguous muts: revised annotation for mutation_info
 ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
-
 ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts()    
 ambiguous_muts_value_counts

-bar = gene_LF1.copy()
-bar.equals(gene_LF1)
-
 gene_LF1_orig = gene_LF1.copy()
+gene_LF1_orig.equals(gene_LF1)
+
+# copy the old column for checking
+gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info']
+gene_LF1['mutation_info_orig'].value_counts()
+gene_LF1['mutation_info'].value_counts()

 #=====================================
 # Now create a new df that will have:
@ -924,42 +926,52 @@ gene_LF1_orig = gene_LF1.copy()
 # label is chosen that corresponds to the max of value counts
 #=====================================
 ambig_muts_rev_df = pd.DataFrame()
+changes_val = []
+changes_dict = {}

 for i in common_muts:
    #print(i)
-    temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info']]
+    temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
    
  # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending
-    max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info']].value_counts()
+    max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts()
    
    revised_label = max_info_table[[0]].index[0][1] # max value_count
    old_label     = max_info_table[[1]].index[0][1] # min value_count 
    print('\nAmbiguous mutation labels...'
          , '\nSetting mutation_info for', i, 'to', revised_label)
-    temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info'] == old_label)
+    temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info_orig'] == old_label)
                      , revised_label
-                      , temp_df['mutation_info'])
+                      , temp_df['mutation_info_orig'])
    ambig_muts_rev_df = pd.concat([ambig_muts_rev_df,temp_df])
+    f = max_info_table[[1]]
+    old_label_count = f[[0]][0]
+    changes_val.append(old_label_count) 
+    cc_dict = f.to_dict()
+    changes_dict.update(cc_dict)
                      
-ambig_muts_rev_df['mutation_info'].value_counts() 
 ambig_muts_rev_df['mutation_info_REV'].value_counts() 
+ambig_muts_rev_df['mutation_info_orig'].value_counts() 
+changes_val
+changes_total = sum(changes_val)
+changes_dict
+
+# TODO: Add sanity check to make sure you can add value_count checks

 #=================
 # Merge ambig muts
 # with gene_LF1
 #===================
-bar = gene_LF1.copy()
-bar['mutation_info_old'] = bar['mutation_info']
-
 ambig_muts_rev_df.index
-bar.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
-bar['mutation_info_old'].value_counts()
-bar['mutation_info'].value_counts()
-foo = bar.iloc[ambig_muts_rev_df.index]
-foo[['mutation', 'mutation_info', 'mutation_info_old']]

-# CHECK if there are still any ambiguous muts
-muts_split_rev = list(bar.groupby('mutation_info'))
+gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
+gene_LF1['mutation_info_orig'].value_counts()
+gene_LF1['mutation_info'].value_counts()
+foo = gene_LF1.iloc[ambig_muts_rev_df.index]
+foo[['mutation', 'mutation_info', 'mutation_info_orig']]
+
+# Sanity check1: if there are still any ambiguous muts
+muts_split_rev = list(gene_LF1.groupby('mutation_info'))
 dr_muts_rev = muts_split_rev[0][1].mutation 
 other_muts_rev =  muts_split_rev[1][1].mutation
 print('splitting muts by mut_info:', muts_split_rev)
@ -972,34 +984,8 @@ else:
    print('\nAmbiguous muts corrected. Quitting!')
    sys.exit()

-#%% ROUND THE HOUSES: DELETE
-foo = ambig_muts_rev_df[['mutation', 'mutation_info_REV']]
-fooU = foo.drop_duplicates()
-
-#https://cmdlinetips.com/2021/04/convert-two-column-values-from-pandas-dataframe-to-a-dictionary/
-fooD = fooU.set_index('mutation').to_dict()['mutation_info_REV']
-fooD
-#https://stackoverflow.com/questions/45334020/pandas-map-column-data-based-on-value-from-another-column-using-if-to-determine
-#df['New_State_Name'] = np.where(df['Name']=='Person1',df['State'].map(state_map),df['State'].map(state_map2))
-bar['NEW'] = bar['mutation_info_old']
-bar['NEW'].value_counts()
-bar['mutation_info_old'].value_counts()
-
-bar['NEW'].value_counts()
-bar2 = bar[['mutation', 'mutation_info', 'NEW']]
-bar2.isnull().sum()
-
-bar['NEW'].value_counts()
-bar['mutation_info'].value_counts()
-bar.loc[bar['mutation'].isin(fooD.keys()), 'NEW'] = bar['mutation_info'].map(fooD)
-bar['NEW'] = bar['NEW'].map(fooD).fillna(bar['NEW'])
-
-bar['NEW2'].value_counts()
-bar['NEW'].value_counts()
 #%% OUTFILE 3, write file: ambiguous muts and ambiguous mut counts
-
-#print(outdir)
-===================
+#===================
 # ambiguous muts
 #=======================
 #dr_muts.to_csv('dr_muts.csv', header = True)
@ -1034,7 +1020,7 @@ print('\n----------------------------------'

 ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)

-#%% clear variables
+# clear variables
 del(id_dr, id_other
    #, meta_data
    , meta_gene_dr, meta_gene_other
@ -1043,9 +1029,420 @@ del(id_dr, id_other

 del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1)                     
 
-#%% Splitting mutation column to get mCSM style muts i.e. mutationinformation column
+#%% Add column: Mutationinformation
+# splitting mutation column to get mCSM style muts
+#=======================
+# Formatting df: read aa dict and pull relevant info
+#=======================
+print('Now some more formatting:'
+      , '\nRead aa dict and pull relevant info'
+      , '\nFormat mutations:'
+      , '\nsplit mutation into mCSM style muts: '
+      , '\nFormatting mutation in mCSM style format: {WT}<POS>{MUT}'
+      , '\nAssign aa properties: adding 2 cols at a time for each prop'
+      , '\n===================================================================')
+
+# BEWARE hardcoding : only works as we are adding aa prop once for wt and once for mut
+# in each lookup cycle 
+ncol_mutf_add = 3 # mut split into 3 cols
+ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping
+
+#===========
+# Split 'mutation' column into three:  wild_type, position and
+# mutant_type separately. Then map three letter code to one using
+# reference_dict imported at the beginning.
+# After importing, convert to mutation to lowercase for compatibility with dict 
+#===========
+gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower()
+
+print('wt regex being used:', wt_regex
+      , '\nmut regex being used:', mut_regex
+      , '\nposition regex being used:', pos_regex)
+
+mylen0 = len(gene_LF1.columns)
+
+#=======
+# Iterate through the dict, create a lookup dict i.e
+# lookup_dict = {three_letter_code: one_letter_code}.
+# lookup dict should be the key and the value (you want to create a column for)
+# Then use this to perform the mapping separetly for wild type and mutant cols.
+# The three letter code is extracted using a string match match from the dataframe and then converted
+# to 'pandas series'since map only works in pandas series
+#=======
+print('Adding', ncol_mutf_add, 'more cols:\n')
+
+# initialise a sub dict that is lookup dict for three letter code to 1-letter code
+# adding three more cols
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['one_letter_code']
+    #wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()converts to a series that map works on
+    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
+    gene_LF1['wild_type'] = wt.map(lookup_dict)   
+    #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
+    gene_LF1['mutant_type'] = mut.map(lookup_dict)
+
+# extract position info from mutation column separetly using string match
+#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') 
+gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex) 
+
+mylen1 = len(gene_LF1.columns)
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen1 ==  mylen0 + ncol_mutf_add:
+    print('PASS: successfully added', ncol_mutf_add, 'cols'
+      , '\nold length:', mylen0
+      , '\nnew len:', mylen1)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen0
+          , '\nnew len:', mylen1)
+
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+########
+# combine the wild_type+poistion+mutant_type columns to generate 
+# mutationinformation (matches mCSM output field)
+# Remember to use .map(str) for int col types to allow string concatenation
+#########
+gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type']
+print('Created column: mutationinformation'
+	, '\n=====================================================================\n'
+    , gene_LF1.mutationinformation.head(10))
+
+#order by position for convenience
+gene_LF1.dtypes
+
+# converting position to numeric
+gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
+
+# sort by position inplace 
+foo = gene_LF1['position'].value_counts()
+gene_LF1.sort_values(by = ['position'], inplace = True)
+bar = gene_LF1['position'].value_counts()
+
+if (foo == bar).all():
+    print('PASS: df ordered by position')
+    print(gene_LF1['position'].head())
+else:
+    print('FAIL: df could not be ordered. Check source')
+    sys.exit()
+
+#%% OUTFILE 4, write file mCSM style muts
+snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique())
+snps_only.head()
+# assign column name
+snps_only.columns = ['mutationinformation']
+
+# count how many positions this corresponds to
+pos_only = pd.DataFrame(gene_LF1['position'].unique()) 
+
+print('Checking NA in snps...')# should be 0
+if snps_only.mutationinformation.isna().sum() == 0:
+    print ('PASS: NO NAs/missing entries for SNPs'
+    , '\n===============================================================')
+else:
+    sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
+
+# write file: mCSM muts
+out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv'
+outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
+
+print('\n----------------------------------'
+      , '\nWriting file: mCSM style muts'
+      , '\n----------------------------------'
+      , '\nFile:', outfile_mcsmsnps
+      , '\nmutation format (SNP): {WT}<POS>{MUT}'
+      , '\nNo. of distinct muts:', len(snps_only)
+      , '\nNo. of distinct positions:', len(pos_only)
+      , '\n=============================================================')
+
+snps_only.to_csv(outfile_mcsmsnps, header = False, index = False)
+
+print('Finished writing:', outfile_mcsmsnps
+      , '\nNo. of rows:', len(snps_only)
+      , '\nNo. of cols:', len(snps_only.columns)
+      , '\n=============================================================')
+del(out_filename_mcsmsnps)
+#%% OUTFILE 5, write file frequency of position counts
+metadata_pos = pd.DataFrame(gene_LF1['position'])
+z =  gene_LF1['position'].value_counts()
+z1 = z.to_dict()
+metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
+metadata_pos['meta_pos_count'].value_counts()
+
+metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
+
+out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
+outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
+print('\n----------------------------------'
+      , '\nWriting file: Metadata poscounts'
+      , '\n----------------------------------'
+      , '\nFile:', outfile_metadata_poscounts
+      , '\n============================================================')
+
+metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
+print('Finished writing:', outfile_metadata_poscounts
+      , '\nNo. of rows:', len(metadata_pos)
+      , '\nNo. of cols:', len(metadata_pos.columns)
+      , '\n=============================================================')
+del(out_filename_metadata_poscounts)
+#%% OUTFILE 6, write file <gene>_metadata 
+# gene_LF1, where each row has UNIQUE mutations NOT unique sample ids
+out_filename_metadata = gene.lower() + '_metadata.csv'
+outfile_metadata = outdir + '/' + out_filename_metadata
+print('\n----------------------------------'
+      , '\nWriting file: LF formatted data'
+      , '\n----------------------------------'
+      , '\nFile:', outfile_metadata
+      , '\n============================================================')
+
+gene_LF1.to_csv(outfile_metadata, header = True, index = False)
+print('Finished writing:', outfile_metadata
+      , '\nNo. of rows:', len(gene_LF1)
+      , '\nNo. of cols:', len(gene_LF1.columns)
+      , '\n=============================================================')
+del(out_filename_metadata)
+#%% OUTFILE 7, write file MSA plots
+# mCSM style muts but with repetitions
+# !!! THINK !!!: Implications if mut is coming from the same sample, etc.
+all_muts_msa = pd.DataFrame(gene_LF1['mutationinformation']) 
+all_muts_msa.head()
+
+# assign column name
+all_muts_msa.columns = ['mutationinformation']
+
+# make sure it is string
+all_muts_msa.columns.dtype
+
+# sort the column
+all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation')
+
+# create an extra column with protein name
+#all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1') 
+#all_muts_msa_sorted.head()
+
+# rearrange columns so the fasta name is the first column (required for mutate.script)
+#all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
+all_muts_msa_sorted.head()
+
+print('Checking NA in snps...')# should be 0
+if all_muts_msa.mutationinformation.isna().sum() == 0:
+    print ('PASS: NO NAs/missing entries for SNPs'
+    , '\n===============================================================')
+else:
+    sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
+
+out_filename_msa = gene.lower() +'_all_muts_msa.csv'
+outfile_msa = outdir + '/' + out_filename_msa
+
+print('\n----------------------------------'
+      , '\nWriting file: mCSM style muts for msa'
+      , '\n----------------------------------'
+      , '\nFile:', outfile_msa
+      , '\nmutation format (SNP): {WT}<POS>{MUT}'
+      , '\nNo.of lines of msa:', len(all_muts_msa))
+
+all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False)
+
+print('Finished writing:', outfile_msa
+      , '\nNo. of rows:', len(all_muts_msa)
+      , '\nNo. of cols:', len(all_muts_msa.columns)
+      , '\n=============================================================')
+
+del(out_filename_msa)
+#%% OUTFILE 8, write file mutational position with count
+# count how many positions this corresponds to
+pos_only = pd.DataFrame(gene_LF1['position'].unique()) 
+# assign column name
+pos_only.columns = ['position']
+# make sure dtype of column position is int or numeric and not string
+pos_only.position.dtype
+pos_only['position'] = pos_only['position'].astype(int)
+pos_only.position.dtype
+
+# sort by position value
+pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)
+
+out_filename_pos = gene.lower() + '_mutational_positons.csv'
+outfile_pos = outdir + '/' + out_filename_pos
+
+print('\n----------------------------------'
+      , '\nWriting file: mutational positions'
+      , '\n----------------------------------'
+      , '\nFile:', outfile_pos
+      , '\nNo. of distinct positions:', len(pos_only_sorted)
+      , '\n=============================================================')
+
+pos_only_sorted.to_csv(outfile_pos, header = True, index = False)
+
+print('Finished writing:', outfile_pos
+      , '\nNo. of rows:', len(pos_only_sorted)
+      , '\nNo. of cols:', len(pos_only_sorted.columns)
+      , '\n============================================================='
+      , '\n\n\n')
+
+del(out_filename_pos)
+#%% Add column: aa property_water
+#=========
+# iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_prop_water} 
+# Do this for both wild_type and mutant as above.
+#=========
+print('Adding', ncol_aa_add, 'more cols:\n')
+
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+# adding two more cols
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_prop_water']
+    #print(lookup_dict)
+    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
+    gene_LF1['wt_prop_water'] = wt.map(lookup_dict)   
+    #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
+    gene_LF1['mut_prop_water'] = mut.map(lookup_dict)
+
+mylen2 = len(gene_LF1.columns)    
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen2 == mylen1 + ncol_aa_add:
+    print('PASS: successfully added', ncol_aa_add, 'cols'
+      , '\nold length:', mylen1
+      , '\nnew len:', mylen2)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen1
+          , '\nnew len:', mylen2)
+
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+#%% Add column: aa_prop_polarity
+#========
+# iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_prop_polarity} 
+# Do this for both wild_type and mutant as above.
+#=========
+print('Adding', ncol_aa_add, 'more cols:\n')
+
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+# adding two more cols
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_prop_polarity']
+    #print(lookup_dict)
+    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
+    gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict)   
+   #mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
+    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
+    gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
+    
+mylen3 = len(gene_LF1.columns)       
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen3 ==  mylen2 + ncol_aa_add:
+    print('PASS: successfully added', ncol_aa_add, 'cols'
+      , '\nold length:', mylen1
+      , '\nnew len:', mylen2)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen1
+          , '\nnew len:', mylen2)
+   
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+#%% Add column: aa_calcprop
+#========
+# iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_calcprop} 
+# Do this for both wild_type and mutant as above.
+#=========
+print('Adding', ncol_aa_add, 'more cols:\n')
+
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_calcprop']
+    #print(lookup_dict)
+    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
+    gene_LF1['wt_calcprop'] = wt.map(lookup_dict)   
+    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
+    gene_LF1['mut_calcprop'] = mut.map(lookup_dict)
+ 
+mylen4 = len(gene_LF1.columns)   
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen4 ==  mylen3 + ncol_aa_add:
+    print('PASS: successfully added', ncol_aa_add, 'cols'
+      , '\nold length:', mylen3
+      , '\nnew len:', mylen4)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen3
+          , '\nnew len:', mylen4)
+
+# clear variables
+del(k, v, wt, mut, lookup_dict)

 #%% NEW TODO: map mutationinformation
+
 #%% NEW: mappings
 #=======================
 # column name: <drug>