did all other mappings until dst column

2022-04-23 11:14:34 +01:00 · 2022-04-23 11:14:34 +01:00 · cb93cef3c7
commit cb93cef3c7
parent 7a10b4f223
1 changed files with 414 additions and 5 deletions
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -672,6 +672,15 @@ print(dict(Cdict))
 for k, v in Cdict.items():
    if k in common_snps_dr_other:
        print(k,v)   
+        
+# convert defaultDict to dict
+SnpFDict_orig = dict(Cdict)
+
+def lower_dict(d):
+   new_dict = dict((k.lower(), v) for k, v in d.items())
+   return new_dict
+
+SnpFDict = lower_dict(SnpFDict_orig)
 ###############################################################################
 # USE Vcounts to get expected curated df
 # resolve dm om muts funda
@ -987,13 +996,16 @@ ambig_muts_rev_df.index
 gene_LF1.index
 all(ambig_muts_rev_df.index.isin(gene_LF1.index))

-gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV']
+#gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV']
+gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
+
 gene_LF1['mutation_info_orig'].value_counts()
-gene_LF1['mutation_info_v1'].value_counts()
+#gene_LF1['mutation_info_v1'].value_counts()
 foo = gene_LF1.iloc[ambig_muts_rev_df.index]

 # Sanity check1: if there are still any ambiguous muts
-muts_split_rev = list(gene_LF1.groupby('mutation_info_v1'))
+#muts_split_rev = list(gene_LF1.groupby('mutation_info_v1'))
+muts_split_rev = list(gene_LF1.groupby('mutation_info'))
 dr_muts_rev = muts_split_rev[0][1].mutation 
 other_muts_rev =  muts_split_rev[1][1].mutation
 print('splitting muts by mut_info:', muts_split_rev)
@ -1006,5 +1018,402 @@ else:
    print('\nAmbiguous muts NOT corrected. Quitting!')
    sys.exit()

-gene_LF1['mutation_info_v1'].value_counts()
+#gene_LF1['mutation_info_v1'].value_counts()
+gene_LF1['mutation_info'].value_counts()
+
+# reassign
 #%% PHEW! Good to go for downstream stuff
+#%% Add column: Mutationinformation
+# splitting mutation column to get mCSM style muts
+#=======================
+# Formatting df: read aa dict and pull relevant info
+#=======================
+print('Now some more formatting:'
+      , '\nRead aa dict and pull relevant info'
+      , '\nFormat mutations:'
+      , '\nsplit mutation into mCSM style muts: '
+      , '\nFormatting mutation in mCSM style format: {WT}<POS>{MUT}'
+      , '\nAssign aa properties: adding 2 cols at a time for each prop'
+      , '\n===================================================================')
+
+# BEWARE hardcoding : only works as we are adding aa prop once for wt and once for mut
+# in each lookup cycle 
+ncol_mutf_add = 3 # mut split into 3 cols
+ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping
+
+#===========
+# Split 'mutation' column into three:  wild_type, position and
+# mutant_type separately. Then map three letter code to one using
+# reference_dict imported at the beginning.
+# After importing, convert to mutation to lowercase for compatibility with dict 
+#===========
+gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower()
+
+print('wt regex being used:', wt_regex
+      , '\nmut regex being used:', mut_regex
+      , '\nposition regex being used:', pos_regex)
+
+mylen0 = len(gene_LF1.columns)
+
+#=======
+# Iterate through the dict, create a lookup dict i.e
+# lookup_dict = {three_letter_code: one_letter_code}.
+# lookup dict should be the key and the value (you want to create a column for)
+# Then use this to perform the mapping separetly for wild type and mutant cols.
+# The three letter code is extracted using a string match match from the dataframe and then converted
+# to 'pandas series'since map only works in pandas series
+#=======
+print('Adding', ncol_mutf_add, 'more cols:\n')
+
+# initialise a sub dict that is lookup dict for three letter code to 1-letter code
+# adding three more cols
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['one_letter_code']
+    #wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()converts to a series that map works on
+    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
+    gene_LF1['wild_type'] = wt.map(lookup_dict)   
+    #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
+    gene_LF1['mutant_type'] = mut.map(lookup_dict)
+
+# extract position info from mutation column separetly using string match
+#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') 
+gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex) 
+
+mylen1 = len(gene_LF1.columns)
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt & mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen1 ==  mylen0 + ncol_mutf_add:
+    print('PASS: successfully added', ncol_mutf_add, 'cols'
+      , '\nold length:', mylen0
+      , '\nnew len:', mylen1)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen0
+          , '\nnew len:', mylen1)
+
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+########
+# combine the wild_type+poistion+mutant_type columns to generate 
+# mutationinformation (matches mCSM output field)
+# Remember to use .map(str) for int col types to allow string concatenation
+#########
+gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type']
+print('Created column: mutationinformation'
+	, '\n=====================================================================\n'
+    , gene_LF1.mutationinformation.head(10))
+
+#order by position for convenience
+gene_LF1.dtypes
+
+# converting position to numeric
+gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
+
+# sort by position inplace 
+foo = gene_LF1['position'].value_counts()
+foo
+gene_LF1.sort_values(by = ['position'], inplace = True)
+bar = gene_LF1['position'].value_counts()
+
+# FIXME:Can only compare identically-labeled Series objects
+if (foo == bar).all():
+    print('PASS: df ordered by position')
+    print(gene_LF1['position'].head())
+else:
+    print('FAIL: df could not be ordered. Check source')
+    sys.exit()
+
+#%% Create a copy of mutationinformation column for downstream mergeing
+gene_LF1['Mut']      = gene_LF1['mutationinformation'] 
+gene_LF1['Mut_copy'] = gene_LF1['mutationinformation'] 
+
+#%% Create a copy of indices for downstream mergeing
+gene_LF1['index_orig']      = gene_LF1.index
+gene_LF1['index_orig_copy'] = gene_LF1.index
+
+all(gene_LF1.index.values == gene_LF1['index_orig'].values)
+all(gene_LF1.index.values == gene_LF1['index_orig_copy'].values)
+
+#%% quick sanity check for position freq count
+# count the freq of 'other muts' samples
+test_df = gene_LF1.copy()
+test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'position']]
+# add freq column
+#test_df['sample_freq'] = test_df.groupby('id')['id'].transform('count')
+#print('Revised dim of other_muts_df:',test_df.shape) 
+test_df['scount'] = test_df['mutation'].map(SnpFDict)
+#%% Map mutation frequency count as a column
+gene_LF1['snp_frequency'] = gene_LF1['mutation'].map(SnpFDict)
+#%% Map position frequency count as a column
+z =  gene_LF1['position'].value_counts()
+z1 = z.to_dict()
+gene_LF1['pos_count'] = gene_LF1['position'].map(z1)
+
+#test_df2 = test_df.loc[test_df['position'] == 10] 
+
+#%% OUTFILE 4, write file mCSM style muts
+snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique())
+snps_only.head()
+# assign column name
+snps_only.columns = ['mutationinformation']
+
+# count how many positions this corresponds to
+pos_only = pd.DataFrame(gene_LF1['position'].unique()) 
+pos_only
+
+print('Checking NA in snps...')# should be 0
+if snps_only.mutationinformation.isna().sum() == 0:
+    print ('PASS: NO NAs/missing entries for SNPs'
+    , '\n===============================================================')
+else:
+    sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
+
+# write file: mCSM muts
+out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv'
+outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
+
+print('\n----------------------------------'
+      , '\nWriting file: mCSM style muts'
+      , '\n----------------------------------'
+      , '\nFile:', outfile_mcsmsnps
+      , '\nmutation format (SNP): {WT}<POS>{MUT}'
+      , '\nNo. of distinct muts:', len(snps_only)
+      , '\nNo. of distinct positions:', len(pos_only)
+      , '\n=============================================================')
+
+snps_only.to_csv(outfile_mcsmsnps, header = False, index = False)
+
+print('Finished writing:', outfile_mcsmsnps
+      , '\nNo. of rows:', len(snps_only)
+      , '\nNo. of cols:', len(snps_only.columns)
+      , '\n=============================================================')
+del(out_filename_mcsmsnps)
+
+#%% OUTFILE 5, write file frequency of position counts:  MOVE TO THE END
+metadata_pos = pd.DataFrame(gene_LF1['position'])
+metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
+metadata_pos['meta_pos_count'].value_counts()
+
+metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
+
+out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
+outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
+print('\n----------------------------------'
+      , '\nWriting file: Metadata poscounts'
+      , '\n----------------------------------'
+      , '\nFile:', outfile_metadata_poscounts
+      , '\n============================================================')
+
+metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
+print('Finished writing:', outfile_metadata_poscounts
+      , '\nNo. of rows:', len(metadata_pos)
+      , '\nNo. of cols:', len(metadata_pos.columns)
+      , '\n=============================================================')
+del(out_filename_metadata_poscounts)
+#%% OUTFILE 6, write file <gene>_metadata:  MOVE TO THE END
+# ---THINK ---
+#%% OUTFILE 7, write file MSA plots:  MOVE TO THE END
+# -- THINK ---
+#%% OUTFILE 8, write file mutational position with count:  MOVE TO THE END
+# -- THINK ---
+#%% Add column: aa property_water
+#%% Add column: aa_prop_polarity
+#%% Add column: aa_calcprop
+#%% NEW mappings: gene_LF2
+# gene_LF2: copy gene_LF1
+gene_LF2 = gene_LF1.copy()
+#%% Add total unique id count
+gene_LF2['id'].nunique()
+gene_LF2['mutationinformation'].nunique()
+total_id_ucount = gene_LF2['id'].nunique()
+total_id_ucount
+total_id_ucount2 = gene_LF2['sample'].nunique()
+total_id_ucount2
+
+if total_id_ucount == total_id_ucount2:
+    print('\nPASS: sample and id unique counts match')
+else:
+    print('\nFAIL: sample and id unique counts DO NOT match!'
+          , '\nGWAS worry!?')
+
+# Add all sample ids in a list for sanity checks
+#gene_LF2['id_list']  = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].apply(list))
+#==========================================
+# Add column: total unique id/sample count
+#==========================================
+gene_LF2['total_id_ucount'] = total_id_ucount
+
+#==========================================
+# DELETE as already mapped: Add column: mutation count in all samples 
+#==========================================
+# gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
+# gene_LF2['mut_id_ucount']
+
+# gene_LF1['snp_frequency'].equals(gene_LF2['mut_id_ucount'])
+
+#%% AF for gene
+#=================
+# Add column: AF
+#=================
+gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount']
+gene_LF2['maf'].head()
+
+#%% Mapping 1: column '<drug>', mutation_info
+gene_LF2['mutation_info'].value_counts()
+gene_LF2['mutation_info_v1'].value_counts()
+gene_LF2['mutation_info_orig'].value_counts()
+
+#=======================
+# column name: <drug>
+#=======================
+# mapping 1.1: labels
+dm_om_label_map = {dr_muts_col: 'DM'
+             , other_muts_col: 'OM'}
+dm_om_label_map
+gene_LF2['mutation_info_labels'] = gene_LF2['mutation_info'].map(dm_om_label_map)
+
+# mapping 1.2: numeric
+dm_om_num_map = {dr_muts_col: 1
+              , other_muts_col: 0}
+
+gene_LF2['dm_om_numeric'] = gene_LF2['mutation_info'].map(dm_om_num_map)
+gene_LF2['dm_om_numeric_orig'] = gene_LF2['mutation_info_orig'].map(dm_om_num_map)
+
+gene_LF2['mutation_info'].value_counts()
+gene_LF2['mutation_info_labels'].value_counts()
+gene_LF2['mutation_info_orig'].value_counts()
+gene_LF2['dm_om_numeric'].value_counts()
+gene_LF2['dm_om_numeric_orig'].value_counts()
+#%% Mapping 2: column '<drtype>'
+#============================
+# column name: <drtype>
+#============================
+gene_LF2['drtype'].value_counts()
+
+# mapping 2.1: numeric
+drtype_map = {'XDR': 5
+              , 'Pre-XDR': 4
+              , 'MDR': 3
+              , 'Pre-MDR': 2
+              , 'Other': 1
+              , 'Sensitive': 0}
+
+gene_LF2['drtype_numeric']  = gene_LF2['drtype'].map(drtype_map)
+
+gene_LF2['drtype'].value_counts()
+gene_LF2['drtype_numeric'].value_counts()
+#%% Mapping 3: column '<dst>', drug
+#============================
+# column name: <dst>
+#============================
+# copy dst column
+gene_LF2['dst'] = gene_LF2[drug] # to allow cross checking
+gene_LF2['dst'].equals(gene_LF2[drug])
+
+gene_LF2['dst_multimode'] = gene_LF2[drug]
+
+gene_LF2[drug].isnull().sum()
+gene_LF2['dst_multimode'].isnull().sum()
+#%% Further mappings: gene_LF3
+gene_LF3 = gene_LF2.copy()
+gene_LF3.index
+gene_LF3 = gene_LF3.set_index(['Mut'])
+gene_LF3.index
+
+gene_LF3['dst_multimode'].value_counts()
+gene_LF3['dst_multimode'].value_counts().sum()
+#%% Multimode: dst
+# For each mutation, generate the revised dst which is the mode of dm_om_numeric
+#=============================
+# Recalculation: Revised dst
+# max(multimode)
+#=============================
+# Get multimode for dm_om_numeric column
+#dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
+dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
+
+dm_om_multimode_LF3
+
+# Fill using multimode ONLY where NA in dst_multimode column
+gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
+
+# Now get the max from multimode
+gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
+print(gene_LF3)
+#%% Revised Columns:IMPORTANT
+#%% Multimode: dst column
+#----------------------------
+# Revised dst column: Max
+#----------------------------
+# Finally created a revised dst with the max from the multimode
+gene_LF3['dst_mode']  = gene_LF3.groupby('mutationinformation')['dst_noNA'].max()
+
+#%% Multimode: drtype
+#=============================
+# Recalculation: Revised drtype
+# max(multimode)
+#=============================
+#--------------------------------
+# drtype: ALL values:
+# numeric and names in an array 
+#--------------------------------
+gene_LF3['drtype_all_vals']  = gene_LF3['drtype_numeric']
+gene_LF3['drtype_all_names'] = gene_LF3['drtype']
+
+gene_LF3['drtype_all_vals']  = gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
+gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
+
+#---------------------------------
+# Revised drtype: max(Multimode)
+#--------------------------------
+gene_LF3['drtype_multimode'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode)
+gene_LF3['drtype_multimode']
+
+# Now get the max from multimode
+gene_LF3['drtype_mode'] = gene_LF3['drtype_multimode'].apply(lambda x: np.nanmax(x))
+gene_LF3.head()
+
+#----------------------
+# Revised drtype: Max
+#----------------------
+gene_LF3.head()
+gene_LF3['drtype_max'] =  gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].max()
+gene_LF3.head()
+
+#%% Revised counts checks
+gene_LF3['dst_mode'].value_counts()
+gene_LF3[drug].value_counts()
+
+print('\n------------------------------------------------------'
+      , '\nRevised counting: mutation_info i.e dm om column'
+      , '\n-----------------------------------------------------'
+      
+      , '\n----------------------------------'
+      , '\nOriginal drug column count'
+      , '\n----------------------------------'
+      ,  gene_LF3[drug].value_counts()
+      , '\nTotal samples [original]:', gene_LF3[drug].value_counts().sum()
+      
+      , '\n----------------------------------'
+      , '\nRevised drug column count'
+      , '\n----------------------------------'
+      , gene_LF3['dst_mode'].value_counts()
+      , '\nTotal samples [revised]:', gene_LF3['dst_mode'].value_counts().sum()
+      )