did all other mappings until dst column

This commit is contained in:
Tanushree Tunstall 2022-04-23 11:14:34 +01:00
parent 7a10b4f223
commit cb93cef3c7

View file

@ -672,6 +672,15 @@ print(dict(Cdict))
for k, v in Cdict.items(): for k, v in Cdict.items():
if k in common_snps_dr_other: if k in common_snps_dr_other:
print(k,v) print(k,v)
# convert defaultDict to dict
SnpFDict_orig = dict(Cdict)
def lower_dict(d):
new_dict = dict((k.lower(), v) for k, v in d.items())
return new_dict
SnpFDict = lower_dict(SnpFDict_orig)
############################################################################### ###############################################################################
# USE Vcounts to get expected curated df # USE Vcounts to get expected curated df
# resolve dm om muts funda # resolve dm om muts funda
@ -987,13 +996,16 @@ ambig_muts_rev_df.index
gene_LF1.index gene_LF1.index
all(ambig_muts_rev_df.index.isin(gene_LF1.index)) all(ambig_muts_rev_df.index.isin(gene_LF1.index))
gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV'] #gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV']
gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
gene_LF1['mutation_info_orig'].value_counts() gene_LF1['mutation_info_orig'].value_counts()
gene_LF1['mutation_info_v1'].value_counts() #gene_LF1['mutation_info_v1'].value_counts()
foo = gene_LF1.iloc[ambig_muts_rev_df.index] foo = gene_LF1.iloc[ambig_muts_rev_df.index]
# Sanity check1: if there are still any ambiguous muts # Sanity check1: if there are still any ambiguous muts
muts_split_rev = list(gene_LF1.groupby('mutation_info_v1')) #muts_split_rev = list(gene_LF1.groupby('mutation_info_v1'))
muts_split_rev = list(gene_LF1.groupby('mutation_info'))
dr_muts_rev = muts_split_rev[0][1].mutation dr_muts_rev = muts_split_rev[0][1].mutation
other_muts_rev = muts_split_rev[1][1].mutation other_muts_rev = muts_split_rev[1][1].mutation
print('splitting muts by mut_info:', muts_split_rev) print('splitting muts by mut_info:', muts_split_rev)
@ -1006,5 +1018,402 @@ else:
print('\nAmbiguous muts NOT corrected. Quitting!') print('\nAmbiguous muts NOT corrected. Quitting!')
sys.exit() sys.exit()
gene_LF1['mutation_info_v1'].value_counts() #gene_LF1['mutation_info_v1'].value_counts()
gene_LF1['mutation_info'].value_counts()
# reassign
#%% PHEW! Good to go for downstream stuff #%% PHEW! Good to go for downstream stuff
#%% Add column: Mutationinformation
# splitting mutation column to get mCSM style muts
#=======================
# Formatting df: read aa dict and pull relevant info
#=======================
print('Now some more formatting:'
, '\nRead aa dict and pull relevant info'
, '\nFormat mutations:'
, '\nsplit mutation into mCSM style muts: '
, '\nFormatting mutation in mCSM style format: {WT}<POS>{MUT}'
, '\nAssign aa properties: adding 2 cols at a time for each prop'
, '\n===================================================================')
# BEWARE hardcoding : only works as we are adding aa prop once for wt and once for mut
# in each lookup cycle
ncol_mutf_add = 3 # mut split into 3 cols
ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping
#===========
# Split 'mutation' column into three: wild_type, position and
# mutant_type separately. Then map three letter code to one using
# reference_dict imported at the beginning.
# After importing, convert to mutation to lowercase for compatibility with dict
#===========
gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower()
print('wt regex being used:', wt_regex
, '\nmut regex being used:', mut_regex
, '\nposition regex being used:', pos_regex)
mylen0 = len(gene_LF1.columns)
#=======
# Iterate through the dict, create a lookup dict i.e
# lookup_dict = {three_letter_code: one_letter_code}.
# lookup dict should be the key and the value (you want to create a column for)
# Then use this to perform the mapping separetly for wild type and mutant cols.
# The three letter code is extracted using a string match match from the dataframe and then converted
# to 'pandas series'since map only works in pandas series
#=======
print('Adding', ncol_mutf_add, 'more cols:\n')
# initialise a sub dict that is lookup dict for three letter code to 1-letter code
# adding three more cols
lookup_dict = dict()
for k, v in my_aa_dict.items():
lookup_dict[k] = v['one_letter_code']
#wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()converts to a series that map works on
wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
gene_LF1['wild_type'] = wt.map(lookup_dict)
#mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
gene_LF1['mutant_type'] = mut.map(lookup_dict)
# extract position info from mutation column separetly using string match
#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)')
gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex)
mylen1 = len(gene_LF1.columns)
# sanity checks
print('checking if 3-letter wt&mut residue extraction worked correctly')
if wt.isna().sum() & mut.isna().sum() == 0:
print('PASS: 3-letter wt & mut residue extraction worked correctly:'
, '\nNo NAs detected:'
, '\nwild-type\n', wt
, '\nmutant-type\n', mut
, '\ndim of df:', gene_LF1.shape)
else:
print('FAIL: 3-letter wt&mut residue extraction failed'
, '\nNo NAs detected:'
, '\nwild-type\n', wt
, '\nmutant-type\n', mut
, '\ndim of df:', gene_LF1.shape)
if mylen1 == mylen0 + ncol_mutf_add:
print('PASS: successfully added', ncol_mutf_add, 'cols'
, '\nold length:', mylen0
, '\nnew len:', mylen1)
else:
print('FAIL: failed to add cols:'
, '\nold length:', mylen0
, '\nnew len:', mylen1)
# clear variables
del(k, v, wt, mut, lookup_dict)
########
# combine the wild_type+poistion+mutant_type columns to generate
# mutationinformation (matches mCSM output field)
# Remember to use .map(str) for int col types to allow string concatenation
#########
gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type']
print('Created column: mutationinformation'
, '\n=====================================================================\n'
, gene_LF1.mutationinformation.head(10))
#order by position for convenience
gene_LF1.dtypes
# converting position to numeric
gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
# sort by position inplace
foo = gene_LF1['position'].value_counts()
foo
gene_LF1.sort_values(by = ['position'], inplace = True)
bar = gene_LF1['position'].value_counts()
# FIXME:Can only compare identically-labeled Series objects
if (foo == bar).all():
print('PASS: df ordered by position')
print(gene_LF1['position'].head())
else:
print('FAIL: df could not be ordered. Check source')
sys.exit()
#%% Create a copy of mutationinformation column for downstream mergeing
gene_LF1['Mut'] = gene_LF1['mutationinformation']
gene_LF1['Mut_copy'] = gene_LF1['mutationinformation']
#%% Create a copy of indices for downstream mergeing
gene_LF1['index_orig'] = gene_LF1.index
gene_LF1['index_orig_copy'] = gene_LF1.index
all(gene_LF1.index.values == gene_LF1['index_orig'].values)
all(gene_LF1.index.values == gene_LF1['index_orig_copy'].values)
#%% quick sanity check for position freq count
# count the freq of 'other muts' samples
test_df = gene_LF1.copy()
test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'position']]
# add freq column
#test_df['sample_freq'] = test_df.groupby('id')['id'].transform('count')
#print('Revised dim of other_muts_df:',test_df.shape)
test_df['scount'] = test_df['mutation'].map(SnpFDict)
#%% Map mutation frequency count as a column
gene_LF1['snp_frequency'] = gene_LF1['mutation'].map(SnpFDict)
#%% Map position frequency count as a column
z = gene_LF1['position'].value_counts()
z1 = z.to_dict()
gene_LF1['pos_count'] = gene_LF1['position'].map(z1)
#test_df2 = test_df.loc[test_df['position'] == 10]
#%% OUTFILE 4, write file mCSM style muts
snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique())
snps_only.head()
# assign column name
snps_only.columns = ['mutationinformation']
# count how many positions this corresponds to
pos_only = pd.DataFrame(gene_LF1['position'].unique())
pos_only
print('Checking NA in snps...')# should be 0
if snps_only.mutationinformation.isna().sum() == 0:
print ('PASS: NO NAs/missing entries for SNPs'
, '\n===============================================================')
else:
sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
# write file: mCSM muts
out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv'
outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
print('\n----------------------------------'
, '\nWriting file: mCSM style muts'
, '\n----------------------------------'
, '\nFile:', outfile_mcsmsnps
, '\nmutation format (SNP): {WT}<POS>{MUT}'
, '\nNo. of distinct muts:', len(snps_only)
, '\nNo. of distinct positions:', len(pos_only)
, '\n=============================================================')
snps_only.to_csv(outfile_mcsmsnps, header = False, index = False)
print('Finished writing:', outfile_mcsmsnps
, '\nNo. of rows:', len(snps_only)
, '\nNo. of cols:', len(snps_only.columns)
, '\n=============================================================')
del(out_filename_mcsmsnps)
#%% OUTFILE 5, write file frequency of position counts: MOVE TO THE END
metadata_pos = pd.DataFrame(gene_LF1['position'])
metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
metadata_pos['meta_pos_count'].value_counts()
metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
print('\n----------------------------------'
, '\nWriting file: Metadata poscounts'
, '\n----------------------------------'
, '\nFile:', outfile_metadata_poscounts
, '\n============================================================')
metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
print('Finished writing:', outfile_metadata_poscounts
, '\nNo. of rows:', len(metadata_pos)
, '\nNo. of cols:', len(metadata_pos.columns)
, '\n=============================================================')
del(out_filename_metadata_poscounts)
#%% OUTFILE 6, write file <gene>_metadata: MOVE TO THE END
# ---THINK ---
#%% OUTFILE 7, write file MSA plots: MOVE TO THE END
# -- THINK ---
#%% OUTFILE 8, write file mutational position with count: MOVE TO THE END
# -- THINK ---
#%% Add column: aa property_water
#%% Add column: aa_prop_polarity
#%% Add column: aa_calcprop
#%% NEW mappings: gene_LF2
# gene_LF2: copy gene_LF1
gene_LF2 = gene_LF1.copy()
#%% Add total unique id count
gene_LF2['id'].nunique()
gene_LF2['mutationinformation'].nunique()
total_id_ucount = gene_LF2['id'].nunique()
total_id_ucount
total_id_ucount2 = gene_LF2['sample'].nunique()
total_id_ucount2
if total_id_ucount == total_id_ucount2:
print('\nPASS: sample and id unique counts match')
else:
print('\nFAIL: sample and id unique counts DO NOT match!'
, '\nGWAS worry!?')
# Add all sample ids in a list for sanity checks
#gene_LF2['id_list'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].apply(list))
#==========================================
# Add column: total unique id/sample count
#==========================================
gene_LF2['total_id_ucount'] = total_id_ucount
#==========================================
# DELETE as already mapped: Add column: mutation count in all samples
#==========================================
# gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
# gene_LF2['mut_id_ucount']
# gene_LF1['snp_frequency'].equals(gene_LF2['mut_id_ucount'])
#%% AF for gene
#=================
# Add column: AF
#=================
gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount']
gene_LF2['maf'].head()
#%% Mapping 1: column '<drug>', mutation_info
gene_LF2['mutation_info'].value_counts()
gene_LF2['mutation_info_v1'].value_counts()
gene_LF2['mutation_info_orig'].value_counts()
#=======================
# column name: <drug>
#=======================
# mapping 1.1: labels
dm_om_label_map = {dr_muts_col: 'DM'
, other_muts_col: 'OM'}
dm_om_label_map
gene_LF2['mutation_info_labels'] = gene_LF2['mutation_info'].map(dm_om_label_map)
# mapping 1.2: numeric
dm_om_num_map = {dr_muts_col: 1
, other_muts_col: 0}
gene_LF2['dm_om_numeric'] = gene_LF2['mutation_info'].map(dm_om_num_map)
gene_LF2['dm_om_numeric_orig'] = gene_LF2['mutation_info_orig'].map(dm_om_num_map)
gene_LF2['mutation_info'].value_counts()
gene_LF2['mutation_info_labels'].value_counts()
gene_LF2['mutation_info_orig'].value_counts()
gene_LF2['dm_om_numeric'].value_counts()
gene_LF2['dm_om_numeric_orig'].value_counts()
#%% Mapping 2: column '<drtype>'
#============================
# column name: <drtype>
#============================
gene_LF2['drtype'].value_counts()
# mapping 2.1: numeric
drtype_map = {'XDR': 5
, 'Pre-XDR': 4
, 'MDR': 3
, 'Pre-MDR': 2
, 'Other': 1
, 'Sensitive': 0}
gene_LF2['drtype_numeric'] = gene_LF2['drtype'].map(drtype_map)
gene_LF2['drtype'].value_counts()
gene_LF2['drtype_numeric'].value_counts()
#%% Mapping 3: column '<dst>', drug
#============================
# column name: <dst>
#============================
# copy dst column
gene_LF2['dst'] = gene_LF2[drug] # to allow cross checking
gene_LF2['dst'].equals(gene_LF2[drug])
gene_LF2['dst_multimode'] = gene_LF2[drug]
gene_LF2[drug].isnull().sum()
gene_LF2['dst_multimode'].isnull().sum()
#%% Further mappings: gene_LF3
gene_LF3 = gene_LF2.copy()
gene_LF3.index
gene_LF3 = gene_LF3.set_index(['Mut'])
gene_LF3.index
gene_LF3['dst_multimode'].value_counts()
gene_LF3['dst_multimode'].value_counts().sum()
#%% Multimode: dst
# For each mutation, generate the revised dst which is the mode of dm_om_numeric
#=============================
# Recalculation: Revised dst
# max(multimode)
#=============================
# Get multimode for dm_om_numeric column
#dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
dm_om_multimode_LF3
# Fill using multimode ONLY where NA in dst_multimode column
gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
# Now get the max from multimode
gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
print(gene_LF3)
#%% Revised Columns:IMPORTANT
#%% Multimode: dst column
#----------------------------
# Revised dst column: Max
#----------------------------
# Finally created a revised dst with the max from the multimode
gene_LF3['dst_mode'] = gene_LF3.groupby('mutationinformation')['dst_noNA'].max()
#%% Multimode: drtype
#=============================
# Recalculation: Revised drtype
# max(multimode)
#=============================
#--------------------------------
# drtype: ALL values:
# numeric and names in an array
#--------------------------------
gene_LF3['drtype_all_vals'] = gene_LF3['drtype_numeric']
gene_LF3['drtype_all_names'] = gene_LF3['drtype']
gene_LF3['drtype_all_vals'] = gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
#---------------------------------
# Revised drtype: max(Multimode)
#--------------------------------
gene_LF3['drtype_multimode'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode)
gene_LF3['drtype_multimode']
# Now get the max from multimode
gene_LF3['drtype_mode'] = gene_LF3['drtype_multimode'].apply(lambda x: np.nanmax(x))
gene_LF3.head()
#----------------------
# Revised drtype: Max
#----------------------
gene_LF3.head()
gene_LF3['drtype_max'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].max()
gene_LF3.head()
#%% Revised counts checks
gene_LF3['dst_mode'].value_counts()
gene_LF3[drug].value_counts()
print('\n------------------------------------------------------'
, '\nRevised counting: mutation_info i.e dm om column'
, '\n-----------------------------------------------------'
, '\n----------------------------------'
, '\nOriginal drug column count'
, '\n----------------------------------'
, gene_LF3[drug].value_counts()
, '\nTotal samples [original]:', gene_LF3[drug].value_counts().sum()
, '\n----------------------------------'
, '\nRevised drug column count'
, '\n----------------------------------'
, gene_LF3['dst_mode'].value_counts()
, '\nTotal samples [revised]:', gene_LF3['dst_mode'].value_counts().sum()
)