did all other mappings until dst column
This commit is contained in:
parent
7a10b4f223
commit
cb93cef3c7
1 changed files with 414 additions and 5 deletions
|
@ -672,6 +672,15 @@ print(dict(Cdict))
|
|||
for k, v in Cdict.items():
|
||||
if k in common_snps_dr_other:
|
||||
print(k,v)
|
||||
|
||||
# convert defaultDict to dict
|
||||
SnpFDict_orig = dict(Cdict)
|
||||
|
||||
def lower_dict(d):
|
||||
new_dict = dict((k.lower(), v) for k, v in d.items())
|
||||
return new_dict
|
||||
|
||||
SnpFDict = lower_dict(SnpFDict_orig)
|
||||
###############################################################################
|
||||
# USE Vcounts to get expected curated df
|
||||
# resolve dm om muts funda
|
||||
|
@ -987,13 +996,16 @@ ambig_muts_rev_df.index
|
|||
gene_LF1.index
|
||||
all(ambig_muts_rev_df.index.isin(gene_LF1.index))
|
||||
|
||||
gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV']
|
||||
#gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV']
|
||||
gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
|
||||
|
||||
gene_LF1['mutation_info_orig'].value_counts()
|
||||
gene_LF1['mutation_info_v1'].value_counts()
|
||||
#gene_LF1['mutation_info_v1'].value_counts()
|
||||
foo = gene_LF1.iloc[ambig_muts_rev_df.index]
|
||||
|
||||
# Sanity check1: if there are still any ambiguous muts
|
||||
muts_split_rev = list(gene_LF1.groupby('mutation_info_v1'))
|
||||
#muts_split_rev = list(gene_LF1.groupby('mutation_info_v1'))
|
||||
muts_split_rev = list(gene_LF1.groupby('mutation_info'))
|
||||
dr_muts_rev = muts_split_rev[0][1].mutation
|
||||
other_muts_rev = muts_split_rev[1][1].mutation
|
||||
print('splitting muts by mut_info:', muts_split_rev)
|
||||
|
@ -1006,5 +1018,402 @@ else:
|
|||
print('\nAmbiguous muts NOT corrected. Quitting!')
|
||||
sys.exit()
|
||||
|
||||
gene_LF1['mutation_info_v1'].value_counts()
|
||||
#gene_LF1['mutation_info_v1'].value_counts()
|
||||
gene_LF1['mutation_info'].value_counts()
|
||||
|
||||
# reassign
|
||||
#%% PHEW! Good to go for downstream stuff
|
||||
#%% Add column: Mutationinformation
|
||||
# splitting mutation column to get mCSM style muts
|
||||
#=======================
|
||||
# Formatting df: read aa dict and pull relevant info
|
||||
#=======================
|
||||
print('Now some more formatting:'
|
||||
, '\nRead aa dict and pull relevant info'
|
||||
, '\nFormat mutations:'
|
||||
, '\nsplit mutation into mCSM style muts: '
|
||||
, '\nFormatting mutation in mCSM style format: {WT}<POS>{MUT}'
|
||||
, '\nAssign aa properties: adding 2 cols at a time for each prop'
|
||||
, '\n===================================================================')
|
||||
|
||||
# BEWARE hardcoding : only works as we are adding aa prop once for wt and once for mut
|
||||
# in each lookup cycle
|
||||
ncol_mutf_add = 3 # mut split into 3 cols
|
||||
ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping
|
||||
|
||||
#===========
|
||||
# Split 'mutation' column into three: wild_type, position and
|
||||
# mutant_type separately. Then map three letter code to one using
|
||||
# reference_dict imported at the beginning.
|
||||
# After importing, convert to mutation to lowercase for compatibility with dict
|
||||
#===========
|
||||
gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower()
|
||||
|
||||
print('wt regex being used:', wt_regex
|
||||
, '\nmut regex being used:', mut_regex
|
||||
, '\nposition regex being used:', pos_regex)
|
||||
|
||||
mylen0 = len(gene_LF1.columns)
|
||||
|
||||
#=======
|
||||
# Iterate through the dict, create a lookup dict i.e
|
||||
# lookup_dict = {three_letter_code: one_letter_code}.
|
||||
# lookup dict should be the key and the value (you want to create a column for)
|
||||
# Then use this to perform the mapping separetly for wild type and mutant cols.
|
||||
# The three letter code is extracted using a string match match from the dataframe and then converted
|
||||
# to 'pandas series'since map only works in pandas series
|
||||
#=======
|
||||
print('Adding', ncol_mutf_add, 'more cols:\n')
|
||||
|
||||
# initialise a sub dict that is lookup dict for three letter code to 1-letter code
|
||||
# adding three more cols
|
||||
lookup_dict = dict()
|
||||
for k, v in my_aa_dict.items():
|
||||
lookup_dict[k] = v['one_letter_code']
|
||||
#wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()converts to a series that map works on
|
||||
wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
|
||||
gene_LF1['wild_type'] = wt.map(lookup_dict)
|
||||
#mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
|
||||
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
|
||||
gene_LF1['mutant_type'] = mut.map(lookup_dict)
|
||||
|
||||
# extract position info from mutation column separetly using string match
|
||||
#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)')
|
||||
gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex)
|
||||
|
||||
mylen1 = len(gene_LF1.columns)
|
||||
|
||||
# sanity checks
|
||||
print('checking if 3-letter wt&mut residue extraction worked correctly')
|
||||
if wt.isna().sum() & mut.isna().sum() == 0:
|
||||
print('PASS: 3-letter wt & mut residue extraction worked correctly:'
|
||||
, '\nNo NAs detected:'
|
||||
, '\nwild-type\n', wt
|
||||
, '\nmutant-type\n', mut
|
||||
, '\ndim of df:', gene_LF1.shape)
|
||||
else:
|
||||
print('FAIL: 3-letter wt&mut residue extraction failed'
|
||||
, '\nNo NAs detected:'
|
||||
, '\nwild-type\n', wt
|
||||
, '\nmutant-type\n', mut
|
||||
, '\ndim of df:', gene_LF1.shape)
|
||||
|
||||
if mylen1 == mylen0 + ncol_mutf_add:
|
||||
print('PASS: successfully added', ncol_mutf_add, 'cols'
|
||||
, '\nold length:', mylen0
|
||||
, '\nnew len:', mylen1)
|
||||
else:
|
||||
print('FAIL: failed to add cols:'
|
||||
, '\nold length:', mylen0
|
||||
, '\nnew len:', mylen1)
|
||||
|
||||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
|
||||
########
|
||||
# combine the wild_type+poistion+mutant_type columns to generate
|
||||
# mutationinformation (matches mCSM output field)
|
||||
# Remember to use .map(str) for int col types to allow string concatenation
|
||||
#########
|
||||
gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type']
|
||||
print('Created column: mutationinformation'
|
||||
, '\n=====================================================================\n'
|
||||
, gene_LF1.mutationinformation.head(10))
|
||||
|
||||
#order by position for convenience
|
||||
gene_LF1.dtypes
|
||||
|
||||
# converting position to numeric
|
||||
gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
|
||||
|
||||
# sort by position inplace
|
||||
foo = gene_LF1['position'].value_counts()
|
||||
foo
|
||||
gene_LF1.sort_values(by = ['position'], inplace = True)
|
||||
bar = gene_LF1['position'].value_counts()
|
||||
|
||||
# FIXME:Can only compare identically-labeled Series objects
|
||||
if (foo == bar).all():
|
||||
print('PASS: df ordered by position')
|
||||
print(gene_LF1['position'].head())
|
||||
else:
|
||||
print('FAIL: df could not be ordered. Check source')
|
||||
sys.exit()
|
||||
|
||||
#%% Create a copy of mutationinformation column for downstream mergeing
|
||||
gene_LF1['Mut'] = gene_LF1['mutationinformation']
|
||||
gene_LF1['Mut_copy'] = gene_LF1['mutationinformation']
|
||||
|
||||
#%% Create a copy of indices for downstream mergeing
|
||||
gene_LF1['index_orig'] = gene_LF1.index
|
||||
gene_LF1['index_orig_copy'] = gene_LF1.index
|
||||
|
||||
all(gene_LF1.index.values == gene_LF1['index_orig'].values)
|
||||
all(gene_LF1.index.values == gene_LF1['index_orig_copy'].values)
|
||||
|
||||
#%% quick sanity check for position freq count
|
||||
# count the freq of 'other muts' samples
|
||||
test_df = gene_LF1.copy()
|
||||
test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'position']]
|
||||
# add freq column
|
||||
#test_df['sample_freq'] = test_df.groupby('id')['id'].transform('count')
|
||||
#print('Revised dim of other_muts_df:',test_df.shape)
|
||||
test_df['scount'] = test_df['mutation'].map(SnpFDict)
|
||||
#%% Map mutation frequency count as a column
|
||||
gene_LF1['snp_frequency'] = gene_LF1['mutation'].map(SnpFDict)
|
||||
#%% Map position frequency count as a column
|
||||
z = gene_LF1['position'].value_counts()
|
||||
z1 = z.to_dict()
|
||||
gene_LF1['pos_count'] = gene_LF1['position'].map(z1)
|
||||
|
||||
#test_df2 = test_df.loc[test_df['position'] == 10]
|
||||
|
||||
#%% OUTFILE 4, write file mCSM style muts
|
||||
snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique())
|
||||
snps_only.head()
|
||||
# assign column name
|
||||
snps_only.columns = ['mutationinformation']
|
||||
|
||||
# count how many positions this corresponds to
|
||||
pos_only = pd.DataFrame(gene_LF1['position'].unique())
|
||||
pos_only
|
||||
|
||||
print('Checking NA in snps...')# should be 0
|
||||
if snps_only.mutationinformation.isna().sum() == 0:
|
||||
print ('PASS: NO NAs/missing entries for SNPs'
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
|
||||
|
||||
# write file: mCSM muts
|
||||
out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv'
|
||||
outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
|
||||
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: mCSM style muts'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_mcsmsnps
|
||||
, '\nmutation format (SNP): {WT}<POS>{MUT}'
|
||||
, '\nNo. of distinct muts:', len(snps_only)
|
||||
, '\nNo. of distinct positions:', len(pos_only)
|
||||
, '\n=============================================================')
|
||||
|
||||
snps_only.to_csv(outfile_mcsmsnps, header = False, index = False)
|
||||
|
||||
print('Finished writing:', outfile_mcsmsnps
|
||||
, '\nNo. of rows:', len(snps_only)
|
||||
, '\nNo. of cols:', len(snps_only.columns)
|
||||
, '\n=============================================================')
|
||||
del(out_filename_mcsmsnps)
|
||||
|
||||
#%% OUTFILE 5, write file frequency of position counts: MOVE TO THE END
|
||||
metadata_pos = pd.DataFrame(gene_LF1['position'])
|
||||
metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
|
||||
metadata_pos['meta_pos_count'].value_counts()
|
||||
|
||||
metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
|
||||
|
||||
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
|
||||
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: Metadata poscounts'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_metadata_poscounts
|
||||
, '\n============================================================')
|
||||
|
||||
metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
|
||||
print('Finished writing:', outfile_metadata_poscounts
|
||||
, '\nNo. of rows:', len(metadata_pos)
|
||||
, '\nNo. of cols:', len(metadata_pos.columns)
|
||||
, '\n=============================================================')
|
||||
del(out_filename_metadata_poscounts)
|
||||
#%% OUTFILE 6, write file <gene>_metadata: MOVE TO THE END
|
||||
# ---THINK ---
|
||||
#%% OUTFILE 7, write file MSA plots: MOVE TO THE END
|
||||
# -- THINK ---
|
||||
#%% OUTFILE 8, write file mutational position with count: MOVE TO THE END
|
||||
# -- THINK ---
|
||||
#%% Add column: aa property_water
|
||||
#%% Add column: aa_prop_polarity
|
||||
#%% Add column: aa_calcprop
|
||||
#%% NEW mappings: gene_LF2
|
||||
# gene_LF2: copy gene_LF1
|
||||
gene_LF2 = gene_LF1.copy()
|
||||
#%% Add total unique id count
|
||||
gene_LF2['id'].nunique()
|
||||
gene_LF2['mutationinformation'].nunique()
|
||||
total_id_ucount = gene_LF2['id'].nunique()
|
||||
total_id_ucount
|
||||
total_id_ucount2 = gene_LF2['sample'].nunique()
|
||||
total_id_ucount2
|
||||
|
||||
if total_id_ucount == total_id_ucount2:
|
||||
print('\nPASS: sample and id unique counts match')
|
||||
else:
|
||||
print('\nFAIL: sample and id unique counts DO NOT match!'
|
||||
, '\nGWAS worry!?')
|
||||
|
||||
# Add all sample ids in a list for sanity checks
|
||||
#gene_LF2['id_list'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].apply(list))
|
||||
#==========================================
|
||||
# Add column: total unique id/sample count
|
||||
#==========================================
|
||||
gene_LF2['total_id_ucount'] = total_id_ucount
|
||||
|
||||
#==========================================
|
||||
# DELETE as already mapped: Add column: mutation count in all samples
|
||||
#==========================================
|
||||
# gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
|
||||
# gene_LF2['mut_id_ucount']
|
||||
|
||||
# gene_LF1['snp_frequency'].equals(gene_LF2['mut_id_ucount'])
|
||||
|
||||
#%% AF for gene
|
||||
#=================
|
||||
# Add column: AF
|
||||
#=================
|
||||
gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount']
|
||||
gene_LF2['maf'].head()
|
||||
|
||||
#%% Mapping 1: column '<drug>', mutation_info
|
||||
gene_LF2['mutation_info'].value_counts()
|
||||
gene_LF2['mutation_info_v1'].value_counts()
|
||||
gene_LF2['mutation_info_orig'].value_counts()
|
||||
|
||||
#=======================
|
||||
# column name: <drug>
|
||||
#=======================
|
||||
# mapping 1.1: labels
|
||||
dm_om_label_map = {dr_muts_col: 'DM'
|
||||
, other_muts_col: 'OM'}
|
||||
dm_om_label_map
|
||||
gene_LF2['mutation_info_labels'] = gene_LF2['mutation_info'].map(dm_om_label_map)
|
||||
|
||||
# mapping 1.2: numeric
|
||||
dm_om_num_map = {dr_muts_col: 1
|
||||
, other_muts_col: 0}
|
||||
|
||||
gene_LF2['dm_om_numeric'] = gene_LF2['mutation_info'].map(dm_om_num_map)
|
||||
gene_LF2['dm_om_numeric_orig'] = gene_LF2['mutation_info_orig'].map(dm_om_num_map)
|
||||
|
||||
gene_LF2['mutation_info'].value_counts()
|
||||
gene_LF2['mutation_info_labels'].value_counts()
|
||||
gene_LF2['mutation_info_orig'].value_counts()
|
||||
gene_LF2['dm_om_numeric'].value_counts()
|
||||
gene_LF2['dm_om_numeric_orig'].value_counts()
|
||||
#%% Mapping 2: column '<drtype>'
|
||||
#============================
|
||||
# column name: <drtype>
|
||||
#============================
|
||||
gene_LF2['drtype'].value_counts()
|
||||
|
||||
# mapping 2.1: numeric
|
||||
drtype_map = {'XDR': 5
|
||||
, 'Pre-XDR': 4
|
||||
, 'MDR': 3
|
||||
, 'Pre-MDR': 2
|
||||
, 'Other': 1
|
||||
, 'Sensitive': 0}
|
||||
|
||||
gene_LF2['drtype_numeric'] = gene_LF2['drtype'].map(drtype_map)
|
||||
|
||||
gene_LF2['drtype'].value_counts()
|
||||
gene_LF2['drtype_numeric'].value_counts()
|
||||
#%% Mapping 3: column '<dst>', drug
|
||||
#============================
|
||||
# column name: <dst>
|
||||
#============================
|
||||
# copy dst column
|
||||
gene_LF2['dst'] = gene_LF2[drug] # to allow cross checking
|
||||
gene_LF2['dst'].equals(gene_LF2[drug])
|
||||
|
||||
gene_LF2['dst_multimode'] = gene_LF2[drug]
|
||||
|
||||
gene_LF2[drug].isnull().sum()
|
||||
gene_LF2['dst_multimode'].isnull().sum()
|
||||
#%% Further mappings: gene_LF3
|
||||
gene_LF3 = gene_LF2.copy()
|
||||
gene_LF3.index
|
||||
gene_LF3 = gene_LF3.set_index(['Mut'])
|
||||
gene_LF3.index
|
||||
|
||||
gene_LF3['dst_multimode'].value_counts()
|
||||
gene_LF3['dst_multimode'].value_counts().sum()
|
||||
#%% Multimode: dst
|
||||
# For each mutation, generate the revised dst which is the mode of dm_om_numeric
|
||||
#=============================
|
||||
# Recalculation: Revised dst
|
||||
# max(multimode)
|
||||
#=============================
|
||||
# Get multimode for dm_om_numeric column
|
||||
#dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
|
||||
dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
|
||||
|
||||
dm_om_multimode_LF3
|
||||
|
||||
# Fill using multimode ONLY where NA in dst_multimode column
|
||||
gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
|
||||
|
||||
# Now get the max from multimode
|
||||
gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
|
||||
print(gene_LF3)
|
||||
#%% Revised Columns:IMPORTANT
|
||||
#%% Multimode: dst column
|
||||
#----------------------------
|
||||
# Revised dst column: Max
|
||||
#----------------------------
|
||||
# Finally created a revised dst with the max from the multimode
|
||||
gene_LF3['dst_mode'] = gene_LF3.groupby('mutationinformation')['dst_noNA'].max()
|
||||
|
||||
#%% Multimode: drtype
|
||||
#=============================
|
||||
# Recalculation: Revised drtype
|
||||
# max(multimode)
|
||||
#=============================
|
||||
#--------------------------------
|
||||
# drtype: ALL values:
|
||||
# numeric and names in an array
|
||||
#--------------------------------
|
||||
gene_LF3['drtype_all_vals'] = gene_LF3['drtype_numeric']
|
||||
gene_LF3['drtype_all_names'] = gene_LF3['drtype']
|
||||
|
||||
gene_LF3['drtype_all_vals'] = gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
|
||||
gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
|
||||
|
||||
#---------------------------------
|
||||
# Revised drtype: max(Multimode)
|
||||
#--------------------------------
|
||||
gene_LF3['drtype_multimode'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode)
|
||||
gene_LF3['drtype_multimode']
|
||||
|
||||
# Now get the max from multimode
|
||||
gene_LF3['drtype_mode'] = gene_LF3['drtype_multimode'].apply(lambda x: np.nanmax(x))
|
||||
gene_LF3.head()
|
||||
|
||||
#----------------------
|
||||
# Revised drtype: Max
|
||||
#----------------------
|
||||
gene_LF3.head()
|
||||
gene_LF3['drtype_max'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].max()
|
||||
gene_LF3.head()
|
||||
|
||||
#%% Revised counts checks
|
||||
gene_LF3['dst_mode'].value_counts()
|
||||
gene_LF3[drug].value_counts()
|
||||
|
||||
print('\n------------------------------------------------------'
|
||||
, '\nRevised counting: mutation_info i.e dm om column'
|
||||
, '\n-----------------------------------------------------'
|
||||
|
||||
, '\n----------------------------------'
|
||||
, '\nOriginal drug column count'
|
||||
, '\n----------------------------------'
|
||||
, gene_LF3[drug].value_counts()
|
||||
, '\nTotal samples [original]:', gene_LF3[drug].value_counts().sum()
|
||||
|
||||
, '\n----------------------------------'
|
||||
, '\nRevised drug column count'
|
||||
, '\n----------------------------------'
|
||||
, gene_LF3['dst_mode'].value_counts()
|
||||
, '\nTotal samples [revised]:', gene_LF3['dst_mode'].value_counts().sum()
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue