added sections and slotted relevant bits from data_extraction to v2

This commit is contained in:
Tanushree Tunstall 2022-04-14 12:21:16 +01:00
parent e6faf80c20
commit f05cb96346
2 changed files with 446 additions and 52 deletions

View file

@ -1210,7 +1210,7 @@ print('Finished writing:', outfile_mcsmsnps
, '\n=============================================================')
del(out_filename_mcsmsnps)
#%%# write frequency of position counts
#%% write frequency of position counts
metadata_pos = pd.DataFrame(gene_LF1['position'])
z = gene_LF1['position'].value_counts()
z1 = z.to_dict()
@ -1219,8 +1219,6 @@ metadata_pos['meta_pos_count'].value_counts()
metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
# Write file: gene_metadata (i.e gene_LF1)
# where each row has UNIQUE mutations NOT unique sample ids
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
print('\n----------------------------------'
@ -1236,7 +1234,6 @@ print('Finished writing:', outfile_metadata_poscounts
, '\n=============================================================')
del(out_filename_metadata_poscounts)
#%% Write file: gene_metadata (i.e gene_LF1)
# where each row has UNIQUE mutations NOT unique sample ids
#out_filename_metadata = gene.lower() + '_metadata_raw.csv' #! rationale?

View file

@ -905,14 +905,16 @@ else:
#%% Ambiguous muts: revised annotation for mutation_info
ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts()
ambiguous_muts_value_counts
bar = gene_LF1.copy()
bar.equals(gene_LF1)
gene_LF1_orig = gene_LF1.copy()
gene_LF1_orig.equals(gene_LF1)
# copy the old column for checking
gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info']
gene_LF1['mutation_info_orig'].value_counts()
gene_LF1['mutation_info'].value_counts()
#=====================================
# Now create a new df that will have:
@ -924,42 +926,52 @@ gene_LF1_orig = gene_LF1.copy()
# label is chosen that corresponds to the max of value counts
#=====================================
ambig_muts_rev_df = pd.DataFrame()
changes_val = []
changes_dict = {}
for i in common_muts:
#print(i)
temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info']]
temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
# DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending
max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info']].value_counts()
max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts()
revised_label = max_info_table[[0]].index[0][1] # max value_count
old_label = max_info_table[[1]].index[0][1] # min value_count
print('\nAmbiguous mutation labels...'
, '\nSetting mutation_info for', i, 'to', revised_label)
temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info'] == old_label)
temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info_orig'] == old_label)
, revised_label
, temp_df['mutation_info'])
, temp_df['mutation_info_orig'])
ambig_muts_rev_df = pd.concat([ambig_muts_rev_df,temp_df])
f = max_info_table[[1]]
old_label_count = f[[0]][0]
changes_val.append(old_label_count)
cc_dict = f.to_dict()
changes_dict.update(cc_dict)
ambig_muts_rev_df['mutation_info'].value_counts()
ambig_muts_rev_df['mutation_info_REV'].value_counts()
ambig_muts_rev_df['mutation_info_orig'].value_counts()
changes_val
changes_total = sum(changes_val)
changes_dict
# TODO: Add sanity check to make sure you can add value_count checks
#=================
# Merge ambig muts
# with gene_LF1
#===================
bar = gene_LF1.copy()
bar['mutation_info_old'] = bar['mutation_info']
ambig_muts_rev_df.index
bar.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
bar['mutation_info_old'].value_counts()
bar['mutation_info'].value_counts()
foo = bar.iloc[ambig_muts_rev_df.index]
foo[['mutation', 'mutation_info', 'mutation_info_old']]
# CHECK if there are still any ambiguous muts
muts_split_rev = list(bar.groupby('mutation_info'))
gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
gene_LF1['mutation_info_orig'].value_counts()
gene_LF1['mutation_info'].value_counts()
foo = gene_LF1.iloc[ambig_muts_rev_df.index]
foo[['mutation', 'mutation_info', 'mutation_info_orig']]
# Sanity check1: if there are still any ambiguous muts
muts_split_rev = list(gene_LF1.groupby('mutation_info'))
dr_muts_rev = muts_split_rev[0][1].mutation
other_muts_rev = muts_split_rev[1][1].mutation
print('splitting muts by mut_info:', muts_split_rev)
@ -972,34 +984,8 @@ else:
print('\nAmbiguous muts corrected. Quitting!')
sys.exit()
#%% ROUND THE HOUSES: DELETE
foo = ambig_muts_rev_df[['mutation', 'mutation_info_REV']]
fooU = foo.drop_duplicates()
#https://cmdlinetips.com/2021/04/convert-two-column-values-from-pandas-dataframe-to-a-dictionary/
fooD = fooU.set_index('mutation').to_dict()['mutation_info_REV']
fooD
#https://stackoverflow.com/questions/45334020/pandas-map-column-data-based-on-value-from-another-column-using-if-to-determine
#df['New_State_Name'] = np.where(df['Name']=='Person1',df['State'].map(state_map),df['State'].map(state_map2))
bar['NEW'] = bar['mutation_info_old']
bar['NEW'].value_counts()
bar['mutation_info_old'].value_counts()
bar['NEW'].value_counts()
bar2 = bar[['mutation', 'mutation_info', 'NEW']]
bar2.isnull().sum()
bar['NEW'].value_counts()
bar['mutation_info'].value_counts()
bar.loc[bar['mutation'].isin(fooD.keys()), 'NEW'] = bar['mutation_info'].map(fooD)
bar['NEW'] = bar['NEW'].map(fooD).fillna(bar['NEW'])
bar['NEW2'].value_counts()
bar['NEW'].value_counts()
#%% OUTFILE 3, write file: ambiguous muts and ambiguous mut counts
#print(outdir)
===================
#===================
# ambiguous muts
#=======================
#dr_muts.to_csv('dr_muts.csv', header = True)
@ -1034,7 +1020,7 @@ print('\n----------------------------------'
ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
#%% clear variables
# clear variables
del(id_dr, id_other
#, meta_data
, meta_gene_dr, meta_gene_other
@ -1043,9 +1029,420 @@ del(id_dr, id_other
del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1)
#%% Splitting mutation column to get mCSM style muts i.e. mutationinformation column
#%% Add column: Mutationinformation
# splitting mutation column to get mCSM style muts
#=======================
# Formatting df: read aa dict and pull relevant info
#=======================
print('Now some more formatting:'
, '\nRead aa dict and pull relevant info'
, '\nFormat mutations:'
, '\nsplit mutation into mCSM style muts: '
, '\nFormatting mutation in mCSM style format: {WT}<POS>{MUT}'
, '\nAssign aa properties: adding 2 cols at a time for each prop'
, '\n===================================================================')
# BEWARE hardcoding : only works as we are adding aa prop once for wt and once for mut
# in each lookup cycle
ncol_mutf_add = 3 # mut split into 3 cols
ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping
#===========
# Split 'mutation' column into three: wild_type, position and
# mutant_type separately. Then map three letter code to one using
# reference_dict imported at the beginning.
# After importing, convert to mutation to lowercase for compatibility with dict
#===========
gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower()
print('wt regex being used:', wt_regex
, '\nmut regex being used:', mut_regex
, '\nposition regex being used:', pos_regex)
mylen0 = len(gene_LF1.columns)
#=======
# Iterate through the dict, create a lookup dict i.e
# lookup_dict = {three_letter_code: one_letter_code}.
# lookup dict should be the key and the value (you want to create a column for)
# Then use this to perform the mapping separetly for wild type and mutant cols.
# The three letter code is extracted using a string match match from the dataframe and then converted
# to 'pandas series'since map only works in pandas series
#=======
print('Adding', ncol_mutf_add, 'more cols:\n')
# initialise a sub dict that is lookup dict for three letter code to 1-letter code
# adding three more cols
lookup_dict = dict()
for k, v in my_aa_dict.items():
lookup_dict[k] = v['one_letter_code']
#wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()converts to a series that map works on
wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
gene_LF1['wild_type'] = wt.map(lookup_dict)
#mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
gene_LF1['mutant_type'] = mut.map(lookup_dict)
# extract position info from mutation column separetly using string match
#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)')
gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex)
mylen1 = len(gene_LF1.columns)
# sanity checks
print('checking if 3-letter wt&mut residue extraction worked correctly')
if wt.isna().sum() & mut.isna().sum() == 0:
print('PASS: 3-letter wt&mut residue extraction worked correctly:'
, '\nNo NAs detected:'
, '\nwild-type\n', wt
, '\nmutant-type\n', mut
, '\ndim of df:', gene_LF1.shape)
else:
print('FAIL: 3-letter wt&mut residue extraction failed'
, '\nNo NAs detected:'
, '\nwild-type\n', wt
, '\nmutant-type\n', mut
, '\ndim of df:', gene_LF1.shape)
if mylen1 == mylen0 + ncol_mutf_add:
print('PASS: successfully added', ncol_mutf_add, 'cols'
, '\nold length:', mylen0
, '\nnew len:', mylen1)
else:
print('FAIL: failed to add cols:'
, '\nold length:', mylen0
, '\nnew len:', mylen1)
# clear variables
del(k, v, wt, mut, lookup_dict)
########
# combine the wild_type+poistion+mutant_type columns to generate
# mutationinformation (matches mCSM output field)
# Remember to use .map(str) for int col types to allow string concatenation
#########
gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type']
print('Created column: mutationinformation'
, '\n=====================================================================\n'
, gene_LF1.mutationinformation.head(10))
#order by position for convenience
gene_LF1.dtypes
# converting position to numeric
gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
# sort by position inplace
foo = gene_LF1['position'].value_counts()
gene_LF1.sort_values(by = ['position'], inplace = True)
bar = gene_LF1['position'].value_counts()
if (foo == bar).all():
print('PASS: df ordered by position')
print(gene_LF1['position'].head())
else:
print('FAIL: df could not be ordered. Check source')
sys.exit()
#%% OUTFILE 4, write file mCSM style muts
snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique())
snps_only.head()
# assign column name
snps_only.columns = ['mutationinformation']
# count how many positions this corresponds to
pos_only = pd.DataFrame(gene_LF1['position'].unique())
print('Checking NA in snps...')# should be 0
if snps_only.mutationinformation.isna().sum() == 0:
print ('PASS: NO NAs/missing entries for SNPs'
, '\n===============================================================')
else:
sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
# write file: mCSM muts
out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv'
outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
print('\n----------------------------------'
, '\nWriting file: mCSM style muts'
, '\n----------------------------------'
, '\nFile:', outfile_mcsmsnps
, '\nmutation format (SNP): {WT}<POS>{MUT}'
, '\nNo. of distinct muts:', len(snps_only)
, '\nNo. of distinct positions:', len(pos_only)
, '\n=============================================================')
snps_only.to_csv(outfile_mcsmsnps, header = False, index = False)
print('Finished writing:', outfile_mcsmsnps
, '\nNo. of rows:', len(snps_only)
, '\nNo. of cols:', len(snps_only.columns)
, '\n=============================================================')
del(out_filename_mcsmsnps)
#%% OUTFILE 5, write file frequency of position counts
metadata_pos = pd.DataFrame(gene_LF1['position'])
z = gene_LF1['position'].value_counts()
z1 = z.to_dict()
metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
metadata_pos['meta_pos_count'].value_counts()
metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
print('\n----------------------------------'
, '\nWriting file: Metadata poscounts'
, '\n----------------------------------'
, '\nFile:', outfile_metadata_poscounts
, '\n============================================================')
metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
print('Finished writing:', outfile_metadata_poscounts
, '\nNo. of rows:', len(metadata_pos)
, '\nNo. of cols:', len(metadata_pos.columns)
, '\n=============================================================')
del(out_filename_metadata_poscounts)
#%% OUTFILE 6, write file <gene>_metadata
# gene_LF1, where each row has UNIQUE mutations NOT unique sample ids
out_filename_metadata = gene.lower() + '_metadata.csv'
outfile_metadata = outdir + '/' + out_filename_metadata
print('\n----------------------------------'
, '\nWriting file: LF formatted data'
, '\n----------------------------------'
, '\nFile:', outfile_metadata
, '\n============================================================')
gene_LF1.to_csv(outfile_metadata, header = True, index = False)
print('Finished writing:', outfile_metadata
, '\nNo. of rows:', len(gene_LF1)
, '\nNo. of cols:', len(gene_LF1.columns)
, '\n=============================================================')
del(out_filename_metadata)
#%% OUTFILE 7, write file MSA plots
# mCSM style muts but with repetitions
# !!! THINK !!!: Implications if mut is coming from the same sample, etc.
all_muts_msa = pd.DataFrame(gene_LF1['mutationinformation'])
all_muts_msa.head()
# assign column name
all_muts_msa.columns = ['mutationinformation']
# make sure it is string
all_muts_msa.columns.dtype
# sort the column
all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation')
# create an extra column with protein name
#all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1')
#all_muts_msa_sorted.head()
# rearrange columns so the fasta name is the first column (required for mutate.script)
#all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
all_muts_msa_sorted.head()
print('Checking NA in snps...')# should be 0
if all_muts_msa.mutationinformation.isna().sum() == 0:
print ('PASS: NO NAs/missing entries for SNPs'
, '\n===============================================================')
else:
sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
out_filename_msa = gene.lower() +'_all_muts_msa.csv'
outfile_msa = outdir + '/' + out_filename_msa
print('\n----------------------------------'
, '\nWriting file: mCSM style muts for msa'
, '\n----------------------------------'
, '\nFile:', outfile_msa
, '\nmutation format (SNP): {WT}<POS>{MUT}'
, '\nNo.of lines of msa:', len(all_muts_msa))
all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False)
print('Finished writing:', outfile_msa
, '\nNo. of rows:', len(all_muts_msa)
, '\nNo. of cols:', len(all_muts_msa.columns)
, '\n=============================================================')
del(out_filename_msa)
#%% OUTFILE 8, write file mutational position with count
# count how many positions this corresponds to
pos_only = pd.DataFrame(gene_LF1['position'].unique())
# assign column name
pos_only.columns = ['position']
# make sure dtype of column position is int or numeric and not string
pos_only.position.dtype
pos_only['position'] = pos_only['position'].astype(int)
pos_only.position.dtype
# sort by position value
pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)
out_filename_pos = gene.lower() + '_mutational_positons.csv'
outfile_pos = outdir + '/' + out_filename_pos
print('\n----------------------------------'
, '\nWriting file: mutational positions'
, '\n----------------------------------'
, '\nFile:', outfile_pos
, '\nNo. of distinct positions:', len(pos_only_sorted)
, '\n=============================================================')
pos_only_sorted.to_csv(outfile_pos, header = True, index = False)
print('Finished writing:', outfile_pos
, '\nNo. of rows:', len(pos_only_sorted)
, '\nNo. of cols:', len(pos_only_sorted.columns)
, '\n============================================================='
, '\n\n\n')
del(out_filename_pos)
#%% Add column: aa property_water
#=========
# iterate through the dict, create a lookup dict that i.e
# lookup_dict = {three_letter_code: aa_prop_water}
# Do this for both wild_type and mutant as above.
#=========
print('Adding', ncol_aa_add, 'more cols:\n')
# initialise a sub dict that is lookup dict for three letter code to aa prop
# adding two more cols
lookup_dict = dict()
for k, v in my_aa_dict.items():
lookup_dict[k] = v['aa_prop_water']
#print(lookup_dict)
wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
gene_LF1['wt_prop_water'] = wt.map(lookup_dict)
#mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
gene_LF1['mut_prop_water'] = mut.map(lookup_dict)
mylen2 = len(gene_LF1.columns)
# sanity checks
print('checking if 3-letter wt&mut residue extraction worked correctly')
if wt.isna().sum() & mut.isna().sum() == 0:
print('PASS: 3-letter wt&mut residue extraction worked correctly:'
, '\nNo NAs detected:'
, '\nwild-type\n', wt
, '\nmutant-type\n', mut
, '\ndim of df:', gene_LF1.shape)
else:
print('FAIL: 3-letter wt&mut residue extraction failed'
, '\nNo NAs detected:'
, '\nwild-type\n', wt
, '\nmutant-type\n', mut
, '\ndim of df:', gene_LF1.shape)
if mylen2 == mylen1 + ncol_aa_add:
print('PASS: successfully added', ncol_aa_add, 'cols'
, '\nold length:', mylen1
, '\nnew len:', mylen2)
else:
print('FAIL: failed to add cols:'
, '\nold length:', mylen1
, '\nnew len:', mylen2)
# clear variables
del(k, v, wt, mut, lookup_dict)
#%% Add column: aa_prop_polarity
#========
# iterate through the dict, create a lookup dict that i.e
# lookup_dict = {three_letter_code: aa_prop_polarity}
# Do this for both wild_type and mutant as above.
#=========
print('Adding', ncol_aa_add, 'more cols:\n')
# initialise a sub dict that is lookup dict for three letter code to aa prop
# adding two more cols
lookup_dict = dict()
for k, v in my_aa_dict.items():
lookup_dict[k] = v['aa_prop_polarity']
#print(lookup_dict)
wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict)
#mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
mylen3 = len(gene_LF1.columns)
# sanity checks
print('checking if 3-letter wt&mut residue extraction worked correctly')
if wt.isna().sum() & mut.isna().sum() == 0:
print('PASS: 3-letter wt&mut residue extraction worked correctly:'
, '\nNo NAs detected:'
, '\nwild-type\n', wt
, '\nmutant-type\n', mut
, '\ndim of df:', gene_LF1.shape)
else:
print('FAIL: 3-letter wt&mut residue extraction failed'
, '\nNo NAs detected:'
, '\nwild-type\n', wt
, '\nmutant-type\n', mut
, '\ndim of df:', gene_LF1.shape)
if mylen3 == mylen2 + ncol_aa_add:
print('PASS: successfully added', ncol_aa_add, 'cols'
, '\nold length:', mylen1
, '\nnew len:', mylen2)
else:
print('FAIL: failed to add cols:'
, '\nold length:', mylen1
, '\nnew len:', mylen2)
# clear variables
del(k, v, wt, mut, lookup_dict)
#%% Add column: aa_calcprop
#========
# iterate through the dict, create a lookup dict that i.e
# lookup_dict = {three_letter_code: aa_calcprop}
# Do this for both wild_type and mutant as above.
#=========
print('Adding', ncol_aa_add, 'more cols:\n')
lookup_dict = dict()
for k, v in my_aa_dict.items():
lookup_dict[k] = v['aa_calcprop']
#print(lookup_dict)
wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
gene_LF1['wt_calcprop'] = wt.map(lookup_dict)
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
gene_LF1['mut_calcprop'] = mut.map(lookup_dict)
mylen4 = len(gene_LF1.columns)
# sanity checks
print('checking if 3-letter wt&mut residue extraction worked correctly')
if wt.isna().sum() & mut.isna().sum() == 0:
print('PASS: 3-letter wt&mut residue extraction worked correctly:'
, '\nNo NAs detected:'
, '\nwild-type\n', wt
, '\nmutant-type\n', mut
, '\ndim of df:', gene_LF1.shape)
else:
print('FAIL: 3-letter wt&mut residue extraction failed'
, '\nNo NAs detected:'
, '\nwild-type\n', wt
, '\nmutant-type\n', mut
, '\ndim of df:', gene_LF1.shape)
if mylen4 == mylen3 + ncol_aa_add:
print('PASS: successfully added', ncol_aa_add, 'cols'
, '\nold length:', mylen3
, '\nnew len:', mylen4)
else:
print('FAIL: failed to add cols:'
, '\nold length:', mylen3
, '\nnew len:', mylen4)
# clear variables
del(k, v, wt, mut, lookup_dict)
#%% NEW TODO: map mutationinformation
#%% NEW: mappings
#=======================
# column name: <drug>