added sections and slotted relevant bits from data_extraction to v2
This commit is contained in:
parent
e6faf80c20
commit
f05cb96346
2 changed files with 446 additions and 52 deletions
|
@ -1210,7 +1210,7 @@ print('Finished writing:', outfile_mcsmsnps
|
|||
, '\n=============================================================')
|
||||
del(out_filename_mcsmsnps)
|
||||
|
||||
#%%# write frequency of position counts
|
||||
#%% write frequency of position counts
|
||||
metadata_pos = pd.DataFrame(gene_LF1['position'])
|
||||
z = gene_LF1['position'].value_counts()
|
||||
z1 = z.to_dict()
|
||||
|
@ -1219,8 +1219,6 @@ metadata_pos['meta_pos_count'].value_counts()
|
|||
|
||||
metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
|
||||
|
||||
# Write file: gene_metadata (i.e gene_LF1)
|
||||
# where each row has UNIQUE mutations NOT unique sample ids
|
||||
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
|
||||
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
|
||||
print('\n----------------------------------'
|
||||
|
@ -1236,7 +1234,6 @@ print('Finished writing:', outfile_metadata_poscounts
|
|||
, '\n=============================================================')
|
||||
del(out_filename_metadata_poscounts)
|
||||
|
||||
|
||||
#%% Write file: gene_metadata (i.e gene_LF1)
|
||||
# where each row has UNIQUE mutations NOT unique sample ids
|
||||
#out_filename_metadata = gene.lower() + '_metadata_raw.csv' #! rationale?
|
||||
|
|
|
@ -905,14 +905,16 @@ else:
|
|||
|
||||
#%% Ambiguous muts: revised annotation for mutation_info
|
||||
ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
|
||||
|
||||
ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts()
|
||||
ambiguous_muts_value_counts
|
||||
|
||||
bar = gene_LF1.copy()
|
||||
bar.equals(gene_LF1)
|
||||
|
||||
gene_LF1_orig = gene_LF1.copy()
|
||||
gene_LF1_orig.equals(gene_LF1)
|
||||
|
||||
# copy the old column for checking
|
||||
gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info']
|
||||
gene_LF1['mutation_info_orig'].value_counts()
|
||||
gene_LF1['mutation_info'].value_counts()
|
||||
|
||||
#=====================================
|
||||
# Now create a new df that will have:
|
||||
|
@ -924,42 +926,52 @@ gene_LF1_orig = gene_LF1.copy()
|
|||
# label is chosen that corresponds to the max of value counts
|
||||
#=====================================
|
||||
ambig_muts_rev_df = pd.DataFrame()
|
||||
changes_val = []
|
||||
changes_dict = {}
|
||||
|
||||
for i in common_muts:
|
||||
#print(i)
|
||||
temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info']]
|
||||
temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
|
||||
|
||||
# DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending
|
||||
max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info']].value_counts()
|
||||
max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts()
|
||||
|
||||
revised_label = max_info_table[[0]].index[0][1] # max value_count
|
||||
old_label = max_info_table[[1]].index[0][1] # min value_count
|
||||
print('\nAmbiguous mutation labels...'
|
||||
, '\nSetting mutation_info for', i, 'to', revised_label)
|
||||
temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info'] == old_label)
|
||||
temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info_orig'] == old_label)
|
||||
, revised_label
|
||||
, temp_df['mutation_info'])
|
||||
, temp_df['mutation_info_orig'])
|
||||
ambig_muts_rev_df = pd.concat([ambig_muts_rev_df,temp_df])
|
||||
f = max_info_table[[1]]
|
||||
old_label_count = f[[0]][0]
|
||||
changes_val.append(old_label_count)
|
||||
cc_dict = f.to_dict()
|
||||
changes_dict.update(cc_dict)
|
||||
|
||||
ambig_muts_rev_df['mutation_info'].value_counts()
|
||||
ambig_muts_rev_df['mutation_info_REV'].value_counts()
|
||||
ambig_muts_rev_df['mutation_info_orig'].value_counts()
|
||||
changes_val
|
||||
changes_total = sum(changes_val)
|
||||
changes_dict
|
||||
|
||||
# TODO: Add sanity check to make sure you can add value_count checks
|
||||
|
||||
#=================
|
||||
# Merge ambig muts
|
||||
# with gene_LF1
|
||||
#===================
|
||||
bar = gene_LF1.copy()
|
||||
bar['mutation_info_old'] = bar['mutation_info']
|
||||
|
||||
ambig_muts_rev_df.index
|
||||
bar.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
|
||||
bar['mutation_info_old'].value_counts()
|
||||
bar['mutation_info'].value_counts()
|
||||
foo = bar.iloc[ambig_muts_rev_df.index]
|
||||
foo[['mutation', 'mutation_info', 'mutation_info_old']]
|
||||
|
||||
# CHECK if there are still any ambiguous muts
|
||||
muts_split_rev = list(bar.groupby('mutation_info'))
|
||||
gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
|
||||
gene_LF1['mutation_info_orig'].value_counts()
|
||||
gene_LF1['mutation_info'].value_counts()
|
||||
foo = gene_LF1.iloc[ambig_muts_rev_df.index]
|
||||
foo[['mutation', 'mutation_info', 'mutation_info_orig']]
|
||||
|
||||
# Sanity check1: if there are still any ambiguous muts
|
||||
muts_split_rev = list(gene_LF1.groupby('mutation_info'))
|
||||
dr_muts_rev = muts_split_rev[0][1].mutation
|
||||
other_muts_rev = muts_split_rev[1][1].mutation
|
||||
print('splitting muts by mut_info:', muts_split_rev)
|
||||
|
@ -972,34 +984,8 @@ else:
|
|||
print('\nAmbiguous muts corrected. Quitting!')
|
||||
sys.exit()
|
||||
|
||||
#%% ROUND THE HOUSES: DELETE
|
||||
foo = ambig_muts_rev_df[['mutation', 'mutation_info_REV']]
|
||||
fooU = foo.drop_duplicates()
|
||||
|
||||
#https://cmdlinetips.com/2021/04/convert-two-column-values-from-pandas-dataframe-to-a-dictionary/
|
||||
fooD = fooU.set_index('mutation').to_dict()['mutation_info_REV']
|
||||
fooD
|
||||
#https://stackoverflow.com/questions/45334020/pandas-map-column-data-based-on-value-from-another-column-using-if-to-determine
|
||||
#df['New_State_Name'] = np.where(df['Name']=='Person1',df['State'].map(state_map),df['State'].map(state_map2))
|
||||
bar['NEW'] = bar['mutation_info_old']
|
||||
bar['NEW'].value_counts()
|
||||
bar['mutation_info_old'].value_counts()
|
||||
|
||||
bar['NEW'].value_counts()
|
||||
bar2 = bar[['mutation', 'mutation_info', 'NEW']]
|
||||
bar2.isnull().sum()
|
||||
|
||||
bar['NEW'].value_counts()
|
||||
bar['mutation_info'].value_counts()
|
||||
bar.loc[bar['mutation'].isin(fooD.keys()), 'NEW'] = bar['mutation_info'].map(fooD)
|
||||
bar['NEW'] = bar['NEW'].map(fooD).fillna(bar['NEW'])
|
||||
|
||||
bar['NEW2'].value_counts()
|
||||
bar['NEW'].value_counts()
|
||||
#%% OUTFILE 3, write file: ambiguous muts and ambiguous mut counts
|
||||
|
||||
#print(outdir)
|
||||
===================
|
||||
#===================
|
||||
# ambiguous muts
|
||||
#=======================
|
||||
#dr_muts.to_csv('dr_muts.csv', header = True)
|
||||
|
@ -1034,7 +1020,7 @@ print('\n----------------------------------'
|
|||
|
||||
ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
|
||||
|
||||
#%% clear variables
|
||||
# clear variables
|
||||
del(id_dr, id_other
|
||||
#, meta_data
|
||||
, meta_gene_dr, meta_gene_other
|
||||
|
@ -1043,9 +1029,420 @@ del(id_dr, id_other
|
|||
|
||||
del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1)
|
||||
|
||||
#%% Splitting mutation column to get mCSM style muts i.e. mutationinformation column
|
||||
#%% Add column: Mutationinformation
|
||||
# splitting mutation column to get mCSM style muts
|
||||
#=======================
|
||||
# Formatting df: read aa dict and pull relevant info
|
||||
#=======================
|
||||
print('Now some more formatting:'
|
||||
, '\nRead aa dict and pull relevant info'
|
||||
, '\nFormat mutations:'
|
||||
, '\nsplit mutation into mCSM style muts: '
|
||||
, '\nFormatting mutation in mCSM style format: {WT}<POS>{MUT}'
|
||||
, '\nAssign aa properties: adding 2 cols at a time for each prop'
|
||||
, '\n===================================================================')
|
||||
|
||||
# BEWARE hardcoding : only works as we are adding aa prop once for wt and once for mut
|
||||
# in each lookup cycle
|
||||
ncol_mutf_add = 3 # mut split into 3 cols
|
||||
ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping
|
||||
|
||||
#===========
|
||||
# Split 'mutation' column into three: wild_type, position and
|
||||
# mutant_type separately. Then map three letter code to one using
|
||||
# reference_dict imported at the beginning.
|
||||
# After importing, convert to mutation to lowercase for compatibility with dict
|
||||
#===========
|
||||
gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower()
|
||||
|
||||
print('wt regex being used:', wt_regex
|
||||
, '\nmut regex being used:', mut_regex
|
||||
, '\nposition regex being used:', pos_regex)
|
||||
|
||||
mylen0 = len(gene_LF1.columns)
|
||||
|
||||
#=======
|
||||
# Iterate through the dict, create a lookup dict i.e
|
||||
# lookup_dict = {three_letter_code: one_letter_code}.
|
||||
# lookup dict should be the key and the value (you want to create a column for)
|
||||
# Then use this to perform the mapping separetly for wild type and mutant cols.
|
||||
# The three letter code is extracted using a string match match from the dataframe and then converted
|
||||
# to 'pandas series'since map only works in pandas series
|
||||
#=======
|
||||
print('Adding', ncol_mutf_add, 'more cols:\n')
|
||||
|
||||
# initialise a sub dict that is lookup dict for three letter code to 1-letter code
|
||||
# adding three more cols
|
||||
lookup_dict = dict()
|
||||
for k, v in my_aa_dict.items():
|
||||
lookup_dict[k] = v['one_letter_code']
|
||||
#wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()converts to a series that map works on
|
||||
wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
|
||||
gene_LF1['wild_type'] = wt.map(lookup_dict)
|
||||
#mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
|
||||
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
|
||||
gene_LF1['mutant_type'] = mut.map(lookup_dict)
|
||||
|
||||
# extract position info from mutation column separetly using string match
|
||||
#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)')
|
||||
gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex)
|
||||
|
||||
mylen1 = len(gene_LF1.columns)
|
||||
|
||||
# sanity checks
|
||||
print('checking if 3-letter wt&mut residue extraction worked correctly')
|
||||
if wt.isna().sum() & mut.isna().sum() == 0:
|
||||
print('PASS: 3-letter wt&mut residue extraction worked correctly:'
|
||||
, '\nNo NAs detected:'
|
||||
, '\nwild-type\n', wt
|
||||
, '\nmutant-type\n', mut
|
||||
, '\ndim of df:', gene_LF1.shape)
|
||||
else:
|
||||
print('FAIL: 3-letter wt&mut residue extraction failed'
|
||||
, '\nNo NAs detected:'
|
||||
, '\nwild-type\n', wt
|
||||
, '\nmutant-type\n', mut
|
||||
, '\ndim of df:', gene_LF1.shape)
|
||||
|
||||
if mylen1 == mylen0 + ncol_mutf_add:
|
||||
print('PASS: successfully added', ncol_mutf_add, 'cols'
|
||||
, '\nold length:', mylen0
|
||||
, '\nnew len:', mylen1)
|
||||
else:
|
||||
print('FAIL: failed to add cols:'
|
||||
, '\nold length:', mylen0
|
||||
, '\nnew len:', mylen1)
|
||||
|
||||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
|
||||
########
|
||||
# combine the wild_type+poistion+mutant_type columns to generate
|
||||
# mutationinformation (matches mCSM output field)
|
||||
# Remember to use .map(str) for int col types to allow string concatenation
|
||||
#########
|
||||
gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type']
|
||||
print('Created column: mutationinformation'
|
||||
, '\n=====================================================================\n'
|
||||
, gene_LF1.mutationinformation.head(10))
|
||||
|
||||
#order by position for convenience
|
||||
gene_LF1.dtypes
|
||||
|
||||
# converting position to numeric
|
||||
gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
|
||||
|
||||
# sort by position inplace
|
||||
foo = gene_LF1['position'].value_counts()
|
||||
gene_LF1.sort_values(by = ['position'], inplace = True)
|
||||
bar = gene_LF1['position'].value_counts()
|
||||
|
||||
if (foo == bar).all():
|
||||
print('PASS: df ordered by position')
|
||||
print(gene_LF1['position'].head())
|
||||
else:
|
||||
print('FAIL: df could not be ordered. Check source')
|
||||
sys.exit()
|
||||
|
||||
#%% OUTFILE 4, write file mCSM style muts
|
||||
snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique())
|
||||
snps_only.head()
|
||||
# assign column name
|
||||
snps_only.columns = ['mutationinformation']
|
||||
|
||||
# count how many positions this corresponds to
|
||||
pos_only = pd.DataFrame(gene_LF1['position'].unique())
|
||||
|
||||
print('Checking NA in snps...')# should be 0
|
||||
if snps_only.mutationinformation.isna().sum() == 0:
|
||||
print ('PASS: NO NAs/missing entries for SNPs'
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
|
||||
|
||||
# write file: mCSM muts
|
||||
out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv'
|
||||
outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
|
||||
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: mCSM style muts'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_mcsmsnps
|
||||
, '\nmutation format (SNP): {WT}<POS>{MUT}'
|
||||
, '\nNo. of distinct muts:', len(snps_only)
|
||||
, '\nNo. of distinct positions:', len(pos_only)
|
||||
, '\n=============================================================')
|
||||
|
||||
snps_only.to_csv(outfile_mcsmsnps, header = False, index = False)
|
||||
|
||||
print('Finished writing:', outfile_mcsmsnps
|
||||
, '\nNo. of rows:', len(snps_only)
|
||||
, '\nNo. of cols:', len(snps_only.columns)
|
||||
, '\n=============================================================')
|
||||
del(out_filename_mcsmsnps)
|
||||
#%% OUTFILE 5, write file frequency of position counts
|
||||
metadata_pos = pd.DataFrame(gene_LF1['position'])
|
||||
z = gene_LF1['position'].value_counts()
|
||||
z1 = z.to_dict()
|
||||
metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
|
||||
metadata_pos['meta_pos_count'].value_counts()
|
||||
|
||||
metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
|
||||
|
||||
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
|
||||
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: Metadata poscounts'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_metadata_poscounts
|
||||
, '\n============================================================')
|
||||
|
||||
metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
|
||||
print('Finished writing:', outfile_metadata_poscounts
|
||||
, '\nNo. of rows:', len(metadata_pos)
|
||||
, '\nNo. of cols:', len(metadata_pos.columns)
|
||||
, '\n=============================================================')
|
||||
del(out_filename_metadata_poscounts)
|
||||
#%% OUTFILE 6, write file <gene>_metadata
|
||||
# gene_LF1, where each row has UNIQUE mutations NOT unique sample ids
|
||||
out_filename_metadata = gene.lower() + '_metadata.csv'
|
||||
outfile_metadata = outdir + '/' + out_filename_metadata
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: LF formatted data'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_metadata
|
||||
, '\n============================================================')
|
||||
|
||||
gene_LF1.to_csv(outfile_metadata, header = True, index = False)
|
||||
print('Finished writing:', outfile_metadata
|
||||
, '\nNo. of rows:', len(gene_LF1)
|
||||
, '\nNo. of cols:', len(gene_LF1.columns)
|
||||
, '\n=============================================================')
|
||||
del(out_filename_metadata)
|
||||
#%% OUTFILE 7, write file MSA plots
|
||||
# mCSM style muts but with repetitions
|
||||
# !!! THINK !!!: Implications if mut is coming from the same sample, etc.
|
||||
all_muts_msa = pd.DataFrame(gene_LF1['mutationinformation'])
|
||||
all_muts_msa.head()
|
||||
|
||||
# assign column name
|
||||
all_muts_msa.columns = ['mutationinformation']
|
||||
|
||||
# make sure it is string
|
||||
all_muts_msa.columns.dtype
|
||||
|
||||
# sort the column
|
||||
all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation')
|
||||
|
||||
# create an extra column with protein name
|
||||
#all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1')
|
||||
#all_muts_msa_sorted.head()
|
||||
|
||||
# rearrange columns so the fasta name is the first column (required for mutate.script)
|
||||
#all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
|
||||
all_muts_msa_sorted.head()
|
||||
|
||||
print('Checking NA in snps...')# should be 0
|
||||
if all_muts_msa.mutationinformation.isna().sum() == 0:
|
||||
print ('PASS: NO NAs/missing entries for SNPs'
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
|
||||
|
||||
out_filename_msa = gene.lower() +'_all_muts_msa.csv'
|
||||
outfile_msa = outdir + '/' + out_filename_msa
|
||||
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: mCSM style muts for msa'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_msa
|
||||
, '\nmutation format (SNP): {WT}<POS>{MUT}'
|
||||
, '\nNo.of lines of msa:', len(all_muts_msa))
|
||||
|
||||
all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False)
|
||||
|
||||
print('Finished writing:', outfile_msa
|
||||
, '\nNo. of rows:', len(all_muts_msa)
|
||||
, '\nNo. of cols:', len(all_muts_msa.columns)
|
||||
, '\n=============================================================')
|
||||
|
||||
del(out_filename_msa)
|
||||
#%% OUTFILE 8, write file mutational position with count
|
||||
# count how many positions this corresponds to
|
||||
pos_only = pd.DataFrame(gene_LF1['position'].unique())
|
||||
# assign column name
|
||||
pos_only.columns = ['position']
|
||||
# make sure dtype of column position is int or numeric and not string
|
||||
pos_only.position.dtype
|
||||
pos_only['position'] = pos_only['position'].astype(int)
|
||||
pos_only.position.dtype
|
||||
|
||||
# sort by position value
|
||||
pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)
|
||||
|
||||
out_filename_pos = gene.lower() + '_mutational_positons.csv'
|
||||
outfile_pos = outdir + '/' + out_filename_pos
|
||||
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: mutational positions'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_pos
|
||||
, '\nNo. of distinct positions:', len(pos_only_sorted)
|
||||
, '\n=============================================================')
|
||||
|
||||
pos_only_sorted.to_csv(outfile_pos, header = True, index = False)
|
||||
|
||||
print('Finished writing:', outfile_pos
|
||||
, '\nNo. of rows:', len(pos_only_sorted)
|
||||
, '\nNo. of cols:', len(pos_only_sorted.columns)
|
||||
, '\n============================================================='
|
||||
, '\n\n\n')
|
||||
|
||||
del(out_filename_pos)
|
||||
#%% Add column: aa property_water
|
||||
#=========
|
||||
# iterate through the dict, create a lookup dict that i.e
|
||||
# lookup_dict = {three_letter_code: aa_prop_water}
|
||||
# Do this for both wild_type and mutant as above.
|
||||
#=========
|
||||
print('Adding', ncol_aa_add, 'more cols:\n')
|
||||
|
||||
# initialise a sub dict that is lookup dict for three letter code to aa prop
|
||||
# adding two more cols
|
||||
lookup_dict = dict()
|
||||
for k, v in my_aa_dict.items():
|
||||
lookup_dict[k] = v['aa_prop_water']
|
||||
#print(lookup_dict)
|
||||
wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
|
||||
gene_LF1['wt_prop_water'] = wt.map(lookup_dict)
|
||||
#mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
|
||||
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
|
||||
gene_LF1['mut_prop_water'] = mut.map(lookup_dict)
|
||||
|
||||
mylen2 = len(gene_LF1.columns)
|
||||
|
||||
# sanity checks
|
||||
print('checking if 3-letter wt&mut residue extraction worked correctly')
|
||||
if wt.isna().sum() & mut.isna().sum() == 0:
|
||||
print('PASS: 3-letter wt&mut residue extraction worked correctly:'
|
||||
, '\nNo NAs detected:'
|
||||
, '\nwild-type\n', wt
|
||||
, '\nmutant-type\n', mut
|
||||
, '\ndim of df:', gene_LF1.shape)
|
||||
else:
|
||||
print('FAIL: 3-letter wt&mut residue extraction failed'
|
||||
, '\nNo NAs detected:'
|
||||
, '\nwild-type\n', wt
|
||||
, '\nmutant-type\n', mut
|
||||
, '\ndim of df:', gene_LF1.shape)
|
||||
|
||||
if mylen2 == mylen1 + ncol_aa_add:
|
||||
print('PASS: successfully added', ncol_aa_add, 'cols'
|
||||
, '\nold length:', mylen1
|
||||
, '\nnew len:', mylen2)
|
||||
else:
|
||||
print('FAIL: failed to add cols:'
|
||||
, '\nold length:', mylen1
|
||||
, '\nnew len:', mylen2)
|
||||
|
||||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
#%% Add column: aa_prop_polarity
|
||||
#========
|
||||
# iterate through the dict, create a lookup dict that i.e
|
||||
# lookup_dict = {three_letter_code: aa_prop_polarity}
|
||||
# Do this for both wild_type and mutant as above.
|
||||
#=========
|
||||
print('Adding', ncol_aa_add, 'more cols:\n')
|
||||
|
||||
# initialise a sub dict that is lookup dict for three letter code to aa prop
|
||||
# adding two more cols
|
||||
lookup_dict = dict()
|
||||
for k, v in my_aa_dict.items():
|
||||
lookup_dict[k] = v['aa_prop_polarity']
|
||||
#print(lookup_dict)
|
||||
wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
|
||||
gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict)
|
||||
#mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
|
||||
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
|
||||
gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
|
||||
|
||||
mylen3 = len(gene_LF1.columns)
|
||||
|
||||
# sanity checks
|
||||
print('checking if 3-letter wt&mut residue extraction worked correctly')
|
||||
if wt.isna().sum() & mut.isna().sum() == 0:
|
||||
print('PASS: 3-letter wt&mut residue extraction worked correctly:'
|
||||
, '\nNo NAs detected:'
|
||||
, '\nwild-type\n', wt
|
||||
, '\nmutant-type\n', mut
|
||||
, '\ndim of df:', gene_LF1.shape)
|
||||
else:
|
||||
print('FAIL: 3-letter wt&mut residue extraction failed'
|
||||
, '\nNo NAs detected:'
|
||||
, '\nwild-type\n', wt
|
||||
, '\nmutant-type\n', mut
|
||||
, '\ndim of df:', gene_LF1.shape)
|
||||
|
||||
if mylen3 == mylen2 + ncol_aa_add:
|
||||
print('PASS: successfully added', ncol_aa_add, 'cols'
|
||||
, '\nold length:', mylen1
|
||||
, '\nnew len:', mylen2)
|
||||
else:
|
||||
print('FAIL: failed to add cols:'
|
||||
, '\nold length:', mylen1
|
||||
, '\nnew len:', mylen2)
|
||||
|
||||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
#%% Add column: aa_calcprop
|
||||
#========
|
||||
# iterate through the dict, create a lookup dict that i.e
|
||||
# lookup_dict = {three_letter_code: aa_calcprop}
|
||||
# Do this for both wild_type and mutant as above.
|
||||
#=========
|
||||
print('Adding', ncol_aa_add, 'more cols:\n')
|
||||
|
||||
lookup_dict = dict()
|
||||
for k, v in my_aa_dict.items():
|
||||
lookup_dict[k] = v['aa_calcprop']
|
||||
#print(lookup_dict)
|
||||
wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
|
||||
gene_LF1['wt_calcprop'] = wt.map(lookup_dict)
|
||||
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
|
||||
gene_LF1['mut_calcprop'] = mut.map(lookup_dict)
|
||||
|
||||
mylen4 = len(gene_LF1.columns)
|
||||
|
||||
# sanity checks
|
||||
print('checking if 3-letter wt&mut residue extraction worked correctly')
|
||||
if wt.isna().sum() & mut.isna().sum() == 0:
|
||||
print('PASS: 3-letter wt&mut residue extraction worked correctly:'
|
||||
, '\nNo NAs detected:'
|
||||
, '\nwild-type\n', wt
|
||||
, '\nmutant-type\n', mut
|
||||
, '\ndim of df:', gene_LF1.shape)
|
||||
else:
|
||||
print('FAIL: 3-letter wt&mut residue extraction failed'
|
||||
, '\nNo NAs detected:'
|
||||
, '\nwild-type\n', wt
|
||||
, '\nmutant-type\n', mut
|
||||
, '\ndim of df:', gene_LF1.shape)
|
||||
|
||||
if mylen4 == mylen3 + ncol_aa_add:
|
||||
print('PASS: successfully added', ncol_aa_add, 'cols'
|
||||
, '\nold length:', mylen3
|
||||
, '\nnew len:', mylen4)
|
||||
else:
|
||||
print('FAIL: failed to add cols:'
|
||||
, '\nold length:', mylen3
|
||||
, '\nnew len:', mylen4)
|
||||
|
||||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
|
||||
#%% NEW TODO: map mutationinformation
|
||||
|
||||
#%% NEW: mappings
|
||||
#=======================
|
||||
# column name: <drug>
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue