diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index 241ff5b..dc89813 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -1210,7 +1210,7 @@ print('Finished writing:', outfile_mcsmsnps , '\n=============================================================') del(out_filename_mcsmsnps) -#%%# write frequency of position counts +#%% write frequency of position counts metadata_pos = pd.DataFrame(gene_LF1['position']) z = gene_LF1['position'].value_counts() z1 = z.to_dict() @@ -1219,8 +1219,6 @@ metadata_pos['meta_pos_count'].value_counts() metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True) -# Write file: gene_metadata (i.e gene_LF1) -# where each row has UNIQUE mutations NOT unique sample ids out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv' outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts print('\n----------------------------------' @@ -1236,7 +1234,6 @@ print('Finished writing:', outfile_metadata_poscounts , '\n=============================================================') del(out_filename_metadata_poscounts) - #%% Write file: gene_metadata (i.e gene_LF1) # where each row has UNIQUE mutations NOT unique sample ids #out_filename_metadata = gene.lower() + '_metadata_raw.csv' #! rationale? diff --git a/scripts/data_extraction_v2.py b/scripts/data_extraction_v2.py index c041db5..e3e404f 100644 --- a/scripts/data_extraction_v2.py +++ b/scripts/data_extraction_v2.py @@ -905,14 +905,16 @@ else: #%% Ambiguous muts: revised annotation for mutation_info ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)] - ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts() ambiguous_muts_value_counts -bar = gene_LF1.copy() -bar.equals(gene_LF1) - gene_LF1_orig = gene_LF1.copy() +gene_LF1_orig.equals(gene_LF1) + +# copy the old column for checking +gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info'] +gene_LF1['mutation_info_orig'].value_counts() +gene_LF1['mutation_info'].value_counts() #===================================== # Now create a new df that will have: @@ -924,42 +926,52 @@ gene_LF1_orig = gene_LF1.copy() # label is chosen that corresponds to the max of value counts #===================================== ambig_muts_rev_df = pd.DataFrame() +changes_val = [] +changes_dict = {} for i in common_muts: #print(i) - temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info']] + temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']] # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending - max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info']].value_counts() + max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts() revised_label = max_info_table[[0]].index[0][1] # max value_count old_label = max_info_table[[1]].index[0][1] # min value_count print('\nAmbiguous mutation labels...' , '\nSetting mutation_info for', i, 'to', revised_label) - temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info'] == old_label) + temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info_orig'] == old_label) , revised_label - , temp_df['mutation_info']) + , temp_df['mutation_info_orig']) ambig_muts_rev_df = pd.concat([ambig_muts_rev_df,temp_df]) + f = max_info_table[[1]] + old_label_count = f[[0]][0] + changes_val.append(old_label_count) + cc_dict = f.to_dict() + changes_dict.update(cc_dict) -ambig_muts_rev_df['mutation_info'].value_counts() ambig_muts_rev_df['mutation_info_REV'].value_counts() +ambig_muts_rev_df['mutation_info_orig'].value_counts() +changes_val +changes_total = sum(changes_val) +changes_dict + +# TODO: Add sanity check to make sure you can add value_count checks #================= # Merge ambig muts # with gene_LF1 #=================== -bar = gene_LF1.copy() -bar['mutation_info_old'] = bar['mutation_info'] - ambig_muts_rev_df.index -bar.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV'] -bar['mutation_info_old'].value_counts() -bar['mutation_info'].value_counts() -foo = bar.iloc[ambig_muts_rev_df.index] -foo[['mutation', 'mutation_info', 'mutation_info_old']] -# CHECK if there are still any ambiguous muts -muts_split_rev = list(bar.groupby('mutation_info')) +gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV'] +gene_LF1['mutation_info_orig'].value_counts() +gene_LF1['mutation_info'].value_counts() +foo = gene_LF1.iloc[ambig_muts_rev_df.index] +foo[['mutation', 'mutation_info', 'mutation_info_orig']] + +# Sanity check1: if there are still any ambiguous muts +muts_split_rev = list(gene_LF1.groupby('mutation_info')) dr_muts_rev = muts_split_rev[0][1].mutation other_muts_rev = muts_split_rev[1][1].mutation print('splitting muts by mut_info:', muts_split_rev) @@ -972,34 +984,8 @@ else: print('\nAmbiguous muts corrected. Quitting!') sys.exit() -#%% ROUND THE HOUSES: DELETE -foo = ambig_muts_rev_df[['mutation', 'mutation_info_REV']] -fooU = foo.drop_duplicates() - -#https://cmdlinetips.com/2021/04/convert-two-column-values-from-pandas-dataframe-to-a-dictionary/ -fooD = fooU.set_index('mutation').to_dict()['mutation_info_REV'] -fooD -#https://stackoverflow.com/questions/45334020/pandas-map-column-data-based-on-value-from-another-column-using-if-to-determine -#df['New_State_Name'] = np.where(df['Name']=='Person1',df['State'].map(state_map),df['State'].map(state_map2)) -bar['NEW'] = bar['mutation_info_old'] -bar['NEW'].value_counts() -bar['mutation_info_old'].value_counts() - -bar['NEW'].value_counts() -bar2 = bar[['mutation', 'mutation_info', 'NEW']] -bar2.isnull().sum() - -bar['NEW'].value_counts() -bar['mutation_info'].value_counts() -bar.loc[bar['mutation'].isin(fooD.keys()), 'NEW'] = bar['mutation_info'].map(fooD) -bar['NEW'] = bar['NEW'].map(fooD).fillna(bar['NEW']) - -bar['NEW2'].value_counts() -bar['NEW'].value_counts() #%% OUTFILE 3, write file: ambiguous muts and ambiguous mut counts - -#print(outdir) -=================== +#=================== # ambiguous muts #======================= #dr_muts.to_csv('dr_muts.csv', header = True) @@ -1034,7 +1020,7 @@ print('\n----------------------------------' ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True) -#%% clear variables +# clear variables del(id_dr, id_other #, meta_data , meta_gene_dr, meta_gene_other @@ -1043,9 +1029,420 @@ del(id_dr, id_other del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1) -#%% Splitting mutation column to get mCSM style muts i.e. mutationinformation column +#%% Add column: Mutationinformation +# splitting mutation column to get mCSM style muts +#======================= +# Formatting df: read aa dict and pull relevant info +#======================= +print('Now some more formatting:' + , '\nRead aa dict and pull relevant info' + , '\nFormat mutations:' + , '\nsplit mutation into mCSM style muts: ' + , '\nFormatting mutation in mCSM style format: {WT}{MUT}' + , '\nAssign aa properties: adding 2 cols at a time for each prop' + , '\n===================================================================') + +# BEWARE hardcoding : only works as we are adding aa prop once for wt and once for mut +# in each lookup cycle +ncol_mutf_add = 3 # mut split into 3 cols +ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping + +#=========== +# Split 'mutation' column into three: wild_type, position and +# mutant_type separately. Then map three letter code to one using +# reference_dict imported at the beginning. +# After importing, convert to mutation to lowercase for compatibility with dict +#=========== +gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower() + +print('wt regex being used:', wt_regex + , '\nmut regex being used:', mut_regex + , '\nposition regex being used:', pos_regex) + +mylen0 = len(gene_LF1.columns) + +#======= +# Iterate through the dict, create a lookup dict i.e +# lookup_dict = {three_letter_code: one_letter_code}. +# lookup dict should be the key and the value (you want to create a column for) +# Then use this to perform the mapping separetly for wild type and mutant cols. +# The three letter code is extracted using a string match match from the dataframe and then converted +# to 'pandas series'since map only works in pandas series +#======= +print('Adding', ncol_mutf_add, 'more cols:\n') + +# initialise a sub dict that is lookup dict for three letter code to 1-letter code +# adding three more cols +lookup_dict = dict() +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['one_letter_code'] + #wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()converts to a series that map works on + wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze() + gene_LF1['wild_type'] = wt.map(lookup_dict) + #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() + gene_LF1['mutant_type'] = mut.map(lookup_dict) + +# extract position info from mutation column separetly using string match +#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') +gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex) + +mylen1 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen1 == mylen0 + ncol_mutf_add: + print('PASS: successfully added', ncol_mutf_add, 'cols' + , '\nold length:', mylen0 + , '\nnew len:', mylen1) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen0 + , '\nnew len:', mylen1) + +# clear variables +del(k, v, wt, mut, lookup_dict) + +######## +# combine the wild_type+poistion+mutant_type columns to generate +# mutationinformation (matches mCSM output field) +# Remember to use .map(str) for int col types to allow string concatenation +######### +gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type'] +print('Created column: mutationinformation' + , '\n=====================================================================\n' + , gene_LF1.mutationinformation.head(10)) + +#order by position for convenience +gene_LF1.dtypes + +# converting position to numeric +gene_LF1['position'] = pd.to_numeric(gene_LF1['position']) + +# sort by position inplace +foo = gene_LF1['position'].value_counts() +gene_LF1.sort_values(by = ['position'], inplace = True) +bar = gene_LF1['position'].value_counts() + +if (foo == bar).all(): + print('PASS: df ordered by position') + print(gene_LF1['position'].head()) +else: + print('FAIL: df could not be ordered. Check source') + sys.exit() + +#%% OUTFILE 4, write file mCSM style muts +snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique()) +snps_only.head() +# assign column name +snps_only.columns = ['mutationinformation'] + +# count how many positions this corresponds to +pos_only = pd.DataFrame(gene_LF1['position'].unique()) + +print('Checking NA in snps...')# should be 0 +if snps_only.mutationinformation.isna().sum() == 0: + print ('PASS: NO NAs/missing entries for SNPs' + , '\n===============================================================') +else: + sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?') + +# write file: mCSM muts +out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv' +outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps + +print('\n----------------------------------' + , '\nWriting file: mCSM style muts' + , '\n----------------------------------' + , '\nFile:', outfile_mcsmsnps + , '\nmutation format (SNP): {WT}{MUT}' + , '\nNo. of distinct muts:', len(snps_only) + , '\nNo. of distinct positions:', len(pos_only) + , '\n=============================================================') + +snps_only.to_csv(outfile_mcsmsnps, header = False, index = False) + +print('Finished writing:', outfile_mcsmsnps + , '\nNo. of rows:', len(snps_only) + , '\nNo. of cols:', len(snps_only.columns) + , '\n=============================================================') +del(out_filename_mcsmsnps) +#%% OUTFILE 5, write file frequency of position counts +metadata_pos = pd.DataFrame(gene_LF1['position']) +z = gene_LF1['position'].value_counts() +z1 = z.to_dict() +metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1) +metadata_pos['meta_pos_count'].value_counts() + +metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True) + +out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv' +outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts +print('\n----------------------------------' + , '\nWriting file: Metadata poscounts' + , '\n----------------------------------' + , '\nFile:', outfile_metadata_poscounts + , '\n============================================================') + +metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False) +print('Finished writing:', outfile_metadata_poscounts + , '\nNo. of rows:', len(metadata_pos) + , '\nNo. of cols:', len(metadata_pos.columns) + , '\n=============================================================') +del(out_filename_metadata_poscounts) +#%% OUTFILE 6, write file _metadata +# gene_LF1, where each row has UNIQUE mutations NOT unique sample ids +out_filename_metadata = gene.lower() + '_metadata.csv' +outfile_metadata = outdir + '/' + out_filename_metadata +print('\n----------------------------------' + , '\nWriting file: LF formatted data' + , '\n----------------------------------' + , '\nFile:', outfile_metadata + , '\n============================================================') + +gene_LF1.to_csv(outfile_metadata, header = True, index = False) +print('Finished writing:', outfile_metadata + , '\nNo. of rows:', len(gene_LF1) + , '\nNo. of cols:', len(gene_LF1.columns) + , '\n=============================================================') +del(out_filename_metadata) +#%% OUTFILE 7, write file MSA plots +# mCSM style muts but with repetitions +# !!! THINK !!!: Implications if mut is coming from the same sample, etc. +all_muts_msa = pd.DataFrame(gene_LF1['mutationinformation']) +all_muts_msa.head() + +# assign column name +all_muts_msa.columns = ['mutationinformation'] + +# make sure it is string +all_muts_msa.columns.dtype + +# sort the column +all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation') + +# create an extra column with protein name +#all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1') +#all_muts_msa_sorted.head() + +# rearrange columns so the fasta name is the first column (required for mutate.script) +#all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']] +all_muts_msa_sorted.head() + +print('Checking NA in snps...')# should be 0 +if all_muts_msa.mutationinformation.isna().sum() == 0: + print ('PASS: NO NAs/missing entries for SNPs' + , '\n===============================================================') +else: + sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?') + +out_filename_msa = gene.lower() +'_all_muts_msa.csv' +outfile_msa = outdir + '/' + out_filename_msa + +print('\n----------------------------------' + , '\nWriting file: mCSM style muts for msa' + , '\n----------------------------------' + , '\nFile:', outfile_msa + , '\nmutation format (SNP): {WT}{MUT}' + , '\nNo.of lines of msa:', len(all_muts_msa)) + +all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False) + +print('Finished writing:', outfile_msa + , '\nNo. of rows:', len(all_muts_msa) + , '\nNo. of cols:', len(all_muts_msa.columns) + , '\n=============================================================') + +del(out_filename_msa) +#%% OUTFILE 8, write file mutational position with count +# count how many positions this corresponds to +pos_only = pd.DataFrame(gene_LF1['position'].unique()) +# assign column name +pos_only.columns = ['position'] +# make sure dtype of column position is int or numeric and not string +pos_only.position.dtype +pos_only['position'] = pos_only['position'].astype(int) +pos_only.position.dtype + +# sort by position value +pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True) + +out_filename_pos = gene.lower() + '_mutational_positons.csv' +outfile_pos = outdir + '/' + out_filename_pos + +print('\n----------------------------------' + , '\nWriting file: mutational positions' + , '\n----------------------------------' + , '\nFile:', outfile_pos + , '\nNo. of distinct positions:', len(pos_only_sorted) + , '\n=============================================================') + +pos_only_sorted.to_csv(outfile_pos, header = True, index = False) + +print('Finished writing:', outfile_pos + , '\nNo. of rows:', len(pos_only_sorted) + , '\nNo. of cols:', len(pos_only_sorted.columns) + , '\n=============================================================' + , '\n\n\n') + +del(out_filename_pos) +#%% Add column: aa property_water +#========= +# iterate through the dict, create a lookup dict that i.e +# lookup_dict = {three_letter_code: aa_prop_water} +# Do this for both wild_type and mutant as above. +#========= +print('Adding', ncol_aa_add, 'more cols:\n') + +# initialise a sub dict that is lookup dict for three letter code to aa prop +# adding two more cols +lookup_dict = dict() +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['aa_prop_water'] + #print(lookup_dict) + wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze() + gene_LF1['wt_prop_water'] = wt.map(lookup_dict) + #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() + gene_LF1['mut_prop_water'] = mut.map(lookup_dict) + +mylen2 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen2 == mylen1 + ncol_aa_add: + print('PASS: successfully added', ncol_aa_add, 'cols' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) + +# clear variables +del(k, v, wt, mut, lookup_dict) +#%% Add column: aa_prop_polarity +#======== +# iterate through the dict, create a lookup dict that i.e +# lookup_dict = {three_letter_code: aa_prop_polarity} +# Do this for both wild_type and mutant as above. +#========= +print('Adding', ncol_aa_add, 'more cols:\n') + +# initialise a sub dict that is lookup dict for three letter code to aa prop +# adding two more cols +lookup_dict = dict() +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['aa_prop_polarity'] + #print(lookup_dict) + wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze() + gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict) + #mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() + mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() + gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict) + +mylen3 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen3 == mylen2 + ncol_aa_add: + print('PASS: successfully added', ncol_aa_add, 'cols' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) + +# clear variables +del(k, v, wt, mut, lookup_dict) +#%% Add column: aa_calcprop +#======== +# iterate through the dict, create a lookup dict that i.e +# lookup_dict = {three_letter_code: aa_calcprop} +# Do this for both wild_type and mutant as above. +#========= +print('Adding', ncol_aa_add, 'more cols:\n') + +lookup_dict = dict() +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['aa_calcprop'] + #print(lookup_dict) + wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze() + gene_LF1['wt_calcprop'] = wt.map(lookup_dict) + mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() + gene_LF1['mut_calcprop'] = mut.map(lookup_dict) + +mylen4 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen4 == mylen3 + ncol_aa_add: + print('PASS: successfully added', ncol_aa_add, 'cols' + , '\nold length:', mylen3 + , '\nnew len:', mylen4) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen3 + , '\nnew len:', mylen4) + +# clear variables +del(k, v, wt, mut, lookup_dict) #%% NEW TODO: map mutationinformation + #%% NEW: mappings #======================= # column name: