diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index ceab5d0..fb9449d 100644 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -672,6 +672,15 @@ print(dict(Cdict)) for k, v in Cdict.items(): if k in common_snps_dr_other: print(k,v) + +# convert defaultDict to dict +SnpFDict_orig = dict(Cdict) + +def lower_dict(d): + new_dict = dict((k.lower(), v) for k, v in d.items()) + return new_dict + +SnpFDict = lower_dict(SnpFDict_orig) ############################################################################### # USE Vcounts to get expected curated df # resolve dm om muts funda @@ -987,13 +996,16 @@ ambig_muts_rev_df.index gene_LF1.index all(ambig_muts_rev_df.index.isin(gene_LF1.index)) -gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV'] +#gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV'] +gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV'] + gene_LF1['mutation_info_orig'].value_counts() -gene_LF1['mutation_info_v1'].value_counts() +#gene_LF1['mutation_info_v1'].value_counts() foo = gene_LF1.iloc[ambig_muts_rev_df.index] # Sanity check1: if there are still any ambiguous muts -muts_split_rev = list(gene_LF1.groupby('mutation_info_v1')) +#muts_split_rev = list(gene_LF1.groupby('mutation_info_v1')) +muts_split_rev = list(gene_LF1.groupby('mutation_info')) dr_muts_rev = muts_split_rev[0][1].mutation other_muts_rev = muts_split_rev[1][1].mutation print('splitting muts by mut_info:', muts_split_rev) @@ -1006,5 +1018,402 @@ else: print('\nAmbiguous muts NOT corrected. Quitting!') sys.exit() -gene_LF1['mutation_info_v1'].value_counts() -#%% PHEW! Good to go for downstream stuff \ No newline at end of file +#gene_LF1['mutation_info_v1'].value_counts() +gene_LF1['mutation_info'].value_counts() + +# reassign +#%% PHEW! Good to go for downstream stuff +#%% Add column: Mutationinformation +# splitting mutation column to get mCSM style muts +#======================= +# Formatting df: read aa dict and pull relevant info +#======================= +print('Now some more formatting:' + , '\nRead aa dict and pull relevant info' + , '\nFormat mutations:' + , '\nsplit mutation into mCSM style muts: ' + , '\nFormatting mutation in mCSM style format: {WT}{MUT}' + , '\nAssign aa properties: adding 2 cols at a time for each prop' + , '\n===================================================================') + +# BEWARE hardcoding : only works as we are adding aa prop once for wt and once for mut +# in each lookup cycle +ncol_mutf_add = 3 # mut split into 3 cols +ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping + +#=========== +# Split 'mutation' column into three: wild_type, position and +# mutant_type separately. Then map three letter code to one using +# reference_dict imported at the beginning. +# After importing, convert to mutation to lowercase for compatibility with dict +#=========== +gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower() + +print('wt regex being used:', wt_regex + , '\nmut regex being used:', mut_regex + , '\nposition regex being used:', pos_regex) + +mylen0 = len(gene_LF1.columns) + +#======= +# Iterate through the dict, create a lookup dict i.e +# lookup_dict = {three_letter_code: one_letter_code}. +# lookup dict should be the key and the value (you want to create a column for) +# Then use this to perform the mapping separetly for wild type and mutant cols. +# The three letter code is extracted using a string match match from the dataframe and then converted +# to 'pandas series'since map only works in pandas series +#======= +print('Adding', ncol_mutf_add, 'more cols:\n') + +# initialise a sub dict that is lookup dict for three letter code to 1-letter code +# adding three more cols +lookup_dict = dict() +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['one_letter_code'] + #wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()converts to a series that map works on + wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze() + gene_LF1['wild_type'] = wt.map(lookup_dict) + #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() + gene_LF1['mutant_type'] = mut.map(lookup_dict) + +# extract position info from mutation column separetly using string match +#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') +gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex) + +mylen1 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt & mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen1 == mylen0 + ncol_mutf_add: + print('PASS: successfully added', ncol_mutf_add, 'cols' + , '\nold length:', mylen0 + , '\nnew len:', mylen1) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen0 + , '\nnew len:', mylen1) + +# clear variables +del(k, v, wt, mut, lookup_dict) + +######## +# combine the wild_type+poistion+mutant_type columns to generate +# mutationinformation (matches mCSM output field) +# Remember to use .map(str) for int col types to allow string concatenation +######### +gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type'] +print('Created column: mutationinformation' + , '\n=====================================================================\n' + , gene_LF1.mutationinformation.head(10)) + +#order by position for convenience +gene_LF1.dtypes + +# converting position to numeric +gene_LF1['position'] = pd.to_numeric(gene_LF1['position']) + +# sort by position inplace +foo = gene_LF1['position'].value_counts() +foo +gene_LF1.sort_values(by = ['position'], inplace = True) +bar = gene_LF1['position'].value_counts() + +# FIXME:Can only compare identically-labeled Series objects +if (foo == bar).all(): + print('PASS: df ordered by position') + print(gene_LF1['position'].head()) +else: + print('FAIL: df could not be ordered. Check source') + sys.exit() + +#%% Create a copy of mutationinformation column for downstream mergeing +gene_LF1['Mut'] = gene_LF1['mutationinformation'] +gene_LF1['Mut_copy'] = gene_LF1['mutationinformation'] + +#%% Create a copy of indices for downstream mergeing +gene_LF1['index_orig'] = gene_LF1.index +gene_LF1['index_orig_copy'] = gene_LF1.index + +all(gene_LF1.index.values == gene_LF1['index_orig'].values) +all(gene_LF1.index.values == gene_LF1['index_orig_copy'].values) + +#%% quick sanity check for position freq count +# count the freq of 'other muts' samples +test_df = gene_LF1.copy() +test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'position']] +# add freq column +#test_df['sample_freq'] = test_df.groupby('id')['id'].transform('count') +#print('Revised dim of other_muts_df:',test_df.shape) +test_df['scount'] = test_df['mutation'].map(SnpFDict) +#%% Map mutation frequency count as a column +gene_LF1['snp_frequency'] = gene_LF1['mutation'].map(SnpFDict) +#%% Map position frequency count as a column +z = gene_LF1['position'].value_counts() +z1 = z.to_dict() +gene_LF1['pos_count'] = gene_LF1['position'].map(z1) + +#test_df2 = test_df.loc[test_df['position'] == 10] + +#%% OUTFILE 4, write file mCSM style muts +snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique()) +snps_only.head() +# assign column name +snps_only.columns = ['mutationinformation'] + +# count how many positions this corresponds to +pos_only = pd.DataFrame(gene_LF1['position'].unique()) +pos_only + +print('Checking NA in snps...')# should be 0 +if snps_only.mutationinformation.isna().sum() == 0: + print ('PASS: NO NAs/missing entries for SNPs' + , '\n===============================================================') +else: + sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?') + +# write file: mCSM muts +out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv' +outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps + +print('\n----------------------------------' + , '\nWriting file: mCSM style muts' + , '\n----------------------------------' + , '\nFile:', outfile_mcsmsnps + , '\nmutation format (SNP): {WT}{MUT}' + , '\nNo. of distinct muts:', len(snps_only) + , '\nNo. of distinct positions:', len(pos_only) + , '\n=============================================================') + +snps_only.to_csv(outfile_mcsmsnps, header = False, index = False) + +print('Finished writing:', outfile_mcsmsnps + , '\nNo. of rows:', len(snps_only) + , '\nNo. of cols:', len(snps_only.columns) + , '\n=============================================================') +del(out_filename_mcsmsnps) + +#%% OUTFILE 5, write file frequency of position counts: MOVE TO THE END +metadata_pos = pd.DataFrame(gene_LF1['position']) +metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1) +metadata_pos['meta_pos_count'].value_counts() + +metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True) + +out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv' +outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts +print('\n----------------------------------' + , '\nWriting file: Metadata poscounts' + , '\n----------------------------------' + , '\nFile:', outfile_metadata_poscounts + , '\n============================================================') + +metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False) +print('Finished writing:', outfile_metadata_poscounts + , '\nNo. of rows:', len(metadata_pos) + , '\nNo. of cols:', len(metadata_pos.columns) + , '\n=============================================================') +del(out_filename_metadata_poscounts) +#%% OUTFILE 6, write file _metadata: MOVE TO THE END +# ---THINK --- +#%% OUTFILE 7, write file MSA plots: MOVE TO THE END +# -- THINK --- +#%% OUTFILE 8, write file mutational position with count: MOVE TO THE END +# -- THINK --- +#%% Add column: aa property_water +#%% Add column: aa_prop_polarity +#%% Add column: aa_calcprop +#%% NEW mappings: gene_LF2 +# gene_LF2: copy gene_LF1 +gene_LF2 = gene_LF1.copy() +#%% Add total unique id count +gene_LF2['id'].nunique() +gene_LF2['mutationinformation'].nunique() +total_id_ucount = gene_LF2['id'].nunique() +total_id_ucount +total_id_ucount2 = gene_LF2['sample'].nunique() +total_id_ucount2 + +if total_id_ucount == total_id_ucount2: + print('\nPASS: sample and id unique counts match') +else: + print('\nFAIL: sample and id unique counts DO NOT match!' + , '\nGWAS worry!?') + +# Add all sample ids in a list for sanity checks +#gene_LF2['id_list'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].apply(list)) +#========================================== +# Add column: total unique id/sample count +#========================================== +gene_LF2['total_id_ucount'] = total_id_ucount + +#========================================== +# DELETE as already mapped: Add column: mutation count in all samples +#========================================== +# gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique()) +# gene_LF2['mut_id_ucount'] + +# gene_LF1['snp_frequency'].equals(gene_LF2['mut_id_ucount']) + +#%% AF for gene +#================= +# Add column: AF +#================= +gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount'] +gene_LF2['maf'].head() + +#%% Mapping 1: column '', mutation_info +gene_LF2['mutation_info'].value_counts() +gene_LF2['mutation_info_v1'].value_counts() +gene_LF2['mutation_info_orig'].value_counts() + +#======================= +# column name: +#======================= +# mapping 1.1: labels +dm_om_label_map = {dr_muts_col: 'DM' + , other_muts_col: 'OM'} +dm_om_label_map +gene_LF2['mutation_info_labels'] = gene_LF2['mutation_info'].map(dm_om_label_map) + +# mapping 1.2: numeric +dm_om_num_map = {dr_muts_col: 1 + , other_muts_col: 0} + +gene_LF2['dm_om_numeric'] = gene_LF2['mutation_info'].map(dm_om_num_map) +gene_LF2['dm_om_numeric_orig'] = gene_LF2['mutation_info_orig'].map(dm_om_num_map) + +gene_LF2['mutation_info'].value_counts() +gene_LF2['mutation_info_labels'].value_counts() +gene_LF2['mutation_info_orig'].value_counts() +gene_LF2['dm_om_numeric'].value_counts() +gene_LF2['dm_om_numeric_orig'].value_counts() +#%% Mapping 2: column '' +#============================ +# column name: +#============================ +gene_LF2['drtype'].value_counts() + +# mapping 2.1: numeric +drtype_map = {'XDR': 5 + , 'Pre-XDR': 4 + , 'MDR': 3 + , 'Pre-MDR': 2 + , 'Other': 1 + , 'Sensitive': 0} + +gene_LF2['drtype_numeric'] = gene_LF2['drtype'].map(drtype_map) + +gene_LF2['drtype'].value_counts() +gene_LF2['drtype_numeric'].value_counts() +#%% Mapping 3: column '', drug +#============================ +# column name: +#============================ +# copy dst column +gene_LF2['dst'] = gene_LF2[drug] # to allow cross checking +gene_LF2['dst'].equals(gene_LF2[drug]) + +gene_LF2['dst_multimode'] = gene_LF2[drug] + +gene_LF2[drug].isnull().sum() +gene_LF2['dst_multimode'].isnull().sum() +#%% Further mappings: gene_LF3 +gene_LF3 = gene_LF2.copy() +gene_LF3.index +gene_LF3 = gene_LF3.set_index(['Mut']) +gene_LF3.index + +gene_LF3['dst_multimode'].value_counts() +gene_LF3['dst_multimode'].value_counts().sum() +#%% Multimode: dst +# For each mutation, generate the revised dst which is the mode of dm_om_numeric +#============================= +# Recalculation: Revised dst +# max(multimode) +#============================= +# Get multimode for dm_om_numeric column +#dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode) +dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) + +dm_om_multimode_LF3 + +# Fill using multimode ONLY where NA in dst_multimode column +gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3) + +# Now get the max from multimode +gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x)) +print(gene_LF3) +#%% Revised Columns:IMPORTANT +#%% Multimode: dst column +#---------------------------- +# Revised dst column: Max +#---------------------------- +# Finally created a revised dst with the max from the multimode +gene_LF3['dst_mode'] = gene_LF3.groupby('mutationinformation')['dst_noNA'].max() + +#%% Multimode: drtype +#============================= +# Recalculation: Revised drtype +# max(multimode) +#============================= +#-------------------------------- +# drtype: ALL values: +# numeric and names in an array +#-------------------------------- +gene_LF3['drtype_all_vals'] = gene_LF3['drtype_numeric'] +gene_LF3['drtype_all_names'] = gene_LF3['drtype'] + +gene_LF3['drtype_all_vals'] = gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list) +gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list) + +#--------------------------------- +# Revised drtype: max(Multimode) +#-------------------------------- +gene_LF3['drtype_multimode'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode) +gene_LF3['drtype_multimode'] + +# Now get the max from multimode +gene_LF3['drtype_mode'] = gene_LF3['drtype_multimode'].apply(lambda x: np.nanmax(x)) +gene_LF3.head() + +#---------------------- +# Revised drtype: Max +#---------------------- +gene_LF3.head() +gene_LF3['drtype_max'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].max() +gene_LF3.head() + +#%% Revised counts checks +gene_LF3['dst_mode'].value_counts() +gene_LF3[drug].value_counts() + +print('\n------------------------------------------------------' + , '\nRevised counting: mutation_info i.e dm om column' + , '\n-----------------------------------------------------' + + , '\n----------------------------------' + , '\nOriginal drug column count' + , '\n----------------------------------' + , gene_LF3[drug].value_counts() + , '\nTotal samples [original]:', gene_LF3[drug].value_counts().sum() + + , '\n----------------------------------' + , '\nRevised drug column count' + , '\n----------------------------------' + , gene_LF3['dst_mode'].value_counts() + , '\nTotal samples [revised]:', gene_LF3['dst_mode'].value_counts().sum() + )