did all other mappings until dst column

2022-04-23 11:14:34 +01:00 · 2022-04-23 11:14:34 +01:00 · cb93cef3c7
commit cb93cef3c7
parent 7a10b4f223
1 changed files with 414 additions and 5 deletions
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -672,6 +672,15 @@ print(dict(Cdict))
 for k, v in Cdict.items():
    if k in common_snps_dr_other:
        print(k,v)   
 # convert defaultDict to dict
 SnpFDict_orig = dict(Cdict)
 def lower_dict(d):
   new_dict = dict((k.lower(), v) for k, v in d.items())
   return new_dict
 SnpFDict = lower_dict(SnpFDict_orig)
 ###############################################################################
 # USE Vcounts to get expected curated df
 # resolve dm om muts funda
@ -987,13 +996,16 @@ ambig_muts_rev_df.index
 gene_LF1.index
 all(ambig_muts_rev_df.index.isin(gene_LF1.index))
-gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV']
+#gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV']
 gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
 gene_LF1['mutation_info_orig'].value_counts()
-gene_LF1['mutation_info_v1'].value_counts()
+#gene_LF1['mutation_info_v1'].value_counts()
 foo = gene_LF1.iloc[ambig_muts_rev_df.index]
 # Sanity check1: if there are still any ambiguous muts
-muts_split_rev = list(gene_LF1.groupby('mutation_info_v1'))
+#muts_split_rev = list(gene_LF1.groupby('mutation_info_v1'))
 muts_split_rev = list(gene_LF1.groupby('mutation_info'))
 dr_muts_rev = muts_split_rev[0][1].mutation 
 other_muts_rev =  muts_split_rev[1][1].mutation
 print('splitting muts by mut_info:', muts_split_rev)
@ -1006,5 +1018,402 @@ else:
    print('\nAmbiguous muts NOT corrected. Quitting!')
    sys.exit()
-gene_LF1['mutation_info_v1'].value_counts()
+#gene_LF1['mutation_info_v1'].value_counts()
 gene_LF1['mutation_info'].value_counts()
 # reassign
 #%% PHEW! Good to go for downstream stuff
 #%% Add column: Mutationinformation
 # splitting mutation column to get mCSM style muts
 #=======================
 # Formatting df: read aa dict and pull relevant info
 #=======================
 print('Now some more formatting:'
      , '\nRead aa dict and pull relevant info'
      , '\nFormat mutations:'
      , '\nsplit mutation into mCSM style muts: '
      , '\nFormatting mutation in mCSM style format: {WT}<POS>{MUT}'
      , '\nAssign aa properties: adding 2 cols at a time for each prop'
      , '\n===================================================================')
 # BEWARE hardcoding : only works as we are adding aa prop once for wt and once for mut
 # in each lookup cycle 
 ncol_mutf_add = 3 # mut split into 3 cols
 ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping
 #===========
 # Split 'mutation' column into three:  wild_type, position and
 # mutant_type separately. Then map three letter code to one using
 # reference_dict imported at the beginning.
 # After importing, convert to mutation to lowercase for compatibility with dict 
 #===========
 gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower()
 print('wt regex being used:', wt_regex
      , '\nmut regex being used:', mut_regex
      , '\nposition regex being used:', pos_regex)
 mylen0 = len(gene_LF1.columns)
 #=======
 # Iterate through the dict, create a lookup dict i.e
 # lookup_dict = {three_letter_code: one_letter_code}.
 # lookup dict should be the key and the value (you want to create a column for)
 # Then use this to perform the mapping separetly for wild type and mutant cols.
 # The three letter code is extracted using a string match match from the dataframe and then converted
 # to 'pandas series'since map only works in pandas series
 #=======
 print('Adding', ncol_mutf_add, 'more cols:\n')
 # initialise a sub dict that is lookup dict for three letter code to 1-letter code
 # adding three more cols
 lookup_dict = dict()
 for k, v in my_aa_dict.items():
    lookup_dict[k] = v['one_letter_code']
    #wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()converts to a series that map works on
    wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
    gene_LF1['wild_type'] = wt.map(lookup_dict)   
    #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
    mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
    gene_LF1['mutant_type'] = mut.map(lookup_dict)
 # extract position info from mutation column separetly using string match
 #gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') 
 gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex) 
 mylen1 = len(gene_LF1.columns)
 # sanity checks
 print('checking if 3-letter wt&mut residue extraction worked correctly')
 if wt.isna().sum() & mut.isna().sum() == 0:
   print('PASS: 3-letter wt & mut residue extraction worked correctly:'
         , '\nNo NAs detected:'
         , '\nwild-type\n', wt
         , '\nmutant-type\n', mut
         , '\ndim of df:', gene_LF1.shape)
 else:
    print('FAIL: 3-letter wt&mut residue extraction failed'
          , '\nNo NAs detected:'
          , '\nwild-type\n', wt
          , '\nmutant-type\n', mut
          , '\ndim of df:', gene_LF1.shape)
 if mylen1 ==  mylen0 + ncol_mutf_add:
    print('PASS: successfully added', ncol_mutf_add, 'cols'
      , '\nold length:', mylen0
      , '\nnew len:', mylen1)
 else:
    print('FAIL: failed to add cols:'
          , '\nold length:', mylen0
          , '\nnew len:', mylen1)
 # clear variables
 del(k, v, wt, mut, lookup_dict)
 ########
 # combine the wild_type+poistion+mutant_type columns to generate 
 # mutationinformation (matches mCSM output field)
 # Remember to use .map(str) for int col types to allow string concatenation
 #########
 gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type']
 print('Created column: mutationinformation'
 	, '\n=====================================================================\n'
    , gene_LF1.mutationinformation.head(10))
 #order by position for convenience
 gene_LF1.dtypes
 # converting position to numeric
 gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
 # sort by position inplace 
 foo = gene_LF1['position'].value_counts()
 foo
 gene_LF1.sort_values(by = ['position'], inplace = True)
 bar = gene_LF1['position'].value_counts()
 # FIXME:Can only compare identically-labeled Series objects
 if (foo == bar).all():
    print('PASS: df ordered by position')
    print(gene_LF1['position'].head())
 else:
    print('FAIL: df could not be ordered. Check source')
    sys.exit()
 #%% Create a copy of mutationinformation column for downstream mergeing
 gene_LF1['Mut']      = gene_LF1['mutationinformation'] 
 gene_LF1['Mut_copy'] = gene_LF1['mutationinformation'] 
 #%% Create a copy of indices for downstream mergeing
 gene_LF1['index_orig']      = gene_LF1.index
 gene_LF1['index_orig_copy'] = gene_LF1.index
 all(gene_LF1.index.values == gene_LF1['index_orig'].values)
 all(gene_LF1.index.values == gene_LF1['index_orig_copy'].values)
 #%% quick sanity check for position freq count
 # count the freq of 'other muts' samples
 test_df = gene_LF1.copy()
 test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'position']]
 # add freq column
 #test_df['sample_freq'] = test_df.groupby('id')['id'].transform('count')
 #print('Revised dim of other_muts_df:',test_df.shape) 
 test_df['scount'] = test_df['mutation'].map(SnpFDict)
 #%% Map mutation frequency count as a column
 gene_LF1['snp_frequency'] = gene_LF1['mutation'].map(SnpFDict)
 #%% Map position frequency count as a column
 z =  gene_LF1['position'].value_counts()
 z1 = z.to_dict()
 gene_LF1['pos_count'] = gene_LF1['position'].map(z1)
 #test_df2 = test_df.loc[test_df['position'] == 10] 
 #%% OUTFILE 4, write file mCSM style muts
 snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique())
 snps_only.head()
 # assign column name
 snps_only.columns = ['mutationinformation']
 # count how many positions this corresponds to
 pos_only = pd.DataFrame(gene_LF1['position'].unique()) 
 pos_only
 print('Checking NA in snps...')# should be 0
 if snps_only.mutationinformation.isna().sum() == 0:
    print ('PASS: NO NAs/missing entries for SNPs'
    , '\n===============================================================')
 else:
    sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
 # write file: mCSM muts
 out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv'
 outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
 print('\n----------------------------------'
      , '\nWriting file: mCSM style muts'
      , '\n----------------------------------'
      , '\nFile:', outfile_mcsmsnps
      , '\nmutation format (SNP): {WT}<POS>{MUT}'
      , '\nNo. of distinct muts:', len(snps_only)
      , '\nNo. of distinct positions:', len(pos_only)
      , '\n=============================================================')
 snps_only.to_csv(outfile_mcsmsnps, header = False, index = False)
 print('Finished writing:', outfile_mcsmsnps
      , '\nNo. of rows:', len(snps_only)
      , '\nNo. of cols:', len(snps_only.columns)
      , '\n=============================================================')
 del(out_filename_mcsmsnps)
 #%% OUTFILE 5, write file frequency of position counts:  MOVE TO THE END
 metadata_pos = pd.DataFrame(gene_LF1['position'])
 metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
 metadata_pos['meta_pos_count'].value_counts()
 metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
 out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
 outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
 print('\n----------------------------------'
      , '\nWriting file: Metadata poscounts'
      , '\n----------------------------------'
      , '\nFile:', outfile_metadata_poscounts
      , '\n============================================================')
 metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
 print('Finished writing:', outfile_metadata_poscounts
      , '\nNo. of rows:', len(metadata_pos)
      , '\nNo. of cols:', len(metadata_pos.columns)
      , '\n=============================================================')
 del(out_filename_metadata_poscounts)
 #%% OUTFILE 6, write file <gene>_metadata:  MOVE TO THE END
 # ---THINK ---
 #%% OUTFILE 7, write file MSA plots:  MOVE TO THE END
 # -- THINK ---
 #%% OUTFILE 8, write file mutational position with count:  MOVE TO THE END
 # -- THINK ---
 #%% Add column: aa property_water
 #%% Add column: aa_prop_polarity
 #%% Add column: aa_calcprop
 #%% NEW mappings: gene_LF2
 # gene_LF2: copy gene_LF1
 gene_LF2 = gene_LF1.copy()
 #%% Add total unique id count
 gene_LF2['id'].nunique()
 gene_LF2['mutationinformation'].nunique()
 total_id_ucount = gene_LF2['id'].nunique()
 total_id_ucount
 total_id_ucount2 = gene_LF2['sample'].nunique()
 total_id_ucount2
 if total_id_ucount == total_id_ucount2:
    print('\nPASS: sample and id unique counts match')
 else:
    print('\nFAIL: sample and id unique counts DO NOT match!'
          , '\nGWAS worry!?')
 # Add all sample ids in a list for sanity checks
 #gene_LF2['id_list']  = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].apply(list))
 #==========================================
 # Add column: total unique id/sample count
 #==========================================
 gene_LF2['total_id_ucount'] = total_id_ucount
 #==========================================
 # DELETE as already mapped: Add column: mutation count in all samples 
 #==========================================
 # gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
 # gene_LF2['mut_id_ucount']
 # gene_LF1['snp_frequency'].equals(gene_LF2['mut_id_ucount'])
 #%% AF for gene
 #=================
 # Add column: AF
 #=================
 gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount']
 gene_LF2['maf'].head()
 #%% Mapping 1: column '<drug>', mutation_info
 gene_LF2['mutation_info'].value_counts()
 gene_LF2['mutation_info_v1'].value_counts()
 gene_LF2['mutation_info_orig'].value_counts()
 #=======================
 # column name: <drug>
 #=======================
 # mapping 1.1: labels
 dm_om_label_map = {dr_muts_col: 'DM'
             , other_muts_col: 'OM'}
 dm_om_label_map
 gene_LF2['mutation_info_labels'] = gene_LF2['mutation_info'].map(dm_om_label_map)
 # mapping 1.2: numeric
 dm_om_num_map = {dr_muts_col: 1
              , other_muts_col: 0}
 gene_LF2['dm_om_numeric'] = gene_LF2['mutation_info'].map(dm_om_num_map)
 gene_LF2['dm_om_numeric_orig'] = gene_LF2['mutation_info_orig'].map(dm_om_num_map)
 gene_LF2['mutation_info'].value_counts()
 gene_LF2['mutation_info_labels'].value_counts()
 gene_LF2['mutation_info_orig'].value_counts()
 gene_LF2['dm_om_numeric'].value_counts()
 gene_LF2['dm_om_numeric_orig'].value_counts()
 #%% Mapping 2: column '<drtype>'
 #============================
 # column name: <drtype>
 #============================
 gene_LF2['drtype'].value_counts()
 # mapping 2.1: numeric
 drtype_map = {'XDR': 5
              , 'Pre-XDR': 4
              , 'MDR': 3
              , 'Pre-MDR': 2
              , 'Other': 1
              , 'Sensitive': 0}
 gene_LF2['drtype_numeric']  = gene_LF2['drtype'].map(drtype_map)
 gene_LF2['drtype'].value_counts()
 gene_LF2['drtype_numeric'].value_counts()
 #%% Mapping 3: column '<dst>', drug
 #============================
 # column name: <dst>
 #============================
 # copy dst column
 gene_LF2['dst'] = gene_LF2[drug] # to allow cross checking
 gene_LF2['dst'].equals(gene_LF2[drug])
 gene_LF2['dst_multimode'] = gene_LF2[drug]
 gene_LF2[drug].isnull().sum()
 gene_LF2['dst_multimode'].isnull().sum()
 #%% Further mappings: gene_LF3
 gene_LF3 = gene_LF2.copy()
 gene_LF3.index
 gene_LF3 = gene_LF3.set_index(['Mut'])
 gene_LF3.index
 gene_LF3['dst_multimode'].value_counts()
 gene_LF3['dst_multimode'].value_counts().sum()
 #%% Multimode: dst
 # For each mutation, generate the revised dst which is the mode of dm_om_numeric
 #=============================
 # Recalculation: Revised dst
 # max(multimode)
 #=============================
 # Get multimode for dm_om_numeric column
 #dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
 dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
 dm_om_multimode_LF3
 # Fill using multimode ONLY where NA in dst_multimode column
 gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
 # Now get the max from multimode
 gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
 print(gene_LF3)
 #%% Revised Columns:IMPORTANT
 #%% Multimode: dst column
 #----------------------------
 # Revised dst column: Max
 #----------------------------
 # Finally created a revised dst with the max from the multimode
 gene_LF3['dst_mode']  = gene_LF3.groupby('mutationinformation')['dst_noNA'].max()
 #%% Multimode: drtype
 #=============================
 # Recalculation: Revised drtype
 # max(multimode)
 #=============================
 #--------------------------------
 # drtype: ALL values:
 # numeric and names in an array 
 #--------------------------------
 gene_LF3['drtype_all_vals']  = gene_LF3['drtype_numeric']
 gene_LF3['drtype_all_names'] = gene_LF3['drtype']
 gene_LF3['drtype_all_vals']  = gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
 gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
 #---------------------------------
 # Revised drtype: max(Multimode)
 #--------------------------------
 gene_LF3['drtype_multimode'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode)
 gene_LF3['drtype_multimode']
 # Now get the max from multimode
 gene_LF3['drtype_mode'] = gene_LF3['drtype_multimode'].apply(lambda x: np.nanmax(x))
 gene_LF3.head()
 #----------------------
 # Revised drtype: Max
 #----------------------
 gene_LF3.head()
 gene_LF3['drtype_max'] =  gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].max()
 gene_LF3.head()
 #%% Revised counts checks
 gene_LF3['dst_mode'].value_counts()
 gene_LF3[drug].value_counts()
 print('\n------------------------------------------------------'
      , '\nRevised counting: mutation_info i.e dm om column'
      , '\n-----------------------------------------------------'
      , '\n----------------------------------'
      , '\nOriginal drug column count'
      , '\n----------------------------------'
      ,  gene_LF3[drug].value_counts()
      , '\nTotal samples [original]:', gene_LF3[drug].value_counts().sum()
      , '\n----------------------------------'
      , '\nRevised drug column count'
      , '\n----------------------------------'
      , gene_LF3['dst_mode'].value_counts()
      , '\nTotal samples [revised]:', gene_LF3['dst_mode'].value_counts().sum()
      )