updated data extraction to ensure genes without common mutations and duplicate indices can run from the cmd

2022-05-05 13:35:24 +01:00 · 2022-05-05 13:35:24 +01:00 · e54ae877a8
commit e54ae877a8
parent 5429b8fed7
1 changed files with 170 additions and 142 deletions
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -376,7 +376,7 @@ print('===========================================================\n'
 search = ";"
 
 # count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence
-count_df_dr = meta_gene_dr[['id', dr_muts_col]]
+count_df_dr = meta_gene_dr[['id', dr_muts_col]].copy()
 count_df_dr['dr_semicolon_count'] = meta_gene_dr.loc[:, dr_muts_col].str.count(search, re.I)
 dr_sc_C = count_df_dr['dr_semicolon_count'].value_counts().reset_index()
 dr_sc_C 
@ -502,7 +502,7 @@ if other_muts_col in dr_df.columns:
 #%% Important sanity checks: other muts column for tidy split(), nsSNPs, etc.
 # Split based on semi colon on other_muts_col
 # count of occurrence of ";" in other_muts_col: No.of semicolons + 1 is no. of rows created * occurence
-count_df_other = meta_gene_other[['id', other_muts_col]]
+count_df_other = meta_gene_other[['id', other_muts_col]].copy()
 count_df_other['other_semicolon_count'] = meta_gene_other.loc[:, other_muts_col].str.count(search, re.I)
 other_sc_C = count_df_other['other_semicolon_count'].value_counts().reset_index()
 other_sc_C
@ -790,7 +790,8 @@ print('Length of gene_LF0:', len(gene_LF0)
      , '\nThis should be what we need. But just double checking and extracting nsSNP for', gene
      , '\nfrom LF0 (concatenated data) using case insensitive regex match:', nssnp_match)

-gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)]   
+#gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)]
+gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)].copy()

 if len(gene_LF0) == len(gene_LF1):
    print('PASS: length of gene_LF0 and gene_LF1 match',
@ -859,8 +860,7 @@ other_muts =  muts_split[1][1].mutation
 print('splitting muts by mut_info:', muts_split)
 print('no.of dr_muts samples:', len(dr_muts))
 print('no. of other_muts samples', len(other_muts))
-#%% Ambiguous muts
-# IMPORTANT: The same mutation cannot be classed as a drug AND 'others'
+#%% Ambiguous muts: the same mutation cannot be classed as a drug AND 'others'
 if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
    print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category'
          , '\n===============================================================')
@ -894,139 +894,166 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
        print('\n===========================================================') 
 else:
    #sys.exit('Error: ambiguous muts present, but extraction failed. Debug!')
-    print('No: ambiguous muts present')
-         
-#%% Ambiguous muts: revised annotation for mutation_info
-ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
-ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts()    
-ambiguous_muts_value_counts
+    print('Ambiguous muts are NOT present')

+#%% DOES NOT depend on common_muts
 gene_LF1_orig = gene_LF1.copy()
 gene_LF1_orig.equals(gene_LF1)

 # copy the old columns for checking
-gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info'] 
+gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info']
 gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info'] 
 gene_LF1['mutation_info'].value_counts()
-#%% Inspect ambiguous muts
-#=====================================
-# Now create a new df that will have:
-    # ambiguous muts
-    # mutation_info
-    # revised mutation_info
-# The revised label is based on value_counts
-# for mutaiton_info. The corresponding mutation_info:
-# label is chosen that corresponds to the max of value counts
-#=====================================
-ambig_muts_rev_df = pd.DataFrame()
-changes_val = []
-changes_dict = {}

-##BROKENNNN!!!!
-common_muts
-gene_LF1['mutation'].head()
-#common_muts_lower =  list((map(lambda x: x.lower(), common_muts)))
-#common_muts_lower
-
-for i in common_muts:
-#for i in common_muts_lower:
-    #print(i)
-    temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
-    temp_df
-  # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending
-    max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts()
-    max_info_table
-    revised_label = max_info_table[[0]].index[0][1] # max value_count
-    old_label     = max_info_table[[1]].index[0][1] # min value_count 
-    print('\nAmbiguous mutation labels...'
-          , '\nSetting mutation_info for', i, 'to', revised_label)
-    temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info_orig'] == old_label)
-                      , revised_label
-                      , temp_df['mutation_info_orig'])
-    ambig_muts_rev_df = pd.concat([ambig_muts_rev_df,temp_df])
-    f = max_info_table[[1]]
-    old_label_count = f[[0]][0]
-    changes_val.append(old_label_count) 
-    cc_dict = f.to_dict()
-    changes_dict.update(cc_dict)
-                      
-ambig_muts_rev_df['mutation_info_REV'].value_counts() 
-ambig_muts_rev_df['mutation_info_orig'].value_counts() 
-changes_val
-changes_total = sum(changes_val)
-changes_dict
-#%% OUTFILE 1, write file: ambiguous muts and ambiguous mut counts
-#==================
-# ambiguous muts
-#==================
-#dr_muts.XXX_csvXXXX('dr_muts.csv', header = True)
-#other_muts.XXXX_csvXXX('other_muts.csv', header = True)
-if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
-    out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
-    outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
-    print('\n----------------------------------'
-          , '\nWriting file: ambiguous muts'
-          , '\n----------------------------------'
-          , '\nFilename:', outfile_ambig_muts)
-    inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
-    inspect.to_csv(outfile_ambig_muts, index = True)
-
-    print('Finished writing:', out_filename_ambig_muts
-          , '\nNo. of rows:', len(inspect)
-          , '\nNo. of cols:', len(inspect.columns)
-          , '\nNo. of rows = no. of samples with the ambiguous muts present:'
-          , dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum()
-          , '\n=============================================================')
-    del(out_filename_ambig_muts)
+#%% Ambiguous muts: revised annotation for mutation_info
+if 'common_muts' in globals():
+    ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
+    ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts()    
+    ambiguous_muts_value_counts
    
-#%% OUTFILE 2, write file: ambiguous mut counts    
-#======================
-# ambiguous mut counts
-#======================
-out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv'
-outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts
-print('\n----------------------------------'
-       , '\nWriting file: ambiguous mut counts'
-       , '\n----------------------------------'
-       , '\nFilename:', outfile_ambig_mut_counts)
+    # gene_LF1_orig = gene_LF1.copy()
+    # gene_LF1_orig.equals(gene_LF1)
+    
+    # # copy the old columns for checking
+    # gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info'] 
+    # gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info'] 
+    # gene_LF1['mutation_info'].value_counts()
+    #%% Inspect ambiguous muts
+    #=====================================
+    # Now create a new df that will have:
+        # ambiguous muts
+        # mutation_info
+        # revised mutation_info
+    # The revised label is based on value_counts
+    # for mutaiton_info. The corresponding mutation_info:
+    # label is chosen that corresponds to the max of value counts
+    #=====================================
+    ambig_muts_rev_df = pd.DataFrame()
+    changes_val = []
+    changes_dict = {}
+    
+    common_muts
+    gene_LF1['mutation'].head()
+    #common_muts_lower =  list((map(lambda x: x.lower(), common_muts)))
+    #common_muts_lower
+    
+    for i in common_muts:
+    #for i in common_muts_lower:
+        #print(i)
+        temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
+        temp_df
+      # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending
+        max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts()
+        max_info_table
+        revised_label = max_info_table[[0]].index[0][1] # max value_count
+        old_label     = max_info_table[[1]].index[0][1] # min value_count 
+        print('\nAmbiguous mutation labels...'
+              , '\nSetting mutation_info for', i, 'to', revised_label)
+        temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info_orig'] == old_label)
+                          , revised_label
+                          , temp_df['mutation_info_orig'])
+        ambig_muts_rev_df = pd.concat([ambig_muts_rev_df,temp_df])
+        f = max_info_table[[1]]
+        old_label_count = f[[0]][0]
+        changes_val.append(old_label_count) 
+        cc_dict = f.to_dict()
+        changes_dict.update(cc_dict)
+                          
+    ambig_muts_rev_df['mutation_info_REV'].value_counts() 
+    ambig_muts_rev_df['mutation_info_orig'].value_counts() 
+    changes_val
+    changes_total = sum(changes_val)
+    changes_dict
+    n_changes = sum(changes_dict.values())
+    #%% OUTFILE 1, write file: ambiguous muts and ambiguous mut counts
+    #==================
+    # ambiguous muts
+    #==================
+    #dr_muts.XXX_csvXXXX('dr_muts.csv', header = True)
+    #other_muts.XXXX_csvXXX('other_muts.csv', header = True)
+    if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
+        out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
+        outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
+        print('\n----------------------------------'
+              , '\nWriting file: ambiguous muts'
+              , '\n----------------------------------'
+              , '\nFilename:', outfile_ambig_muts)
+        inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
+        inspect.to_csv(outfile_ambig_muts, index = True)
+    
+        print('Finished writing:', out_filename_ambig_muts
+              , '\nNo. of rows:', len(inspect)
+              , '\nNo. of cols:', len(inspect.columns)
+              , '\nNo. of rows = no. of samples with the ambiguous muts present:'
+              , dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum()
+              , '\n=============================================================')
+        del(out_filename_ambig_muts)
+        
+    #%% OUTFILE 2, write file: ambiguous mut counts    
+    #======================
+    # ambiguous mut counts
+    #======================
+    out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv'
+    outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts
+    print('\n----------------------------------'
+           , '\nWriting file: ambiguous mut counts'
+           , '\n----------------------------------'
+           , '\nFilename:', outfile_ambig_mut_counts)
+    
+    ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
+    #%% FIXME: Add sanity check to make sure you can add value_count checks
+    #%% Resolving ambiguous muts: Merging ambiguous muts with the revised annotations
+    #=================
+    # Merge ambig muts
+    # with gene_LF1
+    #===================
+    ambig_muts_rev_df.index
+    gene_LF1.index
+    all(ambig_muts_rev_df.index.isin(gene_LF1.index))
+    
+    any(gene_LF1.index.isin(ambig_muts_rev_df.index))
+   # if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == len(ambig_muts_rev_df)):
+    if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == ambig_muts_rev_df.index.nunique()):

-ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
-#%% FIXME: Add sanity check to make sure you can add value_count checks
-#%% Resolving ambiguous muts: Merging ambiguous muts with the revised annotations
-#=================
-# Merge ambig muts
-# with gene_LF1
-#===================
-ambig_muts_rev_df.index
-gene_LF1.index
-all(ambig_muts_rev_df.index.isin(gene_LF1.index))
+        print('\nPASS: ambiguous mut indices present in gene_LF1. Prepare to merge...')
+    else:
+        sys.exit('\nFAIL:ambiguous mut indices MISmatch. Check section Resolving ambiguous muts')
+    
+    ##########################################################################
+    for index, row in ambig_muts_rev_df.iterrows():
+        curr_mut = row['mutation']
+        curr_rev = row['mutation_info_REV']
+        print('\n=====\nAmbiguous Mutation: index:', index, '\nmutation:', curr_mut, '\nNew:', curr_rev, '\n=====\n' )
+        print('\n-----\nReplacing original: index:', index, '\nmutation: '
+              , gene_LF1.loc[index,'mutation']
+              , '\nmutation_info to replace:'
+              , gene_LF1.loc[index,'mutation_info']
+              , '\nwith:', curr_rev, '\n-----')
+        replacement_row=(gene_LF1.index==index) & (gene_LF1['mutation'] == curr_mut)
+        gene_LF1.loc[replacement_row, 'mutation_info'] = curr_rev 
+        
+    ###########################################################################
+     
+    gene_LF1['mutation_info_orig'].value_counts()
+    gene_LF1['mutation_info_v1'].value_counts()
+    gene_LF1['mutation_info'].value_counts()

-any(gene_LF1.index.isin(ambig_muts_rev_df.index))
-if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == len(ambig_muts_rev_df)):
-    print('\nPASS: ambiguous mut indices present in gene_LF1. Prepare to merge...')
+    # Sanity check1: if there are still any ambiguous muts
+    #muts_split_rev = list(gene_LF1.groupby('mutation_info_v1'))
+    muts_split_rev = list(gene_LF1.groupby('mutation_info'))
+    dr_muts_rev = muts_split_rev[0][1].mutation 
+    other_muts_rev =  muts_split_rev[1][1].mutation
+    print('splitting muts by mut_info:', muts_split_rev)
+    print('no.of dr_muts samples:', len(dr_muts_rev))
+    print('no. of other_muts samples', len(other_muts_rev))  
+    
+    if not dr_muts_rev.isin(other_muts_rev).sum() & other_muts_rev.isin(dr_muts_rev).sum() > 0:
+        print('\nAmbiguous muts corrected. Proceeding with downstream analysis')
+    else:
+        print('\nAmbiguous muts NOT corrected. Quitting!')
+        sys.exit()
 else:
-    sys.exit('\nFAIL:ambiguous mut indices MISmatch. Check section Resolving ambiguous muts')
-
-#gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV']
-gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
-
-gene_LF1['mutation_info_orig'].value_counts()
-gene_LF1['mutation_info_v1'].value_counts()
-
-# Sanity check1: if there are still any ambiguous muts
-#muts_split_rev = list(gene_LF1.groupby('mutation_info_v1'))
-muts_split_rev = list(gene_LF1.groupby('mutation_info'))
-dr_muts_rev = muts_split_rev[0][1].mutation 
-other_muts_rev =  muts_split_rev[1][1].mutation
-print('splitting muts by mut_info:', muts_split_rev)
-print('no.of dr_muts samples:', len(dr_muts_rev))
-print('no. of other_muts samples', len(other_muts_rev))  
-
-if not dr_muts_rev.isin(other_muts_rev).sum() & other_muts_rev.isin(dr_muts_rev).sum() > 0:
-    print('\nAmbiguous muts corrected. Proceeding with downstream analysis')
-else:
-    print('\nAmbiguous muts NOT corrected. Quitting!')
-    sys.exit()
+    print('Mutations ARE NOT ambiguous, proceeding to downstream analyses')

 #gene_LF1['mutation_info_v1'].value_counts()
 gene_LF1['mutation_info'].value_counts()
@ -1649,7 +1676,7 @@ lf_lin_split['lineage_corrupt'].value_counts()
 search = ";"
 
 # count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence
-count_df_lin = gene_LF3_ColsSel[['lineage']]
+count_df_lin = gene_LF3_ColsSel[['lineage']].copy()
 count_df_lin['lineage_semicolon_count'] = gene_LF3_ColsSel.loc[:, 'lineage'].str.count(search, re.I)
 lin_sc_C = count_df_lin['lineage_semicolon_count'].value_counts().reset_index()
 lin_sc_C 
@ -1708,7 +1735,7 @@ lf_lin_split['lineage_multimode'].value_counts()
 #%% Select only the columns you want to merge from lf_lin_split 
 lf_lin_split.columns
 lf_lin_split_ColSel = lf_lin_split[['lineage_corrupt_list','lineage_corrupt_count'
-                                    , 'lineage_corrupt_ucount' ,'lineage_ulist', 'lineage_multimode']]
+                                    , 'lineage_corrupt_ucount' ,'lineage_ulist', 'lineage_multimode']].copy()
 lf_lin_split_ColSel.columns

 lf_lin_split_ColSel.rename(columns = {'lineage_corrupt_list'     : 'lineage_list_all'
@ -1951,28 +1978,29 @@ print('\n============================================'
 	  , '\n'
     
      , '\nTotal no. of unique nsSNPs [check1: length of snps_only]:', len(snps_only)
-      
-      , '\nTotal no.of unique dr muts:'    , dr_muts_rev.nunique() 
-      , '\nTotal no.of unique other muts:' , other_muts_rev.nunique()
-      , '\nTotal no. of unique nsSNPs [check2: dr_muts + other_muts]:', dr_muts_rev.nunique()+other_muts_rev.nunique()
-     
      , '\nTotal no.of unique nSNSPs [check3, gene_LF4]:', gene_LF4['mutationinformation'].nunique()
      , '\nTotal no.of unique positions associated with missense muts:', gene_LF4['position'].nunique()
      , '\nTotal no. of samples with nsSNPs:', len(gene_LF4)
      , '\nTotal no. of unique sample ids with nsSNPs:', gene_LF4['id'].nunique()
      )
-
-if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
-    print('\nTotal no.of samples with ambiguous muts:', len(inspect)
-          #, '\nTotal no.of unique ambiguous muts:', len(common_muts)
-          , '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
-          , '\n============================================================='
-          , '\nPost resolving ambiguity\n'
-          , ambig_muts_rev_df['mutation_info_REV'].value_counts())
+if 'common_muts' in globals():
+    print('\nTotal no.of unique dr muts:'    , dr_muts_rev.nunique() 
+      , '\nTotal no.of unique other muts:' , other_muts_rev.nunique()
+      , '\nTotal no. of unique nsSNPs [check2: dr_muts + other_muts]:', dr_muts_rev.nunique()+other_muts_rev.nunique()
+      )
+    if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
+        print('\nTotal no.of samples with ambiguous muts:', len(inspect)
+              #, '\nTotal no.of unique ambiguous muts:', len(common_muts)
+              , '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
+              , '\n============================================================='
+              , '\nPost resolving ambiguity\n'
+              , ambig_muts_rev_df['mutation_info_REV'].value_counts())
+else:
+    print('No ambiguous muts present, hence no summary')
    
 print('\n============================================================='
      , '\n============================================================='
-      , '\###############################\n'
+      , '\n###############################\n'
      , '\nNumbers for ML workflows...'
      , '\n###############################\n'