various changes

2020-09-08 17:13:02 +01:00 · 2020-09-08 17:13:02 +01:00 · e4608342a4
commit e4608342a4
parent c72269dcd1
3 changed files with 199 additions and 95 deletions
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@ -55,6 +55,7 @@ os.getcwd()
 #from combining import combine_dfs_with_checks
 from combining_FIXME import detect_common_cols
 from reference_dict import oneletter_aa_dict # CHECK DIR STRUC THERE!
+from reference_dict import low_3letter_dict # CHECK DIR STRUC THERE!
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
@ -79,6 +80,21 @@ gene    = args.gene
 datadir = args.datadir
 indir   = args.input_dir
 outdir  = args.output_dir
+
+gene_match = gene + '_p.'
+print('mut pattern for gene', gene, ':',  gene_match)
+
+nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
+print('nsSNP for gene', gene, ':',  nssnp_match)
+
+wt_regex = gene_match.lower()+'([A-Za-z]{3})'
+print('wt regex:', wt_regex)
+
+mut_regex = r'[0-9]+(\w{3})$'
+print('mt regex:', mut_regex)
+
+pos_regex = r'([0-9]+)'
+print('position regex:', pos_regex)
 #%%=======================================================================
 #==============
 # directories
@ -214,7 +230,9 @@ combined_df[merging_cols_m4].apply(len) == len(combined_df)

 # OR merges: TEDIOUSSSS!!!!

-
+del(mcsm_df, foldx_df, mcsm_foldx_dfs, dssp_kd_dfs, dssp_kd_rd_dfs,rd_df, kd_df, infile_mcsm, infile_foldx, infile_dssp, infile_kd)
+del(merging_cols_m1, merging_cols_m2, merging_cols_m3, merging_cols_m4)
+del(in_filename_dssp, in_filename_foldx, in_filename_kd, in_filename_mcsm, in_filename_rd)
 #%%
 print('==================================='
      , '\nFifth merge: afor_df + afor_kin_df'
@ -235,7 +253,7 @@ print('Dim of afor_df:', afor_df.shape
 # finding if ALL afor_kin_df muts are present in afor_df
 # i.e all kinship muts should be PRESENT in mycalcs_present
 if len(afor_kin_df[afor_kin_df['mutation'].isin(afor_df['mutation'])]) == afor_kin_df.shape[0]:
-    print('PASS: ALL or_kinship muts are present in my or list')
+    print('PASS: ALL', len(afor_kin_df), 'or_kinship muts are present in my or list')
 else:
    nf_muts = len(afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])])
    nf_muts_df = afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])]    
@ -246,10 +264,10 @@ else:

 # Now checking how many afor_df muts are NOT present in afor_kin_df    
 common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])])  
-extra_muts_myor =   afor_kin_df.shape[0] - common_muts
+extra_muts_myor = afor_kin_df.shape[0] - common_muts

 print('=========================================='
-      , '\nmy or calcs has', extra_muts_myor, 'extra mutations'
+      , '\nmy or calcs has', common_muts, 'present in af_or_kin_df'
      , '\n==========================================')

 print('Expected cals for merging with outer_join...')
@ -257,10 +275,15 @@ print('Expected cals for merging with outer_join...')
 expected_rows = afor_df.shape[0] + extra_muts_myor
 expected_cols = afor_df.shape[1] + afor_kin_df.shape[1] - len(merging_cols_m5)

+
+afor_df['mutation']
+afor_kin_df['mutation']
+
 ors_df = pd.merge(afor_df, afor_kin_df, on = merging_cols_m5, how = o_join)

 if ors_df.shape[0] == expected_rows and ors_df.shape[1] == expected_cols:
-    print('PASS: OR dfs successfully combined! PHEWWWW!')
+    print('PASS but with duplicate muts: OR dfs successfully combined! PHEWWWW!'
+          , '\nDuplicate muts present but with different \'ref\' and \'alt\' alleles')
 else:
    print('FAIL: could not combine OR dfs'
          , '\nCheck expected rows and cols calculation and join type')
@ -269,12 +292,12 @@ print('Dim of merged ors_df:', ors_df.shape)

 ors_df[merging_cols_m5].apply(len)
 ors_df[merging_cols_m5].apply(len) == len(ors_df)
+
 #%%============================================================================
 # formatting ors_df
 ors_df.columns

 # Dropping unncessary columns: already removed in ealier preprocessing
-#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
 cols_to_drop = ['n_miss']
 print('Dropping', len(cols_to_drop), 'columns:\n'
      , cols_to_drop)
@ -327,7 +350,6 @@ column_order = ['mutation'
                #, 'wt_3let' # old
                #, 'mt_3let' # old
                #, 'symbol'
-                #, 'n_miss'
                ]

 if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())):
@ -343,6 +365,61 @@ else:

 print('\nResult of Sixth merge:', ors_df_ordered.shape
      , '\n===================================================================')
+#%%
+ors_df_ordered.shape
+check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']]
+
+# populating 'nan' info
+lookup_dict = dict()
+for k, v in low_3letter_dict.items():
+    lookup_dict[k] = v['one_letter_code']
+    #print(lookup_dict)
+    
+    wt = ors_df_ordered['mutation'].str.extract(wt_regex).squeeze()
+    #print(wt)
+    ors_df_ordered['wild_type'] = wt.map(lookup_dict)
+    
+    ors_df_ordered['position'] = ors_df_ordered['mutation'].str.extract(pos_regex) 
+    
+    mt = ors_df_ordered['mutation'].str.extract(mut_regex).squeeze()
+    ors_df_ordered['mutant_type'] = mt.map(lookup_dict)
+
+ors_df_ordered['mutationinformation'] = ors_df_ordered['wild_type'] + ors_df_ordered.position.map(str) + ors_df_ordered['mutant_type']
+check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']]
+
+# populate mut_info_f1
+ors_df_ordered['mut_info_f1'].isna().sum()
+ors_df_ordered['mut_info_f1'] = ors_df_ordered['position'].astype(str) + ors_df_ordered['wild_type'] + '>' + ors_df_ordered['position'].astype(str) + ors_df_ordered['mutant_type']
+ors_df_ordered['mut_info_f1'].isna().sum()  
+
+# populate mut_info_f2
+ors_df_ordered['mut_info_f2'] = ors_df_ordered['mutation'].str.replace(gene_match.lower(), 'p.', regex = True)
+
+# populate mut_type
+ors_df_ordered['mut_type'].isna().sum()
+#mut_type_word = ors_df_ordered['mut_type'].value_counts()
+mut_type_word  = 'missense' # FIXME, should be derived
+ors_df_ordered['mut_type'].fillna(mut_type_word, inplace = True)
+ors_df_ordered['mut_type'].isna().sum()
+
+# populate gene_id
+ors_df_ordered['gene_id'].isna().sum()
+#gene_id_word = ors_df_ordered['gene_id'].value_counts()
+gene_id_word  = 'Rv2043c' # FIXME, should be derived
+ors_df_ordered['gene_id'].fillna(gene_id_word, inplace = True)
+ors_df_ordered['gene_id'].isna().sum()
+
+# populate gene_name
+ors_df_ordered['gene_name'].isna().sum()
+ors_df_ordered['gene_name'].value_counts()
+ors_df_ordered['gene_name'].fillna(gene, inplace = True)
+ors_df_ordered['gene_name'].isna().sum()
+
+# check numbers
+ors_df_ordered['or_kin'].isna().sum()
+# should be 0
+ors_df_ordered['or_mychisq'].isna().sum()
+
 #%%============================================================================ 
 print('==================================='
      , '\nSixth merge: Fourth + Fifth merge'
@ -350,79 +427,94 @@ print('==================================='
      , '\n===================================')

 #combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join)
-merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered)
+merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered) 
+
+# dtype problems
+if len(merging_cols_m6) > 1 and 'position'in merging_cols_m6:
+    print('Removing \'position\' from merging_cols_m6 to make dtypes consistent'
+          , '\norig length of merging_cols_m6:', len(merging_cols_m6))
+    merging_cols_m6.remove('position')
+    print('\nlength after removing:', len(merging_cols_m6))
+
 print('Dim of df1:', combined_df.shape
      , '\nDim of df2:', ors_df_ordered.shape
      , '\nNo. of merging_cols:', len(merging_cols_m6))

 print('Checking mutations in the two dfs:'
-      , '\nmuts in df1 but NOT in df2:'
+      , '\nmuts in df1 present in df2:'
      , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum()
-      , '\nmuts in df2 but NOT in df1:'
+      , '\nmuts in df2 present in df1:'
      , ors_df_ordered['mutationinformation'].isin(combined_df['mutationinformation']).sum())

-#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) )
-
-combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join)
+#----------
+# merge 6
+#----------
+combined_df_all = pd.merge(combined_df, ors_df_ordered, on = merging_cols_m6, how = o_join)
 combined_df_all.shape

-# populate mut_info_f1
-combined_df_all['mut_info_f1'].isna().sum()
-combined_df_all['mut_info_f1'] = combined_df_all['position'].astype(str) + combined_df_all['wild_type'] + '>' + combined_df_all['position'].astype(str) + combined_df_all['mutant_type']
-combined_df_all['mut_info_f1'].isna().sum()  
-
-# populate mut_type
-combined_df_all['mut_type'].isna().sum()
-#mut_type_word = combined_df_all['mut_type'].value_counts()
-mut_type_word  = 'missense' # FIXME, should be derived
-combined_df_all['mut_type'].fillna(mut_type_word, inplace = True)
-combined_df_all['mut_type'].isna().sum()
-
-# populate gene_id
-combined_df_all['gene_id'].isna().sum()
-#gene_id_word = combined_df_all['gene_id'].value_counts()
-gene_id_word  = 'Rv2043c' # FIXME, should be derived
-combined_df_all['gene_id'].fillna(gene_id_word, inplace = True)
-combined_df_all['gene_id'].isna().sum()
-
-# populate gene_name
-combined_df_all['gene_name'].isna().sum()
-combined_df_all['gene_name'].value_counts()
-combined_df_all['gene_name'].fillna(gene, inplace = True)
-combined_df_all['gene_name'].isna().sum()
-
-
-# FIXME: DIM
-# only with left join!
-outdf_expected_rows = len(combined_df)
+# sanity check for merge 6
+outdf_expected_rows = len(combined_df) + extra_muts_myor
+unique_muts = len(combined_df)
 outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m6)

-#if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all.shape[0] == outdf_expected_rows:  
-if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == outdf_expected_rows:
+if combined_df_all.shape[0] == outdf_expected_rows and combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == unique_muts:
    print('PASS: Df dimension match'
-          , '\nDim of combined_df_all with join type:', l_join
+          , '\ncombined_df_all with join type:', o_join
          , '\n', combined_df_all.shape
          , '\n===============================================================')
 else:
    print('FAIL: Df dimension mismatch'
             , 'Cannot generate expected dim. See details of merge performed'
             , '\ndf1 dim:', combined_df.shape
-             , '\ndf2 dim:', ors_df.shape
+             , '\ndf2 dim:', ors_df_ordered.shape
             , '\nGot:', combined_df_all.shape
             , '\nmuts in df1 but NOT in df2:'
-             , combined_df['mutationinformation'].isin(ors_df['mutationinformation']).sum()
+             , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum()
             , '\nmuts in df2 but NOT in df1:'
             , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum())
    sys.exit()
-    
+ 
+# drop extra cols
+all_cols = combined_df_all.columns
+
+#pos_cols_check = combined_df_all[['position_x','position_y']]
+c = combined_df_all[['position_x','position_y']].isna().sum()
+pos_col_to_drop = c.index[c>0].to_list()
+cols_to_drop = pos_col_to_drop +  ['wild_type_kd']   
+
+print('Dropping', len(cols_to_drop), 'columns:\n', cols_to_drop)
+combined_df_all.drop(cols_to_drop, axis = 1, inplace = True) 
+ 
+# rename position_x to position
+pos_col_to_rename = c.index[c==0].to_list()
+combined_df_all.shape
+combined_df_all.rename(columns = { pos_col_to_rename[0]: 'position'}, inplace = True)
+combined_df_all.shape
+
+all_cols = combined_df_all.columns
+
+#%% reorder cols to for convenience
+first_cols = ['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']
+last_cols = [col for col in combined_df_all.columns if col not in first_cols]
+
+df = combined_df_all[first_cols+last_cols]

 #%% IMPORTANT: check if mutation related info is all populated after this merge  
-# FIXME: should get fixed with JP's resolved dataset!?
-check_nan = combined_df_all.isna().sum(axis = 0)
+# select string colnames to ensure no NA exist there
+string_cols = combined_df_all.columns[combined_df_all.applymap(lambda x: isinstance(x, str)).all(0)]
+
+if (combined_df_all[string_cols].isna().sum(axis = 0)== 0).all():
+    print('PASS: All string cols are populated with no NAs')
+else:
+    print('FAIL: NAs detected in string cols')
+    print(combined_df_all[string_cols].isna().sum(axis = 0))
+    sys.exit()
+
 # relevant mut cols
 check_mut_cols = merging_cols_m5 + merging_cols_m6

 count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'})
+print(check_mut_cols)

 if (count_na_mut_cols['na_count'].sum() > 0).any():
    # FIXME: static override, generate 'mutation' from variable
@ -434,31 +526,29 @@ else:
    print('No missing \'mutation\' has been detected!')
    
    
-lookup_dict = dict()
-for k, v in oneletter_aa_dict.items():
-    lookup_dict[k] = v['three_letter_code_lower']
-    print(lookup_dict)
-    wt_3let = combined_df_all['wild_type'].map(lookup_dict)
-    #print(wt_3let)
-    pos = combined_df_all['position'].astype(str)
-    #print(pos)
-    mt_3let = combined_df_all['mutant_type'].map(lookup_dict)
-    #print(mt_3let)
-    # override the 'mutation' column
-    combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
-    print(combined_df_all['mutation'])    

-# populate mut_info_f2
-combined_df_all['mut_info_f2'] = combined_df_all['mutation'].str.replace(gene_match.lower(), 'p.', regex = True)
+#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
+col_to_drop = ['wild_type_kd']
+print('Dropping', len(col_to_drop), 'columns:\n'
+      , col_to_drop)
+combined_df_all.drop(col_to_drop, axis = 1, inplace = True)
+
+

 #%% check
 #cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2']
 #foo = combined_df_all[cols_check]
-
+foo = combined_df_all.drop_duplicates('mutationinformation')
+foo2 = combined_df_all.drop_duplicates('mutation')
+poo = combined_df_all[combined_df_all['mutation'].isna()]
 #%%============================================================================
 output_cols = combined_df_all.columns
 #print('Output cols:', output_cols)
-
+#%% drop duplicates
+if combined_df_all.shape[0] != outdf_expected_rows:
+    print('INFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele')
+else:
+    print('combined_df_all has no duplicate muts present')
 #%%============================================================================ 
 # write csv
 print('Writing file: combined output of all params needed for plotting and ML')