hopefully finally sorted data merges!

2020-09-08 17:46:52 +01:00 · 2020-09-08 17:46:52 +01:00 · f7ab799f74
commit f7ab799f74
parent e4608342a4
1 changed files with 46 additions and 32 deletions
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@ -73,7 +73,6 @@ args = arg_parser.parse_args()
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
-gene_match = gene + '_p.'

 drug    = args.drug
 gene    = args.gene
@ -312,18 +311,13 @@ column_order = ['mutation'
                , 'wild_type'               
                , 'position'
                , 'mutant_type'                              
-                #, 'chr_num_allele' #old
                , 'ref_allele'
                , 'alt_allele'
                , 'mut_info_f1'
                , 'mut_info_f2'
                , 'mut_type'
                , 'gene_id'
-                #, 'gene_number' #old
                , 'gene_name'
-                #, 'mut_region'                
-                #, 'reference_allele'
-                #, 'alternate_allele'
                , 'chromosome_number'                
                , 'af'
                , 'af_kin'               
@ -346,11 +340,7 @@ column_order = ['mutation'
                , 'se_kin'                            
                , 'zval_logistic'
                , 'logl_h1_kin'
-                , 'l_remle_kin'
-                #, 'wt_3let' # old
-                #, 'mt_3let' # old
-                #, 'symbol'
-                ]
+                , 'l_remle_kin']

 if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())):
    print('PASS: Column order generated for all:', len(column_order), 'columns'
@ -516,39 +506,63 @@ check_mut_cols = merging_cols_m5 + merging_cols_m6
 count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'})
 print(check_mut_cols)

-if (count_na_mut_cols['na_count'].sum() > 0).any():
-    # FIXME: static override, generate 'mutation' from variable
-    na_muts_n = combined_df_all['mutation'].isna().sum() 
-    #baz = combined_df_all[combined_df_all['mutation'].isna()]
-    print(na_muts_n, 'mutations have missing \'mutation\' info.'
+c2 = combined_df_all[check_mut_cols].isna().sum()
+missing_info_cols = c2.index[c2>0].to_list()
+
+if c2.sum()>0:
+    #na_muts_n = combined_df_all['mutation'].isna().sum() 
+    na_muts_n = combined_df_all[missing_info_cols].isna().sum() 
+    print(na_muts_n.values[0], 'mutations have missing \'mutation\' info.'
          , '\nFetching these from reference dict...')
 else:
    print('No missing \'mutation\' has been detected!')
    
+lookup_dict = dict()
+for k, v in oneletter_aa_dict.items():
+    lookup_dict[k] = v['three_letter_code_lower']
+    print(lookup_dict)
+    wt_3let = combined_df_all['wild_type'].map(lookup_dict)
+    #print(wt_3let)
+    pos = combined_df_all['position'].astype(str)
+    #print(pos)
+    mt_3let = combined_df_all['mutant_type'].map(lookup_dict)
+    #print(mt_3let)
+    # override the 'mutation' column
+    combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
+    print(combined_df_all['mutation'])    

-
-#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
-col_to_drop = ['wild_type_kd']
-print('Dropping', len(col_to_drop), 'columns:\n'
-      , col_to_drop)
-combined_df_all.drop(col_to_drop, axis = 1, inplace = True)
-
-
+# check again
+if combined_df_all[missing_info_cols].isna().sum().all() == 0: 
+    print('PASS: No mutations have missing \'mutation\' info.')
+else:
+    print('FAIL:', combined_df_all[missing_info_cols].isna().sum().values[0]
+          , '\nmutations have missing info STILL...')
+    sys.exit()

 #%% check
-#cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2']
-#foo = combined_df_all[cols_check]
 foo = combined_df_all.drop_duplicates('mutationinformation')
 foo2 = combined_df_all.drop_duplicates('mutation')
-poo = combined_df_all[combined_df_all['mutation'].isna()]
+if foo.equals(foo2):
+    print('PASS: Dropping mutation or mutatationinformation has the same effect\n')
+else:
+    print('FAIL: Still problems in merged data')
+    sys.exit()
+
 #%%============================================================================
 output_cols = combined_df_all.columns
-#print('Output cols:', output_cols)
-#%% drop duplicates
-if combined_df_all.shape[0] != outdf_expected_rows:
-    print('INFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele')
+
+#%% IMPORTANT result info
+if combined_df_all.shape[0] == outdf_expected_rows:
+    print('\nINFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele'
+          , '\n=============================================================')
 else:
-    print('combined_df_all has no duplicate muts present')
+    print('combined_df_all has no duplicate muts present'
+          ,'\n===============================================================')
+    
+print('\nDim of combined_data:', combined_df_all.shape
+      , '\nNo. of unique mutations:', combined_df_all['mutationinformation'].nunique())
+    
+
 #%%============================================================================ 
 # write csv
 print('Writing file: combined output of all params needed for plotting and ML')