adding missing mutation col in combining_dfs

2020-09-04 21:04:18 +01:00 · 2020-09-04 21:04:18 +01:00 · 5d0e2d94ce
commit 5d0e2d94ce
parent c48c5177ca
1 changed files with 12 additions and 17 deletions
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@ -427,42 +427,37 @@ count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().r
 if (count_na_mut_cols['na_count'].sum() > 0).any():
    # FIXME: static override, generate 'mutation' from variable
    na_muts_n = combined_df_all['mutation'].isna().sum() 
-    baz = combined_df_all[combined_df_all['mutation'].isna()]
+    #baz = combined_df_all[combined_df_all['mutation'].isna()]
    baz = baz[check_mut_cols]
    print(na_muts_n, 'mutations have missing \'mutation\' info.'
          , '\nFetching these from reference dict...')
 else:
    print('No missing \'mutation\' has been detected!')
 lookup_dict = dict()
 for k, v in oneletter_aa_dict.items():
    lookup_dict[k] = v['three_letter_code_lower']
    print(lookup_dict)
-    wt_3let = combined_df_all['wild_type'].map(lookup_dict).str.capitalize()
+    wt_3let = combined_df_all['wild_type'].map(lookup_dict)
    #print(wt_3let)
    pos = combined_df_all['position'].astype(str)
    #print(pos)
-    mt_3let = combined_df_all['mutant_type'].map(lookup_dict).str.capitalize()
+    mt_3let = combined_df_all['mutant_type'].map(lookup_dict)
    #print(mt_3let)
-    baz['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
+    # override the 'mutation' column
    combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
    print(combined_df_all['mutation'])    
 # populate mut_info_f2
 combined_df_all['mut_info_f2'] = combined_df_all['mutation'].str.replace(gene_match.lower(), 'p.', regex = True)
-#%% merge
+#%% check
-#merging_cols_m7 = detect_common_cols(combined_df_all, baz)
+#cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2']
-
+#foo = combined_df_all[cols_check]
 baz2 = baz[['mutationinformation', 'mut_info_f2']]
 baz2 = baz2.drop_duplicates()
 merging_cols_m7 = detect_common_cols(combined_df_all, baz2)
 combined_df_all2 = pd.merge(combined_df_all, baz2
                           #, on = merging_cols_m7
                            , on = 'mutationinformation'
                            , how = o_join)
 #%%============================================================================
 output_cols = combined_df_all.columns
-print('Output cols:', output_cols)
+#print('Output cols:', output_cols)
 #%%============================================================================ 
 # write csv