diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index 50ff6ee..622e8e5 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -427,42 +427,37 @@ count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().r if (count_na_mut_cols['na_count'].sum() > 0).any(): # FIXME: static override, generate 'mutation' from variable na_muts_n = combined_df_all['mutation'].isna().sum() - baz = combined_df_all[combined_df_all['mutation'].isna()] - baz = baz[check_mut_cols] + #baz = combined_df_all[combined_df_all['mutation'].isna()] print(na_muts_n, 'mutations have missing \'mutation\' info.' , '\nFetching these from reference dict...') +else: + print('No missing \'mutation\' has been detected!') + lookup_dict = dict() for k, v in oneletter_aa_dict.items(): lookup_dict[k] = v['three_letter_code_lower'] print(lookup_dict) - wt_3let = combined_df_all['wild_type'].map(lookup_dict).str.capitalize() + wt_3let = combined_df_all['wild_type'].map(lookup_dict) #print(wt_3let) pos = combined_df_all['position'].astype(str) #print(pos) - mt_3let = combined_df_all['mutant_type'].map(lookup_dict).str.capitalize() + mt_3let = combined_df_all['mutant_type'].map(lookup_dict) #print(mt_3let) - baz['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let + # override the 'mutation' column + combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let print(combined_df_all['mutation']) # populate mut_info_f2 combined_df_all['mut_info_f2'] = combined_df_all['mutation'].str.replace(gene_match.lower(), 'p.', regex = True) -#%% merge -#merging_cols_m7 = detect_common_cols(combined_df_all, baz) - -baz2 = baz[['mutationinformation', 'mut_info_f2']] -baz2 = baz2.drop_duplicates() -merging_cols_m7 = detect_common_cols(combined_df_all, baz2) - -combined_df_all2 = pd.merge(combined_df_all, baz2 - #, on = merging_cols_m7 - , on = 'mutationinformation' - , how = o_join) +#%% check +#cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2'] +#foo = combined_df_all[cols_check] #%%============================================================================ output_cols = combined_df_all.columns -print('Output cols:', output_cols) +#print('Output cols:', output_cols) #%%============================================================================ # write csv