adding missing mutation col in combining_dfs

This commit is contained in:
Tanushree Tunstall 2020-09-04 21:04:18 +01:00
parent c48c5177ca
commit 5d0e2d94ce

View file

@ -427,42 +427,37 @@ count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().r
if (count_na_mut_cols['na_count'].sum() > 0).any(): if (count_na_mut_cols['na_count'].sum() > 0).any():
# FIXME: static override, generate 'mutation' from variable # FIXME: static override, generate 'mutation' from variable
na_muts_n = combined_df_all['mutation'].isna().sum() na_muts_n = combined_df_all['mutation'].isna().sum()
baz = combined_df_all[combined_df_all['mutation'].isna()] #baz = combined_df_all[combined_df_all['mutation'].isna()]
baz = baz[check_mut_cols]
print(na_muts_n, 'mutations have missing \'mutation\' info.' print(na_muts_n, 'mutations have missing \'mutation\' info.'
, '\nFetching these from reference dict...') , '\nFetching these from reference dict...')
else:
print('No missing \'mutation\' has been detected!')
lookup_dict = dict() lookup_dict = dict()
for k, v in oneletter_aa_dict.items(): for k, v in oneletter_aa_dict.items():
lookup_dict[k] = v['three_letter_code_lower'] lookup_dict[k] = v['three_letter_code_lower']
print(lookup_dict) print(lookup_dict)
wt_3let = combined_df_all['wild_type'].map(lookup_dict).str.capitalize() wt_3let = combined_df_all['wild_type'].map(lookup_dict)
#print(wt_3let) #print(wt_3let)
pos = combined_df_all['position'].astype(str) pos = combined_df_all['position'].astype(str)
#print(pos) #print(pos)
mt_3let = combined_df_all['mutant_type'].map(lookup_dict).str.capitalize() mt_3let = combined_df_all['mutant_type'].map(lookup_dict)
#print(mt_3let) #print(mt_3let)
baz['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let # override the 'mutation' column
combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
print(combined_df_all['mutation']) print(combined_df_all['mutation'])
# populate mut_info_f2 # populate mut_info_f2
combined_df_all['mut_info_f2'] = combined_df_all['mutation'].str.replace(gene_match.lower(), 'p.', regex = True) combined_df_all['mut_info_f2'] = combined_df_all['mutation'].str.replace(gene_match.lower(), 'p.', regex = True)
#%% merge #%% check
#merging_cols_m7 = detect_common_cols(combined_df_all, baz) #cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2']
#foo = combined_df_all[cols_check]
baz2 = baz[['mutationinformation', 'mut_info_f2']]
baz2 = baz2.drop_duplicates()
merging_cols_m7 = detect_common_cols(combined_df_all, baz2)
combined_df_all2 = pd.merge(combined_df_all, baz2
#, on = merging_cols_m7
, on = 'mutationinformation'
, how = o_join)
#%%============================================================================ #%%============================================================================
output_cols = combined_df_all.columns output_cols = combined_df_all.columns
print('Output cols:', output_cols) #print('Output cols:', output_cols)
#%%============================================================================ #%%============================================================================
# write csv # write csv