From 42986bb119da58c74f67c4175c2164502ecaa3ac Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 8 Sep 2020 17:46:52 +0100 Subject: [PATCH] hopefully finally sorted data merges! --- scripts/combining_dfs.py | 78 +++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 32 deletions(-) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index be62177..eab2808 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -73,7 +73,6 @@ args = arg_parser.parse_args() #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' -gene_match = gene + '_p.' drug = args.drug gene = args.gene @@ -312,18 +311,13 @@ column_order = ['mutation' , 'wild_type' , 'position' , 'mutant_type' - #, 'chr_num_allele' #old , 'ref_allele' , 'alt_allele' , 'mut_info_f1' , 'mut_info_f2' , 'mut_type' , 'gene_id' - #, 'gene_number' #old , 'gene_name' - #, 'mut_region' - #, 'reference_allele' - #, 'alternate_allele' , 'chromosome_number' , 'af' , 'af_kin' @@ -346,11 +340,7 @@ column_order = ['mutation' , 'se_kin' , 'zval_logistic' , 'logl_h1_kin' - , 'l_remle_kin' - #, 'wt_3let' # old - #, 'mt_3let' # old - #, 'symbol' - ] + , 'l_remle_kin'] if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())): print('PASS: Column order generated for all:', len(column_order), 'columns' @@ -516,39 +506,63 @@ check_mut_cols = merging_cols_m5 + merging_cols_m6 count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'}) print(check_mut_cols) -if (count_na_mut_cols['na_count'].sum() > 0).any(): - # FIXME: static override, generate 'mutation' from variable - na_muts_n = combined_df_all['mutation'].isna().sum() - #baz = combined_df_all[combined_df_all['mutation'].isna()] - print(na_muts_n, 'mutations have missing \'mutation\' info.' +c2 = combined_df_all[check_mut_cols].isna().sum() +missing_info_cols = c2.index[c2>0].to_list() + +if c2.sum()>0: + #na_muts_n = combined_df_all['mutation'].isna().sum() + na_muts_n = combined_df_all[missing_info_cols].isna().sum() + print(na_muts_n.values[0], 'mutations have missing \'mutation\' info.' , '\nFetching these from reference dict...') else: print('No missing \'mutation\' has been detected!') - - -#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ] -col_to_drop = ['wild_type_kd'] -print('Dropping', len(col_to_drop), 'columns:\n' - , col_to_drop) -combined_df_all.drop(col_to_drop, axis = 1, inplace = True) - +lookup_dict = dict() +for k, v in oneletter_aa_dict.items(): + lookup_dict[k] = v['three_letter_code_lower'] + print(lookup_dict) + wt_3let = combined_df_all['wild_type'].map(lookup_dict) + #print(wt_3let) + pos = combined_df_all['position'].astype(str) + #print(pos) + mt_3let = combined_df_all['mutant_type'].map(lookup_dict) + #print(mt_3let) + # override the 'mutation' column + combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let + print(combined_df_all['mutation']) +# check again +if combined_df_all[missing_info_cols].isna().sum().all() == 0: + print('PASS: No mutations have missing \'mutation\' info.') +else: + print('FAIL:', combined_df_all[missing_info_cols].isna().sum().values[0] + , '\nmutations have missing info STILL...') + sys.exit() #%% check -#cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2'] -#foo = combined_df_all[cols_check] foo = combined_df_all.drop_duplicates('mutationinformation') foo2 = combined_df_all.drop_duplicates('mutation') -poo = combined_df_all[combined_df_all['mutation'].isna()] +if foo.equals(foo2): + print('PASS: Dropping mutation or mutatationinformation has the same effect\n') +else: + print('FAIL: Still problems in merged data') + sys.exit() + #%%============================================================================ output_cols = combined_df_all.columns -#print('Output cols:', output_cols) -#%% drop duplicates -if combined_df_all.shape[0] != outdf_expected_rows: - print('INFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele') + +#%% IMPORTANT result info +if combined_df_all.shape[0] == outdf_expected_rows: + print('\nINFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele' + , '\n=============================================================') else: - print('combined_df_all has no duplicate muts present') + print('combined_df_all has no duplicate muts present' + ,'\n===============================================================') + +print('\nDim of combined_data:', combined_df_all.shape + , '\nNo. of unique mutations:', combined_df_all['mutationinformation'].nunique()) + + #%%============================================================================ # write csv print('Writing file: combined output of all params needed for plotting and ML')