hopefully finally sorted data merges!

This commit is contained in:
Tanushree Tunstall 2020-09-08 17:46:52 +01:00
parent e4608342a4
commit f7ab799f74

View file

@ -73,7 +73,6 @@ args = arg_parser.parse_args()
#%% variable assignment: input and output #%% variable assignment: input and output
#drug = 'pyrazinamide' #drug = 'pyrazinamide'
#gene = 'pncA' #gene = 'pncA'
gene_match = gene + '_p.'
drug = args.drug drug = args.drug
gene = args.gene gene = args.gene
@ -312,18 +311,13 @@ column_order = ['mutation'
, 'wild_type' , 'wild_type'
, 'position' , 'position'
, 'mutant_type' , 'mutant_type'
#, 'chr_num_allele' #old
, 'ref_allele' , 'ref_allele'
, 'alt_allele' , 'alt_allele'
, 'mut_info_f1' , 'mut_info_f1'
, 'mut_info_f2' , 'mut_info_f2'
, 'mut_type' , 'mut_type'
, 'gene_id' , 'gene_id'
#, 'gene_number' #old
, 'gene_name' , 'gene_name'
#, 'mut_region'
#, 'reference_allele'
#, 'alternate_allele'
, 'chromosome_number' , 'chromosome_number'
, 'af' , 'af'
, 'af_kin' , 'af_kin'
@ -346,11 +340,7 @@ column_order = ['mutation'
, 'se_kin' , 'se_kin'
, 'zval_logistic' , 'zval_logistic'
, 'logl_h1_kin' , 'logl_h1_kin'
, 'l_remle_kin' , 'l_remle_kin']
#, 'wt_3let' # old
#, 'mt_3let' # old
#, 'symbol'
]
if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())): if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())):
print('PASS: Column order generated for all:', len(column_order), 'columns' print('PASS: Column order generated for all:', len(column_order), 'columns'
@ -516,39 +506,63 @@ check_mut_cols = merging_cols_m5 + merging_cols_m6
count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'}) count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'})
print(check_mut_cols) print(check_mut_cols)
if (count_na_mut_cols['na_count'].sum() > 0).any(): c2 = combined_df_all[check_mut_cols].isna().sum()
# FIXME: static override, generate 'mutation' from variable missing_info_cols = c2.index[c2>0].to_list()
na_muts_n = combined_df_all['mutation'].isna().sum()
#baz = combined_df_all[combined_df_all['mutation'].isna()] if c2.sum()>0:
print(na_muts_n, 'mutations have missing \'mutation\' info.' #na_muts_n = combined_df_all['mutation'].isna().sum()
na_muts_n = combined_df_all[missing_info_cols].isna().sum()
print(na_muts_n.values[0], 'mutations have missing \'mutation\' info.'
, '\nFetching these from reference dict...') , '\nFetching these from reference dict...')
else: else:
print('No missing \'mutation\' has been detected!') print('No missing \'mutation\' has been detected!')
lookup_dict = dict()
for k, v in oneletter_aa_dict.items():
#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ] lookup_dict[k] = v['three_letter_code_lower']
col_to_drop = ['wild_type_kd'] print(lookup_dict)
print('Dropping', len(col_to_drop), 'columns:\n' wt_3let = combined_df_all['wild_type'].map(lookup_dict)
, col_to_drop) #print(wt_3let)
combined_df_all.drop(col_to_drop, axis = 1, inplace = True) pos = combined_df_all['position'].astype(str)
#print(pos)
mt_3let = combined_df_all['mutant_type'].map(lookup_dict)
#print(mt_3let)
# override the 'mutation' column
combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
print(combined_df_all['mutation'])
# check again
if combined_df_all[missing_info_cols].isna().sum().all() == 0:
print('PASS: No mutations have missing \'mutation\' info.')
else:
print('FAIL:', combined_df_all[missing_info_cols].isna().sum().values[0]
, '\nmutations have missing info STILL...')
sys.exit()
#%% check #%% check
#cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2']
#foo = combined_df_all[cols_check]
foo = combined_df_all.drop_duplicates('mutationinformation') foo = combined_df_all.drop_duplicates('mutationinformation')
foo2 = combined_df_all.drop_duplicates('mutation') foo2 = combined_df_all.drop_duplicates('mutation')
poo = combined_df_all[combined_df_all['mutation'].isna()] if foo.equals(foo2):
print('PASS: Dropping mutation or mutatationinformation has the same effect\n')
else:
print('FAIL: Still problems in merged data')
sys.exit()
#%%============================================================================ #%%============================================================================
output_cols = combined_df_all.columns output_cols = combined_df_all.columns
#print('Output cols:', output_cols)
#%% drop duplicates #%% IMPORTANT result info
if combined_df_all.shape[0] != outdf_expected_rows: if combined_df_all.shape[0] == outdf_expected_rows:
print('INFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele') print('\nINFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele'
, '\n=============================================================')
else: else:
print('combined_df_all has no duplicate muts present') print('combined_df_all has no duplicate muts present'
,'\n===============================================================')
print('\nDim of combined_data:', combined_df_all.shape
, '\nNo. of unique mutations:', combined_df_all['mutationinformation'].nunique())
#%%============================================================================ #%%============================================================================
# write csv # write csv
print('Writing file: combined output of all params needed for plotting and ML') print('Writing file: combined output of all params needed for plotting and ML')