hopefully finally sorted data merges!

This commit is contained in:
Tanushree Tunstall 2020-09-08 17:46:52 +01:00
parent e4608342a4
commit f7ab799f74

View file

@ -73,7 +73,6 @@ args = arg_parser.parse_args()
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
gene_match = gene + '_p.'
drug = args.drug
gene = args.gene
@ -312,18 +311,13 @@ column_order = ['mutation'
, 'wild_type'
, 'position'
, 'mutant_type'
#, 'chr_num_allele' #old
, 'ref_allele'
, 'alt_allele'
, 'mut_info_f1'
, 'mut_info_f2'
, 'mut_type'
, 'gene_id'
#, 'gene_number' #old
, 'gene_name'
#, 'mut_region'
#, 'reference_allele'
#, 'alternate_allele'
, 'chromosome_number'
, 'af'
, 'af_kin'
@ -346,11 +340,7 @@ column_order = ['mutation'
, 'se_kin'
, 'zval_logistic'
, 'logl_h1_kin'
, 'l_remle_kin'
#, 'wt_3let' # old
#, 'mt_3let' # old
#, 'symbol'
]
, 'l_remle_kin']
if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())):
print('PASS: Column order generated for all:', len(column_order), 'columns'
@ -516,39 +506,63 @@ check_mut_cols = merging_cols_m5 + merging_cols_m6
count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'})
print(check_mut_cols)
if (count_na_mut_cols['na_count'].sum() > 0).any():
# FIXME: static override, generate 'mutation' from variable
na_muts_n = combined_df_all['mutation'].isna().sum()
#baz = combined_df_all[combined_df_all['mutation'].isna()]
print(na_muts_n, 'mutations have missing \'mutation\' info.'
c2 = combined_df_all[check_mut_cols].isna().sum()
missing_info_cols = c2.index[c2>0].to_list()
if c2.sum()>0:
#na_muts_n = combined_df_all['mutation'].isna().sum()
na_muts_n = combined_df_all[missing_info_cols].isna().sum()
print(na_muts_n.values[0], 'mutations have missing \'mutation\' info.'
, '\nFetching these from reference dict...')
else:
print('No missing \'mutation\' has been detected!')
lookup_dict = dict()
for k, v in oneletter_aa_dict.items():
lookup_dict[k] = v['three_letter_code_lower']
print(lookup_dict)
wt_3let = combined_df_all['wild_type'].map(lookup_dict)
#print(wt_3let)
pos = combined_df_all['position'].astype(str)
#print(pos)
mt_3let = combined_df_all['mutant_type'].map(lookup_dict)
#print(mt_3let)
# override the 'mutation' column
combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
print(combined_df_all['mutation'])
#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
col_to_drop = ['wild_type_kd']
print('Dropping', len(col_to_drop), 'columns:\n'
, col_to_drop)
combined_df_all.drop(col_to_drop, axis = 1, inplace = True)
# check again
if combined_df_all[missing_info_cols].isna().sum().all() == 0:
print('PASS: No mutations have missing \'mutation\' info.')
else:
print('FAIL:', combined_df_all[missing_info_cols].isna().sum().values[0]
, '\nmutations have missing info STILL...')
sys.exit()
#%% check
#cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2']
#foo = combined_df_all[cols_check]
foo = combined_df_all.drop_duplicates('mutationinformation')
foo2 = combined_df_all.drop_duplicates('mutation')
poo = combined_df_all[combined_df_all['mutation'].isna()]
if foo.equals(foo2):
print('PASS: Dropping mutation or mutatationinformation has the same effect\n')
else:
print('FAIL: Still problems in merged data')
sys.exit()
#%%============================================================================
output_cols = combined_df_all.columns
#print('Output cols:', output_cols)
#%% drop duplicates
if combined_df_all.shape[0] != outdf_expected_rows:
print('INFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele')
#%% IMPORTANT result info
if combined_df_all.shape[0] == outdf_expected_rows:
print('\nINFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele'
, '\n=============================================================')
else:
print('combined_df_all has no duplicate muts present')
print('combined_df_all has no duplicate muts present'
,'\n===============================================================')
print('\nDim of combined_data:', combined_df_all.shape
, '\nNo. of unique mutations:', combined_df_all['mutationinformation'].nunique())
#%%============================================================================
# write csv
print('Writing file: combined output of all params needed for plotting and ML')