hopefully finally sorted data merges!
This commit is contained in:
parent
fe49a45447
commit
42986bb119
1 changed files with 46 additions and 32 deletions
|
@ -73,7 +73,6 @@ args = arg_parser.parse_args()
|
|||
#%% variable assignment: input and output
|
||||
#drug = 'pyrazinamide'
|
||||
#gene = 'pncA'
|
||||
gene_match = gene + '_p.'
|
||||
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
|
@ -312,18 +311,13 @@ column_order = ['mutation'
|
|||
, 'wild_type'
|
||||
, 'position'
|
||||
, 'mutant_type'
|
||||
#, 'chr_num_allele' #old
|
||||
, 'ref_allele'
|
||||
, 'alt_allele'
|
||||
, 'mut_info_f1'
|
||||
, 'mut_info_f2'
|
||||
, 'mut_type'
|
||||
, 'gene_id'
|
||||
#, 'gene_number' #old
|
||||
, 'gene_name'
|
||||
#, 'mut_region'
|
||||
#, 'reference_allele'
|
||||
#, 'alternate_allele'
|
||||
, 'chromosome_number'
|
||||
, 'af'
|
||||
, 'af_kin'
|
||||
|
@ -346,11 +340,7 @@ column_order = ['mutation'
|
|||
, 'se_kin'
|
||||
, 'zval_logistic'
|
||||
, 'logl_h1_kin'
|
||||
, 'l_remle_kin'
|
||||
#, 'wt_3let' # old
|
||||
#, 'mt_3let' # old
|
||||
#, 'symbol'
|
||||
]
|
||||
, 'l_remle_kin']
|
||||
|
||||
if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())):
|
||||
print('PASS: Column order generated for all:', len(column_order), 'columns'
|
||||
|
@ -516,39 +506,63 @@ check_mut_cols = merging_cols_m5 + merging_cols_m6
|
|||
count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'})
|
||||
print(check_mut_cols)
|
||||
|
||||
if (count_na_mut_cols['na_count'].sum() > 0).any():
|
||||
# FIXME: static override, generate 'mutation' from variable
|
||||
na_muts_n = combined_df_all['mutation'].isna().sum()
|
||||
#baz = combined_df_all[combined_df_all['mutation'].isna()]
|
||||
print(na_muts_n, 'mutations have missing \'mutation\' info.'
|
||||
c2 = combined_df_all[check_mut_cols].isna().sum()
|
||||
missing_info_cols = c2.index[c2>0].to_list()
|
||||
|
||||
if c2.sum()>0:
|
||||
#na_muts_n = combined_df_all['mutation'].isna().sum()
|
||||
na_muts_n = combined_df_all[missing_info_cols].isna().sum()
|
||||
print(na_muts_n.values[0], 'mutations have missing \'mutation\' info.'
|
||||
, '\nFetching these from reference dict...')
|
||||
else:
|
||||
print('No missing \'mutation\' has been detected!')
|
||||
|
||||
|
||||
|
||||
#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
|
||||
col_to_drop = ['wild_type_kd']
|
||||
print('Dropping', len(col_to_drop), 'columns:\n'
|
||||
, col_to_drop)
|
||||
combined_df_all.drop(col_to_drop, axis = 1, inplace = True)
|
||||
|
||||
lookup_dict = dict()
|
||||
for k, v in oneletter_aa_dict.items():
|
||||
lookup_dict[k] = v['three_letter_code_lower']
|
||||
print(lookup_dict)
|
||||
wt_3let = combined_df_all['wild_type'].map(lookup_dict)
|
||||
#print(wt_3let)
|
||||
pos = combined_df_all['position'].astype(str)
|
||||
#print(pos)
|
||||
mt_3let = combined_df_all['mutant_type'].map(lookup_dict)
|
||||
#print(mt_3let)
|
||||
# override the 'mutation' column
|
||||
combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
|
||||
print(combined_df_all['mutation'])
|
||||
|
||||
# check again
|
||||
if combined_df_all[missing_info_cols].isna().sum().all() == 0:
|
||||
print('PASS: No mutations have missing \'mutation\' info.')
|
||||
else:
|
||||
print('FAIL:', combined_df_all[missing_info_cols].isna().sum().values[0]
|
||||
, '\nmutations have missing info STILL...')
|
||||
sys.exit()
|
||||
|
||||
#%% check
|
||||
#cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2']
|
||||
#foo = combined_df_all[cols_check]
|
||||
foo = combined_df_all.drop_duplicates('mutationinformation')
|
||||
foo2 = combined_df_all.drop_duplicates('mutation')
|
||||
poo = combined_df_all[combined_df_all['mutation'].isna()]
|
||||
if foo.equals(foo2):
|
||||
print('PASS: Dropping mutation or mutatationinformation has the same effect\n')
|
||||
else:
|
||||
print('FAIL: Still problems in merged data')
|
||||
sys.exit()
|
||||
|
||||
#%%============================================================================
|
||||
output_cols = combined_df_all.columns
|
||||
#print('Output cols:', output_cols)
|
||||
#%% drop duplicates
|
||||
if combined_df_all.shape[0] != outdf_expected_rows:
|
||||
print('INFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele')
|
||||
|
||||
#%% IMPORTANT result info
|
||||
if combined_df_all.shape[0] == outdf_expected_rows:
|
||||
print('\nINFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele'
|
||||
, '\n=============================================================')
|
||||
else:
|
||||
print('combined_df_all has no duplicate muts present')
|
||||
print('combined_df_all has no duplicate muts present'
|
||||
,'\n===============================================================')
|
||||
|
||||
print('\nDim of combined_data:', combined_df_all.shape
|
||||
, '\nNo. of unique mutations:', combined_df_all['mutationinformation'].nunique())
|
||||
|
||||
|
||||
#%%============================================================================
|
||||
# write csv
|
||||
print('Writing file: combined output of all params needed for plotting and ML')
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue