From e5aca5e24f2b2a84721fc62fcaaa83e8e3b32809 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 24 Nov 2021 07:57:20 +0000 Subject: [PATCH] fixed the duplicate colum problem by removing them from combining_dfs.py --- scripts/combining_dfs.py | 86 ++++++++++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 26 deletions(-) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index e6ea6cc..d6cb2fd 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -59,7 +59,7 @@ os.getcwd() # FIXME: local imports #from combining import combine_dfs_with_checks from combining_FIXME import detect_common_cols -from reference_dict import oneletter_aa_dict +from reference_dict import oneletter_aa_dict from reference_dict import low_3letter_dict from aa_code import get_aa_3lower @@ -114,16 +114,16 @@ if not outdir: #======= # input #======= -gene_list_normal = ["pnca", "katg", "rpob", "alr"] +gene_list_normal = ['pnca', 'katg', 'rpob', 'alr'] -#FIXME: for gid, this should be SRY as this is the drug...please check!!!! if gene.lower() == "gid": print("\nReading mCSM file for gene:", gene) in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously if gene.lower() == "embb": print("\nReading mCSM file for gene:", gene) #in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #798 - in_filename_mcsm = gene.lower() + '_complex_mcsm_norm2.csv' #844 + #in_filename_mcsm = gene.lower() + '_complex_mcsm_norm2.csv' #844 + in_filename_mcsm = gene.lower() + '_complex_mcsm_norm3.csv' #851 if gene.lower() in gene_list_normal: print("\nReading mCSM file for gene:", gene) @@ -172,17 +172,29 @@ mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mu #------------------------------------------------------------------------------ # ONLY:for gene pnca and gid: End logic should pick this up! -geneL_dy_na = ['gid'] -if gene.lower() in geneL_dy_na : +geneL_na = ['gid', 'rpob'] +if gene.lower() in geneL_na: + print("\nGene:", gene.lower() + , "\nReading mCSM_na files") + # infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid + # infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut + # dynamut_df = pd.read_csv(infile_dynamut, sep = ',') + + infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid + infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na + mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',') + +geneL_dy = ['gid'] +if gene.lower() in geneL_dy: print("\nGene:", gene.lower() , "\nReading Dynamut and mCSM_na files") infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut dynamut_df = pd.read_csv(infile_dynamut, sep = ',') - infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid - infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na - mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',') + # infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid + # infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na + # mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',') # ONLY:for gene embb and alr: End logic should pick this up! geneL_ppi2 = ['embb', 'alr'] @@ -192,7 +204,6 @@ if gene.lower() in geneL_ppi2: infile_mcsm_ppi2 = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2 mcsm_ppi2_df = pd.read_csv(infile_mcsm_ppi2, sep = ',') - if gene.lower() == "embb": sel_chain = "B" else: @@ -227,7 +238,7 @@ foldx_df['ddg_foldx'] # Rescale values in Foldx_change col b/w -1 and 1 so negative numbers # stay neg and pos numbers stay positive -foldx_min = foldx_df['ddg_foldx'].min() +foldx_min = foldx_df['ddg_foldx'].min() foldx_max = foldx_df['ddg_foldx'].max() foldx_min foldx_max @@ -299,7 +310,13 @@ if len(deepddg_df.loc[:,'chain_id'].value_counts()) > 1: print('\nSelecting chain:', sel_chain, 'for gene:', gene) deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain] - + +#-------------------------- +# Drop chain id col as other targets don't have it.Check for duplicates +#-------------------------- +col_to_drop = ['chain_id'] +deepddg_df = deepddg_df.drop(col_to_drop, axis = 1) + #-------------------------- # Check for duplicates #-------------------------- @@ -312,12 +329,6 @@ if len(deepddg_df['mutationinformation'].duplicated().value_counts())> 1: else: print("\nPASS: No duplicates detected in DeepDDG infile") -#-------------------------- -# Drop chain id col as other targets don't have it.Check for duplicates -#-------------------------- -col_to_drop = ['chain_id'] -deepddg_df = deepddg_df.drop(col_to_drop, axis = 1) - #------------------------- # scale Deepddg values #------------------------- @@ -366,8 +377,7 @@ else: , '\nGot:', doc[0] , '\n======================================================') sys.exit() - - + if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1: print('\nPASS: Deepddg data is scaled between -1 and 1', '\nproceeding with merge') @@ -571,7 +581,7 @@ foo = combined_df[['wild_type', 'wild_type_kd', 'wt_3letter_caps', 'wt_aa_3lower # Drop cols cols_to_drop = ['chain_id', 'wild_type_kd', 'wild_type_dssp', 'wt_3letter_caps'] combined_df_clean = combined_df.drop(cols_to_drop, axis = 1) - +combined_df_clean.columns del(foo) #%%============================================================================ # Output columns @@ -611,7 +621,7 @@ get_aa_1upper(df = afor_df afor_df['mutationinformation'] = afor_df['wild_type'] + afor_df['position'].map(str) + afor_df['mutant_type'] afor_cols = afor_df.columns -merging_cols_m5 = detect_common_cols(combined_df_clean, afor_df) +merging_cols_m5 = detect_common_cols(combined_df_clean, afor_df) # remove position so that merging can take place without dtype conflicts merging_cols_m5.remove('position') @@ -683,14 +693,24 @@ if gene.lower() == "embb": if gene.lower() == "katg": dfs_list = [dynamut2_df] if gene.lower() == "rpob": - dfs_list = [dynamut2_df] + dfs_list = [dynamut2_df, mcsm_na_df] if gene.lower() == "alr": dfs_list = [dynamut2_df, mcsm_ppi2_df] +# noticed that with revised rpoB that mcsm-NA had one less position, +# Hence this condition else the last check fails with discrepancy for expected_nrows +if len(dfs_list) > 1: + join_type = 'outer' +else: + join_type = 'inner' + +print('\nUsing join type: "', join_type, '" for the last but one merge') + dfs_merged = reduce(lambda left,right: pd.merge(left , right , on = ['mutationinformation'] - , how = 'inner') + #, how = 'inner') + , how = join_type) , dfs_list) # drop excess columns drop_cols = detect_common_cols(dfs_merged, combined_stab_afor) @@ -718,7 +738,21 @@ else: , '\nGot:', len(dfs_merged_clean.columns) , '\nExpected nrows:', expected_nrows , '\nGot:', len(dfs_merged_clean) ) - + +# FIXME: need to extract 'cols_to_drop' programatically +# Drop cols +if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any(): + print('\nDuplicate column names detected...' + , '\nDropping these before writing file') + extra_cols_to_drop = list(combined_all_params.columns.str.extract(r'(.*_x$|.*_y$)', expand = True).dropna()[0]) + print('\nTotal cols:', len(combined_all_params.columns) + ,'\nDropping:', len(extra_cols_to_drop), 'columns') + #extra_cols_to_drop = ['chain_x', 'chain_y'] + combined_all_params = combined_all_params.drop(extra_cols_to_drop, axis = 1) +else: + print('\nNo duplicate column names detected, just writing file' + , '\nTotal cols:', len(combined_all_params.columns) ) +#del(foo) #%% Done for gid on 10/09/2021 # write csv print('Writing file: all params') @@ -727,4 +761,4 @@ combined_all_params.to_csv(outfile_comb, index = False) print('\nFinished writing file:' , '\nNo. of rows:', combined_all_params.shape[0] , '\nNo. of cols:', combined_all_params.shape[1]) -#%% end of script +#%% end of script \ No newline at end of file