From e5aca5e24f2b2a84721fc62fcaaa83e8e3b32809 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 24 Nov 2021 07:57:20 +0000
Subject: [PATCH] fixed the duplicate colum problem by removing them from
 combining_dfs.py

---
 scripts/combining_dfs.py | 86 ++++++++++++++++++++++++++++------------
 1 file changed, 60 insertions(+), 26 deletions(-)

diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py
index e6ea6cc..d6cb2fd 100755
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@@ -59,7 +59,7 @@ os.getcwd()
 # FIXME: local imports
 #from combining import combine_dfs_with_checks
 from combining_FIXME import detect_common_cols
-from reference_dict import oneletter_aa_dict 
+from reference_dict import oneletter_aa_dict
 from reference_dict import low_3letter_dict 
 
 from aa_code import get_aa_3lower
@@ -114,16 +114,16 @@ if not outdir:
 #=======
 # input
 #=======
-gene_list_normal = ["pnca", "katg", "rpob", "alr"]
+gene_list_normal = ['pnca', 'katg', 'rpob', 'alr']
 
-#FIXME: for gid, this should be SRY as this is the drug...please check!!!!
 if gene.lower() == "gid":
     print("\nReading mCSM file for gene:", gene)
     in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously
 if gene.lower() == "embb":
     print("\nReading mCSM file for gene:", gene)
     #in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #798
-    in_filename_mcsm = gene.lower() + '_complex_mcsm_norm2.csv' #844
+    #in_filename_mcsm = gene.lower() + '_complex_mcsm_norm2.csv' #844
+    in_filename_mcsm = gene.lower() + '_complex_mcsm_norm3.csv' #851
 
 if gene.lower() in gene_list_normal:
    print("\nReading mCSM file for gene:", gene)
@@ -172,17 +172,29 @@ mcsm_f_snps            = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mu
 
 #------------------------------------------------------------------------------
 # ONLY:for gene pnca and gid: End logic should pick this up!
-geneL_dy_na = ['gid']
-if gene.lower() in geneL_dy_na :
+geneL_na = ['gid', 'rpob']
+if gene.lower() in geneL_na:
+    print("\nGene:", gene.lower()
+          , "\nReading mCSM_na files")
+    # infilename_dynamut    = gene.lower() + '_dynamut_norm.csv' # gid
+    # infile_dynamut        = outdir + 'dynamut_results/' + infilename_dynamut
+    # dynamut_df            = pd.read_csv(infile_dynamut, sep = ',')
+    
+    infilename_mcsm_na    = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
+    infile_mcsm_na        = outdir + 'mcsm_na_results/' + infilename_mcsm_na 
+    mcsm_na_df            = pd.read_csv(infile_mcsm_na, sep = ',')
+
+geneL_dy = ['gid']
+if gene.lower() in geneL_dy:
     print("\nGene:", gene.lower()
           , "\nReading Dynamut and mCSM_na files")
     infilename_dynamut    = gene.lower() + '_dynamut_norm.csv' # gid
     infile_dynamut        = outdir + 'dynamut_results/' + infilename_dynamut
     dynamut_df            = pd.read_csv(infile_dynamut, sep = ',')
     
-    infilename_mcsm_na    = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
-    infile_mcsm_na        = outdir + 'mcsm_na_results/' + infilename_mcsm_na 
-    mcsm_na_df            = pd.read_csv(infile_mcsm_na, sep = ',')
+    # infilename_mcsm_na    = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
+    # infile_mcsm_na        = outdir + 'mcsm_na_results/' + infilename_mcsm_na 
+    # mcsm_na_df            = pd.read_csv(infile_mcsm_na, sep = ',')
 
 # ONLY:for gene embb and alr: End logic should pick this up!
 geneL_ppi2 = ['embb', 'alr']
@@ -192,7 +204,6 @@ if gene.lower() in geneL_ppi2:
     infile_mcsm_ppi2       = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2
     mcsm_ppi2_df           = pd.read_csv(infile_mcsm_ppi2, sep = ',')
     
-    
 if gene.lower() == "embb":
     sel_chain = "B"
 else:
@@ -227,7 +238,7 @@ foldx_df['ddg_foldx']
   
 # Rescale values in Foldx_change col b/w -1 and 1 so negative numbers
 # stay neg and pos numbers stay positive
-foldx_min = foldx_df['ddg_foldx'].min() 
+foldx_min = foldx_df['ddg_foldx'].min()
 foldx_max = foldx_df['ddg_foldx'].max() 
 foldx_min
 foldx_max
@@ -299,7 +310,13 @@ if len(deepddg_df.loc[:,'chain_id'].value_counts()) > 1:
 print('\nSelecting chain:', sel_chain, 'for gene:', gene)
 
 deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain]
-    
+
+#--------------------------
+# Drop chain id col as other targets don't have it.Check for duplicates
+#--------------------------
+col_to_drop = ['chain_id']
+deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
+
 #--------------------------
 # Check for duplicates
 #--------------------------
@@ -312,12 +329,6 @@ if len(deepddg_df['mutationinformation'].duplicated().value_counts())> 1:
 else:
     print("\nPASS: No duplicates detected in DeepDDG infile")
 
-#--------------------------
-# Drop chain id col as other targets don't have it.Check for duplicates
-#--------------------------
-col_to_drop = ['chain_id']
-deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
-
 #-------------------------
 # scale Deepddg values
 #-------------------------
@@ -366,8 +377,7 @@ else:
           , '\nGot:', doc[0]
           , '\n======================================================')
     sys.exit()
-    
-    
+        
 if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
     print('\nPASS: Deepddg data is scaled between -1 and 1',
            '\nproceeding with merge')
@@ -571,7 +581,7 @@ foo = combined_df[['wild_type', 'wild_type_kd', 'wt_3letter_caps', 'wt_aa_3lower
 # Drop cols
 cols_to_drop = ['chain_id', 'wild_type_kd', 'wild_type_dssp', 'wt_3letter_caps']
 combined_df_clean = combined_df.drop(cols_to_drop, axis = 1)
-
+combined_df_clean.columns
 del(foo)
 #%%============================================================================
 # Output columns
@@ -611,7 +621,7 @@ get_aa_1upper(df = afor_df
 afor_df['mutationinformation'] = afor_df['wild_type'] + afor_df['position'].map(str) + afor_df['mutant_type']
 afor_cols = afor_df.columns
 
-merging_cols_m5 = detect_common_cols(combined_df_clean, afor_df) 
+merging_cols_m5 = detect_common_cols(combined_df_clean, afor_df)
 
 # remove position so that merging can take place without dtype conflicts
 merging_cols_m5.remove('position')
@@ -683,14 +693,24 @@ if gene.lower() == "embb":
 if gene.lower() == "katg":
     dfs_list = [dynamut2_df]
 if gene.lower() == "rpob":
-    dfs_list = [dynamut2_df]
+    dfs_list = [dynamut2_df, mcsm_na_df]
 if gene.lower() == "alr":
     dfs_list = [dynamut2_df, mcsm_ppi2_df]
 
+# noticed that with revised rpoB that mcsm-NA had one less position,
+# Hence this condition else the last check fails with discrepancy for expected_nrows
+if len(dfs_list) > 1:
+    join_type = 'outer'
+else:
+    join_type = 'inner'
+
+print('\nUsing join type: "', join_type, '" for the last but one merge')
+    
 dfs_merged = reduce(lambda  left,right: pd.merge(left
                                                 , right
                                                 , on = ['mutationinformation']
-                                                , how = 'inner')
+                                                #, how = 'inner')
+                                                , how = join_type) 
                    , dfs_list)
 # drop excess columns
 drop_cols = detect_common_cols(dfs_merged, combined_stab_afor)
@@ -718,7 +738,21 @@ else:
           , '\nGot:', len(dfs_merged_clean.columns) 
           , '\nExpected nrows:', expected_nrows
           , '\nGot:', len(dfs_merged_clean) )
-    
+
+# FIXME: need to extract 'cols_to_drop' programatically
+# Drop cols
+if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any():
+     print('\nDuplicate column names detected...'
+           , '\nDropping these before writing file')
+     extra_cols_to_drop = list(combined_all_params.columns.str.extract(r'(.*_x$|.*_y$)', expand = True).dropna()[0])
+     print('\nTotal cols:',  len(combined_all_params.columns)
+         ,'\nDropping:', len(extra_cols_to_drop), 'columns')
+     #extra_cols_to_drop = ['chain_x', 'chain_y']
+     combined_all_params = combined_all_params.drop(extra_cols_to_drop, axis = 1)
+else:
+    print('\nNo duplicate column names detected, just writing file'
+          , '\nTotal cols:', len(combined_all_params.columns) )
+#del(foo)
 #%% Done for gid on 10/09/2021
 # write csv
 print('Writing file: all params')
@@ -727,4 +761,4 @@ combined_all_params.to_csv(outfile_comb, index = False)
 print('\nFinished writing file:'
       , '\nNo. of rows:', combined_all_params.shape[0]
       , '\nNo. of cols:', combined_all_params.shape[1])
-#%% end of script
+#%% end of script
\ No newline at end of file