From 920007cc836e1e2ddf0a2947e2af6be1b980dd07 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 21 Jun 2021 14:53:04 +0100
Subject: [PATCH] added af_or to add to combining_dfs.py

---
 scripts/combining_dfs.py | 87 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 78 insertions(+), 9 deletions(-)

diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py
index 44a1fad..10b10db 100755
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@@ -40,7 +40,6 @@ import sys, os
 import pandas as pd
 from pandas import DataFrame
 import numpy as np
-#from varname import nameof
 import argparse
 #=======================================================================
 #%% specify input and curr dir
@@ -132,7 +131,7 @@ in_filename_kd = gene.lower() + '_kd.csv'
 in_filename_rd = gene.lower() + '_rd.csv'
 
 #in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv' # gwas f info
-#in_filename_afor = gene.lower() + '_af_or.csv'
+in_filename_afor = gene.lower() + '_af_or.csv'
 #in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv'
 
 infile_mcsm = outdir + in_filename_mcsm
@@ -144,7 +143,7 @@ infile_kd = outdir + in_filename_kd
 infile_rd = outdir + in_filename_rd
 
 #infile_snpinfo = outdir + '/' + in_filename_snpinfo 
-#infile_afor = outdir + '/' + in_filename_afor
+infile_afor = outdir + '/' + in_filename_afor
 #infile_afor_kin = outdir + '/' + in_filename_afor_kin
 
 print('\nInput path:', indir
@@ -157,7 +156,7 @@ print('\nInput path:', indir
       , '\nInput filename rd', infile_rd
      
       #, '\nInput filename snp info:', infile_snpinfo, '\n'
-      #, '\nInput filename af or:', infile_afor
+      , '\nInput filename af or:', infile_afor
       #, '\nInput filename afor kinship:', infile_afor_kin
       , '\n============================================================')
 
@@ -216,7 +215,7 @@ mcsm_foldx_deepddg_dfs['deepddg_outcome'].value_counts()
 ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns)
 #%%============================================================================
 print('==================================='
-      , '\nSecond merge: dssp + kd'
+      , '\Third merge: dssp + kd'
       , '\n===================================')
 
 dssp_df = pd.read_csv(infile_dssp, sep = ',')
@@ -227,11 +226,11 @@ rd_df = pd.read_csv(infile_rd, sep = ',')
 merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
 dssp_kd_dfs = pd.merge(dssp_df, kd_df, on = merging_cols_m2,  how = o_join)
 
-print('\n\nResult of second merge:', dssp_kd_dfs.shape
+print('\n\nResult of third merge:', dssp_kd_dfs.shape
       , '\n===================================================================')
 #%%============================================================================
 print('==================================='
-      , '\nThird merge: second merge + rd_df' 
+      , '\nFourth merge: third merge + rd_df' 
       , '\ndssp_kd_dfs + rd_df'
       , '\n===================================')
 #dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = o_join)
@@ -247,7 +246,7 @@ dssp_kd_rd_dfs[merging_cols_m3].apply(len)
 dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
 #%%============================================================================
 print('======================================='
-      , '\nFourth merge: First merge + Third merge'
+      , '\nFifth merge: Second merge + fourth merge'
       , '\nmcsm_foldx_dfs + dssp_kd_rd_dfs'
       , '\n=======================================')
 #combined_df = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = i_join)
@@ -270,6 +269,7 @@ else:
 
 print('\nResult of Fourth merge:', combined_df.shape
       , '\n===================================================================')
+
 combined_df[merging_cols_m4].apply(len)
 combined_df[merging_cols_m4].apply(len) == len(combined_df)
 #%%============================================================================
@@ -280,6 +280,7 @@ combined_df_colnames = combined_df.columns
 combined_df['chain'].equals(combined_df['chain_id'])
 combined_df['wild_type'].equals(combined_df['wild_type_kd']) # has nan
 combined_df['wild_type'].equals(combined_df['wild_type_dssp'])
+
 #sanity check
 foo = combined_df[['wild_type', 'wild_type_kd', 'wt_3letter_caps', 'wt_aa_3lower', 'mut_aa_3lower']] 
 
@@ -301,6 +302,74 @@ combined_df_clean.to_csv(outfile_stab_struc, index = False)
 print('\nFinished writing file:'
       , '\nNo. of rows:', combined_df_clean.shape[0]
       , '\nNo. of cols:', combined_df_clean.shape[1])
+#%%=====================================================================
+print('======================================='
+      , '\nFifth merge:
+      , '\ncombined_df_clean + afor_df '
+      , '\n=======================================')
 
+afor_df = pd.read_csv(infile_afor, sep = ',') 
+afor_cols = afor_df.columns
 
-#%% end of script
+# create a mapping from the gwas mutation column i.e <gene_match>_abcXXXrst
+#----------------------
+# call get_aa_upper(): 
+# adds 3 more cols with one letter aa code
+#----------------------
+get_aa_1upper(df = afor_df
+               , gwas_mut_colname = 'mutation'
+               , wt_colname = 'wild_type'
+               , pos_colname = 'position'
+               , mut_colname = 'mutant_type')
+
+afor_df['mutationinformation'] = afor_df['wild_type'] + afor_df['position'].map(str) + afor_df['mutant_type']
+afor_cols = afor_df.columns
+
+merging_cols_m5 = detect_common_cols(combined_df_clean, afor_df) 
+
+# remove position so that merging can take place without dtype conflicts
+merging_cols_m5.remove('position')
+
+# drop position column from afor_df
+afor_df = afor_df.drop(['position'], axis = 1)
+afor_cols = afor_df.columns
+
+# merge 
+combined_stab_afor = pd.merge(combined_df_clean, afor_df, on = merging_cols_m5, how  = l_join)
+comb_afor_df_cols = combined_stab_afor.columns
+
+comb_afor_expected_cols = len(combined_df_clean.columns) + len(afor_df.columns) - len(merging_cols_m5)
+
+if len(combined_stab_afor) == len(combined_df_clean) and len(combined_stab_afor.columns) == comb_afor_expected_cols:
+    print('\nPASS: successfully combined 6 dfs'
+          , '\nNo. of rows combined_stab_afor:', len(combined_stab_afor)
+          , '\nNo. of cols combined_stab_afor:', len(combined_stab_afor.columns))   
+else:
+    sys.exit('\nFAIL: check individual df merges')
+
+print('\n\nResult of Fourth merge:', combined_stab_afor.shape
+      , '\n===================================================================')
+
+combined_stab_afor[merging_cols_m5].apply(len)
+combined_stab_afor[merging_cols_m5].apply(len) == len(combined_stab_afor)
+
+if len(combined_stab_afor) -  combined_stab_afor['mutation'].isna().sum() == len(afor_df):
+     print('\nPASS: Merge successful for af and or'
+          , '\nNo. of nsSNPs with valid ORs: ', len(afor_df))
+else:
+    sys.exit('\nFAIL: merge unsuccessful for af and or')
+
+#%%============================================================================
+# Output columns
+out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
+outfile_comb_afor =  outdir + '/' + out_filename_comb_afor
+print('Output filename:', outfile_comb_afor
+      , '\n===================================================================')
+
+# write csv
+print('Writing file: combined stability and afor')
+combined_stab_afor.to_csv(outfile_comb_afor, index = False)
+print('\nFinished writing file:'
+      , '\nNo. of rows:', combined_stab_afor.shape[0]
+      , '\nNo. of cols:', combined_stab_afor.shape[1])
+#%% end of script
\ No newline at end of file