From 920007cc836e1e2ddf0a2947e2af6be1b980dd07 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 21 Jun 2021 14:53:04 +0100 Subject: [PATCH] added af_or to add to combining_dfs.py --- scripts/combining_dfs.py | 87 +++++++++++++++++++++++++++++++++++----- 1 file changed, 78 insertions(+), 9 deletions(-) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index 44a1fad..10b10db 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -40,7 +40,6 @@ import sys, os import pandas as pd from pandas import DataFrame import numpy as np -#from varname import nameof import argparse #======================================================================= #%% specify input and curr dir @@ -132,7 +131,7 @@ in_filename_kd = gene.lower() + '_kd.csv' in_filename_rd = gene.lower() + '_rd.csv' #in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv' # gwas f info -#in_filename_afor = gene.lower() + '_af_or.csv' +in_filename_afor = gene.lower() + '_af_or.csv' #in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv' infile_mcsm = outdir + in_filename_mcsm @@ -144,7 +143,7 @@ infile_kd = outdir + in_filename_kd infile_rd = outdir + in_filename_rd #infile_snpinfo = outdir + '/' + in_filename_snpinfo -#infile_afor = outdir + '/' + in_filename_afor +infile_afor = outdir + '/' + in_filename_afor #infile_afor_kin = outdir + '/' + in_filename_afor_kin print('\nInput path:', indir @@ -157,7 +156,7 @@ print('\nInput path:', indir , '\nInput filename rd', infile_rd #, '\nInput filename snp info:', infile_snpinfo, '\n' - #, '\nInput filename af or:', infile_afor + , '\nInput filename af or:', infile_afor #, '\nInput filename afor kinship:', infile_afor_kin , '\n============================================================') @@ -216,7 +215,7 @@ mcsm_foldx_deepddg_dfs['deepddg_outcome'].value_counts() ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns) #%%============================================================================ print('===================================' - , '\nSecond merge: dssp + kd' + , '\Third merge: dssp + kd' , '\n===================================') dssp_df = pd.read_csv(infile_dssp, sep = ',') @@ -227,11 +226,11 @@ rd_df = pd.read_csv(infile_rd, sep = ',') merging_cols_m2 = detect_common_cols(dssp_df, kd_df) dssp_kd_dfs = pd.merge(dssp_df, kd_df, on = merging_cols_m2, how = o_join) -print('\n\nResult of second merge:', dssp_kd_dfs.shape +print('\n\nResult of third merge:', dssp_kd_dfs.shape , '\n===================================================================') #%%============================================================================ print('===================================' - , '\nThird merge: second merge + rd_df' + , '\nFourth merge: third merge + rd_df' , '\ndssp_kd_dfs + rd_df' , '\n===================================') #dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = o_join) @@ -247,7 +246,7 @@ dssp_kd_rd_dfs[merging_cols_m3].apply(len) dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs) #%%============================================================================ print('=======================================' - , '\nFourth merge: First merge + Third merge' + , '\nFifth merge: Second merge + fourth merge' , '\nmcsm_foldx_dfs + dssp_kd_rd_dfs' , '\n=======================================') #combined_df = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = i_join) @@ -270,6 +269,7 @@ else: print('\nResult of Fourth merge:', combined_df.shape , '\n===================================================================') + combined_df[merging_cols_m4].apply(len) combined_df[merging_cols_m4].apply(len) == len(combined_df) #%%============================================================================ @@ -280,6 +280,7 @@ combined_df_colnames = combined_df.columns combined_df['chain'].equals(combined_df['chain_id']) combined_df['wild_type'].equals(combined_df['wild_type_kd']) # has nan combined_df['wild_type'].equals(combined_df['wild_type_dssp']) + #sanity check foo = combined_df[['wild_type', 'wild_type_kd', 'wt_3letter_caps', 'wt_aa_3lower', 'mut_aa_3lower']] @@ -301,6 +302,74 @@ combined_df_clean.to_csv(outfile_stab_struc, index = False) print('\nFinished writing file:' , '\nNo. of rows:', combined_df_clean.shape[0] , '\nNo. of cols:', combined_df_clean.shape[1]) +#%%===================================================================== +print('=======================================' + , '\nFifth merge: + , '\ncombined_df_clean + afor_df ' + , '\n=======================================') +afor_df = pd.read_csv(infile_afor, sep = ',') +afor_cols = afor_df.columns -#%% end of script +# create a mapping from the gwas mutation column i.e _abcXXXrst +#---------------------- +# call get_aa_upper(): +# adds 3 more cols with one letter aa code +#---------------------- +get_aa_1upper(df = afor_df + , gwas_mut_colname = 'mutation' + , wt_colname = 'wild_type' + , pos_colname = 'position' + , mut_colname = 'mutant_type') + +afor_df['mutationinformation'] = afor_df['wild_type'] + afor_df['position'].map(str) + afor_df['mutant_type'] +afor_cols = afor_df.columns + +merging_cols_m5 = detect_common_cols(combined_df_clean, afor_df) + +# remove position so that merging can take place without dtype conflicts +merging_cols_m5.remove('position') + +# drop position column from afor_df +afor_df = afor_df.drop(['position'], axis = 1) +afor_cols = afor_df.columns + +# merge +combined_stab_afor = pd.merge(combined_df_clean, afor_df, on = merging_cols_m5, how = l_join) +comb_afor_df_cols = combined_stab_afor.columns + +comb_afor_expected_cols = len(combined_df_clean.columns) + len(afor_df.columns) - len(merging_cols_m5) + +if len(combined_stab_afor) == len(combined_df_clean) and len(combined_stab_afor.columns) == comb_afor_expected_cols: + print('\nPASS: successfully combined 6 dfs' + , '\nNo. of rows combined_stab_afor:', len(combined_stab_afor) + , '\nNo. of cols combined_stab_afor:', len(combined_stab_afor.columns)) +else: + sys.exit('\nFAIL: check individual df merges') + +print('\n\nResult of Fourth merge:', combined_stab_afor.shape + , '\n===================================================================') + +combined_stab_afor[merging_cols_m5].apply(len) +combined_stab_afor[merging_cols_m5].apply(len) == len(combined_stab_afor) + +if len(combined_stab_afor) - combined_stab_afor['mutation'].isna().sum() == len(afor_df): + print('\nPASS: Merge successful for af and or' + , '\nNo. of nsSNPs with valid ORs: ', len(afor_df)) +else: + sys.exit('\nFAIL: merge unsuccessful for af and or') + +#%%============================================================================ +# Output columns +out_filename_comb_afor = gene.lower() + '_comb_afor.csv' +outfile_comb_afor = outdir + '/' + out_filename_comb_afor +print('Output filename:', outfile_comb_afor + , '\n===================================================================') + +# write csv +print('Writing file: combined stability and afor') +combined_stab_afor.to_csv(outfile_comb_afor, index = False) +print('\nFinished writing file:' + , '\nNo. of rows:', combined_stab_afor.shape[0] + , '\nNo. of cols:', combined_stab_afor.shape[1]) +#%% end of script \ No newline at end of file