added af_or to add to combining_dfs.py
This commit is contained in:
parent
8a301e8bb1
commit
920007cc83
1 changed files with 78 additions and 9 deletions
|
@ -40,7 +40,6 @@ import sys, os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas import DataFrame
|
from pandas import DataFrame
|
||||||
import numpy as np
|
import numpy as np
|
||||||
#from varname import nameof
|
|
||||||
import argparse
|
import argparse
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% specify input and curr dir
|
#%% specify input and curr dir
|
||||||
|
@ -132,7 +131,7 @@ in_filename_kd = gene.lower() + '_kd.csv'
|
||||||
in_filename_rd = gene.lower() + '_rd.csv'
|
in_filename_rd = gene.lower() + '_rd.csv'
|
||||||
|
|
||||||
#in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv' # gwas f info
|
#in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv' # gwas f info
|
||||||
#in_filename_afor = gene.lower() + '_af_or.csv'
|
in_filename_afor = gene.lower() + '_af_or.csv'
|
||||||
#in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv'
|
#in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv'
|
||||||
|
|
||||||
infile_mcsm = outdir + in_filename_mcsm
|
infile_mcsm = outdir + in_filename_mcsm
|
||||||
|
@ -144,7 +143,7 @@ infile_kd = outdir + in_filename_kd
|
||||||
infile_rd = outdir + in_filename_rd
|
infile_rd = outdir + in_filename_rd
|
||||||
|
|
||||||
#infile_snpinfo = outdir + '/' + in_filename_snpinfo
|
#infile_snpinfo = outdir + '/' + in_filename_snpinfo
|
||||||
#infile_afor = outdir + '/' + in_filename_afor
|
infile_afor = outdir + '/' + in_filename_afor
|
||||||
#infile_afor_kin = outdir + '/' + in_filename_afor_kin
|
#infile_afor_kin = outdir + '/' + in_filename_afor_kin
|
||||||
|
|
||||||
print('\nInput path:', indir
|
print('\nInput path:', indir
|
||||||
|
@ -157,7 +156,7 @@ print('\nInput path:', indir
|
||||||
, '\nInput filename rd', infile_rd
|
, '\nInput filename rd', infile_rd
|
||||||
|
|
||||||
#, '\nInput filename snp info:', infile_snpinfo, '\n'
|
#, '\nInput filename snp info:', infile_snpinfo, '\n'
|
||||||
#, '\nInput filename af or:', infile_afor
|
, '\nInput filename af or:', infile_afor
|
||||||
#, '\nInput filename afor kinship:', infile_afor_kin
|
#, '\nInput filename afor kinship:', infile_afor_kin
|
||||||
, '\n============================================================')
|
, '\n============================================================')
|
||||||
|
|
||||||
|
@ -216,7 +215,7 @@ mcsm_foldx_deepddg_dfs['deepddg_outcome'].value_counts()
|
||||||
ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns)
|
ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns)
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
print('==================================='
|
print('==================================='
|
||||||
, '\nSecond merge: dssp + kd'
|
, '\Third merge: dssp + kd'
|
||||||
, '\n===================================')
|
, '\n===================================')
|
||||||
|
|
||||||
dssp_df = pd.read_csv(infile_dssp, sep = ',')
|
dssp_df = pd.read_csv(infile_dssp, sep = ',')
|
||||||
|
@ -227,11 +226,11 @@ rd_df = pd.read_csv(infile_rd, sep = ',')
|
||||||
merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
|
merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
|
||||||
dssp_kd_dfs = pd.merge(dssp_df, kd_df, on = merging_cols_m2, how = o_join)
|
dssp_kd_dfs = pd.merge(dssp_df, kd_df, on = merging_cols_m2, how = o_join)
|
||||||
|
|
||||||
print('\n\nResult of second merge:', dssp_kd_dfs.shape
|
print('\n\nResult of third merge:', dssp_kd_dfs.shape
|
||||||
, '\n===================================================================')
|
, '\n===================================================================')
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
print('==================================='
|
print('==================================='
|
||||||
, '\nThird merge: second merge + rd_df'
|
, '\nFourth merge: third merge + rd_df'
|
||||||
, '\ndssp_kd_dfs + rd_df'
|
, '\ndssp_kd_dfs + rd_df'
|
||||||
, '\n===================================')
|
, '\n===================================')
|
||||||
#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = o_join)
|
#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = o_join)
|
||||||
|
@ -247,7 +246,7 @@ dssp_kd_rd_dfs[merging_cols_m3].apply(len)
|
||||||
dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
|
dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
print('======================================='
|
print('======================================='
|
||||||
, '\nFourth merge: First merge + Third merge'
|
, '\nFifth merge: Second merge + fourth merge'
|
||||||
, '\nmcsm_foldx_dfs + dssp_kd_rd_dfs'
|
, '\nmcsm_foldx_dfs + dssp_kd_rd_dfs'
|
||||||
, '\n=======================================')
|
, '\n=======================================')
|
||||||
#combined_df = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = i_join)
|
#combined_df = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = i_join)
|
||||||
|
@ -270,6 +269,7 @@ else:
|
||||||
|
|
||||||
print('\nResult of Fourth merge:', combined_df.shape
|
print('\nResult of Fourth merge:', combined_df.shape
|
||||||
, '\n===================================================================')
|
, '\n===================================================================')
|
||||||
|
|
||||||
combined_df[merging_cols_m4].apply(len)
|
combined_df[merging_cols_m4].apply(len)
|
||||||
combined_df[merging_cols_m4].apply(len) == len(combined_df)
|
combined_df[merging_cols_m4].apply(len) == len(combined_df)
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
|
@ -280,6 +280,7 @@ combined_df_colnames = combined_df.columns
|
||||||
combined_df['chain'].equals(combined_df['chain_id'])
|
combined_df['chain'].equals(combined_df['chain_id'])
|
||||||
combined_df['wild_type'].equals(combined_df['wild_type_kd']) # has nan
|
combined_df['wild_type'].equals(combined_df['wild_type_kd']) # has nan
|
||||||
combined_df['wild_type'].equals(combined_df['wild_type_dssp'])
|
combined_df['wild_type'].equals(combined_df['wild_type_dssp'])
|
||||||
|
|
||||||
#sanity check
|
#sanity check
|
||||||
foo = combined_df[['wild_type', 'wild_type_kd', 'wt_3letter_caps', 'wt_aa_3lower', 'mut_aa_3lower']]
|
foo = combined_df[['wild_type', 'wild_type_kd', 'wt_3letter_caps', 'wt_aa_3lower', 'mut_aa_3lower']]
|
||||||
|
|
||||||
|
@ -301,6 +302,74 @@ combined_df_clean.to_csv(outfile_stab_struc, index = False)
|
||||||
print('\nFinished writing file:'
|
print('\nFinished writing file:'
|
||||||
, '\nNo. of rows:', combined_df_clean.shape[0]
|
, '\nNo. of rows:', combined_df_clean.shape[0]
|
||||||
, '\nNo. of cols:', combined_df_clean.shape[1])
|
, '\nNo. of cols:', combined_df_clean.shape[1])
|
||||||
|
#%%=====================================================================
|
||||||
|
print('======================================='
|
||||||
|
, '\nFifth merge:
|
||||||
|
, '\ncombined_df_clean + afor_df '
|
||||||
|
, '\n=======================================')
|
||||||
|
|
||||||
|
afor_df = pd.read_csv(infile_afor, sep = ',')
|
||||||
|
afor_cols = afor_df.columns
|
||||||
|
|
||||||
#%% end of script
|
# create a mapping from the gwas mutation column i.e <gene_match>_abcXXXrst
|
||||||
|
#----------------------
|
||||||
|
# call get_aa_upper():
|
||||||
|
# adds 3 more cols with one letter aa code
|
||||||
|
#----------------------
|
||||||
|
get_aa_1upper(df = afor_df
|
||||||
|
, gwas_mut_colname = 'mutation'
|
||||||
|
, wt_colname = 'wild_type'
|
||||||
|
, pos_colname = 'position'
|
||||||
|
, mut_colname = 'mutant_type')
|
||||||
|
|
||||||
|
afor_df['mutationinformation'] = afor_df['wild_type'] + afor_df['position'].map(str) + afor_df['mutant_type']
|
||||||
|
afor_cols = afor_df.columns
|
||||||
|
|
||||||
|
merging_cols_m5 = detect_common_cols(combined_df_clean, afor_df)
|
||||||
|
|
||||||
|
# remove position so that merging can take place without dtype conflicts
|
||||||
|
merging_cols_m5.remove('position')
|
||||||
|
|
||||||
|
# drop position column from afor_df
|
||||||
|
afor_df = afor_df.drop(['position'], axis = 1)
|
||||||
|
afor_cols = afor_df.columns
|
||||||
|
|
||||||
|
# merge
|
||||||
|
combined_stab_afor = pd.merge(combined_df_clean, afor_df, on = merging_cols_m5, how = l_join)
|
||||||
|
comb_afor_df_cols = combined_stab_afor.columns
|
||||||
|
|
||||||
|
comb_afor_expected_cols = len(combined_df_clean.columns) + len(afor_df.columns) - len(merging_cols_m5)
|
||||||
|
|
||||||
|
if len(combined_stab_afor) == len(combined_df_clean) and len(combined_stab_afor.columns) == comb_afor_expected_cols:
|
||||||
|
print('\nPASS: successfully combined 6 dfs'
|
||||||
|
, '\nNo. of rows combined_stab_afor:', len(combined_stab_afor)
|
||||||
|
, '\nNo. of cols combined_stab_afor:', len(combined_stab_afor.columns))
|
||||||
|
else:
|
||||||
|
sys.exit('\nFAIL: check individual df merges')
|
||||||
|
|
||||||
|
print('\n\nResult of Fourth merge:', combined_stab_afor.shape
|
||||||
|
, '\n===================================================================')
|
||||||
|
|
||||||
|
combined_stab_afor[merging_cols_m5].apply(len)
|
||||||
|
combined_stab_afor[merging_cols_m5].apply(len) == len(combined_stab_afor)
|
||||||
|
|
||||||
|
if len(combined_stab_afor) - combined_stab_afor['mutation'].isna().sum() == len(afor_df):
|
||||||
|
print('\nPASS: Merge successful for af and or'
|
||||||
|
, '\nNo. of nsSNPs with valid ORs: ', len(afor_df))
|
||||||
|
else:
|
||||||
|
sys.exit('\nFAIL: merge unsuccessful for af and or')
|
||||||
|
|
||||||
|
#%%============================================================================
|
||||||
|
# Output columns
|
||||||
|
out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
|
||||||
|
outfile_comb_afor = outdir + '/' + out_filename_comb_afor
|
||||||
|
print('Output filename:', outfile_comb_afor
|
||||||
|
, '\n===================================================================')
|
||||||
|
|
||||||
|
# write csv
|
||||||
|
print('Writing file: combined stability and afor')
|
||||||
|
combined_stab_afor.to_csv(outfile_comb_afor, index = False)
|
||||||
|
print('\nFinished writing file:'
|
||||||
|
, '\nNo. of rows:', combined_stab_afor.shape[0]
|
||||||
|
, '\nNo. of cols:', combined_stab_afor.shape[1])
|
||||||
|
#%% end of script
|
Loading…
Add table
Add a link
Reference in a new issue