added deep ddg formatted data to combinig_dfs.py

This commit is contained in:
Tanushree Tunstall 2021-06-21 12:56:06 +01:00
parent 3ff9604002
commit 1155959e67

View file

@ -84,9 +84,6 @@ arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
args = arg_parser.parse_args() args = arg_parser.parse_args()
#======================================================================= #=======================================================================
#%% variable assignment: input and output #%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
drug = args.drug drug = args.drug
gene = args.gene gene = args.gene
datadir = args.datadir datadir = args.datadir
@ -128,39 +125,40 @@ if not outdir:
#in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' #in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv'
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SAM.csv' # gidb in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SAM.csv' # gidb
in_filename_foldx = gene.lower() + '_foldx.csv' in_filename_foldx = gene.lower() + '_foldx.csv'
in_filename_deepddg = gene.lower() + '_ni_deepddg.csv' # change to decent filename and put it in the correct dir
in_filename_dssp = gene.lower() + '_dssp.csv' in_filename_dssp = gene.lower() + '_dssp.csv'
in_filename_kd = gene.lower() + '_kd.csv' in_filename_kd = gene.lower() + '_kd.csv'
in_filename_rd = gene.lower() + '_rd.csv' in_filename_rd = gene.lower() + '_rd.csv'
in_filename_deepddg = gene.lower() + '_complex_ddg_results.txt' # change to decent filename and put it in the correct dir
in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv' # gwas f info
in_filename_afor = gene.lower() + '_af_or.csv'
in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv'
#in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv' # gwas f info
#in_filename_afor = gene.lower() + '_af_or.csv'
#in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv'
infile_mcsm = outdir + in_filename_mcsm infile_mcsm = outdir + in_filename_mcsm
infile_foldx = outdir + in_filename_foldx infile_foldx = outdir + in_filename_foldx
infile_deepddg = outdir + in_filename_deepddg
infile_dssp = outdir + in_filename_dssp infile_dssp = outdir + in_filename_dssp
infile_kd = outdir + in_filename_kd infile_kd = outdir + in_filename_kd
infile_rd = outdir + in_filename_rd infile_rd = outdir + in_filename_rd
infile_deepddg = outdir + 'deep_ddg/' + in_filename_deepddg
infile_snpinfo = outdir + '/' + in_filename_snpinfo #infile_snpinfo = outdir + '/' + in_filename_snpinfo
infile_afor = outdir + '/' + in_filename_afor #infile_afor = outdir + '/' + in_filename_afor
infile_afor_kin = outdir + '/' + in_filename_afor_kin #infile_afor_kin = outdir + '/' + in_filename_afor_kin
print('\nInput path:', indir print('\nInput path:', indir
, '\nOutput path:', outdir, '\n' , '\nOutput path:', outdir, '\n'
, '\nInput filename mcsm:', infile_mcsm , '\nInput filename mcsm:', infile_mcsm
, '\nInput filename foldx:', infile_foldx, '\n' , '\nInput filename foldx:', infile_foldx, '\n'
, '\nInput filename deepddg', infile_deepddg , '\n'
, '\nInput filename dssp:', infile_dssp , '\nInput filename dssp:', infile_dssp
, '\nInput filename kd:', infile_kd , '\nInput filename kd:', infile_kd
, '\nInput filename rd', infile_rd , '\nInput filename rd', infile_rd
# , '\nInput filename rd', infile_deepddg , '\n'
#, '\nInput filename snp info:', infile_snpinfo, '\n'
, '\nInput filename snp info:', infile_snpinfo, '\n' #, '\nInput filename af or:', infile_afor
, '\nInput filename af or:', infile_afor #, '\nInput filename afor kinship:', infile_afor_kin
, '\nInput filename afor kinship:', infile_afor_kin
, '\n============================================================') , '\n============================================================')
#======= #=======
@ -208,29 +206,11 @@ print('==================================='
, '\nSecond merge: mcsm_foldx_dfs + deepddg' , '\nSecond merge: mcsm_foldx_dfs + deepddg'
, '\n===================================') , '\n===================================')
deepddg_df = pd.read_csv(infile_deepddg, sep = ' ') deepddg_df = pd.read_csv(infile_deepddg, sep = ',')
deepddg_df.columns deepddg_df.columns
deepddg_df.rename(columns = {'#chain' : 'chain_id' # merge with mcsm_foldx_dfs and deepddg_df
, 'WT' : 'wild_type_deepddg' mcsm_foldx_deepddg_dfs = pd.merge(mcsm_foldx_dfs, deepddg_df, on = 'mutationinformation', how = l_join)
, 'ResID' : 'position'
, 'Mut' : 'mutant_type_deepddg'}
, inplace = True)
deepddg_df['mutationinformation'] = deepddg_df['wild_type_deepddg'] + deepddg_df['position'].map(str) + deepddg_df['mutant_type_deepddg']
# add deepddg outcome column: <0--> Destabilising, >0 --> Stabilising
deepddg_df['deepddg_outcome'] = np.where(deepddg_df['deepddg'] < 0, 'Destabilising', 'Stabilising')
deepddg_df['deepddg_outcome'].value_counts()
# drop extra columns to allow clean merging
deepddg_short_df = deepddg_df.drop(['chain_id', 'wild_type_deepddg', 'position', 'mutant_type_deepddg'], axis = 1)
# rearrange columns
deepddg_short_df.columns
deepddg_short_df = deepddg_short_df[["mutationinformation", "deepddg", "deepddg_outcome"]]
mcsm_foldx_deepddg_dfs = pd.merge(mcsm_foldx_dfs, deepddg_short_df, on = 'mutationinformation', how = l_join)
mcsm_foldx_deepddg_dfs['deepddg_outcome'].value_counts() mcsm_foldx_deepddg_dfs['deepddg_outcome'].value_counts()
ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns) ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns)
@ -317,10 +297,10 @@ print('Output filename:', outfile_stab_struc
# write csv # write csv
print('Writing file: combined stability and structural parameters') print('Writing file: combined stability and structural parameters')
combined_df.to_csv(outfile_stab_struc, index = False) combined_df_clean.to_csv(outfile_stab_struc, index = False)
print('\nFinished writing file:' print('\nFinished writing file:'
, '\nNo. of rows:', combined_df.shape[0] , '\nNo. of rows:', combined_df_clean.shape[0]
, '\nNo. of cols:', combined_df.shape[1]) , '\nNo. of cols:', combined_df_clean.shape[1])
#%% end of script #%% end of script