diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index ccb3ff5..44a1fad 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -84,9 +84,6 @@ arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output -#drug = 'pyrazinamide' -#gene = 'pncA' - drug = args.drug gene = args.gene datadir = args.datadir @@ -128,39 +125,40 @@ if not outdir: #in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SAM.csv' # gidb in_filename_foldx = gene.lower() + '_foldx.csv' +in_filename_deepddg = gene.lower() + '_ni_deepddg.csv' # change to decent filename and put it in the correct dir + in_filename_dssp = gene.lower() + '_dssp.csv' in_filename_kd = gene.lower() + '_kd.csv' in_filename_rd = gene.lower() + '_rd.csv' -in_filename_deepddg = gene.lower() + '_complex_ddg_results.txt' # change to decent filename and put it in the correct dir - -in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv' # gwas f info -in_filename_afor = gene.lower() + '_af_or.csv' -in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv' +#in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv' # gwas f info +#in_filename_afor = gene.lower() + '_af_or.csv' +#in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv' infile_mcsm = outdir + in_filename_mcsm infile_foldx = outdir + in_filename_foldx +infile_deepddg = outdir + in_filename_deepddg + infile_dssp = outdir + in_filename_dssp infile_kd = outdir + in_filename_kd infile_rd = outdir + in_filename_rd -infile_deepddg = outdir + 'deep_ddg/' + in_filename_deepddg -infile_snpinfo = outdir + '/' + in_filename_snpinfo -infile_afor = outdir + '/' + in_filename_afor -infile_afor_kin = outdir + '/' + in_filename_afor_kin +#infile_snpinfo = outdir + '/' + in_filename_snpinfo +#infile_afor = outdir + '/' + in_filename_afor +#infile_afor_kin = outdir + '/' + in_filename_afor_kin print('\nInput path:', indir , '\nOutput path:', outdir, '\n' , '\nInput filename mcsm:', infile_mcsm , '\nInput filename foldx:', infile_foldx, '\n' + , '\nInput filename deepddg', infile_deepddg , '\n' , '\nInput filename dssp:', infile_dssp , '\nInput filename kd:', infile_kd , '\nInput filename rd', infile_rd -# , '\nInput filename rd', infile_deepddg , '\n' - - , '\nInput filename snp info:', infile_snpinfo, '\n' - , '\nInput filename af or:', infile_afor - , '\nInput filename afor kinship:', infile_afor_kin + + #, '\nInput filename snp info:', infile_snpinfo, '\n' + #, '\nInput filename af or:', infile_afor + #, '\nInput filename afor kinship:', infile_afor_kin , '\n============================================================') #======= @@ -208,29 +206,11 @@ print('===================================' , '\nSecond merge: mcsm_foldx_dfs + deepddg' , '\n===================================') -deepddg_df = pd.read_csv(infile_deepddg, sep = ' ') +deepddg_df = pd.read_csv(infile_deepddg, sep = ',') deepddg_df.columns -deepddg_df.rename(columns = {'#chain' : 'chain_id' - , 'WT' : 'wild_type_deepddg' - , 'ResID' : 'position' - , 'Mut' : 'mutant_type_deepddg'} - , inplace = True) - -deepddg_df['mutationinformation'] = deepddg_df['wild_type_deepddg'] + deepddg_df['position'].map(str) + deepddg_df['mutant_type_deepddg'] - -# add deepddg outcome column: <0--> Destabilising, >0 --> Stabilising -deepddg_df['deepddg_outcome'] = np.where(deepddg_df['deepddg'] < 0, 'Destabilising', 'Stabilising') -deepddg_df['deepddg_outcome'].value_counts() - -# drop extra columns to allow clean merging -deepddg_short_df = deepddg_df.drop(['chain_id', 'wild_type_deepddg', 'position', 'mutant_type_deepddg'], axis = 1) - -# rearrange columns -deepddg_short_df.columns -deepddg_short_df = deepddg_short_df[["mutationinformation", "deepddg", "deepddg_outcome"]] - -mcsm_foldx_deepddg_dfs = pd.merge(mcsm_foldx_dfs, deepddg_short_df, on = 'mutationinformation', how = l_join) +# merge with mcsm_foldx_dfs and deepddg_df +mcsm_foldx_deepddg_dfs = pd.merge(mcsm_foldx_dfs, deepddg_df, on = 'mutationinformation', how = l_join) mcsm_foldx_deepddg_dfs['deepddg_outcome'].value_counts() ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns) @@ -317,10 +297,10 @@ print('Output filename:', outfile_stab_struc # write csv print('Writing file: combined stability and structural parameters') -combined_df.to_csv(outfile_stab_struc, index = False) +combined_df_clean.to_csv(outfile_stab_struc, index = False) print('\nFinished writing file:' - , '\nNo. of rows:', combined_df.shape[0] - , '\nNo. of cols:', combined_df.shape[1]) + , '\nNo. of rows:', combined_df_clean.shape[0] + , '\nNo. of cols:', combined_df_clean.shape[1]) #%% end of script