diff --git a/scripts/deepddg_format.py b/scripts/deepddg_format.py new file mode 100644 index 0000000..aab0769 --- /dev/null +++ b/scripts/deepddg_format.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' +#======================================================================= +# Task: format deep ddg df to allow easy merging + +# Input: 2 dfs +#1) .lower()'_mcsm_formatted_snps.csv' +#2) .lower()_complex_ddg_results.csv' +#======================================================================= +#%% load packages +import sys, os +import pandas as pd +from pandas import DataFrame +import numpy as np +#from varname import nameof +import argparse +#======================================================================= +#%% specify input and curr dir +homedir = os.path.expanduser('~') + +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/scripts') +os.getcwd() +#=======================================================================#%% command line args: case sensitive +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') +arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') + +arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') +arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') +arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') + +arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') + +args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output +drug = args.drug +gene = args.gene +datadir = args.datadir +indir = args.input_dir +outdir = args.output_dir +#%%======================================================================= +#============== +# directories +#============== +if not datadir: + datadir = homedir + '/git/Data/' + +if not indir: + indir = datadir + drug + '/input/' + +if not outdir: + outdir = datadir + drug + '/output/' + +#======= +# input +#======= +in_filename_mcsm_snps = gene.lower() + '_mcsm_formatted_snps.csv' +infile_mcsm_snps = outdir + in_filename_mcsm_snps + +in_filename_deepddg = gene.lower() + '_complex_ddg_results.csv' +infile_deepddg = outdir + 'deep_ddg/' + in_filename_deepddg + +print('\nInput path:', indir + , '\nOutput path:', outdir, '\n' + , '\nInput filename mcsm snps', infile_mcsm_snps , '\n' + , '\nInput filename deepddg', infile_deepddg , '\n' + , '\n============================================================') + +#======= +# output +#======= +#out_filename_deepddg = gene.lower() + '_complex_ni_deepddg.txt' +out_filename_deepddg = gene.lower() + '_complex_ni_deepddg.csv' +outfile_deepddg_f = outdir + out_filename_deepddg + +print('Output filename:', outfile_deepddg_f + , '\n===================================================================') +# end of variable assignment for input and output files +#%%============================================================================ +print('===================================' + , '\nmcsm muts' + , '\n===================================') + +mcsm_muts_df = pd.read_csv(infile_mcsm_snps , header = None, sep = ',', names = ['mutationinformation']) +mcsm_muts_df.columns + +#%%============================================================================ +print('===================================' + , '\nDeep ddg' + , '\n===================================') + +deepddg_df = pd.read_csv(infile_deepddg, sep = ',') +deepddg_df.columns + +deepddg_df.rename(columns = {'#chain' : 'chain_id' + , 'WT' : 'wild_type_deepddg' + , 'ResID' : 'position' + , 'Mut' : 'mutant_type_deepddg'} + , inplace = True) +deepddg_df.columns +deepddg_df['mutationinformation'] = deepddg_df['wild_type_deepddg'] + deepddg_df['position'].map(str) + deepddg_df['mutant_type_deepddg'] +deepddg_df.columns + +# add deepddg outcome column: <0--> Destabilising, >0 --> Stabilising +deepddg_df['deepddg_outcome'] = np.where(deepddg_df['deepddg'] < 0, 'Destabilising', 'Stabilising') +deepddg_df['deepddg_outcome'].value_counts() + +# should be identical in count ot Destabilising and stabilising respectively +len(deepddg_df.loc[deepddg_df['deepddg'] < 0]) +len(deepddg_df.loc[deepddg_df['deepddg'] >= 0]) + +# drop extra columns to allow clean merging +deepddg_short_df = deepddg_df.drop(['chain_id', 'wild_type_deepddg', 'position', 'mutant_type_deepddg'], axis = 1) + +# rearrange columns +deepddg_short_df.columns +deepddg_short_df = deepddg_short_df[["mutationinformation", "deepddg", "deepddg_outcome"]] + +#%% combine with mcsm snps +deepddg_mcsm_muts_dfs = pd.merge(deepddg_short_df + , mcsm_muts_df + , on = 'mutationinformation' + , how = 'right') +deepddg_mcsm_muts_dfs ['deepddg_outcome'].value_counts() + +#%%============================================================================ +# write csv +print('Writing file: formatted deepddg and only mcsm muts') +deepddg_mcsm_muts_dfs.to_csv(outfile_deepddg_f, index = False) +print('\nFinished writing file:' + , '\nNo. of rows:', deepddg_mcsm_muts_dfs.shape[0] + , '\nNo. of cols:', deepddg_mcsm_muts_dfs.shape[1]) +#%% end of script