#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' #======================================================================= # Task: format deep ddg df to allow easy merging # Input: 2 dfs #1) .lower()'_mcsm_formatted_snps.csv' #2) .lower()_complex_ddg_results.csv' #======================================================================= #%% load packages import sys, os import pandas as pd from pandas import DataFrame import numpy as np #from varname import nameof import argparse #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() #=======================================================================#%% command line args: case sensitive arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output drug = args.drug gene = args.gene datadir = args.datadir indir = args.input_dir outdir = args.output_dir #%%======================================================================= #============== # directories #============== if not datadir: datadir = homedir + '/git/Data/' if not indir: indir = datadir + drug + '/input/' if not outdir: outdir = datadir + drug + '/output/' #======= # input #======= in_filename_mcsm_snps = gene.lower() + '_mcsm_formatted_snps.csv' infile_mcsm_snps = outdir + in_filename_mcsm_snps in_filename_deepddg = gene.lower() + '_complex_ddg_results.csv' infile_deepddg = outdir + 'deep_ddg/' + in_filename_deepddg print('\nInput path:', indir , '\nOutput path:', outdir, '\n' , '\nInput filename mcsm snps', infile_mcsm_snps , '\n' , '\nInput filename deepddg', infile_deepddg , '\n' , '\n============================================================') #======= # output #======= #out_filename_deepddg = gene.lower() + '_complex_ni_deepddg.txt' out_filename_deepddg = gene.lower() + '_complex_ni_deepddg.csv' outfile_deepddg_f = outdir + out_filename_deepddg print('Output filename:', outfile_deepddg_f , '\n===================================================================') # end of variable assignment for input and output files #%%============================================================================ print('===================================' , '\nmcsm muts' , '\n===================================') mcsm_muts_df = pd.read_csv(infile_mcsm_snps , header = None, sep = ',', names = ['mutationinformation']) mcsm_muts_df.columns #%%============================================================================ print('===================================' , '\nDeep ddg' , '\n===================================') deepddg_df = pd.read_csv(infile_deepddg, sep = ',') deepddg_df.columns deepddg_df.rename(columns = {'#chain' : 'chain_id' , 'WT' : 'wild_type_deepddg' , 'ResID' : 'position' , 'Mut' : 'mutant_type_deepddg'} , inplace = True) deepddg_df.columns deepddg_df['mutationinformation'] = deepddg_df['wild_type_deepddg'] + deepddg_df['position'].map(str) + deepddg_df['mutant_type_deepddg'] deepddg_df.columns # add deepddg outcome column: <0--> Destabilising, >0 --> Stabilising deepddg_df['deepddg_outcome'] = np.where(deepddg_df['deepddg'] < 0, 'Destabilising', 'Stabilising') deepddg_df['deepddg_outcome'].value_counts() # should be identical in count ot Destabilising and stabilising respectively len(deepddg_df.loc[deepddg_df['deepddg'] < 0]) len(deepddg_df.loc[deepddg_df['deepddg'] >= 0]) # drop extra columns to allow clean merging deepddg_short_df = deepddg_df.drop(['chain_id', 'wild_type_deepddg', 'position', 'mutant_type_deepddg'], axis = 1) # rearrange columns deepddg_short_df.columns deepddg_short_df = deepddg_short_df[["mutationinformation", "deepddg", "deepddg_outcome"]] #%% combine with mcsm snps deepddg_mcsm_muts_dfs = pd.merge(deepddg_short_df , mcsm_muts_df , on = 'mutationinformation' , how = 'right') deepddg_mcsm_muts_dfs ['deepddg_outcome'].value_counts() #%%============================================================================ # write csv print('Writing file: formatted deepddg and only mcsm muts') deepddg_mcsm_muts_dfs.to_csv(outfile_deepddg_f, index = False) print('\nFinished writing file:' , '\nNo. of rows:', deepddg_mcsm_muts_dfs.shape[0] , '\nNo. of cols:', deepddg_mcsm_muts_dfs.shape[1]) #%% end of script