500 lines
18 KiB
Python
Executable file
500 lines
18 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
import subprocess
|
|
import os
|
|
import sys
|
|
import numpy as np
|
|
import pandas as pd
|
|
from contextlib import suppress
|
|
from pathlib import Path
|
|
import re
|
|
import csv
|
|
import argparse
|
|
import shutil
|
|
import time
|
|
#https://realpython.com/python-pathlib/
|
|
|
|
# FIXME
|
|
#strong dependency of file and path names
|
|
#cannot pass file with path. Need to pass them separately
|
|
#assumptions made for dir struc as standard
|
|
#datadir + drug + input
|
|
|
|
#=======================================================================
|
|
#%% specify input and curr dir
|
|
homedir = os.path.expanduser('~')
|
|
|
|
# set working dir
|
|
os.getcwd()
|
|
#os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
|
|
#os.getcwd()
|
|
|
|
#=======================================================================
|
|
#%% command line args
|
|
arg_parser = argparse.ArgumentParser()
|
|
|
|
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = None)
|
|
arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = None)
|
|
|
|
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
|
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
|
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
|
arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
|
|
|
|
arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
|
|
arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_formatted_snps.csv exists')
|
|
|
|
# FIXME: Doesn't work with 2 chains yet!
|
|
arg_parser.add_argument('-c1', '--chain1', help = 'Chain1 ID', default = 'A') # case sensitive
|
|
arg_parser.add_argument('-c2', '--chain2', help = 'Chain2 ID', default = 'B') # case sensitive
|
|
|
|
args = arg_parser.parse_args()
|
|
#=======================================================================
|
|
#%% variable assignment: input and output
|
|
#drug = 'pyrazinamide'
|
|
#gene = 'pncA'
|
|
#gene_match = gene + '_p.'
|
|
#%%=====================================================================
|
|
# Command line options
|
|
drug = args.drug
|
|
gene = args.gene
|
|
|
|
datadir = args.datadir
|
|
indir = args.input_dir
|
|
outdir = args.output_dir
|
|
process_dir = args.process_dir
|
|
|
|
mut_filename = args.mutation_file
|
|
chainA = args.chain1
|
|
chainB = args.chain2
|
|
pdb_filename = args.pdb_file
|
|
|
|
|
|
# os.path.splitext will fail interestingly with file.pdb.txt.zip
|
|
#pdb_name = os.path.splitext(pdb_file)[0]
|
|
# Just the filename, thanks
|
|
#pdb_name = Path(in_filename_pdb).stem
|
|
|
|
|
|
# Handle the case where neither 'drug'
|
|
# nor (indir,outdir,process_dir) are defined
|
|
if not drug:
|
|
if not indir or not outdir or not process_dir:
|
|
print('ERROR: if "drug" is not specified, you must specify Input, Output, and Process directories')
|
|
sys.exit()
|
|
|
|
#==============
|
|
# directories
|
|
#==============
|
|
if not datadir:
|
|
datadir = homedir + '/' + 'git/Data'
|
|
|
|
if not indir:
|
|
indir = datadir + '/' + drug + '/input'
|
|
|
|
if not outdir:
|
|
outdir = datadir + '/' + drug + '/output'
|
|
|
|
#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
|
|
if not process_dir:
|
|
process_dir = datadir + '/' + drug + '/processing'
|
|
|
|
# Make all paths absolute in case the user forgot
|
|
indir = os.path.abspath(indir)
|
|
process_dir = os.path.abspath(process_dir)
|
|
outdir = os.path.abspath(outdir)
|
|
datadir = os.path.abspath(datadir)
|
|
|
|
#=======
|
|
# input
|
|
#=======
|
|
# FIXME
|
|
if pdb_filename:
|
|
pdb_filename = os.path.abspath(pdb_filename)
|
|
pdb_name = Path(pdb_filename).stem
|
|
infile_pdb = pdb_filename
|
|
else:
|
|
pdb_filename = gene.lower() + '_complex.pdb'
|
|
pdb_name = Path(pdb_filename).stem
|
|
infile_pdb = indir + '/' + pdb_filename
|
|
|
|
actual_pdb_filename = Path(infile_pdb).name
|
|
|
|
if mut_filename:
|
|
mutation_file = os.path.abspath(mut_filename)
|
|
infile_muts = mutation_file
|
|
print('User-provided mutation file in use:', infile_muts)
|
|
else:
|
|
mutation_file = gene.lower() + '_mcsm_formatted_snps.csv'
|
|
infile_muts = outdir + '/' + mutation_file
|
|
print('WARNING: Assuming default mutation file:', infile_muts)
|
|
|
|
#=======
|
|
# output
|
|
#=======
|
|
out_filename = gene.lower() + '_foldx.csv'
|
|
outfile_foldx = outdir + '/' + out_filename
|
|
|
|
print('Arguments being passed:'
|
|
, '\nDrug:', args.drug
|
|
, '\ngene:', args.gene
|
|
, '\ninput dir:', indir
|
|
, '\nprocess dir:', process_dir
|
|
, '\noutput dir:', outdir
|
|
, '\npdb file:', infile_pdb
|
|
, '\npdb name:', pdb_name
|
|
, '\nactual pdb name:', actual_pdb_filename
|
|
, '\nmutation file:', infile_muts
|
|
, '\nchain1:', args.chain1
|
|
, '\noutput file:', outfile_foldx
|
|
, '\n=============================================================')
|
|
|
|
|
|
# make sure rotabase.txt exists in the process_dir
|
|
rotabase_file = process_dir + '/' + 'rotabase.txt'
|
|
|
|
if Path(rotabase_file).is_file():
|
|
print(f'rotabase file: {rotabase_file} exists')
|
|
else:
|
|
print(f'ERROR: rotabase file: {rotabase_file} does not exist. Please download it and put it in {process_dir}')
|
|
sys.exit()
|
|
|
|
#### Delay for 10 seconds to check the params ####
|
|
print('Sleeping for 10 seconds to give you time to cancel')
|
|
time.sleep(10)
|
|
#=======================================================================
|
|
|
|
def getInteractionEnergy(filename):
|
|
data = pd.read_csv(filename,sep = '\t')
|
|
return data['Interaction Energy'].loc[0]
|
|
|
|
def getInteractions(filename):
|
|
data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
|
|
contactList = getIndexes(data,1)
|
|
number = len(contactList)
|
|
return number
|
|
|
|
def formatMuts(mut_file,pdbname):
|
|
with open(mut_file) as csvfile:
|
|
readCSV = csv.reader(csvfile)
|
|
muts = []
|
|
for row in readCSV:
|
|
mut = row[0]
|
|
muts.append(mut)
|
|
|
|
mut_list = []
|
|
outfile = process_dir + '/individual_list_' + pdbname + '.txt'
|
|
with open(outfile, 'w') as output:
|
|
for m in muts:
|
|
print(m)
|
|
mut = m[:1] + chainA+ m[1:]
|
|
mut_list.append(mut)
|
|
mut = mut + ';'
|
|
print(mut)
|
|
output.write(mut)
|
|
output.write('\n')
|
|
return mut_list
|
|
|
|
def getIndexes(data, value):
|
|
colnames = data.columns.values
|
|
listOfPos = list()
|
|
result = data.isin([value])
|
|
result.columns = colnames
|
|
seriesdata = result.any()
|
|
columnNames = list(seriesdata[seriesdata==True].index)
|
|
for col in columnNames:
|
|
rows = list(result[col][result[col]==True].index)
|
|
|
|
for row in rows:
|
|
listOfPos.append((row,col))
|
|
|
|
return listOfPos
|
|
|
|
def loadFiles(df):
|
|
# load a text file in to np matrix
|
|
resultList = []
|
|
f = open(df,'r')
|
|
for line in f:
|
|
line = line.rstrip('\n')
|
|
aVals = line.split('\t')
|
|
fVals = list(map(np.float32, sVals))
|
|
resultList.append(fVals)
|
|
f.close()
|
|
return np.asarray(resultList, dtype=np.float32)
|
|
|
|
# TODO: put the subprocess call in a 'def'
|
|
#def repairPDB():
|
|
# subprocess.call(['foldx'
|
|
# , '--command=RepairPDB'
|
|
# , '--pdb-dir=' + indir
|
|
# , '--pdb=' + actual_pdb_filename
|
|
# , '--ionStrength=0.05'#
|
|
# , '--pH=7'
|
|
# , '--water=PREDICT'
|
|
# , '--vdwDesign=1'
|
|
# , 'outPDB=true'
|
|
# , '--output-dir=' + process_dir])
|
|
|
|
#=======================================================================
|
|
def main():
|
|
pdbname = pdb_name
|
|
comp = '' # for complex only
|
|
mut_filename = infile_muts #pnca_mcsm_snps.csv
|
|
mutlist = formatMuts(mut_filename, pdbname)
|
|
|
|
print(mutlist)
|
|
nmuts = len(mutlist)
|
|
print(nmuts)
|
|
print(mutlist)
|
|
print('start')
|
|
#subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir])
|
|
print('\033[95mSTAGE: repair PDB\033[0m')
|
|
print('EXECUTING: repairPDB.sh %s %s %s' % (indir, actual_pdb_filename, process_dir))
|
|
#subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir])
|
|
# once you decide to use the function
|
|
# repairPDB(pdbname)
|
|
|
|
print('start')
|
|
# some common parameters for foldX
|
|
foldx_common=' --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 '
|
|
|
|
print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m')
|
|
print('Running foldx RepairPDB for WT')
|
|
|
|
fold_RepairDB = ['foldx'
|
|
, '--command=RepairPDB'
|
|
, foldx_common
|
|
# , '--pdb-dir=' + os.path.dirname(pdb_filename)
|
|
, '--pdb-dir=' + indir
|
|
, '--pdb=' + actual_pdb_filename
|
|
, 'outPDB=true'
|
|
, '--output-dir=' + process_dir]
|
|
print('CMD:', fold_RepairDB)
|
|
subprocess.call(fold_RepairDB)
|
|
print('\033[95mCOMPLETED STAGE: repair PDB\033[0m')
|
|
print('\n==========================================================')
|
|
|
|
|
|
print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m')
|
|
print('Running foldx BuildModel for WT')
|
|
|
|
foldx_BuildModel = ['foldx'
|
|
, '--command=BuildModel'
|
|
, foldx_common
|
|
, '--pdb-dir=' + process_dir
|
|
, '--pdb=' + pdbname + '_Repair.pdb'
|
|
, '--mutant-file=' + process_dir + '/' + 'individual_list_' + pdbname +'.txt'
|
|
, 'outPDB=true'
|
|
, '--numberOfRuns=1'
|
|
, '--output-dir=' + process_dir]
|
|
print('CMD:', foldx_BuildModel)
|
|
subprocess.call( foldx_BuildModel, cwd=process_dir)
|
|
|
|
print('Running foldx PrintNetworks for WT')
|
|
foldx_PrintNetworks = ['foldx'
|
|
, '--command=PrintNetworks'
|
|
, '--pdb-dir=' + process_dir
|
|
, '--pdb=' + pdbname + '_Repair.pdb'
|
|
, '--water=PREDICT'
|
|
, '--vdwDesign=1'
|
|
, '--output-dir=' + process_dir]
|
|
print('CMD:', foldx_PrintNetworks)
|
|
subprocess.call(foldx_PrintNetworks, cwd=process_dir)
|
|
|
|
print('Running foldx SequenceDetail for WT')
|
|
foldx_SequenceDetail = ['foldx'
|
|
, '--command=SequenceDetail'
|
|
, '--pdb-dir=' + process_dir
|
|
, '--pdb=' + pdbname + '_Repair.pdb'
|
|
, '--water=PREDICT'
|
|
, '--vdwDesign=1'
|
|
, '--output-dir=' + process_dir]
|
|
print('CMD:', foldx_SequenceDetail)
|
|
subprocess.call(foldx_SequenceDetail , cwd=process_dir)
|
|
|
|
print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m')
|
|
print('\n==========================================================')
|
|
|
|
print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m')
|
|
for n in range(1,nmuts+1):
|
|
print('\033[95mNETWORK:\033[0m', n)
|
|
print('Running foldx PrintNetworks for mutation', n)
|
|
foldx_PrintNetworksMT = ['foldx'
|
|
, '--command=PrintNetworks'
|
|
, '--pdb-dir=' + process_dir
|
|
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
|
|
, '--water=PREDICT'
|
|
, '--vdwDesign=1'
|
|
, '--output-dir=' + process_dir]
|
|
print('CMD:', foldx_PrintNetworksMT)
|
|
subprocess.call( foldx_PrintNetworksMT , cwd=process_dir)
|
|
print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m')
|
|
print('\n==========================================================')
|
|
|
|
|
|
print('\033[95mSTAGE: Rename Mutation Files (shell)\033[0m')
|
|
for n in range(1,nmuts+1):
|
|
print('\033[95mMUTATION:\033[0m', n)
|
|
print('\033[96mCommand:\033[0m mutrenamefiles.sh %s %s %s' % (pdbname, str(n), process_dir ))
|
|
#FIXME: bad design and needs to be done in a pythonic way
|
|
with suppress(Exception):
|
|
subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
|
|
print('\033[95mCOMPLETED STAGE: Rename Mutation Files (shell)\033[0m')
|
|
print('\n==========================================================')
|
|
|
|
|
|
print('\033[95mSTAGE: Rename Files (shell) for WT\033[0m')
|
|
# FIXME: this is bad design and needs to be done in a pythonic way
|
|
out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
|
|
print('\033[95mCOMPLETED STAGE: Rename Files (shell) for WT\033[0m')
|
|
print('\n==========================================================')
|
|
|
|
|
|
if comp=='y':
|
|
print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m')
|
|
chain1=chainA
|
|
chain2=chainB
|
|
foldx_AnalyseComplex = ['foldx'
|
|
, '--command=AnalyseComplex'
|
|
, '--pdb-dir=' + process_dir
|
|
, '--pdb=' + pdbname + '_Repair.pdb'
|
|
, '--analyseComplexChains=' + chain1 + ',' + chain2
|
|
, '--water=PREDICT'
|
|
, '--vdwDesign=1'
|
|
, '--output-dir=' + process_dir]
|
|
print('CMD:',foldx_AnalyseComplex)
|
|
subprocess.call(foldx_AnalyseComplex, cwd=process_dir)
|
|
|
|
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
|
|
ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
|
|
ac_dest = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
|
|
shutil.copyfile(ac_source, ac_dest)
|
|
print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for WT:\033[0m', n)
|
|
|
|
for n in range(1,nmuts+1):
|
|
print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n)
|
|
foldx_AnalyseComplex = ['foldx'
|
|
, '--command=AnalyseComplex'
|
|
, '--pdb-dir=' + process_dir
|
|
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
|
|
, '--analyseComplexChains=' + chain1 + ',' + chain2
|
|
, '--water=PREDICT'
|
|
, '--vdwDesign=1'
|
|
, '--output-dir=' + process_dir]
|
|
print('CMD:', foldx_AnalyseComplex)
|
|
subprocess.call( foldx_AnalyseComplex , cwd=process_dir)
|
|
|
|
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
|
|
ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'
|
|
ac_mut_dest = process_dir + '/Summary_' + pdbname + '_Repair)' + str(n) +'_AC.txt'
|
|
shutil.copyfile(ac_mut_source, ac_mut_dest)
|
|
print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
|
|
print('\n==========================================================')
|
|
|
|
interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
|
|
|
|
dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
|
|
dGdata = pd.read_csv(dGdatafile, sep = '\t')
|
|
|
|
ddG=[]
|
|
print('ddG')
|
|
print(len(dGdata))
|
|
for i in range(0,len(dGdata)):
|
|
ddG.append(dGdata['total energy'].loc[i])
|
|
|
|
|
|
nint = len(interactions)
|
|
wt_int = []
|
|
|
|
for i in interactions:
|
|
filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
|
|
wt_int.append(getInteractions(filename))
|
|
print('wt')
|
|
print(wt_int)
|
|
|
|
ntotal = nint+1
|
|
print(ntotal)
|
|
print(nmuts)
|
|
data = np.empty((ntotal,nmuts))
|
|
data[0] = ddG
|
|
print(data)
|
|
for i in range(0,len(interactions)):
|
|
d=[]
|
|
p=0
|
|
for n in range(1, nmuts+1):
|
|
print(i)
|
|
filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
|
|
mut = getInteractions(filename)
|
|
diff = wt_int[i] - mut
|
|
print(diff)
|
|
print(wt_int[i])
|
|
print(mut)
|
|
d.append(diff)
|
|
print(d)
|
|
data[i+1] = d
|
|
|
|
interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
|
|
|
|
print(interactions)
|
|
|
|
IE = []
|
|
if comp=='y':
|
|
wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
|
|
wtE = getInteractionEnergy(wtfilename)
|
|
print(wtE)
|
|
for n in range(1,nmuts+1):
|
|
print(n)
|
|
filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
|
|
mutE = getInteractionEnergy(filename)
|
|
print(mutE)
|
|
diff = wtE - mutE
|
|
print(diff)
|
|
IE.append(diff)
|
|
print(IE)
|
|
IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
|
|
IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
|
|
IEresults.to_csv(IEfilename)
|
|
print(len(IE))
|
|
data = np.append(data,[IE], axis = 0)
|
|
print(data)
|
|
interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS','Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']
|
|
|
|
mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
|
|
with open(mut_file) as csvfile:
|
|
readCSV = csv.reader(csvfile)
|
|
mutlist = []
|
|
for row in readCSV:
|
|
mut = row[0]
|
|
mutlist.append(mut)
|
|
print(mutlist)
|
|
print(len(mutlist))
|
|
print(data)
|
|
results = pd.DataFrame(data, columns = mutlist, index = interactions)
|
|
results.append(ddG)
|
|
#print(results.head())
|
|
|
|
# my style formatted results
|
|
results2 = results.T # transpose df
|
|
results2.index.name = 'mutationinformation' # assign name to index
|
|
results2 = results2.reset_index() # turn it into a columns
|
|
|
|
results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
|
|
results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
|
|
|
|
results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
|
|
|
|
# lower case columns
|
|
results2.columns = results2.columns.str.lower()
|
|
|
|
print('Writing file in the format below:\n'
|
|
, results2.head()
|
|
, '\nNo. of rows:', len(results2)
|
|
, '\nNo. of cols:', len(results2.columns))
|
|
|
|
outputfilename = outfile_foldx
|
|
#outputfilename = 'foldx_results_' + pdbname + '.csv'
|
|
#results.to_csv(outputfilename)
|
|
results2.to_csv(outputfilename, index = False)
|
|
print ('end')
|
|
|
|
if __name__ == '__main__':
|
|
main()
|