diff --git a/foldx/runFoldx.py b/foldx/runFoldx.py index d54d507..8d9358b 100755 --- a/foldx/runFoldx.py +++ b/foldx/runFoldx.py @@ -40,7 +40,7 @@ arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb fi arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME -arg_parser.add_argument('-pdb', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called _complex.pdb in input_dir') +arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called _complex.pdb in input_dir') arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called _mcsm_snps.csv exists') # FIXME: Doesn't work with 2 chains yet! @@ -55,18 +55,19 @@ args = arg_parser.parse_args() #gene_match = gene + '_p.' #%%===================================================================== # Command line options -drug = args.drug -gene = args.gene +drug = args.drug +gene = args.gene -datadir = args.datadir -indir = args.input_dir -outdir = args.output_dir -process_dir = args.process_dir +datadir = args.datadir +indir = args.input_dir +outdir = args.output_dir +process_dir = args.process_dir + +mut_filename = args.mutation_file +chainA = args.chain1 +chainB = args.chain2 +pdb_filename = args.pdb_file -mut_filename = args.mutation_file -chainA = args.chain1 -chainB = args.chain2 -pdb_filename = args.pdb_file # os.path.splitext will fail interestingly with file.pdb.txt.zip #pdb_name = os.path.splitext(pdb_file)[0] diff --git a/foldx/runFoldx5.py b/foldx/runFoldx5.py new file mode 100755 index 0000000..8a07a4f --- /dev/null +++ b/foldx/runFoldx5.py @@ -0,0 +1,466 @@ +#!/usr/bin/env python3 +import subprocess +import os +import sys +import numpy as np +import pandas as pd +from contextlib import suppress +from pathlib import Path +import re +import csv +import argparse +import shutil +import time +#https://realpython.com/python-pathlib/ + +# FIXME +#strong dependency of file and path names +#cannot pass file with path. Need to pass them separately +#assumptions made for dir struc as standard +#datadir + drug + input + +#======================================================================= +#%% specify input and curr dir +homedir = os.path.expanduser('~') + +# set working dir +os.getcwd() +#os.chdir(homedir + '/git/LSHTM_analysis/foldx/') +#os.getcwd() + +#======================================================================= +#%% command line args +arg_parser = argparse.ArgumentParser() + +arg_parser.add_argument('-d', '--drug', help = 'drug name', default = None) +arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = None) + +arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') +arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') +arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') +arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME + +arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called _complex.pdb in input_dir') +arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called _mcsm_snps.csv exists') + +# FIXME: Doesn't work with 2 chains yet! +arg_parser.add_argument('-c1', '--chain1', help = 'Chain1 ID', default = 'A') # case sensitive +arg_parser.add_argument('-c2', '--chain2', help = 'Chain2 ID', default = 'B') # case sensitive + +args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output +#drug = 'pyrazinamide' +#gene = 'pncA' +#gene_match = gene + '_p.' +#%%===================================================================== +# Command line options +drug = args.drug +gene = args.gene + +datadir = args.datadir +indir = args.input_dir +outdir = args.output_dir +process_dir = args.process_dir + +mut_filename = args.mutation_file +chainA = args.chain1 +chainB = args.chain2 +pdb_filename = args.pdb_file + + +# os.path.splitext will fail interestingly with file.pdb.txt.zip +#pdb_name = os.path.splitext(pdb_file)[0] +# Just the filename, thanks +#pdb_name = Path(in_filename_pdb).stem + + +# Handle the case where neither 'drug' +# nor (indir,outdir,process_dir) are defined +if not drug: + if not indir or not outdir or not process_dir: + print('ERROR: if "drug" is not specified, you must specify Input, Output, and Process directories') + sys.exit() + +#============== +# directories +#============== +if not datadir: + datadir = homedir + '/' + 'git/Data' + +if not indir: + indir = datadir + '/' + drug + '/input' + +if not outdir: + outdir = datadir + '/' + drug + '/output' + +#TODO: perhaps better handled by refactoring code to prevent generating lots of output files! +if not process_dir: + process_dir = datadir + '/' + drug + '/processing' + +# Make all paths absolute in case the user forgot +indir = os.path.abspath(indir) +process_dir = os.path.abspath(process_dir) +outdir = os.path.abspath(outdir) +datadir = os.path.abspath(datadir) + +#======= +# input +#======= +# FIXME +if pdb_filename: + pdb_filename = os.path.abspath(pdb_filename) + pdb_name = Path(pdb_filename).stem + infile_pdb = pdb_filename +else: + pdb_filename = gene.lower() + '_complex.pdb' + pdb_name = Path(pdb_filename).stem + infile_pdb = indir + '/' + pdb_filename + +actual_pdb_filename = Path(infile_pdb).name + +if mut_filename: + mutation_file = os.path.abspath(mut_filename) + infile_muts = mutation_file + print('User-provided mutation file in use:', infile_muts) +else: + mutation_file = gene.lower() + '_mcsm_formatted_snps.csv' + infile_muts = outdir + '/' + mutation_file + print('WARNING: Assuming default mutation file:', infile_muts) + +#======= +# output +#======= +out_filename = gene.lower() + '_foldx.csv' +outfile_foldx = outdir + '/' + out_filename + +print('Arguments being passed:' +, '\nDrug:', args.drug +, '\ngene:', args.gene +, '\ninput dir:', indir +, '\nprocess dir:', process_dir +, '\noutput dir:', outdir +, '\npdb file:', infile_pdb +, '\npdb name:', pdb_name +, '\nactual pdb name:', actual_pdb_filename +, '\nmutation file:', infile_muts +, '\nchain1:', args.chain1 +, '\noutput file:', outfile_foldx +, '\n=============================================================') + +#### Delay for 10 seconds to check the params #### +print('Sleeping for 10 seconds to give you time to cancel') +time.sleep(10) +#======================================================================= + +def getInteractionEnergy(filename): + data = pd.read_csv(filename,sep = '\t') + return data['Interaction Energy'].loc[0] + +def getInteractions(filename): + data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t') + contactList = getIndexes(data,1) + number = len(contactList) + return number + +def formatMuts(mut_file,pdbname): + with open(mut_file) as csvfile: + readCSV = csv.reader(csvfile) + muts = [] + for row in readCSV: + mut = row[0] + muts.append(mut) + + mut_list = [] + outfile = process_dir + '/individual_list_' + pdbname + '.txt' + with open(outfile, 'w') as output: + for m in muts: + print(m) + mut = m[:1] + chainA+ m[1:] + mut_list.append(mut) + mut = mut + ';' + print(mut) + output.write(mut) + output.write('\n') + return mut_list + +def getIndexes(data, value): + colnames = data.columns.values + listOfPos = list() + result = data.isin([value]) + result.columns = colnames + seriesdata = result.any() + columnNames = list(seriesdata[seriesdata==True].index) + for col in columnNames: + rows = list(result[col][result[col]==True].index) + + for row in rows: + listOfPos.append((row,col)) + + return listOfPos + +def loadFiles(df): + # load a text file in to np matrix + resultList = [] + f = open(df,'r') + for line in f: + line = line.rstrip('\n') + aVals = line.split('\t') + fVals = list(map(np.float32, sVals)) + resultList.append(fVals) + f.close() + return np.asarray(resultList, dtype=np.float32) + +# TODO: put the subprocess call in a 'def' +#def repairPDB(): +# subprocess.call(['foldx' +# , '--command=RepairPDB' +# , '--pdb-dir=' + indir +# , '--pdb=' + actual_pdb_filename +# , '--ionStrength=0.05'# +# , '--pH=7' +# , '--water=PREDICT' +# , '--vdwDesign=1' +# , 'outPDB=true' +# , '--output-dir=' + process_dir]) + +#======================================================================= +def main(): + pdbname = pdb_name + comp = '' # for complex only + mut_filename = infile_muts #pnca_mcsm_snps.csv + mutlist = formatMuts(mut_filename, pdbname) + + print(mutlist) + nmuts = len(mutlist) + print(nmuts) + print(mutlist) + + print('start') + # some common parameters for foldX + foldx_common=' --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 ' + + print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m') + print('Running foldx RepairPDB for WT') + subprocess.call(['foldx5' + , '--command=RepairPDB' + , foldx_common + , '--pdb-dir=' + os.path.dirname(pdb_filename) + , '--pdb=' + actual_pdb_filename + , 'outPDB=true' + , '--output-dir=' + process_dir]) + print('\033[95mCOMPLETED STAGE: repair PDB\033[0m') + print('\n==========================================================') + + + print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m') + print('Running foldx BuildModel for WT') + subprocess.call(['foldx5' + , '--command=BuildModel' + , foldx_common + , '--pdb-dir=' + process_dir + , '--pdb=' + pdbname + '_Repair.pdb' + , '--mutant-file="individual_list_' + pdbname +'.txt"' + , 'outPDB=true' + , '--numberOfRuns=1' + , '--output-dir=' + process_dir], cwd=process_dir) + + print('Running foldx PrintNetworks for WT') + subprocess.call(['foldx5' + , '--command=PrintNetworks' + , '--pdb-dir=' + process_dir + , '--pdb=' + pdbname + '_Repair.pdb' + , '--water=PREDICT' + , '--vdwDesign=1' + , '--output-dir=' + process_dir], cwd=process_dir) + + print('Running foldx SequenceDetail for WT') + subprocess.call(['foldx5' + , '--command=SequenceDetail' + , '--pdb-dir=' + process_dir + , '--pdb=' + pdbname + '_Repair.pdb' + , '--water=PREDICT' + , '--vdwDesign=1' + , '--output-dir=' + process_dir], cwd=process_dir) + print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m') + print('\n==========================================================') + + + print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m') + for n in range(1,nmuts+1): + print('\033[95mNETWORK:\033[0m', n) + print('Running foldx PrintNetworks for mutation', n) + subprocess.call(['foldx5' + , '--command=PrintNetworks' + , '--pdb-dir=' + process_dir + , '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb' + , '--water=PREDICT' + , '--vdwDesign=1' + , '--output-dir=' + process_dir], cwd=process_dir) + print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m') + print('\n==========================================================') + + + print('\033[95mSTAGE: Rename Mutation Files (shell)\033[0m') + for n in range(1,nmuts+1): + print('\033[95mMUTATION:\033[0m', n) + print('\033[96mCommand:\033[0m mutrenamefiles.sh %s %s %s' % (pdbname, str(n), process_dir )) + #FIXME: bad design and needs to be done in a pythonic way + with suppress(Exception): + subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir]) + print('\033[95mCOMPLETED STAGE: Rename Mutation Files (shell)\033[0m') + print('\n==========================================================') + + + print('\033[95mSTAGE: Rename Files (shell) for WT\033[0m') + # FIXME: this is bad design and needs to be done in a pythonic way + out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir]) + print('\033[95mCOMPLETED STAGE: Rename Files (shell) for WT\033[0m') + print('\n==========================================================') + + + if comp=='y': + print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m') + chain1=chainA + chain2=chainB + subprocess.call(['foldx5' + , '--command=AnalyseComplex' + , '--pdb-dir=' + process_dir + , '--pdb=' + pdbname + '_Repair.pdb' + , '--analyseComplexChains=' + chain1 + ',' + chain2 + , '--water=PREDICT' + , '--vdwDesign=1' + , '--output-dir=' + process_dir], cwd=process_dir) + + # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh + ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout' + ac_dest = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt' + shutil.copyfile(ac_source, ac_dest) + print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for WT:\033[0m', n) + + for n in range(1,nmuts+1): + print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n) + subprocess.call(['foldx5' + , '--command=AnalyseComplex' + , '--pdb-dir=' + process_dir + , '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb' + , '--analyseComplexChains=' + chain1 + ',' + chain2 + , '--water=PREDICT' + , '--vdwDesign=1' + , '--output-dir=' + process_dir], cwd=process_dir) + + # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh + ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout' + ac_mut_dest = process_dir + '/Summary_' + pdbname + '_Repair)' + str(n) +'_AC.txt' + shutil.copyfile(ac_mut_source, ac_mut_dest) + print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for mutation:\033[0m', n) + print('\n==========================================================') + + interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS'] + + dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt' + dGdata = pd.read_csv(dGdatafile, sep = '\t') + + ddG=[] + print('ddG') + print(len(dGdata)) + for i in range(0,len(dGdata)): + ddG.append(dGdata['total energy'].loc[i]) + + + nint = len(interactions) + wt_int = [] + + for i in interactions: + filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt' + wt_int.append(getInteractions(filename)) + print('wt') + print(wt_int) + + ntotal = nint+1 + print(ntotal) + print(nmuts) + data = np.empty((ntotal,nmuts)) + data[0] = ddG + print(data) + for i in range(0,len(interactions)): + d=[] + p=0 + for n in range(1, nmuts+1): + print(i) + filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt' + mut = getInteractions(filename) + diff = wt_int[i] - mut + print(diff) + print(wt_int[i]) + print(mut) + d.append(diff) + print(d) + data[i+1] = d + + interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS'] + + print(interactions) + + IE = [] + if comp=='y': + wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt' + wtE = getInteractionEnergy(wtfilename) + print(wtE) + for n in range(1,nmuts+1): + print(n) + filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt' + mutE = getInteractionEnergy(filename) + print(mutE) + diff = wtE - mutE + print(diff) + IE.append(diff) + print(IE) + IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist) + IEfilename = 'foldx_complexresults_'+pdbname+'.csv' + IEresults.to_csv(IEfilename) + print(len(IE)) + data = np.append(data,[IE], axis = 0) + print(data) + interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS','Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy'] + + mut_file = process_dir + '/individual_list_' + pdbname + '.txt' + with open(mut_file) as csvfile: + readCSV = csv.reader(csvfile) + mutlist = [] + for row in readCSV: + mut = row[0] + mutlist.append(mut) + print(mutlist) + print(len(mutlist)) + print(data) + results = pd.DataFrame(data, columns = mutlist, index = interactions) + results.append(ddG) + #print(results.head()) + + # my style formatted results + results2 = results.T # transpose df + results2.index.name = 'mutationinformation' # assign name to index + results2 = results2.reset_index() # turn it into a columns + + results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id) + results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space + + results2.rename(columns = {'Distances': 'Contacts'}, inplace = True) + + # lower case columns + results2.columns = results2.columns.str.lower() + + print('Writing file in the format below:\n' + , results2.head() + , '\nNo. of rows:', len(results2) + , '\nNo. of cols:', len(results2.columns)) + + outputfilename = outfile_foldx + #outputfilename = 'foldx_results_' + pdbname + '.csv' + #results.to_csv(outputfilename) + results2.to_csv(outputfilename, index = False) + print ('end') + +if __name__ == '__main__': + main()