LSHTM_analysis/foldx/runFoldx.py

#!/usr/bin/env python3
import subprocess
import os
import sys
import numpy as np
import pandas as pd
from contextlib import suppress
from pathlib import Path
import re
import csv
import argparse
import shutil
import time
#https://realpython.com/python-pathlib/

# FIXME
#strong dependency of file and path names
#cannot pass file with path. Need to pass them separately
#assumptions made for dir struc as standard
#datadir + drug + input

#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')

# set working dir
os.getcwd()
#os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
#os.getcwd()

#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()

arg_parser.add_argument('-d', '--drug',     help = 'drug name', default = None)
arg_parser.add_argument('-g', '--gene',     help = 'gene name (case sensitive)', default = None)

arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME

arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_formatted_snps.csv exists')

# FIXME: Doesn't work with 2 chains yet!
arg_parser.add_argument('-c1', '--chain1',    help = 'Chain1 ID', default = 'A') # case sensitive
arg_parser.add_argument('-c2', '--chain2',    help = 'Chain2 ID', default = 'B') # case sensitive

args = arg_parser.parse_args()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
#gene_match = gene + '_p.'
#%%=====================================================================
# Command line options
drug          = args.drug
gene          = args.gene

datadir       = args.datadir
indir         = args.input_dir
outdir        = args.output_dir
process_dir   = args.process_dir

mut_filename  = args.mutation_file
chainA        = args.chain1
chainB        = args.chain2
pdb_filename  = args.pdb_file


# os.path.splitext will fail interestingly with file.pdb.txt.zip
#pdb_name = os.path.splitext(pdb_file)[0]
# Just the filename, thanks
#pdb_name = Path(in_filename_pdb).stem


# Handle the case where neither 'drug'
# nor (indir,outdir,process_dir) are defined
if not drug:
    if not indir or not outdir or not process_dir:
        print('ERROR: if "drug" is not specified, you must specify Input, Output, and Process directories')
        sys.exit()

#==============
# directories
#==============
if not datadir:
    datadir = homedir + '/' + 'git/Data'

if not indir:
    indir = datadir + '/' + drug + '/input'

if not outdir:
    outdir = datadir + '/' + drug + '/output'

#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
if not process_dir:
    process_dir = datadir + '/' + drug + '/processing'

# Make all paths absolute in case the user forgot
indir = os.path.abspath(indir)
process_dir = os.path.abspath(process_dir)
outdir = os.path.abspath(outdir)
datadir = os.path.abspath(datadir)

#=======
# input
#=======
# FIXME
if pdb_filename:
    pdb_filename = os.path.abspath(pdb_filename)
    pdb_name = Path(pdb_filename).stem
    infile_pdb = pdb_filename
else:
    pdb_filename = gene.lower() + '_complex.pdb'
    pdb_name = Path(pdb_filename).stem
    infile_pdb = indir + '/' + pdb_filename

actual_pdb_filename = Path(infile_pdb).name

if mut_filename:
    mutation_file = os.path.abspath(mut_filename)
    infile_muts = mutation_file
    print('User-provided mutation file in use:', infile_muts)
else:
    mutation_file =  gene.lower() + '_mcsm_formatted_snps.csv'
    infile_muts = outdir + '/' + mutation_file
    print('WARNING: Assuming default mutation file:', infile_muts)

#=======
# output
#=======
out_filename = gene.lower() + '_foldx.csv'
outfile_foldx =  outdir + '/' + out_filename

print('Arguments being passed:'
, '\nDrug:', args.drug
, '\ngene:', args.gene
, '\ninput dir:', indir
, '\nprocess dir:', process_dir
, '\noutput dir:', outdir
, '\npdb file:', infile_pdb
, '\npdb name:', pdb_name
, '\nactual pdb name:', actual_pdb_filename
, '\nmutation file:', infile_muts
, '\nchain1:', args.chain1
, '\noutput file:', outfile_foldx
, '\n=============================================================')


# make sure rotabase.txt exists in the process_dir
rotabase_file = process_dir + '/' + 'rotabase.txt'

if Path(rotabase_file).is_file():
    print(f'rotabase file: {rotabase_file} exists')
else:
    print(f'ERROR: rotabase file: {rotabase_file} does not exist. Please download it and put it in {process_dir}')
    sys.exit()

#### Delay for 10 seconds to check the params ####
print('Sleeping for 10 seconds to give you time to cancel')
time.sleep(10)
#=======================================================================

def getInteractionEnergy(filename):
    data = pd.read_csv(filename,sep = '\t')
    return data['Interaction Energy'].loc[0]

def getInteractions(filename):
    data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
    contactList = getIndexes(data,1)
    number = len(contactList)
    return number

def formatMuts(mut_file,pdbname):
    with open(mut_file) as csvfile:
        readCSV = csv.reader(csvfile)
        muts = []
        for row in readCSV:
                mut = row[0]
                muts.append(mut)

    mut_list = []
    outfile = process_dir + '/individual_list_' + pdbname + '.txt'
    with open(outfile, 'w') as output:
        for m in muts:
                print(m)
                mut = m[:1] + chainA+ m[1:]
                mut_list.append(mut)
                mut = mut + ';'
                print(mut)
                output.write(mut)
                output.write('\n')
    return mut_list

def getIndexes(data, value):
    colnames = data.columns.values
    listOfPos = list()
    result = data.isin([value])
    result.columns = colnames
    seriesdata = result.any()
    columnNames = list(seriesdata[seriesdata==True].index)
    for col in columnNames:
        rows = list(result[col][result[col]==True].index)

        for row in rows:
            listOfPos.append((row,col))

    return listOfPos

def loadFiles(df):
    # load a text file in to np matrix
    resultList = []
    f = open(df,'r')
    for line in f:
        line = line.rstrip('\n')
        aVals = line.split('\t')
        fVals = list(map(np.float32, sVals))
        resultList.append(fVals)
    f.close()
    return np.asarray(resultList, dtype=np.float32)

# TODO: put the subprocess call in a 'def'
#def repairPDB():
#    subprocess.call(['foldx'
#    , '--command=RepairPDB'
#    , '--pdb-dir=' + indir
#    ,  '--pdb=' + actual_pdb_filename
#    , '--ionStrength=0.05'#
#   , '--pH=7'
#    , '--water=PREDICT'
#    , '--vdwDesign=1'
#    , 'outPDB=true'
#    , '--output-dir=' + process_dir])

#=======================================================================
def main():
    pdbname = pdb_name
    comp = '' # for complex only
    mut_filename = infile_muts #pnca_mcsm_snps.csv
    mutlist = formatMuts(mut_filename, pdbname)

    print(mutlist)
    nmuts = len(mutlist)
    print(nmuts)
    print(mutlist)
    print('start')
    #subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir])
    print('\033[95mSTAGE: repair PDB\033[0m')
    print('EXECUTING: repairPDB.sh %s %s %s' % (indir, actual_pdb_filename, process_dir))
    #subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir])
    # once you decide to use the function
    # repairPDB(pdbname)

    print('start')
    # some common parameters for foldX
    foldx_common=' --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 '

    print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m')
    print('Running foldx RepairPDB for WT')

    fold_RepairDB = ['foldx'
    , '--command=RepairPDB'
    , foldx_common
#    , '--pdb-dir=' + os.path.dirname(pdb_filename)
    , '--pdb-dir=' + indir
    ,  '--pdb=' + actual_pdb_filename
    , 'outPDB=true'
    , '--output-dir=' + process_dir]
    print('CMD:', fold_RepairDB)
    subprocess.call(fold_RepairDB)
    print('\033[95mCOMPLETED STAGE: repair PDB\033[0m')
    print('\n==========================================================')


    print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m')
    print('Running foldx BuildModel for WT')

    foldx_BuildModel = ['foldx'
    , '--command=BuildModel'
    , foldx_common
    , '--pdb-dir=' + process_dir
    ,  '--pdb=' + pdbname + '_Repair.pdb'
    , '--mutant-file=' + process_dir + '/' + 'individual_list_' + pdbname +'.txt'
    , 'outPDB=true'
    , '--numberOfRuns=1'
    , '--output-dir=' + process_dir]
    print('CMD:', foldx_BuildModel)
    subprocess.call( foldx_BuildModel, cwd=process_dir)

    print('Running foldx PrintNetworks for WT')
    foldx_PrintNetworks = ['foldx'
    , '--command=PrintNetworks'
    , '--pdb-dir=' + process_dir
    ,  '--pdb=' + pdbname + '_Repair.pdb'
    , '--water=PREDICT'
    , '--vdwDesign=1'
    , '--output-dir=' + process_dir]
    print('CMD:', foldx_PrintNetworks)
    subprocess.call(foldx_PrintNetworks, cwd=process_dir)

    print('Running foldx SequenceDetail for WT')
    foldx_SequenceDetail = ['foldx'
    , '--command=SequenceDetail'
    , '--pdb-dir=' + process_dir
    ,  '--pdb=' + pdbname + '_Repair.pdb'
    , '--water=PREDICT'
    , '--vdwDesign=1'
    , '--output-dir=' + process_dir]
    print('CMD:', foldx_SequenceDetail)
    subprocess.call(foldx_SequenceDetail , cwd=process_dir)

    print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m')
    print('\n==========================================================')

    print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m')
    for n in range(1,nmuts+1):
        print('\033[95mNETWORK:\033[0m', n)
        print('Running foldx PrintNetworks for mutation', n)
        foldx_PrintNetworksMT = ['foldx'
        , '--command=PrintNetworks'
        , '--pdb-dir=' + process_dir
        ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
        , '--water=PREDICT'
        , '--vdwDesign=1'
        , '--output-dir=' + process_dir]
        print('CMD:', foldx_PrintNetworksMT)
        subprocess.call( foldx_PrintNetworksMT , cwd=process_dir)
    print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m')
    print('\n==========================================================')


    print('\033[95mSTAGE: Rename Mutation Files (shell)\033[0m')
    for n in range(1,nmuts+1):
        print('\033[95mMUTATION:\033[0m', n)
        print('\033[96mCommand:\033[0m mutrenamefiles.sh %s %s %s' % (pdbname, str(n), process_dir ))
       #FIXME: bad design and needs to be done in a pythonic way
        with suppress(Exception):
            subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
    print('\033[95mCOMPLETED STAGE: Rename Mutation Files (shell)\033[0m')
    print('\n==========================================================')


    print('\033[95mSTAGE: Rename Files (shell) for WT\033[0m')
    # FIXME: this is bad design and needs to be done in a pythonic way
    out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
    print('\033[95mCOMPLETED STAGE: Rename Files (shell) for WT\033[0m')
    print('\n==========================================================')


    if comp=='y':
        print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m')
        chain1=chainA
        chain2=chainB
        foldx_AnalyseComplex = ['foldx'
        , '--command=AnalyseComplex'
        , '--pdb-dir=' + process_dir
        ,  '--pdb=' + pdbname + '_Repair.pdb'
        , '--analyseComplexChains=' + chain1 + ',' + chain2
        , '--water=PREDICT'
        , '--vdwDesign=1'
        , '--output-dir=' + process_dir]
        print('CMD:',foldx_AnalyseComplex)
        subprocess.call(foldx_AnalyseComplex, cwd=process_dir)

        # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
        ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
        ac_dest = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
        shutil.copyfile(ac_source, ac_dest)
        print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for WT:\033[0m', n)

        for n in range(1,nmuts+1):
            print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n)
            foldx_AnalyseComplex = ['foldx'
            , '--command=AnalyseComplex'
            , '--pdb-dir=' + process_dir
            ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
            , '--analyseComplexChains=' + chain1 + ',' + chain2
            , '--water=PREDICT'
            , '--vdwDesign=1'
            , '--output-dir=' + process_dir]
            print('CMD:', foldx_AnalyseComplex)
            subprocess.call( foldx_AnalyseComplex , cwd=process_dir)

            # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
            ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'
            ac_mut_dest = process_dir + '/Summary_' + pdbname + '_Repair)' + str(n) +'_AC.txt'
            shutil.copyfile(ac_mut_source, ac_mut_dest)
        print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
        print('\n==========================================================')

    interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']

    dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
    dGdata = pd.read_csv(dGdatafile, sep = '\t')

    ddG=[]
    print('ddG')
    print(len(dGdata))
    for i in range(0,len(dGdata)):
        ddG.append(dGdata['total energy'].loc[i])


    nint = len(interactions)
    wt_int = []

    for i in interactions:
        filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
        wt_int.append(getInteractions(filename))
    print('wt')
    print(wt_int)

    ntotal = nint+1
    print(ntotal)
    print(nmuts)
    data = np.empty((ntotal,nmuts))
    data[0] = ddG
    print(data)
    for i in range(0,len(interactions)):
        d=[]
        p=0
        for n in range(1, nmuts+1):
            print(i)
            filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
            mut = getInteractions(filename)
            diff = wt_int[i] - mut
            print(diff)
            print(wt_int[i])
            print(mut)
            d.append(diff)
        print(d)
        data[i+1] = d

    interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']

    print(interactions)

    IE = []
    if comp=='y':
        wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
        wtE = getInteractionEnergy(wtfilename)
        print(wtE)
        for n in range(1,nmuts+1):
            print(n)
            filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
            mutE = getInteractionEnergy(filename)
            print(mutE)
            diff = wtE - mutE
            print(diff)
            IE.append(diff)
        print(IE)
        IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
        IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
        IEresults.to_csv(IEfilename)
        print(len(IE))
        data = np.append(data,[IE], axis = 0)
        print(data)
        interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS','Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']

    mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
    with open(mut_file) as csvfile:
        readCSV = csv.reader(csvfile)
        mutlist = []
        for row in readCSV:
                mut = row[0]
                mutlist.append(mut)
    print(mutlist)
    print(len(mutlist))
    print(data)
    results = pd.DataFrame(data, columns = mutlist, index = interactions)
    results.append(ddG)
    #print(results.head())

    # my style formatted results
    results2 = results.T # transpose df
    results2.index.name = 'mutationinformation' # assign name to index
    results2 = results2.reset_index() # turn it into a columns

    results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
    results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space

    results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)

    # lower case columns
    results2.columns = results2.columns.str.lower()

    print('Writing file in the format below:\n'
        , results2.head()
        , '\nNo. of rows:', len(results2)
        , '\nNo. of cols:', len(results2.columns))

    outputfilename = outfile_foldx
    #outputfilename = 'foldx_results_' + pdbname + '.csv'
    #results.to_csv(outputfilename)
    results2.to_csv(outputfilename, index = False)
    print ('end')

if __name__ == '__main__':
    main()