#!/usr/bin/env python3 import subprocess import os import sys import numpy as np import pandas as pd from contextlib import suppress from pathlib import Path import re import csv import argparse import shutil #https://realpython.com/python-pathlib/ # FIXME #strong dependency of file and path names #cannot pass file with path. Need to pass them separately #assumptions made for dir struc as standard #datadir + drug + input #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() #os.chdir(homedir + '/git/LSHTM_analysis/foldx/') #os.getcwd() #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help = 'drug name', default = None) arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = None) arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME arg_parser.add_argument('-pdb', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called _complex.pdb in input_dir') arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called _mcsm_snps.csv exists') # FIXME: Doesn't work with 2 chains yet! arg_parser.add_argument('-c1', '--chain1', help = 'Chain1 ID', default = 'A') # case sensitive arg_parser.add_argument('-c2', '--chain2', help = 'Chain2 ID', default = 'B') # case sensitive args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' #gene_match = gene + '_p.' #%%===================================================================== # Command line options drug = args.drug gene = args.gene datadir = args.datadir indir = args.input_dir outdir = args.output_dir process_dir = args.process_dir mut_filename = args.mutation_file chainA = args.chain1 chainB = args.chain2 pdb_filename = args.pdb_file # os.path.splitext will fail interestingly with file.pdb.txt.zip #pdb_name = os.path.splitext(pdb_file)[0] # Just the filename, thanks #pdb_name = Path(in_filename_pdb).stem #============== # directories #============== if not datadir: datadir = homedir + '/' + 'git/Data' if not indir: indir = datadir + '/' + drug + '/input' if not outdir: outdir = datadir + '/' + drug + '/output' #TODO: perhaps better handled by refactoring code to prevent generating lots of output files! #if not process_dir: # process_dir = datadir + '/' + drug + '/processing' # Make all paths absolute in case the user forgot indir = os.path.abspath(indir) process_dir = os.path.abspath(process_dir) outdir = os.path.abspath(outdir) datadir = os.path.abspath(datadir) #======= # input #======= # FIXME if pdb_filename: pdb_name = Path(pdb_filename).stem else: pdb_filename = gene.lower() + '_complex.pdb' pdb_name = Path(pdb_filename).stem infile_pdb = indir + '/' + pdb_filename actual_pdb_filename = Path(infile_pdb).name #actual_pdb_filename = os.path.abspath(infile_pdb) if mut_filename: mutation_file = os.path.abspath(mut_filename) infile_muts = mutation_file print('User-provided mutation file in use:', infile_muts) else: mutation_file = gene.lower() + '_mcsm_formatted_snps.csv' infile_muts = outdir + '/' + mutation_file print('WARNING: Assuming default mutation file:', infile_muts) #======= # output #======= out_filename = gene.lower() + '_foldx.csv' outfile_foldx = outdir + '/' + out_filename print('Arguments being passed:' , '\nDrug:', args.drug , '\ngene:', args.gene , '\ninput dir:', indir , '\nprocess dir:', process_dir , '\noutput dir:', outdir , '\npdb file:', infile_pdb , '\npdb name:', pdb_name , '\nactual pdb name:', actual_pdb_filename , '\nmutation file:', infile_muts , '\nchain1:', args.chain1 , '\noutput file:', outfile_foldx , '\n=============================================================') #======================================================================= def getInteractionEnergy(filename): data = pd.read_csv(filename,sep = '\t') return data['Interaction Energy'].loc[0] def getInteractions(filename): data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t') contactList = getIndexes(data,1) number = len(contactList) return number def formatMuts(mut_file,pdbname): with open(mut_file) as csvfile: readCSV = csv.reader(csvfile) muts = [] for row in readCSV: mut = row[0] muts.append(mut) mut_list = [] outfile = process_dir + '/individual_list_' + pdbname + '.txt' with open(outfile, 'w') as output: for m in muts: print(m) mut = m[:1] + chainA+ m[1:] mut_list.append(mut) mut = mut + ';' print(mut) output.write(mut) output.write('\n') return mut_list def getIndexes(data, value): colnames = data.columns.values listOfPos = list() result = data.isin([value]) result.columns = colnames seriesdata = result.any() columnNames = list(seriesdata[seriesdata==True].index) for col in columnNames: rows = list(result[col][result[col]==True].index) for row in rows: listOfPos.append((row,col)) return listOfPos def loadFiles(df): # load a text file in to np matrix resultList = [] f = open(df,'r') for line in f: line = line.rstrip('\n') aVals = line.split('\t') fVals = list(map(np.float32, sVals)) resultList.append(fVals) f.close() return np.asarray(resultList, dtype=np.float32) # TODO: use this code pattern rather than invoking bash #def repairPDB(): # subprocess.call(['foldx' # , '--command=RepairPDB' # , '--pdb-dir=' + indir # , '--pdb=' + actual_pdb_filename # , '--ionStrength=0.05'# # , '--pH=7' # , '--water=PREDICT' # , '--vdwDesign=1' # , 'outPDB=true' # , '--output-dir=' + process_dir]) #======================================================================= def main(): pdbname = pdb_name comp = '' # for complex only mut_filename = infile_muts #pnca_mcsm_snps.csv mutlist = formatMuts(mut_filename, pdbname) print(mutlist) nmuts = len(mutlist) print(nmuts) print(mutlist) print('start') #subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir]) print('\033[95mSTAGE: repair PDB\033[0m') print('EXECUTING: repairPDB.sh %s %s %s' % (indir, actual_pdb_filename, process_dir)) #subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir]) # once you decide to use the function # repairPDB(pdbname) # FIXME: put this hack elsewhere foldx_common=' --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 ' subprocess.call(['foldx' , '--command=RepairPDB' , foldx_common , '--pdb-dir=' + indir , '--pdb=' + actual_pdb_filename , 'outPDB=true' , '--output-dir=' + process_dir]) print('\033[95mCOMPLETE: repair PDB\033[0m') print('\033[95mSTAGE: run FoldX (subprocess)\033[0m') print('EXECUTING: runfoldx.sh %s %s ' % (pdbname, process_dir)) #output = subprocess.check_output(['bash', 'runfoldx.sh', pdbname, process_dir]) print('Running foldx BuildModel') subprocess.call(['foldx' , '--command=BuildModel' , foldx_common , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair.pdb' , '--mutant-file="individual_list_' + pdbname +'.txt"' , 'outPDB=true' , '--numberOfRuns=1' , '--output-dir=' + process_dir], cwd=process_dir) print('Running foldx PrintNetworks') subprocess.call(['foldx' , '--command=PrintNetworks' , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair.pdb' , '--water=PREDICT' , '--vdwDesign=1' , '--output-dir=' + process_dir], cwd=process_dir) print('Running foldx SequenceDetail') subprocess.call(['foldx' , '--command=SequenceDetail' , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair.pdb' , '--water=PREDICT' , '--vdwDesign=1' , '--output-dir=' + process_dir], cwd=process_dir) print('\033[95mCOMPLETE: run FoldX (subprocess)\033[0m') print('\033[95mSTAGE: Print Networks (shell)\033[0m') for n in range(1,nmuts+1): print('\033[95mNETWORK:\033[0m', n) #print('\033[96mCommand:\033[0m runPrintNetworks.sh %s %s %s' % (pdbname, str(n), process_dir )) #with suppress(Exception): #foldx --command=PrintNetworks --pdb="${PDB}_Repair_${n}.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR} print('Running foldx PrintNetworks for mutation', n) subprocess.call(['foldx' , '--command=PrintNetworks' , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb' , '--water=PREDICT' , '--vdwDesign=1' , '--output-dir=' + process_dir], cwd=process_dir) #subprocess.check_output(['bash', 'runPrintNetworks.sh', pdbname, str(n), process_dir]) print('\033[95mCOMPLETE: Print Networks (shell)\033[0m') print('\033[95mSTAGE: Rename Mutation Files (shell)\033[0m') for n in range(1,nmuts+1): print('\033[95mMUTATION:\033[0m', n) print('\033[96mCommand:\033[0m mutrenamefiles.sh %s %s %s' % (pdbname, str(n), process_dir )) # FIXME: this is bad design and needs to be done in a pythonic way with suppress(Exception): subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir]) print('\033[95mCOMPLETE: Rename Mutation Files (shell)\033[0m') print('\033[95mSTAGE: Rename Files (shell)\033[0m') # FIXME: this is bad design and needs to be done in a pythonic way out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir]) print('\033[95mCOMPLETE: Rename Files (shell)\033[0m') if comp=='y': print('\033[95mSTAGE: Running foldx AnalyseComplex (subprocess)\033[0m') chain1=chainA chain2=chainB #with suppress(Exception): #subprocess.check_output(['bash','runcomplex.sh', pdbname, chain1, chain2, process_dir]) subprocess.call(['foldx' , '--command=AnalyseComplex' , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair.pdb' , '--analyseComplexChains=' + chain1 + ',' + chain2 , '--water=PREDICT' , '--vdwDesign=1' , '--output-dir=' + process_dir], cwd=process_dir) # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout' ac_dest = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt' shutil.copyfile(ac_source, ac_dest) for n in range(1,nmuts+1): print('\033[95mSTAGE: Running foldx AnalyseComplex (subprocess) for mutation:\033[0m', n) #with suppress(Exception): # subprocess.check_output(['bash','mutruncomplex.sh', pdbname, chain1, chain2, str(n), process_dir]) subprocess.call(['foldx' , '--command=AnalyseComplex' , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb' , '--analyseComplexChains=' + chain1 + ',' + chain2 , '--water=PREDICT' , '--vdwDesign=1' , '--output-dir=' + process_dir], cwd=process_dir) # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout' ac_mut_dest = process_dir + '/Summary_' + pdbname + '_Repair)' + str(n) +'_AC.txt' shutil.copyfile(ac_mut_source, ac_mut_dest) print('\033[95mCOMPLETE: foldx AnalyseComplex (subprocess) for mutation:\033[0m', n) interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM', 'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS'] dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt' dGdata = pd.read_csv(dGdatafile, sep = '\t') ddG=[] print('ddG') print(len(dGdata)) for i in range(0,len(dGdata)): ddG.append(dGdata['total energy'].loc[i]) nint = len(interactions) wt_int = [] for i in interactions: filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt' wt_int.append(getInteractions(filename)) print('wt') print(wt_int) ntotal = nint+1 print(ntotal) print(nmuts) data = np.empty((ntotal,nmuts)) data[0] = ddG print(data) for i in range(0,len(interactions)): d=[] p=0 for n in range(1, nmuts+1): print(i) filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt' mut = getInteractions(filename) diff = wt_int[i] - mut print(diff) print(wt_int[i]) print(mut) d.append(diff) print(d) data[i+1] = d interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS'] print(interactions) IE = [] if comp=='y': wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt' wtE = getInteractionEnergy(wtfilename) print(wtE) for n in range(1,nmuts+1): print(n) filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt' mutE = getInteractionEnergy(filename) print(mutE) diff = wtE - mutE print(diff) IE.append(diff) print(IE) IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist) IEfilename = 'foldx_complexresults_'+pdbname+'.csv' IEresults.to_csv(IEfilename) print(len(IE)) data = np.append(data,[IE], axis = 0) print(data) interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS','Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy'] mut_file = process_dir + '/individual_list_' + pdbname + '.txt' with open(mut_file) as csvfile: readCSV = csv.reader(csvfile) mutlist = [] for row in readCSV: mut = row[0] mutlist.append(mut) print(mutlist) print(len(mutlist)) print(data) results = pd.DataFrame(data, columns = mutlist, index = interactions) results.append(ddG) #print(results.head()) # my style formatted results results2 = results.T # transpose df results2.index.name = 'mutationinformation' # assign name to index results2 = results2.reset_index() # turn it into a columns results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id) results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space results2.rename(columns = {'Distances': 'Contacts'}, inplace = True) # lower case columns results2.columns = results2.columns.str.lower() print('Writing file in the format below:\n' , results2.head() , '\nNo. of rows:', len(results2) , '\nNo. of cols:', len(results2.columns)) outputfilename = outfile_foldx #outputfilename = 'foldx_results_' + pdbname + '.csv' #results.to_csv(outputfilename) results2.to_csv(outputfilename, index = False) if __name__ == '__main__': main()