#!/usr/bin/env python3 import subprocess import os import numpy as np import pandas as pd from contextlib import suppress from pathlib import Path import re import csv import argparse #https://realpython.com/python-pathlib/ # FIXME #strong dependency of file and path names #cannot pass file with path. Need to pass them separately #assumptions made for dir struc as standard #datadir + drug + input #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/foldx/') os.getcwd() #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help = 'drug name', default = None) arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = None) arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') arg_parser.add_argument('-pdb', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called _complex.pdb in input_dir') arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called _mcsm_snps.csv exists') arg_parser.add_argument('-c1', '--chain1', help = 'Chain1 ID', default = 'A') # case sensitive arg_parser.add_argument('-c2', '--chain2', help = 'Chain2 ID', default = 'B') # case sensitive args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' #gene_match = gene + '_p.' #%%===================================================================== # Command line options drug = args.drug gene = args.gene datadir = args.datadir indir = args.input_dir outdir = args.output_dir process_dir = args.process_dir mut_filename = args.mutation_file chainA = args.chain1 chainB = args.chain2 pdb_filename = args.pdb_file # os.path.splitext will fail interestingly with file.pdb.txt.zip #pdb_name = os.path.splitext(pdb_file)[0] # Just the filename, thanks #pdb_name = Path(in_filename_pdb).stem #============== # directories #============== if not datadir: datadir = homedir + '/' + 'git/Data' if not indir: indir = datadir + '/' + drug + '/input' if not outdir: outdir = datadir + '/' + drug + '/output' #TODO: perhaps better handled by refactoring code to prevent generating lots of output files! if not process_dir: process_dir = datadir + '/' + drug +'/' + 'processing' #======= # input #======= # FIXME if pdb_filename: pdb_name = Path(pdb_filename).stem else: pdb_filename = gene.lower() + '_complex.pdb' pdb_name = Path(pdb_filename).stem infile_pdb = indir + '/' + pdb_filename actual_pdb_filename = Path(infile_pdb).name if mut_filename: mutation_file = mut_filename else: mutation_file = gene.lower() + '_mcsm_snps.csv' infile_muts = outdir + '/' + mutation_file #======= # output #======= out_filename = gene.lower() + '_foldx.csv' outfile_foldx = outdir + '/' + out_filename print('Arguments being passed:' , '\nDrug:', args.drug , '\ngene:', args.gene , '\ninput dir:', indir , '\noutput dir:', outdir , '\npdb file:', infile_pdb , '\npdb name:', pdb_name , '\nactual pdb name:', actual_pdb_filename , '\nmutation file:', infile_muts , '\nchain1:', args.chain1 , '\noutput file:', outfile_foldx , '\n=============================================================') #======================================================================= def getInteractionEnergy(filename): data = pd.read_csv(filename,sep = '\t') return data['Interaction Energy'].loc[0] def getInteractions(filename): data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t') contactList = getIndexes(data,1) number = len(contactList) return number def formatMuts(mut_file,pdbname): with open(mut_file) as csvfile: readCSV = csv.reader(csvfile) muts = [] for row in readCSV: mut = row[0] muts.append(mut) mut_list = [] outfile = process_dir + '/individual_list_' + pdbname + '.txt' with open(outfile, 'w') as output: for m in muts: print(m) mut = m[:1] + chainA+ m[1:] mut_list.append(mut) mut = mut + ';' print(mut) output.write(mut) output.write('\n') return mut_list def getIndexes(data, value): colnames = data.columns.values listOfPos = list() result = data.isin([value]) result.columns = colnames seriesdata = result.any() columnNames = list(seriesdata[seriesdata==True].index) for col in columnNames: rows = list(result[col][result[col]==True].index) for row in rows: listOfPos.append((row,col)) return listOfPos def loadFiles(df): # load a text file in to np matrix resultList = [] f = open(df,'r') for line in f: line = line.rstrip('\n') aVals = line.split('\t') fVals = list(map(np.float32, sVals)) resultList.append(fVals) f.close() return np.asarray(resultList, dtype=np.float32) #======================================================================= def main(): pdbname = pdb_name comp = '' # for complex only mut_filename = infile_muts #pnca_mcsm_snps.csv mutlist = formatMuts(mut_filename, pdbname) print(mutlist) nmuts = len(mutlist) print(nmuts) print(mutlist) print('start') #subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir]) subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir]) print('end') output = subprocess.check_output(['bash', 'runfoldx.sh', pdbname, process_dir]) for n in range(1,nmuts+1): print(n) with suppress(Exception): subprocess.check_output(['bash', 'runPrintNetworks.sh', pdbname, str(n), process_dir]) for n in range(1,nmuts+1): print(n) with suppress(Exception): subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir]) out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir]) if comp=='y': chain1=chainA chain2=chainB with suppress(Exception): subprocess.check_output(['bash','runcomplex.sh', pdbname, chain1, chain2, process_dir]) for n in range(1,nmuts+1): with suppress(Exception): subprocess.check_output(['bash','mutruncomplex.sh', pdbname, chain1, chain2, str(n), process_dir]) interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM', 'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS'] dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt' dGdata = pd.read_csv(dGdatafile, sep = '\t') ddG=[] print('ddG') print(len(dGdata)) for i in range(0,len(dGdata)): ddG.append(dGdata['total energy'].loc[i]) nint = len(interactions) wt_int = [] for i in interactions: filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt' wt_int.append(getInteractions(filename)) print('wt') print(wt_int) ntotal = nint+1 print(ntotal) print(nmuts) data = np.empty((ntotal,nmuts)) data[0] = ddG print(data) for i in range(0,len(interactions)): d=[] p=0 for n in range(1, nmuts+1): print(i) filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt' mut = getInteractions(filename) diff = wt_int[i] - mut print(diff) print(wt_int[i]) print(mut) d.append(diff) print(d) data[i+1] = d interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM', 'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS'] print(interactions) IE = [] if comp=='y': wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt' wtE = getInteractionEnergy(wtfilename) print(wtE) for n in range(1,nmuts+1): print(n) filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt' mutE = getInteractionEnergy(filename) print(mutE) diff = wtE - mutE print(diff) IE.append(diff) print(IE) IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist) IEfilename = 'foldx_complexresults_'+pdbname+'.csv' IEresults.to_csv(IEfilename) print(len(IE)) data = np.append(data,[IE], axis = 0) print(data) interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM', 'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy'] mut_file = process_dir + '/individual_list_' + pdbname + '.txt' with open(mut_file) as csvfile: readCSV = csv.reader(csvfile) mutlist = [] for row in readCSV: mut = row[0] mutlist.append(mut) print(mutlist) print(len(mutlist)) print(data) results = pd.DataFrame(data, columns = mutlist, index = interactions) results.append(ddG) #print(results.head()) # my style formatted results results2 = results.T # transpose df results2.index.name = 'mutationinformation' # assign name to index results2 = results2.reset_index() # turn it into a columns results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id) results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space results2.rename(columns = {'Distances': 'Contacts'}, inplace = True) # lower case columns results2.columns = results2.columns.str.lower() print('Writing file in the format below:\n' , results2.head() , '\nNo. of rows:', len(results2) , '\nNo. of cols:', len(results2.columns)) outputfilename = outfile_foldx #outputfilename = 'foldx_results_' + pdbname + '.csv' #results.to_csv(outputfilename) results2.to_csv(outputfilename, index = False) if __name__ == '__main__': main()