refactor foldx pipeline to include:

* command-line args * creating necessary dirs automagically * code cleanup, syntax errors, etc etc
2020-06-30 17:14:30 +01:00 · 2020-06-30 17:14:30 +01:00 · 973a1a33da
commit 973a1a33da
parent e8a66a7a94
12 changed files with 700 additions and 56 deletions
--- a/foldx/runFoldx.py
+++ b/foldx/runFoldx.py
@ -0,0 +1,332 @@
+#!/usr/bin/env python3
+import subprocess
+import os
+import numpy as np
+import pandas as pd
+from contextlib import suppress
+from pathlib import Path
+import re
+import csv
+import argparse
+#https://realpython.com/python-pathlib/
+
+# FIXME
+#strong dependency of file and path names
+#cannot pass file with path. Need to pass them separately
+#assumptions made for dir struc as standard
+#datadir + drug + input
+
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
+os.getcwd()
+
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+
+arg_parser.add_argument('-d', '--drug',     help = 'drug name', default = 'pyrazinamide')
+arg_parser.add_argument('-g', '--gene',     help = 'gene name', default = 'pncA') # case sensitive
+
+
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + drug + input', default = None)
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + drug + output', default = None)
+
+
+arg_parser.add_argument('-f', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called gene_complex.pdb', default = None)
+arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called gene_test_snps.csv exists', default = None)
+
+arg_parser.add_argument('-c1', '--chain1',    help = 'Chain1 ID', default = 'A') # case sensitive
+arg_parser.add_argument('-c2', '--chain2',    help = 'Chain2 ID', default = 'B') # case sensitive
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+#gene_match = gene + '_p.'
+#%%=====================================================================
+# Command Line Options
+drug            = args.drug
+gene            = args.gene
+indir           = args.input_dir
+outdir          = args.output_dir
+mut_filename    = args.mutation_file
+chainA          = args.chain1
+chainB          = args.chain2
+pdb_filename    = args.pdb_file
+
+# os.path.splitext will fail interestingly with file.pdb.txt.zip
+#pdb_name = os.path.splitext(pdb_file)[0]
+# Just the filename, thanks
+#pdb_name = Path(in_filename_pdb).stem
+
+#==========
+# dir
+#==========
+datadir = homedir + '/' + 'git/Data'
+
+if not indir:
+    indir = datadir + '/' + drug + '/' + 'input'
+
+if not outdir:
+    outdir = datadir + '/' + drug + '/' + 'output'
+# FIXME: 
+process_dir = datadir + '/' + drug +'/' + 'processing'
+# FIXME: this is a temporary directory and should be correctly handled
+os.mkdir(process_dir)
+#=======
+# input
+#=======
+# FIXME
+
+
+if pdb_filename:
+    pdb_name = Path(pdb_filename).stem
+else:
+    pdb_filename = gene.lower() + '_complex.pdb'
+    pdb_name = Path(pdb_filename).stem
+
+infile_pdb = indir + '/' + pdb_filename
+actual_pdb_filename = Path(infile_pdb).name
+
+if mut_filename:
+    mutation_file = mut_filename
+else:
+    #mutation_file =  gene.lower() + '_mcsm_snps.csv' #real
+    mutation_file =  gene.lower() + '_test_snps.csv' #test
+
+infile_muts = outdir + '/' + mutation_file
+
+#=======
+# output 
+#=======
+out_filename = gene.lower() + '_foldx_results.csv'
+outfile_foldx =  outdir + '/' + out_filename
+
+print('Arguments being passed:'
+, '\nDrug:', args.drug
+, '\ngene:', args.gene
+, '\ninput dir:', indir
+, '\noutput dir:', outdir
+, '\npdb file:', infile_pdb
+, '\npdb name:', pdb_name
+, '\nactual pdb name:', actual_pdb_filename
+, '\nmutation file:', infile_muts
+, '\nchain1:', args.chain1
+, '\noutput file:', outfile_foldx
+, '\n=============================================================')
+#=======================================================================
+
+def getInteractionEnergy(filename):
+    data = pd.read_csv(filename,sep = '\t')
+    return data['Interaction Energy'].loc[0]
+
+def getInteractions(filename):
+    data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
+    contactList = getIndexes(data,1)
+    number = len(contactList)
+    return number
+
+def formatMuts(mut_file,pdbname):
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        muts = []
+        for row in readCSV:
+                mut = row[0]
+                muts.append(mut)
+        
+    mut_list = []
+    outfile = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(outfile, 'w') as output:
+        for m in muts:
+                print(m)
+                mut = m[:1] + chainA+ m[1:] 
+                mut_list.append(mut)
+                mut = mut + ';'
+                print(mut)
+                output.write(mut)
+                output.write('\n')
+    return mut_list
+
+def getIndexes(data, value):
+    colnames = data.columns.values
+    listOfPos = list()
+    result = data.isin([value])
+    result.columns = colnames
+    seriesdata = result.any()
+    columnNames = list(seriesdata[seriesdata==True].index)
+    for col in columnNames:
+        rows = list(result[col][result[col]==True].index)
+        
+        for row in rows:
+            listOfPos.append((row,col))
+    
+    return listOfPos
+
+def loadFiles(df):
+    # load a text file in to np matrix
+    resultList = []
+    f = open(df,'r')
+    for line in f:
+        line = line.rstrip('\n')
+        aVals = line.split('\t')
+        fVals = list(map(np.float32, sVals))
+        resultList.append(fVals)
+    f.close()
+    return np.asarray(resultList, dtype=np.float32)
+
+#=======================================================================    
+def main():
+    pdbname = pdb_name
+    comp = '' # for complex only
+    mut_filename = infile_muts #pnca_test_snps.csv
+    mutlist = formatMuts(mut_filename, pdbname)
+
+    print(mutlist)
+    nmuts = len(mutlist)
+    print(nmuts)
+    print(mutlist)
+    print('start')
+    #subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir])
+    subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir])
+    
+    print('end')
+    output = subprocess.check_output(['bash', 'runfoldx.sh', pdbname, process_dir])
+    
+    for n in range(1,nmuts+1):
+        print(n)
+        with suppress(Exception):
+            subprocess.check_output(['bash', 'runPrintNetworks.sh', pdbname, str(n), process_dir])
+        
+    for n in range(1,nmuts+1):
+        print(n)
+        with suppress(Exception):
+            subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
+            
+    out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
+
+    if comp=='y':
+        chain1=chainA
+        chain2=chainB
+        with suppress(Exception):
+            subprocess.check_output(['bash','runcomplex.sh', pdbname, chain1, chain2, process_dir])
+        for n in range(1,nmuts+1):
+            with suppress(Exception):
+                subprocess.check_output(['bash','mutruncomplex.sh', pdbname, chain1, chain2, str(n), process_dir])
+
+    interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS',
+                    'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
+                    'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
+    
+    dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
+    dGdata = pd.read_csv(dGdatafile, sep = '\t')
+    
+    ddG=[]
+    print('ddG')
+    print(len(dGdata))
+    for i in range(0,len(dGdata)):
+        ddG.append(dGdata['total energy'].loc[i])
+    
+
+    nint = len(interactions)
+    wt_int = []
+
+    for i in interactions:
+        filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
+        wt_int.append(getInteractions(filename))
+    print('wt')
+    print(wt_int)
+    
+    ntotal = nint+1
+    print(ntotal)
+    print(nmuts)
+    data = np.empty((ntotal,nmuts))
+    data[0] = ddG
+    print(data)
+    for i in range(0,len(interactions)):
+        d=[]
+        p=0
+        for n in range(1, nmuts+1):
+            print(i)
+            filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
+            mut = getInteractions(filename)
+            diff = wt_int[i] - mut
+            print(diff)
+            print(wt_int[i])
+            print(mut)
+            d.append(diff)
+        print(d)
+        data[i+1] = d    
+        
+    interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS',              'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
+'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']   
+
+    print(interactions)
+
+    IE = []
+    if comp=='y':
+        wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
+        wtE = getInteractionEnergy(wtfilename)
+        print(wtE)
+        for n in range(1,nmuts+1):
+            print(n)
+            filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
+            mutE = getInteractionEnergy(filename)
+            print(mutE)
+            diff = wtE - mutE
+            print(diff)
+            IE.append(diff)
+        print(IE)
+        IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
+        IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
+        IEresults.to_csv(IEfilename)
+        print(len(IE))
+        data = np.append(data,[IE], axis = 0)
+        print(data)
+        interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS',                'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
+'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']  
+
+    mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        mutlist = []
+        for row in readCSV:
+                mut = row[0]
+                mutlist.append(mut)
+    print(mutlist)
+    print(len(mutlist))
+    print(data)
+    results = pd.DataFrame(data, columns = mutlist, index = interactions)
+    results.append(ddG)
+    #print(results.head())
+    
+    # my style formatted results
+    results2 = results.T # transpose df
+    results2.index.name = 'mutationinformation' # assign name to index
+    results2 = results2.reset_index() # turn it into a columns
+  
+    results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
+    results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
+    
+    results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
+        
+    # lower case columns
+    results2.columns = results2.columns.str.lower()
+    
+    print('Writing file in the format below:\n'
+        , results2.head()
+        , '\nNo. of rows:', len(results2)
+        , '\nNo. of cols:', len(results2.columns))
+    
+    outputfilename = outfile_foldx   
+    #outputfilename = 'foldx_results_' + pdbname + '.csv'
+    #results.to_csv(outputfilename)
+    results2.to_csv(outputfilename, index = False)
+    
+if __name__ == '__main__':
+    main()