added scratch/

2020-05-22 12:03:11 +01:00 · 2020-05-22 12:03:11 +01:00 · f6fc6e47ab
commit f6fc6e47ab
parent 3fe1d35df5
10 changed files with 42018 additions and 0 deletions
--- a/scripts/scratch/3byw.pdb
+++ b/scripts/scratch/3byw.pdb
--- a/scripts/scratch/7bvf_b.pdb
+++ b/scripts/scratch/7bvf_b.pdb
--- a/scripts/scratch/align_fastas.txt
+++ b/scripts/scratch/align_fastas.txt
@ -0,0 +1,15 @@
+>Mycobacterium tuberculosis H37Rv|Rv3423c|alr
+VKRFWENVGKPNDTTDGRGTTSLAMTPISQTPGLLAEAMVDLGAIEHNVRVLREHAGHAQLMAVVKADGYGH
+GATRVAQTALGAGAAELGVATVDEALALRADGITAPVLAWLHPPGIDFGPALLADVQVAVSSLRQLDELLHA
+VRRTGRTATVTVKVDTGLNRNGVGPAQFPAMLTALRQAMAEDAVRLRGLMSHMVYADKPDDSINDVQAQRFT
+AFLAQAREQGVRFEVAHLSNSSATMARPDLTFDLVRPGIAVYGLSPVPALGDMGLVPAMTVKCAVALVKSIR
+AGEGVSYGHTWIAPRDTNLALLPIGYADGVFRSLGGRLEVLINGRRCPGVGRICMDQFMVDLGPGPLDVAEG
+DEAILFGPGIRGEPTAQDWADLVGTIHYEVVTSPRGRITRTYREAENR
+      
+>alr_complex | chain A | 371 aa
+LAEAMVDLGAIEHNVRVLREHAGHAQLMAVVKADGYGHGATRVAQTALGAGAAELGVATVDEALALRADGIT
+APVLAWLHPPGIDFGPALLADVQVAVSSLRQLDELLHAVRRTGRTATVTVKVDTGLNRNGVGPAQFPAMLTA
+LRQAMAEDAVRLRGLMSHMVYADKPDDSINDVQAQRFTAFLAQAREQGVRFEVAHLSNSSATMARPDLTFDL
+VRPGIAVYGLSPVPALGDMGLVPAMTVKCAVALVKSIRAGEGVSYGHTWIAPRDTNLALLPIGYADGVFRSL
+GGRLEVLINGRRCPGVGRICMDQFMVDLGPGPLDVAEGDEAILFGPGIRGEPTAQDWADLVGTIHYEVVTSP
+RGRITRTYREA
--- a/scripts/scratch/align_template.py
+++ b/scripts/scratch/align_template.py
@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+
+# useful links
+#https://towardsdatascience.com/pairwise-sequence-alignment-using-biopython-d1a9d0ba861f
+#https://www.biostars.org/p/265338/
+
+from Bio import SeqIO
+from Bio import pairwise2
+from Bio.pairwise2 import format_alignment
+import re
+import os
+
+#%%
+os.chdir('/home/tanu/git/LSHTM_analysis/scripts/examples')
+#%%
+def myalign(ref_seq, pdb_seq):
+    myalign_dict = {}
+    
+    alignments = pairwise2.align.globalxx(ref_seq, pdb_seq)  
+    #alignments = pairwise2.align.localxx(ref, struct)
+
+    match = []
+
+    for a, b in zip(alignments[0][0], alignments[0][1]):
+            if a == b:
+                    match.append('|')
+            else:
+                    match.append(' ')
+
+
+    #print(match)
+    print(alignments[0][0])
+    print("".join(match))
+    print(alignments[0][1])
+
+    result_align = alignments[0][1]
+    #print(result_align)
+    print('===============================================================\n')
+
+    # update dict
+    myalign_dict.update({'aligned_fasta': result_align})
+    
+    
+    aa_regex = '\w'
+    m = re.search(aa_regex, result_align)
+    #m = my_match.span()
+    offset = m.start()
+    offset_end = m.end()
+
+    print('start of match:', offset
+    , '\nend of match:', offset_end)
+    print('===============================================================\n')
+    
+    # update dict
+    myalign_dict.update({'start_match' : offset})
+    myalign_dict.update({'end_match' : offset_end})
+
+    return myalign_dict
+
+   
+def main():
+    """
+    read file containing reference and pdb_sequence to align
+    """
+    my_dict = {}
+    align_fastas_to_align = open('align_fastas.txt', 'r') 
+    for record in SeqIO.parse(align_fastas_to_align,"fasta"):
+         myid = record.id
+         seq = str(record.seq)
+         my_dict.update({myid : seq})
+         
+    my_keys = list(my_dict.keys())
+    my_ref_seq = my_dict[my_keys[0]] # ref_seq
+    my_pdb_seq = my_dict[my_keys[1]] # pdb_seq
+    
+    fasta_alignment = myalign(my_ref_seq, my_pdb_seq)
+    print('this is my result:', fasta_alignment)
+
+
+if __name__ == '__main__':
+    main()
+
+#%% debug: individually
+my_dict = {}
+align_fastas_to_align = open('align_fastas.txt', 'r') 
+for record in SeqIO.parse(align_fastas_to_align,"fasta"):
+     myid =record.id
+     seq=str(record.seq)
+     #print(myid, seq)
+     my_dict.update({myid: seq})
+     
+print(my_dict)
+print(my_dict.keys())
+
+my_keys = list(my_dict.keys())
+alignments = pairwise2.align.globalxx(my_dict[my_keys[0]], my_dict[my_keys[1]])
+match = []
+
+for a, b in zip(alignments[0][0], alignments[0][1]):
+    if a == b:
+        match.append('|')
+    else:
+        match.append(' ')
+
+#print(match)
+print(alignments[0][0])
+print("".join(match))
+print(alignments[0][1])
+
+result_align = alignments[0][1]
+#print(result_align)
+print('===============================================================\n')
+
+#offset = ''
+aa_regex = '\w'
+m = re.search(aa_regex, result_align)
+#m = my_match.span()
+offset = m.start()
+offset_end = m.end()
+
+print('start of match:', offset, '\nend of match:', offset_end)
+print('===============================================================\n')
+
+    
+                         
--- a/scripts/scratch/alr_complex.pdb
+++ b/scripts/scratch/alr_complex.pdb
--- a/scripts/scratch/chain_extract_template.py
+++ b/scripts/scratch/chain_extract_template.py
@ -0,0 +1,52 @@
+#!/usr/bin/python3
+
+#=======================================================================
+# TASK: select specified chains from the structure & save a cropped structure with
+# the selected chains. Useful for dimer, etc modelling.
+
+# link for saving each chain as a separate file
+# https://stackoverflow.com/questions/11685716/how-to-extract-chains-from-a-structure-file
+#=======================================================================
+#%%
+import os, sys
+from Bio.PDB import PDBParser, PDBIO, Select
+#%% homdir and curr dir and local imports
+homedir = os.path.expanduser('~') 
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/Data/ethambutol/input/')
+os.getcwd()
+#%%
+# Select() Method to return True for every chain in 'chains'
+class ChainExtract(Select):
+    def __init__(self, chain):
+        self.chain = chain
+        
+    def accept_chain(self, chain):
+        #print dir(chain)
+        if chain.id in self.chain:
+            return 1
+        else:
+            return 0
+             
+def main():
+    p = PDBParser(PERMISSIVE=1)  
+    structure = p.get_structure("3byw", "3byw.pdb")
+    
+    my_chains = ['G', 'H'] # specify selected chains
+    c_names = ''.join(my_chains)
+   
+    pdb_chains_file = 'pdb_crop_' + c_names + '.pdb'
+    
+    io = PDBIO()
+    io.set_structure(structure)
+    #io.save(structure.get_id() + "_crop.structure", ChainExtract(my_chains))
+    io.save(pdb_chains_file, ChainExtract(my_chains))
+    
+if __name__ == '__main__':
+    main()
+#%%
+# test
+#my_chains = ['G', 'H'] # specify selected chains
+#foo = ''.join(my_chains) # to append to file name
+#pdb_chains_file = '_{}.pdb'.format(my_chains)
--- a/scripts/scratch/chain_splitter_template.py
+++ b/scripts/scratch/chain_splitter_template.py
@ -0,0 +1,49 @@
+#!/usr/bin/python3
+
+#=======================================================================
+# TASK: extract chain from pdb and save each chain as a separate file
+
+# link for saving each chain as a separate file
+# https://stackoverflow.com/questions/11685716/how-to-extract-chains-from-a-pdb-file
+# command line args
+# https://stackoverflow.com/questions/15753701/how-can-i-pass-a-list-as-a-command-line-argument-with-argparse
+#=======================================================================   
+
+#%%
+import os, sys
+from Bio.PDB import Select, PDBIO
+from Bio.PDB.PDBParser import PDBParser
+#%% homdir and curr dir and local imports
+homedir = os.path.expanduser('~') 
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+os.getcwd()
+#%%
+class ChainSelect(Select):
+    def __init__(self, chain):
+        self.chain = chain
+
+    def accept_chain(self, chain):
+        if chain.get_id() == self.chain:
+            return 1
+        else:          
+            return 0
+
+def main(): 
+    chains = ['A','B','C','F']
+    p = PDBParser(PERMISSIVE=1)       
+    #structure = p.get_structure(pdb_file, pdb_file)
+    structure = p.get_structure('/home/tanu/git/Data/ethambutol/input/3byw', '/home/tanu/git/Data/ethambutol/input/3byw.pdb')
+    #print('STRUCTURE:', structure.get_id())
+#    pdb_filename = print()
+    for chain in chains:
+        pdb_chain_file = 'pdb_file_chain_{}.pdb'.format(chain)         
+                        
+        io = PDBIO()               
+        io.set_structure(structure)
+        io.save('{}'.format(pdb_chain_file), ChainSelect(chain))
+
+# If run from command line...
+if __name__ == "__main__":
+    main()
--- a/scripts/scratch/check_ligand.py
+++ b/scripts/scratch/check_ligand.py
@ -0,0 +1,45 @@
+#!/usr/bin/env python
+import os
+from biopandas.pdb import PandasPdb
+
+#%%
+homedir = os.path.expanduser('~')
+os.chdir(homedir + '/git/LSHTM_analysis/scripts/examples')
+
+#%%
+file_list = ['7bvf_b.pdb', 'pnca_complex.pdb', 'alr_complex.pdb']
+
+file_list = ['7bvf_b.pdb']
+#file_list = ['pnca_complex.pdb']
+file_list = ['alr_complex.pdb']
+BORING_LIGANDS = ["HOH","CA","SO4","IOD","NA","CL","GOL","PO4"]
+#%% df with list
+ligands_dict = {}
+for pdb_id in file_list:
+    ppdb = PandasPdb()
+    pdb_file = ppdb.read_pdb(pdb_id)        
+    het = pdb_file.df['HETATM']
+    het_list = list(set(het['residue_name']))
+    ligands = [ l for l in het_list if l not in BORING_LIGANDS]
+    lig_dict = {pdb_id:ligands}
+    #lig_dict = {pdb_id:het_list} # include BORING_LIGANDS
+    
+    ligands_dict.update(lig_dict)
+    print(ligands_dict)
+    print('pdb_code:', pdb_file.code) # works only in case of valid pdb
+    print('pdb_code:', pdb_file.pdb_path) #works for bespoke pdb but prints the full path
+    print('pdb_code:', os.path.basename(pdb_file.pdb_path)) # prints only the last part i.e filename
+#%%   test with one
+ppdb = PandasPdb()
+pdb_file = ppdb.read_pdb('7bvf_b.pdb')        
+het = pdb_file.df['HETATM']
+het_list = list(set(het['residue_name']))
+print(het_list)
+ligands = [ l for l in het_list if l not in BORING_LIGANDS]
+print(ligands) 
+
+#%% extract last part from file path
+print(os.path.basename(homedir + '/git/LSHTM_analysis/scripts/examples'))
+print(os.path.basename('alr_complex.pdb'))
+foo = os.path.basename(pdb_file.pdb_path)
+print(foo)
--- a/scripts/scratch/inspect.py
+++ b/scripts/scratch/inspect.py
@ -0,0 +1,81 @@
+#!/usr/bin/env python
+import os
+from Bio.PDB import *
+from biopandas.pdb import PandasPdb
+from collections import defaultdict, OrderedDict
+import pandas as pd
+from functools import reduce
+#%% see verison of pandas
+#print(pd.__version__)
+
+#%%
+homedir = os.path.expanduser('~')
+os.chdir(homedir + '/git/LSHTM_analysis/scripts/examples')
+# link
+#https://www.pythonprogramming.in/pandas-count-distinct-values-of-one-column-depend-on-another-column.html
+#https://datascience.stackexchange.com/questions/32328/export-pandas-to-dictionary-by-combining-multiple-row-values
+
+# 3 way merge
+#https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
+#https://stackoverflow.com/questions/52223045/merge-multiple-dataframes-based-on-a-common-column
+
+#%% Read data
+file_list = ['7bvf_b.pdb']
+file_list = ['3byw.pdb']
+#file_list = ['7bvf_b.pdb', 'pnca_complex.pdb', '3byw']
+#%%
+
+for pdb in file_list:
+    print(pdb)
+    p = PDBParser()
+    structure = p.get_structure(pdb, pdb)
+    for model in structure:
+        for chain in model:
+            for residue in chain:
+                for atom in residue:
+                    #print(atom)
+
+#%% biopandas
+pdb_dict = {}
+for pdb_id in file_list:
+    ppdb = PandasPdb()
+    pdb_file = ppdb.read_pdb(pdb_id)
+    #dir(pdb_file)
+    atm_df = pdb_file.df['ATOM']
+    #print('column names:', atm_df.columns)
+     
+    pdb_chains = list(set(atm_df['chain_id']))
+    print('pdb chains:', pdb_chains)
+    
+    total_chains = len(pdb_chains)
+    print('total no. of chains:', total_chains)
+    
+    chain_info = {}
+    #atm_df_s = atm_df.sort_values(by=['atom_number', 'chain_id', 'residue_number'])
+    c_start = atm_df.groupby('chain_id').residue_number.min()
+    print(c_start)
+    c_start_df = pd.DataFrame({'chain_id': c_start.index, 'start_res': c_start.values})    
+    
+    c_end = atm_df.groupby('chain_id').residue_number.max()
+    print(c_end)
+    c_end_df = pd.DataFrame({'chain_id': c_end.index, 'end_res': c_end.values})   
+    
+    c_length = atm_df.groupby('chain_id').residue_number.nunique()
+    print(c_length)
+    c_length_df = pd.DataFrame({'chain_id': c_length.index, 'chain_len': c_length.values})   
+    
+    # combine 3 series into and assign 'chain_id' as a column
+    # using rlambda function works well (as it should work with whatever number of dataframes you want to merge)
+    # using pd.concat creates extra chain id cols
+    c_df = reduce(lambda left,right: pd.merge(left,right, on = 'chain_id'), [c_start_df, c_end_df, c_length_df])
+    
+    # convert df to dict with 'chain_id' as key and columns as list of values
+    chain_dict = c_df.set_index('chain_id').T.to_dict('list')
+    print(chain_dict)
+#%% Idea
+#protein_name: total_chains: 8, total ligands/hetatm = 3
+#df of chain details
+#chain start_res end_res len_chain
+#pdb tools script separate one chain 
+
+# remove water and 
--- a/scripts/scratch/pnca_complex.pdb
+++ b/scripts/scratch/pnca_complex.pdb