added scratch/

2020-05-22 12:03:11 +01:00 · 2020-05-22 12:03:11 +01:00 · 3a0ff9b35e
commit 3a0ff9b35e
parent 73762568e8
10 changed files with 42018 additions and 0 deletions
--- a/scripts/scratch/3byw.pdb
+++ b/scripts/scratch/3byw.pdb
--- a/scripts/scratch/7bvf_b.pdb
+++ b/scripts/scratch/7bvf_b.pdb
--- a/scripts/scratch/align_fastas.txt
+++ b/scripts/scratch/align_fastas.txt
@ -0,0 +1,15 @@
 >Mycobacterium tuberculosis H37Rv|Rv3423c|alr
 VKRFWENVGKPNDTTDGRGTTSLAMTPISQTPGLLAEAMVDLGAIEHNVRVLREHAGHAQLMAVVKADGYGH
 GATRVAQTALGAGAAELGVATVDEALALRADGITAPVLAWLHPPGIDFGPALLADVQVAVSSLRQLDELLHA
 VRRTGRTATVTVKVDTGLNRNGVGPAQFPAMLTALRQAMAEDAVRLRGLMSHMVYADKPDDSINDVQAQRFT
 AFLAQAREQGVRFEVAHLSNSSATMARPDLTFDLVRPGIAVYGLSPVPALGDMGLVPAMTVKCAVALVKSIR
 AGEGVSYGHTWIAPRDTNLALLPIGYADGVFRSLGGRLEVLINGRRCPGVGRICMDQFMVDLGPGPLDVAEG
 DEAILFGPGIRGEPTAQDWADLVGTIHYEVVTSPRGRITRTYREAENR
 >alr_complex | chain A | 371 aa
 LAEAMVDLGAIEHNVRVLREHAGHAQLMAVVKADGYGHGATRVAQTALGAGAAELGVATVDEALALRADGIT
 APVLAWLHPPGIDFGPALLADVQVAVSSLRQLDELLHAVRRTGRTATVTVKVDTGLNRNGVGPAQFPAMLTA
 LRQAMAEDAVRLRGLMSHMVYADKPDDSINDVQAQRFTAFLAQAREQGVRFEVAHLSNSSATMARPDLTFDL
 VRPGIAVYGLSPVPALGDMGLVPAMTVKCAVALVKSIRAGEGVSYGHTWIAPRDTNLALLPIGYADGVFRSL
 GGRLEVLINGRRCPGVGRICMDQFMVDLGPGPLDVAEGDEAILFGPGIRGEPTAQDWADLVGTIHYEVVTSP
 RGRITRTYREA
--- a/scripts/scratch/align_template.py
+++ b/scripts/scratch/align_template.py
@ -0,0 +1,125 @@
 #!/usr/bin/env python3
 # useful links
 #https://towardsdatascience.com/pairwise-sequence-alignment-using-biopython-d1a9d0ba861f
 #https://www.biostars.org/p/265338/
 from Bio import SeqIO
 from Bio import pairwise2
 from Bio.pairwise2 import format_alignment
 import re
 import os
 #%%
 os.chdir('/home/tanu/git/LSHTM_analysis/scripts/examples')
 #%%
 def myalign(ref_seq, pdb_seq):
    myalign_dict = {}
    alignments = pairwise2.align.globalxx(ref_seq, pdb_seq)  
    #alignments = pairwise2.align.localxx(ref, struct)
    match = []
    for a, b in zip(alignments[0][0], alignments[0][1]):
            if a == b:
                    match.append('|')
            else:
                    match.append(' ')
    #print(match)
    print(alignments[0][0])
    print("".join(match))
    print(alignments[0][1])
    result_align = alignments[0][1]
    #print(result_align)
    print('===============================================================\n')
    # update dict
    myalign_dict.update({'aligned_fasta': result_align})
    aa_regex = '\w'
    m = re.search(aa_regex, result_align)
    #m = my_match.span()
    offset = m.start()
    offset_end = m.end()
    print('start of match:', offset
    , '\nend of match:', offset_end)
    print('===============================================================\n')
    # update dict
    myalign_dict.update({'start_match' : offset})
    myalign_dict.update({'end_match' : offset_end})
    return myalign_dict
 def main():
    """
    read file containing reference and pdb_sequence to align
    """
    my_dict = {}
    align_fastas_to_align = open('align_fastas.txt', 'r') 
    for record in SeqIO.parse(align_fastas_to_align,"fasta"):
         myid = record.id
         seq = str(record.seq)
         my_dict.update({myid : seq})
    my_keys = list(my_dict.keys())
    my_ref_seq = my_dict[my_keys[0]] # ref_seq
    my_pdb_seq = my_dict[my_keys[1]] # pdb_seq
    fasta_alignment = myalign(my_ref_seq, my_pdb_seq)
    print('this is my result:', fasta_alignment)
 if __name__ == '__main__':
    main()
 #%% debug: individually
 my_dict = {}
 align_fastas_to_align = open('align_fastas.txt', 'r') 
 for record in SeqIO.parse(align_fastas_to_align,"fasta"):
     myid =record.id
     seq=str(record.seq)
     #print(myid, seq)
     my_dict.update({myid: seq})
 print(my_dict)
 print(my_dict.keys())
 my_keys = list(my_dict.keys())
 alignments = pairwise2.align.globalxx(my_dict[my_keys[0]], my_dict[my_keys[1]])
 match = []
 for a, b in zip(alignments[0][0], alignments[0][1]):
    if a == b:
        match.append('|')
    else:
        match.append(' ')
 #print(match)
 print(alignments[0][0])
 print("".join(match))
 print(alignments[0][1])
 result_align = alignments[0][1]
 #print(result_align)
 print('===============================================================\n')
 #offset = ''
 aa_regex = '\w'
 m = re.search(aa_regex, result_align)
 #m = my_match.span()
 offset = m.start()
 offset_end = m.end()
 print('start of match:', offset, '\nend of match:', offset_end)
 print('===============================================================\n')
--- a/scripts/scratch/alr_complex.pdb
+++ b/scripts/scratch/alr_complex.pdb
--- a/scripts/scratch/chain_extract_template.py
+++ b/scripts/scratch/chain_extract_template.py
@ -0,0 +1,52 @@
 #!/usr/bin/python3
 #=======================================================================
 # TASK: select specified chains from the structure & save a cropped structure with
 # the selected chains. Useful for dimer, etc modelling.
 # link for saving each chain as a separate file
 # https://stackoverflow.com/questions/11685716/how-to-extract-chains-from-a-structure-file
 #=======================================================================
 #%%
 import os, sys
 from Bio.PDB import PDBParser, PDBIO, Select
 #%% homdir and curr dir and local imports
 homedir = os.path.expanduser('~') 
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/Data/ethambutol/input/')
 os.getcwd()
 #%%
 # Select() Method to return True for every chain in 'chains'
 class ChainExtract(Select):
    def __init__(self, chain):
        self.chain = chain
    def accept_chain(self, chain):
        #print dir(chain)
        if chain.id in self.chain:
            return 1
        else:
            return 0
 def main():
    p = PDBParser(PERMISSIVE=1)  
    structure = p.get_structure("3byw", "3byw.pdb")
    my_chains = ['G', 'H'] # specify selected chains
    c_names = ''.join(my_chains)
    pdb_chains_file = 'pdb_crop_' + c_names + '.pdb'
    io = PDBIO()
    io.set_structure(structure)
    #io.save(structure.get_id() + "_crop.structure", ChainExtract(my_chains))
    io.save(pdb_chains_file, ChainExtract(my_chains))
 if __name__ == '__main__':
    main()
 #%%
 # test
 #my_chains = ['G', 'H'] # specify selected chains
 #foo = ''.join(my_chains) # to append to file name
 #pdb_chains_file = '_{}.pdb'.format(my_chains)
--- a/scripts/scratch/chain_splitter_template.py
+++ b/scripts/scratch/chain_splitter_template.py
@ -0,0 +1,49 @@
 #!/usr/bin/python3
 #=======================================================================
 # TASK: extract chain from pdb and save each chain as a separate file
 # link for saving each chain as a separate file
 # https://stackoverflow.com/questions/11685716/how-to-extract-chains-from-a-pdb-file
 # command line args
 # https://stackoverflow.com/questions/15753701/how-can-i-pass-a-list-as-a-command-line-argument-with-argparse
 #=======================================================================   
 #%%
 import os, sys
 from Bio.PDB import Select, PDBIO
 from Bio.PDB.PDBParser import PDBParser
 #%% homdir and curr dir and local imports
 homedir = os.path.expanduser('~') 
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()
 #%%
 class ChainSelect(Select):
    def __init__(self, chain):
        self.chain = chain
    def accept_chain(self, chain):
        if chain.get_id() == self.chain:
            return 1
        else:          
            return 0
 def main(): 
    chains = ['A','B','C','F']
    p = PDBParser(PERMISSIVE=1)       
    #structure = p.get_structure(pdb_file, pdb_file)
    structure = p.get_structure('/home/tanu/git/Data/ethambutol/input/3byw', '/home/tanu/git/Data/ethambutol/input/3byw.pdb')
    #print('STRUCTURE:', structure.get_id())
 #    pdb_filename = print()
    for chain in chains:
        pdb_chain_file = 'pdb_file_chain_{}.pdb'.format(chain)         
        io = PDBIO()               
        io.set_structure(structure)
        io.save('{}'.format(pdb_chain_file), ChainSelect(chain))
 # If run from command line...
 if __name__ == "__main__":
    main()
--- a/scripts/scratch/check_ligand.py
+++ b/scripts/scratch/check_ligand.py
@ -0,0 +1,45 @@
 #!/usr/bin/env python
 import os
 from biopandas.pdb import PandasPdb
 #%%
 homedir = os.path.expanduser('~')
 os.chdir(homedir + '/git/LSHTM_analysis/scripts/examples')
 #%%
 file_list = ['7bvf_b.pdb', 'pnca_complex.pdb', 'alr_complex.pdb']
 file_list = ['7bvf_b.pdb']
 #file_list = ['pnca_complex.pdb']
 file_list = ['alr_complex.pdb']
 BORING_LIGANDS = ["HOH","CA","SO4","IOD","NA","CL","GOL","PO4"]
 #%% df with list
 ligands_dict = {}
 for pdb_id in file_list:
    ppdb = PandasPdb()
    pdb_file = ppdb.read_pdb(pdb_id)        
    het = pdb_file.df['HETATM']
    het_list = list(set(het['residue_name']))
    ligands = [ l for l in het_list if l not in BORING_LIGANDS]
    lig_dict = {pdb_id:ligands}
    #lig_dict = {pdb_id:het_list} # include BORING_LIGANDS
    ligands_dict.update(lig_dict)
    print(ligands_dict)
    print('pdb_code:', pdb_file.code) # works only in case of valid pdb
    print('pdb_code:', pdb_file.pdb_path) #works for bespoke pdb but prints the full path
    print('pdb_code:', os.path.basename(pdb_file.pdb_path)) # prints only the last part i.e filename
 #%%   test with one
 ppdb = PandasPdb()
 pdb_file = ppdb.read_pdb('7bvf_b.pdb')        
 het = pdb_file.df['HETATM']
 het_list = list(set(het['residue_name']))
 print(het_list)
 ligands = [ l for l in het_list if l not in BORING_LIGANDS]
 print(ligands) 
 #%% extract last part from file path
 print(os.path.basename(homedir + '/git/LSHTM_analysis/scripts/examples'))
 print(os.path.basename('alr_complex.pdb'))
 foo = os.path.basename(pdb_file.pdb_path)
 print(foo)
--- a/scripts/examples/inspect.py
+++ b/scripts/examples/inspect.py
--- a/scripts/scratch/pnca_complex.pdb
+++ b/scripts/scratch/pnca_complex.pdb