added scratch/
This commit is contained in:
parent
3fe1d35df5
commit
f6fc6e47ab
10 changed files with 42018 additions and 0 deletions
11289
scripts/scratch/3byw.pdb
Normal file
11289
scripts/scratch/3byw.pdb
Normal file
File diff suppressed because it is too large
Load diff
17357
scripts/scratch/7bvf_b.pdb
Normal file
17357
scripts/scratch/7bvf_b.pdb
Normal file
File diff suppressed because it is too large
Load diff
15
scripts/scratch/align_fastas.txt
Normal file
15
scripts/scratch/align_fastas.txt
Normal file
|
@ -0,0 +1,15 @@
|
|||
>Mycobacterium tuberculosis H37Rv|Rv3423c|alr
|
||||
VKRFWENVGKPNDTTDGRGTTSLAMTPISQTPGLLAEAMVDLGAIEHNVRVLREHAGHAQLMAVVKADGYGH
|
||||
GATRVAQTALGAGAAELGVATVDEALALRADGITAPVLAWLHPPGIDFGPALLADVQVAVSSLRQLDELLHA
|
||||
VRRTGRTATVTVKVDTGLNRNGVGPAQFPAMLTALRQAMAEDAVRLRGLMSHMVYADKPDDSINDVQAQRFT
|
||||
AFLAQAREQGVRFEVAHLSNSSATMARPDLTFDLVRPGIAVYGLSPVPALGDMGLVPAMTVKCAVALVKSIR
|
||||
AGEGVSYGHTWIAPRDTNLALLPIGYADGVFRSLGGRLEVLINGRRCPGVGRICMDQFMVDLGPGPLDVAEG
|
||||
DEAILFGPGIRGEPTAQDWADLVGTIHYEVVTSPRGRITRTYREAENR
|
||||
|
||||
>alr_complex | chain A | 371 aa
|
||||
LAEAMVDLGAIEHNVRVLREHAGHAQLMAVVKADGYGHGATRVAQTALGAGAAELGVATVDEALALRADGIT
|
||||
APVLAWLHPPGIDFGPALLADVQVAVSSLRQLDELLHAVRRTGRTATVTVKVDTGLNRNGVGPAQFPAMLTA
|
||||
LRQAMAEDAVRLRGLMSHMVYADKPDDSINDVQAQRFTAFLAQAREQGVRFEVAHLSNSSATMARPDLTFDL
|
||||
VRPGIAVYGLSPVPALGDMGLVPAMTVKCAVALVKSIRAGEGVSYGHTWIAPRDTNLALLPIGYADGVFRSL
|
||||
GGRLEVLINGRRCPGVGRICMDQFMVDLGPGPLDVAEGDEAILFGPGIRGEPTAQDWADLVGTIHYEVVTSP
|
||||
RGRITRTYREA
|
125
scripts/scratch/align_template.py
Executable file
125
scripts/scratch/align_template.py
Executable file
|
@ -0,0 +1,125 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# useful links
|
||||
#https://towardsdatascience.com/pairwise-sequence-alignment-using-biopython-d1a9d0ba861f
|
||||
#https://www.biostars.org/p/265338/
|
||||
|
||||
from Bio import SeqIO
|
||||
from Bio import pairwise2
|
||||
from Bio.pairwise2 import format_alignment
|
||||
import re
|
||||
import os
|
||||
|
||||
#%%
|
||||
os.chdir('/home/tanu/git/LSHTM_analysis/scripts/examples')
|
||||
#%%
|
||||
def myalign(ref_seq, pdb_seq):
|
||||
myalign_dict = {}
|
||||
|
||||
alignments = pairwise2.align.globalxx(ref_seq, pdb_seq)
|
||||
#alignments = pairwise2.align.localxx(ref, struct)
|
||||
|
||||
match = []
|
||||
|
||||
for a, b in zip(alignments[0][0], alignments[0][1]):
|
||||
if a == b:
|
||||
match.append('|')
|
||||
else:
|
||||
match.append(' ')
|
||||
|
||||
|
||||
#print(match)
|
||||
print(alignments[0][0])
|
||||
print("".join(match))
|
||||
print(alignments[0][1])
|
||||
|
||||
result_align = alignments[0][1]
|
||||
#print(result_align)
|
||||
print('===============================================================\n')
|
||||
|
||||
# update dict
|
||||
myalign_dict.update({'aligned_fasta': result_align})
|
||||
|
||||
|
||||
aa_regex = '\w'
|
||||
m = re.search(aa_regex, result_align)
|
||||
#m = my_match.span()
|
||||
offset = m.start()
|
||||
offset_end = m.end()
|
||||
|
||||
print('start of match:', offset
|
||||
, '\nend of match:', offset_end)
|
||||
print('===============================================================\n')
|
||||
|
||||
# update dict
|
||||
myalign_dict.update({'start_match' : offset})
|
||||
myalign_dict.update({'end_match' : offset_end})
|
||||
|
||||
return myalign_dict
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
read file containing reference and pdb_sequence to align
|
||||
"""
|
||||
my_dict = {}
|
||||
align_fastas_to_align = open('align_fastas.txt', 'r')
|
||||
for record in SeqIO.parse(align_fastas_to_align,"fasta"):
|
||||
myid = record.id
|
||||
seq = str(record.seq)
|
||||
my_dict.update({myid : seq})
|
||||
|
||||
my_keys = list(my_dict.keys())
|
||||
my_ref_seq = my_dict[my_keys[0]] # ref_seq
|
||||
my_pdb_seq = my_dict[my_keys[1]] # pdb_seq
|
||||
|
||||
fasta_alignment = myalign(my_ref_seq, my_pdb_seq)
|
||||
print('this is my result:', fasta_alignment)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
#%% debug: individually
|
||||
my_dict = {}
|
||||
align_fastas_to_align = open('align_fastas.txt', 'r')
|
||||
for record in SeqIO.parse(align_fastas_to_align,"fasta"):
|
||||
myid =record.id
|
||||
seq=str(record.seq)
|
||||
#print(myid, seq)
|
||||
my_dict.update({myid: seq})
|
||||
|
||||
print(my_dict)
|
||||
print(my_dict.keys())
|
||||
|
||||
my_keys = list(my_dict.keys())
|
||||
alignments = pairwise2.align.globalxx(my_dict[my_keys[0]], my_dict[my_keys[1]])
|
||||
match = []
|
||||
|
||||
for a, b in zip(alignments[0][0], alignments[0][1]):
|
||||
if a == b:
|
||||
match.append('|')
|
||||
else:
|
||||
match.append(' ')
|
||||
|
||||
#print(match)
|
||||
print(alignments[0][0])
|
||||
print("".join(match))
|
||||
print(alignments[0][1])
|
||||
|
||||
result_align = alignments[0][1]
|
||||
#print(result_align)
|
||||
print('===============================================================\n')
|
||||
|
||||
#offset = ''
|
||||
aa_regex = '\w'
|
||||
m = re.search(aa_regex, result_align)
|
||||
#m = my_match.span()
|
||||
offset = m.start()
|
||||
offset_end = m.end()
|
||||
|
||||
print('start of match:', offset, '\nend of match:', offset_end)
|
||||
print('===============================================================\n')
|
||||
|
||||
|
||||
|
11179
scripts/scratch/alr_complex.pdb
Normal file
11179
scripts/scratch/alr_complex.pdb
Normal file
File diff suppressed because it is too large
Load diff
52
scripts/scratch/chain_extract_template.py
Normal file
52
scripts/scratch/chain_extract_template.py
Normal file
|
@ -0,0 +1,52 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
#=======================================================================
|
||||
# TASK: select specified chains from the structure & save a cropped structure with
|
||||
# the selected chains. Useful for dimer, etc modelling.
|
||||
|
||||
# link for saving each chain as a separate file
|
||||
# https://stackoverflow.com/questions/11685716/how-to-extract-chains-from-a-structure-file
|
||||
#=======================================================================
|
||||
#%%
|
||||
import os, sys
|
||||
from Bio.PDB import PDBParser, PDBIO, Select
|
||||
#%% homdir and curr dir and local imports
|
||||
homedir = os.path.expanduser('~')
|
||||
# set working dir
|
||||
os.getcwd()
|
||||
os.chdir(homedir + '/git/Data/ethambutol/input/')
|
||||
os.getcwd()
|
||||
#%%
|
||||
# Select() Method to return True for every chain in 'chains'
|
||||
class ChainExtract(Select):
|
||||
def __init__(self, chain):
|
||||
self.chain = chain
|
||||
|
||||
def accept_chain(self, chain):
|
||||
#print dir(chain)
|
||||
if chain.id in self.chain:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
def main():
|
||||
p = PDBParser(PERMISSIVE=1)
|
||||
structure = p.get_structure("3byw", "3byw.pdb")
|
||||
|
||||
my_chains = ['G', 'H'] # specify selected chains
|
||||
c_names = ''.join(my_chains)
|
||||
|
||||
pdb_chains_file = 'pdb_crop_' + c_names + '.pdb'
|
||||
|
||||
io = PDBIO()
|
||||
io.set_structure(structure)
|
||||
#io.save(structure.get_id() + "_crop.structure", ChainExtract(my_chains))
|
||||
io.save(pdb_chains_file, ChainExtract(my_chains))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
#%%
|
||||
# test
|
||||
#my_chains = ['G', 'H'] # specify selected chains
|
||||
#foo = ''.join(my_chains) # to append to file name
|
||||
#pdb_chains_file = '_{}.pdb'.format(my_chains)
|
49
scripts/scratch/chain_splitter_template.py
Executable file
49
scripts/scratch/chain_splitter_template.py
Executable file
|
@ -0,0 +1,49 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
#=======================================================================
|
||||
# TASK: extract chain from pdb and save each chain as a separate file
|
||||
|
||||
# link for saving each chain as a separate file
|
||||
# https://stackoverflow.com/questions/11685716/how-to-extract-chains-from-a-pdb-file
|
||||
# command line args
|
||||
# https://stackoverflow.com/questions/15753701/how-can-i-pass-a-list-as-a-command-line-argument-with-argparse
|
||||
#=======================================================================
|
||||
|
||||
#%%
|
||||
import os, sys
|
||||
from Bio.PDB import Select, PDBIO
|
||||
from Bio.PDB.PDBParser import PDBParser
|
||||
#%% homdir and curr dir and local imports
|
||||
homedir = os.path.expanduser('~')
|
||||
# set working dir
|
||||
os.getcwd()
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||
os.getcwd()
|
||||
#%%
|
||||
class ChainSelect(Select):
|
||||
def __init__(self, chain):
|
||||
self.chain = chain
|
||||
|
||||
def accept_chain(self, chain):
|
||||
if chain.get_id() == self.chain:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
def main():
|
||||
chains = ['A','B','C','F']
|
||||
p = PDBParser(PERMISSIVE=1)
|
||||
#structure = p.get_structure(pdb_file, pdb_file)
|
||||
structure = p.get_structure('/home/tanu/git/Data/ethambutol/input/3byw', '/home/tanu/git/Data/ethambutol/input/3byw.pdb')
|
||||
#print('STRUCTURE:', structure.get_id())
|
||||
# pdb_filename = print()
|
||||
for chain in chains:
|
||||
pdb_chain_file = 'pdb_file_chain_{}.pdb'.format(chain)
|
||||
|
||||
io = PDBIO()
|
||||
io.set_structure(structure)
|
||||
io.save('{}'.format(pdb_chain_file), ChainSelect(chain))
|
||||
|
||||
# If run from command line...
|
||||
if __name__ == "__main__":
|
||||
main()
|
45
scripts/scratch/check_ligand.py
Executable file
45
scripts/scratch/check_ligand.py
Executable file
|
@ -0,0 +1,45 @@
|
|||
#!/usr/bin/env python
|
||||
import os
|
||||
from biopandas.pdb import PandasPdb
|
||||
|
||||
#%%
|
||||
homedir = os.path.expanduser('~')
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts/examples')
|
||||
|
||||
#%%
|
||||
file_list = ['7bvf_b.pdb', 'pnca_complex.pdb', 'alr_complex.pdb']
|
||||
|
||||
file_list = ['7bvf_b.pdb']
|
||||
#file_list = ['pnca_complex.pdb']
|
||||
file_list = ['alr_complex.pdb']
|
||||
BORING_LIGANDS = ["HOH","CA","SO4","IOD","NA","CL","GOL","PO4"]
|
||||
#%% df with list
|
||||
ligands_dict = {}
|
||||
for pdb_id in file_list:
|
||||
ppdb = PandasPdb()
|
||||
pdb_file = ppdb.read_pdb(pdb_id)
|
||||
het = pdb_file.df['HETATM']
|
||||
het_list = list(set(het['residue_name']))
|
||||
ligands = [ l for l in het_list if l not in BORING_LIGANDS]
|
||||
lig_dict = {pdb_id:ligands}
|
||||
#lig_dict = {pdb_id:het_list} # include BORING_LIGANDS
|
||||
|
||||
ligands_dict.update(lig_dict)
|
||||
print(ligands_dict)
|
||||
print('pdb_code:', pdb_file.code) # works only in case of valid pdb
|
||||
print('pdb_code:', pdb_file.pdb_path) #works for bespoke pdb but prints the full path
|
||||
print('pdb_code:', os.path.basename(pdb_file.pdb_path)) # prints only the last part i.e filename
|
||||
#%% test with one
|
||||
ppdb = PandasPdb()
|
||||
pdb_file = ppdb.read_pdb('7bvf_b.pdb')
|
||||
het = pdb_file.df['HETATM']
|
||||
het_list = list(set(het['residue_name']))
|
||||
print(het_list)
|
||||
ligands = [ l for l in het_list if l not in BORING_LIGANDS]
|
||||
print(ligands)
|
||||
|
||||
#%% extract last part from file path
|
||||
print(os.path.basename(homedir + '/git/LSHTM_analysis/scripts/examples'))
|
||||
print(os.path.basename('alr_complex.pdb'))
|
||||
foo = os.path.basename(pdb_file.pdb_path)
|
||||
print(foo)
|
81
scripts/scratch/inspect.py
Normal file
81
scripts/scratch/inspect.py
Normal file
|
@ -0,0 +1,81 @@
|
|||
#!/usr/bin/env python
|
||||
import os
|
||||
from Bio.PDB import *
|
||||
from biopandas.pdb import PandasPdb
|
||||
from collections import defaultdict, OrderedDict
|
||||
import pandas as pd
|
||||
from functools import reduce
|
||||
#%% see verison of pandas
|
||||
#print(pd.__version__)
|
||||
|
||||
#%%
|
||||
homedir = os.path.expanduser('~')
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts/examples')
|
||||
# link
|
||||
#https://www.pythonprogramming.in/pandas-count-distinct-values-of-one-column-depend-on-another-column.html
|
||||
#https://datascience.stackexchange.com/questions/32328/export-pandas-to-dictionary-by-combining-multiple-row-values
|
||||
|
||||
# 3 way merge
|
||||
#https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
|
||||
#https://stackoverflow.com/questions/52223045/merge-multiple-dataframes-based-on-a-common-column
|
||||
|
||||
#%% Read data
|
||||
file_list = ['7bvf_b.pdb']
|
||||
file_list = ['3byw.pdb']
|
||||
#file_list = ['7bvf_b.pdb', 'pnca_complex.pdb', '3byw']
|
||||
#%%
|
||||
|
||||
for pdb in file_list:
|
||||
print(pdb)
|
||||
p = PDBParser()
|
||||
structure = p.get_structure(pdb, pdb)
|
||||
for model in structure:
|
||||
for chain in model:
|
||||
for residue in chain:
|
||||
for atom in residue:
|
||||
#print(atom)
|
||||
|
||||
#%% biopandas
|
||||
pdb_dict = {}
|
||||
for pdb_id in file_list:
|
||||
ppdb = PandasPdb()
|
||||
pdb_file = ppdb.read_pdb(pdb_id)
|
||||
#dir(pdb_file)
|
||||
atm_df = pdb_file.df['ATOM']
|
||||
#print('column names:', atm_df.columns)
|
||||
|
||||
pdb_chains = list(set(atm_df['chain_id']))
|
||||
print('pdb chains:', pdb_chains)
|
||||
|
||||
total_chains = len(pdb_chains)
|
||||
print('total no. of chains:', total_chains)
|
||||
|
||||
chain_info = {}
|
||||
#atm_df_s = atm_df.sort_values(by=['atom_number', 'chain_id', 'residue_number'])
|
||||
c_start = atm_df.groupby('chain_id').residue_number.min()
|
||||
print(c_start)
|
||||
c_start_df = pd.DataFrame({'chain_id': c_start.index, 'start_res': c_start.values})
|
||||
|
||||
c_end = atm_df.groupby('chain_id').residue_number.max()
|
||||
print(c_end)
|
||||
c_end_df = pd.DataFrame({'chain_id': c_end.index, 'end_res': c_end.values})
|
||||
|
||||
c_length = atm_df.groupby('chain_id').residue_number.nunique()
|
||||
print(c_length)
|
||||
c_length_df = pd.DataFrame({'chain_id': c_length.index, 'chain_len': c_length.values})
|
||||
|
||||
# combine 3 series into and assign 'chain_id' as a column
|
||||
# using rlambda function works well (as it should work with whatever number of dataframes you want to merge)
|
||||
# using pd.concat creates extra chain id cols
|
||||
c_df = reduce(lambda left,right: pd.merge(left,right, on = 'chain_id'), [c_start_df, c_end_df, c_length_df])
|
||||
|
||||
# convert df to dict with 'chain_id' as key and columns as list of values
|
||||
chain_dict = c_df.set_index('chain_id').T.to_dict('list')
|
||||
print(chain_dict)
|
||||
#%% Idea
|
||||
#protein_name: total_chains: 8, total ligands/hetatm = 3
|
||||
#df of chain details
|
||||
#chain start_res end_res len_chain
|
||||
#pdb tools script separate one chain
|
||||
|
||||
# remove water and
|
1907
scripts/scratch/pnca_complex.pdb
Normal file
1907
scripts/scratch/pnca_complex.pdb
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue