building script for inspecting pdb
This commit is contained in:
parent
bdad2dcfda
commit
73762568e8
3 changed files with 83 additions and 1 deletions
81
scripts/examples/inspect.py
Normal file
81
scripts/examples/inspect.py
Normal file
|
@ -0,0 +1,81 @@
|
|||
#!/usr/bin/env python
|
||||
import os
|
||||
from Bio.PDB import *
|
||||
from biopandas.pdb import PandasPdb
|
||||
from collections import defaultdict, OrderedDict
|
||||
import pandas as pd
|
||||
from functools import reduce
|
||||
#%% see verison of pandas
|
||||
#print(pd.__version__)
|
||||
|
||||
#%%
|
||||
homedir = os.path.expanduser('~')
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts/examples')
|
||||
# link
|
||||
#https://www.pythonprogramming.in/pandas-count-distinct-values-of-one-column-depend-on-another-column.html
|
||||
#https://datascience.stackexchange.com/questions/32328/export-pandas-to-dictionary-by-combining-multiple-row-values
|
||||
|
||||
# 3 way merge
|
||||
#https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
|
||||
#https://stackoverflow.com/questions/52223045/merge-multiple-dataframes-based-on-a-common-column
|
||||
|
||||
#%% Read data
|
||||
file_list = ['7bvf_b.pdb']
|
||||
file_list = ['3byw.pdb']
|
||||
#file_list = ['7bvf_b.pdb', 'pnca_complex.pdb', '3byw']
|
||||
#%%
|
||||
|
||||
for pdb in file_list:
|
||||
print(pdb)
|
||||
p = PDBParser()
|
||||
structure = p.get_structure(pdb, pdb)
|
||||
for model in structure:
|
||||
for chain in model:
|
||||
for residue in chain:
|
||||
for atom in residue:
|
||||
#print(atom)
|
||||
|
||||
#%% biopandas
|
||||
pdb_dict = {}
|
||||
for pdb_id in file_list:
|
||||
ppdb = PandasPdb()
|
||||
pdb_file = ppdb.read_pdb(pdb_id)
|
||||
#dir(pdb_file)
|
||||
atm_df = pdb_file.df['ATOM']
|
||||
#print('column names:', atm_df.columns)
|
||||
|
||||
pdb_chains = list(set(atm_df['chain_id']))
|
||||
print('pdb chains:', pdb_chains)
|
||||
|
||||
total_chains = len(pdb_chains)
|
||||
print('total no. of chains:', total_chains)
|
||||
|
||||
chain_info = {}
|
||||
#atm_df_s = atm_df.sort_values(by=['atom_number', 'chain_id', 'residue_number'])
|
||||
c_start = atm_df.groupby('chain_id').residue_number.min()
|
||||
print(c_start)
|
||||
c_start_df = pd.DataFrame({'chain_id': c_start.index, 'start_res': c_start.values})
|
||||
|
||||
c_end = atm_df.groupby('chain_id').residue_number.max()
|
||||
print(c_end)
|
||||
c_end_df = pd.DataFrame({'chain_id': c_end.index, 'end_res': c_end.values})
|
||||
|
||||
c_length = atm_df.groupby('chain_id').residue_number.nunique()
|
||||
print(c_length)
|
||||
c_length_df = pd.DataFrame({'chain_id': c_length.index, 'chain_len': c_length.values})
|
||||
|
||||
# combine 3 series into and assign 'chain_id' as a column
|
||||
# using rlambda function works well (as it should work with whatever number of dataframes you want to merge)
|
||||
# using pd.concat creates extra chain id cols
|
||||
c_df = reduce(lambda left,right: pd.merge(left,right, on = 'chain_id'), [c_start_df, c_end_df, c_length_df])
|
||||
|
||||
# convert df to dict with 'chain_id' as key and columns as list of values
|
||||
chain_dict = c_df.set_index('chain_id').T.to_dict('list')
|
||||
print(chain_dict)
|
||||
#%% Idea
|
||||
#protein_name: total_chains: 8, total ligands/hetatm = 3
|
||||
#df of chain details
|
||||
#chain start_res end_res len_chain
|
||||
#pdb tools script separate one chain
|
||||
|
||||
# remove water and
|
Loading…
Add table
Add a link
Reference in a new issue