diff --git a/scripts/examples/inspect.py b/scripts/examples/inspect.py new file mode 100644 index 0000000..60d4667 --- /dev/null +++ b/scripts/examples/inspect.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +import os +from Bio.PDB import * +from biopandas.pdb import PandasPdb +from collections import defaultdict, OrderedDict +import pandas as pd +from functools import reduce +#%% see verison of pandas +#print(pd.__version__) + +#%% +homedir = os.path.expanduser('~') +os.chdir(homedir + '/git/LSHTM_analysis/scripts/examples') +# link +#https://www.pythonprogramming.in/pandas-count-distinct-values-of-one-column-depend-on-another-column.html +#https://datascience.stackexchange.com/questions/32328/export-pandas-to-dictionary-by-combining-multiple-row-values + +# 3 way merge +#https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns +#https://stackoverflow.com/questions/52223045/merge-multiple-dataframes-based-on-a-common-column + +#%% Read data +file_list = ['7bvf_b.pdb'] +file_list = ['3byw.pdb'] +#file_list = ['7bvf_b.pdb', 'pnca_complex.pdb', '3byw'] +#%% + +for pdb in file_list: + print(pdb) + p = PDBParser() + structure = p.get_structure(pdb, pdb) + for model in structure: + for chain in model: + for residue in chain: + for atom in residue: + #print(atom) + +#%% biopandas +pdb_dict = {} +for pdb_id in file_list: + ppdb = PandasPdb() + pdb_file = ppdb.read_pdb(pdb_id) + #dir(pdb_file) + atm_df = pdb_file.df['ATOM'] + #print('column names:', atm_df.columns) + + pdb_chains = list(set(atm_df['chain_id'])) + print('pdb chains:', pdb_chains) + + total_chains = len(pdb_chains) + print('total no. of chains:', total_chains) + + chain_info = {} + #atm_df_s = atm_df.sort_values(by=['atom_number', 'chain_id', 'residue_number']) + c_start = atm_df.groupby('chain_id').residue_number.min() + print(c_start) + c_start_df = pd.DataFrame({'chain_id': c_start.index, 'start_res': c_start.values}) + + c_end = atm_df.groupby('chain_id').residue_number.max() + print(c_end) + c_end_df = pd.DataFrame({'chain_id': c_end.index, 'end_res': c_end.values}) + + c_length = atm_df.groupby('chain_id').residue_number.nunique() + print(c_length) + c_length_df = pd.DataFrame({'chain_id': c_length.index, 'chain_len': c_length.values}) + + # combine 3 series into and assign 'chain_id' as a column + # using rlambda function works well (as it should work with whatever number of dataframes you want to merge) + # using pd.concat creates extra chain id cols + c_df = reduce(lambda left,right: pd.merge(left,right, on = 'chain_id'), [c_start_df, c_end_df, c_length_df]) + + # convert df to dict with 'chain_id' as key and columns as list of values + chain_dict = c_df.set_index('chain_id').T.to_dict('list') + print(chain_dict) +#%% Idea +#protein_name: total_chains: 8, total ligands/hetatm = 3 +#df of chain details +#chain start_res end_res len_chain +#pdb tools script separate one chain + +# remove water and diff --git a/scripts/pdbtools b/scripts/pdbtools index 8c46611..881ff8f 160000 --- a/scripts/pdbtools +++ b/scripts/pdbtools @@ -1 +1 @@ -Subproject commit 8c46611c8ceb37b680bc7bbaa161f284f0742f24 +Subproject commit 881ff8f27aaf1db4266a84fb03baad3dab552c64 diff --git a/scripts/pdbtools_commands b/scripts/pdbtools_commands index 9cf29da..28cb195 100644 --- a/scripts/pdbtools_commands +++ b/scripts/pdbtools_commands @@ -32,6 +32,7 @@ home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_residue_renumber /home /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/pyrazinamide/input/pnca_complex.pdb /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/ethambutol/input/7bvf_b.pdb /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/ethambutol/input/7bvf.pdb +/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/rifampicin/input/rpob_complex.pdb #====================================================== # get torsion angles #======================================================