#!/usr/bin/env python import os from Bio.PDB import * from biopandas.pdb import PandasPdb from collections import defaultdict, OrderedDict import pandas as pd from functools import reduce #%% see verison of pandas #print(pd.__version__) #%% homedir = os.path.expanduser('~') os.chdir(homedir + '/git/LSHTM_analysis/scripts/examples') # link #https://www.pythonprogramming.in/pandas-count-distinct-values-of-one-column-depend-on-another-column.html #https://datascience.stackexchange.com/questions/32328/export-pandas-to-dictionary-by-combining-multiple-row-values # 3 way merge #https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns #https://stackoverflow.com/questions/52223045/merge-multiple-dataframes-based-on-a-common-column #%% Read data file_list = ['7bvf_b.pdb'] file_list = ['3byw.pdb'] #file_list = ['7bvf_b.pdb', 'pnca_complex.pdb', '3byw'] #%% for pdb in file_list: print(pdb) p = PDBParser() structure = p.get_structure(pdb, pdb) for model in structure: for chain in model: for residue in chain: for atom in residue: #print(atom) #%% biopandas pdb_dict = {} for pdb_id in file_list: ppdb = PandasPdb() pdb_file = ppdb.read_pdb(pdb_id) #dir(pdb_file) atm_df = pdb_file.df['ATOM'] #print('column names:', atm_df.columns) pdb_chains = list(set(atm_df['chain_id'])) print('pdb chains:', pdb_chains) total_chains = len(pdb_chains) print('total no. of chains:', total_chains) chain_info = {} #atm_df_s = atm_df.sort_values(by=['atom_number', 'chain_id', 'residue_number']) c_start = atm_df.groupby('chain_id').residue_number.min() print(c_start) c_start_df = pd.DataFrame({'chain_id': c_start.index, 'start_res': c_start.values}) c_end = atm_df.groupby('chain_id').residue_number.max() print(c_end) c_end_df = pd.DataFrame({'chain_id': c_end.index, 'end_res': c_end.values}) c_length = atm_df.groupby('chain_id').residue_number.nunique() print(c_length) c_length_df = pd.DataFrame({'chain_id': c_length.index, 'chain_len': c_length.values}) # combine 3 series into and assign 'chain_id' as a column # using rlambda function works well (as it should work with whatever number of dataframes you want to merge) # using pd.concat creates extra chain id cols c_df = reduce(lambda left,right: pd.merge(left,right, on = 'chain_id'), [c_start_df, c_end_df, c_length_df]) # convert df to dict with 'chain_id' as key and columns as list of values chain_dict = c_df.set_index('chain_id').T.to_dict('list') print(chain_dict) #%% Idea #protein_name: total_chains: 8, total ligands/hetatm = 3 #df of chain details #chain start_res end_res len_chain #pdb tools script separate one chain # remove water and