From 2ac4ea8f5cc888327981728e1e69336ae0ca9b6f Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 16 Nov 2020 16:01:31 +0000 Subject: [PATCH] added script to interrogate pdb files mainly for res numbers --- scripts/pdb_data_exraction.py | 222 ++++++++++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100755 scripts/pdb_data_exraction.py diff --git a/scripts/pdb_data_exraction.py b/scripts/pdb_data_exraction.py new file mode 100755 index 0000000..512e54e --- /dev/null +++ b/scripts/pdb_data_exraction.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' +# FIXME: import dirs.py to get the basic dir paths available +#======================================================================= +# TASK: Read pdb file and check if numbering is continuous +#======================================================================= +#%% load libraries +import os, sys +import re +import pandas as pd +import numpy as np +from biopandas.pdb import PandasPdb +import argparse +#======================================================================= +#%% dir and local imports +homedir = os.path.expanduser('~') +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/scripts') +os.getcwd() +#======================================================================= +#%% command line args +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None) +arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) +arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') +arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') +arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') +arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true') + +arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') + + +args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output paths & filenames +drug = args.drug +gene = args.gene +datadir = args.datadir +indir = args.input_dir +outdir = args.output_dir +make_dirs = args.make_dirs + +#drug = 'ethambutol' +#gene = 'embB' + +#%% input and output dirs and files +#======= +# dirs +#======= +if not datadir: + datadir = homedir + '/' + 'git/Data' + +if not indir: + indir = datadir + '/' + drug + '/input' + +if not outdir: + outdir = datadir + '/' + drug + '/output' + +if make_dirs: + print('make_dirs is turned on, creating data dir:', datadir) + try: + os.makedirs(datadir, exist_ok = True) + print("Directory '%s' created successfully" %datadir) + except OSError as error: + print("Directory '%s' can not be created") + + print('make_dirs is turned on, creating indir:', indir) + try: + os.makedirs(indir, exist_ok = True) + print("Directory '%s' created successfully" %indir) + except OSError as error: + print("Directory '%s' can not be created") + + print('make_dirs is turned on, creating outdir:', outdir) + try: + os.makedirs(outdir, exist_ok = True) + print("Directory '%s' created successfully" %outdir) + except OSError as error: + print("Directory '%s' can not be created") + +# handle missing dirs here +if not os.path.isdir(datadir): + print('ERROR: Data directory does not exist:', datadir + , '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option ---make_dirs') + sys.exit() +if not os.path.isdir(indir): + print('ERROR: Input directory does not exist:', indir + , '\nPlease either create or specify indir and rerun\nelse specify cmd option ---make_dirs') + sys.exit() +if not os.path.isdir(outdir): + print('ERROR: Output directory does not exist:', outdir + , '\nPlease create or specify outdir and rerun\nelse specify cmd option ---make_dirs') + sys.exit() + +# Requires +from reference_dict import up_3letter_aa_dict # CHECK DIR STRUC THERE! +#=================================================================== +gene_match = gene + '_p.' +print('mut pattern for gene', gene, ':', gene_match) +#======================================================================= +#======= +# input +#======= +in_filename_pdb = gene.lower() + '_complex.pdb' +infile_pdb = indir + '/' + in_filename_pdb +print('Input file: ', infile_pdb + , '\n============================================================') + +#======= +# output +#======= +# several output files: in respective sections at the time of outputting files +print('Output filename: in the respective sections' + , '\nOutput path: ', outdir + , '\n=============================================================') + +#%%end of variable assignment for input and output files +#======================================================================= +# fetch pdb +#ppdb = PandasPdb().fetch_pdb('XXX') +ppdb = PandasPdb().read_pdb(infile_pdb) + +print('PDB Code: %s' % ppdb.code) +print('PDB Header Line: %s' % ppdb.header) +print('\nRaw PDB file contents:\n\n%s\n...' % ppdb.pdb_text[:1000]) + +# extract the atom records into a df +pdb_atoms_df = ppdb.df['ATOM'] +pdb_atoms_df.head(3) + +# extract all residue numbers +pdb_resno_df = pdb_atoms_df[['chain_id', 'residue_number', 'residue_name']] + +# extract unique residue numbers +#pdb_resno_df_u = pdb_resno_df.drop_duplicates(subset = ['chain_id', 'residue_number'], keep = 'first') #hurr durr + +# initialise a sub dict that is lookup dict for three letter code to 1-letter code +# adding three more cols +lookup_dict = dict() +for k, v in up_3letter_aa_dict.items(): + lookup_dict[k] = v['one_letter_code'] + #print(lookup_dict) + #wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze() + aa_1let = pdb_resno_df['residue_name'] + #print(pdb_resno_df['residue_name'], ':', aa_1let) + pdb_resno_df['residue_name_1let'] = pdb_resno_df['residue_name'].map(lookup_dict) + + +# extract df with chain of interest +my_resno_df_all = pdb_resno_df[pdb_resno_df['chain_id'] == 'B'] +my_resno_df = my_resno_df_all.drop_duplicates(subset = 'residue_number', keep = 'first') +print('Length of extracted chain for', gene.lower(), 'is:', len(my_resno_df) ) + +pdb_numbers = my_resno_df['residue_number'].tolist() +#%% Check continuity of residue numbers + +# check if the residue numbering is continuous: as this will effect the 'valid' +# mcsm snps you can run +def CheckConsecutive(l): + n = len(l) - 1 + return (sum(np.diff(sorted(l)) == 1) >= n) + +def FindDiscontinuos(l): + #n = len(l) - 1 + #return (sum(np.diff(sorted(l)) == 1) >= n) + check_cont = np.diff(sorted(l)) == 1 # array of True/False + n_discont = np.size(check_cont ) - np.count_nonzero(check_cont) + print('Continuity broken', n_discont, 'times' + , '\nFinding out which sites...') + discont = np.diff(sorted(l)) != 1 + discont_i = list(np.where(discont)[0]) + if len(discont_i) == n_discont: + my_indices = discont_i[0:] + my_discont_resno = [l[i] for i in my_indices] + #after = [resno + 1 for resno in my_discont_resno] + print('Extracting PDB residue number/s with start of discontinuity:', my_discont_resno) + return(my_discont_resno) +#%% Call above functions +#test_numbers = [1, 2, 3, 4, 5, 9,10, 20, 30] + +CheckConsecutive(pdb_numbers) + +#if CheckConsecutive(test_numbers): +if CheckConsecutive(pdb_numbers): + print('PASS: PDB numbering continuous') +else: + print('WARNING:PDB numbering discontinuous...!') + +#FindDiscontinuos(test_numbers) +resno_check = FindDiscontinuos(pdb_numbers) + +#%% +#resno_check = resno_check + [248,249] +pdb_numbers_i = [i for i, v in enumerate(pdb_numbers) if v in resno_check] +pdb_numbers_i + +#ind_before = [i - 1 for i in pdb_numbers_i] +ind_after = [i + 1 for i in pdb_numbers_i] + +ind_checks = pdb_numbers_i + ind_after +ind_checks + +# extract values for these indices from pdb_numbers +resno_check = [pdb_numbers[i] for i in ind_checks] +resno_check + +check_df = my_resno_df[my_resno_df['residue_number'].isin(resno_check)] +check_df +#%%====================================================================== +# TODO: read muts file and extract structural pos numbers + +#======================================================================= +print(u'\u2698' * 50, + '\nEnd of script: PDB data extraction and writing files' + '\n' + u'\u2698' * 50 ) +#%% end of script