LSHTM_analysis/scripts/pdb_data_exraction.py

222 lines
7.9 KiB
Python
Executable file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Tue Aug 6 12:56:03 2019
@author: tanu
'''
# FIXME: import dirs.py to get the basic dir paths available
#=======================================================================
# TASK: Read pdb file and check if numbering is continuous
#=======================================================================
#%% load libraries
import os, sys
import re
import pandas as pd
import numpy as np
from biopandas.pdb import PandasPdb
import argparse
#=======================================================================
#%% dir and local imports
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd()
#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true')
arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
args = arg_parser.parse_args()
#=======================================================================
#%% variable assignment: input and output paths & filenames
drug = args.drug
gene = args.gene
datadir = args.datadir
indir = args.input_dir
outdir = args.output_dir
make_dirs = args.make_dirs
#drug = 'ethambutol'
#gene = 'embB'
#%% input and output dirs and files
#=======
# dirs
#=======
if not datadir:
datadir = homedir + '/' + 'git/Data'
if not indir:
indir = datadir + '/' + drug + '/input'
if not outdir:
outdir = datadir + '/' + drug + '/output'
if make_dirs:
print('make_dirs is turned on, creating data dir:', datadir)
try:
os.makedirs(datadir, exist_ok = True)
print("Directory '%s' created successfully" %datadir)
except OSError as error:
print("Directory '%s' can not be created")
print('make_dirs is turned on, creating indir:', indir)
try:
os.makedirs(indir, exist_ok = True)
print("Directory '%s' created successfully" %indir)
except OSError as error:
print("Directory '%s' can not be created")
print('make_dirs is turned on, creating outdir:', outdir)
try:
os.makedirs(outdir, exist_ok = True)
print("Directory '%s' created successfully" %outdir)
except OSError as error:
print("Directory '%s' can not be created")
# handle missing dirs here
if not os.path.isdir(datadir):
print('ERROR: Data directory does not exist:', datadir
, '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option ---make_dirs')
sys.exit()
if not os.path.isdir(indir):
print('ERROR: Input directory does not exist:', indir
, '\nPlease either create or specify indir and rerun\nelse specify cmd option ---make_dirs')
sys.exit()
if not os.path.isdir(outdir):
print('ERROR: Output directory does not exist:', outdir
, '\nPlease create or specify outdir and rerun\nelse specify cmd option ---make_dirs')
sys.exit()
# Requires
from reference_dict import up_3letter_aa_dict # CHECK DIR STRUC THERE!
#===================================================================
gene_match = gene + '_p.'
print('mut pattern for gene', gene, ':', gene_match)
#=======================================================================
#=======
# input
#=======
in_filename_pdb = gene.lower() + '_complex.pdb'
infile_pdb = indir + '/' + in_filename_pdb
print('Input file: ', infile_pdb
, '\n============================================================')
#=======
# output
#=======
# several output files: in respective sections at the time of outputting files
print('Output filename: in the respective sections'
, '\nOutput path: ', outdir
, '\n=============================================================')
#%%end of variable assignment for input and output files
#=======================================================================
# fetch pdb
#ppdb = PandasPdb().fetch_pdb('XXX')
ppdb = PandasPdb().read_pdb(infile_pdb)
print('PDB Code: %s' % ppdb.code)
print('PDB Header Line: %s' % ppdb.header)
print('\nRaw PDB file contents:\n\n%s\n...' % ppdb.pdb_text[:1000])
# extract the atom records into a df
pdb_atoms_df = ppdb.df['ATOM']
pdb_atoms_df.head(3)
# extract all residue numbers
pdb_resno_df = pdb_atoms_df[['chain_id', 'residue_number', 'residue_name']]
# extract unique residue numbers
#pdb_resno_df_u = pdb_resno_df.drop_duplicates(subset = ['chain_id', 'residue_number'], keep = 'first') #hurr durr
# initialise a sub dict that is lookup dict for three letter code to 1-letter code
# adding three more cols
lookup_dict = dict()
for k, v in up_3letter_aa_dict.items():
lookup_dict[k] = v['one_letter_code']
#print(lookup_dict)
#wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
aa_1let = pdb_resno_df['residue_name']
#print(pdb_resno_df['residue_name'], ':', aa_1let)
pdb_resno_df['residue_name_1let'] = pdb_resno_df['residue_name'].map(lookup_dict)
# extract df with chain of interest
my_resno_df_all = pdb_resno_df[pdb_resno_df['chain_id'] == 'B']
my_resno_df = my_resno_df_all.drop_duplicates(subset = 'residue_number', keep = 'first')
print('Length of extracted chain for', gene.lower(), 'is:', len(my_resno_df) )
pdb_numbers = my_resno_df['residue_number'].tolist()
#%% Check continuity of residue numbers
# check if the residue numbering is continuous: as this will effect the 'valid'
# mcsm snps you can run
def CheckConsecutive(l):
n = len(l) - 1
return (sum(np.diff(sorted(l)) == 1) >= n)
def FindDiscontinuos(l):
#n = len(l) - 1
#return (sum(np.diff(sorted(l)) == 1) >= n)
check_cont = np.diff(sorted(l)) == 1 # array of True/False
n_discont = np.size(check_cont ) - np.count_nonzero(check_cont)
print('Continuity broken', n_discont, 'times'
, '\nFinding out which sites...')
discont = np.diff(sorted(l)) != 1
discont_i = list(np.where(discont)[0])
if len(discont_i) == n_discont:
my_indices = discont_i[0:]
my_discont_resno = [l[i] for i in my_indices]
#after = [resno + 1 for resno in my_discont_resno]
print('Extracting PDB residue number/s with start of discontinuity:', my_discont_resno)
return(my_discont_resno)
#%% Call above functions
#test_numbers = [1, 2, 3, 4, 5, 9,10, 20, 30]
CheckConsecutive(pdb_numbers)
#if CheckConsecutive(test_numbers):
if CheckConsecutive(pdb_numbers):
print('PASS: PDB numbering continuous')
else:
print('WARNING:PDB numbering discontinuous...!')
#FindDiscontinuos(test_numbers)
resno_check = FindDiscontinuos(pdb_numbers)
#%%
#resno_check = resno_check + [248,249]
pdb_numbers_i = [i for i, v in enumerate(pdb_numbers) if v in resno_check]
pdb_numbers_i
#ind_before = [i - 1 for i in pdb_numbers_i]
ind_after = [i + 1 for i in pdb_numbers_i]
ind_checks = pdb_numbers_i + ind_after
ind_checks
# extract values for these indices from pdb_numbers
resno_check = [pdb_numbers[i] for i in ind_checks]
resno_check
check_df = my_resno_df[my_resno_df['residue_number'].isin(resno_check)]
check_df
#%%======================================================================
# TODO: read muts file and extract structural pos numbers
#=======================================================================
print(u'\u2698' * 50,
'\nEnd of script: PDB data extraction and writing files'
'\n' + u'\u2698' * 50 )
#%% end of script