222 lines
7.9 KiB
Python
Executable file
222 lines
7.9 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
'''
|
|
Created on Tue Aug 6 12:56:03 2019
|
|
|
|
@author: tanu
|
|
'''
|
|
# FIXME: import dirs.py to get the basic dir paths available
|
|
#=======================================================================
|
|
# TASK: Read pdb file and check if numbering is continuous
|
|
#=======================================================================
|
|
#%% load libraries
|
|
import os, sys
|
|
import re
|
|
import pandas as pd
|
|
import numpy as np
|
|
from biopandas.pdb import PandasPdb
|
|
import argparse
|
|
#=======================================================================
|
|
#%% dir and local imports
|
|
homedir = os.path.expanduser('~')
|
|
# set working dir
|
|
os.getcwd()
|
|
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
|
os.getcwd()
|
|
#=======================================================================
|
|
#%% command line args
|
|
arg_parser = argparse.ArgumentParser()
|
|
arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
|
|
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
|
|
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
|
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
|
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
|
arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true')
|
|
|
|
arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
|
|
|
|
|
|
args = arg_parser.parse_args()
|
|
#=======================================================================
|
|
#%% variable assignment: input and output paths & filenames
|
|
drug = args.drug
|
|
gene = args.gene
|
|
datadir = args.datadir
|
|
indir = args.input_dir
|
|
outdir = args.output_dir
|
|
make_dirs = args.make_dirs
|
|
|
|
#drug = 'ethambutol'
|
|
#gene = 'embB'
|
|
|
|
#%% input and output dirs and files
|
|
#=======
|
|
# dirs
|
|
#=======
|
|
if not datadir:
|
|
datadir = homedir + '/' + 'git/Data'
|
|
|
|
if not indir:
|
|
indir = datadir + '/' + drug + '/input'
|
|
|
|
if not outdir:
|
|
outdir = datadir + '/' + drug + '/output'
|
|
|
|
if make_dirs:
|
|
print('make_dirs is turned on, creating data dir:', datadir)
|
|
try:
|
|
os.makedirs(datadir, exist_ok = True)
|
|
print("Directory '%s' created successfully" %datadir)
|
|
except OSError as error:
|
|
print("Directory '%s' can not be created")
|
|
|
|
print('make_dirs is turned on, creating indir:', indir)
|
|
try:
|
|
os.makedirs(indir, exist_ok = True)
|
|
print("Directory '%s' created successfully" %indir)
|
|
except OSError as error:
|
|
print("Directory '%s' can not be created")
|
|
|
|
print('make_dirs is turned on, creating outdir:', outdir)
|
|
try:
|
|
os.makedirs(outdir, exist_ok = True)
|
|
print("Directory '%s' created successfully" %outdir)
|
|
except OSError as error:
|
|
print("Directory '%s' can not be created")
|
|
|
|
# handle missing dirs here
|
|
if not os.path.isdir(datadir):
|
|
print('ERROR: Data directory does not exist:', datadir
|
|
, '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option ---make_dirs')
|
|
sys.exit()
|
|
if not os.path.isdir(indir):
|
|
print('ERROR: Input directory does not exist:', indir
|
|
, '\nPlease either create or specify indir and rerun\nelse specify cmd option ---make_dirs')
|
|
sys.exit()
|
|
if not os.path.isdir(outdir):
|
|
print('ERROR: Output directory does not exist:', outdir
|
|
, '\nPlease create or specify outdir and rerun\nelse specify cmd option ---make_dirs')
|
|
sys.exit()
|
|
|
|
# Requires
|
|
from reference_dict import up_3letter_aa_dict # CHECK DIR STRUC THERE!
|
|
#===================================================================
|
|
gene_match = gene + '_p.'
|
|
print('mut pattern for gene', gene, ':', gene_match)
|
|
#=======================================================================
|
|
#=======
|
|
# input
|
|
#=======
|
|
in_filename_pdb = gene.lower() + '_complex.pdb'
|
|
infile_pdb = indir + '/' + in_filename_pdb
|
|
print('Input file: ', infile_pdb
|
|
, '\n============================================================')
|
|
|
|
#=======
|
|
# output
|
|
#=======
|
|
# several output files: in respective sections at the time of outputting files
|
|
print('Output filename: in the respective sections'
|
|
, '\nOutput path: ', outdir
|
|
, '\n=============================================================')
|
|
|
|
#%%end of variable assignment for input and output files
|
|
#=======================================================================
|
|
# fetch pdb
|
|
#ppdb = PandasPdb().fetch_pdb('XXX')
|
|
ppdb = PandasPdb().read_pdb(infile_pdb)
|
|
|
|
print('PDB Code: %s' % ppdb.code)
|
|
print('PDB Header Line: %s' % ppdb.header)
|
|
print('\nRaw PDB file contents:\n\n%s\n...' % ppdb.pdb_text[:1000])
|
|
|
|
# extract the atom records into a df
|
|
pdb_atoms_df = ppdb.df['ATOM']
|
|
pdb_atoms_df.head(3)
|
|
|
|
# extract all residue numbers
|
|
pdb_resno_df = pdb_atoms_df[['chain_id', 'residue_number', 'residue_name']]
|
|
|
|
# extract unique residue numbers
|
|
#pdb_resno_df_u = pdb_resno_df.drop_duplicates(subset = ['chain_id', 'residue_number'], keep = 'first') #hurr durr
|
|
|
|
# initialise a sub dict that is lookup dict for three letter code to 1-letter code
|
|
# adding three more cols
|
|
lookup_dict = dict()
|
|
for k, v in up_3letter_aa_dict.items():
|
|
lookup_dict[k] = v['one_letter_code']
|
|
#print(lookup_dict)
|
|
#wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
|
|
aa_1let = pdb_resno_df['residue_name']
|
|
#print(pdb_resno_df['residue_name'], ':', aa_1let)
|
|
pdb_resno_df['residue_name_1let'] = pdb_resno_df['residue_name'].map(lookup_dict)
|
|
|
|
|
|
# extract df with chain of interest
|
|
my_resno_df_all = pdb_resno_df[pdb_resno_df['chain_id'] == 'B']
|
|
my_resno_df = my_resno_df_all.drop_duplicates(subset = 'residue_number', keep = 'first')
|
|
print('Length of extracted chain for', gene.lower(), 'is:', len(my_resno_df) )
|
|
|
|
pdb_numbers = my_resno_df['residue_number'].tolist()
|
|
#%% Check continuity of residue numbers
|
|
|
|
# check if the residue numbering is continuous: as this will effect the 'valid'
|
|
# mcsm snps you can run
|
|
def CheckConsecutive(l):
|
|
n = len(l) - 1
|
|
return (sum(np.diff(sorted(l)) == 1) >= n)
|
|
|
|
def FindDiscontinuos(l):
|
|
#n = len(l) - 1
|
|
#return (sum(np.diff(sorted(l)) == 1) >= n)
|
|
check_cont = np.diff(sorted(l)) == 1 # array of True/False
|
|
n_discont = np.size(check_cont ) - np.count_nonzero(check_cont)
|
|
print('Continuity broken', n_discont, 'times'
|
|
, '\nFinding out which sites...')
|
|
discont = np.diff(sorted(l)) != 1
|
|
discont_i = list(np.where(discont)[0])
|
|
if len(discont_i) == n_discont:
|
|
my_indices = discont_i[0:]
|
|
my_discont_resno = [l[i] for i in my_indices]
|
|
#after = [resno + 1 for resno in my_discont_resno]
|
|
print('Extracting PDB residue number/s with start of discontinuity:', my_discont_resno)
|
|
return(my_discont_resno)
|
|
#%% Call above functions
|
|
#test_numbers = [1, 2, 3, 4, 5, 9,10, 20, 30]
|
|
|
|
CheckConsecutive(pdb_numbers)
|
|
|
|
#if CheckConsecutive(test_numbers):
|
|
if CheckConsecutive(pdb_numbers):
|
|
print('PASS: PDB numbering continuous')
|
|
else:
|
|
print('WARNING:PDB numbering discontinuous...!')
|
|
|
|
#FindDiscontinuos(test_numbers)
|
|
resno_check = FindDiscontinuos(pdb_numbers)
|
|
|
|
#%%
|
|
#resno_check = resno_check + [248,249]
|
|
pdb_numbers_i = [i for i, v in enumerate(pdb_numbers) if v in resno_check]
|
|
pdb_numbers_i
|
|
|
|
#ind_before = [i - 1 for i in pdb_numbers_i]
|
|
ind_after = [i + 1 for i in pdb_numbers_i]
|
|
|
|
ind_checks = pdb_numbers_i + ind_after
|
|
ind_checks
|
|
|
|
# extract values for these indices from pdb_numbers
|
|
resno_check = [pdb_numbers[i] for i in ind_checks]
|
|
resno_check
|
|
|
|
check_df = my_resno_df[my_resno_df['residue_number'].isin(resno_check)]
|
|
check_df
|
|
#%%======================================================================
|
|
# TODO: read muts file and extract structural pos numbers
|
|
|
|
#=======================================================================
|
|
print(u'\u2698' * 50,
|
|
'\nEnd of script: PDB data extraction and writing files'
|
|
'\n' + u'\u2698' * 50 )
|
|
#%% end of script
|