From bf3bb72bb884bb5c2321a1dfe354336ccc2deb82 Mon Sep 17 00:00:00 2001 From: mvaradi Date: Sat, 13 Oct 2018 20:07:54 +0100 Subject: [PATCH] Adding residue index validator --- requirements.txt | 3 +- tests/test_residue_index.py | 104 ++++++++++++++++++++++++++++ validator/residue_index.py | 134 ++++++++++++++++++++++++++++++++++++ 3 files changed, 240 insertions(+), 1 deletion(-) create mode 100644 tests/test_residue_index.py create mode 100644 validator/residue_index.py diff --git a/requirements.txt b/requirements.txt index 7b8f015..9dd5cad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -jsonschema \ No newline at end of file +jsonschema +requests \ No newline at end of file diff --git a/tests/test_residue_index.py b/tests/test_residue_index.py new file mode 100644 index 0000000..74bc717 --- /dev/null +++ b/tests/test_residue_index.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 + +""" +Copyright 2018 EMBL - European Bioinformatics Institute + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +either express or implied. See the License for the specific +language governing permissions and limitations under the +License. +""" + +import json +from unittest import TestCase + +from validator.residue_index import CheckResidueIndices + +with open("data/test_data.json", "r") as mock_data_file: + mock_data = json.load(mock_data_file) + +mock_data_no_pdb_id = {"foo": "bar"} + +mock_data_bad_numbering = {"pdb_id": "2aqa", + "chains": [{"chain_label": "A", + "residues": [{"pdb_res_label": "2", + "aa_type": "ALA"}]}]} + + +def mock_get_residue_numbering_false(self): + return False + + +def mock_get_residue_numbering_true(self): + return True + + +def mock_compare_residue_number(self, foo, bar): + return False + + +class TestCheckResidueIndices(TestCase): + + def setUp(self): + self.cri = CheckResidueIndices(mock_data) + + def test_loop_chains(self): + self.cri.get_residue_numbering = mock_get_residue_numbering_false + result = self.cri.loop_chains() + self.assertFalse(result) + self.cri.get_residue_numbering = mock_get_residue_numbering_true + result = self.cri.loop_chains() + self.assertTrue(result) + self.cri.pdb_id = None + self.assertFalse(self.cri.loop_chains()) + + def test_set_pdb_id(self): + self.assertIsNotNone(self.cri.set_pdb_id()) + bad_cri = CheckResidueIndices(mock_data_no_pdb_id) + self.assertIsNone(bad_cri.set_pdb_id()) + + def test_check_numbering(self): + result = self.cri.check_numbering({}, {}) + self.assertFalse(result) + self.cri.compare_residue_number = mock_compare_residue_number + result = self.cri.check_numbering({}, {"residues": [{"pdb_res_label": 0, "aa_type": "ALA"}]}) + self.assertFalse(result) + + def test_get_residue_numbering(self): + mock_data = {"chain_label": "A"} + self.cri.pdb_id = "1CBS" + self.cri.check_numbering = lambda x, y : True + result = self.cri.get_residue_numbering(mock_data) + self.assertTrue(result) + self.cri.pdb_id = "2H58" + result = self.cri.get_residue_numbering(mock_data) + self.assertFalse(result) + + def test_recursive_loop(self): + result = self.cri.recursive_loop([{"foo": "bar"}], "foo", None, None) + self.assertFalse(result) + + def test_with_bad_numbering(self): + cri_with_bad_numbering = CheckResidueIndices(mock_data_bad_numbering) + result = cri_with_bad_numbering.loop_chains() + self.assertFalse(result) + + def test_process_residues(self): + result = self.cri.process_residues( + [{"author_residue_number": 1, "residue_name": "ALA", "author_insertion_code": ""}], "1", "ALA") + self.assertTrue(result) + result = self.cri.process_residues( + [{"author_residue_number": 1, "residue_name": "ALA", "author_insertion_code": "C"}], "1C", "ALA") + self.assertTrue(result) + result = self.cri.process_residues( + [{"author_residue_number": 2, "residue_name": "ALA", "author_insertion_code": ""}], "1", "ALA") + self.assertFalse(result) + result = self.cri.process_residues( + [{"author_residue_number": 1, "residue_name": "ALA", "author_insertion_code": ""}], "1", "HIS") + self.assertFalse(result) \ No newline at end of file diff --git a/validator/residue_index.py b/validator/residue_index.py new file mode 100644 index 0000000..867fd99 --- /dev/null +++ b/validator/residue_index.py @@ -0,0 +1,134 @@ +import json +import requests + + +class CheckResidueIndices(object): + """ + This class has all the methods required for validating the + residue indices that are in the user submitted data. + Each residue has an index number in the submitted JSON, + and each has to match the indices in the official PDB entry + This class relies on the PDBe API to get the current residue + indices + """ + + def __init__(self, data): + self.api_url = "https://www.ebi.ac.uk/pdbe/api/pdb/entry/residue_listing/" + self.data = data + self.pdb_id = self.set_pdb_id() + self.mismatches = [] + self.labels = ["residues", "chains", "molecules"] + + def set_pdb_id(self): + """ + Sets the PDB id based on the JSON data + :return: String, PDB id or None + """ + if "pdb_id" in self.data.keys(): + return self.data["pdb_id"].lower() + return None + + def loop_chains(self): + """ + Looping through all the chains that are present + in the JSON data + :return: True if the residue numbering is valid, False if not + """ + if not self.pdb_id: + return False + for chain_data in self.data["chains"]: + if not self.get_residue_numbering(chain_data): + return False + return True + + def get_residue_numbering(self, chain_data): + """ + Gets the residue numbering from the PDBe API and + checks all residues + :param chain_data: JSON sub-data + :return: True if residue numbering is valid, False if not + """ + chain_id = chain_data["chain_label"] + url = "%s%s/chain/%s" % (self.api_url, self.pdb_id, chain_id) + response = requests.get(url) + residue_numbering = json.loads(response.text) + if not residue_numbering.keys(): + self.mismatches.append("No residues in PDB for this entry - probably obsoleted entry") + return False + return self.check_numbering(residue_numbering, chain_data) + + def check_numbering(self, residue_numbering, chain_data): + """ + This method loops through all the residues in a chain + and call the residue index comparator method + :param residue_numbering: JSON data from PDBe API + :param chain_data: JSON data from user + :return: True is residue numbering is valid, False if not + """ + if not "residues" in chain_data.keys(): + return False + for residue in chain_data["residues"]: + depositor_residue_number = residue["pdb_res_label"] + depositor_aa_type = residue["aa_type"] + if not self.compare_residue_number(depositor_residue_number, depositor_aa_type, residue_numbering): + return False + return True + + def compare_residue_number(self, depositor_residue_number, depositor_aa_type, residue_numbering): + """ + This method starts looping through the substructure of the PDBe API data + :param depositor_residue_number: Residue number provided by the user + :param depositor_aa_type: Residue amino acid code provided by user + :param residue_numbering: Residue numbering provided by PDBe API + :return: True is residue numbering is valid, False if not + """ + molecules = residue_numbering[self.pdb_id]["molecules"] + return self.recursive_loop(molecules, "chains", depositor_residue_number, depositor_aa_type) + + def recursive_loop(self, data, label, depositor_residue_number, depositor_aa_type): + """ + A recursive loop that goes down to residue level and processes all residues + :param data: JSON data + :param label: String, "chains" or "residues" depending on the level + :param depositor_residue_number: Residue number provided by the user + :param depositor_aa_type: Residue amino acid code provided by user + :return: True is residue numbering is valid, False if not + """ + for item in data: + sub_data = item[label] + if label == "chains": + return self.recursive_loop(sub_data, "residues", depositor_residue_number, depositor_aa_type) + elif label == "residues": + return self.process_residues(sub_data, depositor_residue_number, depositor_aa_type) + return False + + def process_residues(self, residues, depositor_residue_number, depositor_aa_type): + """ + This method grabs the residue information and call the comparator if the + residue number of PDBe is the same as the user input + :param residues: Residue data from PDBe API + :param depositor_residue_number: Residue number provided by the user + :param depositor_aa_type: Residue amino acid code provided by user + :return: True is residue numbering is valid, False if not + """ + for residue in residues: + if "%i%s" % (residue["author_residue_number"], residue["author_insertion_code"]) == depositor_residue_number: + return self.make_comparison(residue["residue_name"], depositor_aa_type, depositor_residue_number) + self.mismatches.append("residue numbering is completely mismatched between data and PDB entry") + return False + + def make_comparison(self, residue_name, depositor_aa_type, depositor_residue_number): + """ + This method does the comparison between two residues that have the same index number + The comparison is between amino acid code + :param residue_name: Residue amino acid code provided by PDBe API + :param depositor_aa_type: Residue amino acid code provided by user + :param depositor_residue_number: Residue number provided by the user + :return: True is residue numbering is valid, False if not + """ + if residue_name == depositor_aa_type: + return True + mismatch = "residue %s (%s) in data does not match residue %s (%s) in PDB" % ( + depositor_residue_number, depositor_aa_type, depositor_residue_number, residue_name) + self.mismatches.append(mismatch) + return False \ No newline at end of file