From bf3bb72bb884bb5c2321a1dfe354336ccc2deb82 Mon Sep 17 00:00:00 2001
From: mvaradi <mvaradi@ebi.ac.uk>
Date: Sat, 13 Oct 2018 20:07:54 +0100
Subject: [PATCH] Adding residue index validator

---
 requirements.txt            |   3 +-
 tests/test_residue_index.py | 104 ++++++++++++++++++++++++++++
 validator/residue_index.py  | 134 ++++++++++++++++++++++++++++++++++++
 3 files changed, 240 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_residue_index.py
 create mode 100644 validator/residue_index.py

diff --git a/requirements.txt b/requirements.txt
index 7b8f015..9dd5cad 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
-jsonschema
\ No newline at end of file
+jsonschema
+requests
\ No newline at end of file
diff --git a/tests/test_residue_index.py b/tests/test_residue_index.py
new file mode 100644
index 0000000..74bc717
--- /dev/null
+++ b/tests/test_residue_index.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+"""
+Copyright 2018 EMBL - European Bioinformatics Institute
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+either express or implied. See the License for the specific
+language governing permissions and limitations under the
+License.
+"""
+
+import json
+from unittest import TestCase
+
+from validator.residue_index import CheckResidueIndices
+
+with open("data/test_data.json", "r") as mock_data_file:
+    mock_data = json.load(mock_data_file)
+
+mock_data_no_pdb_id = {"foo": "bar"}
+
+mock_data_bad_numbering = {"pdb_id": "2aqa",
+                           "chains": [{"chain_label": "A",
+                                       "residues": [{"pdb_res_label": "2",
+                                                     "aa_type": "ALA"}]}]}
+
+
+def mock_get_residue_numbering_false(self):
+    return False
+
+
+def mock_get_residue_numbering_true(self):
+    return True
+
+
+def mock_compare_residue_number(self, foo, bar):
+    return False
+
+
+class TestCheckResidueIndices(TestCase):
+
+    def setUp(self):
+        self.cri = CheckResidueIndices(mock_data)
+
+    def test_loop_chains(self):
+        self.cri.get_residue_numbering = mock_get_residue_numbering_false
+        result = self.cri.loop_chains()
+        self.assertFalse(result)
+        self.cri.get_residue_numbering = mock_get_residue_numbering_true
+        result = self.cri.loop_chains()
+        self.assertTrue(result)
+        self.cri.pdb_id = None
+        self.assertFalse(self.cri.loop_chains())
+
+    def test_set_pdb_id(self):
+        self.assertIsNotNone(self.cri.set_pdb_id())
+        bad_cri = CheckResidueIndices(mock_data_no_pdb_id)
+        self.assertIsNone(bad_cri.set_pdb_id())
+
+    def test_check_numbering(self):
+        result = self.cri.check_numbering({}, {})
+        self.assertFalse(result)
+        self.cri.compare_residue_number = mock_compare_residue_number
+        result = self.cri.check_numbering({}, {"residues": [{"pdb_res_label": 0, "aa_type": "ALA"}]})
+        self.assertFalse(result)
+
+    def test_get_residue_numbering(self):
+        mock_data = {"chain_label": "A"}
+        self.cri.pdb_id = "1CBS"
+        self.cri.check_numbering = lambda x, y : True
+        result = self.cri.get_residue_numbering(mock_data)
+        self.assertTrue(result)
+        self.cri.pdb_id = "2H58"
+        result = self.cri.get_residue_numbering(mock_data)
+        self.assertFalse(result)
+
+    def test_recursive_loop(self):
+        result = self.cri.recursive_loop([{"foo": "bar"}], "foo", None, None)
+        self.assertFalse(result)
+
+    def test_with_bad_numbering(self):
+        cri_with_bad_numbering = CheckResidueIndices(mock_data_bad_numbering)
+        result = cri_with_bad_numbering.loop_chains()
+        self.assertFalse(result)
+
+    def test_process_residues(self):
+        result = self.cri.process_residues(
+            [{"author_residue_number": 1, "residue_name": "ALA", "author_insertion_code": ""}], "1", "ALA")
+        self.assertTrue(result)
+        result = self.cri.process_residues(
+            [{"author_residue_number": 1, "residue_name": "ALA", "author_insertion_code": "C"}], "1C", "ALA")
+        self.assertTrue(result)
+        result = self.cri.process_residues(
+            [{"author_residue_number": 2, "residue_name": "ALA", "author_insertion_code": ""}], "1", "ALA")
+        self.assertFalse(result)
+        result = self.cri.process_residues(
+            [{"author_residue_number": 1, "residue_name": "ALA", "author_insertion_code": ""}], "1", "HIS")
+        self.assertFalse(result)
\ No newline at end of file
diff --git a/validator/residue_index.py b/validator/residue_index.py
new file mode 100644
index 0000000..867fd99
--- /dev/null
+++ b/validator/residue_index.py
@@ -0,0 +1,134 @@
+import json
+import requests
+
+
+class CheckResidueIndices(object):
+    """
+    This class has all the methods required for validating the
+    residue indices that are in the user submitted data.
+    Each residue has an index number in the submitted JSON,
+    and each has to match the indices in the official PDB entry
+    This class relies on the PDBe API to get the current residue
+    indices
+    """
+
+    def __init__(self, data):
+        self.api_url = "https://www.ebi.ac.uk/pdbe/api/pdb/entry/residue_listing/"
+        self.data = data
+        self.pdb_id = self.set_pdb_id()
+        self.mismatches = []
+        self.labels = ["residues", "chains", "molecules"]
+
+    def set_pdb_id(self):
+        """
+        Sets the PDB id based on the JSON data
+        :return: String, PDB id or None
+        """
+        if "pdb_id" in self.data.keys():
+            return self.data["pdb_id"].lower()
+        return None
+
+    def loop_chains(self):
+        """
+        Looping through all the chains that are present
+        in the JSON data
+        :return: True if the residue numbering is valid, False if not
+        """
+        if not self.pdb_id:
+            return False
+        for chain_data in self.data["chains"]:
+            if not self.get_residue_numbering(chain_data):
+                return False
+        return True
+
+    def get_residue_numbering(self, chain_data):
+        """
+        Gets the residue numbering from the PDBe API and
+        checks all residues
+        :param chain_data: JSON sub-data
+        :return: True if residue numbering is valid, False if not
+        """
+        chain_id = chain_data["chain_label"]
+        url = "%s%s/chain/%s" % (self.api_url, self.pdb_id, chain_id)
+        response = requests.get(url)
+        residue_numbering = json.loads(response.text)
+        if not residue_numbering.keys():
+            self.mismatches.append("No residues in PDB for this entry - probably obsoleted entry")
+            return False
+        return self.check_numbering(residue_numbering, chain_data)
+
+    def check_numbering(self, residue_numbering, chain_data):
+        """
+        This method loops through all the residues in a chain
+        and call the residue index comparator method
+        :param residue_numbering: JSON data from PDBe API
+        :param chain_data: JSON data from user
+        :return: True is residue numbering is valid, False if not
+        """
+        if not "residues" in chain_data.keys():
+            return False
+        for residue in chain_data["residues"]:
+            depositor_residue_number = residue["pdb_res_label"]
+            depositor_aa_type = residue["aa_type"]
+            if not self.compare_residue_number(depositor_residue_number, depositor_aa_type, residue_numbering):
+                return False
+        return True
+
+    def compare_residue_number(self, depositor_residue_number, depositor_aa_type, residue_numbering):
+        """
+        This method starts looping through the substructure of the PDBe API data
+        :param depositor_residue_number: Residue number provided by the user
+        :param depositor_aa_type: Residue amino acid code provided by user
+        :param residue_numbering: Residue numbering provided by PDBe API
+        :return: True is residue numbering is valid, False if not
+        """
+        molecules = residue_numbering[self.pdb_id]["molecules"]
+        return self.recursive_loop(molecules, "chains", depositor_residue_number, depositor_aa_type)
+
+    def recursive_loop(self, data, label, depositor_residue_number, depositor_aa_type):
+        """
+        A recursive loop that goes down to residue level and processes all residues
+        :param data: JSON data
+        :param label: String, "chains" or "residues" depending on the level
+        :param depositor_residue_number: Residue number provided by the user
+        :param depositor_aa_type: Residue amino acid code provided by user
+        :return: True is residue numbering is valid, False if not
+        """
+        for item in data:
+            sub_data = item[label]
+            if label == "chains":
+                return self.recursive_loop(sub_data, "residues", depositor_residue_number, depositor_aa_type)
+            elif label == "residues":
+                return self.process_residues(sub_data, depositor_residue_number, depositor_aa_type)
+            return False
+
+    def process_residues(self, residues, depositor_residue_number, depositor_aa_type):
+        """
+        This method grabs the residue information and call the comparator if the
+        residue number of PDBe is the same as the user input
+        :param residues: Residue data from PDBe API
+        :param depositor_residue_number: Residue number provided by the user
+        :param depositor_aa_type: Residue amino acid code provided by user
+        :return: True is residue numbering is valid, False if not
+        """
+        for residue in residues:
+            if "%i%s" % (residue["author_residue_number"], residue["author_insertion_code"]) == depositor_residue_number:
+                return self.make_comparison(residue["residue_name"], depositor_aa_type, depositor_residue_number)
+        self.mismatches.append("residue numbering is completely mismatched between data and PDB entry")
+        return False
+
+    def make_comparison(self, residue_name, depositor_aa_type, depositor_residue_number):
+        """
+        This method does the comparison between two residues that have the same index number
+        The comparison is between amino acid code
+        :param residue_name: Residue amino acid code provided by PDBe API
+        :param depositor_aa_type: Residue amino acid code provided by user
+        :param depositor_residue_number: Residue number provided by the user
+        :return: True is residue numbering is valid, False if not
+        """
+        if residue_name == depositor_aa_type:
+            return True
+        mismatch = "residue %s (%s) in data does not match residue %s (%s) in PDB" % (
+            depositor_residue_number, depositor_aa_type, depositor_residue_number, residue_name)
+        self.mismatches.append(mismatch)
+        return False
\ No newline at end of file