feat: adding REST API server for prediction (#4)

varfish-org · Sep 7, 2023 · 8bb7516 · 8bb7516
1 parent 48d504c
commit 8bb7516
Show file tree

Hide file tree

Showing 12 changed files with 182 additions and 36 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -28,7 +28,7 @@ jobs:
       - name: Install Python
         uses: actions/setup-python@v2
         with:
-          python-version: "3.8"
+          python-version: "3.9"
 
       - name: Install dependencies
         run: |
@@ -56,7 +56,7 @@ jobs:
       - name: Install Python
         uses: actions/setup-python@v2
         with:
-          python-version: "3.8"
+          python-version: "3.9"
 
       - name: Install dependencies
         run: |
@@ -74,7 +74,6 @@ jobs:
     strategy:
       matrix:
         python-version:
-          - '3.8'
           - '3.9'
           - '3.10'
           - '3.11'

diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
@@ -102,4 +102,4 @@ Before you submit a pull request, check that it meets these guidelines:
 1. The pull request should include tests.
 2. If the pull request adds functionality, the docs should be updated.
    Put your new functionality into a function with a docstring.
-3. The pull request should work for Python 3.8 to 3.11.
+3. The pull request should work for Python 3.9 to 3.11.
diff --git a/Makefile b/Makefile
@@ -39,3 +39,7 @@ flake8:
 mypy: export MYPYPATH=stubs
 mypy:
 	mypy cada_prio tests
+
+.PHONY: serve
+serve:
+	uvicorn cada_prio.rest_server:app --host 0.0.0.0 --port 8080 --reload
diff --git a/cada_prio/predict.py b/cada_prio/predict.py
@@ -2,34 +2,25 @@
 
 import json
 import os
+import pickle
 import typing
 
 import cattrs
 from gensim.models import Word2Vec
 from logzero import logger
-import networkx as nx
 import numpy as np
 
 from cada_prio import train_model
 
 
-def load_hgnc_info(path_hgnc_json: str) -> typing.List[train_model.GeneIds]:
-    result = []
-    with open(path_hgnc_json, "rt") as f:
-        for line in f:
-            result.append(cattrs.structure(json.loads(line), train_model.GeneIds))
-    return result
-
-
-def run(
-    path_model: str,
-    orig_hpo_terms: typing.List[str],
-    genes: typing.Optional[typing.List[str]] = None,
-) -> int:
-    # Load and prepare data
+def load_hgnc_info(path_model):
     logger.info("Loading HGNC info...")
     logger.info("- parsing")
-    hgnc_infos = load_hgnc_info(os.path.join(path_model, "hgnc_info.jsonl"))
+    hgnc_infos = []
+    path_hgnc_jsonl = os.path.join(path_model, "hgnc_info.jsonl")
+    with open(path_hgnc_jsonl, "rt") as f:
+        for line in f:
+            hgnc_infos.append(cattrs.structure(json.loads(line), train_model.GeneIds))
     logger.info("- create mapping")
     all_to_hgnc = {}
     for record in hgnc_infos:
@@ -39,21 +30,28 @@ def run(
         if record.ensembl_gene_id:
             all_to_hgnc[record.ensembl_gene_id] = record
     hgnc_info_by_id = {record.hgnc_id: record for record in hgnc_infos}
-    hgnc_ids = []
-    for gene in genes or []:
-        if gene in all_to_hgnc:
-            hgnc_ids.append(all_to_hgnc[gene].hgnc_id)
-        else:
-            logger.warning("could not resolve HGNC ID for gene %s", gene)
     logger.info("... done loading HGNC info")
+    return all_to_hgnc, hgnc_info_by_id
+
 
+def load_graph_model(path_model):
     logger.info("Loading graph...")
-    graph = nx.read_gpickle(os.path.join(path_model, "graph.gpickle"))
+    with open(os.path.join(path_model, "graph.gpickle"), "rb") as inputf:
+        graph = pickle.load(inputf)
     logger.info("... done loading graph")
     logger.info("Loading model...")
     model = Word2Vec.load(os.path.join(path_model, "model"))
     logger.info("... done loading model")
+    return graph, model
+
+
+class NoValidHpoTerms(ValueError):
+    pass
+
 
+def run_prediction(
+    orig_hpo_terms, orig_genes, all_to_hgnc, graph, model
+) -> typing.Tuple[typing.List[str], typing.Dict[str, float]]:
     # Lookup HPO term embeddings.
     hpo_terms = {}
     for hpo_term in orig_hpo_terms:
@@ -63,7 +61,16 @@ def run(
             hpo_terms[hpo_term] = model.wv[hpo_term]
     if not hpo_terms:
         logger.error("no valid HPO terms in model")
-        return 1
+        raise NoValidHpoTerms("no valid HPO terms in query")
+
+    # Map gene IDs to HGNC IDs
+    genes = []
+    for orig_gene in orig_genes or []:
+        print(all_to_hgnc.get(orig_gene))
+        if orig_gene in all_to_hgnc:
+            genes.append(all_to_hgnc[orig_gene].hgnc_id)
+        else:
+            genes.append(orig_gene)
 
     # Generate a score for each gene in the knowledge graph
     logger.info("Generating scores...")
@@ -81,11 +88,29 @@ def run(
                 this_gene_scores.append(score)
             gene_scores[hgnc_id] = sum(this_gene_scores) / len(hpo_terms)
 
+    sorted_scores = dict(sorted(gene_scores.items(), key=lambda x: x[1], reverse=True))
+    return list(hpo_terms.keys()), sorted_scores
+
+
+def run(
+    path_model: str,
+    orig_hpo_terms: typing.List[str],
+    orig_genes: typing.Optional[typing.List[str]] = None,
+) -> int:
+    # Load and prepare data
+    all_to_hgnc, hgnc_info_by_id = load_hgnc_info(path_model)
+    graph, model = load_graph_model(path_model)
+    try:
+        hpo_terms, sorted_scores = run_prediction(
+            orig_hpo_terms, orig_genes, all_to_hgnc, graph, model
+        )
+    except NoValidHpoTerms:
+        return 1
+
     # Write out results to stdout, largest score first
-    sorted_scores = sorted(gene_scores.items(), key=lambda x: x[1], reverse=True)
     print("# query (len=%d): %s" % (len(hpo_terms), ",".join(hpo_terms)))
     print("\t".join(["rank", "score", "gene_symbol", "ncbi_gene_id", "hgnc_id"]))
-    for rank, (hgnc_id, score) in enumerate(sorted_scores, start=1):
+    for rank, (hgnc_id, score) in enumerate(sorted_scores.items(), start=1):
         hgnc_info = hgnc_info_by_id[hgnc_id]
         print(
             "\t".join(

diff --git a/cada_prio/rest_server.py b/cada_prio/rest_server.py
@@ -0,0 +1,82 @@
+"""REST API for CADA using FastAPI."""
+from contextlib import asynccontextmanager
+import os
+import typing
+
+from dotenv import load_dotenv
+from fastapi import FastAPI, Query
+from pydantic import BaseModel
+from starlette.responses import Response
+
+import cada_prio
+from cada_prio import predict
+
+# Load environment
+env = os.environ
+load_dotenv()
+
+#: Debug mode
+DEBUG = env.get("CADA_DEBUG", "false").lower() in ("true", "1")
+#: Path to data / model
+PATH_DATA = env.get("CADA_PATH_DATA", "/data/cada")
+
+#: The CADA models, to be loaded on startup.
+GLOBAL_STATIC = {}
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Load the models
+    all_to_hgnc, hgnc_info_by_id = predict.load_hgnc_info(PATH_DATA)
+    GLOBAL_STATIC["all_to_hgnc"] = all_to_hgnc
+    GLOBAL_STATIC["hgnc_info_by_id"] = hgnc_info_by_id
+    graph, model = predict.load_graph_model(PATH_DATA)
+    GLOBAL_STATIC["graph"] = graph
+    GLOBAL_STATIC["model"] = model
+
+    yield
+
+    GLOBAL_STATIC.clear()
+
+
+app = FastAPI(lifespan=lifespan)
+
+
+# Register endpoint for returning CADA version.
+@app.get("/version")
+async def version():
+    return Response(content=cada_prio.__version__)
+
+
+class PredictionResult(BaseModel):
+    rank: int
+    score: float
+    gene_symbol: str
+    ncbi_gene_id: str
+    hgnc_id: str
+
+
+# Register endpoint for the prediction
+@app.get("/predict")
+async def handle_predict(
+    hpo_terms: typing.Annotated[typing.List[str], Query()],
+    genes: typing.Annotated[typing.Optional[typing.List[str]], Query()] = [],
+):
+    _, sorted_scores = predict.run_prediction(
+        hpo_terms,
+        genes,
+        GLOBAL_STATIC["all_to_hgnc"],
+        GLOBAL_STATIC["graph"],
+        GLOBAL_STATIC["model"],
+    )
+    hgnc_info_by_id = GLOBAL_STATIC["hgnc_info_by_id"]
+    return [
+        PredictionResult(
+            rank=rank,
+            score=score,
+            gene_symbol=hgnc_info_by_id[hgnc_id].symbol,
+            ncbi_gene_id=hgnc_info_by_id[hgnc_id].ncbi_gene_id,
+            hgnc_id=hgnc_info_by_id[hgnc_id].hgnc_id,
+        )
+        for rank, (hgnc_id, score) in enumerate(sorted_scores.items(), start=1)
+    ]
diff --git a/cada_prio/train_model.py b/cada_prio/train_model.py
@@ -3,6 +3,7 @@
 import itertools
 import json
 import os
+import pickle
 import typing
 import warnings
 
@@ -265,7 +266,8 @@ def write_graph_and_model(
 
     path_graph = os.path.join(path_out, "graph.gpickle")
     logger.info("Saving graph to %s...", path_graph)
-    nx.write_gpickle(training_graph, path_graph)
+    with open(path_graph, "wb") as outputf:
+        pickle.dump(training_graph, outputf)
     logger.info("... done saving graph")
 
     logger.info("Saving embedding to %s...", path_out)

diff --git a/requirements/base.txt b/requirements/base.txt
@@ -7,3 +7,6 @@ tqdm >=4.0
 pronto >=2.5, <3.0
 networkx
 node2vec >=0.4.6, <0.5
+uvicorn >=0.23.2
+fastapi >=0.103, <0.104
+python-dotenv >=1.0, <2.0
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -6,6 +6,8 @@ flake8 >=5.0.4, <7.0
 pytest
 pytest-cov
 pytest-snapshot
+pytest-asyncio
+httpx >=0.24, <0.25
 
 mypy ==1.5.1
 types-python-dateutil >=2.8.19.3

diff --git a/setup.py b/setup.py
@@ -47,7 +47,6 @@ def parse_requirements(path):
         "License :: OSI Approved :: MIT License",
         "Natural Language :: English",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",

diff --git a/tests/test_predict.py b/tests/test_predict.py
@@ -1,5 +1,5 @@
 from cada_prio import predict
 
 
-def test_predict_smoke_test(tmpdir):
-    predict.run("tests/data/model_smoke", "HP:0008551")
+def test_predict_smoke_test():
+    predict.run("tests/data/model_smoke", ["HP:0008551"])
diff --git a/tests/test_rest_server.py b/tests/test_rest_server.py
@@ -0,0 +1,30 @@
+from unittest import mock
+
+import pytest
+from starlette.testclient import TestClient
+
+from cada_prio import rest_server
+
+
+@mock.patch("cada_prio.rest_server.PATH_DATA", "tests/data/model_smoke")
+@pytest.mark.asyncio
+async def test_version():
+    with TestClient(rest_server.app) as client:
+        response = client.get("/version")
+        assert response.status_code == 200
+
+
+@mock.patch("cada_prio.rest_server.PATH_DATA", "tests/data/model_smoke")
+@pytest.mark.asyncio
+async def test_predict_with_gene():
+    with TestClient(rest_server.app) as client:
+        response = client.get("/predict/?hpo_terms=HP:0008551&hpo=HP:0000007&gene=MKS1")
+        assert response.status_code == 200
+
+
+@mock.patch("cada_prio.rest_server.PATH_DATA", "tests/data/model_smoke")
+@pytest.mark.asyncio
+async def test_predict_without_gene():
+    with TestClient(rest_server.app) as client:
+        response = client.get("/predict/?hpo_terms=HP:0008551&hpo=HP:0000007")
+        assert response.status_code == 200
diff --git a/utils/docker/Dockerfile b/utils/docker/Dockerfile
@@ -43,12 +43,12 @@ ENV PATH="/.venv/bin:$PATH"
 
 COPY cada_prio cada_prio
 COPY requirements requirements
-COPY setup.py requirements.txt README.md CHANGELOG.md .
+COPY setup.py requirements.txt README.md CHANGELOG.md ./
 RUN pip install .
 
 RUN useradd --create-home cada_prio
 WORKDIR /home/cada_prio
 USER cada_prio
 
-CMD ["uvicorn", "cada_prio.server:app", "--host", "0.0.0.0", "--port", "8080"]
+CMD ["uvicorn", "cada_prio.rest_server:app", "--host", "0.0.0.0", "--port", "8080"]
 EXPOSE 8080