deeppavlov · smilni · Aug 10, 2023 · Aug 10, 2023 · Aug 10, 2023 · Aug 14, 2023
diff --git a/annotators/fact_checking/Dockerfile b/annotators/fact_checking/Dockerfile
@@ -0,0 +1,21 @@
+FROM python:3.9.16
+WORKDIR /src
+
+COPY annotators/fact_checking/requirements.txt /src/requirements.txt
+RUN pip install -r /src/requirements.txt 
+
+ARG SERVICE_PORT
+ENV SERVICE_PORT ${SERVICE_PORT}
+ARG GENERATIVE_SERVICE_URL
+ENV GENERATIVE_SERVICE_URL ${GENERATIVE_SERVICE_URL}
+ARG GENERATIVE_TIMEOUT=5
+ENV GENERATIVE_TIMEOUT ${GENERATIVE_TIMEOUT}
+ARG GENERATIVE_SERVICE_CONFIG
+ENV GENERATIVE_SERVICE_CONFIG ${GENERATIVE_SERVICE_CONFIG}
+ARG ENVVARS_TO_SEND
+ENV ENVVARS_TO_SEND ${ENVVARS_TO_SEND}
+
+COPY annotators/fact_checking /src
+COPY common /src/common
+
+CMD gunicorn --workers=1 server:app -b 0.0.0.0:8182 --timeout=1200
diff --git a/annotators/fact_checking/README.md b/annotators/fact_checking/README.md
@@ -0,0 +1,16 @@
+# Fact Checking
+
+## Description
+
+Fact Checking conducts basic fact-checking of response candidates. As of now, it considers all hypotheses derived from external sources correct. Internally generated hypotheses are fact-checked by ensuring that they do not contradict any of the external hypotheses. For example, if `dff_google_api_skill` that relies on Google as a source of external knowledge responds _"Person X is 25 years old"_ and some solely LLM-based skill provides a hallucinated responds _"Person X is 23 years old"_, the second hypotheses is considered incorrect as it contradicts the first, external one.
+
+NB: Scripted responses from `dummy_skill` and `dff_intent_responder_skill` are not fact-checked for the sake of efficiency and are always deemed correct.
+
+## Parameters
+
+```
+ENVVARS_TO_SEND: API keys splitted by comma to get as env variables
+GENERATIVE_SERVICE_URL: LLM to utilize for fact-checking
+GENERATIVE_TIMEOUT: timeout for the request to LLM
+GENERATIVE_SERVICE_CONFIG:  configuration file with generative parameters to utilize
+```
diff --git a/annotators/fact_checking/requirements.txt b/annotators/fact_checking/requirements.txt
@@ -0,0 +1,9 @@
+flask==1.1.1
+itsdangerous==2.0.1
+gunicorn==19.9.0
+requests==2.22.0
+sentry-sdk[flask]==0.14.1
+healthcheck==1.3.3
+jinja2<=3.0.3
+Werkzeug<=2.0.3
+deeppavlov==1.1.1
diff --git a/annotators/fact_checking/server.py b/annotators/fact_checking/server.py
@@ -0,0 +1,103 @@
+import logging
+from os import getenv
+import sentry_sdk
+import json
+from flask import Flask, jsonify, request
+from sentry_sdk.integrations.flask import FlaskIntegration
+from common.prompts import send_request_to_prompted_generative_service, compose_sending_variables
+from common.response_selection import EXTERNAL_SKILLS
+
+# logging here because it conflicts with tf
+
+logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO)
+logger = logging.getLogger(__name__)
+sentry_sdk.init(dsn=getenv("SENTRY_DSN"), integrations=[FlaskIntegration()])
+app = Flask(__name__)
+
+ENVVARS_TO_SEND = getenv("ENVVARS_TO_SEND", None)
+ENVVARS_TO_SEND = [] if ENVVARS_TO_SEND is None else ENVVARS_TO_SEND.split(",")
+GENERATIVE_SERVICE_URL = getenv("GENERATIVE_SERVICE_URL")
+GENERATIVE_TIMEOUT = int(getenv("GENERATIVE_TIMEOUT"))
+GENERATIVE_SERVICE_CONFIG = getenv("GENERATIVE_SERVICE_CONFIG")
+if GENERATIVE_SERVICE_CONFIG:
+    with open(f"common/generative_configs/{GENERATIVE_SERVICE_CONFIG}", "r") as f:
+        GENERATIVE_SERVICE_CONFIG = json.load(f)
+SKILLS_NOT_TO_FACT_CHECK = ["dummy_skill", "dff_intent_responder_skill"]
+
+
+def check_hyp_with_llm(curr_prompt, human_uttr_attr):
+    lm_service_kwargs = human_uttr_attr.pop("lm_service_kwargs", None)
+    lm_service_kwargs = {} if lm_service_kwargs is None else lm_service_kwargs
+    envvars_to_send = ENVVARS_TO_SEND if len(ENVVARS_TO_SEND) else human_uttr_attr.get("envvars_to_send", [])
+    sending_variables = compose_sending_variables(
+        lm_service_kwargs,
+        envvars_to_send,
+        **human_uttr_attr,
+    )
+    response = send_request_to_prompted_generative_service(
+        "",
+        curr_prompt,
+        GENERATIVE_SERVICE_URL,
+        GENERATIVE_SERVICE_CONFIG,
+        GENERATIVE_TIMEOUT,
+        sending_variables,
+    )
+    result = response[0]
+    if "yes" in result.lower():
+        _is_hyp_correct = False
+    else:
+        _is_hyp_correct = True
+    return _is_hyp_correct
+
+
+@app.route("/respond_batch", methods=["POST"])
+def respond_batch():
+    hypotheses = request.json["hypotheses"]
+    human_uttr_attributes = request.json["human_uttr_attributes"]
+    ie_types = ["external" if hyp["skill_name"] in EXTERNAL_SKILLS else "internal" for hyp in hypotheses]
+    external_service_hyps = [
+        (hyp["text"], hyp["skill_name"]) for hyp in hypotheses if hyp["skill_name"] in EXTERNAL_SKILLS
+    ]
+    results = []
+    for hyp, human_uttr_attr, ie_type in zip(hypotheses, human_uttr_attributes, ie_types):
+        hyp_text = hyp["text"]
+        try:
+            if ie_type == "external":
+                logger.info(f"Hypothesis `{hyp_text}` is considered correct as it is external.")
+                results += ["Correct"]
+            elif hyp["skill_name"] in SKILLS_NOT_TO_FACT_CHECK:
+                logger.info(f"Hypothesis `{hyp_text}` is not checked as it was produced by {hyp['skill_name']}.")
+                results += ["Correct"]
+            else:
+                if len(external_service_hyps) == 0:
+                    logger.info(
+                        f"Internal hypothesis `{hyp_text}` is considered correct as there are no external hypotheses \
+to check it upon."
+                    )
+                    results += ["Correct"]
+                else:
+                    _is_hyp_correct = True
+                    for external_service_hyp, external_service_name in external_service_hyps:
+                        curr_prompt = f"""Hypothesis: "{hyp_text}"
+Does Hypothesis contradict Fact that {external_service_hyp}? Always answer only Yes or No without explanation."""
+                        logger.info(f"Checking internal hypothesis `{hyp_text}` with LLM. Prompt: {curr_prompt}")
+                        _is_hyp_correct_one_step = check_hyp_with_llm(curr_prompt, human_uttr_attr)
+                        if not _is_hyp_correct_one_step:
+                            _is_hyp_correct = False
+                            logger.info(
+                                f"""Internal hypothesis `{hyp_text}` is incorrect according to external service \
+{external_service_name}."""
+                            )
+                            results += ["Incorrect"]
+                            break
+                    if _is_hyp_correct:
+                        logger.info(f"Internal hypothesis `{hyp_text}` is correct according to all external services.")
+                        results += ["Correct"]
+        except Exception as e:
+            logger.error(e)
+            results += ["Correct"]
+    return jsonify([{"batch": results}])
+
+
+if __name__ == "__main__":
+    app.run(debug=False, host="0.0.0.0", port=3000)
diff --git a/annotators/fact_checking/service_configs/fact-checking/environment.yml b/annotators/fact_checking/service_configs/fact-checking/environment.yml
@@ -0,0 +1,6 @@
+SERVICE_PORT: 8182
+GENERATIVE_SERVICE_URL: http://openai-api-chatgpt:8145/respond
+GENERATIVE_TIMEOUT: 5
+GENERATIVE_SERVICE_CONFIG: openai-chatgpt.json
+ENVVARS_TO_SEND: OPENAI_API_KEY,OPENAI_ORGANIZATION
+FLASK_APP: server
diff --git a/annotators/fact_checking/service_configs/fact-checking/service.yml b/annotators/fact_checking/service_configs/fact-checking/service.yml
@@ -0,0 +1,30 @@
+name: fact-checking
+endpoints:
+  - respond_batch
+compose:
+  env_file:
+    - .env, .env_secret
+  build:
+    args:
+      SERVICE_PORT: 8182
+      GENERATIVE_SERVICE_URL: http://openai-api-chatgpt:8145/respond
+      GENERATIVE_TIMEOUT: 5
+      GENERATIVE_SERVICE_CONFIG: openai-chatgpt.json
+      ENVVARS_TO_SEND: OPENAI_API_KEY,OPENAI_ORGANIZATION
+    context: .
+    dockerfile: annotators/fact_checking/Dockerfile
+  command: flask run -h 0.0.0.0 -p 8182
+  environment:
+    - FLASK_APP=server
+  deploy:
+    resources:
+      limits:
+        memory: 128M
+      reservations:
+        memory: 128M
+  volumes:
+    - "./annotators/fact_checking:/src"
+    - "./common:/src/common"
+  ports:
+    - 8182:8182
+proxy: null
diff --git a/annotators/fact_checking/test.py b/annotators/fact_checking/test.py
@@ -0,0 +1,33 @@
+import requests
+from os import getenv
+
+SERVICE_PORT = getenv("SERVICE_PORT")
+
+
+def main():
+    url = f"http://0.0.0.0:{SERVICE_PORT}/respond_batch"
+    result = requests.post(
+        url=url,
+        json={
+            "hypotheses": [
+                {"skill_name": "dff_google_api_skill", "text": "Jack is 5 years old."},
+                {
+                    "skill_name": "dff_dream_persona_chatgpt_prompted_skill",
+                    "text": "Jack is 25 years old.",
+                },
+                {
+                    "skill_name": "dummy_skill",
+                    "text": "Sorry, I cannot answer your question.",
+                },
+            ],
+            "human_uttr_attributes": [{}, {}, {}],
+        },
+    )
+    result = result.json()[0]["batch"]
+    result_gold = ["Correct", "Incorrect", "Correct"]
+    assert result == result_gold, f"Got\n{result}\n, something is wrong"
+    print("Success!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/annotators/fact_checking/test.sh b/annotators/fact_checking/test.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+python test.py
diff --git a/assistant_dists/dream/dev.yml b/assistant_dists/dream/dev.yml
@@ -157,4 +157,10 @@ services:
       - "./common:/src/common"
     ports:
       - 8167:8167
+  fact-checking:
+    volumes:
+      - "./annotators/fact_checking:/src"
+      - "./common:/src/common"
+    ports:
+      - 8182:8182
 version: "3.7"
diff --git a/assistant_dists/dream/docker-compose.override.yml b/assistant_dists/dream/docker-compose.override.yml
@@ -8,7 +8,8 @@ services:
           combined-classification:8087, fact-retrieval:8100, entity-detection:8103,
           sentence-ranker:8128, property-extraction:8136, prompt-selector:8135, openai-api-chatgpt:8145,
           dff-dream-persona-chatgpt-prompted-skill:8137, dff-dream-faq-prompted-skill:8170,
-          openai-api-chatgpt-16k:8167, summarization-annotator:8058, dialog-summarizer:8059"
+          openai-api-chatgpt-16k:8167, summarization-annotator:8058, dialog-summarizer:8059,
+          fact-checking:8182"
       WAIT_HOSTS_TIMEOUT: ${WAIT_TIMEOUT:-1000}
       HIGH_PRIORITY_INTENTS: 1
       RESTRICTION_FOR_SENSITIVE_CASE: 1
@@ -26,6 +27,10 @@ services:
         SENTENCE_RANKER_ANNOTATION_NAME: sentence_ranker
         SENTENCE_RANKER_SERVICE_URL: http://sentence-ranker:8128/respond
         SENTENCE_RANKER_TIMEOUT: 3
+        ENABLE_FACT_CHECKING: 1
+        FACTUAL_CONFORMITY_SERVICE_URL: http://fact-checking:8182/respond
+        FACTUAL_CONFORMITY_SERVICE_TIMEOUT: 5
+        FACTUAL_CONFORMITY_ANNOTATION_NAME: fact_checking
         N_UTTERANCES_CONTEXT: 5
         FILTER_TOXIC_OR_BADLISTED: 1
         FALLBACK_FILE: fallbacks_dream_en.json
@@ -482,4 +487,24 @@ services:
         reservations:
           memory: 4G
 
+  fact-checking:
+    env_file: [ .env, .env_secret ]
+    build:
+      args:
+        GENERATIVE_SERVICE_URL: http://openai-api-chatgpt:8145/respond
+        GENERATIVE_TIMEOUT: 5
+        GENERATIVE_SERVICE_CONFIG: openai-chatgpt.json
+        ENVVARS_TO_SEND: OPENAI_API_KEY,OPENAI_ORGANIZATION
+      context: .
+      dockerfile: annotators/fact_checking/Dockerfile
+    command: flask run -h 0.0.0.0 -p 8182
+    environment:
+      - FLASK_APP=server
+    deploy:
+      resources:
+        limits:
+          memory: 128M
+        reservations:
+          memory: 128M
+
 version: '3.7'
diff --git a/assistant_dists/dream/pipeline_conf.json b/assistant_dists/dream/pipeline_conf.json
@@ -395,6 +395,24 @@
                     "component": "components/XGwmAHtAOu0NDqqG3QCJw.yml",
                     "service": "services/sentence_ranker/service_configs/sentence-ranker"
                 }
+            },
+            "fact_checking": {
+                "connector": {
+                    "protocol": "http",
+                    "timeout": 5.0,
+                    "url": "http://fact-checking:8182/respond_batch"
+                },
+                "dialog_formatter": "state_formatters.dp_formatters:hypotheses_and_attributes",
+                "response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
+                "previous_services": [
+                    "skills"
+                ],
+                "state_manager_method": "add_hypothesis_annotation_batch",
+                "is_enabled": true,
+                "source": {
+                    "component": "components/oijrtOIfj94jnvkf30n.yml",
+                    "service": "annotators/fact_checking/service_configs/fact-checking"
+                }
             }
         },
         "skill_selectors": {
@@ -551,8 +569,8 @@
                 "state_manager_method": "add_bot_utterance",
                 "is_enabled": true,
                 "source": {
-                    "component": "components/YJzc7NwGrLmKp6gfZJh7X1.yml",
-                    "service": "response_selectors/ranking_based_response_selector/service_configs/ranking-based-response-selector"
+                    "component": "components/dvpefjoPOHC90efoi.yml",
+                    "service": "response_selectors/ranking_based_response_selector/service_configs/ranking-based-response-selector-fact-checking"
                 }
             }
         }

diff --git a/common/response_selection.py b/common/response_selection.py
@@ -68,3 +68,5 @@
     "Oh, before I forget,",
     "I wanted to mention that,",
 ]
+
+EXTERNAL_SKILLS = ["factoid_qa", "dff_google_api_skill"]
diff --git a/components.tsv b/components.tsv
@@ -183,6 +183,6 @@
 8179	dff-robot-prompted-skill
 8180
 8181
-8182
+8182	fact-checking
 8183	external-integration-skill
 8184	external-fake-server
diff --git a/components/dvpefjoPOHC90efoi.yml b/components/dvpefjoPOHC90efoi.yml
@@ -0,0 +1,25 @@
+name: response_selector
+display_name: Response Selector (Fact-Checking enabled)
+component_type: null
+model_type: Dictionary/Pattern-based
+is_customizable: false
+author: publisher@deeppavlov.ai
+description: Algorithm that selects the final response among the given list of candidate
+    responses via the given ranking service and the given fact-checking service.
+ram_usage: 100M
+gpu_usage: null
+group: response_selectors
+connector:
+  protocol: http
+  timeout: 1.0
+  url: http://ranking-based-response-selector:8002/respond
+dialog_formatter: state_formatters.dp_formatters:cropped_dialog
+response_formatter: state_formatters.dp_formatters:base_response_selector_formatter_service
+previous_services:
+- candidate_annotators
+required_previous_services: null
+state_manager_method: add_bot_utterance
+tags: null
+endpoint: respond
+service: response_selectors/ranking_based_response_selector/service_configs/ranking-based-response-selector-fact-checking
+date_created: '2023-03-16T09:45:32'
diff --git a/components/oijrtOIfj94jnvkf30n.yml b/components/oijrtOIfj94jnvkf30n.yml
@@ -0,0 +1,26 @@
+name: fact_checking
+display_name: Fact Checking
+component_type: null
+model_type: NN-based
+is_customizable: false
+author: publisher@deeppavlov.ai
+description: Performs basic fact-checking of internal hypotheses 
+  by using an LLM to determine whether they contradict any of the hypotheses
+  that rely on external knowledge sources.
+ram_usage: 128M
+gpu_usage: null
+group: candidate_annotators
+connector:
+  protocol: http
+  timeout: 5.0
+  url: http://fact-checking:8182/respond_batch
+dialog_formatter: state_formatters.dp_formatters:hypotheses_and_attributes
+response_formatter: state_formatters.dp_formatters:simple_formatter_service
+previous_services:
+- skills
+required_previous_services: null
+state_manager_method: add_hypothesis_annotation_batch
+tags: null
+endpoint: respond_batch
+service: annotators/fact_checking/service_configs/fact-checking
+date_created: '2023-08-15T17:06:52'