Merge pull request #155 from deepmipt/dev

Release v0.1.7
deeppavlov · May 6, 2022 · 30f290c · 30f290c
2 parents ed42f0c + 8476288
commit 30f290c
Show file tree

Hide file tree

Showing 14 changed files with 62 additions and 35 deletions.
diff --git a/.env b/.env
@@ -22,8 +22,8 @@ COMET_SERVICE_URL=http://comet-atomic:8053/comet
 CONCEPTNET_SERVICE_URL=http://comet-conceptnet:8065/comet
 MASKED_LM_SERVICE_URL=http://masked-lm:8088/respond
 SENTIMENT_CLASSIFICATION_SERVICE_URL=http://sentiment-classification:8024/model
-WIKIDATA_URL=http://wiki-parser:8077/model
-ENTITY_LINKING_URL=http://entity-linking:8075/model
+DP_WIKIDATA_URL=http://wiki-parser:8077/model
+DP_ENTITY_LINKING_URL=http://entity-linking:8075/model
 KNOWLEDGE_GROUNDING_SERVICE_URL=http://knowledge-grounding:8083/respond
 WIKIDATA_DIALOGUE_SERVICE_URL=http://wikidata-dial-service:8092/model
 NEWS_API_ANNOTATOR_URL=http://news-api-annotator:8112/respond

diff --git a/annotators/NER/server.py b/annotators/NER/server.py
@@ -43,7 +43,8 @@ def extract_good_entities(preds, sentences):
         good_entities_for_sent = []
 
         for ent in entities_for_sent:
-            ent_text = ent["text"].lower()
+            ent_text = ent["text"]
+            ent_text = " ".join([ent_word[0].capitalize() + ent_word[1:] for ent_word in ent_text.split()])
             # remove everything except of letters, digitals, spaces and -
             ent_text = EVERYTHING_EXCEPT_LETTERS_DIGITALS_AND_SPACE.sub(" ", ent_text)
             ent_text = DOUBLE_SPACES.sub(" ", ent_text).strip()
@@ -56,6 +57,7 @@ def extract_good_entities(preds, sentences):
             is_long_enough = len(ent_text) > 2
             is_not_banned = not re.match(BANNED_ENTITIES, ent_text)
             if is_not_stopword and is_not_banned and is_long_enough:
+                ent["text"] = ent_text
                 good_entities_for_sent.append(deepcopy(ent))
 
         good_preds.append(good_entities_for_sent)
@@ -64,7 +66,14 @@ def extract_good_entities(preds, sentences):
 
 def get_predictions_for_list_sentences(sentences):
     sents = [word_tokenize(sent.lower()) for sent in sentences]
+    sents_upper = [word_tokenize(sent) for sent in sentences]
     preds = ner.predict(sents)
+    for i in range(len(preds)):
+        sent_upper = sents_upper[i]
+        for j in range(len(preds[i])):
+            ent_upper = " ".join(sent_upper[preds[i][j]["start_pos"] : preds[i][j]["end_pos"]])
+            if ent_upper.lower() == preds[i][j]["text"]:
+                preds[i][j]["text"] = ent_upper
     # each sample is a list of sentences of current utterance
     # so, preds is a list of length = number of sents in utterances
     # each element of preds is a list of entities.

diff --git a/annotators/entity_detection/server.py b/annotators/entity_detection/server.py
@@ -49,6 +49,7 @@ def get_result(request, what_to_annotate):
     logger.info(f"annotating: {what_to_annotate}, input (the last utterances): {last_utts}")
 
     utts_list = []
+    utts_list_init = []
     utts_nums = []
     last_utt_starts = []
     for n, hist_utt in enumerate(last_utts):
@@ -71,6 +72,7 @@ def get_result(request, what_to_annotate):
                 utts_list.append(concat_utt.lower())
             else:
                 utts_list.append(concat_utt)
+            utts_list_init.append(concat_utt)
             utts_nums.append(n)
 
     utt_entities_batch = [{} for _ in last_utts]
@@ -91,14 +93,23 @@ def get_result(request, what_to_annotate):
         ) = entity_detection(utts_list)
         logger.info(f"entity_substr_batch {entity_substr_batch} finegrained_tags_batch {finegrained_tags_batch}")
 
-        for entity_substr_list, tags_list, finegrained_tags_list, entity_offsets_list, last_utt_start, num in zip(
-            entity_substr_batch, tags_batch, finegrained_tags_batch, entity_offsets_batch, last_utt_starts, utts_nums
+        for entity_substr_list, tags_list, finegrained_tags_list, entity_offsets_list, last_utt_start, uttr, num in zip(
+            entity_substr_batch,
+            tags_batch,
+            finegrained_tags_batch,
+            entity_offsets_batch,
+            last_utt_starts,
+            utts_list_init,
+            utts_nums,
         ):
             utt_entities = {}
             for entity, tag, finegrained_tag, (start_offset, end_offset) in zip(
                 entity_substr_list, tags_list, finegrained_tags_list, entity_offsets_list
             ):
-                if entity not in stopwords and len(entity) > 2 and start_offset >= last_utt_start:
+                entity_init = uttr[start_offset:end_offset]
+                if entity_init.lower() == entity:
+                    entity = entity_init
+                if entity.lower() not in stopwords and len(entity) > 2 and start_offset >= last_utt_start:
                     entity = EVERYTHING_EXCEPT_LETTERS_DIGITALS_AND_SPACE.sub(" ", entity)
                     entity = DOUBLE_SPACES.sub(" ", entity).strip()
                     if finegrained_tag[0][0] > 0.5:

diff --git a/annotators/entity_linking/server.py b/annotators/entity_linking/server.py
@@ -87,6 +87,7 @@ def respond():
     entity_substr_batch = inp.get("entity_substr", [[""]])
     template_batch = inp.get("template", [""])
     context_batch = inp.get("context", [[""]])
+    logger.info(f"entity linking, input {entity_substr_batch}")
     long_context_batch = []
     short_context_batch = []
     for entity_substr_list, context_list in zip(entity_substr_batch, context_batch):

diff --git a/annotators/kbqa/kbqa_cq.json b/annotators/kbqa/kbqa_cq.json
@@ -37,14 +37,14 @@
       {
         "class_name": "api_requester",
         "id": "linker_entities",
-        "url": "http://entity-linking:8075/model",
+        "url": "{ENTITY_LINKING_URL}",
         "out": ["entity_ids"],
         "param_names": ["entity_substr", "template_found"]
       },
       {
         "class_name": "api_requester",
         "id": "wiki_p",
-        "url": "http://wiki-parser:8077/model",
+        "url": "{WIKIDATA_URL}",
         "out": ["wiki_parser_output"],
         "param_names": ["parser_info", "query"]
       },
@@ -132,7 +132,9 @@
       "MODELS_PATH": "{ROOT_PATH}/models",
       "BERT_PATH": "{DOWNLOADS_PATH}/bert_models_kbqa/cased_L-12_H-768_A-12",
       "NER_PATH": "{MODELS_PATH}/ner_lcquad_ent_and_type",
-      "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
+      "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs",
+      "ENTITY_LINKING_URL": "http://entity-linking:8075/model",
+      "WIKIDATA_URL": "http://wiki-parser:8077/model"
     },
     "requirements": [
       "{DEEPPAVLOV_PATH}/requirements/tf.txt",

diff --git a/annotators/kbqa/kbqa_cq_mt_bert.json b/annotators/kbqa/kbqa_cq_mt_bert.json
@@ -117,14 +117,14 @@
       {
         "class_name": "api_requester",
         "id": "linker_entities",
-        "url": "http://entity-linking:8075/model",
+        "url": "{ENTITY_LINKING_URL}",
         "out": ["entity_ids"],
         "param_names": ["entity_substr", "template_found"]
       },
       {
         "class_name": "api_requester",
         "id": "wiki_p",
-        "url": "http://wiki-parser:8077/model",
+        "url": "{WIKIDATA_URL}",
         "out": ["wiki_parser_output"],
         "param_names": ["parser_info", "query"]
       },
@@ -228,7 +228,9 @@
       "MODELS_PATH": "{ROOT_PATH}/models",
       "BERT_PATH": "{DOWNLOADS_PATH}/bert_models_kbqa/cased_L-12_H-768_A-12",
       "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs",
-      "MT_BERT_PATH": "{MODELS_PATH}/mt_bert_kbqa"
+      "MT_BERT_PATH": "{MODELS_PATH}/mt_bert_kbqa",
+      "ENTITY_LINKING_URL": "http://entity-linking:8075/model",
+      "WIKIDATA_URL": "http://wiki-parser:8077/model"
     },
     "requirements": [
       "{DEEPPAVLOV_PATH}/requirements/tf.txt",

diff --git a/annotators/kbqa/kbqa_cq_mt_bert_lite.json b/annotators/kbqa/kbqa_cq_mt_bert_lite.json
@@ -12,7 +12,7 @@
       {
         "class_name": "api_requester",
         "id": "linker_entities",
-        "url": "http://entity-linking:8075/model",
+        "url": "{ENTITY_LINKING_URL}",
         "out": ["entity_ids"],
         "param_names": ["entity_substr", "template_found"]
       },
@@ -90,7 +90,8 @@
       "MODELS_PATH": "{ROOT_PATH}/models",
       "BERT_PATH": "{DOWNLOADS_PATH}/bert_models_kbqa/cased_L-12_H-768_A-12",
       "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs",
-      "MT_BERT_PATH": "{MODELS_PATH}/mt_bert_kbqa"
+      "MT_BERT_PATH": "{MODELS_PATH}/mt_bert_kbqa",
+      "ENTITY_LINKING_URL": "http://entity-linking:8075/model"
     },
     "requirements": [
       "{DEEPPAVLOV_PATH}/requirements/tf.txt",

diff --git a/annotators/wiki_parser/wiki_parser.py b/annotators/wiki_parser/wiki_parser.py
@@ -620,6 +620,8 @@ def find_top_triplets(entity, entity_substr, pos=None, token_conf=None, conf=Non
             triplets["token_conf"] = token_conf
         if conf is not None:
             triplets["conf"] = conf
+        if entity_substr.lower() in entity_label.lower():
+            entity_substr = entity_label
         triplets_info[entity_substr] = triplets
     return triplets_info
 

diff --git a/common/custom_requests.py b/common/custom_requests.py
@@ -8,8 +8,8 @@
 sentry_sdk.init(getenv("SENTRY_DSN"))
 logger = logging.getLogger(__name__)
 
-WIKIDATA_URL = getenv("WIKIDATA_URL")
-ENTITY_LINKING_URL = getenv("ENTITY_LINKING_URL")
+WIKIDATA_URL = getenv("DP_WIKIDATA_URL")
+ENTITY_LINKING_URL = getenv("DP_ENTITY_LINKING_URL")
 assert WIKIDATA_URL and ENTITY_LINKING_URL
 
 

diff --git a/skills/dff_bot_persona_skill/dialogflows/flows/shared.py b/skills/dff_bot_persona_skill/dialogflows/flows/shared.py
@@ -9,8 +9,8 @@
 sentry_sdk.init(dsn=os.getenv("SENTRY_DSN"))
 
 
-ENTITY_LINKING_URL = os.getenv("ENTITY_LINKING_URL")
-WIKIDATA_URL = os.getenv("WIKIDATA_URL")
+ENTITY_LINKING_URL = os.getenv("DP_ENTITY_LINKING_URL")
+WIKIDATA_URL = os.getenv("DP_WIKIDATA_URL")
 assert ENTITY_LINKING_URL, ENTITY_LINKING_URL
 assert WIKIDATA_URL, WIKIDATA_URL
 

diff --git a/skills/dff_gossip_skill/dialogflows/flows/utils.py b/skills/dff_gossip_skill/dialogflows/flows/utils.py
@@ -18,8 +18,8 @@
 
 sentry_sdk.init(dsn=os.getenv("SENTRY_DSN"))
 
-ENTITY_LINKING_URL = os.getenv("ENTITY_LINKING_URL")
-WIKIDATA_URL = os.getenv("WIKIDATA_URL")
+ENTITY_LINKING_URL = os.getenv("DP_ENTITY_LINKING_URL")
+WIKIDATA_URL = os.getenv("DP_WIKIDATA_URL")
 assert ENTITY_LINKING_URL, ENTITY_LINKING_URL
 assert WIKIDATA_URL, WIKIDATA_URL
 

diff --git a/skills/dff_sport_skill/dialogflows/flows/sport.py b/skills/dff_sport_skill/dialogflows/flows/sport.py
@@ -245,7 +245,7 @@ def entity_in_last_uttr_from_sport_area(vars):
 
 def get_dict_entity(entity_substr, entity_ids, type):
     try:
-        WIKIDATA_URL = "http://wiki-parser:8077/model"
+        WIKIDATA_URL = os.getenv("DP_WIKIDATA_URL")
         dict_result = requests.post(
             WIKIDATA_URL,
             json={
@@ -352,8 +352,8 @@ def user_ask_about_sport_request(ngrams, vars):
 def lets_chat_about_sport_response(vars):
     # USR_ASK_ABOUT_SPORT
     responses = [
-        f"I have no physical embodiment. Sport is interesting and useful. Tell me what sport do you enjoy?",
-        f"I live on a cloud, so i can't do sport , but I'm really curious about what sport are you fond of?",
+        "I have no physical embodiment. Sport is interesting and useful. Tell me what sport do you enjoy?",
+        "I live on a cloud, so i can't do sport , but I'm really curious about what sport are you fond of?",
     ]
     try:
         state_utils.set_confidence(vars, confidence=SUPER_CONFIDENCE)
@@ -398,7 +398,7 @@ def user_ask_about_athletes_response(vars):
     try:
         state_utils.set_confidence(vars, confidence=SUPER_CONFIDENCE)
         state_utils.set_can_continue(vars, continue_flag=MUST_CONTINUE)
-        return f"I know all the athletes on this planet. Which athlete do you like the most?"
+        return "I know all the athletes on this planet. Which athlete do you like the most?"
     except Exception as exc:
         logger.exception(exc)
         sentry_sdk.capture_exception(exc)
@@ -922,7 +922,7 @@ def last_chance_response(vars):
 
 def error_response(vars):
     state_utils.set_confidence(vars, ZERO_CONFIDENCE)
-    return f""
+    return ""
 
 
 ##################################################################################################################

diff --git a/skills/factoid_qa/server.py b/skills/factoid_qa/server.py
@@ -363,9 +363,6 @@ def respond():
             logger.info("Question is not classified as factoid.")
             response = ""
             confidence = 0.0
-
-        if confidence == 1.0:
-            confidence = 0.99
         responses.append(response)
         confidences.append(confidence)
         attributes.append(attr)

diff --git a/state_formatters/dp_formatters.py b/state_formatters/dp_formatters.py
@@ -535,21 +535,23 @@ def el_formatter_dialog(dialog: Dict):
     # Used by: entity_linking annotator
     num_last_utterances = 2
     ner_output = get_entities(dialog["human_utterances"][-1], only_named=True, with_labels=True)
-    nounphrases = dialog["human_utterances"][-1]["annotations"].get("cobot_entities", {}).get("entities", [])
-    entity_substr = []
+    nounphrases = get_entities(dialog["human_utterances"][-1], only_named=False, with_labels=False)
+    entity_substr_list = []
     if ner_output:
         for entity in ner_output:
             if entity and isinstance(entity, dict) and "text" in entity and entity["text"].lower() != "alexa":
-                entity_substr.append(entity["text"].lower())
-
+                entity_substr_list.append(entity["text"])
+    entity_substr_lower_list = {entity_substr.lower() for entity_substr in entity_substr_list}
     dialog = utils.get_last_n_turns(dialog, bot_last_turns=1)
     dialog = utils.replace_with_annotated_utterances(dialog, mode="punct_sent")
     context = [[uttr["text"] for uttr in dialog["utterances"][-num_last_utterances:]]]
     if nounphrases:
-        entity_substr += [nounphrase.lower() for nounphrase in nounphrases]
-    entity_substr = list(set(entity_substr))
+        entity_substr_list += [
+            nounphrase for nounphrase in nounphrases if nounphrase.lower() not in entity_substr_lower_list
+        ]
+    entity_substr_list = list(set(entity_substr_list))
 
-    return [{"entity_substr": [entity_substr], "template": [""], "context": context}]
+    return [{"entity_substr": [entity_substr_list], "template": [""], "context": context}]
 
 
 def kbqa_formatter_dialog(dialog: Dict):