Feat/nice formatting (#110)

* formatting: first commit * unify summary descriptions * formatting for titles completed * fix compose_variables; fix getting parts of report; fix summary length prompts; fix formatting * fix: verify=False for getting files * improve some prompts * working formatting * codestyle * add comments * formatting fixes * sent most of logic to utils * codestyle
deeppavlov · Nov 7, 2023 · 6b0f6dc · 6b0f6dc
1 parent a9c8995
commit 6b0f6dc
Show file tree

Hide file tree

Showing 6 changed files with 295 additions and 188 deletions.
diff --git a/annotators/doc_processor/utils.py b/annotators/doc_processor/utils.py
@@ -299,7 +299,8 @@ def upload_documents_save_info(
                 file_id = generate_unique_file_id(10, dialog_id)
                 all_docs_info_new[file_id] = {"initial_path_or_link": file_source}
                 if file_source_type == "link":
-                    orig_file = requests.get(file_source, timeout=FILE_SERVER_TIMEOUT)
+                    # without verify=False, downloading files from our service does not work
+                    orig_file = requests.get(file_source, timeout=FILE_SERVER_TIMEOUT, verify=False)
                     content_headers = orig_file.headers.get("Content-disposition", "filename=NoNameFile")
                     filename = re.search(FIND_FILENAME, content_headers)
                     if filename:

diff --git a/common/prompts/management_assistant/summary.json b/common/prompts/management_assistant/summary.json
@@ -1,5 +1,5 @@
 {
-    "prompt": "Meeting transcript:\n{transcript_chunk}\nProvide a full summary of the meeting based on the meeting transcript.\nDo not include tasks separately.",
-    "prompt_concatenate": "Here are parts of a summary of a work meeting:\n{gpt_responses}\nCombine them to provide a complete and coherent summary.\nIt must include all the information from the parts above. Remove repetitions if necessary.",
+    "prompt": "Meeting transcript:\n{transcript_chunk}\nProvide a full summary of the meeting based on the meeting transcript.\nBe concise but mention all important details. Do not include tasks separately. Provide only the summary with title 'Summary of the meeting' and no additional comments.",
+    "prompt_concatenate": "Here are parts of a summary of a work meeting:\n{gpt_responses}\nCombine them to provide a complete and coherent summary.\nIt must include all the information from the parts above.\nYou should abridge the text and leave out minor detailes when possible. Provide only the summary with title 'Summary of the meeting' and no additional comments.",
     "goals": "Summarizes meeting transcripts to provide a detailed overview of what was discussed."
 }
diff --git a/common/prompts/management_assistant/weekly_report.json b/common/prompts/management_assistant/weekly_report.json
@@ -1,5 +1,5 @@
 {
-    "prompt": "Here are summaries of daily meetings:\n{transcript_chunk}\nCombine them to provide an overview of the weekly progress. Preserve the sections.\nYour overview must include all the information from daily meetings, while also summarizing it and making conclusions to provide an informative weekly progress report. Start with a title stating the dates that the report covers.",
-    "prompt_concatenate": "Here are parts of weekly reports:\n{gpt_responses}\nCombine the parts to obtain one complete weekly report.",
+    "prompt": "Here are summaries of daily meetings:\n{transcript_chunk}\nCombine them to provide an overview of the weekly progress. Preserve the sections.\nYour overview must include all the information from daily meetings, while also summarizing it and making conclusions to provide an informative weekly progress report. Start with a title stating the dates that the report covers. Preserve the titles and structure of the parts. You must not add any symbols like #, _, * or try to control fonts.",
+    "prompt_concatenate": "Here are parts of weekly reports:\n{gpt_responses}\nCombine the parts to obtain one complete weekly report. Start with a title stating the dates that the report covers.  Preserve the titles and structure of the parts. You must not add any symbols like #, _, * or try to control fonts.",
     "goals": "Forms a weekly report based on daily reports."
 }
diff --git a/components/OFinvkfdeoih3849onv.yml b/components/OFinvkfdeoih3849onv.yml
@@ -4,8 +4,8 @@ component_type: Generative
 model_type: NN-based
 is_customizable: true
 author: publisher@deeppavlov.ai
-description: This skill analyses meeting transcripts. Only this skill must be selected when summary, decisions,
-  current or completed tasks are requested. It must also be selected when any questions about 
+description: This skill analyses meeting transcripts. Only this skill must be selected when full or weekly report, 
+  summary, decisions, current or completed tasks are requested. It must also be selected when any questions about 
   the meeting content or progress are asked.
 ram_usage: 500M
 gpu_usage: null

diff --git a/skills/dff_meeting_analysis_skill/scenario/response.py b/skills/dff_meeting_analysis_skill/scenario/response.py
@@ -2,26 +2,22 @@
 import logging
 import sentry_sdk
 import os
-from typing import Any, List
-import re
-import requests
+from typing import Any
 import time
 
 import common.dff.integration.context as int_ctx
 import common.dff.integration.response as int_rsp
 from common.constants import CAN_NOT_CONTINUE
-from common.containers import get_envvars_for_llm, get_max_tokens_for_llm, is_container_running
-from common.prompts import (
-    send_request_to_prompted_generative_service,
-    compose_sending_variables,
-)
-from common.text_processing_for_prompts import (
-    check_token_number,
-    decide_where_to_break,
-    split_transcript_into_chunks,
-)
-from common.files_and_folders_processing import upload_document
+from common.containers import get_envvars_for_llm, is_container_running
+from common.prompts import compose_sending_variables
 from df_engine.core import Context, Actor
+from .utils import (
+    postprocess_formatting,
+    get_and_upload_response_for_one_doc,
+    set_correct_type_and_id,
+    compose_and_upload_final_response,
+    get_older_gen_response,
+)
 
 
 sentry_sdk.init(os.getenv("SENTRY_DSN"))
@@ -44,9 +40,6 @@
     with open(f"common/generative_configs/{GENERATIVE_SERVICE_CONFIG}", "r") as f:
         GENERATIVE_SERVICE_CONFIG = json.load(f)
 
-with open("common/prompts/management_assistant/default_system_prompt_assistant.json", "r") as f:
-    DEFAULT_SYSTEM_PROMPT = json.load(f)["prompt"]
-
 GENERATIVE_TIMEOUT = float(os.getenv("GENERATIVE_TIMEOUT", 5))
 GENERATIVE_TIMEOUT = (
     GENERATIVE_SERVICE_CONFIG.pop("timeout", GENERATIVE_TIMEOUT) if GENERATIVE_SERVICE_CONFIG else GENERATIVE_TIMEOUT
@@ -55,152 +48,17 @@
 FILE_SERVER_URL = os.getenv("FILE_SERVER_URL")
 FILE_SERVER_TIMEOUT = float(os.getenv("FILE_SERVER_TIMEOUT"))
 ENVVARS_TO_SEND = get_envvars_for_llm(GENERATIVE_SERVICE_URL)
-INCLUDE_INTO_REPORT = ["progress_by_areas", "problems", "summary"]
-SEP_FOR_DOC_RESPONSES = "\n****************\n"
-LONG_SUMMARY_REQUEST = re.compile(r"(detailed)|(long)", flags=re.IGNORECASE)
-SHORT_SUMMARY_REQUEST = re.compile(r"(short)|(concise)", flags=re.IGNORECASE)
 
 DEFAULT_CONFIDENCE = 0.9
 SUPER_CONFIDENCE = 1.0
 LOW_CONFIDENCE = 0.7
 
-management_prompts_dict = {
-    "summary": {},
-    "future_tasks": {},
-    "completed_tasks": {},
-    "decisions": {},
-    "question_answering": {},
-    "progress_by_areas": {},
-    "weekly_report": {},
-    "problems": {},
-    "combine_responses": {},
-}
-
-for key in management_prompts_dict.keys():
-    with open(f"common/prompts/management_assistant/{key}.json", "r") as f:
-        prompt_dict = json.load(f)
-        management_prompts_dict[key]["prompt"] = prompt_dict["prompt"]
-        management_prompts_dict[key]["prompt_concatenate"] = prompt_dict["prompt_concatenate"]
-
-
-def get_response_for_prompts(
-    transcript_chunks: list, prompt_type: str, dialog_context: list, sending_variables: dict
-) -> str:
-    if "summary" in prompt_type:
-        if "summary_short" in prompt_type:
-            length_request = " Your summary must be very concise. Make it as short as possible."
-        elif "summary_long" in prompt_type:
-            length_request = " Your summary must be as detailed as possible. Cover all aspects of the discussions."
-        else:
-            length_request = " Be concise but mention all important details. Leave out minor detailes when possible."
-        prompt_type = "summary"
-    else:
-        length_request = ""
-    logger.info(f"length_request {length_request}")
-    all_gpt_responses = []
-    # this is prompt for processing each separate chunk of text
-    prompt = management_prompts_dict[prompt_type]["prompt"]
-    # this is prompt for processing the results for all chunks to get the final response
-    # only used if the document is longer than the model's context window - 1000
-    prompt_final = management_prompts_dict[prompt_type]["prompt_concatenate"]
-    request = dialog_context[-1]
-    for n, chunk in enumerate(transcript_chunks):
-        prompt_to_send = prompt.replace("{transcript_chunk}", chunk)
-        prompt_to_send += length_request
-        dialog_context[-1] = prompt_to_send.replace("{request}", request)
-        response = send_request_to_prompted_generative_service(
-            dialog_context,
-            DEFAULT_SYSTEM_PROMPT,
-            GENERATIVE_SERVICE_URL,
-            GENERATIVE_SERVICE_CONFIG,
-            GENERATIVE_TIMEOUT,
-            sending_variables,
-        )
-        logger.info(f"Got response for chunk {n+1}, tokens in response: {check_token_number(response[0])}.")
-        all_gpt_responses += response
-    if len(all_gpt_responses) > 1:
-        logger.info("Processing multiple LLM's responses to get the final answer.")
-        gpt_responses = "\n".join(all_gpt_responses)
-        prompt_final_to_send = prompt_final.replace("{gpt_responses}", gpt_responses)
-        prompt_final_to_send += length_request
-        dialog_context[-1] = prompt_final_to_send.replace("{request}", request)
-        final_response = send_request_to_prompted_generative_service(
-            dialog_context,
-            DEFAULT_SYSTEM_PROMPT,
-            GENERATIVE_SERVICE_URL,
-            GENERATIVE_SERVICE_CONFIG,
-            GENERATIVE_TIMEOUT,
-            sending_variables,
-        )
-        all_gpt_responses += final_response
-    return all_gpt_responses[-1]
-
-
-def upload_generated_item_return_link(hypothesis: str, prompt_type_and_id: str):
-    filename = f"{prompt_type_and_id}.txt"
-    uploaded_doc_link = upload_document(hypothesis, filename, FILE_SERVER_URL, FILE_SERVER_TIMEOUT, type_ref="text")
-    return uploaded_doc_link
-
-
-def compose_and_upload_final_response(
-    hyps_all_docs: list,
-    prompt_type_and_id: str,
-    dialog_context: List[str],
-    sending_variables: dict,
-    bot_attrs_files: dict,
-):
-    # note that we are joining responses for all docs by a special character SEP_FOR_DOC_RESPONSES
-    # when we are sending them to LLM, if we need to split the info into chunks, we
-    # will do that by SEP_FOR_DOC_RESPONSES, not by newline
-    info_from_all_docs = SEP_FOR_DOC_RESPONSES.join(hyps_all_docs)
-    if "weekly_report" not in prompt_type_and_id:
-        prompt_type_and_id = f"combine_responses__{prompt_type_and_id.split('__')[1]}"
-    hyp_week, bot_attrs_files = get_and_upload_response_for_one_doc(
-        info_from_all_docs, prompt_type_and_id, dialog_context, sending_variables, bot_attrs_files
-    )
-    return hyp_week, bot_attrs_files
-
-
-def get_and_upload_response_for_one_doc(
-    orig_text: str, prompt_type_and_id: str, dialog_context: List[str], sending_variables: dict, bot_attrs_files: dict
-) -> str:
-    prompt_type = prompt_type_and_id.split("__")[0]
-    document_in_use_id = prompt_type_and_id.split("__")[1]
-    # hard-coded limit: we preserve 1000 tokens for LLM answer
-    token_limit = get_max_tokens_for_llm(GENERATIVE_SERVICE_URL) - 1000
-
-    # if we have multiple docs, we would like not to split one doc into two
-    # so in this case we split by special separator
-    if prompt_type == "weekly_report" or prompt_type == "combine_responses":
-        break_points = decide_where_to_break(orig_text, limit=token_limit, sep=SEP_FOR_DOC_RESPONSES)
-    else:
-        break_points = decide_where_to_break(orig_text, limit=token_limit)
-    transcript_chunks = split_transcript_into_chunks(orig_text, break_points)
-
-    # if asked for full report, we get parts of it separately and then just concatenate them
-    if prompt_type == "full_report":
-        hypothesis = ""
-        for item in INCLUDE_INTO_REPORT:
-            item_type_and_id = f"{item}__{document_in_use_id}"
-            part_of_report = get_response_for_prompts(transcript_chunks, item, dialog_context, sending_variables)
-            uploaded_doc_link = upload_generated_item_return_link(part_of_report, item_type_and_id)
-            bot_attrs_files[item_type_and_id] = uploaded_doc_link
-            hypothesis += f"{part_of_report}\n\n"
-    else:
-        hypothesis = get_response_for_prompts(transcript_chunks, prompt_type, dialog_context, sending_variables)
-
-    # we save each hyp to server under the name of the request and doc_in_use id
-    # except for question_answering and combine_responses which we don't save as questions may vary
-    if prompt_type != "question_answering" and prompt_type != "combine_responses":
-        uploaded_doc_link = upload_generated_item_return_link(hypothesis, prompt_type_and_id)
-        bot_attrs_files[prompt_type_and_id] = uploaded_doc_link
-    return [hypothesis], bot_attrs_files
-
 
 def analyze_transcript(prompt_type: str):
     """
     prompt_type:  "summary", "future_tasks", "completed_tasks",
-        "decisions", "question_answering", "progress_by_areas", "full_report"
+        "decisions", "question_answering", "progress_by_areas",
+        "full_report", "weekly_report"
     """
 
     def transcript_analysis_handler(ctx: Context, actor: Actor, *args, **kwargs) -> Any:
@@ -219,7 +77,6 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr):
         dialog = int_ctx.get_dialog(ctx, actor)
         context = dialog.get("utterances", [])[-N_UTTERANCES_CONTEXT:]
         bot_attrs_files = {}
-        filepaths_on_server = []
         if context:
             dialog_context = [uttr["text"] for uttr in dialog["utterances"][-N_UTTERANCES_CONTEXT:]]
             request = dialog_context[-1]
@@ -231,42 +88,34 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr):
             # check if we already received such request before and saved hyp for it to server
             documents_in_use = context[-1].get("user", {}).get("attributes", {}).get("documents_in_use", [])
             all_docs_info = context[-1].get("user", {}).get("attributes", {}).get("processed_documents", {})
+            sending_variables = compose_sending_variables({}, ENVVARS_TO_SEND, human_uttr_attributes)
             hyps_all_docs = []
             if documents_in_use:
                 for document_in_use_id in documents_in_use:
                     # if we need a weekly report, on this step we gather separate daily reports for each doc
-                    if prompt_type_local == "weekly_report":
-                        prompt_type_and_id = f"full_report__{document_in_use_id}"
-                    elif prompt_type_local == "summary":
-                        is_long_request = LONG_SUMMARY_REQUEST.search(request)
-                        is_short_request = SHORT_SUMMARY_REQUEST.search(request)
-                        if is_long_request:
-                            prompt_type_local += "_long"
-                        elif is_short_request:
-                            prompt_type_local += "_short"
-                        prompt_type_and_id = f"{prompt_type_local}__{document_in_use_id}"
-                    else:
-                        prompt_type_and_id = f"{prompt_type_local}__{document_in_use_id}"
+                    # also here we change the type of summary prompt based on summary length request
+                    prompt_type_local, prompt_type_and_id = set_correct_type_and_id(
+                        request, prompt_type_local, document_in_use_id
+                    )
 
                     # here we check if we already generated sth for the same request and the same doc
                     if prompt_type_and_id in bot_attrs_files.keys():
-                        hypothesis_link = bot_attrs_files[prompt_type_and_id]
-                        hyp_one_doc = [requests.get(hypothesis_link, timeout=FILE_SERVER_TIMEOUT).text]
-                        logger.info(f"Found and downloaded {prompt_type_and_id} generated earlier.")
+                        hyp_one_doc = [get_older_gen_response(prompt_type_and_id, bot_attrs_files)]
                     # if no, let's generate it
                     else:
                         logger.info(
-                            f"""No earlier {prompt_type_and_id} found. Got transcript text from \
-{filepaths_on_server}, now sending request to generative model."""
+                            f"No earlier {prompt_type_and_id} found. \
+Sending request to generative model."
                         )
                         transcript_link = all_docs_info[document_in_use_id].get("processed_text_link", "")
-                        sending_variables = compose_sending_variables({}, ENVVARS_TO_SEND, human_uttr_attributes)
                         if transcript_link:
                             try:
-                                orig_file = requests.get(transcript_link, timeout=FILE_SERVER_TIMEOUT)
-                                orig_text = orig_file.text
                                 hyp_one_doc, bot_attrs_files = get_and_upload_response_for_one_doc(
-                                    orig_text, prompt_type_and_id, dialog_context, sending_variables, bot_attrs_files
+                                    transcript_link,
+                                    prompt_type_and_id,
+                                    dialog_context,
+                                    sending_variables,
+                                    bot_attrs_files,
                                 )
                             except Exception as e:
                                 sentry_sdk.capture_exception(e)
@@ -277,17 +126,25 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr):
                     hyps_all_docs += hyp_one_doc
 
                 # having got responses for all docs, let's make one response from it
-
                 # just return the response if we have one document and one response
-                if len(hyps_all_docs) == 1:
+                if len(hyps_all_docs) == 1 and prompt_type_local != "weekly_report":
                     hypotheses = hyps_all_docs
                 else:
                     # earlier we set prompt_type_and_id for weekly_analysis to full report for each doc,
                     # now we need it to set it back
                     prompt_type_and_id = f"{prompt_type_local}__{document_in_use_id}"
-                    hypotheses, bot_attrs_files = compose_and_upload_final_response(
-                        hyps_all_docs, prompt_type_and_id, dialog_context, sending_variables, bot_attrs_files
-                    )
+                    try:
+                        hypotheses, bot_attrs_files = compose_and_upload_final_response(
+                            hyps_all_docs, prompt_type_and_id, dialog_context, sending_variables, bot_attrs_files
+                        )
+                    except Exception as e:
+                        sentry_sdk.capture_exception(e)
+                        logger.exception(e)
+                        hypotheses = []
+
+                # for full report and weekly report, add formatting
+                if prompt_type_local == "weekly_report" or prompt_type_local == "full_report":
+                    hypotheses = postprocess_formatting(hypotheses, prompt_type=prompt_type_local)
             # if there are docs in human utt attributes, but no processed docs in use were found
             elif docs_in_attributes:
                 hyp_excuse = """Sorry, I failed to process the file you provided. \