Merge pull request #605 from deeppavlov/dev

Release v1.13.0
deeppavlov · Jan 23, 2024 · 00bba31 · 00bba31
2 parents 8dab819 + 141a8db
commit 00bba31
Show file tree

Hide file tree

Showing 78 changed files with 5,080 additions and 54,987 deletions.
diff --git a/annotators/emotion_detection/Dockerfile b/annotators/emotion_detection/Dockerfile
@@ -0,0 +1,38 @@
+FROM nvidia/cuda:12.1.1-base-ubuntu20.04
+
+RUN apt update
+RUN apt install -y python3.9
+RUN apt install -y git python3-pip
+
+ARG VIDEO_PRETRAINED
+ARG TEXT_PRETRAINED
+ARG MODEL_PATH
+ARG MULTIMODAL_MODEL
+ARG REDUNDANT_FEATURES
+
+ENV VIDEO_PRETRAINED=$VIDEO_PRETRAINED
+ENV TEXT_PRETRAINED=$TEXT_PRETRAINED
+ENV MULTIMODAL_MODEL=$MULTIMODAL_MODEL
+ENV MODEL_PATH=$MODEL_PATH
+ENV REDUNDANT_FEATURES=$REDUNDANT_FEATURES
+
+WORKDIR /src
+
+COPY . /src
+RUN mkdir /data
+RUN pip install -r requirements.txt
+
+RUN apt install -y ffmpeg=7:4.2.7-0ubuntu0.1 libsm6=2:1.2.3-1 libxext6=2:1.3.4-0ubuntu1
+
+RUN pip install gdown==4.7.1
+
+RUN git clone https://github.com/anna-a-m/MultimodalERC /data/repo && cd /data/repo && git reset --hard 84097d442b23b5a9238b5090a04e2625741314ae
+
+RUN mv -f /data/repo/* /data/ && rm -rf /data/repo
+
+RUN touch /data/multimodal_concat/__init__.py
+
+RUN apt-get install -y wget
+
+RUN wget -O models http://files.deeppavlov.ai/dream_data/emotion_detection/emotion_detection_v1.tar.gz && tar -xf models -C /data/
+RUN wget -O redundant_feat http://files.deeppavlov.ai/dream_data/emotion_detection/redundant_feat.txt && mv -f redundant_feat /data/
diff --git a/annotators/emotion_detection/aux.py b/annotators/emotion_detection/aux.py
@@ -0,0 +1,4 @@
+import sys
+
+sys.path.append("/data")
+sys.path.append("/data/multimodal_concat")
diff --git a/annotators/emotion_detection/requirements.txt b/annotators/emotion_detection/requirements.txt
@@ -0,0 +1,16 @@
+pandas==1.5.3
+scikit-learn==1.3.0
+tqdm==4.64.1
+opencv-python==4.7.0.68
+opensmile==2.4.2
+sentry-sdk==1.15.0
+torch==1.13.1
+transformers==4.31.0
+fastapi==0.103.0 
+blinker==1.5.0
+pydantic==2.3.0
+numpy==1.24.4
+starlette==0.27.0
+uvicorn==0.23.2
+Pillow==9.3.0
+wandb==0.13.9
diff --git a/annotators/emotion_detection/server.py b/annotators/emotion_detection/server.py
@@ -0,0 +1,199 @@
+import logging
+import os
+import opensmile
+import torch
+import numpy as np
+import sentry_sdk
+import cv2
+import aux  # noqa: F401
+
+from multimodal_concat.models import MultimodalClassificationModel, MainModel
+from multimodal_concat.utils import prepare_models
+
+from fastapi import FastAPI
+from fastapi.encoders import jsonable_encoder
+from pydantic import BaseModel
+from starlette.middleware.cors import CORSMiddleware
+from transformers import AutoTokenizer, AutoProcessor
+from typing import List
+from urllib.request import urlretrieve
+
+sentry_sdk.init(dsn=os.getenv("SENTRY_DSN"))
+
+label2id = {
+    "anger": 0,
+    "disgust": 1,
+    "fear": 2,
+    "joy": 3,
+    "neutral": 4,
+    "sadness": 5,
+    "surprise": 6,
+}
+num_labels = 7
+text_model, video_model, audio_model = prepare_models(num_labels, os.getenv("MODEL_PATH"))
+
+logger = logging.getLogger(__name__)
+
+
+def sample_frame_indices(seg_len, clip_len=16, frame_sample_rate=4, mode="video"):
+    converted_len = int(clip_len * frame_sample_rate)
+    converted_len = min(converted_len, seg_len - 1)
+    end_idx = np.random.randint(converted_len, seg_len)
+    start_idx = end_idx - converted_len
+    if mode == "video":
+        indices = np.linspace(start_idx, end_idx, num=clip_len)
+    else:
+        indices = np.linspace(start_idx, end_idx, num=clip_len * frame_sample_rate)
+    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+    return indices
+
+
+def get_frames(
+    file_path,
+    clip_len=16,
+):
+    cap = cv2.VideoCapture(file_path)
+    v_len = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    indices = sample_frame_indices(v_len)
+
+    frames = []
+    for fn in range(v_len):
+        success, frame = cap.read()
+        if success is False:
+            continue
+        if fn in indices:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            res = cv2.resize(frame, dsize=(224, 224), interpolation=cv2.INTER_CUBIC)
+            frames.append(res)
+    cap.release()
+
+    if len(frames) < clip_len:
+        add_num = clip_len - len(frames)
+        frames_to_add = [frames[-1]] * add_num
+        frames.extend(frames_to_add)
+
+    return frames
+
+
+def create_final_model():
+    multi_model = MultimodalClassificationModel(
+        text_model,
+        video_model,
+        audio_model,
+        num_labels,
+        input_size=4885,
+        hidden_size=512,
+    )
+    checkpoint = torch.load(os.getenv("MULTIMODAL_MODEL"))
+    multi_model.load_state_dict(checkpoint)
+
+    device = "cuda"
+    return MainModel(multi_model, device=device)
+
+
+def process_text(input_tokens: str):
+    text_model_name = os.getenv("TEXT_PRETRAINED")
+    logger.info(f"{text_model_name}")
+    tokenizer = AutoTokenizer.from_pretrained(text_model_name)
+
+    return tokenizer(
+        input_tokens,
+        padding="max_length",
+        truncation=True,
+        max_length=128,
+        return_tensors="pt",
+    )
+
+
+def process_video(video_path: str):
+    video_frames = get_frames(video_path)
+
+    video_model_name = os.getenv("VIDEO_PRETRAINED")
+    video_feature_extractor = AutoProcessor.from_pretrained(video_model_name)
+
+    return video_feature_extractor(videos=video_frames, return_tensors="pt")
+
+
+def process_audio(file_path: str):
+    smile = opensmile.Smile(
+        opensmile.FeatureSet.ComParE_2016,
+        opensmile.FeatureLevel.Functionals,
+        sampling_rate=16000,
+        resample=True,
+        num_workers=5,
+        verbose=True,
+    )
+
+    redundant_features = os.getenv("REDUNDANT_FEATURES")
+    with open(redundant_features, "r") as features_file:
+        redundant_features_list = features_file.read().split(",")
+
+    audio_features = smile.process_files([file_path])
+    audio_features = audio_features.drop(columns=redundant_features_list, inplace=False)
+    return audio_features.values.reshape(audio_features.shape[0], 1, audio_features.shape[1])
+
+
+def inference(text: str, video_path: str):
+    text_encoding = process_text(text)
+    video_encoding = process_video(video_path)
+    audio_features = process_audio(video_path)
+    batch = {
+        "text": text_encoding,
+        "video": video_encoding,
+        "audio": audio_features,
+        "label": None,
+    }
+    label = final_model(batch)
+    id2label = {v: k for k, v in label2id.items()}
+    return id2label[int(label.detach().cpu())]
+
+
+def predict_emotion(text: str, video_path: str):
+    try:
+        logger.warning(f"{inference(text, video_path)}")
+        return inference(text, video_path)
+    except Exception as e:
+        sentry_sdk.capture_exception(e)
+        raise e
+
+
+final_model = create_final_model()
+
+
+class EmotionsPayload(BaseModel):
+    personality: List[str]
+    video_path: List[str]
+
+
+def subinfer(msg_text: str, video_path: str):
+    emotion = "Emotion detection unsuccessfull. An error occured during inference."
+    filepath = "undefined"
+    try:
+        filename = video_path.split("=")[-1]
+        filepath = f"/data/{filename}"
+        urlretrieve(video_path, filepath)
+        if not os.path.exists(filepath):
+            raise ValueError(f"Failed to retrieve videofile from {filepath}")
+        emotion = predict_emotion(msg_text + " ", filepath)
+        logger.info(f"Detected emotion: {jsonable_encoder(emotion)}")
+    except Exception as e:
+        raise ValueError(f"The message format is correct, but: {e}")
+
+    return emotion
+
+
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+@app.post("/model")
+def infer(payload: EmotionsPayload):
+    logger.info(f"Emotion Detection: {payload}")
+    emotion = [subinfer(p[0], p[1]) for p in zip(payload.personality, payload.video_path)]
+    return jsonable_encoder(emotion)
diff --git a/annotators/emotion_detection/service_configs/emotion-detection/environment.yml b/annotators/emotion_detection/service_configs/emotion-detection/environment.yml
@@ -0,0 +1,9 @@
+SERVICE_PORT: 8040
+SERVICE_NAME: emotion_detection
+CUDA_VISIBLE_DEVICES: 0
+VIDEO_PRETRAINED: "microsoft/xclip-base-patch32"
+EXT_PRETRAINED: "bert-large-uncased"
+MULTIMODAL_MODEL: "final_model.pt"
+REDUNDANT_FEATURES: "redundant_features.txt"
+MODEL_PATH: "/data/"
+PREFIX: "Detect emotions:"
diff --git a/annotators/emotion_detection/service_configs/emotion-detection/service.yml b/annotators/emotion_detection/service_configs/emotion-detection/service.yml
@@ -0,0 +1,29 @@
+name: emotion-detection
+endpoints:
+- model
+compose:
+  env_file:
+  - .env
+  build:
+    args:
+      SERVICE_PORT: 8040
+      SERVICE_NAME: emotion_detection
+      VIDEO_PRETRAINED: "microsoft/xclip-base-patch32"
+      TEXT_PRETRAINED: "bert-large-uncased"
+      MULTIMODAL_MODEL: "final_model.pt"
+      REDUNDANT_FEATURES: "redundant_features.txt"
+      MODEL_PATH: "/data/"
+      PREFIX: "Detect emotions:"
+    context: .
+    dockerfile: ./annotators/emotion_detection/Dockerfile
+  command: uvicorn server:app --host 0.0.0.0 --port 8040
+  deploy:
+    resources:
+      limits:
+        memory: 1G
+      reservations:
+        memory: 1G
+  environment:
+      - CUDA_VISIBLE_DEVICES=0
+  ports:
+  - 8040:8040
diff --git a/annotators/kbqa/tests/test_kbqa.py b/annotators/kbqa/tests/test_kbqa.py
@@ -13,7 +13,7 @@
         ),
         (
             {"x_init": ["How old is Donald Trump?"], "entities": [["Donald Trump"]], "entity_tags": [[["per", 1.0]]]},
-            "Donald Trump is 77 years old.",
+            "Donald Trump is 78 years old.",
         ),
     ],
 )

diff --git a/annotators/personality_detection/Dockerfile b/annotators/personality_detection/Dockerfile
@@ -2,10 +2,14 @@ FROM python:3.7
 
 WORKDIR /src
 
-RUN git clone https://github.com/jkwieser/personality-detection-text.git /personality-detection-text
+COPY . /src
+RUN mkdir /data
 
 COPY requirements.txt .
 
 RUN pip install -r requirements.txt
 
+RUN wget -O models http://files.deeppavlov.ai/dream_data/personality_detection/personality_detection_models_v0.tar.gz && tar -xf models
+RUN mv -f models_v0 /data/models && ls /data/models && rm -rf models_v0
+
 COPY . .
diff --git a/annotators/personality_detection/requirements.txt b/annotators/personality_detection/requirements.txt
@@ -1,5 +1,4 @@
 scikit-learn==0.22.1
-plotly==4.14.3
 pandas==1.2.4
 uvicorn==0.13.4
 fastapi==0.65.1

diff --git a/annotators/personality_detection/server.py b/annotators/personality_detection/server.py
@@ -12,13 +12,13 @@
 
 sentry_sdk.init(os.getenv("SENTRY_DSN"))
 
-cEXT = pickle.load(open("/personality-detection-text/data/models/cEXT.p", "rb"))
-cNEU = pickle.load(open("/personality-detection-text/data/models/cNEU.p", "rb"))
-cAGR = pickle.load(open("/personality-detection-text/data/models/cAGR.p", "rb"))
-cCON = pickle.load(open("/personality-detection-text/data/models/cCON.p", "rb"))
-cOPN = pickle.load(open("/personality-detection-text/data/models/cOPN.p", "rb"))
-vectorizer_31 = pickle.load(open("/personality-detection-text/data/models/vectorizer_31.p", "rb"))
-vectorizer_30 = pickle.load(open("/personality-detection-text/data/models/vectorizer_30.p", "rb"))
+cEXT = pickle.load(open("/data/models/cEXT.p", "rb"))
+cNEU = pickle.load(open("/data/models/cNEU.p", "rb"))
+cAGR = pickle.load(open("/data/models/cAGR.p", "rb"))
+cCON = pickle.load(open("/data/models/cCON.p", "rb"))
+cOPN = pickle.load(open("/data/models/cOPN.p", "rb"))
+vectorizer_31 = pickle.load(open("/data/models/vectorizer_31.p", "rb"))
+vectorizer_30 = pickle.load(open("/data/models/vectorizer_30.p", "rb"))
 
 
 logger = logging.getLogger(__name__)

diff --git a/annotators/speech_function_classifier/Dockerfile b/annotators/speech_function_classifier/Dockerfile
@@ -4,21 +4,16 @@ WORKDIR /src
 
 RUN apt-get update && apt-get install -y --allow-unauthenticated git curl && rm -rf /var/lib/apt/lists/*
 
-RUN mkdir /models && \
-    curl https://files.deeppavlov.ai/alexaprize_data/speech_function_classifier.tar.gz --output arch.tgz && \
-    tar zxfv arch.tgz && \
-    rm -rf arch.tgz && \
-    curl https://files.deeppavlov.ai/dream/speech_function_classifier/res_cor.json --output data/res_cor.json && \
-    mv data/* /models && \
-    rm -rf data
+# RUN git clone -b feat/speech-fn https://github.com/deeppavlov/DeepPavlov.git && \
+# 	cd DeepPavlov && pip install -e . && cd .. && rm -rf DeepPavlov && python -m deeppavlov download speech_fn
 
-COPY annotators/speech_function_classifier/requirements.txt requirements.txt
-
-RUN pip install -r requirements.txt && \
-    python -c "import nltk; nltk.download('punkt')"
+RUN pip install git+https://github.com/deeppavlov/DeepPavlov.git@e2ae2d91edad414736d15520eb1820cb6c06565d && \
+        python -m deeppavlov download speech_fn
 
 COPY . ./
 
+RUN pip install -r requirements.txt
+
 ARG SERVICE_NAME
 ENV SERVICE_NAME ${SERVICE_NAME}