Skip to content

Commit

Permalink
Merge pull request #605 from deeppavlov/dev
Browse files Browse the repository at this point in the history
Release v1.13.0
  • Loading branch information
dilyararimovna committed Jan 23, 2024
2 parents 8dab819 + 141a8db commit 00bba31
Show file tree
Hide file tree
Showing 78 changed files with 5,080 additions and 54,987 deletions.
38 changes: 38 additions & 0 deletions annotators/emotion_detection/Dockerfile
@@ -0,0 +1,38 @@
FROM nvidia/cuda:12.1.1-base-ubuntu20.04

RUN apt update
RUN apt install -y python3.9
RUN apt install -y git python3-pip

ARG VIDEO_PRETRAINED
ARG TEXT_PRETRAINED
ARG MODEL_PATH
ARG MULTIMODAL_MODEL
ARG REDUNDANT_FEATURES

ENV VIDEO_PRETRAINED=$VIDEO_PRETRAINED
ENV TEXT_PRETRAINED=$TEXT_PRETRAINED
ENV MULTIMODAL_MODEL=$MULTIMODAL_MODEL
ENV MODEL_PATH=$MODEL_PATH
ENV REDUNDANT_FEATURES=$REDUNDANT_FEATURES

WORKDIR /src

COPY . /src
RUN mkdir /data
RUN pip install -r requirements.txt

RUN apt install -y ffmpeg=7:4.2.7-0ubuntu0.1 libsm6=2:1.2.3-1 libxext6=2:1.3.4-0ubuntu1

RUN pip install gdown==4.7.1

RUN git clone https://github.com/anna-a-m/MultimodalERC /data/repo && cd /data/repo && git reset --hard 84097d442b23b5a9238b5090a04e2625741314ae

RUN mv -f /data/repo/* /data/ && rm -rf /data/repo

RUN touch /data/multimodal_concat/__init__.py

RUN apt-get install -y wget

RUN wget -O models http://files.deeppavlov.ai/dream_data/emotion_detection/emotion_detection_v1.tar.gz && tar -xf models -C /data/
RUN wget -O redundant_feat http://files.deeppavlov.ai/dream_data/emotion_detection/redundant_feat.txt && mv -f redundant_feat /data/
4 changes: 4 additions & 0 deletions annotators/emotion_detection/aux.py
@@ -0,0 +1,4 @@
import sys

sys.path.append("/data")
sys.path.append("/data/multimodal_concat")
16 changes: 16 additions & 0 deletions annotators/emotion_detection/requirements.txt
@@ -0,0 +1,16 @@
pandas==1.5.3
scikit-learn==1.3.0
tqdm==4.64.1
opencv-python==4.7.0.68
opensmile==2.4.2
sentry-sdk==1.15.0
torch==1.13.1
transformers==4.31.0
fastapi==0.103.0
blinker==1.5.0
pydantic==2.3.0
numpy==1.24.4
starlette==0.27.0
uvicorn==0.23.2
Pillow==9.3.0
wandb==0.13.9
199 changes: 199 additions & 0 deletions annotators/emotion_detection/server.py
@@ -0,0 +1,199 @@
import logging
import os
import opensmile
import torch
import numpy as np
import sentry_sdk
import cv2
import aux # noqa: F401

from multimodal_concat.models import MultimodalClassificationModel, MainModel
from multimodal_concat.utils import prepare_models

from fastapi import FastAPI
from fastapi.encoders import jsonable_encoder
from pydantic import BaseModel
from starlette.middleware.cors import CORSMiddleware
from transformers import AutoTokenizer, AutoProcessor
from typing import List
from urllib.request import urlretrieve

sentry_sdk.init(dsn=os.getenv("SENTRY_DSN"))

label2id = {
"anger": 0,
"disgust": 1,
"fear": 2,
"joy": 3,
"neutral": 4,
"sadness": 5,
"surprise": 6,
}
num_labels = 7
text_model, video_model, audio_model = prepare_models(num_labels, os.getenv("MODEL_PATH"))

logger = logging.getLogger(__name__)


def sample_frame_indices(seg_len, clip_len=16, frame_sample_rate=4, mode="video"):
converted_len = int(clip_len * frame_sample_rate)
converted_len = min(converted_len, seg_len - 1)
end_idx = np.random.randint(converted_len, seg_len)
start_idx = end_idx - converted_len
if mode == "video":
indices = np.linspace(start_idx, end_idx, num=clip_len)
else:
indices = np.linspace(start_idx, end_idx, num=clip_len * frame_sample_rate)
indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
return indices


def get_frames(
file_path,
clip_len=16,
):
cap = cv2.VideoCapture(file_path)
v_len = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
indices = sample_frame_indices(v_len)

frames = []
for fn in range(v_len):
success, frame = cap.read()
if success is False:
continue
if fn in indices:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
res = cv2.resize(frame, dsize=(224, 224), interpolation=cv2.INTER_CUBIC)
frames.append(res)
cap.release()

if len(frames) < clip_len:
add_num = clip_len - len(frames)
frames_to_add = [frames[-1]] * add_num
frames.extend(frames_to_add)

return frames


def create_final_model():
multi_model = MultimodalClassificationModel(
text_model,
video_model,
audio_model,
num_labels,
input_size=4885,
hidden_size=512,
)
checkpoint = torch.load(os.getenv("MULTIMODAL_MODEL"))
multi_model.load_state_dict(checkpoint)

device = "cuda"
return MainModel(multi_model, device=device)


def process_text(input_tokens: str):
text_model_name = os.getenv("TEXT_PRETRAINED")
logger.info(f"{text_model_name}")
tokenizer = AutoTokenizer.from_pretrained(text_model_name)

return tokenizer(
input_tokens,
padding="max_length",
truncation=True,
max_length=128,
return_tensors="pt",
)


def process_video(video_path: str):
video_frames = get_frames(video_path)

video_model_name = os.getenv("VIDEO_PRETRAINED")
video_feature_extractor = AutoProcessor.from_pretrained(video_model_name)

return video_feature_extractor(videos=video_frames, return_tensors="pt")


def process_audio(file_path: str):
smile = opensmile.Smile(
opensmile.FeatureSet.ComParE_2016,
opensmile.FeatureLevel.Functionals,
sampling_rate=16000,
resample=True,
num_workers=5,
verbose=True,
)

redundant_features = os.getenv("REDUNDANT_FEATURES")
with open(redundant_features, "r") as features_file:
redundant_features_list = features_file.read().split(",")

audio_features = smile.process_files([file_path])
audio_features = audio_features.drop(columns=redundant_features_list, inplace=False)
return audio_features.values.reshape(audio_features.shape[0], 1, audio_features.shape[1])


def inference(text: str, video_path: str):
text_encoding = process_text(text)
video_encoding = process_video(video_path)
audio_features = process_audio(video_path)
batch = {
"text": text_encoding,
"video": video_encoding,
"audio": audio_features,
"label": None,
}
label = final_model(batch)
id2label = {v: k for k, v in label2id.items()}
return id2label[int(label.detach().cpu())]


def predict_emotion(text: str, video_path: str):
try:
logger.warning(f"{inference(text, video_path)}")
return inference(text, video_path)
except Exception as e:
sentry_sdk.capture_exception(e)
raise e


final_model = create_final_model()


class EmotionsPayload(BaseModel):
personality: List[str]
video_path: List[str]


def subinfer(msg_text: str, video_path: str):
emotion = "Emotion detection unsuccessfull. An error occured during inference."
filepath = "undefined"
try:
filename = video_path.split("=")[-1]
filepath = f"/data/{filename}"
urlretrieve(video_path, filepath)
if not os.path.exists(filepath):
raise ValueError(f"Failed to retrieve videofile from {filepath}")
emotion = predict_emotion(msg_text + " ", filepath)
logger.info(f"Detected emotion: {jsonable_encoder(emotion)}")
except Exception as e:
raise ValueError(f"The message format is correct, but: {e}")

return emotion


app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)


@app.post("/model")
def infer(payload: EmotionsPayload):
logger.info(f"Emotion Detection: {payload}")
emotion = [subinfer(p[0], p[1]) for p in zip(payload.personality, payload.video_path)]
return jsonable_encoder(emotion)
@@ -0,0 +1,9 @@
SERVICE_PORT: 8040
SERVICE_NAME: emotion_detection
CUDA_VISIBLE_DEVICES: 0
VIDEO_PRETRAINED: "microsoft/xclip-base-patch32"
EXT_PRETRAINED: "bert-large-uncased"
MULTIMODAL_MODEL: "final_model.pt"
REDUNDANT_FEATURES: "redundant_features.txt"
MODEL_PATH: "/data/"
PREFIX: "Detect emotions:"
@@ -0,0 +1,29 @@
name: emotion-detection
endpoints:
- model
compose:
env_file:
- .env
build:
args:
SERVICE_PORT: 8040
SERVICE_NAME: emotion_detection
VIDEO_PRETRAINED: "microsoft/xclip-base-patch32"
TEXT_PRETRAINED: "bert-large-uncased"
MULTIMODAL_MODEL: "final_model.pt"
REDUNDANT_FEATURES: "redundant_features.txt"
MODEL_PATH: "/data/"
PREFIX: "Detect emotions:"
context: .
dockerfile: ./annotators/emotion_detection/Dockerfile
command: uvicorn server:app --host 0.0.0.0 --port 8040
deploy:
resources:
limits:
memory: 1G
reservations:
memory: 1G
environment:
- CUDA_VISIBLE_DEVICES=0
ports:
- 8040:8040
2 changes: 1 addition & 1 deletion annotators/kbqa/tests/test_kbqa.py
Expand Up @@ -13,7 +13,7 @@
),
(
{"x_init": ["How old is Donald Trump?"], "entities": [["Donald Trump"]], "entity_tags": [[["per", 1.0]]]},
"Donald Trump is 77 years old.",
"Donald Trump is 78 years old.",
),
],
)
Expand Down
6 changes: 5 additions & 1 deletion annotators/personality_detection/Dockerfile
Expand Up @@ -2,10 +2,14 @@ FROM python:3.7

WORKDIR /src

RUN git clone https://github.com/jkwieser/personality-detection-text.git /personality-detection-text
COPY . /src
RUN mkdir /data

COPY requirements.txt .

RUN pip install -r requirements.txt

RUN wget -O models http://files.deeppavlov.ai/dream_data/personality_detection/personality_detection_models_v0.tar.gz && tar -xf models
RUN mv -f models_v0 /data/models && ls /data/models && rm -rf models_v0

COPY . .
1 change: 0 additions & 1 deletion annotators/personality_detection/requirements.txt
@@ -1,5 +1,4 @@
scikit-learn==0.22.1
plotly==4.14.3
pandas==1.2.4
uvicorn==0.13.4
fastapi==0.65.1
Expand Down
14 changes: 7 additions & 7 deletions annotators/personality_detection/server.py
Expand Up @@ -12,13 +12,13 @@

sentry_sdk.init(os.getenv("SENTRY_DSN"))

cEXT = pickle.load(open("/personality-detection-text/data/models/cEXT.p", "rb"))
cNEU = pickle.load(open("/personality-detection-text/data/models/cNEU.p", "rb"))
cAGR = pickle.load(open("/personality-detection-text/data/models/cAGR.p", "rb"))
cCON = pickle.load(open("/personality-detection-text/data/models/cCON.p", "rb"))
cOPN = pickle.load(open("/personality-detection-text/data/models/cOPN.p", "rb"))
vectorizer_31 = pickle.load(open("/personality-detection-text/data/models/vectorizer_31.p", "rb"))
vectorizer_30 = pickle.load(open("/personality-detection-text/data/models/vectorizer_30.p", "rb"))
cEXT = pickle.load(open("/data/models/cEXT.p", "rb"))
cNEU = pickle.load(open("/data/models/cNEU.p", "rb"))
cAGR = pickle.load(open("/data/models/cAGR.p", "rb"))
cCON = pickle.load(open("/data/models/cCON.p", "rb"))
cOPN = pickle.load(open("/data/models/cOPN.p", "rb"))
vectorizer_31 = pickle.load(open("/data/models/vectorizer_31.p", "rb"))
vectorizer_30 = pickle.load(open("/data/models/vectorizer_30.p", "rb"))


logger = logging.getLogger(__name__)
Expand Down
17 changes: 6 additions & 11 deletions annotators/speech_function_classifier/Dockerfile
Expand Up @@ -4,21 +4,16 @@ WORKDIR /src

RUN apt-get update && apt-get install -y --allow-unauthenticated git curl && rm -rf /var/lib/apt/lists/*

RUN mkdir /models && \
curl https://files.deeppavlov.ai/alexaprize_data/speech_function_classifier.tar.gz --output arch.tgz && \
tar zxfv arch.tgz && \
rm -rf arch.tgz && \
curl https://files.deeppavlov.ai/dream/speech_function_classifier/res_cor.json --output data/res_cor.json && \
mv data/* /models && \
rm -rf data
# RUN git clone -b feat/speech-fn https://github.com/deeppavlov/DeepPavlov.git && \
# cd DeepPavlov && pip install -e . && cd .. && rm -rf DeepPavlov && python -m deeppavlov download speech_fn

COPY annotators/speech_function_classifier/requirements.txt requirements.txt

RUN pip install -r requirements.txt && \
python -c "import nltk; nltk.download('punkt')"
RUN pip install git+https://github.com/deeppavlov/DeepPavlov.git@e2ae2d91edad414736d15520eb1820cb6c06565d && \
python -m deeppavlov download speech_fn

COPY . ./

RUN pip install -r requirements.txt

ARG SERVICE_NAME
ENV SERVICE_NAME ${SERVICE_NAME}

Expand Down

0 comments on commit 00bba31

Please sign in to comment.