From 176660b4424d10b76858b5bceb3bb4d57dfdd22f Mon Sep 17 00:00:00 2001
From: harry <harry@wangxutech.com>
Date: Mon, 15 Apr 2024 17:45:05 +0800
Subject: [PATCH 1/4] support azure new speech voice

---
 app/config/config.py  |   5 +-
 app/services/voice.py | 133 ++++++++++++++++++++++++++++++++++++++++--
 config.example.toml   |   8 ++-
 requirements.txt      |   8 ++-
 webui/Main.py         |  26 +++++----
 webui/i18n/de.json    |   2 +
 webui/i18n/en.json    |   2 +
 webui/i18n/zh.json    |   2 +
 8 files changed, 168 insertions(+), 18 deletions(-)
diff --git a/app/config/config.py b/app/config/config.py
index ca8a20e..414b444 100644
--- a/app/config/config.py
+++ b/app/config/config.py
@@ -36,6 +36,8 @@ def save_config():
         _cfg["app"] = app
         _cfg["whisper"] = whisper
         _cfg["pexels"] = pexels
+        _cfg["azure"] = azure
+        _cfg["ui"] = ui
         f.write(toml.dumps(_cfg))
 
 
@@ -43,6 +45,7 @@ def save_config():
 app = _cfg.get("app", {})
 whisper = _cfg.get("whisper", {})
 pexels = _cfg.get("pexels", {})
+azure = _cfg.get("azure", {})
 ui = _cfg.get("ui", {})
 
 hostname = socket.gethostname()
@@ -53,7 +56,7 @@ def save_config():
 project_name = _cfg.get("project_name", "MoneyPrinterTurbo")
 project_description = _cfg.get("project_description",
                                "<a href='https://github.com/harry0703/MoneyPrinterTurbo'>https://github.com/harry0703/MoneyPrinterTurbo</a>")
-project_version = _cfg.get("project_version", "1.1.1")
+project_version = _cfg.get("project_version", "1.1.2")
 reload_debug = False
 
 imagemagick_path = app.get("imagemagick_path", "")
diff --git a/app/services/voice.py b/app/services/voice.py
index 03fa104..5b6a460 100644
--- a/app/services/voice.py
+++ b/app/services/voice.py
@@ -1,6 +1,7 @@
 import asyncio
 import os
 import re
+from datetime import datetime
 from xml.sax.saxutils import unescape
 from edge_tts.submaker import mktimestamp
 from loguru import logger
@@ -8,10 +9,11 @@
 import edge_tts
 from moviepy.video.tools import subtitles
 
+from app.config import config
 from app.utils import utils
 
 
-def get_all_voices(filter_locals=None) -> list[str]:
+def get_all_azure_voices(filter_locals=None) -> list[str]:
     if filter_locals is None:
         filter_locals = ["zh-CN", "en-US", "zh-HK", "zh-TW"]
     voices_str = """
@@ -956,6 +958,34 @@ def get_all_voices(filter_locals=None) -> list[str]:
 
 Name: zu-ZA-ThembaNeural
 Gender: Male
+
+
+Name: en-US-AvaMultilingualNeural-V2
+Gender: Female
+
+Name: en-US-AndrewMultilingualNeural-V2
+Gender: Male
+
+Name: en-US-EmmaMultilingualNeural-V2
+Gender: Female
+
+Name: en-US-BrianMultilingualNeural-V2
+Gender: Male
+
+Name: de-DE-FlorianMultilingualNeural-V2
+Gender: Male
+
+Name: de-DE-SeraphinaMultilingualNeural-V2
+Gender: Female
+
+Name: fr-FR-RemyMultilingualNeural-V2
+Gender: Male
+
+Name: fr-FR-VivienneMultilingualNeural-V2
+Gender: Female
+
+Name: zh-CN-XiaoxiaoMultilingualNeural-V2
+Gender: Female
     """.strip()
     voices = []
     name = ''
@@ -986,11 +1016,26 @@ def get_all_voices(filter_locals=None) -> list[str]:
 def parse_voice_name(name: str):
     # zh-CN-XiaoyiNeural-Female
     # zh-CN-YunxiNeural-Male
+    # zh-CN-XiaoxiaoMultilingualNeural-V2-Female
     name = name.replace("-Female", "").replace("-Male", "").strip()
     return name
 
 
+def is_azure_v2_voice(voice_name: str):
+    voice_name = parse_voice_name(voice_name)
+    print(voice_name)
+    if voice_name.endswith("-V2"):
+        return voice_name.replace("-V2", "").strip()
+    return ""
+
+
 def tts(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
+    if is_azure_v2_voice(voice_name):
+        return azure_tts_v2(text, voice_name, voice_file)
+    return azure_tts_v1(text, voice_name, voice_file)
+
+
+def azure_tts_v1(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
     text = text.strip()
     for i in range(3):
         try:
@@ -1019,6 +1064,80 @@ async def _do() -> SubMaker:
     return None
 
 
+def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
+    voice_name = is_azure_v2_voice(voice_name)
+    if not voice_name:
+        logger.error(f"invalid voice name: {voice_name}")
+        raise ValueError(f"invalid voice name: {voice_name}")
+    text = text.strip()
+
+    def _format_duration_to_offset(duration) -> int:
+        if isinstance(duration, str):
+            time_obj = datetime.strptime(duration, "%H:%M:%S.%f")
+            milliseconds = (time_obj.hour * 3600000) + (time_obj.minute * 60000) + (time_obj.second * 1000) + (
+                    time_obj.microsecond // 1000)
+            return milliseconds * 10000
+
+        if isinstance(duration, int):
+            return duration
+
+        return 0
+
+    for i in range(3):
+        try:
+            logger.info(f"start, voice name: {voice_name}, try: {i + 1}")
+
+            import azure.cognitiveservices.speech as speechsdk
+
+            sub_maker = SubMaker()
+
+            def speech_synthesizer_word_boundary_cb(evt: speechsdk.SessionEventArgs):
+                # print('WordBoundary event:')
+                # print('\tBoundaryType: {}'.format(evt.boundary_type))
+                # print('\tAudioOffset: {}ms'.format((evt.audio_offset + 5000)))
+                # print('\tDuration: {}'.format(evt.duration))
+                # print('\tText: {}'.format(evt.text))
+                # print('\tTextOffset: {}'.format(evt.text_offset))
+                # print('\tWordLength: {}'.format(evt.word_length))
+
+                duration = _format_duration_to_offset(str(evt.duration))
+                offset = _format_duration_to_offset(evt.audio_offset)
+                sub_maker.subs.append(evt.text)
+                sub_maker.offset.append((offset, offset + duration))
+
+            # Creates an instance of a speech config with specified subscription key and service region.
+            speech_key = config.azure.get("speech_key", "")
+            service_region = config.azure.get("speech_region", "")
+            audio_config = speechsdk.audio.AudioOutputConfig(filename=voice_file, use_default_speaker=True)
+            speech_config = speechsdk.SpeechConfig(subscription=speech_key,
+                                                   region=service_region)
+            speech_config.speech_synthesis_voice_name = voice_name
+            # speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestSentenceBoundary,
+            #                            value='true')
+            speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestWordBoundary,
+                                       value='true')
+
+            speech_config.set_speech_synthesis_output_format(
+                speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3)
+            speech_synthesizer = speechsdk.SpeechSynthesizer(audio_config=audio_config,
+                                                             speech_config=speech_config)
+            speech_synthesizer.synthesis_word_boundary.connect(speech_synthesizer_word_boundary_cb)
+
+            result = speech_synthesizer.speak_text_async(text).get()
+            if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
+                logger.success(f"azure v2 speech synthesis succeeded: {voice_file}")
+                return sub_maker
+            elif result.reason == speechsdk.ResultReason.Canceled:
+                cancellation_details = result.cancellation_details
+                logger.error(f"azure v2 speech synthesis canceled: {cancellation_details.reason}")
+                if cancellation_details.reason == speechsdk.CancellationReason.Error:
+                    logger.error(f"azure v2 speech synthesis error: {cancellation_details.error_details}")
+            logger.info(f"completed, output file: {voice_file}")
+        except Exception as e:
+            logger.error(f"failed, error: {str(e)}")
+    return None
+
+
 def _format_text(text: str) -> str:
     # text = text.replace("\n", " ")
     text = text.replace("[", " ")
@@ -1131,8 +1250,12 @@ def get_audio_duration(sub_maker: submaker.SubMaker):
 
 
 if __name__ == "__main__":
-    voices = get_all_voices()
-    print(voices)
+    voice_name = "zh-CN-XiaoxiaoMultilingualNeural-V2-Female"
+    voice_name = parse_voice_name(voice_name)
+    voice_name = is_azure_v2_voice(voice_name)
+    print(voice_name)
+
+    voices = get_all_azure_voices()
     print(len(voices))
 
 
@@ -1140,6 +1263,7 @@ async def _do():
         temp_dir = utils.storage_dir("temp")
 
         voice_names = [
+            "zh-CN-XiaoxiaoMultilingualNeural",
             # 女性
             "zh-CN-XiaoxiaoNeural",
             "zh-CN-XiaoyiNeural",
@@ -1174,6 +1298,7 @@ async def _do():
 业绩解读
 利润方面，2023全年贵州茅台，>归母净利润增速为19%，其中营业收入正贡献18%，营业成本正贡献百分之一，管理费用正贡献百分之一点四。(注：归母净利润增速值=营业收入增速+各科目贡献，展示贡献/拖累的前四名科目，且要求贡献值/净利润增速>15%)
 """
+        text = "静夜思是唐代诗人李白创作的一首五言古诗。这首诗描绘了诗人在寂静的夜晚，看到窗前的明月，不禁想起远方的家乡和亲人"
 
         text = _format_text(text)
         lines = utils.split_string_by_punctuations(text)
@@ -1182,7 +1307,7 @@ async def _do():
         for voice_name in voice_names:
             voice_file = f"{temp_dir}/tts-{voice_name}.mp3"
             subtitle_file = f"{temp_dir}/tts.mp3.srt"
-            sub_maker = tts(text=text, voice_name=voice_name, voice_file=voice_file)
+            sub_maker = azure_tts_v2(text=text, voice_name=voice_name, voice_file=voice_file)
             create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
             audio_duration = get_audio_duration(sub_maker)
             print(f"voice: {voice_name}, audio duration: {audio_duration}s")
diff --git a/config.example.toml b/config.example.toml
index 048e4c0..c19ac57 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -161,4 +161,10 @@
         ### Example: "http://user:pass@proxy:1234"
         ### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
         # http = "http://10.10.1.10:3128"
-        # https = "http://10.10.1.10:1080"
\ No newline at end of file
+        # https = "http://10.10.1.10:1080"
+
+[azure]
+    # Azure Speech API Key
+    # Get your API key at https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices
+    speech_key=""
+    speech_region=""
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 28358af..db91b1a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,4 +16,10 @@ g4f~=0.2.5.4
 dashscope~=1.15.0
 google.generativeai~=0.4.1
 python-multipart~=0.0.9
-redis==5.0.3
\ No newline at end of file
+redis==5.0.3
+# if you use pillow~=10.3.0, you will get "PIL.Image' has no attribute 'ANTIALIAS'" error when resize video
+# please install opencv-python to fix "PIL.Image' has no attribute 'ANTIALIAS'" error
+opencv-python
+# for azure speech
+# https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/9-more-realistic-ai-voices-for-conversations-now-generally/ba-p/4099471
+azure-cognitiveservices-speech~=1.37.0
\ No newline at end of file
diff --git a/webui/Main.py b/webui/Main.py
index 16ddaef..481f78e 100644
--- a/webui/Main.py
+++ b/webui/Main.py
@@ -1,5 +1,6 @@
 import sys
 import os
+import time
 
 # Add the root directory of the project to the system path to allow importing modules from the project
 root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
@@ -165,7 +166,6 @@ def tr(key):
             code = selected_language.split(" - ")[0].strip()
             st.session_state['ui_language'] = code
             config.ui['language'] = code
-            config.save_config()
 
     with middle_config_panel:
         #   openai
@@ -207,8 +207,6 @@ def tr(key):
             if st_llm_account_id:
                 config.app[f"{llm_provider}_account_id"] = st_llm_account_id
 
-        config.save_config()
-
     with right_config_panel:
         pexels_api_keys = config.app.get("pexels_api_keys", [])
         if isinstance(pexels_api_keys, str):
@@ -219,7 +217,6 @@ def tr(key):
         pexels_api_key = pexels_api_key.replace(" ", "")
         if pexels_api_key:
             config.app["pexels_api_keys"] = pexels_api_key.split(",")
-            config.save_config()
 
 panel = st.columns(3)
 left_panel = panel[0]
@@ -302,20 +299,20 @@ def tr(key):
                                           index=0)
     with st.container(border=True):
         st.write(tr("Audio Settings"))
-        voices = voice.get_all_voices(filter_locals=["zh-CN", "zh-HK", "zh-TW", "de-DE", "en-US"])
+        voices = voice.get_all_azure_voices(filter_locals=["zh-CN", "zh-HK", "zh-TW", "de-DE", "en-US", "fr-FR"])
         friendly_names = {
-            voice: voice.
+            v: v.
             replace("Female", tr("Female")).
             replace("Male", tr("Male")).
             replace("Neural", "") for
-            voice in voices}
+            v in voices}
         saved_voice_name = config.ui.get("voice_name", "")
         saved_voice_name_index = 0
         if saved_voice_name in friendly_names:
             saved_voice_name_index = list(friendly_names.keys()).index(saved_voice_name)
         else:
-            for i, voice in enumerate(voices):
-                if voice.lower().startswith(st.session_state['ui_language'].lower()):
+            for i, v in enumerate(voices):
+                if v.lower().startswith(st.session_state['ui_language'].lower()):
                     saved_voice_name_index = i
                     break
 
@@ -326,7 +323,13 @@ def tr(key):
         voice_name = list(friendly_names.keys())[list(friendly_names.values()).index(selected_friendly_name)]
         params.voice_name = voice_name
         config.ui['voice_name'] = voice_name
-        config.save_config()
+        if voice.is_azure_v2_voice(voice_name):
+            saved_azure_speech_region = config.azure.get(f"speech_region", "")
+            saved_azure_speech_key = config.azure.get(f"speech_key", "")
+            azure_speech_region = st.text_input(tr("Speech Region"), value=saved_azure_speech_region)
+            azure_speech_key = st.text_input(tr("Speech Key"), value=saved_azure_speech_key, type="password")
+            config.azure["speech_region"] = azure_speech_region
+            config.azure["speech_key"] = azure_speech_key
 
         params.voice_volume = st.selectbox(tr("Speech Volume"),
                                            options=[0.6, 0.8, 1.0, 1.2, 1.5, 2.0, 3.0, 4.0, 5.0], index=2)
@@ -363,7 +366,6 @@ def tr(key):
             saved_font_name_index = font_names.index(saved_font_name)
         params.font_name = st.selectbox(tr("Font"), font_names, index=saved_font_name_index)
         config.ui['font_name'] = params.font_name
-        config.save_config()
 
         subtitle_positions = [
             (tr("Top"), "top"),
@@ -446,3 +448,5 @@ def log_received(msg):
     open_task_folder(task_id)
     logger.info(tr("Video Generation Completed"))
     scroll_to_bottom()
+
+config.save_config()
diff --git a/webui/i18n/de.json b/webui/i18n/de.json
index d68899b..4e3b22f 100644
--- a/webui/i18n/de.json
+++ b/webui/i18n/de.json
@@ -23,6 +23,8 @@
     "Number of Videos Generated Simultaneously": "Anzahl der parallel generierten Videos",
     "Audio Settings": "**Audio Einstellungen**",
     "Speech Synthesis": "Sprachausgabe",
+    "Speech Region": "Region(:red[Required，[Get Region](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "API Key(:red[Required，[Get API Key](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
     "Speech Volume": "Lautstärke der Sprachausgabe",
     "Male": "Männlich",
     "Female": "Weiblich",
diff --git a/webui/i18n/en.json b/webui/i18n/en.json
index 5fca842..472248f 100644
--- a/webui/i18n/en.json
+++ b/webui/i18n/en.json
@@ -23,6 +23,8 @@
     "Number of Videos Generated Simultaneously": "Number of Videos Generated Simultaneously",
     "Audio Settings": "**Audio Settings**",
     "Speech Synthesis": "Speech Synthesis Voice",
+    "Speech Region": "Region(:red[Required，[Get Region](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "API Key(:red[Required，[Get API Key](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
     "Speech Volume": "Speech Volume (1.0 represents 100%)",
     "Male": "Male",
     "Female": "Female",
diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json
index 28f719d..5dbc4bc 100644
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@@ -23,6 +23,8 @@
     "Number of Videos Generated Simultaneously": "同时生成视频数量",
     "Audio Settings": "**音频设置**",
     "Speech Synthesis": "朗读声音（:red[尽量与文案语言保持一致]）",
+    "Speech Region": "服务区域(:red[必填，[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "API Key(:red[必填，[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
     "Speech Volume": "朗读音量（1.0表示100%）",
     "Male": "男性",
     "Female": "女性",

From 2e58d7ccf211b57620300989def5a3ab196eec2a Mon Sep 17 00:00:00 2001
From: harry <harry@wangxutech.com>
Date: Mon, 15 Apr 2024 17:46:24 +0800
Subject: [PATCH 2/4] fix the bug where clip were not closed

---
 app/services/video.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/app/services/video.py b/app/services/video.py
index 3a6a61b..814be27 100644
--- a/app/services/video.py
+++ b/app/services/video.py
@@ -100,17 +100,18 @@ def combine_videos(combined_video_path: str,
             clips.append(clip)
             video_duration += clip.duration
 
-    final_clip = concatenate_videoclips(clips)
-    final_clip = final_clip.set_fps(30)
+    video_clip = concatenate_videoclips(clips)
+    video_clip = video_clip.set_fps(30)
     logger.info(f"writing")
     # https://github.com/harry0703/MoneyPrinterTurbo/issues/111#issuecomment-2032354030
-    final_clip.write_videofile(filename=combined_video_path,
+    video_clip.write_videofile(filename=combined_video_path,
                                threads=threads,
                                logger=None,
                                temp_audiofile_path=output_dir,
                                audio_codec="aac",
                                fps=30,
                                )
+    video_clip.close()
     logger.success(f"completed")
     return combined_video_path
 
@@ -263,7 +264,7 @@ def create_text_clip(subtitle_item):
                                logger=None,
                                fps=30,
                                )
-
+    video_clip.close()
     logger.success(f"completed")
 
 

From 1e96357f00f3521698650854bc2fe455b62903a3 Mon Sep 17 00:00:00 2001
From: harry <harry@wangxutech.com>
Date: Mon, 15 Apr 2024 17:46:56 +0800
Subject: [PATCH 3/4] fix the bug where the last subtitle line was missing

---
 app/utils/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app/utils/utils.py b/app/utils/utils.py
index 3a813f0..b7041dc 100644
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@@ -188,7 +188,7 @@ def split_string_by_punctuations(s):
         else:
             result.append(txt.strip())
             txt = ""
-
+    result.append(txt.strip())
     # filter empty string
     result = list(filter(None, result))
     return result

From d4eb7bc3335342b31a2fe93708fb912c75cb483e Mon Sep 17 00:00:00 2001
From: harry <harry@wangxutech.com>
Date: Mon, 15 Apr 2024 17:47:10 +0800
Subject: [PATCH 4/4] optimize code

---
 app/controllers/v1/video.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/app/controllers/v1/video.py b/app/controllers/v1/video.py
index e6e613e..ab1a3a0 100644
--- a/app/controllers/v1/video.py
+++ b/app/controllers/v1/video.py
@@ -91,7 +91,7 @@ def delete_video(request: Request, task_id: str = Path(..., description="Task ID
 
         sm.state.delete_task(task_id)
         logger.success(f"video deleted: {utils.to_json(task)}")
-        return utils.get_response(200, task)
+        return utils.get_response(200)
 
     raise HttpException(task_id=task_id, status_code=404, message=f"{request_id}: task not found")
 
@@ -190,4 +190,5 @@ async def download_video(_: Request, file_path: str):
     headers = {
         "Content-Disposition": f"attachment; filename={filename}{extension}"
     }
-    return FileResponse(path=video_path, headers=headers, filename=f"{filename}{extension}", media_type=f'video/{extension[1:]}')
+    return FileResponse(path=video_path, headers=headers, filename=f"{filename}{extension}",
+                        media_type=f'video/{extension[1:]}')