From 176660b4424d10b76858b5bceb3bb4d57dfdd22f Mon Sep 17 00:00:00 2001 From: harry Date: Mon, 15 Apr 2024 17:45:05 +0800 Subject: [PATCH 1/4] support azure new speech voice --- app/config/config.py | 5 +- app/services/voice.py | 133 ++++++++++++++++++++++++++++++++++++++++-- config.example.toml | 8 ++- requirements.txt | 8 ++- webui/Main.py | 26 +++++---- webui/i18n/de.json | 2 + webui/i18n/en.json | 2 + webui/i18n/zh.json | 2 + 8 files changed, 168 insertions(+), 18 deletions(-) diff --git a/app/config/config.py b/app/config/config.py index ca8a20e..414b444 100644 --- a/app/config/config.py +++ b/app/config/config.py @@ -36,6 +36,8 @@ def save_config(): _cfg["app"] = app _cfg["whisper"] = whisper _cfg["pexels"] = pexels + _cfg["azure"] = azure + _cfg["ui"] = ui f.write(toml.dumps(_cfg)) @@ -43,6 +45,7 @@ def save_config(): app = _cfg.get("app", {}) whisper = _cfg.get("whisper", {}) pexels = _cfg.get("pexels", {}) +azure = _cfg.get("azure", {}) ui = _cfg.get("ui", {}) hostname = socket.gethostname() @@ -53,7 +56,7 @@ def save_config(): project_name = _cfg.get("project_name", "MoneyPrinterTurbo") project_description = _cfg.get("project_description", "https://github.com/harry0703/MoneyPrinterTurbo") -project_version = _cfg.get("project_version", "1.1.1") +project_version = _cfg.get("project_version", "1.1.2") reload_debug = False imagemagick_path = app.get("imagemagick_path", "") diff --git a/app/services/voice.py b/app/services/voice.py index 03fa104..5b6a460 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -1,6 +1,7 @@ import asyncio import os import re +from datetime import datetime from xml.sax.saxutils import unescape from edge_tts.submaker import mktimestamp from loguru import logger @@ -8,10 +9,11 @@ import edge_tts from moviepy.video.tools import subtitles +from app.config import config from app.utils import utils -def get_all_voices(filter_locals=None) -> list[str]: +def get_all_azure_voices(filter_locals=None) -> list[str]: if filter_locals is None: filter_locals = ["zh-CN", "en-US", "zh-HK", "zh-TW"] voices_str = """ @@ -956,6 +958,34 @@ def get_all_voices(filter_locals=None) -> list[str]: Name: zu-ZA-ThembaNeural Gender: Male + + +Name: en-US-AvaMultilingualNeural-V2 +Gender: Female + +Name: en-US-AndrewMultilingualNeural-V2 +Gender: Male + +Name: en-US-EmmaMultilingualNeural-V2 +Gender: Female + +Name: en-US-BrianMultilingualNeural-V2 +Gender: Male + +Name: de-DE-FlorianMultilingualNeural-V2 +Gender: Male + +Name: de-DE-SeraphinaMultilingualNeural-V2 +Gender: Female + +Name: fr-FR-RemyMultilingualNeural-V2 +Gender: Male + +Name: fr-FR-VivienneMultilingualNeural-V2 +Gender: Female + +Name: zh-CN-XiaoxiaoMultilingualNeural-V2 +Gender: Female """.strip() voices = [] name = '' @@ -986,11 +1016,26 @@ def get_all_voices(filter_locals=None) -> list[str]: def parse_voice_name(name: str): # zh-CN-XiaoyiNeural-Female # zh-CN-YunxiNeural-Male + # zh-CN-XiaoxiaoMultilingualNeural-V2-Female name = name.replace("-Female", "").replace("-Male", "").strip() return name +def is_azure_v2_voice(voice_name: str): + voice_name = parse_voice_name(voice_name) + print(voice_name) + if voice_name.endswith("-V2"): + return voice_name.replace("-V2", "").strip() + return "" + + def tts(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]: + if is_azure_v2_voice(voice_name): + return azure_tts_v2(text, voice_name, voice_file) + return azure_tts_v1(text, voice_name, voice_file) + + +def azure_tts_v1(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]: text = text.strip() for i in range(3): try: @@ -1019,6 +1064,80 @@ async def _do() -> SubMaker: return None +def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]: + voice_name = is_azure_v2_voice(voice_name) + if not voice_name: + logger.error(f"invalid voice name: {voice_name}") + raise ValueError(f"invalid voice name: {voice_name}") + text = text.strip() + + def _format_duration_to_offset(duration) -> int: + if isinstance(duration, str): + time_obj = datetime.strptime(duration, "%H:%M:%S.%f") + milliseconds = (time_obj.hour * 3600000) + (time_obj.minute * 60000) + (time_obj.second * 1000) + ( + time_obj.microsecond // 1000) + return milliseconds * 10000 + + if isinstance(duration, int): + return duration + + return 0 + + for i in range(3): + try: + logger.info(f"start, voice name: {voice_name}, try: {i + 1}") + + import azure.cognitiveservices.speech as speechsdk + + sub_maker = SubMaker() + + def speech_synthesizer_word_boundary_cb(evt: speechsdk.SessionEventArgs): + # print('WordBoundary event:') + # print('\tBoundaryType: {}'.format(evt.boundary_type)) + # print('\tAudioOffset: {}ms'.format((evt.audio_offset + 5000))) + # print('\tDuration: {}'.format(evt.duration)) + # print('\tText: {}'.format(evt.text)) + # print('\tTextOffset: {}'.format(evt.text_offset)) + # print('\tWordLength: {}'.format(evt.word_length)) + + duration = _format_duration_to_offset(str(evt.duration)) + offset = _format_duration_to_offset(evt.audio_offset) + sub_maker.subs.append(evt.text) + sub_maker.offset.append((offset, offset + duration)) + + # Creates an instance of a speech config with specified subscription key and service region. + speech_key = config.azure.get("speech_key", "") + service_region = config.azure.get("speech_region", "") + audio_config = speechsdk.audio.AudioOutputConfig(filename=voice_file, use_default_speaker=True) + speech_config = speechsdk.SpeechConfig(subscription=speech_key, + region=service_region) + speech_config.speech_synthesis_voice_name = voice_name + # speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestSentenceBoundary, + # value='true') + speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestWordBoundary, + value='true') + + speech_config.set_speech_synthesis_output_format( + speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3) + speech_synthesizer = speechsdk.SpeechSynthesizer(audio_config=audio_config, + speech_config=speech_config) + speech_synthesizer.synthesis_word_boundary.connect(speech_synthesizer_word_boundary_cb) + + result = speech_synthesizer.speak_text_async(text).get() + if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: + logger.success(f"azure v2 speech synthesis succeeded: {voice_file}") + return sub_maker + elif result.reason == speechsdk.ResultReason.Canceled: + cancellation_details = result.cancellation_details + logger.error(f"azure v2 speech synthesis canceled: {cancellation_details.reason}") + if cancellation_details.reason == speechsdk.CancellationReason.Error: + logger.error(f"azure v2 speech synthesis error: {cancellation_details.error_details}") + logger.info(f"completed, output file: {voice_file}") + except Exception as e: + logger.error(f"failed, error: {str(e)}") + return None + + def _format_text(text: str) -> str: # text = text.replace("\n", " ") text = text.replace("[", " ") @@ -1131,8 +1250,12 @@ def get_audio_duration(sub_maker: submaker.SubMaker): if __name__ == "__main__": - voices = get_all_voices() - print(voices) + voice_name = "zh-CN-XiaoxiaoMultilingualNeural-V2-Female" + voice_name = parse_voice_name(voice_name) + voice_name = is_azure_v2_voice(voice_name) + print(voice_name) + + voices = get_all_azure_voices() print(len(voices)) @@ -1140,6 +1263,7 @@ async def _do(): temp_dir = utils.storage_dir("temp") voice_names = [ + "zh-CN-XiaoxiaoMultilingualNeural", # 女性 "zh-CN-XiaoxiaoNeural", "zh-CN-XiaoyiNeural", @@ -1174,6 +1298,7 @@ async def _do(): 业绩解读 利润方面,2023全年贵州茅台,>归母净利润增速为19%,其中营业收入正贡献18%,营业成本正贡献百分之一,管理费用正贡献百分之一点四。(注:归母净利润增速值=营业收入增速+各科目贡献,展示贡献/拖累的前四名科目,且要求贡献值/净利润增速>15%) """ + text = "静夜思是唐代诗人李白创作的一首五言古诗。这首诗描绘了诗人在寂静的夜晚,看到窗前的明月,不禁想起远方的家乡和亲人" text = _format_text(text) lines = utils.split_string_by_punctuations(text) @@ -1182,7 +1307,7 @@ async def _do(): for voice_name in voice_names: voice_file = f"{temp_dir}/tts-{voice_name}.mp3" subtitle_file = f"{temp_dir}/tts.mp3.srt" - sub_maker = tts(text=text, voice_name=voice_name, voice_file=voice_file) + sub_maker = azure_tts_v2(text=text, voice_name=voice_name, voice_file=voice_file) create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file) audio_duration = get_audio_duration(sub_maker) print(f"voice: {voice_name}, audio duration: {audio_duration}s") diff --git a/config.example.toml b/config.example.toml index 048e4c0..c19ac57 100644 --- a/config.example.toml +++ b/config.example.toml @@ -161,4 +161,10 @@ ### Example: "http://user:pass@proxy:1234" ### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies # http = "http://10.10.1.10:3128" - # https = "http://10.10.1.10:1080" \ No newline at end of file + # https = "http://10.10.1.10:1080" + +[azure] + # Azure Speech API Key + # Get your API key at https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices + speech_key="" + speech_region="" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 28358af..db91b1a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,10 @@ g4f~=0.2.5.4 dashscope~=1.15.0 google.generativeai~=0.4.1 python-multipart~=0.0.9 -redis==5.0.3 \ No newline at end of file +redis==5.0.3 +# if you use pillow~=10.3.0, you will get "PIL.Image' has no attribute 'ANTIALIAS'" error when resize video +# please install opencv-python to fix "PIL.Image' has no attribute 'ANTIALIAS'" error +opencv-python +# for azure speech +# https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/9-more-realistic-ai-voices-for-conversations-now-generally/ba-p/4099471 +azure-cognitiveservices-speech~=1.37.0 \ No newline at end of file diff --git a/webui/Main.py b/webui/Main.py index 16ddaef..481f78e 100644 --- a/webui/Main.py +++ b/webui/Main.py @@ -1,5 +1,6 @@ import sys import os +import time # Add the root directory of the project to the system path to allow importing modules from the project root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) @@ -165,7 +166,6 @@ def tr(key): code = selected_language.split(" - ")[0].strip() st.session_state['ui_language'] = code config.ui['language'] = code - config.save_config() with middle_config_panel: # openai @@ -207,8 +207,6 @@ def tr(key): if st_llm_account_id: config.app[f"{llm_provider}_account_id"] = st_llm_account_id - config.save_config() - with right_config_panel: pexels_api_keys = config.app.get("pexels_api_keys", []) if isinstance(pexels_api_keys, str): @@ -219,7 +217,6 @@ def tr(key): pexels_api_key = pexels_api_key.replace(" ", "") if pexels_api_key: config.app["pexels_api_keys"] = pexels_api_key.split(",") - config.save_config() panel = st.columns(3) left_panel = panel[0] @@ -302,20 +299,20 @@ def tr(key): index=0) with st.container(border=True): st.write(tr("Audio Settings")) - voices = voice.get_all_voices(filter_locals=["zh-CN", "zh-HK", "zh-TW", "de-DE", "en-US"]) + voices = voice.get_all_azure_voices(filter_locals=["zh-CN", "zh-HK", "zh-TW", "de-DE", "en-US", "fr-FR"]) friendly_names = { - voice: voice. + v: v. replace("Female", tr("Female")). replace("Male", tr("Male")). replace("Neural", "") for - voice in voices} + v in voices} saved_voice_name = config.ui.get("voice_name", "") saved_voice_name_index = 0 if saved_voice_name in friendly_names: saved_voice_name_index = list(friendly_names.keys()).index(saved_voice_name) else: - for i, voice in enumerate(voices): - if voice.lower().startswith(st.session_state['ui_language'].lower()): + for i, v in enumerate(voices): + if v.lower().startswith(st.session_state['ui_language'].lower()): saved_voice_name_index = i break @@ -326,7 +323,13 @@ def tr(key): voice_name = list(friendly_names.keys())[list(friendly_names.values()).index(selected_friendly_name)] params.voice_name = voice_name config.ui['voice_name'] = voice_name - config.save_config() + if voice.is_azure_v2_voice(voice_name): + saved_azure_speech_region = config.azure.get(f"speech_region", "") + saved_azure_speech_key = config.azure.get(f"speech_key", "") + azure_speech_region = st.text_input(tr("Speech Region"), value=saved_azure_speech_region) + azure_speech_key = st.text_input(tr("Speech Key"), value=saved_azure_speech_key, type="password") + config.azure["speech_region"] = azure_speech_region + config.azure["speech_key"] = azure_speech_key params.voice_volume = st.selectbox(tr("Speech Volume"), options=[0.6, 0.8, 1.0, 1.2, 1.5, 2.0, 3.0, 4.0, 5.0], index=2) @@ -363,7 +366,6 @@ def tr(key): saved_font_name_index = font_names.index(saved_font_name) params.font_name = st.selectbox(tr("Font"), font_names, index=saved_font_name_index) config.ui['font_name'] = params.font_name - config.save_config() subtitle_positions = [ (tr("Top"), "top"), @@ -446,3 +448,5 @@ def log_received(msg): open_task_folder(task_id) logger.info(tr("Video Generation Completed")) scroll_to_bottom() + +config.save_config() diff --git a/webui/i18n/de.json b/webui/i18n/de.json index d68899b..4e3b22f 100644 --- a/webui/i18n/de.json +++ b/webui/i18n/de.json @@ -23,6 +23,8 @@ "Number of Videos Generated Simultaneously": "Anzahl der parallel generierten Videos", "Audio Settings": "**Audio Einstellungen**", "Speech Synthesis": "Sprachausgabe", + "Speech Region": "Region(:red[Required,[Get Region](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", + "Speech Key": "API Key(:red[Required,[Get API Key](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", "Speech Volume": "Lautstärke der Sprachausgabe", "Male": "Männlich", "Female": "Weiblich", diff --git a/webui/i18n/en.json b/webui/i18n/en.json index 5fca842..472248f 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -23,6 +23,8 @@ "Number of Videos Generated Simultaneously": "Number of Videos Generated Simultaneously", "Audio Settings": "**Audio Settings**", "Speech Synthesis": "Speech Synthesis Voice", + "Speech Region": "Region(:red[Required,[Get Region](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", + "Speech Key": "API Key(:red[Required,[Get API Key](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", "Speech Volume": "Speech Volume (1.0 represents 100%)", "Male": "Male", "Female": "Female", diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index 28f719d..5dbc4bc 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -23,6 +23,8 @@ "Number of Videos Generated Simultaneously": "同时生成视频数量", "Audio Settings": "**音频设置**", "Speech Synthesis": "朗读声音(:red[尽量与文案语言保持一致])", + "Speech Region": "服务区域(:red[必填,[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", + "Speech Key": "API Key(:red[必填,[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", "Speech Volume": "朗读音量(1.0表示100%)", "Male": "男性", "Female": "女性", From 2e58d7ccf211b57620300989def5a3ab196eec2a Mon Sep 17 00:00:00 2001 From: harry Date: Mon, 15 Apr 2024 17:46:24 +0800 Subject: [PATCH 2/4] fix the bug where clip were not closed --- app/services/video.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/app/services/video.py b/app/services/video.py index 3a6a61b..814be27 100644 --- a/app/services/video.py +++ b/app/services/video.py @@ -100,17 +100,18 @@ def combine_videos(combined_video_path: str, clips.append(clip) video_duration += clip.duration - final_clip = concatenate_videoclips(clips) - final_clip = final_clip.set_fps(30) + video_clip = concatenate_videoclips(clips) + video_clip = video_clip.set_fps(30) logger.info(f"writing") # https://github.com/harry0703/MoneyPrinterTurbo/issues/111#issuecomment-2032354030 - final_clip.write_videofile(filename=combined_video_path, + video_clip.write_videofile(filename=combined_video_path, threads=threads, logger=None, temp_audiofile_path=output_dir, audio_codec="aac", fps=30, ) + video_clip.close() logger.success(f"completed") return combined_video_path @@ -263,7 +264,7 @@ def create_text_clip(subtitle_item): logger=None, fps=30, ) - + video_clip.close() logger.success(f"completed") From 1e96357f00f3521698650854bc2fe455b62903a3 Mon Sep 17 00:00:00 2001 From: harry Date: Mon, 15 Apr 2024 17:46:56 +0800 Subject: [PATCH 3/4] fix the bug where the last subtitle line was missing --- app/utils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/utils/utils.py b/app/utils/utils.py index 3a813f0..b7041dc 100644 --- a/app/utils/utils.py +++ b/app/utils/utils.py @@ -188,7 +188,7 @@ def split_string_by_punctuations(s): else: result.append(txt.strip()) txt = "" - + result.append(txt.strip()) # filter empty string result = list(filter(None, result)) return result From d4eb7bc3335342b31a2fe93708fb912c75cb483e Mon Sep 17 00:00:00 2001 From: harry Date: Mon, 15 Apr 2024 17:47:10 +0800 Subject: [PATCH 4/4] optimize code --- app/controllers/v1/video.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/app/controllers/v1/video.py b/app/controllers/v1/video.py index e6e613e..ab1a3a0 100644 --- a/app/controllers/v1/video.py +++ b/app/controllers/v1/video.py @@ -91,7 +91,7 @@ def delete_video(request: Request, task_id: str = Path(..., description="Task ID sm.state.delete_task(task_id) logger.success(f"video deleted: {utils.to_json(task)}") - return utils.get_response(200, task) + return utils.get_response(200) raise HttpException(task_id=task_id, status_code=404, message=f"{request_id}: task not found") @@ -190,4 +190,5 @@ async def download_video(_: Request, file_path: str): headers = { "Content-Disposition": f"attachment; filename={filename}{extension}" } - return FileResponse(path=video_path, headers=headers, filename=f"{filename}{extension}", media_type=f'video/{extension[1:]}') + return FileResponse(path=video_path, headers=headers, filename=f"{filename}{extension}", + media_type=f'video/{extension[1:]}')