Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add external speech region control by using an external subtitle file. #159

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
92 changes: 73 additions & 19 deletions autosub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import wave
import json
import requests
import pysubs2
try:
from json.decoder import JSONDecodeError
except ImportError:
Expand All @@ -34,7 +35,9 @@
DEFAULT_CONCURRENCY = 10
DEFAULT_SRC_LANGUAGE = 'en'
DEFAULT_DST_LANGUAGE = 'en'

MAX_EXT_REGION_LENGTH = 10000
# Maximum speech to text region length in milliseconds
# when using external speech region control

def percentile(arr, percent):
"""
Expand Down Expand Up @@ -210,41 +213,67 @@ def find_speech_regions(filename, frame_width=4096, min_region_size=0.5, max_reg

threshold = percentile(energies, 0.2)

elapsed_time = 0
region_end = 0

regions = []
region_start = None

for energy in energies:
is_silence = energy <= threshold
max_exceeded = region_start and elapsed_time - region_start >= max_region_size
max_exceeded = region_start and region_end - region_start >= max_region_size

if (max_exceeded or is_silence) and region_start:
if elapsed_time - region_start >= min_region_size:
regions.append((region_start, elapsed_time))
if region_end - region_start >= min_region_size:
regions.append((region_start, region_end))
region_start = None

elif (not region_start) and (not is_silence):
region_start = elapsed_time
elapsed_time += chunk_duration
region_start = region_end
region_end += chunk_duration
return regions


def generate_subtitles( # pylint: disable=too-many-locals,too-many-arguments
def generate_subtitles( # pylint: disable=too-many-locals,too-many-arguments,too-many-branches,too-many-statements
source_path,
output=None,
concurrency=DEFAULT_CONCURRENCY,
src_language=DEFAULT_SRC_LANGUAGE,
dst_language=DEFAULT_DST_LANGUAGE,
subtitle_file_format=DEFAULT_SUBTITLE_FORMAT,
api_key=None,
ext_regions=None,
ext_max_length=MAX_EXT_REGION_LENGTH
):
"""
Given an input audio/video file, generate subtitles in the specified language and format.
"""
audio_filename, audio_rate = extract_audio(source_path)

regions = find_speech_regions(audio_filename)
if not ext_regions:
regions = find_speech_regions(audio_filename)
else:
regions = []
for event in ext_regions.events:
if not event.is_comment:
# not a comment region
if event.duration <= ext_max_length:
regions.append((float(event.start) / 1000.0,
float(event.start + event.duration) / 1000.0))
else:
# split too long regions
elapsed_time = event.duration
start_time = event.start
reader = wave.open(audio_filename)
audio_file_length = float(reader.getnframes()) / float(reader.getframerate())
if float(elapsed_time) / 1000.0 > audio_file_length:
elapsed_time = math.floor(audio_file_length) * 1000
while elapsed_time > ext_max_length:
regions.append((float(start_time) / 1000.0,
float(start_time + ext_max_length) / 1000.0))
elapsed_time = elapsed_time - ext_max_length
start_time = start_time + ext_max_length
regions.append((float(start_time) / 1000.0,
float(start_time + elapsed_time) / 1000.0))

pool = multiprocessing.Pool(concurrency)
converter = FLACConverter(source_path=audio_filename)
Expand Down Expand Up @@ -362,6 +391,11 @@ def main():
parser.add_argument('-o', '--output',
help="Output path for subtitles (by default, subtitles are saved in \
the same directory and name as the source path)")
parser.add_argument('-esr', '--external-speech-regions',
help="Path to the external speech regions, \
which is one of the formats that pysubs2 supports \
and overrides the default method to find speech regions",
nargs="?", metavar="path")
parser.add_argument('-F', '--format', help="Destination subtitle format",
default=DEFAULT_SUBTITLE_FORMAT)
parser.add_argument('-S', '--src-language', help="Language spoken in source file",
Expand Down Expand Up @@ -394,18 +428,38 @@ def main():
return 1

try:
subtitle_file_path = generate_subtitles(
source_path=args.source_path,
concurrency=args.concurrency,
src_language=args.src_language,
dst_language=args.dst_language,
api_key=args.api_key,
subtitle_file_format=args.format,
output=args.output,
)
print("Subtitles file created at {}".format(subtitle_file_path))
if args.external_speech_regions:
print("Using external speech regions.")
ext_regions = pysubs2.SSAFile.load(args.external_speech_regions)
subtitle_file_path = generate_subtitles(
source_path=args.source_path,
concurrency=args.concurrency,
src_language=args.src_language,
dst_language=args.dst_language,
api_key=args.api_key,
subtitle_file_format=args.format,
output=args.output,
ext_regions=ext_regions
)
print("Subtitles file created at {}".format(subtitle_file_path))

else:
subtitle_file_path = generate_subtitles(
source_path=args.source_path,
concurrency=args.concurrency,
src_language=args.src_language,
dst_language=args.dst_language,
api_key=args.api_key,
subtitle_file_format=args.format,
output=args.output
)
print("Subtitles file created at {}".format(subtitle_file_path))

except KeyboardInterrupt:
return 1
except pysubs2.exceptions.Pysubs2Error:
print("Error: pysubs2.exceptions. Check your file format.")
return 1

return 0

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
install_requires=[
'google-api-python-client>=1.4.2',
'requests>=2.3.0',
'pysubs2>=0.2.4',
'pysrt>=1.0.1',
'progressbar2>=3.34.3',
'six>=1.11.0',
Expand Down