From 0bb8156ddda4fde4bbdda5f48d21fbbc34a2b0e8 Mon Sep 17 00:00:00 2001 From: Tianzi Cai Date: Fri, 16 Nov 2018 14:00:27 -0800 Subject: [PATCH] feat: video speech transcription [(#1849)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1849) --- samples/analyze/analyze.py | 54 +++++++++++++++++++++++++++++++++ samples/analyze/analyze_test.py | 8 +++++ 2 files changed, 62 insertions(+) diff --git a/samples/analyze/analyze.py b/samples/analyze/analyze.py index 5a13d01c..ab16e453 100644 --- a/samples/analyze/analyze.py +++ b/samples/analyze/analyze.py @@ -229,6 +229,55 @@ def analyze_shots(path): # [END video_analyze_shots] +def speech_transcription(path): + # [START video_speech_transcription] + """Transcribe speech from a video stored on GCS.""" + from google.cloud import videointelligence + + video_client = videointelligence.VideoIntelligenceServiceClient() + features = [videointelligence.enums.Feature.SPEECH_TRANSCRIPTION] + + config = videointelligence.types.SpeechTranscriptionConfig( + language_code='en-US', + enable_automatic_punctuation=True) + video_context = videointelligence.types.VideoContext( + speech_transcription_config=config) + + operation = video_client.annotate_video( + path, features=features, + video_context=video_context) + + print('\nProcessing video for speech transcription.') + + result = operation.result(timeout=600) + + # There is only one annotation_result since only + # one video is processed. + annotation_results = result.annotation_results[0] + for speech_transcription in annotation_results.speech_transcriptions: + + # The number of alternatives for each transcription is limited by + # SpeechTranscriptionConfig.max_alternatives. + # Each alternative is a different possible transcription + # and has its own confidence score. + for alternative in speech_transcription.alternatives: + print('Alternative level information:') + + print('Transcript: {}'.format(alternative.transcript)) + print('Confidence: {}\n'.format(alternative.confidence)) + + print('Word level information:') + for word_info in alternative.words: + word = word_info.word + start_time = word_info.start_time + end_time = word_info.end_time + print('\t{}s - {}s: {}'.format( + start_time.seconds + start_time.nanos * 1e-9, + end_time.seconds + end_time.nanos * 1e-9, + word)) + # [END video_speech_transcription] + + if __name__ == '__main__': parser = argparse.ArgumentParser( description=__doc__, @@ -246,6 +295,9 @@ def analyze_shots(path): analyze_shots_parser = subparsers.add_parser( 'shots', help=analyze_shots.__doc__) analyze_shots_parser.add_argument('path') + transcribe_speech_parser = subparsers.add_parser( + 'transcribe', help=speech_transcription.__doc__) + transcribe_speech_parser.add_argument('path') args = parser.parse_args() @@ -257,3 +309,5 @@ def analyze_shots(path): analyze_shots(args.path) if args.command == 'explicit_content': analyze_explicit_content(args.path) + if args.command == 'transcribe': + speech_transcription(args.path) diff --git a/samples/analyze/analyze_test.py b/samples/analyze/analyze_test.py index 13126c52..f7606f03 100644 --- a/samples/analyze/analyze_test.py +++ b/samples/analyze/analyze_test.py @@ -38,3 +38,11 @@ def test_analyze_explicit_content(capsys): analyze.analyze_explicit_content('gs://demomaker/cat.mp4') out, _ = capsys.readouterr() assert 'pornography' in out + + +@pytest.mark.slow +def test_speech_transcription(capsys): + analyze.speech_transcription( + 'gs://python-docs-samples-tests/video/googlework_short.mp4') + out, _ = capsys.readouterr() + assert 'cultural' in out