Source code for manim_voiceover.services.azure

import os
import sys
from pathlib import Path

from dotenv import find_dotenv, load_dotenv
from manim import logger

from manim_voiceover.helper import (
    create_dotenv_file,
    prompt_ask_missing_extras,
    remove_bookmarks,
)
from manim_voiceover.services.base import SpeechService

try:
    import azure.cognitiveservices.speech as speechsdk
except ImportError:
    logger.error(
        'Missing packages. Run `pip install "manim-voiceover[azure]"` to use AzureService.'
    )


load_dotenv(find_dotenv(usecwd=True))


def serialize_word_boundary(wb):
    return {
        "audio_offset": wb["audio_offset"],
        "duration_milliseconds": int(wb["duration_milliseconds"].microseconds / 1000),
        "text_offset": wb["text_offset"],
        "word_length": wb["word_length"],
        "text": wb["text"],
        "boundary_type": wb["boundary_type"],
    }


def create_dotenv_azure():
    logger.info(
        "Check out https://voiceover.manim.community/en/stable/services.html#azureservice to learn how to create an account and get your subscription key."
    )
    if not create_dotenv_file(["AZURE_SUBSCRIPTION_KEY", "AZURE_SERVICE_REGION"]):
        raise Exception(
            "The environment variables AZURE_SUBSCRIPTION_KEY and AZURE_SERVICE_REGION are not set. Please set them or create a .env file with the variables."
        )
    logger.info("The .env file has been created. Please run Manim again.")
    sys.exit()


[docs]class AzureService(SpeechService):
    """Speech service for Azure TTS API."""

    def __init__(
        self,
        voice: str = "en-US-AriaNeural",
        # style="newscast-casual",
        style: str = None,
        output_format: str = "Audio48Khz192KBitRateMonoMp3",
        prosody: dict = None,
        **kwargs,
    ):
        """
        Args:
            voice (str, optional): The voice to use. See the `API page <https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support?tabs=stt-tts>`__ for all the available options. Defaults to ``en-US-AriaNeural``.
            style (str, optional): The style to use. See the `API page <https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech?tabs=streaming#style>`__ to see how you can see available styles for a given voice. Defaults to None.
            output_format (str, optional): The output format to use. See the `API page <https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech?tabs=streaming#audio-outputs>`__ for all the available options. Defaults to ``Audio48Khz192KBitRateMonoMp3``.
            prosody (dict, optional): Global prosody settings to use. See the `API page <https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup#adjust-prosody>`__ for all the available options. Defaults to None.
        """
        prompt_ask_missing_extras(
            "azure.cognitiveservices.speech", "azure", "AzureService"
        )

        self.voice = voice
        self.style = style
        self.output_format = output_format
        self.prosody = prosody
        SpeechService.__init__(self, **kwargs)

    def generate_from_text(
        self, text: str, cache_dir: str = None, path: str = None, **kwargs
    ) -> dict:
        """"""
        inner = text
        # Remove bookmarks
        inner = remove_bookmarks(inner)
        if cache_dir is None:
            cache_dir = self.cache_dir

        # Apply prosody
        prosody = kwargs.get("prosody", self.prosody)

        ssml_beginning = r"""<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
    xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
    <voice name="%s">
        """ % (
            self.voice
        )
        ssml_end = r"""
    </voice>
</speak>
        """

        if prosody is not None:
            if not isinstance(prosody, dict):
                raise ValueError(
                    "The prosody argument must be a dict that contains at least one of the following keys: 'pitch', 'contour', 'range', 'rate', 'volume'."
                )

            prosody_opening_tag = (
                "<prosody "
                + " ".join(
                    ['%s="%s"' % (key, str(val)) for key, val in prosody.items()]
                )
                + ">"
            )
            prosody_closing_tag = "</prosody>"
            ssml_beginning = ssml_beginning + prosody_opening_tag
            ssml_end = prosody_closing_tag + ssml_end

        if self.style is not None:
            style_opening_tag = '<mstts:express-as style="%s">' % self.style
            style_closing_tag = "</mstts:express-as>"
            ssml_beginning = ssml_beginning + style_opening_tag
            ssml_end = style_closing_tag + ssml_end

        ssml = ssml_beginning + inner + ssml_end
        initial_offset = len(ssml_beginning)

        input_data = {
            "input_text": text,
            "ssml": ssml,
            "service": "azure",
            "config": {
                "voice": self.voice,
                "style": self.style,
                "output_format": self.output_format,
                "prosody": self.prosody,
            },
        }

        cached_result = self.get_cached_result(input_data, cache_dir)
        if cached_result is not None:
            return cached_result

        if path is None:
            audio_path = self.get_audio_basename(input_data) + ".mp3"
        else:
            audio_path = path

        try:
            azure_subscription_key = os.environ["AZURE_SUBSCRIPTION_KEY"]
            azure_service_region = os.environ["AZURE_SERVICE_REGION"]
        except KeyError:
            logger.error(
                "Could not find the environment variables AZURE_SUBSCRIPTION_KEY and AZURE_SERVICE_REGION. Microsoft Azure's text-to-speech API needs account credentials to connect. You can create an account for free and (as of writing this) get a free quota of TTS minutes."
            )
            create_dotenv_azure()

        speech_config = speechsdk.SpeechConfig(
            subscription=azure_subscription_key,
            region=azure_service_region,
        )
        speech_config.set_speech_synthesis_output_format(
            speechsdk.SpeechSynthesisOutputFormat[self.output_format]
        )
        audio_config = speechsdk.audio.AudioOutputConfig(
            filename=str(Path(cache_dir) / audio_path)
        )

        speech_service = speechsdk.SpeechSynthesizer(
            speech_config=speech_config, audio_config=audio_config
        )
        word_boundaries = []
        # speech_synthesizer.bookmark_reached.connect(lambda evt: print(
        #     "Bookmark reached: {}, audio offset: {}ms, bookmark text: {}.".format(evt, evt.audio_offset, evt.text)))

        def process_event(evt):
            # print(f'{type(evt)=}')
            result = {label[1:]: val for label, val in evt.__dict__.items()}
            result["boundary_type"] = result["boundary_type"].name
            result["text_offset"] = result["text_offset"] - initial_offset
            return result

        speech_service.synthesis_word_boundary.connect(
            lambda evt: word_boundaries.append(process_event(evt))
        )
        speech_synthesis_result = speech_service.speak_ssml_async(ssml).get()

        json_dict = {
            "input_text": text,
            "input_data": input_data,
            "ssml": ssml,
            "word_boundaries": [serialize_word_boundary(wb) for wb in word_boundaries],
            "original_audio": audio_path,
        }

        if (
            speech_synthesis_result.reason
            == speechsdk.ResultReason.SynthesizingAudioCompleted
        ):
            pass
        elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = speech_synthesis_result.cancellation_details
            logger.error(
                "Speech synthesis canceled: {}".format(cancellation_details.reason)
            )
            if cancellation_details.reason == speechsdk.CancellationReason.Error:
                if cancellation_details.error_details:
                    logger.error(
                        "Error details: {}".format(cancellation_details.error_details)
                    )
                    if "authentication" in cancellation_details.error_details.lower():
                        logger.error(
                            "The authentication credentials are invalid. Please check the environment variables AZURE_SUBSCRIPTION_KEY and AZURE_SERVICE_REGION."
                        )
                        logger.info(
                            "Would you like to enter new values for the variables in the .env file? [Y/n]"
                        )
                        if input().lower() in ["y", "yes", ""]:
                            create_dotenv_azure()

            raise Exception("Speech synthesis failed")

        return json_dict