import os
import sys
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
from manim import logger
from manim_voiceover.helper import (
create_dotenv_file,
prompt_ask_missing_extras,
remove_bookmarks,
)
from manim_voiceover.services.base import SpeechService
try:
import azure.cognitiveservices.speech as speechsdk
except ImportError:
logger.error(
'Missing packages. Run `pip install "manim-voiceover[azure]"` to use AzureService.'
)
load_dotenv(find_dotenv(usecwd=True))
def serialize_word_boundary(wb):
return {
"audio_offset": wb["audio_offset"],
"duration_milliseconds": int(wb["duration_milliseconds"].microseconds / 1000),
"text_offset": wb["text_offset"],
"word_length": wb["word_length"],
"text": wb["text"],
"boundary_type": wb["boundary_type"],
}
def create_dotenv_azure():
logger.info(
"Check out https://voiceover.manim.community/en/stable/services.html#azureservice to learn how to create an account and get your subscription key."
)
if not create_dotenv_file(["AZURE_SUBSCRIPTION_KEY", "AZURE_SERVICE_REGION"]):
raise Exception(
"The environment variables AZURE_SUBSCRIPTION_KEY and AZURE_SERVICE_REGION are not set. Please set them or create a .env file with the variables."
)
logger.info("The .env file has been created. Please run Manim again.")
sys.exit()
[docs]class AzureService(SpeechService):
"""Speech service for Azure TTS API."""
def __init__(
self,
voice: str = "en-US-AriaNeural",
# style="newscast-casual",
style: str = None,
output_format: str = "Audio48Khz192KBitRateMonoMp3",
prosody: dict = None,
**kwargs,
):
"""
Args:
voice (str, optional): The voice to use. See the `API page <https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support?tabs=stt-tts>`__ for all the available options. Defaults to ``en-US-AriaNeural``.
style (str, optional): The style to use. See the `API page <https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech?tabs=streaming#style>`__ to see how you can see available styles for a given voice. Defaults to None.
output_format (str, optional): The output format to use. See the `API page <https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech?tabs=streaming#audio-outputs>`__ for all the available options. Defaults to ``Audio48Khz192KBitRateMonoMp3``.
prosody (dict, optional): Global prosody settings to use. See the `API page <https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup#adjust-prosody>`__ for all the available options. Defaults to None.
"""
prompt_ask_missing_extras(
"azure.cognitiveservices.speech", "azure", "AzureService"
)
self.voice = voice
self.style = style
self.output_format = output_format
self.prosody = prosody
SpeechService.__init__(self, **kwargs)
def generate_from_text(
self, text: str, cache_dir: str = None, path: str = None, **kwargs
) -> dict:
""""""
inner = text
# Remove bookmarks
inner = remove_bookmarks(inner)
if cache_dir is None:
cache_dir = self.cache_dir
# Apply prosody
prosody = kwargs.get("prosody", self.prosody)
ssml_beginning = r"""<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
<voice name="%s">
""" % (
self.voice
)
ssml_end = r"""
</voice>
</speak>
"""
if prosody is not None:
if not isinstance(prosody, dict):
raise ValueError(
"The prosody argument must be a dict that contains at least one of the following keys: 'pitch', 'contour', 'range', 'rate', 'volume'."
)
prosody_opening_tag = (
"<prosody "
+ " ".join(
['%s="%s"' % (key, str(val)) for key, val in prosody.items()]
)
+ ">"
)
prosody_closing_tag = "</prosody>"
ssml_beginning = ssml_beginning + prosody_opening_tag
ssml_end = prosody_closing_tag + ssml_end
if self.style is not None:
style_opening_tag = '<mstts:express-as style="%s">' % self.style
style_closing_tag = "</mstts:express-as>"
ssml_beginning = ssml_beginning + style_opening_tag
ssml_end = style_closing_tag + ssml_end
ssml = ssml_beginning + inner + ssml_end
initial_offset = len(ssml_beginning)
input_data = {
"input_text": text,
"ssml": ssml,
"service": "azure",
"config": {
"voice": self.voice,
"style": self.style,
"output_format": self.output_format,
"prosody": self.prosody,
},
}
cached_result = self.get_cached_result(input_data, cache_dir)
if cached_result is not None:
return cached_result
if path is None:
audio_path = self.get_audio_basename(input_data) + ".mp3"
else:
audio_path = path
try:
azure_subscription_key = os.environ["AZURE_SUBSCRIPTION_KEY"]
azure_service_region = os.environ["AZURE_SERVICE_REGION"]
except KeyError:
logger.error(
"Could not find the environment variables AZURE_SUBSCRIPTION_KEY and AZURE_SERVICE_REGION. Microsoft Azure's text-to-speech API needs account credentials to connect. You can create an account for free and (as of writing this) get a free quota of TTS minutes."
)
create_dotenv_azure()
speech_config = speechsdk.SpeechConfig(
subscription=azure_subscription_key,
region=azure_service_region,
)
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat[self.output_format]
)
audio_config = speechsdk.audio.AudioOutputConfig(
filename=str(Path(cache_dir) / audio_path)
)
speech_service = speechsdk.SpeechSynthesizer(
speech_config=speech_config, audio_config=audio_config
)
word_boundaries = []
# speech_synthesizer.bookmark_reached.connect(lambda evt: print(
# "Bookmark reached: {}, audio offset: {}ms, bookmark text: {}.".format(evt, evt.audio_offset, evt.text)))
def process_event(evt):
# print(f'{type(evt)=}')
result = {label[1:]: val for label, val in evt.__dict__.items()}
result["boundary_type"] = result["boundary_type"].name
result["text_offset"] = result["text_offset"] - initial_offset
return result
speech_service.synthesis_word_boundary.connect(
lambda evt: word_boundaries.append(process_event(evt))
)
speech_synthesis_result = speech_service.speak_ssml_async(ssml).get()
json_dict = {
"input_text": text,
"input_data": input_data,
"ssml": ssml,
"word_boundaries": [serialize_word_boundary(wb) for wb in word_boundaries],
"original_audio": audio_path,
}
if (
speech_synthesis_result.reason
== speechsdk.ResultReason.SynthesizingAudioCompleted
):
pass
elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = speech_synthesis_result.cancellation_details
logger.error(
"Speech synthesis canceled: {}".format(cancellation_details.reason)
)
if cancellation_details.reason == speechsdk.CancellationReason.Error:
if cancellation_details.error_details:
logger.error(
"Error details: {}".format(cancellation_details.error_details)
)
if "authentication" in cancellation_details.error_details.lower():
logger.error(
"The authentication credentials are invalid. Please check the environment variables AZURE_SUBSCRIPTION_KEY and AZURE_SERVICE_REGION."
)
logger.info(
"Would you like to enter new values for the variables in the .env file? [Y/n]"
)
if input().lower() in ["y", "yes", ""]:
create_dotenv_azure()
raise Exception("Speech synthesis failed")
return json_dict