Source code for manim_voiceover.services.coqui

from pathlib import Path

from manim import logger
from manim_voiceover.helper import prompt_ask_missing_package, remove_bookmarks, wav2mp3
from manim_voiceover.services.base import SpeechService

try:
    from TTS.api import TTS
except ImportError:
    logger.error("Missing packages. Run `pip install TTS` to use CoquiService.")

# DEFAULT_MODEL = TTS.list_models()[0]
DEFAULT_MODEL = "tts_models/en/ljspeech/tacotron2-DDC"


[docs]class CoquiService(SpeechService):
    """Speech service for Coqui TTS.
    Default model: ``tts_models/en/ljspeech/tacotron2-DDC``.
    """

    def __init__(
        self,
        model_name: str = DEFAULT_MODEL,
        config_path: str = None,
        vocoder_path: str = None,
        vocoder_config_path: str = None,
        progress_bar: bool = True,
        gpu=False,
        speaker_idx=0,
        language_idx=0,
        **kwargs,
    ):
        self.tts = TTS(
            model_name=model_name,
            config_path=config_path,
            vocoder_path=vocoder_path,
            vocoder_config_path=vocoder_config_path,
            progress_bar=progress_bar,
            gpu=gpu,
        )

        # Run TTS
        self.speaker = (
            self.tts.speakers[speaker_idx] if self.tts.speakers is not None else None
        )
        self.language = (
            self.tts.languages[language_idx] if self.tts.languages is not None else None
        )

        self.init_kwargs = kwargs
        prompt_ask_missing_package("TTS", "TTS>=0.13.3")
        SpeechService.__init__(self, **kwargs)

[docs]    def generate_from_text(
        self, text: str, cache_dir: str = None, path: str = None, **kwargs
    ) -> dict:
        if cache_dir is None:
            cache_dir = self.cache_dir

        input_text = remove_bookmarks(text)
        input_data = {"input_text": text, "service": "coqui"}

        cached_result = self.get_cached_result(input_data, cache_dir)
        if cached_result is not None:
            return cached_result

        if path is None:
            audio_path = self.get_audio_basename(input_data) + ".mp3"
        else:
            audio_path = path

        if not kwargs:
            kwargs = self.init_kwargs

        output_path = str(Path(cache_dir) / audio_path)
        wav_path = Path(output_path).with_suffix(".wav")

        # Text to speech to a file
        self.tts.tts_to_file(
            text=input_text,
            speaker=self.speaker,
            language=self.language,
            file_path=wav_path,
        )
        wav2mp3(wav_path, output_path)

        json_dict = {
            "input_text": text,
            "input_data": input_data,
            "original_audio": audio_path,
            # "word_boundaries": word_boundaries,
        }

        return json_dict