Source code for manim_voiceover.services.base

from abc import ABC, abstractmethod
import typing as t
import os
import json
import sys
import hashlib
from pathlib import Path
from manim import config, logger
from slugify import slugify
from manim_voiceover.defaults import (
    DEFAULT_VOICEOVER_CACHE_DIR,
    DEFAULT_VOICEOVER_CACHE_JSON_FILENAME,
)
from manim_voiceover.helper import (
    append_to_json_file,
    prompt_ask_missing_extras,
    remove_bookmarks,
)
from manim_voiceover.modify_audio import adjust_speed
from manim_voiceover.tracker import AUDIO_OFFSET_RESOLUTION


def timestamps_to_word_boundaries(segments):
    word_boundaries = []
    current_text_offset = 0
    for segment in segments:
        for dict_ in segment["words"]:
            word = dict_["word"]
            word_boundaries.append(
                {
                    "audio_offset": int(dict_["start"] * AUDIO_OFFSET_RESOLUTION),
                    # "duration_milliseconds": 0,
                    "text_offset": current_text_offset,
                    "word_length": len(word),
                    "text": word,
                    "boundary_type": "Word",
                }
            )
            current_text_offset += len(word)
            # If word is not punctuation, add a space
            # if word not in [".", ",", "!", "?", ";", ":", "(", ")"]:
            # current_text_offset += 1

    return word_boundaries


[docs]class SpeechService(ABC): """Abstract base class for a speech service.""" def __init__( self, global_speed: float = 1.00, cache_dir: t.Optional[str] = None, transcription_model: t.Optional[str] = None, transcription_kwargs: dict = {}, **kwargs, ): """ Args: global_speed (float, optional): The speed at which to play the audio. Defaults to 1.00. cache_dir (str, optional): The directory to save the audio files to. Defaults to ``voiceovers/``. transcription_model (str, optional): The `OpenAI Whisper model <https://github.com/openai/whisper#available-models-and-languages>`_ to use for transcription. Defaults to None. transcription_kwargs (dict, optional): Keyword arguments to pass to the transcribe() function. Defaults to {}. """ self.global_speed = global_speed if cache_dir is not None: self.cache_dir = cache_dir else: self.cache_dir = Path(config.media_dir) / DEFAULT_VOICEOVER_CACHE_DIR if not os.path.exists(self.cache_dir): os.makedirs(self.cache_dir) self.transcription_model = None self._whisper_model = None self.set_transcription(model=transcription_model, kwargs=transcription_kwargs) self.additional_kwargs = kwargs def _wrap_generate_from_text(self, text: str, path: str = None, **kwargs) -> dict: # Replace newlines with lines, reduce multiple consecutive spaces to single text = " ".join(text.split()) dict_ = self.generate_from_text(text, cache_dir=None, path=path, **kwargs) original_audio = dict_["original_audio"] # Check whether word boundaries exist and if not run stt if "word_boundaries" not in dict_ and self._whisper_model is not None: transcription_result = self._whisper_model.transcribe( str(Path(self.cache_dir) / original_audio), **self.transcription_kwargs ) logger.info("Transcription: " + transcription_result.text) word_boundaries = timestamps_to_word_boundaries( transcription_result.segments_to_dicts() ) dict_["word_boundaries"] = word_boundaries dict_["transcribed_text"] = transcription_result.text # Audio callback self.audio_callback(original_audio, dict_, **kwargs) if self.global_speed != 1: split_path = os.path.splitext(original_audio) adjusted_path = split_path[0] + "_adjusted" + split_path[1] adjust_speed( str(Path(self.cache_dir) / dict_["original_audio"]), str(Path(self.cache_dir) / adjusted_path), self.global_speed, ) dict_["final_audio"] = adjusted_path if "word_boundaries" in dict_: for word_boundary in dict_["word_boundaries"]: word_boundary["audio_offset"] = int( word_boundary["audio_offset"] / self.global_speed ) else: dict_["final_audio"] = dict_["original_audio"] append_to_json_file( Path(self.cache_dir) / DEFAULT_VOICEOVER_CACHE_JSON_FILENAME, dict_ ) return dict_
[docs] def set_transcription(self, model: str = None, kwargs: dict = {}): """Set the transcription model and keyword arguments to be passed to the transcribe() function. Args: model (str, optional): The Whisper model to use for transcription. Defaults to None. kwargs (dict, optional): Keyword arguments to pass to the transcribe() function. Defaults to {}. """ if model != self.transcription_model: if model is not None: try: import whisper as __tmp import stable_whisper as whisper except ImportError: logger.error( 'Missing packages. Run `pip install "manim-voiceover[transcribe]"` to be able to transcribe voiceovers.' ) prompt_ask_missing_extras( ["whisper", "stable_whisper"], "transcribe", "SpeechService.set_transcription()", ) self._whisper_model = whisper.load_model(model) else: self._whisper_model = None self.transcription_kwargs = kwargs
def get_audio_basename(self, data: dict) -> str: dumped_data = json.dumps(data) data_hash = hashlib.sha256(dumped_data.encode("utf-8")).hexdigest() suffix = data_hash[:8] input_text = data["input_text"] input_text = remove_bookmarks(input_text) slug = slugify(input_text, max_length=50, word_boundary=True, save_order=True) ret = f"{slug}-{suffix}" return ret
[docs] @abstractmethod def generate_from_text( self, text: str, cache_dir: str = None, path: str = None ) -> dict: """Implement this method for each speech service. Refer to `AzureService` for an example. Args: text (str): The text to synthesize speech from. cache_dir (str, optional): The output directory to save the audio file and data to. Defaults to None. path (str, optional): The path to save the audio file to. Defaults to None. Returns: dict: Output data dictionary. TODO: Define the format. """ raise NotImplementedError
def get_cached_result(self, input_data, cache_dir): json_path = os.path.join(cache_dir / DEFAULT_VOICEOVER_CACHE_JSON_FILENAME) if os.path.exists(json_path): json_data = json.load(open(json_path, "r")) for entry in json_data: if entry["input_data"] == input_data: return entry return None
[docs] def audio_callback(self, audio_path: str, data: dict, **kwargs): """Callback function for when the audio file is ready. Override this method to do something with the audio file, e.g. noise reduction. Args: audio_path (str): The path to the audio file. data (dict): The data dictionary. """ pass