Source code for manim_voiceover.services.recorder

from pathlib import Path
from manim_voiceover.helper import msg_box, prompt_ask_missing_extras, remove_bookmarks

from manim_voiceover.services.base import SpeechService
from manim import logger

try:
    import pyaudio
    from manim_voiceover.services.recorder.utility import Recorder

    # Workaround to get this included in the docs
    DEFAULT_FORMAT = pyaudio.paInt16
except ImportError:
    logger.error(
        'Missing packages. Run `pip install "manim-voiceover[recorder]"` to use RecorderService.'
    )
    DEFAULT_FORMAT = None


[docs]class RecorderService(SpeechService):
    """Speech service that records from a microphone during rendering."""

    def __init__(
        self,
        format: int = DEFAULT_FORMAT,
        channels: int = 1,
        rate: int = 44100,
        chunk: int = 512,
        device_index: int = None,
        transcription_model: str = "base",
        trim_silence_threshold: float = -40.0,
        trim_buffer_start: int = 200,
        trim_buffer_end: int = 200,
        callback_delay: float = 0.05,
        **kwargs,
    ):
        """Initialize the speech service.

        Args:
            format (int, optional): Format of the audio. Defaults to pyaudio.paInt16.
            channels (int, optional): Number of channels. Defaults to 1.
            rate (int, optional): Sampling rate. Defaults to 44100.
            chunk (int, optional): Chunk size. Defaults to 512.
            device_index (int, optional): Device index, if you don't want to choose it every time you render. Defaults to None.
            transcription_model (str, optional): The `OpenAI Whisper model <https://github.com/openai/whisper#available-models-and-languages>`_ to use for transcription. Defaults to "base".
            trim_silence_threshold (float, optional): Threshold for trimming silence in decibels. Defaults to -40.0 dB.
            trim_buffer_start (int, optional): Buffer duration for trimming silence at the start. Defaults to 200 ms.
            trim_buffer_end (int, optional): Buffer duration for trimming silence at the end. Defaults to 200 ms.
        """
        prompt_ask_missing_extras(["pyaudio", "pynput"], "recorder", "RecorderService")

        self.recorder = Recorder(
            format=format,
            channels=channels,
            rate=rate,
            chunk=chunk,
            device_index=device_index,
            trim_silence_threshold=trim_silence_threshold,
            trim_buffer_start=trim_buffer_start,
            trim_buffer_end=trim_buffer_end,
            callback_delay=callback_delay,
        )

        SpeechService.__init__(self, transcription_model=transcription_model, **kwargs)

    def generate_from_text(
        self, text: str, cache_dir: str = None, path: str = None, **kwargs
    ) -> dict:
        """"""

        # Remove bookmarks
        input_text = remove_bookmarks(text)

        if cache_dir is None:
            cache_dir = self.cache_dir

        input_data = {
            # Remove bookmarks so that we don't record a voiceover every time we change a bookmark
            "input_text": input_text,
            "config": {
                "format": self.recorder.format,
                "channels": self.recorder.channels,
                "rate": self.recorder.rate,
                "chunk": self.recorder.chunk,
            },
            "service": "recorder",
        }
        cached_result = self.get_cached_result(input_data, cache_dir)
        if cached_result is not None:
            return cached_result

        if path is None:
            audio_path = self.get_audio_basename(input_data) + ".mp3"
        else:
            audio_path = path

        self.recorder._trigger_set_device()
        box = msg_box("Voiceover:\n\n" + input_text)
        self.recorder.record(str(Path(cache_dir) / audio_path), box)

        json_dict = {
            "input_text": text,
            "input_data": input_data,
            "original_audio": audio_path,
        }

        return json_dict