from pathlib import Path
from manim_voiceover.helper import msg_box, prompt_ask_missing_extras, remove_bookmarks
from manim_voiceover.services.base import SpeechService
from manim import logger
try:
import pyaudio
from manim_voiceover.services.recorder.utility import Recorder
# Workaround to get this included in the docs
DEFAULT_FORMAT = pyaudio.paInt16
except ImportError:
logger.error(
'Missing packages. Run `pip install "manim-voiceover[recorder]"` to use RecorderService.'
)
DEFAULT_FORMAT = None
[docs]class RecorderService(SpeechService):
"""Speech service that records from a microphone during rendering."""
def __init__(
self,
format: int = DEFAULT_FORMAT,
channels: int = 1,
rate: int = 44100,
chunk: int = 512,
device_index: int = None,
transcription_model: str = "base",
trim_silence_threshold: float = -40.0,
trim_buffer_start: int = 200,
trim_buffer_end: int = 200,
callback_delay: float = 0.05,
**kwargs,
):
"""Initialize the speech service.
Args:
format (int, optional): Format of the audio. Defaults to pyaudio.paInt16.
channels (int, optional): Number of channels. Defaults to 1.
rate (int, optional): Sampling rate. Defaults to 44100.
chunk (int, optional): Chunk size. Defaults to 512.
device_index (int, optional): Device index, if you don't want to choose it every time you render. Defaults to None.
transcription_model (str, optional): The `OpenAI Whisper model <https://github.com/openai/whisper#available-models-and-languages>`_ to use for transcription. Defaults to "base".
trim_silence_threshold (float, optional): Threshold for trimming silence in decibels. Defaults to -40.0 dB.
trim_buffer_start (int, optional): Buffer duration for trimming silence at the start. Defaults to 200 ms.
trim_buffer_end (int, optional): Buffer duration for trimming silence at the end. Defaults to 200 ms.
"""
prompt_ask_missing_extras(["pyaudio", "pynput"], "recorder", "RecorderService")
self.recorder = Recorder(
format=format,
channels=channels,
rate=rate,
chunk=chunk,
device_index=device_index,
trim_silence_threshold=trim_silence_threshold,
trim_buffer_start=trim_buffer_start,
trim_buffer_end=trim_buffer_end,
callback_delay=callback_delay,
)
SpeechService.__init__(self, transcription_model=transcription_model, **kwargs)
def generate_from_text(
self, text: str, cache_dir: str = None, path: str = None, **kwargs
) -> dict:
""""""
# Remove bookmarks
input_text = remove_bookmarks(text)
if cache_dir is None:
cache_dir = self.cache_dir
input_data = {
# Remove bookmarks so that we don't record a voiceover every time we change a bookmark
"input_text": input_text,
"config": {
"format": self.recorder.format,
"channels": self.recorder.channels,
"rate": self.recorder.rate,
"chunk": self.recorder.chunk,
},
"service": "recorder",
}
cached_result = self.get_cached_result(input_data, cache_dir)
if cached_result is not None:
return cached_result
if path is None:
audio_path = self.get_audio_basename(input_data) + ".mp3"
else:
audio_path = path
self.recorder._trigger_set_device()
box = msg_box("Voiceover:\n\n" + input_text)
self.recorder.record(str(Path(cache_dir) / audio_path), box)
json_dict = {
"input_text": text,
"input_data": input_data,
"original_audio": audio_path,
}
return json_dict