Source code for manim_voiceover.tracker

from pathlib import Path
import re
import numpy as np
from manim import logger

from typing import Optional, List
from scipy.interpolate import interp1d

from manim import Scene
from manim_voiceover.modify_audio import get_duration
from manim_voiceover.helper import remove_bookmarks

AUDIO_OFFSET_RESOLUTION = 10_000_000


class TimeInterpolator:
    def __init__(self, word_boundaries: List[dict]):
        self.x = []
        self.y = []
        for wb in word_boundaries:
            self.x.append(wb["text_offset"])
            self.y.append(wb["audio_offset"] / AUDIO_OFFSET_RESOLUTION)

        self.f = interp1d(self.x, self.y)

    def interpolate(self, distance: int) -> np.ndarray:
        try:
            return self.f(distance)
        except:
            logger.warning(
                "TimeInterpolator received weird input, there may be something wrong with the word boundaries."
            )
            return self.y[-1]


[docs]class VoiceoverTracker: """Class to track the progress of a voiceover in a scene.""" def __init__(self, scene: Scene, data: dict, cache_dir: str): """Initializes a VoiceoverTracker object. Args: scene (Scene): The scene to which the voiceover belongs. path (str): The path to the JSON file containing the voiceover data. """ self.scene = scene self.data = data self.cache_dir = cache_dir self.duration = get_duration(Path(cache_dir) / self.data["final_audio"]) # last_t = scene.last_t last_t = scene.renderer.time if last_t is None: last_t = 0 self.start_t = last_t self.end_t = last_t + self.duration if "word_boundaries" in self.data: self._process_bookmarks() def _get_fallback_word_boundaries(self): """ Returns dummy word boundaries assuming a linear mapping between text and audio. Used when word boundaries are not available. """ input_text = remove_bookmarks(self.data["input_text"]) return [ { "audio_offset": 0, "text_offset": 0, "word_length": len(input_text), "text": self.data["input_text"], "boundary_type": "Word", }, { "audio_offset": self.duration * AUDIO_OFFSET_RESOLUTION, "text_offset": len(input_text), "word_length": 1, "text": ".", "boundary_type": "Word", }, ] def _process_bookmarks(self) -> None: self.bookmark_times = {} self.bookmark_distances = {} word_boundaries = self.data["word_boundaries"] if not word_boundaries or len(word_boundaries) < 2: logger.warning( f"Word boundaries for voiceover {self.data['input_text']} are not " "available or are insufficient. Using fallback word boundaries." ) word_boundaries = self._get_fallback_word_boundaries() self.time_interpolator = TimeInterpolator(word_boundaries) net_text_len = len(remove_bookmarks(self.data["input_text"])) if "transcribed_text" in self.data: transcribed_text_len = len(self.data["transcribed_text"].strip()) else: transcribed_text_len = net_text_len self.input_text = self.data["input_text"] self.content = "" # Mark bookmark distances # parts = re.split("(<bookmark .*/>)", self.input_text) parts = re.split(r"(<bookmark\s*mark\s*=[\'\"]\w*[\"\']\s*/>)", self.input_text) for p in parts: matched = re.match(r"<bookmark\s*mark\s*=[\'\"](.*)[\"\']\s*/>", p) if matched: self.bookmark_distances[matched.group(1)] = len(self.content) else: self.content += p for mark, dist in self.bookmark_distances.items(): # Normalize text offset elapsed = self.time_interpolator.interpolate( dist * transcribed_text_len / net_text_len ) self.bookmark_times[mark] = self.start_t + elapsed
[docs] def get_remaining_duration(self, buff: float = 0.0) -> float: """Returns the remaining duration of the voiceover. Args: buff (float, optional): A buffer to add to the remaining duration. Defaults to 0. Returns: int: The remaining duration of the voiceover in seconds. """ # result= max(self.end_t - self.scene.last_t, 0) result = max(self.end_t - self.scene.renderer.time + buff, 0) # print(result) return result
def _check_bookmarks(self): if not hasattr(self, "bookmark_times"): raise Exception( "Word boundaries are required for timing with bookmarks. " "Manim Voiceover currently supports auto-transcription using OpenAI Whisper, " "but this is not enabled for each speech service by default. " "You can enable it by setting transcription_model='base' in your speech service initialization. " "If the performance of the base model is not satisfactory, you can use one of the larger models. " "See https://github.com/openai/whisper for a list of all the available models." )
[docs] def time_until_bookmark( self, mark: str, buff: int = 0, limit: Optional[int] = None ) -> int: """Returns the time until a bookmark. Args: mark (str): The `mark` attribute of the bookmark to count up to. buff (int, optional): A buffer to add to the remaining duration, in seconds. Defaults to 0. limit (Optional[int], optional): A maximum value to return. Defaults to None. Returns: int: """ self._check_bookmarks() if not mark in self.bookmark_times: raise Exception("There is no <bookmark mark='%s' />" % mark) result = max(self.bookmark_times[mark] - self.scene.renderer.time + buff, 0) if limit is not None: result = min(limit, result) return result