Upgrade whisper and stable-whisper versions, fix humanhash issue, fix AzureService text_offset issue (#53)

osolmaz · web-flow · commit 41be9e142ab4 · 2023-05-21T21:29:53.000+02:00
* Upgraded whisper and stable-whisper versions

* Replaced playsound with pydub

* Removed humanhash from dependencies

* Fixed issue with AzureService text offset
diff --git a/manim_voiceover/services/azure.py b/manim_voiceover/services/azure.py
@@ -86,39 +86,40 @@ def generate_from_text(
         # Apply prosody
         prosody = kwargs.get("prosody", self.prosody)
 
+        ssml_beginning = r"""<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
+    xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
+    <voice name="%s">
+        """ % (
+            self.voice
+        )
+        ssml_end = r"""
+    </voice>
+</speak>
+        """
+
         if prosody is not None:
             if not isinstance(prosody, dict):
                 raise ValueError(
                     "The prosody argument must be a dict that contains at least one of the following keys: 'pitch', 'contour', 'range', 'rate', 'volume'."
                 )
 
-            opening_tag = (
+            prosody_opening_tag = (
                 "<prosody "
                 + " ".join(
                     ['%s="%s"' % (key, str(val)) for key, val in prosody.items()]
                 )
                 + ">"
             )
-            inner = opening_tag + inner + "</prosody>"
+            prosody_closing_tag = "</prosody>"
+            ssml_beginning = ssml_beginning + prosody_opening_tag
+            ssml_end = prosody_closing_tag + ssml_end
 
         if self.style is not None:
-            inner = r"""<mstts:express-as style="%s">
-    %s
-</mstts:express-as>""" % (
-                self.style,
-                inner,
-            )
+            style_opening_tag = '<mstts:express-as style="%s">' % self.style
+            style_closing_tag = "</mstts:express-as>"
+            ssml_beginning = ssml_beginning + style_opening_tag
+            ssml_end = style_closing_tag + ssml_end
 
-        ssml_beginning = r"""<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
-    xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
-    <voice name="%s">
-        """ % (
-            self.voice
-        )
-        ssml_end = r"""
-    </voice>
-</speak>
-        """
         ssml = ssml_beginning + inner + ssml_end
         initial_offset = len(ssml_beginning)
 
@@ -139,7 +140,7 @@ def generate_from_text(
             return cached_result
 
         if path is None:
-            audio_path = self.get_data_hash(input_data) + ".mp3"
+            audio_path = self.get_audio_basename(input_data) + ".mp3"
         else:
             audio_path = path
 
diff --git a/manim_voiceover/services/base.py b/manim_voiceover/services/base.py
@@ -1,11 +1,12 @@
 from abc import ABC, abstractmethod
+import typing as t
 import os
 import json
 import sys
 import hashlib
-import humanhash
 from pathlib import Path
 from manim import config, logger
+from slugify import slugify
 from manim_voiceover.defaults import (
     DEFAULT_VOICEOVER_CACHE_DIR,
     DEFAULT_VOICEOVER_CACHE_JSON_FILENAME,
@@ -19,19 +20,19 @@ def timestamps_to_word_boundaries(segments):
     word_boundaries = []
     current_text_offset = 0
     for segment in segments:
-        for dict_ in segment["word_timestamps"]:
+        for dict_ in segment["words"]:
             word = dict_["word"]
             word_boundaries.append(
                 {
-                    "audio_offset": int(dict_["timestamp"] * AUDIO_OFFSET_RESOLUTION),
+                    "audio_offset": int(dict_["start"] * AUDIO_OFFSET_RESOLUTION),
                     # "duration_milliseconds": 0,
                     "text_offset": current_text_offset,
-                    "word_length": len(dict_["word"]),
+                    "word_length": len(word),
                     "text": word,
                     "boundary_type": "Word",
                 }
             )
-            current_text_offset += len(dict_["word"])
+            current_text_offset += len(word)
             # If word is not punctuation, add a space
             # if word not in [".", ",", "!", "?", ";", ":", "(", ")"]:
             # current_text_offset += 1
@@ -45,10 +46,10 @@ class SpeechService(ABC):
     def __init__(
         self,
         global_speed: float = 1.00,
-        cache_dir: str = None,
-        transcription_model: str = None,
+        cache_dir: t.Optional[str] = None,
+        transcription_model: t.Optional[str] = None,
         transcription_kwargs: dict = {},
-        **kwargs
+        **kwargs,
     ):
         """
         Args:
@@ -90,12 +91,12 @@ def _wrap_generate_from_text(self, text: str, path: str = None, **kwargs) -> dic
             transcription_result = self._whisper_model.transcribe(
                 str(Path(self.cache_dir) / original_audio), **self.transcription_kwargs
             )
-            logger.info("Transcription: " + transcription_result["text"])
+            logger.info("Transcription: " + transcription_result.text)
             word_boundaries = timestamps_to_word_boundaries(
-                transcription_result["segments"]
+                transcription_result.segments_to_dicts()
             )
             dict_["word_boundaries"] = word_boundaries
-            dict_["transcribed_text"] = transcription_result["text"]
+            dict_["transcribed_text"] = transcription_result.text
 
         # Audio callback
         self.audio_callback(original_audio, dict_, **kwargs)
@@ -152,10 +153,14 @@ def set_transcription(self, model: str = None, kwargs: dict = {}):
 
         self.transcription_kwargs = kwargs
 
-    def get_data_hash(self, data: dict) -> str:
+    def get_audio_basename(self, data: dict) -> str:
         dumped_data = json.dumps(data)
         data_hash = hashlib.sha256(dumped_data.encode("utf-8")).hexdigest()
-        return humanhash.humanize(data_hash)
+        suffix = data_hash[:8]
+        input_string = data["input_text"]
+        slug = slugify(input_string)
+        ret = f"{slug}-{suffix}"
+        return ret
 
     @abstractmethod
     def generate_from_text(
diff --git a/manim_voiceover/services/coqui.py b/manim_voiceover/services/coqui.py
@@ -65,7 +65,7 @@ def generate_from_text(
             return cached_result
 
         if path is None:
-            audio_path = self.get_data_hash(input_data) + ".mp3"
+            audio_path = self.get_audio_basename(input_data) + ".mp3"
         else:
             audio_path = path
 
diff --git a/manim_voiceover/services/gtts.py b/manim_voiceover/services/gtts.py
@@ -46,7 +46,7 @@ def generate_from_text(
             return cached_result
 
         if path is None:
-            audio_path = self.get_data_hash(input_data) + ".mp3"
+            audio_path = self.get_audio_basename(input_data) + ".mp3"
         else:
             audio_path = path
 
diff --git a/manim_voiceover/services/pyttsx3.py b/manim_voiceover/services/pyttsx3.py
@@ -39,7 +39,7 @@ def generate_from_text(
             return cached_result
 
         if path is None:
-            audio_path = self.get_data_hash(input_data) + ".mp3"
+            audio_path = self.get_audio_basename(input_data) + ".mp3"
         else:
             audio_path = path
 
diff --git a/manim_voiceover/services/recorder/__init__.py b/manim_voiceover/services/recorder/__init__.py
@@ -48,7 +48,7 @@ def __init__(
             trim_buffer_end (int, optional): Buffer duration for trimming silence at the end. Defaults to 200 ms.
         """
         prompt_ask_missing_extras(
-            ["pyaudio", "pynput", "playsound"], "recorder", "RecorderService"
+            ["pyaudio", "pynput"], "recorder", "RecorderService"
         )
 
         self.recorder = Recorder(
@@ -92,7 +92,7 @@ def generate_from_text(
             return cached_result
 
         if path is None:
-            audio_path = self.get_data_hash(input_data) + ".mp3"
+            audio_path = self.get_audio_basename(input_data) + ".mp3"
         else:
             audio_path = path
 
diff --git a/manim_voiceover/services/recorder/utility.py b/manim_voiceover/services/recorder/utility.py
@@ -11,7 +11,8 @@
 
 from pynput import keyboard
 import pyaudio
-import playsound
+from pydub import AudioSegment
+from pydub.playback import play
 
 
 class MyListener(keyboard.Listener):
@@ -235,7 +236,8 @@ def record(self, path: str, message: str = None):
             try:
                 key = input()[-1].lower()
                 if key == "l":
-                    playsound.playsound(path)
+                    audio = AudioSegment.from_file(path)
+                    play(audio)
                 elif key == "r":
                     if message is not None:
                         print(message)
diff --git a/manim_voiceover/tracker.py b/manim_voiceover/tracker.py
@@ -87,11 +87,11 @@ def _process_bookmarks(self) -> None:
             )
             self.bookmark_times[mark] = self.start_t + elapsed
 
-    def get_remaining_duration(self, buff: int = 0) -> int:
+    def get_remaining_duration(self, buff: float = 0.) -> float:
         """Returns the remaining duration of the voiceover.
 
         Args:
-            buff (int, optional): A buffer to add to the remaining duration. Defaults to 0.
+            buff (float, optional): A buffer to add to the remaining duration. Defaults to 0.
 
         Returns:
             int: The remaining duration of the voiceover in seconds.
diff --git a/manim_voiceover/voiceover_scene.py b/manim_voiceover/voiceover_scene.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 from typing import Optional, Generator
 import re
+import typing as t
 
 from manim import Scene, config
 from manim_voiceover.services.base import SpeechService
@@ -163,7 +164,7 @@ def wait_until_bookmark(self, mark: str) -> None:
 
     @contextmanager
     def voiceover(
-        self, text: str = None, ssml: str = None, **kwargs
+        self, text: t.Optional[str] = None, ssml: t.Optional[str] = None, **kwargs
     ) -> Generator[VoiceoverTracker, None, None]:
         """The main function to be used for adding voiceover to a scene.
 
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml