Skip to content

Commit 41be9e1

Browse files
authored
Upgrade whisper and stable-whisper versions, fix humanhash issue, fix AzureService text_offset issue (#53)
* Upgraded whisper and stable-whisper versions * Replaced playsound with pydub * Removed humanhash from dependencies * Fixed issue with AzureService text offset
1 parent b4d7a5c commit 41be9e1

File tree

11 files changed

+311
-95
lines changed

11 files changed

+311
-95
lines changed

manim_voiceover/services/azure.py

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -86,39 +86,40 @@ def generate_from_text(
8686
# Apply prosody
8787
prosody = kwargs.get("prosody", self.prosody)
8888

89+
ssml_beginning = r"""<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
90+
xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
91+
<voice name="%s">
92+
""" % (
93+
self.voice
94+
)
95+
ssml_end = r"""
96+
</voice>
97+
</speak>
98+
"""
99+
89100
if prosody is not None:
90101
if not isinstance(prosody, dict):
91102
raise ValueError(
92103
"The prosody argument must be a dict that contains at least one of the following keys: 'pitch', 'contour', 'range', 'rate', 'volume'."
93104
)
94105

95-
opening_tag = (
106+
prosody_opening_tag = (
96107
"<prosody "
97108
+ " ".join(
98109
['%s="%s"' % (key, str(val)) for key, val in prosody.items()]
99110
)
100111
+ ">"
101112
)
102-
inner = opening_tag + inner + "</prosody>"
113+
prosody_closing_tag = "</prosody>"
114+
ssml_beginning = ssml_beginning + prosody_opening_tag
115+
ssml_end = prosody_closing_tag + ssml_end
103116

104117
if self.style is not None:
105-
inner = r"""<mstts:express-as style="%s">
106-
%s
107-
</mstts:express-as>""" % (
108-
self.style,
109-
inner,
110-
)
118+
style_opening_tag = '<mstts:express-as style="%s">' % self.style
119+
style_closing_tag = "</mstts:express-as>"
120+
ssml_beginning = ssml_beginning + style_opening_tag
121+
ssml_end = style_closing_tag + ssml_end
111122

112-
ssml_beginning = r"""<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
113-
xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
114-
<voice name="%s">
115-
""" % (
116-
self.voice
117-
)
118-
ssml_end = r"""
119-
</voice>
120-
</speak>
121-
"""
122123
ssml = ssml_beginning + inner + ssml_end
123124
initial_offset = len(ssml_beginning)
124125

@@ -139,7 +140,7 @@ def generate_from_text(
139140
return cached_result
140141

141142
if path is None:
142-
audio_path = self.get_data_hash(input_data) + ".mp3"
143+
audio_path = self.get_audio_basename(input_data) + ".mp3"
143144
else:
144145
audio_path = path
145146

manim_voiceover/services/base.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
from abc import ABC, abstractmethod
2+
import typing as t
23
import os
34
import json
45
import sys
56
import hashlib
6-
import humanhash
77
from pathlib import Path
88
from manim import config, logger
9+
from slugify import slugify
910
from manim_voiceover.defaults import (
1011
DEFAULT_VOICEOVER_CACHE_DIR,
1112
DEFAULT_VOICEOVER_CACHE_JSON_FILENAME,
@@ -19,19 +20,19 @@ def timestamps_to_word_boundaries(segments):
1920
word_boundaries = []
2021
current_text_offset = 0
2122
for segment in segments:
22-
for dict_ in segment["word_timestamps"]:
23+
for dict_ in segment["words"]:
2324
word = dict_["word"]
2425
word_boundaries.append(
2526
{
26-
"audio_offset": int(dict_["timestamp"] * AUDIO_OFFSET_RESOLUTION),
27+
"audio_offset": int(dict_["start"] * AUDIO_OFFSET_RESOLUTION),
2728
# "duration_milliseconds": 0,
2829
"text_offset": current_text_offset,
29-
"word_length": len(dict_["word"]),
30+
"word_length": len(word),
3031
"text": word,
3132
"boundary_type": "Word",
3233
}
3334
)
34-
current_text_offset += len(dict_["word"])
35+
current_text_offset += len(word)
3536
# If word is not punctuation, add a space
3637
# if word not in [".", ",", "!", "?", ";", ":", "(", ")"]:
3738
# current_text_offset += 1
@@ -45,10 +46,10 @@ class SpeechService(ABC):
4546
def __init__(
4647
self,
4748
global_speed: float = 1.00,
48-
cache_dir: str = None,
49-
transcription_model: str = None,
49+
cache_dir: t.Optional[str] = None,
50+
transcription_model: t.Optional[str] = None,
5051
transcription_kwargs: dict = {},
51-
**kwargs
52+
**kwargs,
5253
):
5354
"""
5455
Args:
@@ -90,12 +91,12 @@ def _wrap_generate_from_text(self, text: str, path: str = None, **kwargs) -> dic
9091
transcription_result = self._whisper_model.transcribe(
9192
str(Path(self.cache_dir) / original_audio), **self.transcription_kwargs
9293
)
93-
logger.info("Transcription: " + transcription_result["text"])
94+
logger.info("Transcription: " + transcription_result.text)
9495
word_boundaries = timestamps_to_word_boundaries(
95-
transcription_result["segments"]
96+
transcription_result.segments_to_dicts()
9697
)
9798
dict_["word_boundaries"] = word_boundaries
98-
dict_["transcribed_text"] = transcription_result["text"]
99+
dict_["transcribed_text"] = transcription_result.text
99100

100101
# Audio callback
101102
self.audio_callback(original_audio, dict_, **kwargs)
@@ -152,10 +153,14 @@ def set_transcription(self, model: str = None, kwargs: dict = {}):
152153

153154
self.transcription_kwargs = kwargs
154155

155-
def get_data_hash(self, data: dict) -> str:
156+
def get_audio_basename(self, data: dict) -> str:
156157
dumped_data = json.dumps(data)
157158
data_hash = hashlib.sha256(dumped_data.encode("utf-8")).hexdigest()
158-
return humanhash.humanize(data_hash)
159+
suffix = data_hash[:8]
160+
input_string = data["input_text"]
161+
slug = slugify(input_string)
162+
ret = f"{slug}-{suffix}"
163+
return ret
159164

160165
@abstractmethod
161166
def generate_from_text(

manim_voiceover/services/coqui.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def generate_from_text(
6565
return cached_result
6666

6767
if path is None:
68-
audio_path = self.get_data_hash(input_data) + ".mp3"
68+
audio_path = self.get_audio_basename(input_data) + ".mp3"
6969
else:
7070
audio_path = path
7171

manim_voiceover/services/gtts.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def generate_from_text(
4646
return cached_result
4747

4848
if path is None:
49-
audio_path = self.get_data_hash(input_data) + ".mp3"
49+
audio_path = self.get_audio_basename(input_data) + ".mp3"
5050
else:
5151
audio_path = path
5252

manim_voiceover/services/pyttsx3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def generate_from_text(
3939
return cached_result
4040

4141
if path is None:
42-
audio_path = self.get_data_hash(input_data) + ".mp3"
42+
audio_path = self.get_audio_basename(input_data) + ".mp3"
4343
else:
4444
audio_path = path
4545

manim_voiceover/services/recorder/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def __init__(
4848
trim_buffer_end (int, optional): Buffer duration for trimming silence at the end. Defaults to 200 ms.
4949
"""
5050
prompt_ask_missing_extras(
51-
["pyaudio", "pynput", "playsound"], "recorder", "RecorderService"
51+
["pyaudio", "pynput"], "recorder", "RecorderService"
5252
)
5353

5454
self.recorder = Recorder(
@@ -92,7 +92,7 @@ def generate_from_text(
9292
return cached_result
9393

9494
if path is None:
95-
audio_path = self.get_data_hash(input_data) + ".mp3"
95+
audio_path = self.get_audio_basename(input_data) + ".mp3"
9696
else:
9797
audio_path = path
9898

manim_voiceover/services/recorder/utility.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111

1212
from pynput import keyboard
1313
import pyaudio
14-
import playsound
14+
from pydub import AudioSegment
15+
from pydub.playback import play
1516

1617

1718
class MyListener(keyboard.Listener):
@@ -235,7 +236,8 @@ def record(self, path: str, message: str = None):
235236
try:
236237
key = input()[-1].lower()
237238
if key == "l":
238-
playsound.playsound(path)
239+
audio = AudioSegment.from_file(path)
240+
play(audio)
239241
elif key == "r":
240242
if message is not None:
241243
print(message)

manim_voiceover/tracker.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,11 @@ def _process_bookmarks(self) -> None:
8787
)
8888
self.bookmark_times[mark] = self.start_t + elapsed
8989

90-
def get_remaining_duration(self, buff: int = 0) -> int:
90+
def get_remaining_duration(self, buff: float = 0.) -> float:
9191
"""Returns the remaining duration of the voiceover.
9292
9393
Args:
94-
buff (int, optional): A buffer to add to the remaining duration. Defaults to 0.
94+
buff (float, optional): A buffer to add to the remaining duration. Defaults to 0.
9595
9696
Returns:
9797
int: The remaining duration of the voiceover in seconds.

manim_voiceover/voiceover_scene.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from pathlib import Path
44
from typing import Optional, Generator
55
import re
6+
import typing as t
67

78
from manim import Scene, config
89
from manim_voiceover.services.base import SpeechService
@@ -163,7 +164,7 @@ def wait_until_bookmark(self, mark: str) -> None:
163164

164165
@contextmanager
165166
def voiceover(
166-
self, text: str = None, ssml: str = None, **kwargs
167+
self, text: t.Optional[str] = None, ssml: t.Optional[str] = None, **kwargs
167168
) -> Generator[VoiceoverTracker, None, None]:
168169
"""The main function to be used for adding voiceover to a scene.
169170

0 commit comments

Comments
 (0)