sanitize text.

closes #59
4 years ago · 45531df823
parent e678d3e5ca
commit 45531df823
2 changed files with 19 additions and 2 deletions
--- a/utils/voice.py
+++ b/utils/voice.py
@ -0,0 +1,16 @@
+import re
+
+
+def sanitize_text(text):
+    """
+    Sanitizes the text for tts.
+    What gets removed:
+    - following characters`^_~@!&;#:-%“”‘"%*/{}[]()\|<>?=+`
+    """
+
+    # note: not removing apostrophes
+    regex_expr = r"\s['|’]|['|’]\s|[\^_~@!&;#:\-%“”‘\"%\*/{}\[\]\(\)\\|<>=+]"
+    result = re.sub(regex_expr, " ", text)
+
+    # remove extra whitespace
+    return " ".join(result.split())
--- a/video_creation/voices.py
+++ b/video_creation/voices.py
@ -6,6 +6,7 @@ from mutagen.mp3 import MP3, HeaderNotFoundError
 from rich.progress import track

 from utils.console import print_step, print_substep
+from utils.voice import sanitize_text
 from video_creation.TTSwrapper import TTTTSWrapper

 VIDEO_LENGTH: int = 40  # secs
@ -24,7 +25,7 @@ def save_text_to_mp3(reddit_obj):
    Path("assets/temp/mp3").mkdir(parents=True, exist_ok=True)

    ttttsw = TTTTSWrapper()  # tiktok text to speech wrapper
-    ttttsw.tts(reddit_obj["thread_title"], filename=f"assets/temp/mp3/title.mp3", random_speaker=False)
+    ttttsw.tts(sanitize_text(reddit_obj["thread_title"]), filename=f"assets/temp/mp3/title.mp3", random_speaker=False)
    try:
        length += MP3(f"assets/temp/mp3/title.mp3").info.length
    except HeaderNotFoundError:  # note to self AudioFileClip
@ -35,7 +36,7 @@ def save_text_to_mp3(reddit_obj):
        if length > VIDEO_LENGTH:
            break

-        ttttsw.tts(comment["comment_body"], filename=f"assets/temp/mp3/{com}.mp3", random_speaker=False)
+        ttttsw.tts(sanitize_text(comment["comment_body"]), filename=f"assets/temp/mp3/{com}.mp3", random_speaker=False)
        try:
            length += MP3(f"assets/temp/mp3/{com}.mp3").info.length
            com += 1