Merge pull request #1177 from Trichtern/feat/add-silence-between-clips

Minor changes to #990 (fixed audio glitches caused by previous audio concatenation method)
4 years ago · c995811a23
parent 40e4fafa05 9609025bba
commit c995811a23
2 changed files with 66 additions and 38 deletions
--- a/TTS/engine_wrapper.py
+++ b/TTS/engine_wrapper.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import os
 import re
 from pathlib import Path
 from typing import Tuple
@ -6,8 +7,11 @@ from typing import Tuple
 # import sox
 # from mutagen import MutagenError
 # from mutagen.mp3 import MP3, HeaderNotFoundError
+import numpy as np
 import translators as ts
-from moviepy.editor import AudioFileClip, CompositeAudioClip, concatenate_audioclips
+from moviepy.audio.AudioClip import AudioClip
+from moviepy.audio.fx.volumex import volumex
+from moviepy.editor import AudioFileClip
 from rich.progress import track

 from utils import settings
@ -22,7 +26,7 @@ class TTSEngine:
    """Calls the given TTS engine to reduce code duplication and allow multiple TTS engines.

    Args:
-        tts_module          : The TTS module. Your module should handle the TTS itself and saving to the given path under the run method.
+        tts_module            : The TTS module. Your module should handle the TTS itself and saving to the given path under the run method.
        reddit_object         : The reddit object that contains the posts to read.
        path (Optional)       : The unix style path to save the mp3 files to. This must not have leading or trailing slashes.
        max_length (Optional) : The maximum length of the mp3 files in total.
@ -84,45 +88,68 @@ class TTSEngine:
        split_text = [
            x.group().strip() for x in re.finditer(r" *(((.|\n){0," + str(self.tts_module.max_chars) + "})(\.|.$))", text)
        ]
-        offset = 0
-        for idy, text_cut in enumerate(split_text):
-            # print(f"{idx}-{idy}: {text_cut}\n")
-            new_text = process_text(text_cut)
-            if not new_text or new_text.isspace():
-                offset += 1
-                continue
-
-            self.call_tts(f"{idx}-{idy - offset}.part", new_text)
-            split_files.append(AudioFileClip(f"{self.path}/{idx}-{idy - offset}.part.mp3"))
+        self.create_silence_mp3()

-        CompositeAudioClip([concatenate_audioclips(split_files)]).write_audiofile(
-            f"{self.path}/{idx}.mp3", fps=44100, verbose=False, logger=None
-        )
-
-        for i in split_files:
-            name = i.filename
-            i.close()
-            Path(name).unlink()
-
-        # for i in range(0, idy + 1):
-        # print(f"Cleaning up {self.path}/{idx}-{i}.part.mp3")
+        idy = None
+        for idy, text_cut in enumerate(split_text):
+            newtext = process_text(text_cut)
+            # print(f"{idx}-{idy}: {newtext}\n")

-        # Path(f"{self.path}/{idx}-{i}.part.mp3").unlink()
+            if not newtext or newtext.isspace():
+                print("newtext was blank because sanitized split text resulted in none")
+                continue
+            else:
+                self.call_tts(f"{idx}-{idy}.part", newtext)
+                with open(f"{self.path}/list.txt", 'w') as f:
+                    for idz in range(0, len(split_text)):
+                        f.write("file " + f"'{idx}-{idz}.part.mp3'" + "\n")
+                    split_files.append(str(f"{self.path}/{idx}-{idy}.part.mp3"))
+                    f.write("file " + f"'silence.mp3'" + "\n")
+
+                os.system("ffmpeg -f concat -y -hide_banner -loglevel panic -safe 0 " +
+                          "-i " + f"{self.path}/list.txt " +
+                          "-c copy " + f"{self.path}/{idx}.mp3")
+        try:
+            for i in range(0, len(split_files)):
+                os.unlink(split_files[i])
+        except FileNotFoundError as e:
+            print("File not found: " + e.filename)
+        except OSError:
+            print("OSError")

    def call_tts(self, filename: str, text: str):
-        self.tts_module.run(text, filepath=f"{self.path}/{filename}.mp3")
-        # try:
-        #     self.length += MP3(f"{self.path}/{filename}.mp3").info.length
-        # except (MutagenError, HeaderNotFoundError):
-        #     self.length += sox.file_info.duration(f"{self.path}/{filename}.mp3")
+
        try:
+            self.tts_module.run(text, filepath=f"{self.path}/{filename}_no_silence.mp3")
+            self.create_silence_mp3()
+
+            with open(f"{self.path}/{filename}.txt", 'w') as f:
+                f.write("file " + f"'{filename}_no_silence.mp3'" + "\n")
+                f.write("file " + f"'silence.mp3'" + "\n")
+            f.close()
+            os.system("ffmpeg -f concat -y -hide_banner -loglevel panic -safe 0 " +
+                      "-i " + f"{self.path}/{filename}.txt " +
+                      "-c copy " + f"{self.path}/{filename}.mp3")
            clip = AudioFileClip(f"{self.path}/{filename}.mp3")
-            self.last_clip_length = clip.duration
            self.length += clip.duration
            clip.close()
+            try:
+                name = [f"{filename}_no_silence.mp3", "silence.mp3", f"{filename}.txt"]
+                for i in range(0, len(name)):
+                    os.unlink(str(rf"{self.path}/" + name[i]))
+            except FileNotFoundError as e:
+                print("File not found: " + e.filename)
+            except OSError:
+                print("OSError")
        except:
            self.length = 0

+    def create_silence_mp3(self):
+        silence_duration = settings.config["settings"]["tts"]["silence_duration"]
+        silence = AudioClip(make_frame=lambda t: np.sin(440 * 2 * np.pi * t), duration=silence_duration, fps=44100)
+        silence = volumex(silence, 0)
+        silence.write_audiofile(f"{self.path}/silence.mp3", fps=44100, verbose=False, logger=None)
+

 def process_text(text: str):
    lang = settings.config["reddit"]["thread"]["post_lang"]
--- a/utils/.config.template.toml
+++ b/utils/.config.template.toml
@ -3,11 +3,11 @@ client_id = { optional = false, nmin = 12, nmax = 30, explanation = "The ID of y
 client_secret = { optional = false, nmin = 20, nmax = 40, explanation = "The SECRET of your Reddit app of SCRIPT type", example = "fFAGRNJru1FTz70BzhT3Zg", regex = "^[-a-zA-Z0-9._~+/]+=*$", input_error = "The client ID can only contain printable characters.", oob_error = "The secret should be over 20 and under 40 characters, double check your input." }
 username = { optional = false, nmin = 3, nmax = 20, explanation = "The username of your reddit account", example = "JasonLovesDoggo", regex = "^[-_0-9a-zA-Z]+$", oob_error = "A username HAS to be between 3 and 20 characters" }
 password = { optional = false, nmin = 8, explanation = "The password of your reddit account", example = "fFAGRNJru1FTz70BzhT3Zg", oob_error = "Password too short" }
-2fa = { optional = true, type = "bool", options = [true, false,], default = false, explanation = "Whether you have Reddit 2FA enabled, Valid options are True and False", example = true }
+2fa = { optional = true, type = "bool", options = [true, false, ], default = false, explanation = "Whether you have Reddit 2FA enabled, Valid options are True and False", example = true }


 [reddit.thread]
-random = { optional = true, options = [true, false,], default = false, type = "bool", explanation = "If set to no, it will ask you a thread link to extract the thread, if yes it will randomize it. Default: 'False'", example = "True" }
+random = { optional = true, options = [true, false, ], default = false, type = "bool", explanation = "If set to no, it will ask you a thread link to extract the thread, if yes it will randomize it. Default: 'False'", example = "True" }
 subreddit = { optional = false, regex = "[_0-9a-zA-Z]+$", nmin = 3, explanation = "What subreddit to pull posts from, the name of the sub, not the URL. You can have multiple subreddits, add an + with no spaces.", example = "AskReddit+Redditdev", oob_error = "A subreddit name HAS to be between 3 and 20 characters" }
 post_id = { optional = true, default = "", regex = "^((?!://|://)[+a-zA-Z0-9])*$", explanation = "Used if you want to use a specific post.", example = "urdtfx" }
 max_comment_length = { default = 500, optional = false, nmin = 10, nmax = 10000, type = "int", explanation = "max number of characters a comment can have. default is 500", example = 500, oob_error = "the max comment length should be between 10 and 10000" }
@ -16,12 +16,12 @@ min_comments = { default = 20, optional = false, nmin = 15, type = "int", explan


 [settings]
-allow_nsfw = { optional = false, type = "bool", default = false, example = false, options = [true, false,], explanation = "Whether to allow NSFW content, True or False" }
-theme = { optional = false, default = "dark", example = "light", options = ["dark", "light",], explanation = "Sets the Reddit theme, either LIGHT or DARK" }
+allow_nsfw = { optional = false, type = "bool", default = false, example = false, options = [true, false, ], explanation = "Whether to allow NSFW content, True or False" }
+theme = { optional = false, default = "dark", example = "light", options = ["dark", "light", ], explanation = "Sets the Reddit theme, either LIGHT or DARK" }
 times_to_run = { optional = false, default = 1, example = 2, explanation = "Used if you want to run multiple times. Set to an int e.g. 4 or 29 or 1", type = "int", nmin = 1, oob_error = "It's very hard to run something less than once." }
 opacity = { optional = false, default = 0.9, example = 0.8, explanation = "Sets the opacity of the comments when overlayed over the background", type = "float", nmin = 0, nmax = 1, oob_error = "The opacity HAS to be between 0 and 1", input_error = "The opacity HAS to be a decimal number between 0 and 1" }
 transition = { optional = true, default = 0.2, example = 0.2, explanation = "Sets the transition time (in seconds) between the comments. Set to 0 if you want to disable it.", type = "float", nmin = 0, nmax = 2, oob_error = "The transition HAS to be between 0 and 2", input_error = "The opacity HAS to be a decimal number between 0 and 2" }
-storymode = { optional = true, type = "bool", default = false, example = false, options = [true, false,], explanation = "Only read out title and post content, not yet implemented" }
+storymode = { optional = true, type = "bool", default = false, example = false, options = [true, false, ], explanation = "Only read out title and post content, not yet implemented" }


 [settings.background]
@ -31,9 +31,10 @@ background_choice = { optional = true, default = "minecraft", example = "rocket-


 [settings.tts]
-voice_choice = { optional = false, default = "", options = ["streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx",], example = "tiktok", explanation = "The voice platform used for TTS generation. This can be left blank and you will be prompted to choose at runtime." }
+voice_choice = { optional = false, default = "", options = ["streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", ], example = "tiktok", explanation = "The voice platform used for TTS generation. This can be left blank and you will be prompted to choose at runtime." }
 aws_polly_voice = { optional = false, default = "Matthew", example = "Matthew", explanation = "The voice used for AWS Polly" }
 streamlabs_polly_voice = { optional = false, default = "Matthew", example = "Matthew", explanation = "The voice used for Streamlabs Polly" }
 tiktok_voice = { optional = false, default = "en_us_006", example = "en_us_006", explanation = "The voice used for TikTok TTS" }
-python_voice = {optional = false, default = "1", example = "1", explanation = "The index of the system tts voices (can be downloaded externally, run ptt.py to find value, start from zero)"}
-py_voice_num = {optional = false, default = "2", example = "2", explanation= "the number of system voices(2 are pre-installed in windows)"}
+python_voice = { optional = false, default = "1", example = "1", explanation = "The index of the system tts voices (can be downloaded externally, run ptt.py to find value, start from zero)" }
+py_voice_num = { optional = false, default = "2", example = "2", explanation = "the number of system voices(2 are pre-installed in windows)" }
+silence_duration = { optional = true, example = "0.1", explanation = "time in seconds between TTS comments", default = 0.3, type = "float" }