diff --git a/GUI/settings.html b/GUI/settings.html index f3f1751..8135387 100644 --- a/GUI/settings.html +++ b/GUI/settings.html @@ -258,6 +258,7 @@ + @@ -408,6 +409,46 @@ +
+ +
+
+
+ +
+ +
+
+
+
+ +
+ +
+
+ +
+ +
+ +
+
diff --git a/TTS/azuretts.py b/TTS/azuretts.py new file mode 100644 index 0000000..f158d92 --- /dev/null +++ b/TTS/azuretts.py @@ -0,0 +1,77 @@ +import os +import io +import random +from pydub import AudioSegment +from pydub.playback import play +import azure.cognitiveservices.speech as speechsdk +from utils import settings + + +class AzureTTS: + def __init__(self): + self.voices = [] + self.api_key = settings.config["settings"]["tts"]["azure_api_key"] + self.region = settings.config["settings"]["tts"]["azure_region"] + self.default_voice = settings.config["settings"]["tts"]["azure_voice_name"] + self.rate = settings.config["settings"]["tts"]["azure_voice_speed_boost"] + + def run(self, text: str, filepath: str, random_voice=False): + if not self.api_key or not self.region: + raise ValueError("Azure API key and region must be set in settings.") + + if not isinstance(self.rate, int) or not (0 <= self.rate <= 100): + raise ValueError( + "azure_voice_speed_boost must be an integer between 0 and 100." + ) + + speech_config = speechsdk.SpeechConfig( + subscription=self.api_key, region=self.region + ) + audio_config = speechsdk.audio.AudioOutputConfig(filename=filepath) + + if random_voice: + voice_name = self.random_voice() + else: + voice_name = self.default_voice + + speech_config.speech_synthesis_voice_name = voice_name + speech_synthesizer = speechsdk.SpeechSynthesizer( + speech_config=speech_config, audio_config=audio_config + ) + rate_with_percent = f"{self.rate}%" + + # Construct SSML with the specified rate + ssml_text = f""" + + + {text} + + + """ + result = speech_synthesizer.speak_ssml_async(ssml_text).get() + + if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: + print(f"Speech synthesized for text [{text}] and saved to [{filepath}]") + else: + print(f"Speech synthesis failed: {result.reason}") + + def random_voice(self): + if not self.voices: + self.voices = self.fetch_available_voices() + return random.choice(self.voices) + + def fetch_available_voices(self): + return [ + "en-US-AndrewMultilingualNeural", + "en-US-AvaMultilingualNeural", + "de-DE-FlorianMultilingualNeural", + "en-US-EmmaMultilingualNeural", + "de-DE-SeraphinaMultilingualNeural", + "de-DE-FlorianMultilingualNeural", + "fr-FR-VivienneMultilingualNeural", + "fr-FR-RemyMultilingualNeural", + "zh-CN-XiaoxiaoMultilingualNeural", + "zh-CN-XiaochenMultilingualNeural", + "zh-CN-XiaoyuMultilingualNeural", + "zh-CN-YunyiMultilingualNeural", + ] diff --git a/utils/.config.template.toml b/utils/.config.template.toml index f4a3af0..6708630 100644 --- a/utils/.config.template.toml +++ b/utils/.config.template.toml @@ -44,7 +44,7 @@ background_thumbnail_font_size = { optional = true, type = "int", default = 96, background_thumbnail_font_color = { optional = true, default = "255,255,255", example = "255,255,255", explanation = "Font color in RGB format for the thumbnail text" } [settings.tts] -voice_choice = { optional = false, default = "tiktok", options = ["elevenlabs", "streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", ], example = "tiktok", explanation = "The voice platform used for TTS generation. " } +voice_choice = { optional = false, default = "tiktok", options = ["elevenlabs", "streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", "azuretts", ], example = "tiktok", explanation = "The voice platform used for TTS generation. " } random_voice = { optional = false, type = "bool", default = true, example = true, options = [true, false,], explanation = "Randomizes the voice used for each comment" } elevenlabs_voice_name = { optional = false, default = "Bella", example = "Bella", explanation = "The voice used for elevenlabs", options = ["Adam", "Antoni", "Arnold", "Bella", "Domi", "Elli", "Josh", "Rachel", "Sam", ] } elevenlabs_api_key = { optional = true, example = "21f13f91f54d741e2ae27d2ab1b99d59", explanation = "Elevenlabs API key" } @@ -56,3 +56,7 @@ python_voice = { optional = false, default = "1", example = "1", explanation = " py_voice_num = { optional = false, default = "2", example = "2", explanation = "The number of system voices (2 are pre-installed in Windows)" } silence_duration = { optional = true, example = "0.1", explanation = "Time in seconds between TTS comments", default = 0.3, type = "float" } no_emojis = { optional = false, type = "bool", default = false, example = false, options = [true, false,], explanation = "Whether to remove emojis from the comments" } +azure_api_key = { optional = true, example = "fb37d9e152864e1d8992a70154cf3bba", explanation = "Azure Speech service API key" } +azure_region = { optional = true, default = "westus2", example = "westus2", explanation = "Azure region" } +azure_voice_name = { optional = false, default = "en-US-AvaMultilingualNeural", options = ["en-US-AndrewMultilingualNeural", "en-US-AvaMultilingualNeural", "de-DE-FlorianMultilingualNeural", "en-US-EmmaMultilingualNeural", "de-DE-SeraphinaMultilingualNeural", "de-DE-FlorianMultilingualNeural", "fr-FR-VivienneMultilingualNeural", "fr-FR-RemyMultilingualNeural", "zh-CN-XiaoxiaoMultilingualNeural", "zh-CN-XiaochenMultilingualNeural", "zh-CN-XiaoyuMultilingualNeural", "zh-CN-YunyiMultilingualNeural"], example = "en-US-AvaMultilingualNeural", explanation = "Azure voice name" } +azure_voice_speed_boost = { optional = false, default = 0, example = 50, explanation = "Azuretts voice speed boost (in %). Has to be a number between 0 and 100" } \ No newline at end of file diff --git a/video_creation/voices.py b/video_creation/voices.py index ad94a14..6b12104 100644 --- a/video_creation/voices.py +++ b/video_creation/voices.py @@ -7,6 +7,7 @@ from TTS.elevenlabs import elevenlabs from TTS.engine_wrapper import TTSEngine from TTS.GTTS import GTTS from TTS.pyttsx import pyttsx +from TTS.azuretts import AzureTTS from TTS.streamlabs_polly import StreamlabsPolly from TTS.TikTok import TikTok from utils import settings @@ -21,6 +22,7 @@ TTSProviders = { "TikTok": TikTok, "pyttsx": pyttsx, "ElevenLabs": elevenlabs, + "AzureTTS": AzureTTS, } @@ -36,7 +38,9 @@ def save_text_to_mp3(reddit_obj) -> Tuple[int, int]: voice = settings.config["settings"]["tts"]["voice_choice"] if str(voice).casefold() in map(lambda _: _.casefold(), TTSProviders): - text_to_mp3 = TTSEngine(get_case_insensitive_key_value(TTSProviders, voice), reddit_obj) + text_to_mp3 = TTSEngine( + get_case_insensitive_key_value(TTSProviders, voice), reddit_obj + ) else: while True: print_step("Please choose one of the following TTS providers: ") @@ -45,12 +49,18 @@ def save_text_to_mp3(reddit_obj) -> Tuple[int, int]: if choice.casefold() in map(lambda _: _.casefold(), TTSProviders): break print("Unknown Choice") - text_to_mp3 = TTSEngine(get_case_insensitive_key_value(TTSProviders, choice), reddit_obj) + text_to_mp3 = TTSEngine( + get_case_insensitive_key_value(TTSProviders, choice), reddit_obj + ) return text_to_mp3.run() def get_case_insensitive_key_value(input_dict, key): return next( - (value for dict_key, value in input_dict.items() if dict_key.lower() == key.lower()), + ( + value + for dict_key, value in input_dict.items() + if dict_key.lower() == key.lower() + ), None, )