diff --git a/TTS/azuretts.py b/TTS/azuretts.py
new file mode 100644
index 0000000..f158d92
--- /dev/null
+++ b/TTS/azuretts.py
@@ -0,0 +1,77 @@
+import os
+import io
+import random
+from pydub import AudioSegment
+from pydub.playback import play
+import azure.cognitiveservices.speech as speechsdk
+from utils import settings
+
+
+class AzureTTS:
+ def __init__(self):
+ self.voices = []
+ self.api_key = settings.config["settings"]["tts"]["azure_api_key"]
+ self.region = settings.config["settings"]["tts"]["azure_region"]
+ self.default_voice = settings.config["settings"]["tts"]["azure_voice_name"]
+ self.rate = settings.config["settings"]["tts"]["azure_voice_speed_boost"]
+
+ def run(self, text: str, filepath: str, random_voice=False):
+ if not self.api_key or not self.region:
+ raise ValueError("Azure API key and region must be set in settings.")
+
+ if not isinstance(self.rate, int) or not (0 <= self.rate <= 100):
+ raise ValueError(
+ "azure_voice_speed_boost must be an integer between 0 and 100."
+ )
+
+ speech_config = speechsdk.SpeechConfig(
+ subscription=self.api_key, region=self.region
+ )
+ audio_config = speechsdk.audio.AudioOutputConfig(filename=filepath)
+
+ if random_voice:
+ voice_name = self.random_voice()
+ else:
+ voice_name = self.default_voice
+
+ speech_config.speech_synthesis_voice_name = voice_name
+ speech_synthesizer = speechsdk.SpeechSynthesizer(
+ speech_config=speech_config, audio_config=audio_config
+ )
+ rate_with_percent = f"{self.rate}%"
+
+ # Construct SSML with the specified rate
+ ssml_text = f"""
+
+
+ {text}
+
+
+ """
+ result = speech_synthesizer.speak_ssml_async(ssml_text).get()
+
+ if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
+ print(f"Speech synthesized for text [{text}] and saved to [{filepath}]")
+ else:
+ print(f"Speech synthesis failed: {result.reason}")
+
+ def random_voice(self):
+ if not self.voices:
+ self.voices = self.fetch_available_voices()
+ return random.choice(self.voices)
+
+ def fetch_available_voices(self):
+ return [
+ "en-US-AndrewMultilingualNeural",
+ "en-US-AvaMultilingualNeural",
+ "de-DE-FlorianMultilingualNeural",
+ "en-US-EmmaMultilingualNeural",
+ "de-DE-SeraphinaMultilingualNeural",
+ "de-DE-FlorianMultilingualNeural",
+ "fr-FR-VivienneMultilingualNeural",
+ "fr-FR-RemyMultilingualNeural",
+ "zh-CN-XiaoxiaoMultilingualNeural",
+ "zh-CN-XiaochenMultilingualNeural",
+ "zh-CN-XiaoyuMultilingualNeural",
+ "zh-CN-YunyiMultilingualNeural",
+ ]
diff --git a/utils/.config.template.toml b/utils/.config.template.toml
index f4a3af0..6708630 100644
--- a/utils/.config.template.toml
+++ b/utils/.config.template.toml
@@ -44,7 +44,7 @@ background_thumbnail_font_size = { optional = true, type = "int", default = 96,
background_thumbnail_font_color = { optional = true, default = "255,255,255", example = "255,255,255", explanation = "Font color in RGB format for the thumbnail text" }
[settings.tts]
-voice_choice = { optional = false, default = "tiktok", options = ["elevenlabs", "streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", ], example = "tiktok", explanation = "The voice platform used for TTS generation. " }
+voice_choice = { optional = false, default = "tiktok", options = ["elevenlabs", "streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", "azuretts", ], example = "tiktok", explanation = "The voice platform used for TTS generation. " }
random_voice = { optional = false, type = "bool", default = true, example = true, options = [true, false,], explanation = "Randomizes the voice used for each comment" }
elevenlabs_voice_name = { optional = false, default = "Bella", example = "Bella", explanation = "The voice used for elevenlabs", options = ["Adam", "Antoni", "Arnold", "Bella", "Domi", "Elli", "Josh", "Rachel", "Sam", ] }
elevenlabs_api_key = { optional = true, example = "21f13f91f54d741e2ae27d2ab1b99d59", explanation = "Elevenlabs API key" }
@@ -56,3 +56,7 @@ python_voice = { optional = false, default = "1", example = "1", explanation = "
py_voice_num = { optional = false, default = "2", example = "2", explanation = "The number of system voices (2 are pre-installed in Windows)" }
silence_duration = { optional = true, example = "0.1", explanation = "Time in seconds between TTS comments", default = 0.3, type = "float" }
no_emojis = { optional = false, type = "bool", default = false, example = false, options = [true, false,], explanation = "Whether to remove emojis from the comments" }
+azure_api_key = { optional = true, example = "fb37d9e152864e1d8992a70154cf3bba", explanation = "Azure Speech service API key" }
+azure_region = { optional = true, default = "westus2", example = "westus2", explanation = "Azure region" }
+azure_voice_name = { optional = false, default = "en-US-AvaMultilingualNeural", options = ["en-US-AndrewMultilingualNeural", "en-US-AvaMultilingualNeural", "de-DE-FlorianMultilingualNeural", "en-US-EmmaMultilingualNeural", "de-DE-SeraphinaMultilingualNeural", "de-DE-FlorianMultilingualNeural", "fr-FR-VivienneMultilingualNeural", "fr-FR-RemyMultilingualNeural", "zh-CN-XiaoxiaoMultilingualNeural", "zh-CN-XiaochenMultilingualNeural", "zh-CN-XiaoyuMultilingualNeural", "zh-CN-YunyiMultilingualNeural"], example = "en-US-AvaMultilingualNeural", explanation = "Azure voice name" }
+azure_voice_speed_boost = { optional = false, default = 0, example = 50, explanation = "Azuretts voice speed boost (in %). Has to be a number between 0 and 100" }
\ No newline at end of file
diff --git a/video_creation/voices.py b/video_creation/voices.py
index ad94a14..6b12104 100644
--- a/video_creation/voices.py
+++ b/video_creation/voices.py
@@ -7,6 +7,7 @@ from TTS.elevenlabs import elevenlabs
from TTS.engine_wrapper import TTSEngine
from TTS.GTTS import GTTS
from TTS.pyttsx import pyttsx
+from TTS.azuretts import AzureTTS
from TTS.streamlabs_polly import StreamlabsPolly
from TTS.TikTok import TikTok
from utils import settings
@@ -21,6 +22,7 @@ TTSProviders = {
"TikTok": TikTok,
"pyttsx": pyttsx,
"ElevenLabs": elevenlabs,
+ "AzureTTS": AzureTTS,
}
@@ -36,7 +38,9 @@ def save_text_to_mp3(reddit_obj) -> Tuple[int, int]:
voice = settings.config["settings"]["tts"]["voice_choice"]
if str(voice).casefold() in map(lambda _: _.casefold(), TTSProviders):
- text_to_mp3 = TTSEngine(get_case_insensitive_key_value(TTSProviders, voice), reddit_obj)
+ text_to_mp3 = TTSEngine(
+ get_case_insensitive_key_value(TTSProviders, voice), reddit_obj
+ )
else:
while True:
print_step("Please choose one of the following TTS providers: ")
@@ -45,12 +49,18 @@ def save_text_to_mp3(reddit_obj) -> Tuple[int, int]:
if choice.casefold() in map(lambda _: _.casefold(), TTSProviders):
break
print("Unknown Choice")
- text_to_mp3 = TTSEngine(get_case_insensitive_key_value(TTSProviders, choice), reddit_obj)
+ text_to_mp3 = TTSEngine(
+ get_case_insensitive_key_value(TTSProviders, choice), reddit_obj
+ )
return text_to_mp3.run()
def get_case_insensitive_key_value(input_dict, key):
return next(
- (value for dict_key, value in input_dict.items() if dict_key.lower() == key.lower()),
+ (
+ value
+ for dict_key, value in input_dict.items()
+ if dict_key.lower() == key.lower()
+ ),
None,
)