add azure speach service support (azuretts)

pull/2169/head
mvenus 11 months ago
parent 2efd617c1a
commit ab4f5bd46b

@ -258,6 +258,7 @@
<option value="googletranslate">Google Translate</option>
<option value="awspolly">AWS Polly</option>
<option value="pyttsx">Python TTS (pyttsx)</option>
<option value="azuretts">Azure TTS (Speech service)</option>
</select>
</div>
</div>
@ -408,6 +409,46 @@
</div>
</div>
</div>
<div class="row mb-2">
<label for="azure_api_key" class="col-4">Azure API Key</label>
<div class="col-8">
<div class="input-group">
<div class="input-group-text">
<i class="bi bi-key-fill"></i>
</div>
<input name="azure_api_key" value="{{ data.azure_api_key }}" type="text" class="form-control"
placeholder="Azure Speech service API key" data-toggle="tooltip"
data-original-title="Azure Speech service API key">
</div>
</div>
</div>
<div class="row mb-2">
<label for="azure_region" class="col-4">Azure Region</label>
<div class="col-8">
<input name="azure_region" type="text" class="form-control"
placeholder="arial" value="{{ data.azure_region }}">
</div>
</div>
<div class="row mb-2">
<label for="azure_voice_name" class="col-4">Azure Voice Name</label>
<div class="col-8">
<select name="azure_voice_name" class="form-select" data-toggle="tooltip"
data-original-title="Azure voice name">
<option value="en-US-AndrewMultilingualNeural">en-US-AndrewMultilingualNeural</option>
<option value="en-US-AvaMultilingualNeural">en-US-AvaMultilingualNeural</option>
<option value="de-DE-FlorianMultilingualNeural">de-DE-FlorianMultilingualNeural</option>
<option value="en-US-EmmaMultilingualNeural">en-US-EmmaMultilingualNeural</option>
<option value="de-DE-SeraphinaMultilingualNeural">de-DE-SeraphinaMultilingualNeural</option>
<option value="fr-FR-VivienneMultilingualNeural">fr-FR-VivienneMultilingualNeural</option>
<option value="fr-FR-RemyMultilingualNeural">fr-FR-RemyMultilingualNeural</option>
<option value="zh-CN-XiaoxiaoMultilingualNeural">zh-CN-XiaoxiaoMultilingualNeural</option>
<option value="zh-CN-XiaochenMultilingualNeural">zh-CN-XiaochenMultilingualNeural</option>
<option value="zh-CN-XiaoyuMultilingualNeural">zh-CN-XiaoyuMultilingualNeural</option>
<option value="zh-CN-YunyiMultilingualNeural">zh-CN-YunyiMultilingualNeural</option>
</select>
</div>
</div>
<div class="row mb-2">
<label for="silence_duration" class="col-4">Silence Duration</label>
<div class="col-8">

@ -0,0 +1,77 @@
import os
import io
import random
from pydub import AudioSegment
from pydub.playback import play
import azure.cognitiveservices.speech as speechsdk
from utils import settings
class AzureTTS:
def __init__(self):
self.voices = []
self.api_key = settings.config["settings"]["tts"]["azure_api_key"]
self.region = settings.config["settings"]["tts"]["azure_region"]
self.default_voice = settings.config["settings"]["tts"]["azure_voice_name"]
self.rate = settings.config["settings"]["tts"]["azure_voice_speed_boost"]
def run(self, text: str, filepath: str, random_voice=False):
if not self.api_key or not self.region:
raise ValueError("Azure API key and region must be set in settings.")
if not isinstance(self.rate, int) or not (0 <= self.rate <= 100):
raise ValueError(
"azure_voice_speed_boost must be an integer between 0 and 100."
)
speech_config = speechsdk.SpeechConfig(
subscription=self.api_key, region=self.region
)
audio_config = speechsdk.audio.AudioOutputConfig(filename=filepath)
if random_voice:
voice_name = self.random_voice()
else:
voice_name = self.default_voice
speech_config.speech_synthesis_voice_name = voice_name
speech_synthesizer = speechsdk.SpeechSynthesizer(
speech_config=speech_config, audio_config=audio_config
)
rate_with_percent = f"{self.rate}%"
# Construct SSML with the specified rate
ssml_text = f"""
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
<voice name="{voice_name}">
<prosody rate="{rate_with_percent}">{text}</prosody>
</voice>
</speak>
"""
result = speech_synthesizer.speak_ssml_async(ssml_text).get()
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
print(f"Speech synthesized for text [{text}] and saved to [{filepath}]")
else:
print(f"Speech synthesis failed: {result.reason}")
def random_voice(self):
if not self.voices:
self.voices = self.fetch_available_voices()
return random.choice(self.voices)
def fetch_available_voices(self):
return [
"en-US-AndrewMultilingualNeural",
"en-US-AvaMultilingualNeural",
"de-DE-FlorianMultilingualNeural",
"en-US-EmmaMultilingualNeural",
"de-DE-SeraphinaMultilingualNeural",
"de-DE-FlorianMultilingualNeural",
"fr-FR-VivienneMultilingualNeural",
"fr-FR-RemyMultilingualNeural",
"zh-CN-XiaoxiaoMultilingualNeural",
"zh-CN-XiaochenMultilingualNeural",
"zh-CN-XiaoyuMultilingualNeural",
"zh-CN-YunyiMultilingualNeural",
]

@ -44,7 +44,7 @@ background_thumbnail_font_size = { optional = true, type = "int", default = 96,
background_thumbnail_font_color = { optional = true, default = "255,255,255", example = "255,255,255", explanation = "Font color in RGB format for the thumbnail text" }
[settings.tts]
voice_choice = { optional = false, default = "tiktok", options = ["elevenlabs", "streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", ], example = "tiktok", explanation = "The voice platform used for TTS generation. " }
voice_choice = { optional = false, default = "tiktok", options = ["elevenlabs", "streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", "azuretts", ], example = "tiktok", explanation = "The voice platform used for TTS generation. " }
random_voice = { optional = false, type = "bool", default = true, example = true, options = [true, false,], explanation = "Randomizes the voice used for each comment" }
elevenlabs_voice_name = { optional = false, default = "Bella", example = "Bella", explanation = "The voice used for elevenlabs", options = ["Adam", "Antoni", "Arnold", "Bella", "Domi", "Elli", "Josh", "Rachel", "Sam", ] }
elevenlabs_api_key = { optional = true, example = "21f13f91f54d741e2ae27d2ab1b99d59", explanation = "Elevenlabs API key" }
@ -56,3 +56,7 @@ python_voice = { optional = false, default = "1", example = "1", explanation = "
py_voice_num = { optional = false, default = "2", example = "2", explanation = "The number of system voices (2 are pre-installed in Windows)" }
silence_duration = { optional = true, example = "0.1", explanation = "Time in seconds between TTS comments", default = 0.3, type = "float" }
no_emojis = { optional = false, type = "bool", default = false, example = false, options = [true, false,], explanation = "Whether to remove emojis from the comments" }
azure_api_key = { optional = true, example = "fb37d9e152864e1d8992a70154cf3bba", explanation = "Azure Speech service API key" }
azure_region = { optional = true, default = "westus2", example = "westus2", explanation = "Azure region" }
azure_voice_name = { optional = false, default = "en-US-AvaMultilingualNeural", options = ["en-US-AndrewMultilingualNeural", "en-US-AvaMultilingualNeural", "de-DE-FlorianMultilingualNeural", "en-US-EmmaMultilingualNeural", "de-DE-SeraphinaMultilingualNeural", "de-DE-FlorianMultilingualNeural", "fr-FR-VivienneMultilingualNeural", "fr-FR-RemyMultilingualNeural", "zh-CN-XiaoxiaoMultilingualNeural", "zh-CN-XiaochenMultilingualNeural", "zh-CN-XiaoyuMultilingualNeural", "zh-CN-YunyiMultilingualNeural"], example = "en-US-AvaMultilingualNeural", explanation = "Azure voice name" }
azure_voice_speed_boost = { optional = false, default = 0, example = 50, explanation = "Azuretts voice speed boost (in %). Has to be a number between 0 and 100" }

@ -7,6 +7,7 @@ from TTS.elevenlabs import elevenlabs
from TTS.engine_wrapper import TTSEngine
from TTS.GTTS import GTTS
from TTS.pyttsx import pyttsx
from TTS.azuretts import AzureTTS
from TTS.streamlabs_polly import StreamlabsPolly
from TTS.TikTok import TikTok
from utils import settings
@ -21,6 +22,7 @@ TTSProviders = {
"TikTok": TikTok,
"pyttsx": pyttsx,
"ElevenLabs": elevenlabs,
"AzureTTS": AzureTTS,
}
@ -36,7 +38,9 @@ def save_text_to_mp3(reddit_obj) -> Tuple[int, int]:
voice = settings.config["settings"]["tts"]["voice_choice"]
if str(voice).casefold() in map(lambda _: _.casefold(), TTSProviders):
text_to_mp3 = TTSEngine(get_case_insensitive_key_value(TTSProviders, voice), reddit_obj)
text_to_mp3 = TTSEngine(
get_case_insensitive_key_value(TTSProviders, voice), reddit_obj
)
else:
while True:
print_step("Please choose one of the following TTS providers: ")
@ -45,12 +49,18 @@ def save_text_to_mp3(reddit_obj) -> Tuple[int, int]:
if choice.casefold() in map(lambda _: _.casefold(), TTSProviders):
break
print("Unknown Choice")
text_to_mp3 = TTSEngine(get_case_insensitive_key_value(TTSProviders, choice), reddit_obj)
text_to_mp3 = TTSEngine(
get_case_insensitive_key_value(TTSProviders, choice), reddit_obj
)
return text_to_mp3.run()
def get_case_insensitive_key_value(input_dict, key):
return next(
(value for dict_key, value in input_dict.items() if dict_key.lower() == key.lower()),
(
value
for dict_key, value in input_dict.items()
if dict_key.lower() == key.lower()
),
None,
)

Loading…
Cancel
Save