diff --git a/TTS/azuretts.py b/TTS/azuretts.py index f158d92..5674513 100644 --- a/TTS/azuretts.py +++ b/TTS/azuretts.py @@ -1,14 +1,11 @@ -import os -import io import random -from pydub import AudioSegment -from pydub.playback import play import azure.cognitiveservices.speech as speechsdk from utils import settings class AzureTTS: def __init__(self): + # Initialize the AzureTTS class with necessary configurations self.voices = [] self.api_key = settings.config["settings"]["tts"]["azure_api_key"] self.region = settings.config["settings"]["tts"]["azure_region"] @@ -16,31 +13,36 @@ class AzureTTS: self.rate = settings.config["settings"]["tts"]["azure_voice_speed_boost"] def run(self, text: str, filepath: str, random_voice=False): + # Validate API key and region if not self.api_key or not self.region: raise ValueError("Azure API key and region must be set in settings.") + # Validate the rate value if not isinstance(self.rate, int) or not (0 <= self.rate <= 100): raise ValueError( "azure_voice_speed_boost must be an integer between 0 and 100." ) + # Configure speech synthesis with Azure speech_config = speechsdk.SpeechConfig( subscription=self.api_key, region=self.region ) audio_config = speechsdk.audio.AudioOutputConfig(filename=filepath) + # Select voice: random or default if random_voice: voice_name = self.random_voice() else: voice_name = self.default_voice + # Set the voice name in the speech configuration speech_config.speech_synthesis_voice_name = voice_name speech_synthesizer = speechsdk.SpeechSynthesizer( speech_config=speech_config, audio_config=audio_config ) rate_with_percent = f"{self.rate}%" - # Construct SSML with the specified rate + # Construct SSML (Speech Synthesis Markup Language) with the specified rate ssml_text = f""" @@ -48,19 +50,23 @@ class AzureTTS: """ + # Perform speech synthesis result = speech_synthesizer.speak_ssml_async(ssml_text).get() + # Check the result of the synthesis if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: print(f"Speech synthesized for text [{text}] and saved to [{filepath}]") else: print(f"Speech synthesis failed: {result.reason}") def random_voice(self): + # Return a random voice from the available voices if not self.voices: self.voices = self.fetch_available_voices() return random.choice(self.voices) def fetch_available_voices(self): + # Return a list of available voices return [ "en-US-AndrewMultilingualNeural", "en-US-AvaMultilingualNeural", @@ -74,4 +80,4 @@ class AzureTTS: "zh-CN-XiaochenMultilingualNeural", "zh-CN-XiaoyuMultilingualNeural", "zh-CN-YunyiMultilingualNeural", - ] + ] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 95fa560..a5a5987 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,3 +22,4 @@ ffmpeg-python==0.2.0 elevenlabs==1.8.1 yt-dlp==2024.10.7 numpy==1.26.4 +azure-cognitiveservices-speech==1.41.1 \ No newline at end of file