clean-up, add comments, add azure-cognitiveservices-speech to requirements.txt

pull/2169/head
mvenus 11 months ago
parent ab4f5bd46b
commit 34ca6c79ca

@ -1,14 +1,11 @@
import os
import io
import random import random
from pydub import AudioSegment
from pydub.playback import play
import azure.cognitiveservices.speech as speechsdk import azure.cognitiveservices.speech as speechsdk
from utils import settings from utils import settings
class AzureTTS: class AzureTTS:
def __init__(self): def __init__(self):
# Initialize the AzureTTS class with necessary configurations
self.voices = [] self.voices = []
self.api_key = settings.config["settings"]["tts"]["azure_api_key"] self.api_key = settings.config["settings"]["tts"]["azure_api_key"]
self.region = settings.config["settings"]["tts"]["azure_region"] self.region = settings.config["settings"]["tts"]["azure_region"]
@ -16,31 +13,36 @@ class AzureTTS:
self.rate = settings.config["settings"]["tts"]["azure_voice_speed_boost"] self.rate = settings.config["settings"]["tts"]["azure_voice_speed_boost"]
def run(self, text: str, filepath: str, random_voice=False): def run(self, text: str, filepath: str, random_voice=False):
# Validate API key and region
if not self.api_key or not self.region: if not self.api_key or not self.region:
raise ValueError("Azure API key and region must be set in settings.") raise ValueError("Azure API key and region must be set in settings.")
# Validate the rate value
if not isinstance(self.rate, int) or not (0 <= self.rate <= 100): if not isinstance(self.rate, int) or not (0 <= self.rate <= 100):
raise ValueError( raise ValueError(
"azure_voice_speed_boost must be an integer between 0 and 100." "azure_voice_speed_boost must be an integer between 0 and 100."
) )
# Configure speech synthesis with Azure
speech_config = speechsdk.SpeechConfig( speech_config = speechsdk.SpeechConfig(
subscription=self.api_key, region=self.region subscription=self.api_key, region=self.region
) )
audio_config = speechsdk.audio.AudioOutputConfig(filename=filepath) audio_config = speechsdk.audio.AudioOutputConfig(filename=filepath)
# Select voice: random or default
if random_voice: if random_voice:
voice_name = self.random_voice() voice_name = self.random_voice()
else: else:
voice_name = self.default_voice voice_name = self.default_voice
# Set the voice name in the speech configuration
speech_config.speech_synthesis_voice_name = voice_name speech_config.speech_synthesis_voice_name = voice_name
speech_synthesizer = speechsdk.SpeechSynthesizer( speech_synthesizer = speechsdk.SpeechSynthesizer(
speech_config=speech_config, audio_config=audio_config speech_config=speech_config, audio_config=audio_config
) )
rate_with_percent = f"{self.rate}%" rate_with_percent = f"{self.rate}%"
# Construct SSML with the specified rate # Construct SSML (Speech Synthesis Markup Language) with the specified rate
ssml_text = f""" ssml_text = f"""
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US"> <speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
<voice name="{voice_name}"> <voice name="{voice_name}">
@ -48,19 +50,23 @@ class AzureTTS:
</voice> </voice>
</speak> </speak>
""" """
# Perform speech synthesis
result = speech_synthesizer.speak_ssml_async(ssml_text).get() result = speech_synthesizer.speak_ssml_async(ssml_text).get()
# Check the result of the synthesis
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
print(f"Speech synthesized for text [{text}] and saved to [{filepath}]") print(f"Speech synthesized for text [{text}] and saved to [{filepath}]")
else: else:
print(f"Speech synthesis failed: {result.reason}") print(f"Speech synthesis failed: {result.reason}")
def random_voice(self): def random_voice(self):
# Return a random voice from the available voices
if not self.voices: if not self.voices:
self.voices = self.fetch_available_voices() self.voices = self.fetch_available_voices()
return random.choice(self.voices) return random.choice(self.voices)
def fetch_available_voices(self): def fetch_available_voices(self):
# Return a list of available voices
return [ return [
"en-US-AndrewMultilingualNeural", "en-US-AndrewMultilingualNeural",
"en-US-AvaMultilingualNeural", "en-US-AvaMultilingualNeural",
@ -74,4 +80,4 @@ class AzureTTS:
"zh-CN-XiaochenMultilingualNeural", "zh-CN-XiaochenMultilingualNeural",
"zh-CN-XiaoyuMultilingualNeural", "zh-CN-XiaoyuMultilingualNeural",
"zh-CN-YunyiMultilingualNeural", "zh-CN-YunyiMultilingualNeural",
] ]

@ -22,3 +22,4 @@ ffmpeg-python==0.2.0
elevenlabs==1.8.1 elevenlabs==1.8.1
yt-dlp==2024.10.7 yt-dlp==2024.10.7
numpy==1.26.4 numpy==1.26.4
azure-cognitiveservices-speech==1.41.1
Loading…
Cancel
Save