diff --git a/TTS/openai_tts.py b/TTS/openai_tts.py new file mode 100644 index 0000000..f439e2d --- /dev/null +++ b/TTS/openai_tts.py @@ -0,0 +1,89 @@ +import random +import requests +from utils import settings + +class OpenAITTS: + """ + A Text-to-Speech engine that uses an OpenAI-like TTS API endpoint to generate audio from text. + + Attributes: + max_chars (int): Maximum number of characters allowed per API call. + api_key (str): API key loaded from settings. + api_url (str): The complete API endpoint URL, built from a base URL provided in the config. + available_voices (list): Static list of supported voices (according to current docs). + """ + def __init__(self): + # Set maximum input size based on API limits (4096 characters per request) + self.max_chars = 4096 + self.api_key = settings.config["settings"]["tts"].get("openai_api_key") + if not self.api_key: + raise ValueError("No OpenAI API key provided in settings! Please set 'openai_api_key' in your config.") + + # Lese den Basis-URL aus der Konfiguration (z. B. "https://api.openai.com/v1" oder "https://api.openai.com/v1/") + base_url = settings.config["settings"]["tts"].get("openai_api_url", "https://api.openai.com/v1") + # Entferne ggf. den abschließenden Slash + if base_url.endswith("/"): + base_url = base_url[:-1] + # Hänge den TTS-spezifischen Pfad an + self.api_url = base_url + "/audio/speech" + + # Set the available voices to a static list as per OpenAI TTS documentation. + self.available_voices = self.get_available_voices() + + def get_available_voices(self): + """ + Return a static list of supported voices for the OpenAI TTS API. + + According to the documentation, supported voices include: + "alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer" + """ + return ["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"] + + def randomvoice(self): + """ + Select and return a random voice from the available voices. + """ + return random.choice(self.available_voices) + + def run(self, text, filepath, random_voice: bool = False): + """ + Convert the provided text to speech and save the resulting audio to the specified filepath. + + Args: + text (str): The input text to convert. + filepath (str): The file path where the generated audio will be saved. + random_voice (bool): If True, select a random voice from the available voices. + """ + # Choose voice based on configuration or randomly if requested. + if random_voice: + voice = self.randomvoice() + else: + voice = settings.config["settings"]["tts"].get("openai_voice_name", "alloy") + voice = str(voice).lower() # Ensure lower-case as expected by the API + + # Select the model from configuration; default to 'tts-1' + model = settings.config["settings"]["tts"].get("openai_model", "tts-1") + + # Debug output: print which voice and model will be used + print(f"Using OpenAI TTS model: {model} with voice: {voice}") + + # Erstelle das Payload für den API-Request + payload = { + "model": model, + "voice": voice, + "input": text, + "response_format": "mp3" # erlaubte Formate: "mp3", "aac", "opus", "flac", "pcm" oder "wav" + } + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json" + } + try: + response = requests.post(self.api_url, headers=headers, json=payload) + if response.status_code != 200: + raise RuntimeError(f"Error from TTS API: {response.status_code} {response.text}") + # Schreibe die Binärdaten (mp3) direkt in die Datei. + with open(filepath, "wb") as f: + f.write(response.content) + except Exception as e: + raise RuntimeError(f"Failed to generate audio with OpenAI TTS API: {str(e)}") \ No newline at end of file diff --git a/main.py b/main.py index 849663d..744ca67 100755 --- a/main.py +++ b/main.py @@ -122,6 +122,7 @@ if __name__ == "__main__": except Exception as err: config["settings"]["tts"]["tiktok_sessionid"] = "REDACTED" config["settings"]["tts"]["elevenlabs_api_key"] = "REDACTED" + config["settings"]["tts"]["openai_api_key"] = "REDACTED" print_step( f"Sorry, something went wrong with this version! Try again, and feel free to report this issue at GitHub or the Discord community.\n" f"Version: {__VERSION__} \n" diff --git a/utils/.config.template.toml b/utils/.config.template.toml index f4a3af0..3c98b15 100644 --- a/utils/.config.template.toml +++ b/utils/.config.template.toml @@ -44,7 +44,6 @@ background_thumbnail_font_size = { optional = true, type = "int", default = 96, background_thumbnail_font_color = { optional = true, default = "255,255,255", example = "255,255,255", explanation = "Font color in RGB format for the thumbnail text" } [settings.tts] -voice_choice = { optional = false, default = "tiktok", options = ["elevenlabs", "streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", ], example = "tiktok", explanation = "The voice platform used for TTS generation. " } random_voice = { optional = false, type = "bool", default = true, example = true, options = [true, false,], explanation = "Randomizes the voice used for each comment" } elevenlabs_voice_name = { optional = false, default = "Bella", example = "Bella", explanation = "The voice used for elevenlabs", options = ["Adam", "Antoni", "Arnold", "Bella", "Domi", "Elli", "Josh", "Rachel", "Sam", ] } elevenlabs_api_key = { optional = true, example = "21f13f91f54d741e2ae27d2ab1b99d59", explanation = "Elevenlabs API key" } @@ -56,3 +55,7 @@ python_voice = { optional = false, default = "1", example = "1", explanation = " py_voice_num = { optional = false, default = "2", example = "2", explanation = "The number of system voices (2 are pre-installed in Windows)" } silence_duration = { optional = true, example = "0.1", explanation = "Time in seconds between TTS comments", default = 0.3, type = "float" } no_emojis = { optional = false, type = "bool", default = false, example = false, options = [true, false,], explanation = "Whether to remove emojis from the comments" } +openai_api_url = { optional = true, default = "https://api.openai.com/v1/", example = "https://api.openai.com/v1/", explanation = "The API endpoint URL for OpenAI TTS generation" } +openai_api_key = { optional = true, example = "sk-abc123def456...", explanation = "Your OpenAI API key for TTS generation" } +openai_voice_name = { optional = false, default = "alloy", example = "alloy", explanation = "The voice used for OpenAI TTS generation", options = ["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"] } +openai_model = { optional = false, default = "tts-1", example = "tts-1", explanation = "The model variant used for OpenAI TTS generation", options = ["tts-1", "tts-1-hd"] } \ No newline at end of file diff --git a/video_creation/voices.py b/video_creation/voices.py index ad94a14..13ded7c 100644 --- a/video_creation/voices.py +++ b/video_creation/voices.py @@ -9,6 +9,7 @@ from TTS.GTTS import GTTS from TTS.pyttsx import pyttsx from TTS.streamlabs_polly import StreamlabsPolly from TTS.TikTok import TikTok +from TTS.openai_tts import OpenAITTS from utils import settings from utils.console import print_step, print_table @@ -21,6 +22,7 @@ TTSProviders = { "TikTok": TikTok, "pyttsx": pyttsx, "ElevenLabs": elevenlabs, + "OpenAI": OpenAITTS, }