adding OpenAI TTS API Option

This allows users to configure OpenAI TTS API to generate voice for their videos, which might be a cheaper option than ElevenLabs (especially if one runs KokoroTTS localy [which has a similar API structure like OpenAI and is fully compatible])
10 months ago · e137dbb40d
parent 3d4c34d60c
commit e137dbb40d
4 changed files with 96 additions and 1 deletions
--- a/TTS/openai_tts.py
+++ b/TTS/openai_tts.py
@ -0,0 +1,89 @@
+import random
+import requests
+from utils import settings
+
+class OpenAITTS:
+    """
+    A Text-to-Speech engine that uses an OpenAI-like TTS API endpoint to generate audio from text.
+    
+    Attributes:
+        max_chars (int): Maximum number of characters allowed per API call.
+        api_key (str): API key loaded from settings.
+        api_url (str): The complete API endpoint URL, built from a base URL provided in the config.
+        available_voices (list): Static list of supported voices (according to current docs).
+    """
+    def __init__(self):
+        # Set maximum input size based on API limits (4096 characters per request)
+        self.max_chars = 4096
+        self.api_key = settings.config["settings"]["tts"].get("openai_api_key")
+        if not self.api_key:
+            raise ValueError("No OpenAI API key provided in settings! Please set 'openai_api_key' in your config.")
+        
+        # Lese den Basis-URL aus der Konfiguration (z. B. "https://api.openai.com/v1" oder "https://api.openai.com/v1/")
+        base_url = settings.config["settings"]["tts"].get("openai_api_url", "https://api.openai.com/v1")
+        # Entferne ggf. den abschließenden Slash
+        if base_url.endswith("/"):
+            base_url = base_url[:-1]
+        # Hänge den TTS-spezifischen Pfad an
+        self.api_url = base_url + "/audio/speech"
+        
+        # Set the available voices to a static list as per OpenAI TTS documentation.
+        self.available_voices = self.get_available_voices()
+
+    def get_available_voices(self):
+        """
+        Return a static list of supported voices for the OpenAI TTS API.
+        
+        According to the documentation, supported voices include:
+            "alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"
+        """
+        return ["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"]
+
+    def randomvoice(self):
+        """
+        Select and return a random voice from the available voices.
+        """
+        return random.choice(self.available_voices)
+
+    def run(self, text, filepath, random_voice: bool = False):
+        """
+        Convert the provided text to speech and save the resulting audio to the specified filepath.
+        
+        Args:
+            text (str): The input text to convert.
+            filepath (str): The file path where the generated audio will be saved.
+            random_voice (bool): If True, select a random voice from the available voices.
+        """
+        # Choose voice based on configuration or randomly if requested.
+        if random_voice:
+            voice = self.randomvoice()
+        else:
+            voice = settings.config["settings"]["tts"].get("openai_voice_name", "alloy")
+            voice = str(voice).lower()  # Ensure lower-case as expected by the API
+
+        # Select the model from configuration; default to 'tts-1'
+        model = settings.config["settings"]["tts"].get("openai_model", "tts-1")
+
+        # Debug output: print which voice and model will be used
+        print(f"Using OpenAI TTS model: {model} with voice: {voice}")
+
+        # Erstelle das Payload für den API-Request
+        payload = {
+            "model": model,
+            "voice": voice,
+            "input": text,
+            "response_format": "mp3"  # erlaubte Formate: "mp3", "aac", "opus", "flac", "pcm" oder "wav"
+        }
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json"
+        }
+        try:
+            response = requests.post(self.api_url, headers=headers, json=payload)
+            if response.status_code != 200:
+                raise RuntimeError(f"Error from TTS API: {response.status_code} {response.text}")
+            # Schreibe die Binärdaten (mp3) direkt in die Datei.
+            with open(filepath, "wb") as f:
+                f.write(response.content)
+        except Exception as e:
+            raise RuntimeError(f"Failed to generate audio with OpenAI TTS API: {str(e)}")
--- a/main.py
+++ b/main.py
@ -122,6 +122,7 @@ if __name__ == "__main__":
    except Exception as err:
        config["settings"]["tts"]["tiktok_sessionid"] = "REDACTED"
        config["settings"]["tts"]["elevenlabs_api_key"] = "REDACTED"
+        config["settings"]["tts"]["openai_api_key"] = "REDACTED"
        print_step(
            f"Sorry, something went wrong with this version! Try again, and feel free to report this issue at GitHub or the Discord community.\n"
            f"Version: {__VERSION__} \n"
--- a/utils/.config.template.toml
+++ b/utils/.config.template.toml
@ -44,7 +44,6 @@ background_thumbnail_font_size = { optional = true, type = "int", default = 96,
 background_thumbnail_font_color = { optional = true, default = "255,255,255", example = "255,255,255", explanation = "Font color in RGB format for the thumbnail text" }

 [settings.tts]
-voice_choice = { optional = false, default = "tiktok", options = ["elevenlabs", "streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", ], example = "tiktok", explanation = "The voice platform used for TTS generation. " }
 random_voice = { optional = false, type = "bool", default = true, example = true, options = [true, false,], explanation = "Randomizes the voice used for each comment" }
 elevenlabs_voice_name = { optional = false, default = "Bella", example = "Bella", explanation = "The voice used for elevenlabs", options = ["Adam", "Antoni", "Arnold", "Bella", "Domi", "Elli", "Josh", "Rachel", "Sam", ] }
 elevenlabs_api_key = { optional = true, example = "21f13f91f54d741e2ae27d2ab1b99d59", explanation = "Elevenlabs API key" }
@ -56,3 +55,7 @@ python_voice = { optional = false, default = "1", example = "1", explanation = "
 py_voice_num = { optional = false, default = "2", example = "2", explanation = "The number of system voices (2 are pre-installed in Windows)" }
 silence_duration = { optional = true, example = "0.1", explanation = "Time in seconds between TTS comments", default = 0.3, type = "float" }
 no_emojis = { optional = false, type = "bool", default = false, example = false, options = [true, false,], explanation = "Whether to remove emojis from the comments" }
+openai_api_url = { optional = true, default = "https://api.openai.com/v1/", example = "https://api.openai.com/v1/", explanation = "The API endpoint URL for OpenAI TTS generation" }
+openai_api_key = { optional = true, example = "sk-abc123def456...", explanation = "Your OpenAI API key for TTS generation" }
+openai_voice_name = { optional = false, default = "alloy", example = "alloy", explanation = "The voice used for OpenAI TTS generation", options = ["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"] }
+openai_model = { optional = false, default = "tts-1", example = "tts-1", explanation = "The model variant used for OpenAI TTS generation", options = ["tts-1", "tts-1-hd"] }
--- a/video_creation/voices.py
+++ b/video_creation/voices.py
@ -9,6 +9,7 @@ from TTS.GTTS import GTTS
 from TTS.pyttsx import pyttsx
 from TTS.streamlabs_polly import StreamlabsPolly
 from TTS.TikTok import TikTok
+from TTS.openai_tts import OpenAITTS
 from utils import settings
 from utils.console import print_step, print_table

@ -21,6 +22,7 @@ TTSProviders = {
    "TikTok": TikTok,
    "pyttsx": pyttsx,
    "ElevenLabs": elevenlabs,
+    "OpenAI": OpenAITTS,
 }