From 7c1922d0cc581120b707a50bfee9cd093b88b268 Mon Sep 17 00:00:00 2001 From: octo-patch Date: Wed, 8 Apr 2026 02:39:20 +0800 Subject: [PATCH] feat: add MiniMax TTS provider support - Add MiniMax TTS provider (TTS/minimax_tts.py) using the MiniMax T2A v2 API - Register MiniMax in TTSProviders dict (video_creation/voices.py) - Add MiniMax config options to .config.template.toml (minimax_api_key, minimax_voice_name, minimax_tts_model) - Add unit and integration tests (tests/test_minimax_tts.py) - Support MINIMAX_API_KEY environment variable - Default model: speech-2.8-hd; default voice: English_Graceful_Lady --- TTS/minimax_tts.py | 110 ++++++++++++++++ tests/__init__.py | 0 tests/test_minimax_tts.py | 243 ++++++++++++++++++++++++++++++++++++ utils/.config.template.toml | 6 +- video_creation/voices.py | 2 + 5 files changed, 360 insertions(+), 1 deletion(-) create mode 100644 TTS/minimax_tts.py create mode 100644 tests/__init__.py create mode 100644 tests/test_minimax_tts.py diff --git a/TTS/minimax_tts.py b/TTS/minimax_tts.py new file mode 100644 index 0000000..dadd013 --- /dev/null +++ b/TTS/minimax_tts.py @@ -0,0 +1,110 @@ +import os +import random + +import requests + +from utils import settings + +MINIMAX_TTS_VOICES = [ + "English_Graceful_Lady", + "English_Insightful_Speaker", + "English_radiant_girl", + "English_Persuasive_Man", + "English_Lucky_Robot", + "English_expressive_narrator", +] + + +class MiniMaxTTS: + """ + A Text-to-Speech engine that uses the MiniMax TTS API to generate audio from text. + + Attributes: + max_chars (int): Maximum number of characters allowed per API call. + api_key (str): MiniMax API key loaded from settings or environment. + base_url (str): The base URL for the MiniMax API. + available_voices (list): Supported voice IDs. + """ + + def __init__(self): + self.max_chars = 4096 + self.api_key = settings.config["settings"]["tts"].get("minimax_api_key") or os.environ.get( + "MINIMAX_API_KEY" + ) + if not self.api_key: + raise ValueError( + "No MiniMax API key provided! Set 'minimax_api_key' in your config or " + "the MINIMAX_API_KEY environment variable." + ) + self.base_url = settings.config["settings"]["tts"].get( + "minimax_api_url", "https://api.minimax.io" + ).rstrip("/") + self.available_voices = MINIMAX_TTS_VOICES + + def randomvoice(self): + """Return a random voice ID from the available voices.""" + return random.choice(self.available_voices) + + def run(self, text, filepath, random_voice: bool = False): + """ + Convert the provided text to speech and save the resulting audio to the specified filepath. + + Args: + text (str): The input text to convert. + filepath (str): The file path where the generated audio will be saved. + random_voice (bool): If True, select a random voice from the available voices. + """ + if random_voice: + voice = self.randomvoice() + else: + voice = settings.config["settings"]["tts"].get( + "minimax_voice_name", "English_Graceful_Lady" + ) + + model = settings.config["settings"]["tts"].get("minimax_tts_model", "speech-2.8-hd") + + payload = { + "model": model, + "text": text, + "stream": False, + "voice_setting": { + "voice_id": voice, + "speed": 1, + "vol": 1, + "pitch": 0, + }, + "audio_setting": { + "sample_rate": 32000, + "bitrate": 128000, + "format": "mp3", + "channel": 1, + }, + } + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + + response = requests.post( + f"{self.base_url}/v1/t2a_v2", + headers=headers, + json=payload, + ) + + if response.status_code != 200: + raise RuntimeError( + f"MiniMax TTS API error: {response.status_code} {response.text}" + ) + + result = response.json() + if result.get("base_resp", {}).get("status_code") != 0: + raise RuntimeError( + f"MiniMax TTS API returned error: " + f"{result.get('base_resp', {}).get('status_msg', 'Unknown error')}" + ) + + audio_hex = result["data"]["audio"] + audio_bytes = bytes.fromhex(audio_hex) + + with open(filepath, "wb") as f: + f.write(audio_bytes) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_minimax_tts.py b/tests/test_minimax_tts.py new file mode 100644 index 0000000..3b556c7 --- /dev/null +++ b/tests/test_minimax_tts.py @@ -0,0 +1,243 @@ +"""Unit and integration tests for the MiniMax TTS provider.""" +import os +import tempfile +from unittest.mock import MagicMock, patch + +import pytest + + +# --------------------------------------------------------------------------- +# Helpers – fake out the settings module so we can import without a config file +# --------------------------------------------------------------------------- + +FAKE_SETTINGS = { + "settings": { + "tts": { + "minimax_api_key": "test-api-key", + "minimax_api_url": "https://api.minimax.io", + "minimax_voice_name": "English_Graceful_Lady", + "minimax_tts_model": "speech-2.8-hd", + } + } +} + + +def _patch_settings(config=None): + """Return a patcher that replaces utils.settings.config.""" + mock_settings = MagicMock() + mock_settings.config = config or FAKE_SETTINGS + return patch.dict("sys.modules", {"utils": MagicMock(settings=mock_settings)}) + + +# --------------------------------------------------------------------------- +# Unit tests +# --------------------------------------------------------------------------- + + +class TestMiniMaxTTSInit: + """Provider instantiation and configuration parsing.""" + + def test_creates_instance_with_valid_config(self): + with _patch_settings(): + from TTS.minimax_tts import MiniMaxTTS # noqa: PLC0415 + + tts = MiniMaxTTS() + assert tts is not None + + def test_raises_when_api_key_missing(self): + config = {"settings": {"tts": {}}} + with _patch_settings(config), patch.dict(os.environ, {}, clear=False): + # Make sure env var is absent too + env = {k: v for k, v in os.environ.items() if k != "MINIMAX_API_KEY"} + with patch.dict(os.environ, env, clear=True): + from TTS.minimax_tts import MiniMaxTTS # noqa: PLC0415 + + with pytest.raises(ValueError, match="No MiniMax API key"): + MiniMaxTTS() + + def test_reads_api_key_from_env(self): + config = {"settings": {"tts": {}}} + with _patch_settings(config), patch.dict(os.environ, {"MINIMAX_API_KEY": "env-key"}): + from TTS.minimax_tts import MiniMaxTTS # noqa: PLC0415 + + tts = MiniMaxTTS() + assert tts.api_key == "env-key" + + def test_default_base_url(self): + with _patch_settings(): + from TTS.minimax_tts import MiniMaxTTS # noqa: PLC0415 + + tts = MiniMaxTTS() + assert tts.base_url == "https://api.minimax.io" + + def test_available_voices_not_empty(self): + with _patch_settings(): + from TTS.minimax_tts import MiniMaxTTS # noqa: PLC0415 + + tts = MiniMaxTTS() + assert len(tts.available_voices) > 0 + + def test_max_chars(self): + with _patch_settings(): + from TTS.minimax_tts import MiniMaxTTS # noqa: PLC0415 + + tts = MiniMaxTTS() + assert tts.max_chars == 4096 + + +class TestMiniMaxTTSRandomVoice: + def test_randomvoice_returns_valid_voice(self): + with _patch_settings(): + from TTS.minimax_tts import MiniMaxTTS, MINIMAX_TTS_VOICES # noqa: PLC0415 + + tts = MiniMaxTTS() + voice = tts.randomvoice() + assert voice in MINIMAX_TTS_VOICES + + +class TestMiniMaxTTSRun: + """Tests for the run() method using a mocked requests.post.""" + + def _make_mock_response(self, audio_hex="494433"): + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = { + "data": {"audio": audio_hex, "status": 2}, + "base_resp": {"status_code": 0, "status_msg": "success"}, + } + return mock_resp + + def test_sends_request_to_correct_url(self): + with _patch_settings(): + from TTS.minimax_tts import MiniMaxTTS # noqa: PLC0415 + + tts = MiniMaxTTS() + with patch("requests.post", return_value=self._make_mock_response()) as mock_post: + with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: + tts.run("Hello world.", f.name) + mock_post.assert_called_once() + call_url = mock_post.call_args[0][0] + assert "/v1/t2a_v2" in call_url + assert "api.minimax.io" in call_url + + def test_sends_correct_payload(self): + with _patch_settings(): + from TTS.minimax_tts import MiniMaxTTS # noqa: PLC0415 + + tts = MiniMaxTTS() + with patch("requests.post", return_value=self._make_mock_response()) as mock_post: + with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: + tts.run("Hello world.", f.name, random_voice=False) + payload = mock_post.call_args[1]["json"] + assert payload["model"] == "speech-2.8-hd" + assert payload["text"] == "Hello world." + assert payload["voice_setting"]["voice_id"] == "English_Graceful_Lady" + assert payload["audio_setting"]["format"] == "mp3" + + def test_writes_audio_bytes_to_file(self): + audio_hex = "494433" # hex for 'ID3' — valid-ish mp3 header start + with _patch_settings(): + from TTS.minimax_tts import MiniMaxTTS # noqa: PLC0415 + + tts = MiniMaxTTS() + with patch("requests.post", return_value=self._make_mock_response(audio_hex)): + with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: + tmp_path = f.name + tts.run("Hello.", tmp_path) + with open(tmp_path, "rb") as f: + content = f.read() + assert content == bytes.fromhex(audio_hex) + + def test_raises_on_http_error(self): + with _patch_settings(): + from TTS.minimax_tts import MiniMaxTTS # noqa: PLC0415 + + tts = MiniMaxTTS() + mock_resp = MagicMock() + mock_resp.status_code = 401 + mock_resp.text = "Unauthorized" + with patch("requests.post", return_value=mock_resp): + with pytest.raises(RuntimeError, match="MiniMax TTS API error: 401"): + with tempfile.NamedTemporaryFile(suffix=".mp3") as f: + tts.run("Hello.", f.name) + + def test_raises_on_api_status_error(self): + with _patch_settings(): + from TTS.minimax_tts import MiniMaxTTS # noqa: PLC0415 + + tts = MiniMaxTTS() + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = { + "data": {}, + "base_resp": {"status_code": 2013, "status_msg": "invalid voice_id"}, + } + with patch("requests.post", return_value=mock_resp): + with pytest.raises(RuntimeError, match="invalid voice_id"): + with tempfile.NamedTemporaryFile(suffix=".mp3") as f: + tts.run("Hello.", f.name) + + def test_uses_random_voice_when_requested(self): + with _patch_settings(): + from TTS.minimax_tts import MiniMaxTTS, MINIMAX_TTS_VOICES # noqa: PLC0415 + + tts = MiniMaxTTS() + captured_payloads = [] + + def fake_post(url, **kwargs): + captured_payloads.append(kwargs.get("json", {})) + return self._make_mock_response() + + with patch("requests.post", side_effect=fake_post): + with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: + tts.run("Hello.", f.name, random_voice=True) + voice_used = captured_payloads[0]["voice_setting"]["voice_id"] + assert voice_used in MINIMAX_TTS_VOICES + + +class TestTTSProvidersRegistry: + """Verify MiniMax is registered in voices.py.""" + + def test_minimax_in_providers_source(self): + """Verify voices.py source contains MiniMax registration.""" + import pathlib + + voices_path = pathlib.Path(__file__).parent.parent / "video_creation" / "voices.py" + source = voices_path.read_text() + assert "MiniMax" in source, "MiniMax not found in TTSProviders registry" + assert "MiniMaxTTS" in source, "MiniMaxTTS class not imported in voices.py" + assert "minimax_tts" in source, "minimax_tts module not imported in voices.py" + + +# --------------------------------------------------------------------------- +# Integration test – calls the real MiniMax API (skipped if no key set) +# --------------------------------------------------------------------------- + +MINIMAX_API_KEY = os.environ.get("MINIMAX_API_KEY") + + +@pytest.mark.skipif(not MINIMAX_API_KEY, reason="MINIMAX_API_KEY not set") +class TestMiniMaxTTSIntegration: + """Live API calls — only run when MINIMAX_API_KEY is available.""" + + def test_synthesizes_speech_to_file(self): + config = { + "settings": { + "tts": { + "minimax_api_key": MINIMAX_API_KEY, + "minimax_api_url": "https://api.minimax.io", + "minimax_voice_name": "English_Graceful_Lady", + "minimax_tts_model": "speech-2.8-hd", + } + } + } + with _patch_settings(config): + from TTS.minimax_tts import MiniMaxTTS # noqa: PLC0415 + + tts = MiniMaxTTS() + with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: + tmp_path = f.name + tts.run("Hello, this is a MiniMax TTS test.", tmp_path) + size = os.path.getsize(tmp_path) + assert size > 100, f"Audio file too small ({size} bytes), likely empty or error" + os.unlink(tmp_path) diff --git a/utils/.config.template.toml b/utils/.config.template.toml index 9b13657..d08c967 100644 --- a/utils/.config.template.toml +++ b/utils/.config.template.toml @@ -45,7 +45,7 @@ background_thumbnail_font_size = { optional = true, type = "int", default = 96, background_thumbnail_font_color = { optional = true, default = "255,255,255", example = "255,255,255", explanation = "Font color in RGB format for the thumbnail text" } [settings.tts] -voice_choice = { optional = false, default = "tiktok", options = ["elevenlabs", "streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", "OpenAI"], example = "tiktok", explanation = "The voice platform used for TTS generation. " } +voice_choice = { optional = false, default = "tiktok", options = ["elevenlabs", "streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", "OpenAI", "MiniMax"], example = "tiktok", explanation = "The voice platform used for TTS generation. " } random_voice = { optional = false, type = "bool", default = true, example = true, options = [true, false,], explanation = "Randomizes the voice used for each comment" } elevenlabs_voice_name = { optional = false, default = "Bella", example = "Bella", explanation = "The voice used for elevenlabs", options = ["Adam", "Antoni", "Arnold", "Bella", "Domi", "Elli", "Josh", "Rachel", "Sam", ] } elevenlabs_api_key = { optional = true, example = "21f13f91f54d741e2ae27d2ab1b99d59", explanation = "Elevenlabs API key" } @@ -61,3 +61,7 @@ openai_api_url = { optional = true, default = "https://api.openai.com/v1/", exam openai_api_key = { optional = true, example = "sk-abc123def456...", explanation = "Your OpenAI API key for TTS generation" } openai_voice_name = { optional = false, default = "alloy", example = "alloy", explanation = "The voice used for OpenAI TTS generation", options = ["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer", "af_heart"] } openai_model = { optional = false, default = "tts-1", example = "tts-1", explanation = "The model variant used for OpenAI TTS generation", options = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"] } +minimax_api_key = { optional = true, example = "sk-api-xxx...", explanation = "MiniMax API key for TTS generation (also read from MINIMAX_API_KEY env var)" } +minimax_api_url = { optional = true, default = "https://api.minimax.io", example = "https://api.minimax.io", explanation = "The base URL for the MiniMax API" } +minimax_voice_name = { optional = false, default = "English_Graceful_Lady", example = "English_Graceful_Lady", explanation = "The voice used for MiniMax TTS generation", options = ["English_Graceful_Lady", "English_Insightful_Speaker", "English_radiant_girl", "English_Persuasive_Man", "English_Lucky_Robot", "English_expressive_narrator"] } +minimax_tts_model = { optional = false, default = "speech-2.8-hd", example = "speech-2.8-hd", explanation = "The model variant used for MiniMax TTS generation", options = ["speech-2.8-hd", "speech-2.8-turbo"] } diff --git a/video_creation/voices.py b/video_creation/voices.py index 3d48e9e..24dca8a 100644 --- a/video_creation/voices.py +++ b/video_creation/voices.py @@ -6,6 +6,7 @@ from TTS.aws_polly import AWSPolly from TTS.elevenlabs import elevenlabs from TTS.engine_wrapper import TTSEngine from TTS.GTTS import GTTS +from TTS.minimax_tts import MiniMaxTTS from TTS.openai_tts import OpenAITTS from TTS.pyttsx import pyttsx from TTS.streamlabs_polly import StreamlabsPolly @@ -23,6 +24,7 @@ TTSProviders = { "pyttsx": pyttsx, "ElevenLabs": elevenlabs, "OpenAI": OpenAITTS, + "MiniMax": MiniMaxTTS, }