From c220409a758aeeeab1f1da20fc0af71901267d57 Mon Sep 17 00:00:00 2001 From: MinhVu2711 Date: Wed, 13 May 2026 16:50:54 +0000 Subject: [PATCH] feat(TTS): add Crikk TTS integration with voice selection and API handling --- TTS/Crikk.py | 117 +++++++++++++++++++++++++++++++++++++++ TTS/OhFreeMe.py | 6 +- config/crikk_voices.json | 14 +++++ manual/tts_processor.py | 4 +- manual/video_builder.py | 3 +- manual_main.py | 4 +- 6 files changed, 138 insertions(+), 10 deletions(-) create mode 100644 TTS/Crikk.py create mode 100644 config/crikk_voices.json diff --git a/TTS/Crikk.py b/TTS/Crikk.py new file mode 100644 index 0000000..9cea124 --- /dev/null +++ b/TTS/Crikk.py @@ -0,0 +1,117 @@ +import base64 +import json +import os +import random +from pathlib import Path + +import requests +from dotenv import load_dotenv + +from utils import settings +from utils.console import print_substep + +# Load environment variables from .env file +load_dotenv() + +CRIKK_API_URL = os.getenv("CRIKK_API_URL", "") +CRIKK_BASE_URL = os.getenv("CRIKK_BASE_URL", "") +VOICES_FILE = Path(__file__).resolve().parent.parent / "config" / "crikk_voices.json" + +def _load_voices() -> list[dict]: + try: + with open(VOICES_FILE, "r", encoding="utf-8") as f: + return json.load(f) + except (FileNotFoundError, json.JSONDecodeError) as e: + print_substep(f"Warning: Could not load voices from {VOICES_FILE}: {e}", style="yellow") + return [] + + +class Crikk: + # Load list of User‑Agent strings for random header + _user_agents = None + + def _load_user_agents(self) -> list[str]: + """Read user_agents.json and cache result. + Returns empty list on failure and logs warning. + """ + if self._user_agents is not None: + return self._user_agents + try: + agents_path = Path(__file__).resolve().parent.parent / "config" / "user_agents.json" + with open(agents_path, "r", encoding="utf-8") as f: + self._user_agents = json.load(f) + except Exception as e: + print_substep(f"Warning: Could not load user agents: {e}", style="yellow") + self._user_agents = [] + return self._user_agents + + def _pick_user_agent(self) -> str: + """Return a random User‑Agent string from loaded list. + Falls back to generic UA on empty list. + """ + agents = self._load_user_agents() + if agents: + return random.choice(agents) + return "Mozilla/5.0" + + def __init__(self): + self.max_chars = 1200 + self.voices = _load_voices() + + def run(self, text, filepath, random_voice: bool = False): + voice = self._pick_voice(random_voice) + audio_bytes = self._call_api(text, voice["id"]) + with open(filepath, "wb") as f: + f.write(audio_bytes) + + def randomvoice(self) -> dict: + lang = settings.config["settings"]["tts"].get("ohfreeme_lang", "vi") + filtered = [v for v in self.voices if v["lang"] == lang] + if not filtered: + filtered = self.voices + return random.choice(filtered) + + def _pick_voice(self, random_voice: bool) -> dict: + if random_voice: + return self.randomvoice() + lang = settings.config["settings"]["tts"].get("ohfreeme_lang", "vi") + gender = settings.config["settings"]["tts"].get("ohfreeme_gender", "random") + candidates = [v for v in self.voices if v["lang"] == lang] + if gender != "random": + candidates = [v for v in candidates if v["gender"] == gender] + if not candidates: + candidates = self.voices + return random.choice(candidates) + + def _call_api(self, text: str, voice_id: int) -> bytes: + payload = { + "text": text, + "voice": voice_id, + } + headers = { + "cache-control": "no-cache", + "content-type": "application/json", + "accept": "*/*", + "user-agent": self._pick_user_agent(), # important + "origin": CRIKK_BASE_URL, # important + } + + resp = requests.post(CRIKK_API_URL, json=payload, headers=headers) + data = resp.json() + + if data.get("message") == "success": + print_substep(f"[Crikk debug] Received") + return self._extract_audio(data) + + raise RuntimeError(f"Crikk TTS failed") + + def _extract_audio(self, data: dict) -> bytes: + # Expecting a dict with a "url" field containing a data URI + url = data.get("audio_data") + if not url: + raise RuntimeError("Missing 'url' in API response data") + # url format: "data:audio/mpeg;base64," + if not (url.startswith("data:") and ";base64," in url): + raise RuntimeError(f"Unexpected URL format in API response: {url}") + b64_part = url.split(";base64,", 1)[1] + return base64.b64decode(b64_part) diff --git a/TTS/OhFreeMe.py b/TTS/OhFreeMe.py index a3ca218..48a41bb 100644 --- a/TTS/OhFreeMe.py +++ b/TTS/OhFreeMe.py @@ -19,14 +19,10 @@ OHFREEME_BASE_URL = os.getenv("OHFREEME_BASE_URL", "") OHFREEME_JWT_TOKEN = os.getenv("OHFREEME_JWT_TOKEN", "") VOICES_FILE = Path(__file__).resolve().parent.parent / "config" / "ohfreeme_voices.json" MAX_RETRIES = 3 -RATE_LIMIT_WAIT = 10 +RATE_LIMIT_WAIT = 20 def _load_voices() -> list[dict]: - # existing function unchanged - - # existing function unchanged - try: with open(VOICES_FILE, "r", encoding="utf-8") as f: return json.load(f) diff --git a/config/crikk_voices.json b/config/crikk_voices.json new file mode 100644 index 0000000..19e30ff --- /dev/null +++ b/config/crikk_voices.json @@ -0,0 +1,14 @@ +[ + { + "id": "vi-VN-NamMinhNeural", + "name": "NamMinh", + "gender": "male", + "lang": "vi" + }, + { + "id": "vi-VN-HoaiMyNeural", + "name": "HoaiMy", + "gender": "female", + "lang": "vi" + } +] \ No newline at end of file diff --git a/manual/tts_processor.py b/manual/tts_processor.py index 8d7281e..df750fc 100644 --- a/manual/tts_processor.py +++ b/manual/tts_processor.py @@ -155,6 +155,7 @@ class ManualTTSProcessor: """ from TTS.GTTS import GTTS from TTS.OhFreeMe import OhFreeMe + from TTS.Crikk import Crikk from TTS.TikTok import TikTok from TTS.aws_polly import AWSPolly from TTS.elevenlabs import elevenlabs @@ -165,6 +166,7 @@ class ManualTTSProcessor: providers = { "googletranslate": GTTS, "ohfreeme": OhFreeMe, + "crikk": Crikk, "awspolly": AWSPolly, "streamlabspolly": StreamlabsPolly, "tiktok": TikTok, @@ -181,7 +183,7 @@ class ManualTTSProcessor: f"Unknown TTS provider: {voice_choice}. Falling back to GoogleTranslate.", style="yellow", ) - engine_class = GTTS + engine_class = Crikk print_substep(f"Using TTS engine: {engine_class.__name__}") return engine_class() diff --git a/manual/video_builder.py b/manual/video_builder.py index 6ad5f7e..2bad286 100644 --- a/manual/video_builder.py +++ b/manual/video_builder.py @@ -11,7 +11,6 @@ Uses libx264 encoder (CPU-based) by default. import math import multiprocessing -import os import re import tempfile import threading @@ -140,7 +139,7 @@ class ManualVideoBuilder: total_duration = sum(s["audio_duration"] for s in clips) video_length = math.ceil(total_duration) - console.log(f"[bold green] Video will be: {video_length} seconds long ({len(clips)} clips)") + print_substep(f"[bold green] Video will be: {video_length} seconds long ({len(clips)} clips)") # Ensure temp directory exists self.temp_dir.mkdir(parents=True, exist_ok=True) diff --git a/manual_main.py b/manual_main.py index 74dd68b..d609a9a 100644 --- a/manual_main.py +++ b/manual_main.py @@ -43,7 +43,7 @@ MANUAL_DEFAULTS = { "encoder": "libx264", "resolution_w": 1080, "resolution_h": 1920, - "opacity": 0.9, + "opacity": 1, "background_video": "random", "background_audio": "random", "background_video_dir": "assets/backgrounds/video", @@ -103,7 +103,7 @@ _BASE_SETTINGS_DEFAULTS = { "background_thumbnail_font_color": "255,255,255", }, "tts": { - "voice_choice": "ohfreeme", + "voice_choice": "crikk", "random_voice": False, "elevenlabs_voice_name": "Bella", "elevenlabs_api_key": "",