feat(TTS): add Crikk TTS integration with voice selection and API handling

2 months ago · c220409a75
parent 16de343e0b
commit c220409a75
6 changed files with 138 additions and 10 deletions
--- a/TTS/Crikk.py
+++ b/TTS/Crikk.py
@ -0,0 +1,117 @@
+import base64
+import json
+import os
+import random
+from pathlib import Path
+
+import requests
+from dotenv import load_dotenv
+
+from utils import settings
+from utils.console import print_substep
+
+# Load environment variables from .env file
+load_dotenv()
+
+CRIKK_API_URL = os.getenv("CRIKK_API_URL", "")
+CRIKK_BASE_URL = os.getenv("CRIKK_BASE_URL", "")
+VOICES_FILE = Path(__file__).resolve().parent.parent / "config" / "crikk_voices.json"
+
+def _load_voices() -> list[dict]:
+    try:
+        with open(VOICES_FILE, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except (FileNotFoundError, json.JSONDecodeError) as e:
+        print_substep(f"Warning: Could not load voices from {VOICES_FILE}: {e}", style="yellow")
+        return []
+
+
+class Crikk:
+    # Load list of User‑Agent strings for random header
+    _user_agents = None
+
+    def _load_user_agents(self) -> list[str]:
+        """Read user_agents.json and cache result.
+        Returns empty list on failure and logs warning.
+        """
+        if self._user_agents is not None:
+            return self._user_agents
+        try:
+            agents_path = Path(__file__).resolve().parent.parent / "config" / "user_agents.json"
+            with open(agents_path, "r", encoding="utf-8") as f:
+                self._user_agents = json.load(f)
+        except Exception as e:
+            print_substep(f"Warning: Could not load user agents: {e}", style="yellow")
+            self._user_agents = []
+        return self._user_agents
+
+    def _pick_user_agent(self) -> str:
+        """Return a random User‑Agent string from loaded list.
+        Falls back to generic UA on empty list.
+        """
+        agents = self._load_user_agents()
+        if agents:
+            return random.choice(agents)
+        return "Mozilla/5.0"
+
+    def __init__(self):
+        self.max_chars = 1200
+        self.voices = _load_voices()
+
+    def run(self, text, filepath, random_voice: bool = False):
+        voice = self._pick_voice(random_voice)
+        audio_bytes = self._call_api(text, voice["id"])
+        with open(filepath, "wb") as f:
+            f.write(audio_bytes)
+
+    def randomvoice(self) -> dict:
+        lang = settings.config["settings"]["tts"].get("ohfreeme_lang", "vi")
+        filtered = [v for v in self.voices if v["lang"] == lang]
+        if not filtered:
+            filtered = self.voices
+        return random.choice(filtered)
+
+    def _pick_voice(self, random_voice: bool) -> dict:
+        if random_voice:
+            return self.randomvoice()
+        lang = settings.config["settings"]["tts"].get("ohfreeme_lang", "vi")
+        gender = settings.config["settings"]["tts"].get("ohfreeme_gender", "random")
+        candidates = [v for v in self.voices if v["lang"] == lang]
+        if gender != "random":
+            candidates = [v for v in candidates if v["gender"] == gender]
+        if not candidates:
+            candidates = self.voices
+        return random.choice(candidates)
+
+    def _call_api(self, text: str, voice_id: int) -> bytes:
+        payload = {
+            "text": text,
+            "voice": voice_id,
+        }
+        headers = {
+            "cache-control": "no-cache",
+            "content-type": "application/json",
+            "accept": "*/*",
+            "user-agent": self._pick_user_agent(), # important
+            "origin": CRIKK_BASE_URL, # important
+        }
+
+        resp = requests.post(CRIKK_API_URL, json=payload, headers=headers)
+        data = resp.json()
+
+        if data.get("message") == "success":
+            print_substep(f"[Crikk debug] Received")
+            return self._extract_audio(data)
+
+        raise RuntimeError(f"Crikk TTS failed")
+
+    def _extract_audio(self, data: dict) -> bytes:
+        # Expecting a dict with a "url" field containing a data URI
+        url = data.get("audio_data")
+        if not url:
+            raise RuntimeError("Missing 'url' in API response data")
+        # url format: "data:audio/mpeg;base64,<base64data>"
+        if not (url.startswith("data:") and ";base64," in url):
+            raise RuntimeError(f"Unexpected URL format in API response: {url}")
+        b64_part = url.split(";base64,", 1)[1]
+        return base64.b64decode(b64_part)
--- a/TTS/OhFreeMe.py
+++ b/TTS/OhFreeMe.py
@ -19,14 +19,10 @@ OHFREEME_BASE_URL = os.getenv("OHFREEME_BASE_URL", "")
 OHFREEME_JWT_TOKEN = os.getenv("OHFREEME_JWT_TOKEN", "")
 VOICES_FILE = Path(__file__).resolve().parent.parent / "config" / "ohfreeme_voices.json"
 MAX_RETRIES = 3
-RATE_LIMIT_WAIT = 10
+RATE_LIMIT_WAIT = 20


 def _load_voices() -> list[dict]:
-    # existing function unchanged
-
-    # existing function unchanged
-
    try:
        with open(VOICES_FILE, "r", encoding="utf-8") as f:
            return json.load(f)
--- a/config/crikk_voices.json
+++ b/config/crikk_voices.json
@ -0,0 +1,14 @@
+[
+  {
+    "id": "vi-VN-NamMinhNeural",
+    "name": "NamMinh",
+    "gender": "male",
+    "lang": "vi"
+  },
+  {
+    "id": "vi-VN-HoaiMyNeural",
+    "name": "HoaiMy",
+    "gender": "female",
+    "lang": "vi"
+  }
+]
--- a/manual/tts_processor.py
+++ b/manual/tts_processor.py
@ -155,6 +155,7 @@ class ManualTTSProcessor:
        """
        from TTS.GTTS import GTTS
        from TTS.OhFreeMe import OhFreeMe
+        from TTS.Crikk import Crikk
        from TTS.TikTok import TikTok
        from TTS.aws_polly import AWSPolly
        from TTS.elevenlabs import elevenlabs
@ -165,6 +166,7 @@ class ManualTTSProcessor:
        providers = {
            "googletranslate": GTTS,
            "ohfreeme": OhFreeMe,
+            "crikk": Crikk,
            "awspolly": AWSPolly,
            "streamlabspolly": StreamlabsPolly,
            "tiktok": TikTok,
@ -181,7 +183,7 @@ class ManualTTSProcessor:
                f"Unknown TTS provider: {voice_choice}. Falling back to GoogleTranslate.",
                style="yellow",
            )
-            engine_class = GTTS
+            engine_class = Crikk

        print_substep(f"Using TTS engine: {engine_class.__name__}")
        return engine_class()
--- a/manual/video_builder.py
+++ b/manual/video_builder.py
@ -11,7 +11,6 @@ Uses libx264 encoder (CPU-based) by default.

 import math
 import multiprocessing
-import os
 import re
 import tempfile
 import threading
@ -140,7 +139,7 @@ class ManualVideoBuilder:
        total_duration = sum(s["audio_duration"] for s in clips)
        video_length = math.ceil(total_duration)

-        console.log(f"[bold green] Video will be: {video_length} seconds long ({len(clips)} clips)")
+        print_substep(f"[bold green] Video will be: {video_length} seconds long ({len(clips)} clips)")

        # Ensure temp directory exists
        self.temp_dir.mkdir(parents=True, exist_ok=True)
--- a/manual_main.py
+++ b/manual_main.py
@ -43,7 +43,7 @@ MANUAL_DEFAULTS = {
    "encoder": "libx264",
    "resolution_w": 1080,
    "resolution_h": 1920,
-    "opacity": 0.9,
+    "opacity": 1,
    "background_video": "random",
    "background_audio": "random",
    "background_video_dir": "assets/backgrounds/video",
@ -103,7 +103,7 @@ _BASE_SETTINGS_DEFAULTS = {
            "background_thumbnail_font_color": "255,255,255",
        },
        "tts": {
-            "voice_choice": "ohfreeme",
+            "voice_choice": "crikk",
            "random_voice": False,
            "elevenlabs_voice_name": "Bella",
            "elevenlabs_api_key": "",