From c220409a758aeeeab1f1da20fc0af71901267d57 Mon Sep 17 00:00:00 2001
From: MinhVu2711 <vuquocminh99@gmail.com>
Date: Wed, 13 May 2026 16:50:54 +0000
Subject: [PATCH] feat(TTS): add Crikk TTS integration with voice selection and
 API handling

---
 TTS/Crikk.py             | 117 +++++++++++++++++++++++++++++++++++++++
 TTS/OhFreeMe.py          |   6 +-
 config/crikk_voices.json |  14 +++++
 manual/tts_processor.py  |   4 +-
 manual/video_builder.py  |   3 +-
 manual_main.py           |   4 +-
 6 files changed, 138 insertions(+), 10 deletions(-)
 create mode 100644 TTS/Crikk.py
 create mode 100644 config/crikk_voices.json

diff --git a/TTS/Crikk.py b/TTS/Crikk.py
new file mode 100644
index 0000000..9cea124
--- /dev/null
+++ b/TTS/Crikk.py
@@ -0,0 +1,117 @@
+import base64
+import json
+import os
+import random
+from pathlib import Path
+
+import requests
+from dotenv import load_dotenv
+
+from utils import settings
+from utils.console import print_substep
+
+# Load environment variables from .env file
+load_dotenv()
+
+CRIKK_API_URL = os.getenv("CRIKK_API_URL", "")
+CRIKK_BASE_URL = os.getenv("CRIKK_BASE_URL", "")
+VOICES_FILE = Path(__file__).resolve().parent.parent / "config" / "crikk_voices.json"
+
+def _load_voices() -> list[dict]:
+    try:
+        with open(VOICES_FILE, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except (FileNotFoundError, json.JSONDecodeError) as e:
+        print_substep(f"Warning: Could not load voices from {VOICES_FILE}: {e}", style="yellow")
+        return []
+
+
+class Crikk:
+    # Load list of User‑Agent strings for random header
+    _user_agents = None
+
+    def _load_user_agents(self) -> list[str]:
+        """Read user_agents.json and cache result.
+        Returns empty list on failure and logs warning.
+        """
+        if self._user_agents is not None:
+            return self._user_agents
+        try:
+            agents_path = Path(__file__).resolve().parent.parent / "config" / "user_agents.json"
+            with open(agents_path, "r", encoding="utf-8") as f:
+                self._user_agents = json.load(f)
+        except Exception as e:
+            print_substep(f"Warning: Could not load user agents: {e}", style="yellow")
+            self._user_agents = []
+        return self._user_agents
+
+    def _pick_user_agent(self) -> str:
+        """Return a random User‑Agent string from loaded list.
+        Falls back to generic UA on empty list.
+        """
+        agents = self._load_user_agents()
+        if agents:
+            return random.choice(agents)
+        return "Mozilla/5.0"
+
+    def __init__(self):
+        self.max_chars = 1200
+        self.voices = _load_voices()
+
+    def run(self, text, filepath, random_voice: bool = False):
+        voice = self._pick_voice(random_voice)
+        audio_bytes = self._call_api(text, voice["id"])
+        with open(filepath, "wb") as f:
+            f.write(audio_bytes)
+
+    def randomvoice(self) -> dict:
+        lang = settings.config["settings"]["tts"].get("ohfreeme_lang", "vi")
+        filtered = [v for v in self.voices if v["lang"] == lang]
+        if not filtered:
+            filtered = self.voices
+        return random.choice(filtered)
+
+    def _pick_voice(self, random_voice: bool) -> dict:
+        if random_voice:
+            return self.randomvoice()
+        lang = settings.config["settings"]["tts"].get("ohfreeme_lang", "vi")
+        gender = settings.config["settings"]["tts"].get("ohfreeme_gender", "random")
+        candidates = [v for v in self.voices if v["lang"] == lang]
+        if gender != "random":
+            candidates = [v for v in candidates if v["gender"] == gender]
+        if not candidates:
+            candidates = self.voices
+        return random.choice(candidates)
+
+    def _call_api(self, text: str, voice_id: int) -> bytes:
+        payload = {
+            "text": text,
+            "voice": voice_id,
+        }
+        headers = {
+            "cache-control": "no-cache",
+            "content-type": "application/json",
+            "accept": "*/*",
+            "user-agent": self._pick_user_agent(), # important
+            "origin": CRIKK_BASE_URL, # important
+        }
+
+        resp = requests.post(CRIKK_API_URL, json=payload, headers=headers)
+        data = resp.json()
+
+        if data.get("message") == "success":
+            print_substep(f"[Crikk debug] Received")
+            return self._extract_audio(data)
+
+        raise RuntimeError(f"Crikk TTS failed")
+
+    def _extract_audio(self, data: dict) -> bytes:
+        # Expecting a dict with a "url" field containing a data URI
+        url = data.get("audio_data")
+        if not url:
+            raise RuntimeError("Missing 'url' in API response data")
+        # url format: "data:audio/mpeg;base64,<base64data>"
+        if not (url.startswith("data:") and ";base64," in url):
+            raise RuntimeError(f"Unexpected URL format in API response: {url}")
+        b64_part = url.split(";base64,", 1)[1]
+        return base64.b64decode(b64_part)
diff --git a/TTS/OhFreeMe.py b/TTS/OhFreeMe.py
index a3ca218..48a41bb 100644
--- a/TTS/OhFreeMe.py
+++ b/TTS/OhFreeMe.py
@@ -19,14 +19,10 @@ OHFREEME_BASE_URL = os.getenv("OHFREEME_BASE_URL", "")
 OHFREEME_JWT_TOKEN = os.getenv("OHFREEME_JWT_TOKEN", "")
 VOICES_FILE = Path(__file__).resolve().parent.parent / "config" / "ohfreeme_voices.json"
 MAX_RETRIES = 3
-RATE_LIMIT_WAIT = 10
+RATE_LIMIT_WAIT = 20
 
 
 def _load_voices() -> list[dict]:
-    # existing function unchanged
-
-    # existing function unchanged
-
     try:
         with open(VOICES_FILE, "r", encoding="utf-8") as f:
             return json.load(f)
diff --git a/config/crikk_voices.json b/config/crikk_voices.json
new file mode 100644
index 0000000..19e30ff
--- /dev/null
+++ b/config/crikk_voices.json
@@ -0,0 +1,14 @@
+[
+  {
+    "id": "vi-VN-NamMinhNeural",
+    "name": "NamMinh",
+    "gender": "male",
+    "lang": "vi"
+  },
+  {
+    "id": "vi-VN-HoaiMyNeural",
+    "name": "HoaiMy",
+    "gender": "female",
+    "lang": "vi"
+  }
+]
\ No newline at end of file
diff --git a/manual/tts_processor.py b/manual/tts_processor.py
index 8d7281e..df750fc 100644
--- a/manual/tts_processor.py
+++ b/manual/tts_processor.py
@@ -155,6 +155,7 @@ class ManualTTSProcessor:
         """
         from TTS.GTTS import GTTS
         from TTS.OhFreeMe import OhFreeMe
+        from TTS.Crikk import Crikk
         from TTS.TikTok import TikTok
         from TTS.aws_polly import AWSPolly
         from TTS.elevenlabs import elevenlabs
@@ -165,6 +166,7 @@ class ManualTTSProcessor:
         providers = {
             "googletranslate": GTTS,
             "ohfreeme": OhFreeMe,
+            "crikk": Crikk,
             "awspolly": AWSPolly,
             "streamlabspolly": StreamlabsPolly,
             "tiktok": TikTok,
@@ -181,7 +183,7 @@ class ManualTTSProcessor:
                 f"Unknown TTS provider: {voice_choice}. Falling back to GoogleTranslate.",
                 style="yellow",
             )
-            engine_class = GTTS
+            engine_class = Crikk
 
         print_substep(f"Using TTS engine: {engine_class.__name__}")
         return engine_class()
diff --git a/manual/video_builder.py b/manual/video_builder.py
index 6ad5f7e..2bad286 100644
--- a/manual/video_builder.py
+++ b/manual/video_builder.py
@@ -11,7 +11,6 @@ Uses libx264 encoder (CPU-based) by default.
 
 import math
 import multiprocessing
-import os
 import re
 import tempfile
 import threading
@@ -140,7 +139,7 @@ class ManualVideoBuilder:
         total_duration = sum(s["audio_duration"] for s in clips)
         video_length = math.ceil(total_duration)
 
-        console.log(f"[bold green] Video will be: {video_length} seconds long ({len(clips)} clips)")
+        print_substep(f"[bold green] Video will be: {video_length} seconds long ({len(clips)} clips)")
 
         # Ensure temp directory exists
         self.temp_dir.mkdir(parents=True, exist_ok=True)
diff --git a/manual_main.py b/manual_main.py
index 74dd68b..d609a9a 100644
--- a/manual_main.py
+++ b/manual_main.py
@@ -43,7 +43,7 @@ MANUAL_DEFAULTS = {
     "encoder": "libx264",
     "resolution_w": 1080,
     "resolution_h": 1920,
-    "opacity": 0.9,
+    "opacity": 1,
     "background_video": "random",
     "background_audio": "random",
     "background_video_dir": "assets/backgrounds/video",
@@ -103,7 +103,7 @@ _BASE_SETTINGS_DEFAULTS = {
             "background_thumbnail_font_color": "255,255,255",
         },
         "tts": {
-            "voice_choice": "ohfreeme",
+            "voice_choice": "crikk",
             "random_voice": False,
             "elevenlabs_voice_name": "Bella",
             "elevenlabs_api_key": "",