feat(TTS): add Crikk TTS integration with voice selection and API handling

pull/2558/head
MinhVu2711 1 month ago
parent 16de343e0b
commit c220409a75

@ -0,0 +1,117 @@
import base64
import json
import os
import random
from pathlib import Path
import requests
from dotenv import load_dotenv
from utils import settings
from utils.console import print_substep
# Load environment variables from .env file
load_dotenv()
CRIKK_API_URL = os.getenv("CRIKK_API_URL", "")
CRIKK_BASE_URL = os.getenv("CRIKK_BASE_URL", "")
VOICES_FILE = Path(__file__).resolve().parent.parent / "config" / "crikk_voices.json"
def _load_voices() -> list[dict]:
try:
with open(VOICES_FILE, "r", encoding="utf-8") as f:
return json.load(f)
except (FileNotFoundError, json.JSONDecodeError) as e:
print_substep(f"Warning: Could not load voices from {VOICES_FILE}: {e}", style="yellow")
return []
class Crikk:
# Load list of UserAgent strings for random header
_user_agents = None
def _load_user_agents(self) -> list[str]:
"""Read user_agents.json and cache result.
Returns empty list on failure and logs warning.
"""
if self._user_agents is not None:
return self._user_agents
try:
agents_path = Path(__file__).resolve().parent.parent / "config" / "user_agents.json"
with open(agents_path, "r", encoding="utf-8") as f:
self._user_agents = json.load(f)
except Exception as e:
print_substep(f"Warning: Could not load user agents: {e}", style="yellow")
self._user_agents = []
return self._user_agents
def _pick_user_agent(self) -> str:
"""Return a random UserAgent string from loaded list.
Falls back to generic UA on empty list.
"""
agents = self._load_user_agents()
if agents:
return random.choice(agents)
return "Mozilla/5.0"
def __init__(self):
self.max_chars = 1200
self.voices = _load_voices()
def run(self, text, filepath, random_voice: bool = False):
voice = self._pick_voice(random_voice)
audio_bytes = self._call_api(text, voice["id"])
with open(filepath, "wb") as f:
f.write(audio_bytes)
def randomvoice(self) -> dict:
lang = settings.config["settings"]["tts"].get("ohfreeme_lang", "vi")
filtered = [v for v in self.voices if v["lang"] == lang]
if not filtered:
filtered = self.voices
return random.choice(filtered)
def _pick_voice(self, random_voice: bool) -> dict:
if random_voice:
return self.randomvoice()
lang = settings.config["settings"]["tts"].get("ohfreeme_lang", "vi")
gender = settings.config["settings"]["tts"].get("ohfreeme_gender", "random")
candidates = [v for v in self.voices if v["lang"] == lang]
if gender != "random":
candidates = [v for v in candidates if v["gender"] == gender]
if not candidates:
candidates = self.voices
return random.choice(candidates)
def _call_api(self, text: str, voice_id: int) -> bytes:
payload = {
"text": text,
"voice": voice_id,
}
headers = {
"cache-control": "no-cache",
"content-type": "application/json",
"accept": "*/*",
"user-agent": self._pick_user_agent(), # important
"origin": CRIKK_BASE_URL, # important
}
resp = requests.post(CRIKK_API_URL, json=payload, headers=headers)
data = resp.json()
if data.get("message") == "success":
print_substep(f"[Crikk debug] Received")
return self._extract_audio(data)
raise RuntimeError(f"Crikk TTS failed")
def _extract_audio(self, data: dict) -> bytes:
# Expecting a dict with a "url" field containing a data URI
url = data.get("audio_data")
if not url:
raise RuntimeError("Missing 'url' in API response data")
# url format: "data:audio/mpeg;base64,<base64data>"
if not (url.startswith("data:") and ";base64," in url):
raise RuntimeError(f"Unexpected URL format in API response: {url}")
b64_part = url.split(";base64,", 1)[1]
return base64.b64decode(b64_part)

@ -19,14 +19,10 @@ OHFREEME_BASE_URL = os.getenv("OHFREEME_BASE_URL", "")
OHFREEME_JWT_TOKEN = os.getenv("OHFREEME_JWT_TOKEN", "")
VOICES_FILE = Path(__file__).resolve().parent.parent / "config" / "ohfreeme_voices.json"
MAX_RETRIES = 3
RATE_LIMIT_WAIT = 10
RATE_LIMIT_WAIT = 20
def _load_voices() -> list[dict]:
# existing function unchanged
# existing function unchanged
try:
with open(VOICES_FILE, "r", encoding="utf-8") as f:
return json.load(f)

@ -0,0 +1,14 @@
[
{
"id": "vi-VN-NamMinhNeural",
"name": "NamMinh",
"gender": "male",
"lang": "vi"
},
{
"id": "vi-VN-HoaiMyNeural",
"name": "HoaiMy",
"gender": "female",
"lang": "vi"
}
]

@ -155,6 +155,7 @@ class ManualTTSProcessor:
"""
from TTS.GTTS import GTTS
from TTS.OhFreeMe import OhFreeMe
from TTS.Crikk import Crikk
from TTS.TikTok import TikTok
from TTS.aws_polly import AWSPolly
from TTS.elevenlabs import elevenlabs
@ -165,6 +166,7 @@ class ManualTTSProcessor:
providers = {
"googletranslate": GTTS,
"ohfreeme": OhFreeMe,
"crikk": Crikk,
"awspolly": AWSPolly,
"streamlabspolly": StreamlabsPolly,
"tiktok": TikTok,
@ -181,7 +183,7 @@ class ManualTTSProcessor:
f"Unknown TTS provider: {voice_choice}. Falling back to GoogleTranslate.",
style="yellow",
)
engine_class = GTTS
engine_class = Crikk
print_substep(f"Using TTS engine: {engine_class.__name__}")
return engine_class()

@ -11,7 +11,6 @@ Uses libx264 encoder (CPU-based) by default.
import math
import multiprocessing
import os
import re
import tempfile
import threading
@ -140,7 +139,7 @@ class ManualVideoBuilder:
total_duration = sum(s["audio_duration"] for s in clips)
video_length = math.ceil(total_duration)
console.log(f"[bold green] Video will be: {video_length} seconds long ({len(clips)} clips)")
print_substep(f"[bold green] Video will be: {video_length} seconds long ({len(clips)} clips)")
# Ensure temp directory exists
self.temp_dir.mkdir(parents=True, exist_ok=True)

@ -43,7 +43,7 @@ MANUAL_DEFAULTS = {
"encoder": "libx264",
"resolution_w": 1080,
"resolution_h": 1920,
"opacity": 0.9,
"opacity": 1,
"background_video": "random",
"background_audio": "random",
"background_video_dir": "assets/backgrounds/video",
@ -103,7 +103,7 @@ _BASE_SETTINGS_DEFAULTS = {
"background_thumbnail_font_color": "255,255,255",
},
"tts": {
"voice_choice": "ohfreeme",
"voice_choice": "crikk",
"random_voice": False,
"elevenlabs_voice_name": "Bella",
"elevenlabs_api_key": "",

Loading…
Cancel
Save