Fix and improve TikTok TTS (#1271)

* feat: tiktok sessionId can be specified in the config.toml

* feat: tiktok sessionId can be specified in the config.toml

* Various improvements and optimizations

* Add default argument

* Remove an used variable

* Code reformatted with black

* Fixed all problems pointed out by pylint

* Update TTS/TikTok.py

* Apply suggestions from code review

Co-authored-by: Simon <65854503+OpenSourceSimon@users.noreply.github.com>

* chore: add default value for tiktok_voice

Co-authored-by: Jose Collado <jose@collado.pw>
Co-authored-by: Simon <65854503+OpenSourceSimon@users.noreply.github.com>
Co-authored-by: Callum Leslie <github@cleslie.uk>
Co-authored-by: Callum Leslie <git@cleslie.uk>
pull/1423/head
John Toniutti 2 years ago committed by GitHub
parent f2e8e67a78
commit ee6363cd1e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -369,6 +369,19 @@
</div>
</div>
</div>
<div class="row mb-2">
<label for="tiktok_sessionid" class="col-4">TikTok SessionId</label>
<div class="col-8">
<div class="input-group">
<div class="input-group-text">
<i class="bi bi-mic-fill"></i>
</div>
<input value="{{ data.tiktok_sessionid }}" name="tiktok_sessionid" type="text" class="form-control"
data-toggle="tooltip"
data-original-title="TikTok sessionid needed for the TTS API request. Check documentation if you don't know how to obtain it.">
</div>
</div>
</div>
<div class="row mb-2">
<label for="python_voice" class="col-4">Python Voice</label>
<div class="col-8">

@ -1,26 +1,28 @@
# documentation for tiktok api: https://github.com/oscie57/tiktok-voice/wiki
import base64
import random
import time
from typing import Optional, Final
import requests
from requests.adapters import HTTPAdapter, Retry
from utils import settings
# from profanity_filter import ProfanityFilter
# pf = ProfanityFilter()
# Code by @JasonLovesDoggo
# https://twitter.com/scanlime/status/1512598559769702406
__all__ = ["TikTok", "TikTokTTSException"]
nonhuman = [ # DISNEY VOICES
disney_voices: Final[tuple] = (
"en_us_ghostface", # Ghost Face
"en_us_chewbacca", # Chewbacca
"en_us_c3po", # C3PO
"en_us_stitch", # Stitch
"en_us_stormtrooper", # Stormtrooper
"en_us_rocket", # Rocket
# ENGLISH VOICES
]
human = [
"en_female_madam_leota", # Madame Leota
"en_male_ghosthost", # Ghost Host
"en_male_pirate", # pirate
)
eng_voices: Final[tuple] = (
"en_au_001", # English AU - Female
"en_au_002", # English AU - Male
"en_uk_001", # English UK - Male 1
@ -30,23 +32,28 @@ human = [
"en_us_006", # English US - Male 1
"en_us_007", # English US - Male 2
"en_us_009", # English US - Male 3
"en_us_010",
]
voices = nonhuman + human
"en_us_010", # English US - Male 4
"en_male_narration", # Narrator
"en_male_funny", # Funny
"en_female_emotional", # Peaceful
"en_male_cody", # Serious
)
noneng = [
non_eng_voices: Final[tuple] = (
# Western European voices
"fr_001", # French - Male 1
"fr_002", # French - Male 2
"de_001", # German - Female
"de_002", # German - Male
"es_002", # Spanish - Male
# AMERICA VOICES
"it_male_m18" # Italian - Male
# South american voices
"es_mx_002", # Spanish MX - Male
"br_001", # Portuguese BR - Female 1
"br_003", # Portuguese BR - Female 2
"br_004", # Portuguese BR - Female 3
"br_005", # Portuguese BR - Male
# ASIA VOICES
# asian voices
"id_001", # Indonesian - Female
"jp_001", # Japanese - Female 1
"jp_003", # Japanese - Female 2
@ -55,51 +62,97 @@ noneng = [
"kr_002", # Korean - Male 1
"kr_003", # Korean - Female
"kr_004", # Korean - Male 2
]
)
# good_voices = {'good': ['en_us_002', 'en_us_006'],
# 'ok': ['en_au_002', 'en_uk_001']} # less en_us_stormtrooper more less en_us_rocket en_us_ghostface
vocals: Final[tuple] = (
"en_female_f08_salut_damour", # Alto
"en_male_m03_lobby", # Tenor
"en_male_m03_sunshine_soon", # Sunshine Soon
"en_female_f08_warmy_breeze", # Warmy Breeze
"en_female_ht_f08_glorious", # Glorious
"en_male_sing_funny_it_goes_up", # It Goes Up
"en_male_m2_xhxs_m03_silly", # Chipmunk
"en_female_ht_f08_wonderful_world", # Dramatic
)
class TikTok: # TikTok Text-to-Speech Wrapper
class TikTok:
"""TikTok Text-to-Speech Wrapper"""
def __init__(self):
self.URI_BASE = "https://api16-normal-useast5.us.tiktokv.com/media/api/text/speech/invoke/?text_speaker="
headers = {
"User-Agent": "com.zhiliaoapp.musically/2022600030 (Linux; U; Android 7.1.2; es_ES; SM-G988N; "
"Build/NRD90M;tt-ok/3.12.13.1)",
"Cookie": f"sessionid={settings.config['settings']['tts']['tiktok_sessionid']}",
}
self.URI_BASE = "https://api16-normal-c-useast1a.tiktokv.com/media/api/text/speech/invoke/"
self.max_chars = 300
self.voices = {"human": human, "nonhuman": nonhuman, "noneng": noneng}
def run(self, text, filepath, random_voice: bool = False):
# if censor:
# req_text = pf.censor(req_text)
# pass
voice = (
self.randomvoice()
if random_voice
else (
settings.config["settings"]["tts"]["tiktok_voice"]
or random.choice(self.voices["human"])
)
)
try:
r = requests.post(
f"{self.URI_BASE}{voice}&req_text={text}&speaker_map_type=0"
)
except requests.exceptions.SSLError:
# https://stackoverflow.com/a/47475019/18516611
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
r = session.post(
f"{self.URI_BASE}{voice}&req_text={text}&speaker_map_type=0"
)
# print(r.text)
vstr = [r.json()["data"]["v_str"]][0]
b64d = base64.b64decode(vstr)
self._session = requests.Session()
# set the headers to the session, so we don't have to do it for every request
self._session.headers = headers
def run(self, text: str, filepath: str, random_voice: bool = False):
if random_voice:
voice = self.random_voice()
else:
# if tiktok_voice is not set in the config file, then use a random voice
voice = settings.config["settings"]["tts"].get("tiktok_voice", None)
# get the audio from the TikTok API
data = self.get_voices(voice=voice, text=text)
# check if there was an error in the request
status_code = data["status_code"]
if status_code != 0:
raise TikTokTTSException(status_code, data["message"])
# decode data from base64 to binary
raw_voices = data["data"]["v_str"]
decoded_voices = base64.b64decode(raw_voices)
# write voices to specified filepath
with open(filepath, "wb") as out:
out.write(b64d)
out.write(decoded_voices)
def get_voices(self, text: str, voice: Optional[str] = None) -> dict:
"""If voice is not passed, the API will try to use the most fitting voice"""
# sanitize text
text = text.replace("+", "plus").replace("&", "and").replace("r/", "")
# prepare url request
params = {"req_text": text, "speaker_map_type": 0, "aid": 1233}
if voice is not None:
params["text_speaker"] = voice
# send request
try:
response = self._session.post(self.URI_BASE, params=params)
except ConnectionError:
time.sleep(random.randrange(1, 7))
response = self._session.post(self.URI_BASE, params=params)
return response.json()
@staticmethod
def random_voice():
return random.choice(eng_voices)
class TikTokTTSException(Exception):
def __init__(self, code: int, message: str):
self._code = code
self._message = message
def __str__(self) -> str:
if self._code == 1:
return f"Code: {self._code}, reason: probably the aid value isn't correct, message: {self._message}"
if self._code == 2:
return f"Code: {self._code}, reason: the text is too long, message: {self._message}"
if self._code == 4:
return f"Code: {self._code}, reason: the speaker doesn't exist, message: {self._message}"
def randomvoice(self):
return random.choice(self.voices["human"])
return f"Code: {self._message}, reason: unknown, message: {self._message}"

@ -43,11 +43,12 @@ background_thumbnail_font_size = { optional = true, type = "int", default = 96,
background_thumbnail_font_color = { optional = true, default = "255,255,255", example = "255,255,255", explanation = "Font color in RGB format for the thumbnail text" }
[settings.tts]
voice_choice = { optional = false, default = "googletranslate", options = ["streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", ], example = "tiktok", explanation = "The voice platform used for TTS generation. This can be left blank and you will be prompted to choose at runtime." }
aws_polly_voice = { optional = true, default = "Matthew", example = "Matthew", explanation = "The voice used for AWS Polly" }
streamlabs_polly_voice = { optional = true, default = "Matthew", example = "Matthew", explanation = "The voice used for Streamlabs Polly" }
tiktok_voice = { optional = true, default = "en_us_006", example = "en_us_006", explanation = "The voice used for TikTok TTS" }
python_voice = { optional = true, default = "1", example = "1", explanation = "The index of the system tts voices (can be downloaded externally, run ptt.py to find value, start from zero)" }
py_voice_num = { optional = true, default = "2", example = "2", explanation = "The number of system voices (2 are pre-installed in Windows)" }
voice_choice = { optional = false, default = "tiktok", options = ["streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", ], example = "tiktok", explanation = "The voice platform used for TTS generation. This can be left blank and you will be prompted to choose at runtime." }
aws_polly_voice = { optional = false, default = "Matthew", example = "Matthew", explanation = "The voice used for AWS Polly" }
streamlabs_polly_voice = { optional = false, default = "Matthew", example = "Matthew", explanation = "The voice used for Streamlabs Polly" }
tiktok_voice = { optional = true, default = "en_us_001", example = "en_us_006", explanation = "The voice used for TikTok TTS" }
tiktok_sessionid = { optional = true, example = "c76bcc3a7625abcc27b508c7db457ff1", explanation = "TikTok sessionid needed for the TTS API request. Check documentation if you don't know how to obtain it." }
python_voice = { optional = false, default = "1", example = "1", explanation = "The index of the system tts voices (can be downloaded externally, run ptt.py to find value, start from zero)" }
py_voice_num = { optional = false, default = "2", example = "2", explanation = "The number of system voices (2 are pre-installed in Windows)" }
silence_duration = { optional = true, example = "0.1", explanation = "Time in seconds between TTS comments", default = 0.3, type = "float" }
no_emojis = { optional = false, type = "bool", default = false, example = false, options = [true, false,], explanation = "Whether to remove emojis from the comments" }
Loading…
Cancel
Save