Fix and improve TikTok TTS (#1271)

* feat: tiktok sessionId can be specified in the config.toml

* feat: tiktok sessionId can be specified in the config.toml

* Various improvements and optimizations

* Add default argument

* Remove an used variable

* Code reformatted with black

* Fixed all problems pointed out by pylint

* Update TTS/TikTok.py

* Apply suggestions from code review

Co-authored-by: Simon <65854503+OpenSourceSimon@users.noreply.github.com>

* chore: add default value for tiktok_voice

Co-authored-by: Jose Collado <jose@collado.pw>
Co-authored-by: Simon <65854503+OpenSourceSimon@users.noreply.github.com>
Co-authored-by: Callum Leslie <github@cleslie.uk>
Co-authored-by: Callum Leslie <git@cleslie.uk>
pull/1423/head
John Toniutti 3 years ago committed by GitHub
parent f2e8e67a78
commit ee6363cd1e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -369,6 +369,19 @@
</div> </div>
</div> </div>
</div> </div>
<div class="row mb-2">
<label for="tiktok_sessionid" class="col-4">TikTok SessionId</label>
<div class="col-8">
<div class="input-group">
<div class="input-group-text">
<i class="bi bi-mic-fill"></i>
</div>
<input value="{{ data.tiktok_sessionid }}" name="tiktok_sessionid" type="text" class="form-control"
data-toggle="tooltip"
data-original-title="TikTok sessionid needed for the TTS API request. Check documentation if you don't know how to obtain it.">
</div>
</div>
</div>
<div class="row mb-2"> <div class="row mb-2">
<label for="python_voice" class="col-4">Python Voice</label> <label for="python_voice" class="col-4">Python Voice</label>
<div class="col-8"> <div class="col-8">

@ -1,26 +1,28 @@
# documentation for tiktok api: https://github.com/oscie57/tiktok-voice/wiki
import base64 import base64
import random import random
import time
from typing import Optional, Final
import requests import requests
from requests.adapters import HTTPAdapter, Retry
from utils import settings from utils import settings
# from profanity_filter import ProfanityFilter __all__ = ["TikTok", "TikTokTTSException"]
# pf = ProfanityFilter()
# Code by @JasonLovesDoggo
# https://twitter.com/scanlime/status/1512598559769702406
nonhuman = [ # DISNEY VOICES disney_voices: Final[tuple] = (
"en_us_ghostface", # Ghost Face "en_us_ghostface", # Ghost Face
"en_us_chewbacca", # Chewbacca "en_us_chewbacca", # Chewbacca
"en_us_c3po", # C3PO "en_us_c3po", # C3PO
"en_us_stitch", # Stitch "en_us_stitch", # Stitch
"en_us_stormtrooper", # Stormtrooper "en_us_stormtrooper", # Stormtrooper
"en_us_rocket", # Rocket "en_us_rocket", # Rocket
# ENGLISH VOICES "en_female_madam_leota", # Madame Leota
] "en_male_ghosthost", # Ghost Host
human = [ "en_male_pirate", # pirate
)
eng_voices: Final[tuple] = (
"en_au_001", # English AU - Female "en_au_001", # English AU - Female
"en_au_002", # English AU - Male "en_au_002", # English AU - Male
"en_uk_001", # English UK - Male 1 "en_uk_001", # English UK - Male 1
@ -30,23 +32,28 @@ human = [
"en_us_006", # English US - Male 1 "en_us_006", # English US - Male 1
"en_us_007", # English US - Male 2 "en_us_007", # English US - Male 2
"en_us_009", # English US - Male 3 "en_us_009", # English US - Male 3
"en_us_010", "en_us_010", # English US - Male 4
] "en_male_narration", # Narrator
voices = nonhuman + human "en_male_funny", # Funny
"en_female_emotional", # Peaceful
"en_male_cody", # Serious
)
noneng = [ non_eng_voices: Final[tuple] = (
# Western European voices
"fr_001", # French - Male 1 "fr_001", # French - Male 1
"fr_002", # French - Male 2 "fr_002", # French - Male 2
"de_001", # German - Female "de_001", # German - Female
"de_002", # German - Male "de_002", # German - Male
"es_002", # Spanish - Male "es_002", # Spanish - Male
# AMERICA VOICES "it_male_m18" # Italian - Male
# South american voices
"es_mx_002", # Spanish MX - Male "es_mx_002", # Spanish MX - Male
"br_001", # Portuguese BR - Female 1 "br_001", # Portuguese BR - Female 1
"br_003", # Portuguese BR - Female 2 "br_003", # Portuguese BR - Female 2
"br_004", # Portuguese BR - Female 3 "br_004", # Portuguese BR - Female 3
"br_005", # Portuguese BR - Male "br_005", # Portuguese BR - Male
# ASIA VOICES # asian voices
"id_001", # Indonesian - Female "id_001", # Indonesian - Female
"jp_001", # Japanese - Female 1 "jp_001", # Japanese - Female 1
"jp_003", # Japanese - Female 2 "jp_003", # Japanese - Female 2
@ -55,51 +62,97 @@ noneng = [
"kr_002", # Korean - Male 1 "kr_002", # Korean - Male 1
"kr_003", # Korean - Female "kr_003", # Korean - Female
"kr_004", # Korean - Male 2 "kr_004", # Korean - Male 2
] )
# good_voices = {'good': ['en_us_002', 'en_us_006'], vocals: Final[tuple] = (
# 'ok': ['en_au_002', 'en_uk_001']} # less en_us_stormtrooper more less en_us_rocket en_us_ghostface "en_female_f08_salut_damour", # Alto
"en_male_m03_lobby", # Tenor
"en_male_m03_sunshine_soon", # Sunshine Soon
"en_female_f08_warmy_breeze", # Warmy Breeze
"en_female_ht_f08_glorious", # Glorious
"en_male_sing_funny_it_goes_up", # It Goes Up
"en_male_m2_xhxs_m03_silly", # Chipmunk
"en_female_ht_f08_wonderful_world", # Dramatic
)
class TikTok: # TikTok Text-to-Speech Wrapper class TikTok:
"""TikTok Text-to-Speech Wrapper"""
def __init__(self): def __init__(self):
self.URI_BASE = "https://api16-normal-useast5.us.tiktokv.com/media/api/text/speech/invoke/?text_speaker=" headers = {
"User-Agent": "com.zhiliaoapp.musically/2022600030 (Linux; U; Android 7.1.2; es_ES; SM-G988N; "
"Build/NRD90M;tt-ok/3.12.13.1)",
"Cookie": f"sessionid={settings.config['settings']['tts']['tiktok_sessionid']}",
}
self.URI_BASE = "https://api16-normal-c-useast1a.tiktokv.com/media/api/text/speech/invoke/"
self.max_chars = 300 self.max_chars = 300
self.voices = {"human": human, "nonhuman": nonhuman, "noneng": noneng}
def run(self, text, filepath, random_voice: bool = False):
# if censor:
# req_text = pf.censor(req_text)
# pass
voice = (
self.randomvoice()
if random_voice
else (
settings.config["settings"]["tts"]["tiktok_voice"]
or random.choice(self.voices["human"])
)
)
try:
r = requests.post(
f"{self.URI_BASE}{voice}&req_text={text}&speaker_map_type=0"
)
except requests.exceptions.SSLError:
# https://stackoverflow.com/a/47475019/18516611
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
r = session.post(
f"{self.URI_BASE}{voice}&req_text={text}&speaker_map_type=0"
)
# print(r.text)
vstr = [r.json()["data"]["v_str"]][0]
b64d = base64.b64decode(vstr)
self._session = requests.Session()
# set the headers to the session, so we don't have to do it for every request
self._session.headers = headers
def run(self, text: str, filepath: str, random_voice: bool = False):
if random_voice:
voice = self.random_voice()
else:
# if tiktok_voice is not set in the config file, then use a random voice
voice = settings.config["settings"]["tts"].get("tiktok_voice", None)
# get the audio from the TikTok API
data = self.get_voices(voice=voice, text=text)
# check if there was an error in the request
status_code = data["status_code"]
if status_code != 0:
raise TikTokTTSException(status_code, data["message"])
# decode data from base64 to binary
raw_voices = data["data"]["v_str"]
decoded_voices = base64.b64decode(raw_voices)
# write voices to specified filepath
with open(filepath, "wb") as out: with open(filepath, "wb") as out:
out.write(b64d) out.write(decoded_voices)
def get_voices(self, text: str, voice: Optional[str] = None) -> dict:
"""If voice is not passed, the API will try to use the most fitting voice"""
# sanitize text
text = text.replace("+", "plus").replace("&", "and").replace("r/", "")
# prepare url request
params = {"req_text": text, "speaker_map_type": 0, "aid": 1233}
if voice is not None:
params["text_speaker"] = voice
# send request
try:
response = self._session.post(self.URI_BASE, params=params)
except ConnectionError:
time.sleep(random.randrange(1, 7))
response = self._session.post(self.URI_BASE, params=params)
return response.json()
@staticmethod
def random_voice():
return random.choice(eng_voices)
class TikTokTTSException(Exception):
def __init__(self, code: int, message: str):
self._code = code
self._message = message
def __str__(self) -> str:
if self._code == 1:
return f"Code: {self._code}, reason: probably the aid value isn't correct, message: {self._message}"
if self._code == 2:
return f"Code: {self._code}, reason: the text is too long, message: {self._message}"
if self._code == 4:
return f"Code: {self._code}, reason: the speaker doesn't exist, message: {self._message}"
def randomvoice(self): return f"Code: {self._message}, reason: unknown, message: {self._message}"
return random.choice(self.voices["human"])

@ -43,11 +43,12 @@ background_thumbnail_font_size = { optional = true, type = "int", default = 96,
background_thumbnail_font_color = { optional = true, default = "255,255,255", example = "255,255,255", explanation = "Font color in RGB format for the thumbnail text" } background_thumbnail_font_color = { optional = true, default = "255,255,255", example = "255,255,255", explanation = "Font color in RGB format for the thumbnail text" }
[settings.tts] [settings.tts]
voice_choice = { optional = false, default = "googletranslate", options = ["streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", ], example = "tiktok", explanation = "The voice platform used for TTS generation. This can be left blank and you will be prompted to choose at runtime." } voice_choice = { optional = false, default = "tiktok", options = ["streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", ], example = "tiktok", explanation = "The voice platform used for TTS generation. This can be left blank and you will be prompted to choose at runtime." }
aws_polly_voice = { optional = true, default = "Matthew", example = "Matthew", explanation = "The voice used for AWS Polly" } aws_polly_voice = { optional = false, default = "Matthew", example = "Matthew", explanation = "The voice used for AWS Polly" }
streamlabs_polly_voice = { optional = true, default = "Matthew", example = "Matthew", explanation = "The voice used for Streamlabs Polly" } streamlabs_polly_voice = { optional = false, default = "Matthew", example = "Matthew", explanation = "The voice used for Streamlabs Polly" }
tiktok_voice = { optional = true, default = "en_us_006", example = "en_us_006", explanation = "The voice used for TikTok TTS" } tiktok_voice = { optional = true, default = "en_us_001", example = "en_us_006", explanation = "The voice used for TikTok TTS" }
python_voice = { optional = true, default = "1", example = "1", explanation = "The index of the system tts voices (can be downloaded externally, run ptt.py to find value, start from zero)" } tiktok_sessionid = { optional = true, example = "c76bcc3a7625abcc27b508c7db457ff1", explanation = "TikTok sessionid needed for the TTS API request. Check documentation if you don't know how to obtain it." }
py_voice_num = { optional = true, default = "2", example = "2", explanation = "The number of system voices (2 are pre-installed in Windows)" } python_voice = { optional = false, default = "1", example = "1", explanation = "The index of the system tts voices (can be downloaded externally, run ptt.py to find value, start from zero)" }
py_voice_num = { optional = false, default = "2", example = "2", explanation = "The number of system voices (2 are pre-installed in Windows)" }
silence_duration = { optional = true, example = "0.1", explanation = "Time in seconds between TTS comments", default = 0.3, type = "float" } silence_duration = { optional = true, example = "0.1", explanation = "Time in seconds between TTS comments", default = 0.3, type = "float" }
no_emojis = { optional = false, type = "bool", default = false, example = false, options = [true, false,], explanation = "Whether to remove emojis from the comments" } no_emojis = { optional = false, type = "bool", default = false, example = false, options = [true, false,], explanation = "Whether to remove emojis from the comments" }
Loading…
Cancel
Save