Fix and improve TikTok TTS (#1271)

* feat: tiktok sessionId can be specified in the config.toml * feat: tiktok sessionId can be specified in the config.toml * Various improvements and optimizations * Add default argument * Remove an used variable * Code reformatted with black * Fixed all problems pointed out by pylint * Update TTS/TikTok.py * Apply suggestions from code review Co-authored-by: Simon <65854503+OpenSourceSimon@users.noreply.github.com> * chore: add default value for tiktok_voice Co-authored-by: Jose Collado <jose@collado.pw> Co-authored-by: Simon <65854503+OpenSourceSimon@users.noreply.github.com> Co-authored-by: Callum Leslie <github@cleslie.uk> Co-authored-by: Callum Leslie <git@cleslie.uk>
3 years ago · ee6363cd1e
parent f2e8e67a78
commit ee6363cd1e
3 changed files with 128 additions and 61 deletions
--- a/GUI/settings.html
+++ b/GUI/settings.html
@ -369,6 +369,19 @@
                    </div>
                </div>
            </div>
+            <div class="row mb-2">
+                <label for="tiktok_sessionid" class="col-4">TikTok SessionId</label>
+                <div class="col-8">
+                    <div class="input-group">
+                        <div class="input-group-text">
+                            <i class="bi bi-mic-fill"></i>
+                        </div>
+                        <input value="{{ data.tiktok_sessionid }}" name="tiktok_sessionid" type="text" class="form-control"
+                            data-toggle="tooltip"
+                            data-original-title="TikTok sessionid needed for the TTS API request. Check documentation if you don't know how to obtain it.">
+                    </div>
+                </div>
+            </div>
            <div class="row mb-2">
                <label for="python_voice" class="col-4">Python Voice</label>
                <div class="col-8">
--- a/TTS/TikTok.py
+++ b/TTS/TikTok.py
@ -1,26 +1,28 @@
+# documentation for tiktok api: https://github.com/oscie57/tiktok-voice/wiki
 import base64
 import random
+import time
+from typing import Optional, Final

 import requests
-from requests.adapters import HTTPAdapter, Retry

 from utils import settings

-# from profanity_filter import ProfanityFilter
-# pf = ProfanityFilter()
-# Code by @JasonLovesDoggo
-# https://twitter.com/scanlime/status/1512598559769702406
+__all__ = ["TikTok", "TikTokTTSException"]

-nonhuman = [  # DISNEY VOICES
+disney_voices: Final[tuple] = (
    "en_us_ghostface",  # Ghost Face
    "en_us_chewbacca",  # Chewbacca
    "en_us_c3po",  # C3PO
    "en_us_stitch",  # Stitch
    "en_us_stormtrooper",  # Stormtrooper
    "en_us_rocket",  # Rocket
-    # ENGLISH VOICES
-]
-human = [
+    "en_female_madam_leota",  # Madame Leota
+    "en_male_ghosthost",  # Ghost Host
+    "en_male_pirate",  # pirate
+)
+
+eng_voices: Final[tuple] = (
    "en_au_001",  # English AU - Female
    "en_au_002",  # English AU - Male
    "en_uk_001",  # English UK - Male 1
@ -30,23 +32,28 @@ human = [
    "en_us_006",  # English US - Male 1
    "en_us_007",  # English US - Male 2
    "en_us_009",  # English US - Male 3
-    "en_us_010",
-]
-voices = nonhuman + human
+    "en_us_010",  # English US - Male 4
+    "en_male_narration",  # Narrator
+    "en_male_funny",  # Funny
+    "en_female_emotional",  # Peaceful
+    "en_male_cody",  # Serious
+)

-noneng = [
+non_eng_voices: Final[tuple] = (
+    # Western European voices
    "fr_001",  # French - Male 1
    "fr_002",  # French - Male 2
    "de_001",  # German - Female
    "de_002",  # German - Male
    "es_002",  # Spanish - Male
-    # AMERICA VOICES
+    "it_male_m18"  # Italian - Male
+    # South american voices
    "es_mx_002",  # Spanish MX - Male
    "br_001",  # Portuguese BR - Female 1
    "br_003",  # Portuguese BR - Female 2
    "br_004",  # Portuguese BR - Female 3
    "br_005",  # Portuguese BR - Male
-    # ASIA VOICES
+    # asian voices
    "id_001",  # Indonesian - Female
    "jp_001",  # Japanese - Female 1
    "jp_003",  # Japanese - Female 2
@ -55,51 +62,97 @@ noneng = [
    "kr_002",  # Korean - Male 1
    "kr_003",  # Korean - Female
    "kr_004",  # Korean - Male 2
-]
-
+)

-# good_voices = {'good': ['en_us_002', 'en_us_006'],
-#               'ok': ['en_au_002', 'en_uk_001']}  # less en_us_stormtrooper more less en_us_rocket en_us_ghostface
+vocals: Final[tuple] = (
+    "en_female_f08_salut_damour",  # Alto
+    "en_male_m03_lobby",  # Tenor
+    "en_male_m03_sunshine_soon",  # Sunshine Soon
+    "en_female_f08_warmy_breeze",  # Warmy Breeze
+    "en_female_ht_f08_glorious",  # Glorious
+    "en_male_sing_funny_it_goes_up",  # It Goes Up
+    "en_male_m2_xhxs_m03_silly",  # Chipmunk
+    "en_female_ht_f08_wonderful_world",  # Dramatic
+)


-class TikTok:  # TikTok Text-to-Speech Wrapper
+class TikTok:
+    """TikTok Text-to-Speech Wrapper"""
    def __init__(self):
-        self.URI_BASE = "https://api16-normal-useast5.us.tiktokv.com/media/api/text/speech/invoke/?text_speaker="
+        headers = {
+            "User-Agent": "com.zhiliaoapp.musically/2022600030 (Linux; U; Android 7.1.2; es_ES; SM-G988N; "
+            "Build/NRD90M;tt-ok/3.12.13.1)",
+            "Cookie": f"sessionid={settings.config['settings']['tts']['tiktok_sessionid']}",
+        }
+        
+        self.URI_BASE = "https://api16-normal-c-useast1a.tiktokv.com/media/api/text/speech/invoke/"
        self.max_chars = 300
-        self.voices = {"human": human, "nonhuman": nonhuman, "noneng": noneng}
-
-    def run(self, text, filepath, random_voice: bool = False):
-        # if censor:
-        #     req_text = pf.censor(req_text)
-        #     pass
-        voice = (
-            self.randomvoice()
-            if random_voice
-            else (
-                settings.config["settings"]["tts"]["tiktok_voice"]
-                or random.choice(self.voices["human"])
-            )
-        )
-        try:
-            r = requests.post(
-                f"{self.URI_BASE}{voice}&req_text={text}&speaker_map_type=0"
-            )
-        except requests.exceptions.SSLError:
-            # https://stackoverflow.com/a/47475019/18516611
-            session = requests.Session()
-            retry = Retry(connect=3, backoff_factor=0.5)
-            adapter = HTTPAdapter(max_retries=retry)
-            session.mount("http://", adapter)
-            session.mount("https://", adapter)
-            r = session.post(
-                f"{self.URI_BASE}{voice}&req_text={text}&speaker_map_type=0"
-            )
-        # print(r.text)
-        vstr = [r.json()["data"]["v_str"]][0]
-        b64d = base64.b64decode(vstr)

+        self._session = requests.Session()
+        # set the headers to the session, so we don't have to do it for every request
+        self._session.headers = headers
+
+    def run(self, text: str, filepath: str, random_voice: bool = False):
+        if random_voice:
+            voice = self.random_voice()
+        else:
+            # if tiktok_voice is not set in the config file, then use a random voice
+            voice = settings.config["settings"]["tts"].get("tiktok_voice", None)
+
+        # get the audio from the TikTok API
+        data = self.get_voices(voice=voice, text=text)
+
+        # check if there was an error in the request
+        status_code = data["status_code"]
+        if status_code != 0:
+            raise TikTokTTSException(status_code, data["message"])
+
+        # decode data from base64 to binary
+        raw_voices = data["data"]["v_str"]
+        decoded_voices = base64.b64decode(raw_voices)
+
+        # write voices to specified filepath
        with open(filepath, "wb") as out:
-            out.write(b64d)
+            out.write(decoded_voices)
+
+    def get_voices(self, text: str, voice: Optional[str] = None) -> dict:
+        """If voice is not passed, the API will try to use the most fitting voice"""
+        # sanitize text
+        text = text.replace("+", "plus").replace("&", "and").replace("r/", "")
+
+        # prepare url request
+        params = {"req_text": text, "speaker_map_type": 0, "aid": 1233}
+
+        if voice is not None:
+            params["text_speaker"] = voice
+
+        # send request
+        try:
+            response = self._session.post(self.URI_BASE, params=params)
+        except ConnectionError:
+            time.sleep(random.randrange(1, 7))
+            response = self._session.post(self.URI_BASE, params=params)
+
+        return response.json()
+
+    @staticmethod
+    def random_voice():
+        return random.choice(eng_voices)
+
+
+class TikTokTTSException(Exception):
+    def __init__(self, code: int, message: str):
+        self._code = code
+        self._message = message
+
+    def __str__(self) -> str:
+        if self._code == 1:
+            return f"Code: {self._code}, reason: probably the aid value isn't correct, message: {self._message}"
+
+        if self._code == 2:
+            return f"Code: {self._code}, reason: the text is too long, message: {self._message}"
+
+        if self._code == 4:
+            return f"Code: {self._code}, reason: the speaker doesn't exist, message: {self._message}"

-    def randomvoice(self):
-        return random.choice(self.voices["human"])
+        return f"Code: {self._message}, reason: unknown, message: {self._message}"
--- a/utils/.config.template.toml
+++ b/utils/.config.template.toml
@ -43,11 +43,12 @@ background_thumbnail_font_size = { optional = true, type = "int", default = 96,
 background_thumbnail_font_color = { optional = true, default = "255,255,255", example = "255,255,255", explanation = "Font color in RGB format for the thumbnail text" }

 [settings.tts]
-voice_choice = { optional = false, default = "googletranslate", options = ["streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", ], example = "tiktok", explanation = "The voice platform used for TTS generation. This can be left blank and you will be prompted to choose at runtime." }
-aws_polly_voice = { optional = true, default = "Matthew", example = "Matthew", explanation = "The voice used for AWS Polly" }
-streamlabs_polly_voice = { optional = true, default = "Matthew", example = "Matthew", explanation = "The voice used for Streamlabs Polly" }
-tiktok_voice = { optional = true, default = "en_us_006", example = "en_us_006", explanation = "The voice used for TikTok TTS" }
-python_voice = { optional = true, default = "1", example = "1", explanation = "The index of the system tts voices (can be downloaded externally, run ptt.py to find value, start from zero)" }
-py_voice_num = { optional = true, default = "2", example = "2", explanation = "The number of system voices (2 are pre-installed in Windows)" }
+voice_choice = { optional = false, default = "tiktok", options = ["streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", ], example = "tiktok", explanation = "The voice platform used for TTS generation. This can be left blank and you will be prompted to choose at runtime." }
+aws_polly_voice = { optional = false, default = "Matthew", example = "Matthew", explanation = "The voice used for AWS Polly" }
+streamlabs_polly_voice = { optional = false, default = "Matthew", example = "Matthew", explanation = "The voice used for Streamlabs Polly" }
+tiktok_voice = { optional = true, default = "en_us_001", example = "en_us_006", explanation = "The voice used for TikTok TTS" }
+tiktok_sessionid = { optional = true, example = "c76bcc3a7625abcc27b508c7db457ff1", explanation = "TikTok sessionid needed for the TTS API request. Check documentation if you don't know how to obtain it." }
+python_voice = { optional = false, default = "1", example = "1", explanation = "The index of the system tts voices (can be downloaded externally, run ptt.py to find value, start from zero)" }
+py_voice_num = { optional = false, default = "2", example = "2", explanation = "The number of system voices (2 are pre-installed in Windows)" }
 silence_duration = { optional = true, example = "0.1", explanation = "Time in seconds between TTS comments", default = 0.3, type = "float" }
 no_emojis = { optional = false, type = "bool", default = false, example = false, options = [true, false,], explanation = "Whether to remove emojis from the comments" }