Made it so TikTok TTS can have more than 300 words

1 year ago · aebf2262c9
parent 340762e1b6
commit aebf2262c9
2 changed files with 133 additions and 123 deletions
--- a/TTS/TikTok.py
+++ b/TTS/TikTok.py
@ -1,80 +1,79 @@
-# documentation for tiktok api: https://github.com/oscie57/tiktok-voice/wiki
-import base64
 import random
 import time
 from typing import Optional, Final
-
-import requests
+import requests, base64, re, sys
+from threading import Thread
+from playsound import playsound

 from utils import settings

-__all__ = ["TikTok", "TikTokTTSException"]
-
-disney_voices: Final[tuple] = (
-    "en_us_ghostface",  # Ghost Face
-    "en_us_chewbacca",  # Chewbacca
-    "en_us_c3po",  # C3PO
-    "en_us_stitch",  # Stitch
-    "en_us_stormtrooper",  # Stormtrooper
-    "en_us_rocket",  # Rocket
-    "en_female_madam_leota",  # Madame Leota
-    "en_male_ghosthost",  # Ghost Host
-    "en_male_pirate",  # pirate
-)
-
-eng_voices: Final[tuple] = (
-    "en_au_001",  # English AU - Female
-    "en_au_002",  # English AU - Male
-    "en_uk_001",  # English UK - Male 1
-    "en_uk_003",  # English UK - Male 2
-    "en_us_001",  # English US - Female (Int. 1)
-    "en_us_002",  # English US - Female (Int. 2)
-    "en_us_006",  # English US - Male 1
-    "en_us_007",  # English US - Male 2
-    "en_us_009",  # English US - Male 3
-    "en_us_010",  # English US - Male 4
-    "en_male_narration",  # Narrator
-    "en_male_funny",  # Funny
-    "en_female_emotional",  # Peaceful
-    "en_male_cody",  # Serious
-)
-
-non_eng_voices: Final[tuple] = (
-    # Western European voices
-    "fr_001",  # French - Male 1
-    "fr_002",  # French - Male 2
-    "de_001",  # German - Female
-    "de_002",  # German - Male
-    "es_002",  # Spanish - Male
-    "it_male_m18",  # Italian - Male
-    # South american voices
-    "es_mx_002",  # Spanish MX - Male
-    "br_001",  # Portuguese BR - Female 1
-    "br_003",  # Portuguese BR - Female 2
-    "br_004",  # Portuguese BR - Female 3
-    "br_005",  # Portuguese BR - Male
-    # asian voices
-    "id_001",  # Indonesian - Female
-    "jp_001",  # Japanese - Female 1
-    "jp_003",  # Japanese - Female 2
-    "jp_005",  # Japanese - Female 3
-    "jp_006",  # Japanese - Male
-    "kr_002",  # Korean - Male 1
-    "kr_003",  # Korean - Female
-    "kr_004",  # Korean - Male 2
-)
-
-vocals: Final[tuple] = (
-    "en_female_f08_salut_damour",  # Alto
-    "en_male_m03_lobby",  # Tenor
-    "en_male_m03_sunshine_soon",  # Sunshine Soon
-    "en_female_f08_warmy_breeze",  # Warmy Breeze
-    "en_female_ht_f08_glorious",  # Glorious
-    "en_male_sing_funny_it_goes_up",  # It Goes Up
-    "en_male_m2_xhxs_m03_silly",  # Chipmunk
-    "en_female_ht_f08_wonderful_world",  # Dramatic
-)
-
+# define the endpoint data with URLs and corresponding response keys
+ENDPOINT_DATA = [
+    {
+        "url": "https://tiktok-tts.weilnet.workers.dev/api/generation",
+        "response": "data"
+    },
+    {
+        "url": "https://countik.com/api/text/speech",
+        "response": "v_data"
+    },
+    {
+        "url": "https://gesserit.co/api/tiktok-tts",
+        "response": "base64"
+    }
+]
+
+# define available voices for text-to-speech conversion
+VOICES = [
+    # DISNEY VOICES
+    'en_us_ghostface',            # Ghost Face
+    'en_us_chewbacca',            # Chewbacca
+    'en_us_c3po',                 # C3PO
+    'en_us_stitch',               # Stitch
+    'en_us_stormtrooper',         # Stormtrooper
+    'en_us_rocket',               # Rocket
+    # ENGLISH VOICES
+    'en_au_001',                  # English AU - Female
+    'en_au_002',                  # English AU - Male
+    'en_uk_001',                  # English UK - Male 1
+    'en_uk_003',                  # English UK - Male 2
+    'en_us_001',                  # English US - Female (Int. 1)
+    'en_us_002',                  # English US - Female (Int. 2)
+    'en_us_006',                  # English US - Male 1
+    'en_us_007',                  # English US - Male 2
+    'en_us_009',                  # English US - Male 3
+    'en_us_010',                  # English US - Male 4
+    # EUROPE VOICES
+    'fr_001',                     # French - Male 1
+    'fr_002',                     # French - Male 2
+    'de_001',                     # German - Female
+    'de_002',                     # German - Male
+    'es_002',                     # Spanish - Male
+    # AMERICA VOICES
+    'es_mx_002',                  # Spanish MX - Male
+    'br_001',                     # Portuguese BR - Female 1
+    'br_003',                     # Portuguese BR - Female 2
+    'br_004',                     # Portuguese BR - Female 3
+    'br_005',                     # Portuguese BR - Male
+    # ASIA VOICES
+    'id_001',                     # Indonesian - Female
+    'jp_001',                     # Japanese - Female 1
+    'jp_003',                     # Japanese - Female 2
+    'jp_005',                     # Japanese - Female 3
+    'jp_006',                     # Japanese - Male
+    'kr_002',                     # Korean - Male 1
+    'kr_003',                     # Korean - Female
+    'kr_004',                     # Korean - Male 2
+    # SINGING VOICES
+    'en_female_f08_salut_damour',  # Alto
+    'en_male_m03_lobby',           # Tenor
+    'en_female_f08_warmy_breeze',  # Warmy Breeze
+    'en_male_m03_sunshine_soon',   # Sunshine Soon
+    # OTHER
+    'en_male_narration',           # narrator
+    'en_male_funny',               # wacky
+    'en_female_emotional',         # peaceful
+]

 class TikTok:
    """TikTok Text-to-Speech Wrapper"""
@ -90,76 +89,86 @@ class TikTok:
        self.max_chars = 200

        self._session = requests.Session()
-        # set the headers to the session, so we don't have to do it for every request
        self._session.headers = headers

-    def run(self, text: str, filepath: str, random_voice: bool = False):
+    def run(self, text: str, filepath: str, random_voice: bool = False, play_sound: bool = False):
        if random_voice:
            voice = self.random_voice()
        else:
-            # if tiktok_voice is not set in the config file, then use a random voice
            voice = settings.config["settings"]["tts"].get("tiktok_voice", None)

-        # get the audio from the TikTok API
-        data = self.get_voices(voice=voice, text=text)
+        chunks = self._split_text(text)

-        # check if there was an error in the request
-        status_code = data["status_code"]
-        if status_code != 0:
-            raise TikTokTTSException(status_code, data["message"])
+        for entry in ENDPOINT_DATA:
+            endpoint_valid = True
+            audio_data = ["" for _ in range(len(chunks))]

-        # decode data from base64 to binary
-        try:
-            raw_voices = data["data"]["v_str"]
-        except:
-            print(
-                "The TikTok TTS returned an invalid response. Please try again later, and report this bug."
-            )
-            raise TikTokTTSException(0, "Invalid response")
-        decoded_voices = base64.b64decode(raw_voices)
+            def generate_audio_chunk(index: int, chunk: str) -> None:
+                nonlocal endpoint_valid

-        # write voices to specified filepath
-        with open(filepath, "wb") as out:
-            out.write(decoded_voices)
+                if not endpoint_valid:
+                    return

-    def get_voices(self, text: str, voice: Optional[str] = None) -> dict:
-        """If voice is not passed, the API will try to use the most fitting voice"""
-        # sanitize text
-        text = text.replace("+", "plus").replace("&", "and").replace("r/", "")
+                try:
+                    response = requests.post(
+                        entry["url"],
+                        json={
+                            "text": chunk,
+                            "voice": voice
+                        }
+                    )

-        # prepare url request
-        params = {"req_text": text, "speaker_map_type": 0, "aid": 1233}
+                    if response.status_code == 200:
+                        audio_data[index] = response.json()[entry["response"]]
+                    else:
+                        endpoint_valid = False

-        if voice is not None:
-            params["text_speaker"] = voice
+                except requests.RequestException as e:
+                    print(f"Error: {e}")
+                    sys.exit()

-        # send request
-        try:
-            response = self._session.post(self.URI_BASE, params=params)
-        except ConnectionError:
-            time.sleep(random.randrange(1, 7))
-            response = self._session.post(self.URI_BASE, params=params)
+            threads = []
+            for index, chunk in enumerate(chunks):
+                thread = Thread(target=generate_audio_chunk, args=(index, chunk))
+                threads.append(thread)
+                thread.start()

-        return response.json()
+            for thread in threads:
+                thread.join()

-    @staticmethod
-    def random_voice() -> str:
-        return random.choice(eng_voices)
+            if not endpoint_valid:
+                continue
+
+            audio_bytes = base64.b64decode("".join(audio_data))

+            with open(filepath, "wb") as file:
+                file.write(audio_bytes)
+                print(f"File '{filepath}' has been generated successfully.")

-class TikTokTTSException(Exception):
-    def __init__(self, code: int, message: str):
-        self._code = code
-        self._message = message
+            if play_sound:
+                playsound(filepath)

-    def __str__(self) -> str:
-        if self._code == 1:
-            return f"Code: {self._code}, reason: probably the aid value isn't correct, message: {self._message}"
+            break

-        if self._code == 2:
-            return f"Code: {self._code}, reason: the text is too long, message: {self._message}"
+    def _split_text(self, text: str) -> list[str]:
+        merged_chunks: list[str] = []
+        seperated_chunks: list[str] = re.findall(r'.*?[.,!?:;-]|.+', text)

-        if self._code == 4:
-            return f"Code: {self._code}, reason: the speaker doesn't exist, message: {self._message}"
+        for i, chunk in enumerate(seperated_chunks):
+            if len(chunk) > 300:
+                seperated_chunks[i:i+1] = re.findall(r'.*?[ ]|.+', chunk)

-        return f"Code: {self._message}, reason: unknown, message: {self._message}"
+        merged_chunk = ""
+        for seperated_chunk in seperated_chunks:
+            if len(merged_chunk) + len(seperated_chunk) <= 300:
+                merged_chunk += seperated_chunk
+            else:
+                merged_chunks.append(merged_chunk)
+                merged_chunk = seperated_chunk
+
+        merged_chunks.append(merged_chunk)
+        return merged_chunks
+
+    @staticmethod
+    def random_voice() -> str:
+        return random.choice(VOICES)
--- a/requirements.txt
+++ b/requirements.txt
@ -21,3 +21,4 @@ transformers==4.29.2
 ffmpeg-python==0.2.0
 elevenlabs==0.2.17
 yt-dlp==2023.7.6
+playsound==1.2.2