diff --git a/TTS/POLLY.py b/TTS/POLLY.py new file mode 100644 index 0000000..79a6f91 --- /dev/null +++ b/TTS/POLLY.py @@ -0,0 +1,115 @@ +import os +import random +import re + +import requests +import sox +from moviepy.audio.AudioClip import concatenate_audioclips, CompositeAudioClip +from moviepy.audio.io.AudioFileClip import AudioFileClip +voices = {'neural': [ + 'Ivy', + 'Joanna', + 'Kendra', + 'Kimberly', + 'Salli', + 'Joey', + 'Justin', + 'Matthew', + 'Amy', + 'Emma', + 'Brian' + +], 'standard': [ + 'Ivy', + 'Joanna', + 'Kendra', + 'Kimberly', + 'Salli', + 'Joey', + 'Justin', + 'Matthew', + "Russell", + "Nicole", + "Amy", + "Emma", + "Brian", + "Aditi", + "Raveena", + "Geraint" +]} + + +# valid voices https://lazypy.ro/tts/ + + +class POLLY: + def __init__(self): + self.url = 'https://streamlabs.com/polly/speak' + + def tts( + self, + req_text: str = "Amazon Text To Speech", + filename: str = "title.mp3", + random_speaker=False, + censer=False, + ): + if random_speaker: + voice = self.randomvoice() + else: + if not os.getenv('VOICE'): + return ValueError('Please set the environment variable VOICE to a valid voice. options are: {}'.format(voices)) + voice = (os.getenv("VOICE")) + body = {'voice': voice, 'text': req_text} + response = requests.post(self.url, data=body) + try: + voice_data = requests.get(response.json()['speak_url']) + with open(filename, 'wb') as f: + f.write(voice_data.content) + except KeyError: + if response.json()['error'] == 'Text length is too long!': + chunks = [ + m.group().strip() for m in re.finditer(r" *((.{0,299})(\.|.$))", req_text) + ] + + audio_clips = [] + cbn = sox.Combiner() + + chunkId = 0 + for chunk in chunks: + body = {'voice': 'Brian', 'text': chunk} + resp = requests.post(self.url, data=body) + voice_data = requests.get(resp.json()['speak_url']) + with open(filename.replace(".mp3", f"-{chunkId}.mp3"), "wb") as out: + out.write(voice_data.content) + + audio_clips.append(filename.replace(".mp3", f"-{chunkId}.mp3")) + + chunkId = chunkId + 1 + try: + if len(audio_clips) > 1: + cbn.convert(samplerate=44100, n_channels=2) + cbn.build(audio_clips, filename, "concatenate") + else: + os.rename(audio_clips[0], filename) + except (sox.core.SoxError, + FileNotFoundError): # https://github.com/JasonLovesDoggo/RedditVideoMakerBot/issues/67#issuecomment-1150466339 + for clip in audio_clips: + i = audio_clips.index(clip) # get the index of the clip + audio_clips = ( + audio_clips[:i] + [AudioFileClip(clip)] + audio_clips[i + 1:] + ) # replace the clip with an AudioFileClip + audio_concat = concatenate_audioclips(audio_clips) + audio_composite = CompositeAudioClip([audio_concat]) + audio_composite.write_audiofile(filename, 44100, 2, 2000, None) + + def make_readable(self, text): + """ + Amazon Polly fails to read some symbols properly such as '& (and)'. + So we normalize input text before passing it to the service + """ + text = text.replace('&', 'and') + return text + + def randomvoice(self): + valid = voices['neural'] + voices['standard'] + return random.choice(valid) diff --git a/TTS/TikTok.py b/TTS/TikTok.py index 8ed9f8e..79d472a 100644 --- a/TTS/TikTok.py +++ b/TTS/TikTok.py @@ -110,6 +110,7 @@ class TikTok: # TikTok Text-to-Speech Wrapper r = session.post( f"{self.URI_BASE}{voice}&req_text={chunk}&speaker_map_type=0" ) + print(r.text) vstr = [r.json()["data"]["v_str"]][0] b64d = base64.b64decode(vstr) @@ -141,3 +142,6 @@ class TikTok: # TikTok Text-to-Speech Wrapper if ok_or_good == 1: # 1/10 chance of ok voice return random.choice(voices) return random.choice(human) # 9/10 chance of good voice + + +TikTok().tts('Hello World', '../TTS/hello.mp3') diff --git a/TTS/swapper.py b/TTS/swapper.py index cd18223..f4717b1 100644 --- a/TTS/swapper.py +++ b/TTS/swapper.py @@ -3,9 +3,10 @@ from os import getenv from dotenv import load_dotenv from TTS.GTTS import GTTS +from TTS.POLLY import POLLY from TTS.TikTok import TikTok -CHOICE_DIR = {"tiktok": TikTok, "gtts": GTTS} +CHOICE_DIR = {"tiktok": TikTok, "gtts": GTTS, 'polly': POLLY} class TTS: