Remove special characters from text

pull/332/head
zechs 3 years ago
parent cf6b3a9c44
commit ca24608da8

@ -4,6 +4,24 @@ from mutagen.mp3 import MP3
from utils.console import print_step, print_substep
from rich.progress import track
import re
def sanitize_text(reddit_obj):
"""
Sanitizes the text for tts.
What gets removed:
- following characters`^_~@!&;#:-%“”‘"%*/{}[]()\|<>?=+`
"""
# note: not removing apostrophes
regex_expr = r"\s['|]|['|]\s|[\^_~@!&;#:\-%“”‘\"%\*/{}\[\]\(\)\\|<>=+]"
result = re.sub(regex_expr, " ", reddit_obj)
# remove extra whitespace
return " ".join(result.split())
def save_text_to_mp3(reddit_obj):
"""Saves Text to MP3 files.
@ -17,25 +35,16 @@ def save_text_to_mp3(reddit_obj):
# Create a folder for the mp3 files.
Path("assets/mp3").mkdir(parents=True, exist_ok=True)
tts = gTTS(text=reddit_obj["thread_title"], lang="en", slow=False)
thread_title = sanitize_text(reddit_obj["thread_title"])
tts = gTTS(text=thread_title, lang="en", slow=False)
tts.save(f"assets/mp3/title.mp3")
length += MP3(f"assets/mp3/title.mp3").info.length
try:
Path(f"assets/mp3/posttext.mp3").unlink()
except OSError as e:
pass
if reddit_obj["thread_post"] != "":
tts = gTTS(text=reddit_obj["thread_post"], lang="en", slow=False)
tts.save(f"assets/mp3/posttext.mp3")
length += MP3(f"assets/mp3/posttext.mp3").info.length
for idx, comment in track(enumerate(reddit_obj["comments"]), "Saving..."):
# ! Stop creating mp3 files if the length is greater than 50 seconds. This can be longer, but this is just a good starting point
if length > 50:
break
tts = gTTS(text=comment["comment_body"], lang="en", slow=False)
tts = gTTS(text=sanitize_text(comment["comment_body"]), lang="en", slow=False)
tts.save(f"assets/mp3/{idx}.mp3")
length += MP3(f"assets/mp3/{idx}.mp3").info.length

Loading…
Cancel
Save