|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sanitize_text(text: str) -> str:
|
|
|
|
|
r"""Sanitizes the text for tts.
|
|
|
|
|
What gets removed:
|
|
|
|
|
- following characters`^_~@!&;#:-%“”‘"%*/{}[]()\|<>?=+`
|
|
|
|
|
- any http or https links
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
text (str): Text to be sanitized
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
str: Sanitized text
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# remove any urls from the text
|
|
|
|
|
regex_urls = r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*"
|
|
|
|
|
|
|
|
|
|
result = re.sub(regex_urls, " ", text)
|
|
|
|
|
|
|
|
|
|
# note: not removing apostrophes
|
|
|
|
|
regex_expr = r"\s['|’]|['|’]\s|[\^_~@!&;#:\-%“”‘\"%\*/{}\[\]\(\)\\|<>=+]"
|
|
|
|
|
result = re.sub(regex_expr, " ", result)
|
|
|
|
|
result = result.replace("+", "plus").replace("&", "and")
|
|
|
|
|
# remove extra whitespace
|
|
|
|
|
return " ".join(result.split())
|