From cb3f825014b50a2c5e1f9e41b7d2ae8f3da2f388 Mon Sep 17 00:00:00 2001 From: Jason Date: Mon, 6 Jun 2022 18:11:52 -0400 Subject: [PATCH] updated voice.py with uri removal too --- utils/voice.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/utils/voice.py b/utils/voice.py index 7661981..e6f3989 100644 --- a/utils/voice.py +++ b/utils/voice.py @@ -4,13 +4,18 @@ import re def sanitize_text(text): """ Sanitizes the text for tts. - What gets removed: + What gets removed: - following characters`^_~@!&;#:-%“”‘"%*/{}[]()\|<>?=+` + - any http or https links """ + # remove any urls from the text + regex_urls = r"((http|https)://[^\s]+)" + result = re.sub(regex_urls, " ", text) + # note: not removing apostrophes regex_expr = r"\s['|’]|['|’]\s|[\^_~@!&;#:\-%“”‘\"%\*/{}\[\]\(\)\\|<>=+]" - result = re.sub(regex_expr, " ", text) + result = re.sub(regex_expr, " ", result) # remove extra whitespace return " ".join(result.split())