diff --git a/video_creation/voices.py b/video_creation/voices.py index 86bacd4..ea2d790 100644 --- a/video_creation/voices.py +++ b/video_creation/voices.py @@ -13,11 +13,16 @@ def sanitize_text(reddit_obj): What gets removed: - following characters`^_~@!&;#:-%“”‘"%*/{}[]()\|<>?=+` + - any http or https links """ + # remove any urls from the text + regex_urls = r"((http|https)://[^\s]+)" + result = re.sub(regex_urls, " ", reddit_obj) + # note: not removing apostrophes regex_expr = r"\s['|’]|['|’]\s|[\^_~@!&;#:\-%“”‘\"%\*/{}\[\]\(\)\\|<>=+]" - result = re.sub(regex_expr, " ", reddit_obj) + result = re.sub(regex_expr, " ", result) # remove extra whitespace return " ".join(result.split())