From 8130de442438d3ecd72b73a97dd5d25db6fffa87 Mon Sep 17 00:00:00 2001 From: zechs Date: Mon, 6 Jun 2022 14:05:16 +0530 Subject: [PATCH] Remove links from text --- video_creation/voices.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/video_creation/voices.py b/video_creation/voices.py index 86bacd4..ea2d790 100644 --- a/video_creation/voices.py +++ b/video_creation/voices.py @@ -13,11 +13,16 @@ def sanitize_text(reddit_obj): What gets removed: - following characters`^_~@!&;#:-%“”‘"%*/{}[]()\|<>?=+` + - any http or https links """ + # remove any urls from the text + regex_urls = r"((http|https)://[^\s]+)" + result = re.sub(regex_urls, " ", reddit_obj) + # note: not removing apostrophes regex_expr = r"\s['|’]|['|’]\s|[\^_~@!&;#:\-%“”‘\"%\*/{}\[\]\(\)\\|<>=+]" - result = re.sub(regex_expr, " ", reddit_obj) + result = re.sub(regex_expr, " ", result) # remove extra whitespace return " ".join(result.split())