Remove links from text

pull/332/head
zechs 3 years ago
parent ca24608da8
commit 8130de4424

@ -13,11 +13,16 @@ def sanitize_text(reddit_obj):
What gets removed:
- following characters`^_~@!&;#:-%“”‘"%*/{}[]()\|<>?=+`
- any http or https links
"""
# remove any urls from the text
regex_urls = r"((http|https)://[^\s]+)"
result = re.sub(regex_urls, " ", reddit_obj)
# note: not removing apostrophes
regex_expr = r"\s['|]|['|]\s|[\^_~@!&;#:\-%“”‘\"%\*/{}\[\]\(\)\\|<>=+]"
result = re.sub(regex_expr, " ", reddit_obj)
result = re.sub(regex_expr, " ", result)
# remove extra whitespace
return " ".join(result.split())

Loading…
Cancel
Save