You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
RedditVideoMakerBot/utils/posttextparser.py

50 lines
1.2 KiB

import os
import re
import time
from typing import List
try:
import spacy
SPACY_AVAILABLE = True
except ImportError:
SPACY_AVAILABLE = False
from utils.console import print_step
from utils.voice import sanitize_text
def _fallback_sentence_split(text: str) -> List[str]:
"""Fallback sentence splitter when spacy is not available."""
sentences = re.split(r'[.!?]+', text)
return [s.strip() for s in sentences if s.strip()]
# working good
def posttextparser(obj, *, tried: bool = False) -> List[str]:
text: str = re.sub("\n", " ", obj)
if not SPACY_AVAILABLE:
return _fallback_sentence_split(text)
try:
nlp = spacy.load("en_core_web_sm")
except OSError as e:
if not tried:
os.system("python -m spacy download en_core_web_sm")
time.sleep(5)
return posttextparser(obj, tried=True)
print_step(
"The spacy model can't load. Falling back to regex-based sentence splitting. Install with: python -m spacy download en_core_web_sm"
)
return _fallback_sentence_split(text)
doc = nlp(text)
newtext: list = []
for line in doc.sents:
if sanitize_text(line.text):
newtext.append(line.text)
return newtext