You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/third_party/text_processing/normalization/sentence_split.py

24 lines
507 B

import re
from typing import List
4 years ago
SENTENCE_SPLITOR = re.compile(r'([。!?][”’]?)')
def split(text: str) -> List[str]:
"""Split long text into sentences with sentence-splitting punctuations.
Parameters
----------
text : str
The input text.
Returns
-------
List[str]
Sentences.
"""
text = SENTENCE_SPLITOR.sub(r'\1\n', text)
text = text.strip()
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
return sentences