You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/third_party/python-pinyin/tests/seg/test_mmseg.py

793 lines
19 KiB

E2E/Streaming Transformer/Conformer ASR (#578) * add cmvn and label smoothing loss layer * add layer for transformer * add glu and conformer conv * add torch compatiable hack, mask funcs * not hack size since it exists * add test; attention * add attention, common utils, hack paddle * add audio utils * conformer batch padding mask bug fix #223 * fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2 * fix ci * fix ci * add encoder * refactor egs * add decoder * refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils * refactor docs * add fix * fix readme * fix bugs, refactor collator, add pad_sequence, fix ckpt bugs * fix docstring * refactor data feed order * add u2 model * refactor cmvn, test * add utils * add u2 config * fix bugs * fix bugs * fix autograd maybe has problem when using inplace operation * refactor data, build vocab; add format data * fix text featurizer * refactor build vocab * add fbank, refactor feature of speech * refactor audio feat * refactor data preprare * refactor data * model init from config * add u2 bins * flake8 * can train * fix bugs, add coverage, add scripts * test can run * fix data * speed perturb with sox * add spec aug * fix for train * fix train logitc * fix logger * log valid loss, time dataset process * using np for speed perturb, remove some debug log of grad clip * fix logger * fix build vocab * fix logger name * using module logger as default * fix * fix install * reorder imports * fix board logger * fix logger * kaldi fbank and mfcc * fix cmvn and print prarams * fix add_eos_sos and cmvn * fix cmvn compute * fix logger and cmvn * fix subsampling, label smoothing loss, remove useless * add notebook test * fix log * fix tb logger * multi gpu valid * fix log * fix log * fix config * fix compute cmvn, need paddle 2.1 * add cmvn notebook * fix layer tools * fix compute cmvn * add rtf * fix decoding * fix layer tools * fix log, add avg script * more avg and test info * fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh; * add vimrc * refactor tiny script, add transformer and stream conf * spm demo; librisppech scripts and confs * fix log * add librispeech scripts * refactor data pipe; fix conf; fix u2 default params * fix bugs * refactor aishell scripts * fix test * fix cmvn * fix s0 scripts * fix ds2 scripts and bugs * fix dev & test dataset filter * fix dataset filter * filter dev * fix ckpt path * filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test * add comment * add syllable doc * fix ds2 configs * add doc * add pypinyin tools * fix decoder using blank_id=0 * mmseg with pybind11 * format code
3 years ago
import pytest
from pypinyin import pinyin, load_phrases_dict
from pypinyin.seg import mmseg
seg_test = mmseg.Seg(mmseg.PrefixSet())
# yapf: disable
seg_test._prefix_set.train([
'a',
'ab',
'abc',
'abcd',
'abd',
'ac',
'acd',
'aff',
'agf',
'agfgef',
'asdf',
'bbs'
'中国',
'中国人',
'中国人民',
'中国人民银行',
'',
'北京',
'天安门',
'员工',
])
# yapf: enable
# yapf: disable
@pytest.mark.parametrize(
'input, expect', [
['', []],
['a', ['a']],
['abc', ['abc']],
['abcefg', ['abc', 'e', 'f', 'g']],
['bbcabce', ['bb', 'c', 'abc', 'e']],
['北京', ['北京']],
['北京,', ['北京', ',']],
['北京abc', ['北京', 'abc']],
['中国人民银行行长', ['中国人民银行', '', '']],
['中国人民银行员工', ['中国人民银行', '员工']],
[
'abcadbasfgafgasdabcagfaff我是中国人中国人民我爱北京天安门',
[
'abc',
'a',
'd',
'b',
'as',
'f',
'g',
'af',
'g',
'asd',
'abc',
'agf',
'aff',
'',
'',
'中国人',
'中国人民',
'',
'',
'北京',
'天安门',
],
],
]
)
# yapf: enable
def test_mmseg(input, expect):
assert list(seg_test.cut(input)) == expect
@pytest.mark.parametrize('input, default_ret, mmseg_ret', [
[
'一语中的啊',
[[''], [''], ['zhōng'], ['de'], ['a']],
[[''], [''], ['zhòng'], [''], ['a']],
],
])
def test_mmseg_for_pinyin(input, default_ret, mmseg_ret):
assert pinyin(input) == mmseg_ret
assert pinyin(mmseg.seg.cut(input)) == mmseg_ret
@pytest.mark.parametrize('input, jieba_ret, mmseg_ret', [
[
'了局啊',
[['le'], [''], ['a']],
[['liǎo'], [''], ['a']],
],
])
def test_mmseg_and_jieba_for_pinyin(input, jieba_ret, mmseg_ret):
assert pinyin(input) == mmseg_ret
assert pinyin(mmseg.seg.cut(input)) == mmseg_ret
def test_retrain():
seg = mmseg.seg
assert list(seg.cut('啊啊啊')) == ['', '', '']
load_phrases_dict({'啊啊啊': [['a'], ['a'], ['a']]})
mmseg.retrain(seg)
assert list(seg.cut('啊啊啊')) == ['啊啊啊']
assert list(seg.cut('男孩儿')) == ['男孩儿']
def test_phrases():
seg = mmseg.seg
assert list(seg.cut('你要重新考虑这条建议')) == \
['', '', '重新', '', '', '', '', '', '']
load_phrases_dict({'在一起': [['zài'], [''], ['']]})
assert list(seg.cut('在一片')) == ['', '一片']
# 前缀匹配,后缀是词语
#
# 输入头部是另外一个词语的头部,会匹配其他词语的前缀
# 输入尾部是一个词语
# 此时这个尾部词语要被分词出来
assert list(seg.cut('行业')) == ['行业']
assert list(seg.cut('金融行业')) == ['', '', '行业']
# 整个是词语
assert list(seg.cut('金融寡头')) == ['金融寡头']
assert list(seg.cut('服务行业')) == ['服务行业']
assert list(seg.cut('人员')) == ['人员']
assert list(seg.cut('服务人员')) == ['服务', '人员']
assert list(seg.cut('银行')) == ['银行']
assert list(seg.cut('浦发银行')) == ['', '', '银行']
assert list(seg.cut('')) == []
# 整个匹配前缀,但是不是词语
assert list(seg.cut('')) == ['']
assert list(seg.cut('金融')) == ['', '']
#
assert list(seg.cut('金融金')) == ['', '', '']
assert list(seg.cut('金融金融')) == ['', '', '', '']
assert list(seg.cut('金融金融金融金融金融金融')) == [
'', '', '', '', '', '', '', '', '', '', '', ''
]
assert list(seg.cut('金融金融金融金融金融金融金')) == [
'', '', '', '', '', '', '', '', '', '', '', '', ''
]
# 没有任何匹配
assert list(
seg.cut('以其昏昏,使人昭昭')) == ['', '', '', '', '', '使', '', '', '']
# 前缀无任何匹配, 后缀是词语
assert list(seg.cut('以其昏昏行业')) == ['', '', '', '', '行业']
# 前缀是词语
assert list(seg.cut('行业以其昏昏')) == ['行业', '', '', '', '']
# 中间是词语
assert list(seg.cut('使人昭昭行业以其昏昏')) == [
'使', '', '', '', '行业', '', '', '', ''
]
# yapf: disable
def test_seg_long():
seg = mmseg.seg
assert list(seg.cut(''' 真的猛士,敢于直面惨淡的人生,敢于正视淋漓的鲜血。
这是怎样的哀痛者和幸福者然而造化又常常为庸人设计以时间的流驶来洗涤旧迹
仅使留下淡红的血色和微漠的悲哀在这淡红的血色和微漠的悲哀中又给人暂得偷生
维持着这似人非人的世界我不知道这样的世界何时是一个尽头
  我们还在这样的世上活着我也早觉得有写一点东西的必要了离三月十八日也已有两星期
忘却的救主快要降临了罢我正有写一点东西的必要了''')) == [
' ',
' ',
' ',
' ',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'正视',
'',
'',
'',
'鲜血',
'',
'\n',
' ',
' ',
' ',
' ',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'造化',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'时间',
'',
'',
'',
'',
'',
'洗涤',
'',
'',
'',
'\n',
' ',
' ',
' ',
' ',
'',
'使',
'',
'',
'',
'',
'',
'血色',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'血色',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'\n',
' ',
' ',
' ',
' ',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'知道',
'这样',
'',
'',
'',
'何时',
'',
'一个',
'尽头',
'',
'\n',
'\u3000',
'\u3000',
'我们',
'',
'',
'这样',
'',
'世上',
'活着',
'',
'',
'',
'',
'觉得',
'',
'',
'一点',
'',
'西',
'',
'必要',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'星期',
'',
'\n',
' ',
' ',
' ',
' ',
'',
'',
'',
'',
'',
'快要',
'降临',
'',
'',
'',
'',
'',
'',
'',
'一点',
'',
'西',
'',
'必要',
'',
'']
assert list(seg.cut(
'人们常常把人与自然对立起来,宣称要征服自然。殊不知在大自然面前,'
'人类永远只是一个天真幼稚的孩童,而他却要作自然的主人!'
'他只是大自然机体上普通的一部分,正像一株小草只是她的普通一部分一样,'
'有什么资格与自然对立! 如果说自然的智慧是大海,那么,'
'人类的智慧就只是大海中的一个小水滴,虽然这个水滴也映照着大海,'
'但毕竟不是大海。可是,人们竟然不自量力 地宣称要用滴水来代替大海。')) == [
'人们',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'宣称',
'',
'征服',
'',
'',
'',
'殊不知',
'',
'大自然',
'',
'',
'',
'',
'',
'',
'',
'只是',
'一个',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'只是',
'大自然',
'机体',
'',
'',
'',
'',
'',
'部分',
'',
'正像',
'',
'',
'',
'',
'只是',
'',
'',
'',
'',
'',
'部分',
'',
'',
'',
'',
'什么',
'',
'',
'',
'',
'',
'',
'',
'',
' ',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'大海',
'',
'那么',
'',
'',
'',
'',
'',
'',
'',
'只是',
'大海',
'中的',
'一个',
'',
'',
'',
'',
'',
'',
'这个',
'',
'',
'',
'',
'',
'',
'大海',
'',
'',
'',
'',
'不是',
'大海',
'',
'可是',
'',
'人们',
'',
'',
'不自量力',
' ',
'',
'宣称',
'',
'',
'',
'',
'',
'',
'',
'大海',
'']
assert list(seg.cut(
'该负责人表示,银行保险机构具有外部性强、财务杠杆率高、'
'信息不对称严重等特征,不同于一般工商企业,对其股东股权必须从严管理。'
'总体上,银保监会将坚持“两个不变”。一是坚持鼓励社会资本参与银行保险机构改革、'
'优化股东结构的积极取向不变。银保监会将继续畅通社会资本投资入股银行保险机构的渠道,'
'优化股东结构、充实机构资本,重点引入注重机构长远发展、资本实力雄厚、管理经验'
'丰富的战略性股东。二是坚持严惩股东违法违规行为、规范公司治理的高压态势不变。'
'银保监会将继续深入排查整治违法违规股东股权,依法清理规范股权关系,'
'对股东严重违法违规行为,将坚决予以惩戒,发现一起、查处一起。')) == [
'',
'',
'',
'',
'',
'',
'',
'银行',
'',
'',
'',
'',
'具有',
'',
'',
'',
'',
'',
'',
'',
'杠杆',
'',
'',
'',
'信息',
'不对',
'',
'严重',
'',
'',
'',
'',
'不同',
'',
'一般',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'总体',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'两个',
'不变',
'',
'',
'',
'',
'',
'',
'',
'',
'社会',
'',
'',
'参与',
'银行',
'',
'',
'',
'',
'',
'',
'',
'优化',
'',
'',
'结构',
'',
'',
'',
'',
'',
'不变',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'社会',
'',
'',
'',
'',
'',
'',
'银行',
'',
'',
'',
'',
'',
'渠道',
'',
'优化',
'',
'',
'结构',
'',
'',
'',
'',
'',
'',
'',
'',
'重点',
'',
'',
'注重',
'',
'',
'长远',
'发展',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'行为',
'',
'',
'',
'',
'',
'',
'',
'',
'高压',
'',
'',
'不变',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'关系',
'',
'',
'',
'',
'严重',
'',
'',
'',
'',
'行为',
'',
'',
'',
'',
'予以',
'',
'',
'',
'发现',
'一起',
'',
'查处',
'一起',
'']
# yapf: enable
if __name__ == '__main__':
import pytest
pytest.cmdline.main()