diff --git a/README.md b/README.md index e7019a897..a2d2f9a56 100644 --- a/README.md +++ b/README.md @@ -52,4 +52,4 @@ DeepSpeech is provided under the [Apache-2.0 License](./LICENSE). ## Acknowledgement -We depends on many open source repos. See [References](doc/src/reference.md) for more information. \ No newline at end of file +We depends on many open source repos. See [References](doc/src/reference.md) for more information. diff --git a/README_cn.md b/README_cn.md index b9ad78908..3c1111b5e 100644 --- a/README_cn.md +++ b/README_cn.md @@ -50,4 +50,4 @@ DeepSpeech遵循[Apache-2.0开源协议](./LICENSE)。 ## 感谢 -开发中参考一些优秀的仓库,详情参见 [References](doc/src/reference.md)。 \ No newline at end of file +开发中参考一些优秀的仓库,详情参见 [References](doc/src/reference.md)。 diff --git a/examples/ngram_lm/local/kenlm_score_test.py b/examples/ngram_lm/local/kenlm_score_test.py index 1c8f867bd..6268f53a1 100644 --- a/examples/ngram_lm/local/kenlm_score_test.py +++ b/examples/ngram_lm/local/kenlm_score_test.py @@ -1,9 +1,22 @@ -import kenlm -import jieba -import time - +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os import sys +import time + +import jieba +import kenlm language_model_path = sys.argv[1] assert os.path.exists(language_model_path) @@ -33,7 +46,8 @@ def test_score(): for i, v in enumerate(model.full_scores(sentence_char_split)): print(i, v) split_size += 1 - assert split_size == len(sentence_char_split.split()) + 1, "error split size." + assert split_size == len( + sentence_char_split.split()) + 1, "error split size." print(sentence_word_split) print(model.score(sentence_word_split)) @@ -47,8 +61,10 @@ def test_full_scores_chars(): print(sentence_char_split) # Show scores and n-gram matches words = [''] + list(sentence) + [''] - for i, (prob, length, oov) in enumerate(model.full_scores(sentence_char_split)): - print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i + 2 - length:i + 2]))) + for i, (prob, length, + oov) in enumerate(model.full_scores(sentence_char_split)): + print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i + 2 - length: + i + 2]))) if oov: print('\t"{0}" is an OOV'.format(words[i + 1])) @@ -67,8 +83,10 @@ def test_full_scores_words(): print(sentence_word_split) # Show scores and n-gram matches words = [''] + sentence_word_split.split() + [''] - for i, (prob, length, oov) in enumerate(model.full_scores(sentence_word_split)): - print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i + 2 - length:i + 2]))) + for i, (prob, length, + oov) in enumerate(model.full_scores(sentence_word_split)): + print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i + 2 - length: + i + 2]))) if oov: print('\t"{0}" is an OOV'.format(words[i + 1])) @@ -80,7 +98,8 @@ def test_full_scores_words(): print('"{0}" is an OOV'.format(w)) oov.append(w) # zh_giga.no_cna_cmn.prune01244.klm is chinese charactor LM - assert oov == ["盘点", "不怕", "网站", "❗", "️", "海淘", "向来", "便宜", "保真", "!"], 'error oov' + assert oov == ["盘点", "不怕", "网站", "❗", "️", "海淘", "向来", "便宜", "保真", "!" + ], 'error oov' def test_full_scores_chars_length(): @@ -159,9 +178,10 @@ def test_ppl_sentence(): n2 = model.perplexity(part_char_split2) print(n2) + if __name__ == '__main__': test_score() test_full_scores_chars() test_full_scores_words() test_full_scores_chars_length() - test_ppl_sentence() \ No newline at end of file + test_ppl_sentence() diff --git a/examples/ngram_lm/local/zh_preprocess.py b/examples/ngram_lm/local/zh_preprocess.py index 9cdcfd495..93f98624e 100644 --- a/examples/ngram_lm/local/zh_preprocess.py +++ b/examples/ngram_lm/local/zh_preprocess.py @@ -1,31 +1,36 @@ #!/usr/bin/env python3 - -from typing import List, Text +import re +import string import sys +from typing import List +from typing import Text + import jieba -import string -import re from zhon import hanzi + def char_token(s: Text) -> List[Text]: return list(s) + def word_token(s: Text) -> List[Text]: return jieba.lcut(s) + def tn(s: Text) -> Text: s = s.strip() s = s.replace('*', '') # rm english punctuations - s = re.sub(f'[re.escape(string.punctuation)]' , "", s) + s = re.sub(f'[re.escape(string.punctuation)]', "", s) # rm chinese punctuations s = re.sub(f'[{hanzi.punctuation}]', "", s) # text normalization - + # rm english s = ''.join(re.findall(hanzi.sent, s)) return s + def main(infile, outfile, tokenizer=None): with open(infile, 'rt') as fin, open(outfile, 'wt') as fout: lines = fin.readlines() @@ -36,6 +41,7 @@ def main(infile, outfile, tokenizer=None): fout.write(l) fout.write('\n') + if __name__ == '__main__': if len(sys.argv) != 4: print(f"sys.arv[0] [char|word] text text_out ") @@ -52,4 +58,4 @@ if __name__ == '__main__': else: tokenizer = None - main(text, text_out, tokenizer) \ No newline at end of file + main(text, text_out, tokenizer)