format

4 years ago · c7d9b11529
parent caf7225892
commit c7d9b11529
18 changed files with 910 additions and 823 deletions
--- a/.flake8
+++ b/.flake8
@ -12,6 +12,8 @@ exclude =
    .git,
    # python cache
    __pycache__,
+    # third party
+    utils/compute-wer.py,
    third_party/,
 # Provide a comma-separate list of glob patterns to include for checks.
 filename =
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@ -40,6 +40,7 @@ from paddlespeech.s2t.utils.utility import UpdateConfig

 __all__ = ['ASRExecutor']

+
@cli_register(
    name='paddlespeech.asr', description='Speech to text infer command.')
 class ASRExecutor(BaseExecutor):
@ -278,7 +279,8 @@ class ASRExecutor(BaseExecutor):
            self._outputs["result"] = result_transcripts[0]

        elif "conformer" in model_type or "transformer" in model_type:
-            logger.info(f"we will use the transformer like model : {model_type}")
+            logger.info(
+                f"we will use the transformer like model : {model_type}")
            try:
                result_transcripts = self.model.decode(
                    audio,
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@ -305,6 +305,7 @@ class ASRClientExecutor(BaseExecutor):

        return res['asr_results']

+
@cli_client_register(
    name='paddlespeech_client.cls', description='visit cls service')
 class CLSClientExecutor(BaseExecutor):
--- a/paddlespeech/server/engine/asr/online/ctc_search.py
+++ b/paddlespeech/server/engine/asr/online/ctc_search.py
@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections import defaultdict
+
 import paddle
+
 from paddlespeech.cli.log import logger
 from paddlespeech.s2t.utils.utility import log_add

--- a/paddlespeech/server/tests/asr/online/websocket_client.py
+++ b/paddlespeech/server/tests/asr/online/websocket_client.py
@ -36,7 +36,7 @@ class ASRAudioHandler:
        x_len = len(samples)

        chunk_size = 85 * 16  #80ms, sample_rate = 16kHz
-        if x_len % chunk_size!= 0:
+        if x_len % chunk_size != 0:
            padding_len_x = chunk_size - x_len % chunk_size
        else:
            padding_len_x = 0
--- a/paddlespeech/vector/cluster/diarization.py
+++ b/paddlespeech/vector/cluster/diarization.py
@ -20,11 +20,11 @@ A few sklearn functions are modified in this script as per requirement.
 import argparse
 import copy
 import warnings
-from distutils.util import strtobool

 import numpy as np
 import scipy
 import sklearn
+from distutils.util import strtobool
 from scipy import linalg
 from scipy import sparse
 from scipy.sparse.csgraph import connected_components
--- a/speechx/examples/ngram/zh/local/text_to_lexicon.py
+++ b/speechx/examples/ngram/zh/local/text_to_lexicon.py
@ -2,6 +2,7 @@
 import argparse
 from collections import Counter

+
 def main(args):
    counter = Counter()
    with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout:
@ -20,21 +21,16 @@ def main(args):
            fout.write(f"{word}\t{val}\n")
            fout.flush()

+
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='text(line:utt1 中国 人) to lexicon（line:中国 中 国).')
    parser.add_argument(
-        '--has_key',
-        default=True,
-        help='text path, with utt or not')
+        '--has_key', default=True, help='text path, with utt or not')
    parser.add_argument(
-        '--text',
-        required=True,
-        help='text path. line: utt1 中国 人 or 中国 人')
+        '--text', required=True, help='text path. line: utt1 中国 人 or 中国 人')
    parser.add_argument(
-        '--lexicon',
-        required=True,
-        help='lexicon path. line:中国 中 国')
+        '--lexicon', required=True, help='lexicon path. line:中国 中 国')
    args = parser.parse_args()
    print(args)

--- a/speechx/examples/text_lm/local/mmseg.py
+++ b/speechx/examples/text_lm/local/mmseg.py
@ -1,15 +1,16 @@
 #!/usr/bin/env python3
-
 # modify from https://sites.google.com/site/homepageoffuyanwei/Home/remarksandexcellentdiscussion/page-2

+
 class Word:
-    def __init__(self,text = '',freq = 0):  
+    def __init__(self, text='', freq=0):
        self.text = text
        self.freq = freq
        self.length = len(text)

+
 class Chunk:
-    def __init__(self,w1,w2 = None,w3 = None):  
+    def __init__(self, w1, w2=None, w3=None):
        self.words = []
        self.words.append(w1)
        if w2:
@ -44,8 +45,8 @@ class Chunk:
            sum += word.freq
        return sum

-class ComplexCompare:  

+class ComplexCompare:
    def takeHightest(self, chunks, comparator):
        i = 1
        for j in range(1, len(chunks)):
@ -59,23 +60,27 @@ class ComplexCompare:

    #以下四个函数是mmseg算法的四种过滤原则，核心算法  
    def mmFilter(self, chunks):
-        def comparator(a,b):  
+        def comparator(a, b):
            return a.totalWordLength() - b.totalWordLength()
+
        return self.takeHightest(chunks, comparator)

-    def lawlFilter(self,chunks):  
-        def comparator(a,b):  
+    def lawlFilter(self, chunks):
+        def comparator(a, b):
            return a.averageWordLength() - b.averageWordLength()
-        return self.takeHightest(chunks,comparator)  

-    def svmlFilter(self,chunks):  
-        def comparator(a,b):  
+        return self.takeHightest(chunks, comparator)
+
+    def svmlFilter(self, chunks):
+        def comparator(a, b):
            return b.standardDeviation() - a.standardDeviation()
+
        return self.takeHightest(chunks, comparator)

-    def logFreqFilter(self,chunks):  
-        def comparator(a,b):  
+    def logFreqFilter(self, chunks):
+        def comparator(a, b):
            return a.wordFrequency() - b.wordFrequency()
+
        return self.takeHightest(chunks, comparator)


@ -83,6 +88,7 @@ class ComplexCompare:
 dictWord = {}
 maxWordLength = 0

+
 def loadDictChars(filepath):
    global maxWordLength
    fsock = open(filepath)
@ -90,18 +96,22 @@ def loadDictChars(filepath):
        freq, word = line.split()
        word = word.strip()
        dictWord[word] = (len(word), int(freq))
-        maxWordLength = len(word) if maxWordLength < len(word) else maxWordLength  
+        maxWordLength = len(word) if maxWordLength < len(
+            word) else maxWordLength
    fsock.close()

+
 def loadDictWords(filepath):
    global maxWordLength
    fsock = open(filepath)
    for line in fsock.readlines():
        word = line.strip()
        dictWord[word] = (len(word), 0)
-        maxWordLength = len(word) if maxWordLength < len(word) else maxWordLength
+        maxWordLength = len(word) if maxWordLength < len(
+            word) else maxWordLength
    fsock.close()

+
 #判断该词word是否在字典dictWord中      
 def getDictWord(word):
    result = dictWord.get(word)
@ -109,14 +119,15 @@ def getDictWord(word):
        return Word(word, result[1])
    return None

+
 #开始加载字典  
 def run():
    from os.path import join, dirname
    loadDictChars(join(dirname(__file__), 'data', 'chars.dic'))
    loadDictWords(join(dirname(__file__), 'data', 'words.dic'))

-class Analysis:  

+class Analysis:
    def __init__(self, text):
        self.text = text
        self.cacheSize = 3
@ -134,11 +145,10 @@ class Analysis:
        if not dictWord:
            run()

-
    def __iter__(self):
        while True:
            token = self.getNextToken()
-            if token == None:  
+            if token is None:
                raise StopIteration
            yield token

@ -146,7 +156,7 @@ class Analysis:
        return self.text[self.pos]

    #判断该字符是否是中文字符（不包括中文标点）    
-    def isChineseChar(self,charater):  
+    def isChineseChar(self, charater):
        return 0x4e00 <= ord(charater) < 0x9fa6

    #判断是否是ASCII码  
@ -163,8 +173,8 @@ class Analysis:
        while self.pos < self.textLength:
            if self.isChineseChar(self.getNextChar()):
                token = self.getChineseWords()
-            else :  
-                token = self.getASCIIWords()+'/'  
+            else:
+                token = self.getASCIIWords() + '/'
            if len(token) > 0:
                return token
        return None
@ -211,7 +221,7 @@ class Analysis:
            chunks = self.complexCompare.svmlFilter(chunks)
        if len(chunks) > 1:
            chunks = self.complexCompare.logFreqFilter(chunks)
-        if len(chunks) == 0 :  
+        if len(chunks) == 0:
            return ''

        #最后只有一种切割方法  
@ -242,13 +252,13 @@ class Analysis:
                        for word3 in words3:
                            # print(word3.length, word3.text)
                            if word3.length == -1:
-                                chunk = Chunk(word1,word2)  
+                                chunk = Chunk(word1, word2)
                                # print("Ture")
-                            else :  
-                                chunk = Chunk(word1,word2,word3)  
+                            else:
+                                chunk = Chunk(word1, word2, word3)
                            chunks.append(chunk)
                    elif self.pos == self.textLength:
-                        chunks.append(Chunk(word1,word2))  
+                        chunks.append(Chunk(word1, word2))
                    self.pos -= len(word2.text)
            elif self.pos == self.textLength:
                chunks.append(Chunk(word1))
@ -268,7 +278,7 @@ class Analysis:
        words = []
        index = 0
        while self.pos < self.textLength:
-            if index >= maxWordLength :  
+            if index >= maxWordLength:
                break
            if not self.isChineseChar(self.getNextChar()):
                break
@ -288,18 +298,18 @@ class Analysis:
            word.text = 'X'
            words.append(word)

-        self.cache[self.cacheIndex] = (self.pos,words)  
+        self.cache[self.cacheIndex] = (self.pos, words)
        self.cacheIndex += 1
        if self.cacheIndex >= self.cacheSize:
            self.cacheIndex = 0
        return words


-if __name__=="__main__":  
+if __name__ == "__main__":

    def cuttest(text):
        #cut =  Analysis(text)  
-        tmp=""
+        tmp = ""
        try:
            for word in iter(Analysis(text)):
                tmp += word
@ -375,6 +385,8 @@ if __name__=="__main__":
    cuttest(u"好人使用了它就可以解决一些问题")
    cuttest(u"是因为和国家")
    cuttest(u"老年搜索还支持")
-    cuttest(u"干脆就把那部蒙人的闲法给废了拉倒！RT @laoshipukong : 27日，全国人大常委会第三次审议侵权责任法草案，删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")  
+    cuttest(
+        u"干脆就把那部蒙人的闲法给废了拉倒！RT @laoshipukong : 27日，全国人大常委会第三次审议侵权责任法草案，删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 "
+    )
    cuttest("2022年12月30日是星期几？")
    cuttest("二零二二年十二月三十日是星期几？")
--- a/utils/DER.py
+++ b/utils/DER.py
@ -26,9 +26,9 @@ import argparse
 import os
 import re
 import subprocess
-from distutils.util import strtobool

 import numpy as np
+from distutils.util import strtobool

 FILE_IDS = re.compile(r"(?<=Speaker Diarization for).+(?=\*\*\*)")
 SCORED_SPEAKER_TIME = re.compile(r"(?<=SCORED SPEAKER TIME =)[\d.]+")
--- a/utils/compute-wer.py
+++ b/utils/compute-wer.py
--- a/utils/format_rsl.py
+++ b/utils/format_rsl.py
@ -1,11 +1,21 @@
-import os
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse
 import jsonlines


-def trans_hyp(origin_hyp,
-             trans_hyp = None, 
-             trans_hyp_sclite = None):
+def trans_hyp(origin_hyp, trans_hyp=None, trans_hyp_sclite=None):
    """
    Args:
        origin_hyp: The input json file which contains the model output
@ -24,12 +34,11 @@ def trans_hyp(origin_hyp,
    if trans_hyp_sclite is not None:
        with open(trans_hyp_sclite, "w+") as f:
            for key in input_dict.keys():
-                line = input_dict[key] + "(" + key + ".wav" +")" + "\n"
+                line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
                f.write(line)

-def trans_ref(origin_ref,
-                trans_ref = None, 
-                trans_ref_sclite = None):
+
+def trans_ref(origin_ref, trans_ref=None, trans_ref_sclite=None):
    """
    Args:
        origin_hyp: The input json file which contains the model output
@ -49,42 +58,48 @@ def trans_ref(origin_ref,
    if trans_ref_sclite is not None:
        with open(trans_ref_sclite, "w") as f:
            for key in input_dict.keys():
-                line = input_dict[key] + "(" + key + ".wav" +")" + "\n"
+                line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
                f.write(line)


-
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(prog='format hyp file for compute CER/WER', add_help=True)
+    parser = argparse.ArgumentParser(
+        prog='format hyp file for compute CER/WER', add_help=True)
    parser.add_argument(
-        '--origin_hyp',
-        type=str,
-        default = None,
-        help='origin hyp file')
+        '--origin_hyp', type=str, default=None, help='origin hyp file')
    parser.add_argument(
-        '--trans_hyp', type=str, default = None, help='hyp file for caculating CER/WER')
+        '--trans_hyp',
+        type=str,
+        default=None,
+        help='hyp file for caculating CER/WER')
    parser.add_argument(
-        '--trans_hyp_sclite', type=str,  default = None, help='hyp file for caculating CER/WER by sclite')
+        '--trans_hyp_sclite',
+        type=str,
+        default=None,
+        help='hyp file for caculating CER/WER by sclite')

    parser.add_argument(
-        '--origin_ref',
-        type=str,
-        default = None,
-        help='origin ref file')
+        '--origin_ref', type=str, default=None, help='origin ref file')
    parser.add_argument(
-        '--trans_ref', type=str, default = None, help='ref file for caculating CER/WER')
+        '--trans_ref',
+        type=str,
+        default=None,
+        help='ref file for caculating CER/WER')
    parser.add_argument(
-        '--trans_ref_sclite', type=str,  default = None, help='ref file for caculating CER/WER by sclite')
+        '--trans_ref_sclite',
+        type=str,
+        default=None,
+        help='ref file for caculating CER/WER by sclite')
    parser_args = parser.parse_args()

    if parser_args.origin_hyp is not None:
        trans_hyp(
-            origin_hyp = parser_args.origin_hyp,
-            trans_hyp = parser_args.trans_hyp,
-            trans_hyp_sclite = parser_args.trans_hyp_sclite, )
+            origin_hyp=parser_args.origin_hyp,
+            trans_hyp=parser_args.trans_hyp,
+            trans_hyp_sclite=parser_args.trans_hyp_sclite, )

    if parser_args.origin_ref is not None:
        trans_ref(
-            origin_ref = parser_args.origin_ref,
-            trans_ref = parser_args.trans_ref,
-            trans_ref_sclite = parser_args.trans_ref_sclite, )
+            origin_ref=parser_args.origin_ref,
+            trans_ref=parser_args.trans_ref,
+            trans_ref_sclite=parser_args.trans_ref_sclite, )
--- a/utils/fst/prepare_dict.py
+++ b/utils/fst/prepare_dict.py
@ -35,7 +35,7 @@ def main(args):
    # used to filter polyphone and invalid word
    lexicon_table = set()
    in_n = 0  # in lexicon word count
-    out_n = 0 # out lexicon word cout
+    out_n = 0  # out lexicon word cout
    with open(args.in_lexicon, 'r') as fin, \
            open(args.out_lexicon, 'w') as fout:
        for line in fin:
@ -82,7 +82,10 @@ def main(args):
                lexicon_table.add(word)
                out_n += 1

-    print(f"Filter lexicon by unit table: filter out {in_n - out_n}, {out_n}/{in_n}")
+    print(
+        f"Filter lexicon by unit table: filter out {in_n - out_n}, {out_n}/{in_n}"
+    )
+

 if __name__ == '__main__':
    parser = argparse.ArgumentParser(