restructure thchs30/a0

5 years ago · 00017301c6
parent c0ee57d400
commit 00017301c6
10 changed files with 188 additions and 8277 deletions
--- a/examples/thchs30/README.md
+++ b/examples/thchs30/README.md
@ -1,2 +1,7 @@
 this is the example of MFA for thchs30 dataset
 cd a0 run run.sh to get start
 MFA 对齐所使用的字典
 MFA 字典的格式可以参考: https://montreal-forced-aligner.readthedocs.io/en/latest/dictionary.html
 phone.lexicon 直接使用的是 THCHS-30/data_thchs30/lm_phone/lexicon.txt
 word.lexicon 是一个带概率的字典, 生成规则请参考 local/gen_word2phone.py
--- a/examples/thchs30/a0/data/dict/syllable.lexicon
+++ b/examples/thchs30/a0/data/dict/syllable.lexicon
--- a/examples/thchs30/a0/local/data.sh
+++ b/examples/thchs30/a0/local/data.sh
@ -8,9 +8,10 @@ source ${MAIN_ROOT}/utils/parse_options.sh
 mkdir -p data
 TARGET_DIR=${MAIN_ROOT}/examples/dataset
 mkdir -p ${TARGET_DIR}
 LEXICON_NAME=$1
 # download data, generate manifests
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    # download data, generate manifests
    python3 ${TARGET_DIR}/thchs30/thchs30.py \
    --manifest_prefix="data/manifest" \
    --target_dir="${TARGET_DIR}/thchs30"
@ -22,5 +23,25 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 fi
 # dump manifest to data/
 python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data
 # copy files to data/dict to gen word.lexicon
 cp  ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
 cp  ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
 # copy phone.lexicon to data/dict
 cp  ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
 # gen word.lexicon
 python local/gen_word2phone.py  --root-dir=data/dict --output-dir=data/dict
 # reorganize dataset for MFA
 if [ ! -d $EXP_DIR/thchs30_corpus ]; then
    echo "reorganizing thchs30 corpus..."
    python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME
    echo "reorganization done."
 fi
 echo "THCHS-30  data preparation done."
 exit 0
--- a/examples/thchs30/a0/local/gen_word2phone.py
+++ b/examples/thchs30/a0/local/gen_word2phone.py
@ -53,9 +53,9 @@ def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
    root_dir = Path(root_dir).expanduser()
    output_dir = Path(output_dir).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)
-    file1 = root_dir / "data_thchs30/lm_word/lexicon.txt"
+    file1 = root_dir / "lm_word_lexicon_1"
-    file2 = root_dir / "resource/dict/lexicon.txt"
+    file2 = root_dir / "lm_word_lexicon_2"
-    write_file = output_dir / "thchs30_cn2phone"
+    write_file = output_dir / "word.lexicon"
    with open(file1, "r") as f1:
        for line in f1:
@ -87,10 +87,8 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Gen Chinese characters to phone lexicon for THCHS-30 dataset"
    )
    parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
    parser.add_argument(
-        "--output-dir",
+        "--root-dir", type=str, help="dir to thchs30 lm_word_lexicons")
-        type=str,
+    parser.add_argument("--output-dir", type=str, help="path to save outputs")
        help="path to save outputs(audio and transcriptions)")
    args = parser.parse_args()
    gen_lexicon(args.root_dir, args.output_dir)
--- a/examples/thchs30/a0/local/recorganize_thchs30.py
+++ b/examples/thchs30/a0/local/recorganize_thchs30.py
@ -1,112 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Recorganize THCHS-30 for MFA
 read manifest.train from root-dir
 Link *.wav to output-dir
 dump *.lab from manifest.train, such as: text、syllable and phone
 Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 """
 import argparse
 import os
 from pathlib import Path
 from typing import Union
 from deepspeech.frontend.utility import read_manifest
 def link_wav(root_dir: Union[str, Path], output_dir: Union[str, Path]):
    manifest_path = root_dir / "manifest.train"
    manifest_jsons = read_manifest(manifest_path)
    for line_json in manifest_jsons:
        wav_path = line_json['feat']
        wav_name = wav_path.split("/")[-1]
        new_wav_path = output_dir / wav_name
        os.symlink(wav_path, new_wav_path)
 def link_lexicon(root_dir: Union[str, Path],
                 output_dir: Union[str, Path],
                 script_type='phone'):
    manifest_path = root_dir / "manifest.train"
    manifest_jsons = read_manifest(manifest_path)
    line_json = manifest_jsons[0]
    wav_path = line_json['feat']
    if script_type == 'phone':
        # find lexicon.txt in THCHS-30
        grader_father = os.path.abspath(
            os.path.dirname(wav_path) + os.path.sep + "..")
        grader_father = Path(grader_father).expanduser()
        lexicon_name = "lexicon.txt"
        lexicon_father_dir = "lm_phone"
        lexicon_path = grader_father / lexicon_father_dir / lexicon_name
    elif script_type == 'syllable':
        # find thchs30_pinyin2phone in dir of this py file
        py_dir_path = os.path.split(os.path.realpath(__file__))[0]
        py_dir_path = Path(py_dir_path).expanduser()
        lexicon_path = py_dir_path / "thchs30_pinyin2phone"
    else:
        # script_type == 'text'
        # find thchs30_cn2phone in dir of this py file
        py_dir_path = os.path.split(os.path.realpath(__file__))[0]
        py_dir_path = Path(py_dir_path).expanduser()
        lexicon_path = py_dir_path / "thchs30_cn2phone"
    new_lexicon_name = script_type + ".lexicon"
    new_lexicon_path = os.path.dirname(output_dir) + "/" + new_lexicon_name
    os.symlink(lexicon_path, new_lexicon_path)
 def dump_lab(root_dir: Union[str, Path],
             output_dir: Union[str, Path],
             script_type='phone'):
    # script_type can in {'text', 'syllable', 'phone'}
    manifest_path = root_dir / "manifest.train"
    manifest_jsons = read_manifest(manifest_path)
    for line_json in manifest_jsons:
        utt_id = line_json['utt']
        transcript_name = utt_id + ".lab"
        transcript_path = output_dir / transcript_name
        with open(transcript_path, 'wt') as wf:
            wf.write(line_json[script_type] + "\n")
 def reorganize_thchs30(root_dir: Union[str, Path],
                       output_dir: Union[str, Path]=None,
                       script_type='phone'):
    root_dir = Path(root_dir).expanduser()
    output_dir = Path(output_dir).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)
    link_wav(root_dir, output_dir)
    dump_lab(root_dir, output_dir, script_type)
    link_lexicon(root_dir, output_dir, script_type)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Reorganize THCHS-30 dataset for MFA")
    parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
    parser.add_argument(
        "--output-dir",
        type=str,
        help="path to save outputs(audio and transcriptions)")
    parser.add_argument(
        "--script-type",
        type=str,
        default="phone",
        help="type of lab (text'/'syllable'/'phone')")
    args = parser.parse_args()
    reorganize_thchs30(args.root_dir, args.output_dir, args.script_type)
--- a/examples/thchs30/a0/local/reorganize_thchs30.py
+++ b/examples/thchs30/a0/local/reorganize_thchs30.py
@ -0,0 +1,83 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Recorganize THCHS-30 for MFA
 read manifest.train from root-dir
 Link *.wav to output-dir
 dump *.lab from manifest.train, such as: text、syllable and phone
 Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 """
 import argparse
 import os
 from pathlib import Path
 from typing import Union
 def link_wav(root_dir: Union[str, Path], output_dir: Union[str, Path]):
    wav_scp_path = root_dir / 'wav.scp'
    with open(wav_scp_path, 'r') as rf:
        for line in rf:
            utt, feat = line.strip().split()
            wav_path = feat
            wav_name = wav_path.split("/")[-1]
            new_wav_path = output_dir / wav_name
            os.symlink(wav_path, new_wav_path)
 def write_lab(root_dir: Union[str, Path],
              output_dir: Union[str, Path],
              script_type='phone'):
    # script_type can in {'word', 'syllable', 'phone'}
    json_name = 'text.' + script_type
    json_path = root_dir / json_name
    with open(json_path, 'r') as rf:
        for line in rf:
            line = line.strip().split()
            utt_id = line[0]
            context = ' '.join(line[1:])
            transcript_name = utt_id + '.lab'
            transcript_path = output_dir / transcript_name
            with open(transcript_path, 'wt') as wf:
                if script_type == 'word':
                    # add space between chinese char
                    context = ''.join([f + ' ' for f in context])[:-1]
                wf.write(context + "\n")
 def reorganize_thchs30(root_dir: Union[str, Path],
                       output_dir: Union[str, Path]=None,
                       script_type='phone'):
    root_dir = Path(root_dir).expanduser()
    output_dir = Path(output_dir).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)
    link_wav(root_dir, output_dir)
    write_lab(root_dir, output_dir, script_type)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Reorganize THCHS-30 dataset for MFA")
    parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
    parser.add_argument(
        "--output-dir",
        type=str,
        help="path to save outputs(audio and transcriptions)")
    parser.add_argument(
        "--script-type",
        type=str,
        default="phone",
        help="type of lab ('word'/'syllable'/'phone')")
    args = parser.parse_args()
    reorganize_thchs30(args.root_dir, args.output_dir, args.script_type)
--- a/examples/thchs30/a0/local/thchs30_cn2phone
+++ b/examples/thchs30/a0/local/thchs30_cn2phone
--- a/examples/thchs30/a0/path.sh
+++ b/examples/thchs30/a0/path.sh
@ -9,6 +9,5 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
-
+# MFA is in tools
-MODEL=deepspeech2
+export PATH=${MAIN_ROOT}/tools/montreal-forced-aligner/bin:$PATH
 export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
--- a/examples/thchs30/a0/run.sh
+++ b/examples/thchs30/a0/run.sh
@ -4,33 +4,26 @@ source path.sh
 stage=0
 stop_stage=100
 EXP_DIR=exp
-# LEXICON_NAME in {'phone', 'syllable', 'text'}
+# LEXICON_NAME in {'phone', 'syllable', 'word'}
 LEXICON_NAME='phone'
-# get machine's cpu core number
+# set MFA num_jobs as half of machine's cpu core number
-NUM_JOBS=`grep 'processor' /proc/cpuinfo | sort -u | wc -l`
+NUM_JOBS=$((`nproc`/2))
 NUM_JOBS=$((NUM_JOBS/2))
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 # download dataset、unzip and generate manifest 
 # gen lexicon relink gen dump
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
-    bash ./local/data.sh || exit -1
+    bash ./local/data.sh $LEXICON_NAME|| exit -1
 fi
-# reorganize dataset for MFA
+# run MFA
 if [ ! -d $EXP_DIR/thchs30_corpus ]; then
    echo "reorganizing thchs30 corpus..."
    python local/recorganize_thchs30.py --root-dir=./data --output-dir=$EXP_DIR/thchs30_corpus --script-type=$LEXICON_NAME
    echo "reorganization done."
 fi
 # MFA is in tools
 export PATH="${MAIN_ROOT}/tools/montreal-forced-aligner/bin"
 if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then
    echo "Start MFA training..."
-    mfa_train_and_align $EXP_DIR/thchs30_corpus "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
+    mfa_train_and_align data/thchs30_corpus "data/$LEXICON_NAME.lexicon" $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
    echo "training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n"
 fi
    mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
--- a/utils/dump_manifest.py
+++ b/utils/dump_manifest.py
@ -0,0 +1,63 @@
 #!/usr/bin/env python3
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """format manifest into wav.scp text.word [text.syllable text.phone]"""
 import argparse
 from pathlib import Path
 from typing import Union
 from deepspeech.frontend.utility import read_manifest
 key_whitelist = set(['feat', 'text', 'syllable', 'phone'])
 filename = {
    'text': 'text.word',
    'syllable': 'text.syllable',
    'phone': 'text.phone',
    'feat': 'wav.scp',
 }
 def dump_manifest(manifest_path, output_dir: Union[str, Path]):
    output_dir = Path(output_dir).expanduser()
    manifest_path = Path(manifest_path).expanduser()
    manifest_jsons = read_manifest(manifest_path)
    first_line = manifest_jsons[0]
    file_map = {}
    for k in first_line.keys():
        if k not in key_whitelist:
            continue
        file_map[k] = open(output_dir / filename[k], 'w')
    for line_json in manifest_jsons:
        for k in line_json.keys():
            if k not in key_whitelist:
                continue
            file_map[k].write(line_json['utt'] + ' ' + line_json[k] + '\n')
    for _, file in file_map.items():
        file.close()
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="dump manifest to wav.scp text.word ...")
    parser.add_argument("--manifest-path", type=str, help="path to manifest")
    parser.add_argument(
        "--output-dir",
        type=str,
        help="path to save outputs(audio and transcriptions)")
    args = parser.parse_args()
    dump_manifest(args.manifest_path, args.output_dir)