restructure thchs30/a0

5 years ago · 00017301c6
parent c0ee57d400
commit 00017301c6
10 changed files with 188 additions and 8277 deletions
--- a/examples/thchs30/README.md
+++ b/examples/thchs30/README.md
@ -1,2 +1,7 @@
 this is the example of MFA for thchs30 dataset
 cd a0 run run.sh to get start
+
+MFA 对齐所使用的字典
+MFA 字典的格式可以参考: https://montreal-forced-aligner.readthedocs.io/en/latest/dictionary.html
+phone.lexicon 直接使用的是 THCHS-30/data_thchs30/lm_phone/lexicon.txt
+word.lexicon 是一个带概率的字典, 生成规则请参考 local/gen_word2phone.py
--- a/examples/thchs30/a0/data/dict/syllable.lexicon
+++ b/examples/thchs30/a0/data/dict/syllable.lexicon
--- a/examples/thchs30/a0/local/data.sh
+++ b/examples/thchs30/a0/local/data.sh
@ -8,9 +8,10 @@ source ${MAIN_ROOT}/utils/parse_options.sh
 mkdir -p data
 TARGET_DIR=${MAIN_ROOT}/examples/dataset
 mkdir -p ${TARGET_DIR}
+LEXICON_NAME=$1

+# download data, generate manifests
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
-    # download data, generate manifests
    python3 ${TARGET_DIR}/thchs30/thchs30.py \
    --manifest_prefix="data/manifest" \
    --target_dir="${TARGET_DIR}/thchs30"
@ -22,5 +23,25 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    
 fi

+# dump manifest to data/
+python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data
+
+# copy files to data/dict to gen word.lexicon
+cp  ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
+cp  ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
+
+# copy phone.lexicon to data/dict
+cp  ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
+
+# gen word.lexicon
+python local/gen_word2phone.py  --root-dir=data/dict --output-dir=data/dict
+
+# reorganize dataset for MFA
+if [ ! -d $EXP_DIR/thchs30_corpus ]; then
+    echo "reorganizing thchs30 corpus..."
+    python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME
+    echo "reorganization done."
+fi
+
 echo "THCHS-30  data preparation done."
 exit 0
--- a/examples/thchs30/a0/local/gen_word2phone.py
+++ b/examples/thchs30/a0/local/gen_word2phone.py
@ -53,9 +53,9 @@ def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
    root_dir = Path(root_dir).expanduser()
    output_dir = Path(output_dir).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)
-    file1 = root_dir / "data_thchs30/lm_word/lexicon.txt"
-    file2 = root_dir / "resource/dict/lexicon.txt"
-    write_file = output_dir / "thchs30_cn2phone"
+    file1 = root_dir / "lm_word_lexicon_1"
+    file2 = root_dir / "lm_word_lexicon_2"
+    write_file = output_dir / "word.lexicon"

    with open(file1, "r") as f1:
        for line in f1:
@ -87,10 +87,8 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Gen Chinese characters to phone lexicon for THCHS-30 dataset"
    )
-    parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
    parser.add_argument(
-        "--output-dir",
-        type=str,
-        help="path to save outputs(audio and transcriptions)")
+        "--root-dir", type=str, help="dir to thchs30 lm_word_lexicons")
+    parser.add_argument("--output-dir", type=str, help="path to save outputs")
    args = parser.parse_args()
    gen_lexicon(args.root_dir, args.output_dir)
--- a/examples/thchs30/a0/local/recorganize_thchs30.py
+++ b/examples/thchs30/a0/local/recorganize_thchs30.py
@ -1,112 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Recorganize THCHS-30 for MFA
-read manifest.train from root-dir
-Link *.wav to output-dir
-dump *.lab from manifest.train, such as: text、syllable and phone
-Manifest file is a json-format file with each line containing the
-meta data (i.e. audio filepath, transcript and audio duration)
-"""
-import argparse
-import os
-from pathlib import Path
-from typing import Union
-
-from deepspeech.frontend.utility import read_manifest
-
-
-def link_wav(root_dir: Union[str, Path], output_dir: Union[str, Path]):
-    manifest_path = root_dir / "manifest.train"
-    manifest_jsons = read_manifest(manifest_path)
-    for line_json in manifest_jsons:
-        wav_path = line_json['feat']
-        wav_name = wav_path.split("/")[-1]
-        new_wav_path = output_dir / wav_name
-        os.symlink(wav_path, new_wav_path)
-
-
-def link_lexicon(root_dir: Union[str, Path],
-                 output_dir: Union[str, Path],
-                 script_type='phone'):
-    manifest_path = root_dir / "manifest.train"
-    manifest_jsons = read_manifest(manifest_path)
-    line_json = manifest_jsons[0]
-    wav_path = line_json['feat']
-
-    if script_type == 'phone':
-        # find lexicon.txt in THCHS-30
-        grader_father = os.path.abspath(
-            os.path.dirname(wav_path) + os.path.sep + "..")
-        grader_father = Path(grader_father).expanduser()
-        lexicon_name = "lexicon.txt"
-        lexicon_father_dir = "lm_phone"
-        lexicon_path = grader_father / lexicon_father_dir / lexicon_name
-    elif script_type == 'syllable':
-        # find thchs30_pinyin2phone in dir of this py file
-        py_dir_path = os.path.split(os.path.realpath(__file__))[0]
-        py_dir_path = Path(py_dir_path).expanduser()
-        lexicon_path = py_dir_path / "thchs30_pinyin2phone"
-    else:
-        # script_type == 'text'
-        # find thchs30_cn2phone in dir of this py file
-        py_dir_path = os.path.split(os.path.realpath(__file__))[0]
-        py_dir_path = Path(py_dir_path).expanduser()
-        lexicon_path = py_dir_path / "thchs30_cn2phone"
-
-    new_lexicon_name = script_type + ".lexicon"
-    new_lexicon_path = os.path.dirname(output_dir) + "/" + new_lexicon_name
-    os.symlink(lexicon_path, new_lexicon_path)
-
-
-def dump_lab(root_dir: Union[str, Path],
-             output_dir: Union[str, Path],
-             script_type='phone'):
-    # script_type can in {'text', 'syllable', 'phone'}
-    manifest_path = root_dir / "manifest.train"
-    manifest_jsons = read_manifest(manifest_path)
-    for line_json in manifest_jsons:
-        utt_id = line_json['utt']
-        transcript_name = utt_id + ".lab"
-        transcript_path = output_dir / transcript_name
-        with open(transcript_path, 'wt') as wf:
-            wf.write(line_json[script_type] + "\n")
-
-
-def reorganize_thchs30(root_dir: Union[str, Path],
-                       output_dir: Union[str, Path]=None,
-                       script_type='phone'):
-    root_dir = Path(root_dir).expanduser()
-    output_dir = Path(output_dir).expanduser()
-    output_dir.mkdir(parents=True, exist_ok=True)
-    link_wav(root_dir, output_dir)
-    dump_lab(root_dir, output_dir, script_type)
-    link_lexicon(root_dir, output_dir, script_type)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Reorganize THCHS-30 dataset for MFA")
-    parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        help="path to save outputs(audio and transcriptions)")
-
-    parser.add_argument(
-        "--script-type",
-        type=str,
-        default="phone",
-        help="type of lab (text'/'syllable'/'phone')")
-    args = parser.parse_args()
-    reorganize_thchs30(args.root_dir, args.output_dir, args.script_type)
--- a/examples/thchs30/a0/local/reorganize_thchs30.py
+++ b/examples/thchs30/a0/local/reorganize_thchs30.py
@ -0,0 +1,83 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Recorganize THCHS-30 for MFA
+read manifest.train from root-dir
+Link *.wav to output-dir
+dump *.lab from manifest.train, such as: text、syllable and phone
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+"""
+import argparse
+import os
+from pathlib import Path
+from typing import Union
+
+
+def link_wav(root_dir: Union[str, Path], output_dir: Union[str, Path]):
+    wav_scp_path = root_dir / 'wav.scp'
+    with open(wav_scp_path, 'r') as rf:
+        for line in rf:
+            utt, feat = line.strip().split()
+            wav_path = feat
+            wav_name = wav_path.split("/")[-1]
+            new_wav_path = output_dir / wav_name
+            os.symlink(wav_path, new_wav_path)
+
+
+def write_lab(root_dir: Union[str, Path],
+              output_dir: Union[str, Path],
+              script_type='phone'):
+    # script_type can in {'word', 'syllable', 'phone'}
+    json_name = 'text.' + script_type
+    json_path = root_dir / json_name
+    with open(json_path, 'r') as rf:
+        for line in rf:
+            line = line.strip().split()
+            utt_id = line[0]
+            context = ' '.join(line[1:])
+            transcript_name = utt_id + '.lab'
+            transcript_path = output_dir / transcript_name
+            with open(transcript_path, 'wt') as wf:
+                if script_type == 'word':
+                    # add space between chinese char
+                    context = ''.join([f + ' ' for f in context])[:-1]
+                wf.write(context + "\n")
+
+
+def reorganize_thchs30(root_dir: Union[str, Path],
+                       output_dir: Union[str, Path]=None,
+                       script_type='phone'):
+    root_dir = Path(root_dir).expanduser()
+    output_dir = Path(output_dir).expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    link_wav(root_dir, output_dir)
+    write_lab(root_dir, output_dir, script_type)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Reorganize THCHS-30 dataset for MFA")
+    parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        help="path to save outputs(audio and transcriptions)")
+
+    parser.add_argument(
+        "--script-type",
+        type=str,
+        default="phone",
+        help="type of lab ('word'/'syllable'/'phone')")
+    args = parser.parse_args()
+    reorganize_thchs30(args.root_dir, args.output_dir, args.script_type)
--- a/examples/thchs30/a0/local/thchs30_cn2phone
+++ b/examples/thchs30/a0/local/thchs30_cn2phone
--- a/examples/thchs30/a0/path.sh
+++ b/examples/thchs30/a0/path.sh
@ -9,6 +9,5 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}

 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/

-
-MODEL=deepspeech2
-export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
+# MFA is in tools
+export PATH=${MAIN_ROOT}/tools/montreal-forced-aligner/bin:$PATH
--- a/examples/thchs30/a0/run.sh
+++ b/examples/thchs30/a0/run.sh
@ -4,33 +4,26 @@ source path.sh
 stage=0
 stop_stage=100
 EXP_DIR=exp
-# LEXICON_NAME in {'phone', 'syllable', 'text'}
+# LEXICON_NAME in {'phone', 'syllable', 'word'}
 LEXICON_NAME='phone'
-# get machine's cpu core number
-NUM_JOBS=`grep 'processor' /proc/cpuinfo | sort -u | wc -l`
-NUM_JOBS=$((NUM_JOBS/2))
+# set MFA num_jobs as half of machine's cpu core number
+NUM_JOBS=$((`nproc`/2))
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;

 # download dataset、unzip and generate manifest 
+# gen lexicon relink gen dump
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
-    bash ./local/data.sh || exit -1
+    bash ./local/data.sh $LEXICON_NAME|| exit -1
 fi

-# reorganize dataset for MFA
-if [ ! -d $EXP_DIR/thchs30_corpus ]; then
-    echo "reorganizing thchs30 corpus..."
-    python local/recorganize_thchs30.py --root-dir=./data --output-dir=$EXP_DIR/thchs30_corpus --script-type=$LEXICON_NAME
-    echo "reorganization done."
-fi
-# MFA is in tools
-export PATH="${MAIN_ROOT}/tools/montreal-forced-aligner/bin"
-
+# run MFA
 if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then
    echo "Start MFA training..."
-    mfa_train_and_align $EXP_DIR/thchs30_corpus "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
+    mfa_train_and_align data/thchs30_corpus "data/$LEXICON_NAME.lexicon" $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
    echo "training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n"
 fi
+    mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS



--- a/utils/dump_manifest.py
+++ b/utils/dump_manifest.py
@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""format manifest into wav.scp text.word [text.syllable text.phone]"""
+import argparse
+from pathlib import Path
+from typing import Union
+
+from deepspeech.frontend.utility import read_manifest
+
+key_whitelist = set(['feat', 'text', 'syllable', 'phone'])
+filename = {
+    'text': 'text.word',
+    'syllable': 'text.syllable',
+    'phone': 'text.phone',
+    'feat': 'wav.scp',
+}
+
+
+def dump_manifest(manifest_path, output_dir: Union[str, Path]):
+
+    output_dir = Path(output_dir).expanduser()
+    manifest_path = Path(manifest_path).expanduser()
+    manifest_jsons = read_manifest(manifest_path)
+    first_line = manifest_jsons[0]
+    file_map = {}
+
+    for k in first_line.keys():
+        if k not in key_whitelist:
+            continue
+        file_map[k] = open(output_dir / filename[k], 'w')
+
+    for line_json in manifest_jsons:
+        for k in line_json.keys():
+            if k not in key_whitelist:
+                continue
+            file_map[k].write(line_json['utt'] + ' ' + line_json[k] + '\n')
+
+    for _, file in file_map.items():
+        file.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="dump manifest to wav.scp text.word ...")
+    parser.add_argument("--manifest-path", type=str, help="path to manifest")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        help="path to save outputs(audio and transcriptions)")
+    args = parser.parse_args()
+    dump_manifest(args.manifest_path, args.output_dir)