Merge pull request #698 from yt605155624/thchs30_MFA

Thchs30 mfa
4 years ago · ab5411ec16
parent 8ad915a907 f0c0462b7a
commit ab5411ec16
10 changed files with 2874 additions and 3 deletions
--- a/deepspeech/decoders/swig/setup.py
+++ b/deepspeech/decoders/swig/setup.py
@ -84,8 +84,9 @@ FILES = glob.glob('kenlm/util/*.cc') \
 FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')

 FILES = [
-    fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc')
-                               or fn.endswith('unittest.cc'))
+    fn for fn in FILES
+    if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
+        'unittest.cc'))
 ]

 LIBS = ['stdc++']
--- a/examples/thchs30/README.md
+++ b/examples/thchs30/README.md
@ -0,0 +1,42 @@
+# THCHS-30 数据集强制对齐实验
+-----
+本实验对 THCHS-30 中文数据集用 [Montreal-Forced-Aligner](https://montreal-forced-aligner.readthedocs.io/en/latest/index.html) 进行强制对齐。
+THCHS-30 的文本标注数据分为：
+ 1. 汉字级别（word），该数据集用空格对词进行了划分，我们在使用时按照将不同字之间按空格划分
+ 2. 音节级别（syllable），即汉语中的一个拼音
+ 3. 音素级别（phone），一个拼音有多个音素组成，汉语的声母韵母可以理解为音素，不同的数据集有各自的音素标准，THCHS-30 数据集与标贝 BZNSYP 数据集的音素标准略有不同
+
+ 数据 A11_0 文本示例如下：
+```
+绿 是 阳春 烟 景 大块 文章 的 底色 四月 的 林 峦 更是 绿 得 鲜活 秀媚 诗意 盎然↩
+lv4 shi4 yang2 chun1 yan1 jing3 da4 kuai4 wen2 zhang1 de5 di3 se4 si4 yue4 de5 lin2 luan2 geng4 shi4 lv4 de5 xian1 huo2 xiu4 mei4 shi1 yi4 ang4 ran2↩
+l v4 sh ix4 ii iang2 ch un1 ii ian1 j ing3 d a4 k uai4 uu un2 zh ang1 d e5 d i3 s e4 s iy4 vv ve4 d e5 l in2 l uan2 g eng4 sh ix4 l v4 d e5 x ian1 h uo2 x iu4 m ei4 sh ix1 ii i4 aa ang4 r an2
+```
+## 开始实验
+---
+在本项目的 根目录/tools 执行
+```
+make
+```
+下载 MFA 的可执行包（也会同时下载本项目所需的其他工具）
+执行如下命令：
+```
+cd a0
+./run.sh
+```
+应用程序会自动下载 THCHS-30数据集，处理成 MFA 所需的文件格式并开始训练，您可以修改 `run.sh` 中的参数 `LEXICON_NAME` 来决定您需要强制对齐的级别（word、syllable 和 phone）
+## MFA 所使用的字典
+---
+MFA 字典的格式请参考: [MFA 官方文档 Dictionary format ](https://montreal-forced-aligner.readthedocs.io/en/latest/dictionary.html)
+phone.lexicon 直接使用的是 `THCHS-30/data_thchs30/lm_phone/lexicon.txt`
+word.lexicon 考虑到了中文的多音字，使用**带概率的字典**, 生成规则请参考 `local/gen_word2phone.py`
+`syllable.lexicon` 获取自 [DNSun/thchs30-pinyin2tone](https://github.com/DNSun/thchs30-pinyin2tone)
+## 对齐结果
+---
+我们提供了三种级别 MFA 训练好的对齐结果、模型和字典（`syllable.lexicon`  在 `data/dict` 中，`phone.lexicon` 和` word.lexicon` 运行数据预处理代码后会自动从原始数据集复制或生成）
+
+**phone 级别：** [phone.lexicon](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/phone/phone.lexicon)、 [对齐结果](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/phone/thchs30_alignment.tar.gz)、[模型](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/phone/thchs30_model.zip)
+**syllabel 级别：** [syllable.lexicon](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/syllable/syllable.lexicon)、[对齐结果](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/syllable/thchs30_alignment.tar.gz)、[模型](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/syllable/thchs30_model.zip)
+**word 级别：** [word.lexicon](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/word/word.lexicon)、[对齐结果](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/word/thchs30_alignment.tar.gz)、[模型](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/word/thchs30_model.zip)
+
+随后，您可以参考 [MFA 官方文档 Align using pretrained models](https://montreal-forced-aligner.readthedocs.io/en/stable/aligning.html#align-using-pretrained-models) 使用我们给您提供好的模型直接对自己的数据集进行强制对齐，注意，您需要使用和模型对应的 lexicon 文件，当文本是汉字时，您需要用空格把不同的**汉字**（而不是词语）分开
--- a/examples/thchs30/a0/data/dict/syllable.lexicon
+++ b/examples/thchs30/a0/data/dict/syllable.lexicon
--- a/examples/thchs30/a0/local/data.sh
+++ b/examples/thchs30/a0/local/data.sh
@ -0,0 +1,47 @@
+#! /usr/bin/env bash
+
+stage=-1
+stop_stage=100
+
+source ${MAIN_ROOT}/utils/parse_options.sh
+
+mkdir -p data
+TARGET_DIR=${MAIN_ROOT}/examples/dataset
+mkdir -p ${TARGET_DIR}
+LEXICON_NAME=$1
+
+# download data, generate manifests
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    python3 ${TARGET_DIR}/thchs30/thchs30.py \
+    --manifest_prefix="data/manifest" \
+    --target_dir="${TARGET_DIR}/thchs30"
+
+    if [ $? -ne 0 ]; then
+        echo "Prepare THCHS-30 failed. Terminated."
+        exit 1
+    fi
+    
+fi
+
+# dump manifest to data/
+python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data
+
+# copy files to data/dict to gen word.lexicon
+cp  ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
+cp  ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
+
+# copy phone.lexicon to data/dict
+cp  ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
+
+# gen word.lexicon
+python local/gen_word2phone.py  --root-dir=data/dict --output-dir=data/dict
+
+# reorganize dataset for MFA
+if [ ! -d $EXP_DIR/thchs30_corpus ]; then
+    echo "reorganizing thchs30 corpus..."
+    python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME
+    echo "reorganization done."
+fi
+
+echo "THCHS-30  data preparation done."
+exit 0
--- a/examples/thchs30/a0/local/gen_word2phone.py
+++ b/examples/thchs30/a0/local/gen_word2phone.py
@ -0,0 +1,94 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Gen Chinese characters to THCHS30-30 phone lexicon using THCHS30-30's lexicon
+file1: THCHS-30/data_thchs30/lm_word/lexicon.txt
+file2: THCHS-30/resource/dict/lexicon.txt
+"""
+import argparse
+from collections import defaultdict
+from pathlib import Path
+from typing import Union
+
+# key: (cn, ('ee', 'er4'))，value: count
+cn_phones_counter = defaultdict(int)
+# key: cn, value: list of (phones, num)
+cn_counter = defaultdict(list)
+# key: cn, value: list of (phones, probabilities)
+cn_counter_p = defaultdict(list)
+
+
+def is_Chinese(ch):
+    if '\u4e00' <= ch <= '\u9fff':
+        return True
+    return False
+
+
+def proc_line(line):
+    line = line.strip()
+    if is_Chinese(line[0]):
+        line_list = line.split()
+        cn_list = line_list[0]
+        phone_list = line_list[1:]
+        if len(cn_list) == len(phone_list) / 2:
+            new_phone_list = [(phone_list[i], phone_list[i + 1])
+                              for i in range(0, len(phone_list), 2)]
+            assert len(cn_list) == len(new_phone_list)
+            for idx, cn in enumerate(cn_list):
+                phones = new_phone_list[idx]
+                cn_phones_counter[(cn, phones)] += 1
+
+
+def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
+    root_dir = Path(root_dir).expanduser()
+    output_dir = Path(output_dir).expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    file1 = root_dir / "lm_word_lexicon_1"
+    file2 = root_dir / "lm_word_lexicon_2"
+    write_file = output_dir / "word.lexicon"
+
+    with open(file1, "r") as f1:
+        for line in f1:
+            proc_line(line)
+    with open(file2, "r") as f2:
+        for line in f2:
+            proc_line(line)
+    for key in cn_phones_counter:
+        cn = key[0]
+        cn_counter[cn].append((key[1], cn_phones_counter[key]))
+
+    for key in cn_counter:
+        phone_count_list = cn_counter[key]
+        count_sum = sum([x[1] for x in phone_count_list])
+        for item in phone_count_list:
+            p = item[1] / count_sum
+            p = round(p, 2)
+            if p > 0:
+                cn_counter_p[key].append((item[0], p))
+    with open(write_file, "w") as wf:
+        for key in cn_counter_p:
+            phone_p_list = cn_counter_p[key]
+            for item in phone_p_list:
+                phones, p = item
+                wf.write(key + " " + str(p) + " " + " ".join(phones) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Gen Chinese characters to phone lexicon for THCHS-30 dataset"
+    )
+    parser.add_argument(
+        "--root-dir", type=str, help="dir to thchs30 lm_word_lexicons")
+    parser.add_argument("--output-dir", type=str, help="path to save outputs")
+    args = parser.parse_args()
+    gen_lexicon(args.root_dir, args.output_dir)
--- a/examples/thchs30/a0/local/reorganize_thchs30.py
+++ b/examples/thchs30/a0/local/reorganize_thchs30.py
@ -0,0 +1,83 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Recorganize THCHS-30 for MFA
+read manifest.train from root-dir
+Link *.wav to output-dir
+dump *.lab from manifest.train, such as: text、syllable and phone
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+"""
+import argparse
+import os
+from pathlib import Path
+from typing import Union
+
+
+def link_wav(root_dir: Union[str, Path], output_dir: Union[str, Path]):
+    wav_scp_path = root_dir / 'wav.scp'
+    with open(wav_scp_path, 'r') as rf:
+        for line in rf:
+            utt, feat = line.strip().split()
+            wav_path = feat
+            wav_name = wav_path.split("/")[-1]
+            new_wav_path = output_dir / wav_name
+            os.symlink(wav_path, new_wav_path)
+
+
+def write_lab(root_dir: Union[str, Path],
+              output_dir: Union[str, Path],
+              script_type='phone'):
+    # script_type can in {'word', 'syllable', 'phone'}
+    json_name = 'text.' + script_type
+    json_path = root_dir / json_name
+    with open(json_path, 'r') as rf:
+        for line in rf:
+            line = line.strip().split()
+            utt_id = line[0]
+            context = ' '.join(line[1:])
+            transcript_name = utt_id + '.lab'
+            transcript_path = output_dir / transcript_name
+            with open(transcript_path, 'wt') as wf:
+                if script_type == 'word':
+                    # add space between chinese char
+                    context = ''.join([f + ' ' for f in context])[:-1]
+                wf.write(context + "\n")
+
+
+def reorganize_thchs30(root_dir: Union[str, Path],
+                       output_dir: Union[str, Path]=None,
+                       script_type='phone'):
+    root_dir = Path(root_dir).expanduser()
+    output_dir = Path(output_dir).expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    link_wav(root_dir, output_dir)
+    write_lab(root_dir, output_dir, script_type)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Reorganize THCHS-30 dataset for MFA")
+    parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        help="path to save outputs(audio and transcriptions)")
+
+    parser.add_argument(
+        "--script-type",
+        type=str,
+        default="phone",
+        help="type of lab ('word'/'syllable'/'phone')")
+    args = parser.parse_args()
+    reorganize_thchs30(args.root_dir, args.output_dir, args.script_type)
--- a/examples/thchs30/a0/path.sh
+++ b/examples/thchs30/a0/path.sh
@ -0,0 +1,13 @@
+export MAIN_ROOT=${PWD}/../../../
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8 
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+# MFA is in tools
+export PATH=${MAIN_ROOT}/tools/montreal-forced-aligner/bin:$PATH
--- a/examples/thchs30/a0/run.sh
+++ b/examples/thchs30/a0/run.sh
@ -0,0 +1,32 @@
+#!/bin/bash
+set -e
+source path.sh
+stage=0
+stop_stage=100
+EXP_DIR=exp
+# LEXICON_NAME in {'phone', 'syllable', 'word'}
+LEXICON_NAME='phone'
+# set MFA num_jobs as half of machine's cpu core number
+NUM_JOBS=$((`nproc`/2))
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+# download dataset、unzip and generate manifest 
+# gen lexicon relink gen dump
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    bash ./local/data.sh $LEXICON_NAME|| exit -1
+fi
+
+# run MFA
+if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then
+    echo "Start MFA training..."
+    mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
+    echo "training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n"
+fi
+
+
+
+
+
+
+
--- a/tools/Makefile
+++ b/tools/Makefile
@ -1,7 +1,8 @@
+SHELL:= /bin/bash
 PYTHON:= python3.7
 .PHONY: all clean

-all: virtualenv kenlm.done sox.done soxbindings.done
+all: virtualenv kenlm.done sox.done soxbindings.done mfa.done

 virtualenv:
 	test -d venv || virtualenv -p $(PYTHON) venv
@ -33,3 +34,8 @@ soxbindings.done:
 	test -d soxbindings || git clone https://github.com/pseeth/soxbindings.git
 	source venv/bin/activate; cd soxbindings && python setup.py install
 	touch soxbindings.done
+
+mfa.done:
+	test -d montreal-forced-aligner || wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz
+	tar xvf montreal-forced-aligner_linux.tar.gz
+	touch mfa.done
--- a/utils/dump_manifest.py
+++ b/utils/dump_manifest.py
@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""format manifest into wav.scp text.word [text.syllable text.phone]"""
+import argparse
+from pathlib import Path
+from typing import Union
+
+from deepspeech.frontend.utility import read_manifest
+
+key_whitelist = set(['feat', 'text', 'syllable', 'phone'])
+filename = {
+    'text': 'text.word',
+    'syllable': 'text.syllable',
+    'phone': 'text.phone',
+    'feat': 'wav.scp',
+}
+
+
+def dump_manifest(manifest_path, output_dir: Union[str, Path]):
+
+    output_dir = Path(output_dir).expanduser()
+    manifest_path = Path(manifest_path).expanduser()
+    manifest_jsons = read_manifest(manifest_path)
+    first_line = manifest_jsons[0]
+    file_map = {}
+
+    for k in first_line.keys():
+        if k not in key_whitelist:
+            continue
+        file_map[k] = open(output_dir / filename[k], 'w')
+
+    for line_json in manifest_jsons:
+        for k in line_json.keys():
+            if k not in key_whitelist:
+                continue
+            file_map[k].write(line_json['utt'] + ' ' + line_json[k] + '\n')
+
+    for _, file in file_map.items():
+        file.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="dump manifest to wav.scp text.word ...")
+    parser.add_argument("--manifest-path", type=str, help="path to manifest")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        help="path to save outputs(audio and transcriptions)")
+    args = parser.parse_args()
+    dump_manifest(args.manifest_path, args.output_dir)