commit
5681736acc
@ -0,0 +1,48 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Alignment for U2 model."""
|
||||
from deepspeech.exps.u2.config import get_cfg_defaults
|
||||
from deepspeech.exps.u2.model import U2Tester as Tester
|
||||
from deepspeech.training.cli import default_argument_parser
|
||||
from deepspeech.utils.utility import print_arguments
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
exp = Tester(config, args)
|
||||
exp.setup()
|
||||
exp.run_align()
|
||||
|
||||
|
||||
def main(config, args):
|
||||
main_sp(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = default_argument_parser()
|
||||
args = parser.parse_args()
|
||||
print_arguments(args, globals())
|
||||
|
||||
# https://yaml.org/type/float.html
|
||||
config = get_cfg_defaults()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
if args.dump_config:
|
||||
with open(args.dump_config, 'w') as f:
|
||||
print(config, file=f)
|
||||
|
||||
main(config, args)
|
After Width: | Height: | Size: 85 KiB |
After Width: | Height: | Size: 46 KiB |
@ -1,4 +1,4 @@
|
||||
# ASR PostProcess
|
||||
# ASR Text Backend
|
||||
|
||||
1. [Text Segmentation](text_front_end#text segmentation)
|
||||
2. Text Corrector
|
@ -0,0 +1,5 @@
|
||||
# Decoding
|
||||
|
||||
## Reference
|
||||
|
||||
* [时间戳和N-Best](https://mp.weixin.qq.com/s?__biz=MzU2NjUwMTgxOQ==&mid=2247483956&idx=1&sn=80ce595238d84155d50f08c0d52267d3&chksm=fcaacae0cbdd43f62b1da60c8e8671a9e0bb2aeee94f58751839b03a1c45b9a3889b96705080&scene=21#wechat_redirect)
|
@ -0,0 +1,61 @@
|
||||
# Featrues
|
||||
|
||||
### Speech Recognition
|
||||
|
||||
* Offline
|
||||
* [Baidu's DeepSpeech2](http://proceedings.mlr.press/v48/amodei16.pdf)
|
||||
* [Transformer](https://arxiv.org/abs/1706.03762)
|
||||
* [Conformer](https://arxiv.org/abs/2005.08100)
|
||||
|
||||
* Online
|
||||
* [U2](https://arxiv.org/pdf/2012.05481.pdf)
|
||||
|
||||
### Language Model
|
||||
|
||||
* Ngram
|
||||
|
||||
### Decoder
|
||||
|
||||
* ctc greedy
|
||||
* ctc prefix beam search
|
||||
* greedy
|
||||
* beam search
|
||||
* attention rescore
|
||||
|
||||
### Speech Frontend
|
||||
|
||||
* Audio
|
||||
* Auto Gain
|
||||
* Feature
|
||||
* kaldi fbank
|
||||
* kaldi mfcc
|
||||
* linear
|
||||
* delta detla
|
||||
|
||||
### Speech Augmentation
|
||||
|
||||
* Audio
|
||||
- Volume Perturbation
|
||||
- Speed Perturbation
|
||||
- Shifting Perturbation
|
||||
- Online Bayesian normalization
|
||||
- Noise Perturbation
|
||||
- Impulse Response
|
||||
* Spectrum
|
||||
- SpecAugment
|
||||
- Adaptive SpecAugment
|
||||
|
||||
### Tokenizer
|
||||
|
||||
* Chinese/English Character
|
||||
* English Word
|
||||
* Sentence Piece
|
||||
|
||||
### Word Segmentation
|
||||
|
||||
* [mmseg](http://technology.chtsai.org/mmseg/)
|
||||
|
||||
### Grapheme To Phoneme
|
||||
|
||||
* syallable
|
||||
* phoneme
|
@ -0,0 +1,3 @@
|
||||
# Useful Tools
|
||||
|
||||
* [正则可视化和常用正则表达式](https://wangwl.net/static/projects/visualRegex/#)
|
@ -1,7 +1,8 @@
|
||||
# Aishell-1
|
||||
|
||||
## CTC
|
||||
| Model | Config | Test set | CER |
|
||||
| --- | --- | --- | --- |
|
||||
| DeepSpeech2 | conf/deepspeech2.yaml | test | 0.078977 |
|
||||
| DeepSpeech2 | release 1.8.5 | test | 0.080447 |
|
||||
## Deepspeech2
|
||||
| Model | release | Config | Test set | CER |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| DeepSpeech2 | 2.1 | conf/deepspeech2.yaml | test | 0.078671 |
|
||||
| DeepSpeech2 | 2.0 | conf/deepspeech2.yaml | test | 0.078977 |
|
||||
| DeepSpeech2 | 1.8.5 | - | test | 0.080447 |
|
||||
|
@ -1,23 +0,0 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
if [ $# != 2 ];then
|
||||
echo "usage: ${0} ckpt_dir avg_num"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ckpt_dir=${1}
|
||||
average_num=${2}
|
||||
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
|
||||
|
||||
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
|
||||
--dst_model ${decode_checkpoint} \
|
||||
--ckpt_dir ${ckpt_dir} \
|
||||
--num ${average_num} \
|
||||
--val_best
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in avg ckpt!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -1,23 +0,0 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
if [ $# != 2 ]; then
|
||||
echo "usage: ${0} ckpt_dir avg_num"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ckpt_dir=${1}
|
||||
average_num=${2}
|
||||
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
|
||||
|
||||
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
|
||||
--dst_model ${decode_checkpoint} \
|
||||
--ckpt_dir ${ckpt_dir} \
|
||||
--num ${average_num} \
|
||||
--val_best
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in avg ckpt!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -0,0 +1,2 @@
|
||||
data
|
||||
exp
|
@ -0,0 +1,85 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# https://github.com/rubber-duck-dragon/rubber-duck-dragon.github.io/blob/master/cc-cedict_parser/parser.py
|
||||
#A parser for the CC-Cedict. Convert the Chinese-English dictionary into a list of python dictionaries with "traditional","simplified", "pinyin", and "english" keys.
|
||||
#Make sure that the cedict_ts.u8 file is in the same folder as this file, and that the name matches the file name on line 13.
|
||||
#Before starting, open the CEDICT text file and delete the copyright information at the top. Otherwise the program will try to parse it and you will get an error message.
|
||||
#Characters that are commonly used as surnames have two entries in CC-CEDICT. This program will remove the surname entry if there is another entry for the character. If you want to include the surnames, simply delete lines 59 and 60.
|
||||
#This code was written by Franki Allegra in February 2020.
|
||||
import json
|
||||
import sys
|
||||
|
||||
# usage: bin ccedict dump.json
|
||||
|
||||
with open(sys.argv[1], 'rt') as file:
|
||||
text = file.read()
|
||||
lines = text.split('\n')
|
||||
dict_lines = list(lines)
|
||||
|
||||
def parse_line(line):
|
||||
parsed = {}
|
||||
if line == '':
|
||||
dict_lines.remove(line)
|
||||
return 0
|
||||
if line.startswith('#'):
|
||||
return 0
|
||||
if line.startswith('%'):
|
||||
return 0
|
||||
line = line.rstrip('/')
|
||||
line = line.split('/')
|
||||
if len(line) <= 1:
|
||||
return 0
|
||||
english = line[1]
|
||||
char_and_pinyin = line[0].split('[')
|
||||
characters = char_and_pinyin[0]
|
||||
characters = characters.split()
|
||||
traditional = characters[0]
|
||||
simplified = characters[1]
|
||||
pinyin = char_and_pinyin[1]
|
||||
pinyin = pinyin.rstrip()
|
||||
pinyin = pinyin.rstrip("]")
|
||||
parsed['traditional'] = traditional
|
||||
parsed['simplified'] = simplified
|
||||
parsed['pinyin'] = pinyin
|
||||
parsed['english'] = english
|
||||
list_of_dicts.append(parsed)
|
||||
|
||||
def remove_surnames():
|
||||
for x in range(len(list_of_dicts) - 1, -1, -1):
|
||||
if "surname " in list_of_dicts[x]['english']:
|
||||
if list_of_dicts[x]['traditional'] == list_of_dicts[x + 1][
|
||||
'traditional']:
|
||||
list_of_dicts.pop(x)
|
||||
|
||||
def main():
|
||||
|
||||
#make each line into a dictionary
|
||||
print("Parsing dictionary . . .")
|
||||
for line in dict_lines:
|
||||
parse_line(line)
|
||||
|
||||
#remove entries for surnames from the data (optional):
|
||||
print("Removing Surnames . . .")
|
||||
remove_surnames()
|
||||
|
||||
print("Saving to database (this may take a few minutes) . . .")
|
||||
with open(sys.argv[2], 'wt') as fout:
|
||||
for one_dict in list_of_dicts:
|
||||
json_str = json.dumps(one_dict)
|
||||
fout.write(json_str + "\n")
|
||||
print('Done!')
|
||||
|
||||
|
||||
list_of_dicts = []
|
||||
parsed_dict = main()
|
@ -0,0 +1,10 @@
|
||||
export MAIN_ROOT=${PWD}/../../
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
export LD_LIBRARY_PATH=/usr/local/lib/:${LD_LIBRARY_PATH}
|
@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
|
||||
# CC-CEDICT download: https://www.mdbg.net/chinese/dictionary?page=cc-cedict
|
||||
# The word dictionary of this website is based on CC-CEDICT.
|
||||
# CC-CEDICT is a continuation of the CEDICT project started by Paul Denisowski in 1997 with the
|
||||
# aim to provide a complete downloadable Chinese to English dictionary with pronunciation in pinyin for the Chinese characters.
|
||||
# This website allows you to easily add new entries or correct existing entries in CC-CEDICT.
|
||||
# Submitted entries will be checked and processed frequently and released for download in CEDICT format on this page.
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||
|
||||
|
||||
cedict_url=https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.zip
|
||||
cedict=cedict_1_0_ts_utf-8_mdbg.zip
|
||||
|
||||
mkdir -p data
|
||||
|
||||
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ];then
|
||||
test -f data/${cedict} || wget -O data/${cedict} ${cedict_url}
|
||||
pushd data
|
||||
unzip ${cedict}
|
||||
popd
|
||||
|
||||
fi
|
||||
|
||||
mkdir -p exp
|
||||
|
||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
|
||||
cp data/cedict_ts.u8 exp/cedict
|
||||
python3 local/parser.py exp/cedict exp/cedict.json
|
||||
fi
|
||||
|
@ -0,0 +1,2 @@
|
||||
data
|
||||
exp
|
@ -0,0 +1,5 @@
|
||||
# Download Baker dataset
|
||||
|
||||
Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset.
|
||||
|
||||
Download URL https://test.data-baker.com/#/data/index/source.
|
@ -0,0 +1,53 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import re
|
||||
|
||||
import jieba
|
||||
from pypinyin import lazy_pinyin
|
||||
from pypinyin import Style
|
||||
|
||||
|
||||
def extract_pinyin(source, target, use_jieba=False):
|
||||
with open(source, 'rt', encoding='utf-8') as fin:
|
||||
with open(target, 'wt', encoding='utf-8') as fout:
|
||||
for i, line in enumerate(fin):
|
||||
if i % 2 == 0:
|
||||
sentence_id, raw_text = line.strip().split()
|
||||
raw_text = re.sub(r'#\d', '', raw_text)
|
||||
if use_jieba:
|
||||
raw_text = jieba.lcut(raw_text)
|
||||
syllables = lazy_pinyin(
|
||||
raw_text,
|
||||
errors='ignore',
|
||||
style=Style.TONE3,
|
||||
neutral_tone_with_five=True)
|
||||
transcription = ' '.join(syllables)
|
||||
fout.write(f'{sentence_id} {transcription}\n')
|
||||
else:
|
||||
continue
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
|
||||
parser.add_argument(
|
||||
"input", type=str, help="source file of baker's prosody label file")
|
||||
parser.add_argument(
|
||||
"output", type=str, help="target file to write pinyin lables")
|
||||
parser.add_argument(
|
||||
"--use-jieba",
|
||||
action='store_true',
|
||||
help="use jieba for word segmentation.")
|
||||
args = parser.parse_args()
|
||||
extract_pinyin(args.input, args.output, use_jieba=args.use_jieba)
|
@ -0,0 +1,37 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
|
||||
|
||||
def extract_pinyin_lables(source, target):
|
||||
"""Extract pinyin labels from Baker's prosody labeling."""
|
||||
with open(source, 'rt', encoding='utf-8') as fin:
|
||||
with open(target, 'wt', encoding='utf-8') as fout:
|
||||
for i, line in enumerate(fin):
|
||||
if i % 2 == 0:
|
||||
sentence_id, raw_text = line.strip().split()
|
||||
fout.write(f'{sentence_id} ')
|
||||
else:
|
||||
transcription = line.strip()
|
||||
fout.write(f'{transcription}\n')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
|
||||
parser.add_argument(
|
||||
"input", type=str, help="source file of baker's prosody label file")
|
||||
parser.add_argument(
|
||||
"output", type=str, help="target file to write pinyin lables")
|
||||
args = parser.parse_args()
|
||||
extract_pinyin_lables(args.input, args.output)
|
@ -0,0 +1,100 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
from typing import List, Union
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def erized(syllable: str) -> bool:
|
||||
"""Whether the syllable contains erhua effect.
|
||||
|
||||
Example
|
||||
--------
|
||||
huar -> True
|
||||
guanr -> True
|
||||
er -> False
|
||||
"""
|
||||
# note: for pinyin, len(syllable) >=2 is always true
|
||||
# if not: there is something wrong in the data
|
||||
assert len(syllable) >= 2, f"inavlid syllable {syllable}"
|
||||
return syllable[:2] != "er" and syllable[-2] == 'r'
|
||||
|
||||
|
||||
def ignore_sandhi(reference: List[str], generated: List[str]) -> List[str]:
|
||||
"""
|
||||
Given a sequence of syllables from human annotation(reference),
|
||||
which makes sandhi explici and a sequence of syllables from some
|
||||
simple g2p program(generated), which does not consider sandhi,
|
||||
return a the reference sequence while ignore sandhi.
|
||||
|
||||
Example
|
||||
--------
|
||||
['lao2', 'hu3'], ['lao3', 'hu3'] -> ['lao3', 'hu3']
|
||||
"""
|
||||
i = 0
|
||||
j = 0
|
||||
|
||||
# sandhi ignored in the result while other errors are not included
|
||||
result = []
|
||||
while i < len(reference):
|
||||
if erized(reference[i]):
|
||||
result.append(reference[i])
|
||||
i += 1
|
||||
j += 2
|
||||
elif reference[i][:-1] == generated[i][:-1] and reference[i][
|
||||
-1] == '2' and generated[i][-1] == '3':
|
||||
result.append(generated[i])
|
||||
i += 1
|
||||
j += 1
|
||||
else:
|
||||
result.append(reference[i])
|
||||
i += 1
|
||||
j += 1
|
||||
assert j == len(
|
||||
generated
|
||||
), "length of transcriptions mismatch, There may be some characters that are ignored in the generated transcription."
|
||||
return result
|
||||
|
||||
|
||||
def convert_transcriptions(reference: Union[str, Path], generated: Union[str, Path], output: Union[str, Path]):
|
||||
with open(reference, 'rt') as f_ref:
|
||||
with open(generated, 'rt') as f_gen:
|
||||
with open(output, 'wt') as f_out:
|
||||
for i, (ref, gen) in enumerate(zip(f_ref, f_gen)):
|
||||
sentence_id, ref_transcription = ref.strip().split(' ', 1)
|
||||
_, gen_transcription = gen.strip().split(' ', 1)
|
||||
try:
|
||||
result = ignore_sandhi(ref_transcription.split(),
|
||||
gen_transcription.split())
|
||||
result = ' '.join(result)
|
||||
except Exception:
|
||||
print(
|
||||
f"sentence_id: {sentence_id} There is some annotation error in the reference or generated transcription. Use the reference."
|
||||
)
|
||||
result = ref_transcription
|
||||
f_out.write(f"{sentence_id} {result}\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="reference transcription but ignore sandhi.")
|
||||
parser.add_argument(
|
||||
"--reference",
|
||||
type=str,
|
||||
help="path to the reference transcription of baker dataset.")
|
||||
parser.add_argument(
|
||||
"--generated", type=str, help="path to the generated transcription.")
|
||||
parser.add_argument("--output", type=str, help="path to save result.")
|
||||
args = parser.parse_args()
|
||||
convert_transcriptions(args.reference, args.generated, args.output)
|
@ -0,0 +1,33 @@
|
||||
#!/bin/bash
|
||||
|
||||
exp_dir="exp"
|
||||
data_dir="data"
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||
|
||||
archive=${data_dir}/"BZNSYP.rar"
|
||||
if [ ! -f ${archive} ]; then
|
||||
echo "Baker Dataset not found! Download it first to the data_dir."
|
||||
exit -1
|
||||
fi
|
||||
|
||||
MD5='c4350563bf7dc298f7dd364b2607be83'
|
||||
md5_result=$(md5sum ${archive} | awk -F[' '] '{print $1}')
|
||||
if [ ${md5_result} != ${MD5} ]; then
|
||||
echo "MD5 mismatch! The Archive has been changed."
|
||||
exit -1
|
||||
fi
|
||||
|
||||
|
||||
label_file='ProsodyLabeling/000001-010000.txt'
|
||||
filename='000001-010000.txt'
|
||||
unrar e ${archive} ${label_file}
|
||||
cp ${filename} ${exp_dir}
|
||||
rm -f ${filename}
|
||||
|
||||
if [ ! -f ${exp_dir}/${filename} ];then
|
||||
echo "File extraction failed!"
|
||||
exit
|
||||
fi
|
||||
|
||||
exit 0
|
@ -0,0 +1,8 @@
|
||||
export MAIN_ROOT=${PWD}/../../
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
@ -0,0 +1 @@
|
||||
jieba
|
@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
source path.sh
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
|
||||
exp_dir=exp
|
||||
data_dir=data
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||
|
||||
mkdir -p ${exp_dir}
|
||||
|
||||
|
||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
|
||||
echo "stage 0: Extracting Prosody Labeling"
|
||||
bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data_dir}
|
||||
fi
|
||||
|
||||
# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
|
||||
filename="000001-010000.txt"
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
echo "stage 1: Processing transcriptions..."
|
||||
python3 local/extract_pinyin_label.py ${exp_dir}/${filename} ${exp_dir}/ref.pinyin
|
||||
|
||||
python3 local/convert_transcription.py ${exp_dir}/${filename} ${exp_dir}/trans.pinyin
|
||||
python3 local/convert_transcription.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/trans.jieba.pinyin
|
||||
fi
|
||||
|
||||
echo "done"
|
||||
exit 0
|
@ -1,23 +0,0 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
if [ $# != 2 ];then
|
||||
echo "usage: ${0} ckpt_dir avg_num"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ckpt_dir=${1}
|
||||
average_num=${2}
|
||||
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
|
||||
|
||||
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
|
||||
--dst_model ${decode_checkpoint} \
|
||||
--ckpt_dir ${ckpt_dir} \
|
||||
--num ${average_num} \
|
||||
--val_best
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in avg ckpt!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -0,0 +1,7 @@
|
||||
# Ngram LM
|
||||
|
||||
Train chinese chararctor ngram lm by [kenlm](https://github.com/kpu/kenlm).
|
||||
|
||||
```
|
||||
bash run.sh
|
||||
```
|
@ -1,23 +0,0 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
if [ $# != 2 ];then
|
||||
echo "usage: ${0} ckpt_dir avg_num"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ckpt_dir=${1}
|
||||
average_num=${2}
|
||||
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
|
||||
|
||||
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
|
||||
--dst_model ${decode_checkpoint} \
|
||||
--ckpt_dir ${ckpt_dir} \
|
||||
--num ${average_num} \
|
||||
--val_best
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in avg ckpt!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -1,23 +0,0 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
if [ $# != 2 ];then
|
||||
echo "usage: ${0} ckpt_dir avg_num"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ckpt_dir=${1}
|
||||
average_num=${2}
|
||||
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
|
||||
|
||||
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
|
||||
--dst_model ${decode_checkpoint} \
|
||||
--ckpt_dir ${ckpt_dir} \
|
||||
--num ${average_num} \
|
||||
--val_best
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in avg ckpt!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -0,0 +1,99 @@
|
||||
# Copyright 2014 Bernard Yue
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
__doc__ = """
|
||||
Hanzi Converter 繁簡轉換器 | 繁简转换器
|
||||
This module provides functions converting chinese text between simplified and
|
||||
traditional characters. It returns unicode represnetation of the text.
|
||||
Class HanziConv is the main entry point of the module, you can import the
|
||||
class by doing:
|
||||
>>> from hanziconv import HanziConv
|
||||
"""
|
||||
|
||||
import os
|
||||
from zhon import cedict
|
||||
|
||||
class HanziConv():
|
||||
"""This class supports hanzi (漢字) convention between simplified and
|
||||
traditional format"""
|
||||
__traditional_charmap = cedict.traditional
|
||||
__simplified_charmap = cedict.simplified
|
||||
|
||||
@classmethod
|
||||
def __convert(cls, text, toTraditional=True):
|
||||
"""Convert `text` to Traditional characters if `toTraditional` is
|
||||
True, else convert to simplified characters
|
||||
:param text: data to convert
|
||||
:param toTraditional: True -- convert to traditional text
|
||||
False -- covert to simplified text
|
||||
:returns: converted 'text`
|
||||
"""
|
||||
if isinstance(text, bytes):
|
||||
text = text.decode('utf-8')
|
||||
|
||||
fromMap = cls.__simplified_charmap
|
||||
toMap = cls.__traditional_charmap
|
||||
if not toTraditional:
|
||||
fromMap = cls.__traditional_charmap
|
||||
toMap = cls.__simplified_charmap
|
||||
|
||||
final = []
|
||||
for c in text:
|
||||
index = fromMap.find(c)
|
||||
if index != -1:
|
||||
final.append(toMap[index])
|
||||
else:
|
||||
final.append(c)
|
||||
return ''.join(final)
|
||||
|
||||
@classmethod
|
||||
def toSimplified(cls, text):
|
||||
"""Convert `text` to simplified character string. Assuming text is
|
||||
traditional character string
|
||||
:param text: text to convert
|
||||
:returns: converted UTF-8 characters
|
||||
>>> from hanziconv import HanziConv
|
||||
>>> print(HanziConv.toSimplified('繁簡轉換器'))
|
||||
繁简转换器
|
||||
"""
|
||||
return cls.__convert(text, toTraditional=False)
|
||||
|
||||
@classmethod
|
||||
def toTraditional(cls, text):
|
||||
"""Convert `text` to traditional character string. Assuming text is
|
||||
simplified character string
|
||||
:param text: text to convert
|
||||
:returns: converted UTF-8 characters
|
||||
>>> from hanziconv import HanziConv
|
||||
>>> print(HanziConv.toTraditional('繁简转换器'))
|
||||
繁簡轉換器
|
||||
"""
|
||||
return cls.__convert(text, toTraditional=True)
|
||||
|
||||
@classmethod
|
||||
def same(cls, text1, text2):
|
||||
"""Return True if text1 and text2 meant literally the same, False
|
||||
otherwise
|
||||
:param text1: string to compare to ``text2``
|
||||
:param text2: string to compare to ``text1``
|
||||
:returns: **True** -- ``text1`` and ``text2`` are the same in meaning,
|
||||
**False** -- otherwise
|
||||
>>> from hanziconv import HanziConv
|
||||
>>> print(HanziConv.same('繁简转换器', '繁簡轉換器'))
|
||||
True
|
||||
"""
|
||||
t1 = cls.toSimplified(text1)
|
||||
t2 = cls.toSimplified(text2)
|
||||
return t1 == t2
|
@ -0,0 +1,339 @@
|
||||
# author: kuangdd
|
||||
# date: 2021/5/8
|
||||
"""
|
||||
#### style
|
||||
拼音格式转换。
|
||||
|
||||
国标样式的拼音和字母数字的样式的拼音相互转换。
|
||||
"""
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(Path(__file__).stem)
|
||||
|
||||
# 2100 = 420 * 5
|
||||
guobiao2shengyundiao_dict = {
|
||||
'a': 'a5', 'ā': 'a1', 'á': 'a2', 'ǎ': 'a3', 'à': 'a4', 'ai': 'ai5', 'āi': 'ai1', 'ái': 'ai2', 'ǎi': 'ai3',
|
||||
'ài': 'ai4', 'an': 'an5', 'ān': 'an1', 'án': 'an2', 'ǎn': 'an3', 'àn': 'an4', 'ang': 'ang5', 'āng': 'ang1',
|
||||
'áng': 'ang2', 'ǎng': 'ang3', 'àng': 'ang4', 'ao': 'ao5', 'āo': 'ao1', 'áo': 'ao2', 'ǎo': 'ao3', 'ào': 'ao4',
|
||||
'ba': 'ba5', 'bā': 'ba1', 'bá': 'ba2', 'bǎ': 'ba3', 'bà': 'ba4', 'bai': 'bai5', 'bāi': 'bai1', 'bái': 'bai2',
|
||||
'bǎi': 'bai3', 'bài': 'bai4', 'ban': 'ban5', 'bān': 'ban1', 'bán': 'ban2', 'bǎn': 'ban3', 'bàn': 'ban4',
|
||||
'bang': 'bang5', 'bāng': 'bang1', 'báng': 'bang2', 'bǎng': 'bang3', 'bàng': 'bang4', 'bao': 'bao5', 'bāo': 'bao1',
|
||||
'báo': 'bao2', 'bǎo': 'bao3', 'bào': 'bao4', 'bei': 'bei5', 'bēi': 'bei1', 'béi': 'bei2', 'běi': 'bei3',
|
||||
'bèi': 'bei4', 'ben': 'ben5', 'bēn': 'ben1', 'bén': 'ben2', 'běn': 'ben3', 'bèn': 'ben4', 'beng': 'beng5',
|
||||
'bēng': 'beng1', 'béng': 'beng2', 'běng': 'beng3', 'bèng': 'beng4', 'bi': 'bi5', 'bī': 'bi1', 'bí': 'bi2',
|
||||
'bǐ': 'bi3', 'bì': 'bi4', 'bian': 'bian5', 'biān': 'bian1', 'bián': 'bian2', 'biǎn': 'bian3', 'biàn': 'bian4',
|
||||
'biao': 'biao5', 'biāo': 'biao1', 'biáo': 'biao2', 'biǎo': 'biao3', 'biào': 'biao4', 'bie': 'bie5', 'biē': 'bie1',
|
||||
'bié': 'bie2', 'biě': 'bie3', 'biè': 'bie4', 'bin': 'bin5', 'bīn': 'bin1', 'bín': 'bin2', 'bǐn': 'bin3',
|
||||
'bìn': 'bin4', 'bing': 'bing5', 'bīng': 'bing1', 'bíng': 'bing2', 'bǐng': 'bing3', 'bìng': 'bing4', 'bo': 'bo5',
|
||||
'bō': 'bo1', 'bó': 'bo2', 'bǒ': 'bo3', 'bò': 'bo4', 'bu': 'bu5', 'bū': 'bu1', 'bú': 'bu2', 'bǔ': 'bu3', 'bù': 'bu4',
|
||||
'ca': 'ca5', 'cā': 'ca1', 'cá': 'ca2', 'cǎ': 'ca3', 'cà': 'ca4', 'cai': 'cai5', 'cāi': 'cai1', 'cái': 'cai2',
|
||||
'cǎi': 'cai3', 'cài': 'cai4', 'can': 'can5', 'cān': 'can1', 'cán': 'can2', 'cǎn': 'can3', 'càn': 'can4',
|
||||
'cang': 'cang5', 'cāng': 'cang1', 'cáng': 'cang2', 'cǎng': 'cang3', 'càng': 'cang4', 'cao': 'cao5', 'cāo': 'cao1',
|
||||
'cáo': 'cao2', 'cǎo': 'cao3', 'cào': 'cao4', 'ce': 'ce5', 'cē': 'ce1', 'cé': 'ce2', 'cě': 'ce3', 'cè': 'ce4',
|
||||
'cen': 'cen5', 'cēn': 'cen1', 'cén': 'cen2', 'cěn': 'cen3', 'cèn': 'cen4', 'ceng': 'ceng5', 'cēng': 'ceng1',
|
||||
'céng': 'ceng2', 'cěng': 'ceng3', 'cèng': 'ceng4', 'cha': 'cha5', 'chā': 'cha1', 'chá': 'cha2', 'chǎ': 'cha3',
|
||||
'chà': 'cha4', 'chai': 'chai5', 'chāi': 'chai1', 'chái': 'chai2', 'chǎi': 'chai3', 'chài': 'chai4', 'chan': 'chan5',
|
||||
'chān': 'chan1', 'chán': 'chan2', 'chǎn': 'chan3', 'chàn': 'chan4', 'chang': 'chang5', 'chāng': 'chang1',
|
||||
'cháng': 'chang2', 'chǎng': 'chang3', 'chàng': 'chang4', 'chao': 'chao5', 'chāo': 'chao1', 'cháo': 'chao2',
|
||||
'chǎo': 'chao3', 'chào': 'chao4', 'che': 'che5', 'chē': 'che1', 'ché': 'che2', 'chě': 'che3', 'chè': 'che4',
|
||||
'chen': 'chen5', 'chēn': 'chen1', 'chén': 'chen2', 'chěn': 'chen3', 'chèn': 'chen4', 'cheng': 'cheng5',
|
||||
'chēng': 'cheng1', 'chéng': 'cheng2', 'chěng': 'cheng3', 'chèng': 'cheng4', 'chi': 'chi5', 'chī': 'chi1',
|
||||
'chí': 'chi2', 'chǐ': 'chi3', 'chì': 'chi4', 'chong': 'chong5', 'chōng': 'chong1', 'chóng': 'chong2',
|
||||
'chǒng': 'chong3', 'chòng': 'chong4', 'chou': 'chou5', 'chōu': 'chou1', 'chóu': 'chou2', 'chǒu': 'chou3',
|
||||
'chòu': 'chou4', 'chu': 'chu5', 'chū': 'chu1', 'chú': 'chu2', 'chǔ': 'chu3', 'chù': 'chu4', 'chuai': 'chuai5',
|
||||
'chuāi': 'chuai1', 'chuái': 'chuai2', 'chuǎi': 'chuai3', 'chuài': 'chuai4', 'chuan': 'chuan5', 'chuān': 'chuan1',
|
||||
'chuán': 'chuan2', 'chuǎn': 'chuan3', 'chuàn': 'chuan4', 'chuang': 'chuang5', 'chuāng': 'chuang1',
|
||||
'chuáng': 'chuang2', 'chuǎng': 'chuang3', 'chuàng': 'chuang4', 'chui': 'chui5', 'chuī': 'chui1', 'chuí': 'chui2',
|
||||
'chuǐ': 'chui3', 'chuì': 'chui4', 'chun': 'chun5', 'chūn': 'chun1', 'chún': 'chun2', 'chǔn': 'chun3',
|
||||
'chùn': 'chun4', 'chuo': 'chuo5', 'chuō': 'chuo1', 'chuó': 'chuo2', 'chuǒ': 'chuo3', 'chuò': 'chuo4', 'ci': 'ci5',
|
||||
'cī': 'ci1', 'cí': 'ci2', 'cǐ': 'ci3', 'cì': 'ci4', 'cong': 'cong5', 'cōng': 'cong1', 'cóng': 'cong2',
|
||||
'cǒng': 'cong3', 'còng': 'cong4', 'cou': 'cou5', 'cōu': 'cou1', 'cóu': 'cou2', 'cǒu': 'cou3', 'còu': 'cou4',
|
||||
'cu': 'cu5', 'cū': 'cu1', 'cú': 'cu2', 'cǔ': 'cu3', 'cù': 'cu4', 'cuan': 'cuan5', 'cuān': 'cuan1', 'cuán': 'cuan2',
|
||||
'cuǎn': 'cuan3', 'cuàn': 'cuan4', 'cui': 'cui5', 'cuī': 'cui1', 'cuí': 'cui2', 'cuǐ': 'cui3', 'cuì': 'cui4',
|
||||
'cun': 'cun5', 'cūn': 'cun1', 'cún': 'cun2', 'cǔn': 'cun3', 'cùn': 'cun4', 'cuo': 'cuo5', 'cuō': 'cuo1',
|
||||
'cuó': 'cuo2', 'cuǒ': 'cuo3', 'cuò': 'cuo4', 'da': 'da5', 'dā': 'da1', 'dá': 'da2', 'dǎ': 'da3', 'dà': 'da4',
|
||||
'dai': 'dai5', 'dāi': 'dai1', 'dái': 'dai2', 'dǎi': 'dai3', 'dài': 'dai4', 'dan': 'dan5', 'dān': 'dan1',
|
||||
'dán': 'dan2', 'dǎn': 'dan3', 'dàn': 'dan4', 'dang': 'dang5', 'dāng': 'dang1', 'dáng': 'dang2', 'dǎng': 'dang3',
|
||||
'dàng': 'dang4', 'dao': 'dao5', 'dāo': 'dao1', 'dáo': 'dao2', 'dǎo': 'dao3', 'dào': 'dao4', 'de': 'de5',
|
||||
'dē': 'de1', 'dé': 'de2', 'dě': 'de3', 'dè': 'de4', 'dei': 'dei5', 'dēi': 'dei1', 'déi': 'dei2', 'děi': 'dei3',
|
||||
'dèi': 'dei4', 'den': 'den5', 'dēn': 'den1', 'dén': 'den2', 'děn': 'den3', 'dèn': 'den4', 'deng': 'deng5',
|
||||
'dēng': 'deng1', 'déng': 'deng2', 'děng': 'deng3', 'dèng': 'deng4', 'di': 'di5', 'dī': 'di1', 'dí': 'di2',
|
||||
'dǐ': 'di3', 'dì': 'di4', 'dia': 'dia5', 'diā': 'dia1', 'diá': 'dia2', 'diǎ': 'dia3', 'dià': 'dia4',
|
||||
'dian': 'dian5', 'diān': 'dian1', 'dián': 'dian2', 'diǎn': 'dian3', 'diàn': 'dian4', 'diao': 'diao5',
|
||||
'diāo': 'diao1', 'diáo': 'diao2', 'diǎo': 'diao3', 'diào': 'diao4', 'die': 'die5', 'diē': 'die1', 'dié': 'die2',
|
||||
'diě': 'die3', 'diè': 'die4', 'ding': 'ding5', 'dīng': 'ding1', 'díng': 'ding2', 'dǐng': 'ding3', 'dìng': 'ding4',
|
||||
'diu': 'diu5', 'diū': 'diu1', 'diú': 'diu2', 'diǔ': 'diu3', 'diù': 'diu4', 'dong': 'dong5', 'dōng': 'dong1',
|
||||
'dóng': 'dong2', 'dǒng': 'dong3', 'dòng': 'dong4', 'dou': 'dou5', 'dōu': 'dou1', 'dóu': 'dou2', 'dǒu': 'dou3',
|
||||
'dòu': 'dou4', 'du': 'du5', 'dū': 'du1', 'dú': 'du2', 'dǔ': 'du3', 'dù': 'du4', 'duan': 'duan5', 'duān': 'duan1',
|
||||
'duán': 'duan2', 'duǎn': 'duan3', 'duàn': 'duan4', 'dui': 'dui5', 'duī': 'dui1', 'duí': 'dui2', 'duǐ': 'dui3',
|
||||
'duì': 'dui4', 'dun': 'dun5', 'dūn': 'dun1', 'dún': 'dun2', 'dǔn': 'dun3', 'dùn': 'dun4', 'duo': 'duo5',
|
||||
'duō': 'duo1', 'duó': 'duo2', 'duǒ': 'duo3', 'duò': 'duo4', 'e': 'e5', 'ē': 'e1', 'é': 'e2', 'ě': 'e3', 'è': 'e4',
|
||||
'ei': 'ei5', 'ēi': 'ei1', 'éi': 'ei2', 'ěi': 'ei3', 'èi': 'ei4', 'en': 'en5', 'ēn': 'en1', 'én': 'en2', 'ěn': 'en3',
|
||||
'èn': 'en4', 'eng': 'eng5', 'ēng': 'eng1', 'éng': 'eng2', 'ěng': 'eng3', 'èng': 'eng4', 'er': 'er5', 'ēr': 'er1',
|
||||
'ér': 'er2', 'ěr': 'er3', 'èr': 'er4', 'fa': 'fa5', 'fā': 'fa1', 'fá': 'fa2', 'fǎ': 'fa3', 'fà': 'fa4',
|
||||
'fan': 'fan5', 'fān': 'fan1', 'fán': 'fan2', 'fǎn': 'fan3', 'fàn': 'fan4', 'fang': 'fang5', 'fāng': 'fang1',
|
||||
'fáng': 'fang2', 'fǎng': 'fang3', 'fàng': 'fang4', 'fei': 'fei5', 'fēi': 'fei1', 'féi': 'fei2', 'fěi': 'fei3',
|
||||
'fèi': 'fei4', 'fen': 'fen5', 'fēn': 'fen1', 'fén': 'fen2', 'fěn': 'fen3', 'fèn': 'fen4', 'feng': 'feng5',
|
||||
'fēng': 'feng1', 'féng': 'feng2', 'fěng': 'feng3', 'fèng': 'feng4', 'fo': 'fo5', 'fō': 'fo1', 'fó': 'fo2',
|
||||
'fǒ': 'fo3', 'fò': 'fo4', 'fou': 'fou5', 'fōu': 'fou1', 'fóu': 'fou2', 'fǒu': 'fou3', 'fòu': 'fou4', 'fu': 'fu5',
|
||||
'fū': 'fu1', 'fú': 'fu2', 'fǔ': 'fu3', 'fù': 'fu4', 'ga': 'ga5', 'gā': 'ga1', 'gá': 'ga2', 'gǎ': 'ga3', 'gà': 'ga4',
|
||||
'gai': 'gai5', 'gāi': 'gai1', 'gái': 'gai2', 'gǎi': 'gai3', 'gài': 'gai4', 'gan': 'gan5', 'gān': 'gan1',
|
||||
'gán': 'gan2', 'gǎn': 'gan3', 'gàn': 'gan4', 'gang': 'gang5', 'gāng': 'gang1', 'gáng': 'gang2', 'gǎng': 'gang3',
|
||||
'gàng': 'gang4', 'gao': 'gao5', 'gāo': 'gao1', 'gáo': 'gao2', 'gǎo': 'gao3', 'gào': 'gao4', 'ge': 'ge5',
|
||||
'gē': 'ge1', 'gé': 'ge2', 'gě': 'ge3', 'gè': 'ge4', 'gei': 'gei5', 'gēi': 'gei1', 'géi': 'gei2', 'gěi': 'gei3',
|
||||
'gèi': 'gei4', 'gen': 'gen5', 'gēn': 'gen1', 'gén': 'gen2', 'gěn': 'gen3', 'gèn': 'gen4', 'geng': 'geng5',
|
||||
'gēng': 'geng1', 'géng': 'geng2', 'gěng': 'geng3', 'gèng': 'geng4', 'gong': 'gong5', 'gōng': 'gong1',
|
||||
'góng': 'gong2', 'gǒng': 'gong3', 'gòng': 'gong4', 'gou': 'gou5', 'gōu': 'gou1', 'góu': 'gou2', 'gǒu': 'gou3',
|
||||
'gòu': 'gou4', 'gu': 'gu5', 'gū': 'gu1', 'gú': 'gu2', 'gǔ': 'gu3', 'gù': 'gu4', 'gua': 'gua5', 'guā': 'gua1',
|
||||
'guá': 'gua2', 'guǎ': 'gua3', 'guà': 'gua4', 'guai': 'guai5', 'guāi': 'guai1', 'guái': 'guai2', 'guǎi': 'guai3',
|
||||
'guài': 'guai4', 'guan': 'guan5', 'guān': 'guan1', 'guán': 'guan2', 'guǎn': 'guan3', 'guàn': 'guan4',
|
||||
'guang': 'guang5', 'guāng': 'guang1', 'guáng': 'guang2', 'guǎng': 'guang3', 'guàng': 'guang4', 'gui': 'gui5',
|
||||
'guī': 'gui1', 'guí': 'gui2', 'guǐ': 'gui3', 'guì': 'gui4', 'gun': 'gun5', 'gūn': 'gun1', 'gún': 'gun2',
|
||||
'gǔn': 'gun3', 'gùn': 'gun4', 'guo': 'guo5', 'guō': 'guo1', 'guó': 'guo2', 'guǒ': 'guo3', 'guò': 'guo4',
|
||||
'ha': 'ha5', 'hā': 'ha1', 'há': 'ha2', 'hǎ': 'ha3', 'hà': 'ha4', 'hai': 'hai5', 'hāi': 'hai1', 'hái': 'hai2',
|
||||
'hǎi': 'hai3', 'hài': 'hai4', 'han': 'han5', 'hān': 'han1', 'hán': 'han2', 'hǎn': 'han3', 'hàn': 'han4',
|
||||
'hang': 'hang5', 'hāng': 'hang1', 'háng': 'hang2', 'hǎng': 'hang3', 'hàng': 'hang4', 'hao': 'hao5', 'hāo': 'hao1',
|
||||
'háo': 'hao2', 'hǎo': 'hao3', 'hào': 'hao4', 'he': 'he5', 'hē': 'he1', 'hé': 'he2', 'hě': 'he3', 'hè': 'he4',
|
||||
'hei': 'hei5', 'hēi': 'hei1', 'héi': 'hei2', 'hěi': 'hei3', 'hèi': 'hei4', 'hen': 'hen5', 'hēn': 'hen1',
|
||||
'hén': 'hen2', 'hěn': 'hen3', 'hèn': 'hen4', 'heng': 'heng5', 'hēng': 'heng1', 'héng': 'heng2', 'hěng': 'heng3',
|
||||
'hèng': 'heng4', 'hong': 'hong5', 'hōng': 'hong1', 'hóng': 'hong2', 'hǒng': 'hong3', 'hòng': 'hong4', 'hou': 'hou5',
|
||||
'hōu': 'hou1', 'hóu': 'hou2', 'hǒu': 'hou3', 'hòu': 'hou4', 'hu': 'hu5', 'hū': 'hu1', 'hú': 'hu2', 'hǔ': 'hu3',
|
||||
'hù': 'hu4', 'hua': 'hua5', 'huā': 'hua1', 'huá': 'hua2', 'huǎ': 'hua3', 'huà': 'hua4', 'huai': 'huai5',
|
||||
'huāi': 'huai1', 'huái': 'huai2', 'huǎi': 'huai3', 'huài': 'huai4', 'huan': 'huan5', 'huān': 'huan1',
|
||||
'huán': 'huan2', 'huǎn': 'huan3', 'huàn': 'huan4', 'huang': 'huang5', 'huāng': 'huang1', 'huáng': 'huang2',
|
||||
'huǎng': 'huang3', 'huàng': 'huang4', 'hui': 'hui5', 'huī': 'hui1', 'huí': 'hui2', 'huǐ': 'hui3', 'huì': 'hui4',
|
||||
'hun': 'hun5', 'hūn': 'hun1', 'hún': 'hun2', 'hǔn': 'hun3', 'hùn': 'hun4', 'huo': 'huo5', 'huō': 'huo1',
|
||||
'huó': 'huo2', 'huǒ': 'huo3', 'huò': 'huo4', 'ji': 'ji5', 'jī': 'ji1', 'jí': 'ji2', 'jǐ': 'ji3', 'jì': 'ji4',
|
||||
'jia': 'jia5', 'jiā': 'jia1', 'jiá': 'jia2', 'jiǎ': 'jia3', 'jià': 'jia4', 'jian': 'jian5', 'jiān': 'jian1',
|
||||
'jián': 'jian2', 'jiǎn': 'jian3', 'jiàn': 'jian4', 'jiang': 'jiang5', 'jiāng': 'jiang1', 'jiáng': 'jiang2',
|
||||
'jiǎng': 'jiang3', 'jiàng': 'jiang4', 'jiao': 'jiao5', 'jiāo': 'jiao1', 'jiáo': 'jiao2', 'jiǎo': 'jiao3',
|
||||
'jiào': 'jiao4', 'jie': 'jie5', 'jiē': 'jie1', 'jié': 'jie2', 'jiě': 'jie3', 'jiè': 'jie4', 'jin': 'jin5',
|
||||
'jīn': 'jin1', 'jín': 'jin2', 'jǐn': 'jin3', 'jìn': 'jin4', 'jing': 'jing5', 'jīng': 'jing1', 'jíng': 'jing2',
|
||||
'jǐng': 'jing3', 'jìng': 'jing4', 'jiong': 'jiong5', 'jiōng': 'jiong1', 'jióng': 'jiong2', 'jiǒng': 'jiong3',
|
||||
'jiòng': 'jiong4', 'jiu': 'jiu5', 'jiū': 'jiu1', 'jiú': 'jiu2', 'jiǔ': 'jiu3', 'jiù': 'jiu4', 'ju': 'ju5',
|
||||
'jū': 'ju1', 'jú': 'ju2', 'jǔ': 'ju3', 'jù': 'ju4', 'juan': 'juan5', 'juān': 'juan1', 'juán': 'juan2',
|
||||
'juǎn': 'juan3', 'juàn': 'juan4', 'jue': 'jue5', 'juē': 'jue1', 'jué': 'jue2', 'juě': 'jue3', 'juè': 'jue4',
|
||||
'jun': 'jun5', 'jūn': 'jun1', 'jún': 'jun2', 'jǔn': 'jun3', 'jùn': 'jun4', 'ka': 'ka5', 'kā': 'ka1', 'ká': 'ka2',
|
||||
'kǎ': 'ka3', 'kà': 'ka4', 'kai': 'kai5', 'kāi': 'kai1', 'kái': 'kai2', 'kǎi': 'kai3', 'kài': 'kai4', 'kan': 'kan5',
|
||||
'kān': 'kan1', 'kán': 'kan2', 'kǎn': 'kan3', 'kàn': 'kan4', 'kang': 'kang5', 'kāng': 'kang1', 'káng': 'kang2',
|
||||
'kǎng': 'kang3', 'kàng': 'kang4', 'kao': 'kao5', 'kāo': 'kao1', 'káo': 'kao2', 'kǎo': 'kao3', 'kào': 'kao4',
|
||||
'ke': 'ke5', 'kē': 'ke1', 'ké': 'ke2', 'kě': 'ke3', 'kè': 'ke4', 'ken': 'ken5', 'kēn': 'ken1', 'kén': 'ken2',
|
||||
'kěn': 'ken3', 'kèn': 'ken4', 'keng': 'keng5', 'kēng': 'keng1', 'kéng': 'keng2', 'kěng': 'keng3', 'kèng': 'keng4',
|
||||
'kong': 'kong5', 'kōng': 'kong1', 'kóng': 'kong2', 'kǒng': 'kong3', 'kòng': 'kong4', 'kou': 'kou5', 'kōu': 'kou1',
|
||||
'kóu': 'kou2', 'kǒu': 'kou3', 'kòu': 'kou4', 'ku': 'ku5', 'kū': 'ku1', 'kú': 'ku2', 'kǔ': 'ku3', 'kù': 'ku4',
|
||||
'kua': 'kua5', 'kuā': 'kua1', 'kuá': 'kua2', 'kuǎ': 'kua3', 'kuà': 'kua4', 'kuai': 'kuai5', 'kuāi': 'kuai1',
|
||||
'kuái': 'kuai2', 'kuǎi': 'kuai3', 'kuài': 'kuai4', 'kuan': 'kuan5', 'kuān': 'kuan1', 'kuán': 'kuan2',
|
||||
'kuǎn': 'kuan3', 'kuàn': 'kuan4', 'kuang': 'kuang5', 'kuāng': 'kuang1', 'kuáng': 'kuang2', 'kuǎng': 'kuang3',
|
||||
'kuàng': 'kuang4', 'kui': 'kui5', 'kuī': 'kui1', 'kuí': 'kui2', 'kuǐ': 'kui3', 'kuì': 'kui4', 'kun': 'kun5',
|
||||
'kūn': 'kun1', 'kún': 'kun2', 'kǔn': 'kun3', 'kùn': 'kun4', 'kuo': 'kuo5', 'kuō': 'kuo1', 'kuó': 'kuo2',
|
||||
'kuǒ': 'kuo3', 'kuò': 'kuo4', 'la': 'la5', 'lā': 'la1', 'lá': 'la2', 'lǎ': 'la3', 'là': 'la4', 'lai': 'lai5',
|
||||
'lāi': 'lai1', 'lái': 'lai2', 'lǎi': 'lai3', 'lài': 'lai4', 'lan': 'lan5', 'lān': 'lan1', 'lán': 'lan2',
|
||||
'lǎn': 'lan3', 'làn': 'lan4', 'lang': 'lang5', 'lāng': 'lang1', 'láng': 'lang2', 'lǎng': 'lang3', 'làng': 'lang4',
|
||||
'lao': 'lao5', 'lāo': 'lao1', 'láo': 'lao2', 'lǎo': 'lao3', 'lào': 'lao4', 'le': 'le5', 'lē': 'le1', 'lé': 'le2',
|
||||
'lě': 'le3', 'lè': 'le4', 'lei': 'lei5', 'lēi': 'lei1', 'léi': 'lei2', 'lěi': 'lei3', 'lèi': 'lei4',
|
||||
'leng': 'leng5', 'lēng': 'leng1', 'léng': 'leng2', 'lěng': 'leng3', 'lèng': 'leng4', 'li': 'li5', 'lī': 'li1',
|
||||
'lí': 'li2', 'lǐ': 'li3', 'lì': 'li4', 'lia': 'lia5', 'liā': 'lia1', 'liá': 'lia2', 'liǎ': 'lia3', 'lià': 'lia4',
|
||||
'lian': 'lian5', 'liān': 'lian1', 'lián': 'lian2', 'liǎn': 'lian3', 'liàn': 'lian4', 'liang': 'liang5',
|
||||
'liāng': 'liang1', 'liáng': 'liang2', 'liǎng': 'liang3', 'liàng': 'liang4', 'liao': 'liao5', 'liāo': 'liao1',
|
||||
'liáo': 'liao2', 'liǎo': 'liao3', 'liào': 'liao4', 'lie': 'lie5', 'liē': 'lie1', 'lié': 'lie2', 'liě': 'lie3',
|
||||
'liè': 'lie4', 'lin': 'lin5', 'līn': 'lin1', 'lín': 'lin2', 'lǐn': 'lin3', 'lìn': 'lin4', 'ling': 'ling5',
|
||||
'līng': 'ling1', 'líng': 'ling2', 'lǐng': 'ling3', 'lìng': 'ling4', 'liu': 'liu5', 'liū': 'liu1', 'liú': 'liu2',
|
||||
'liǔ': 'liu3', 'liù': 'liu4', 'lo': 'lo5', 'lō': 'lo1', 'ló': 'lo2', 'lǒ': 'lo3', 'lò': 'lo4', 'long': 'long5',
|
||||
'lōng': 'long1', 'lóng': 'long2', 'lǒng': 'long3', 'lòng': 'long4', 'lou': 'lou5', 'lōu': 'lou1', 'lóu': 'lou2',
|
||||
'lǒu': 'lou3', 'lòu': 'lou4', 'lu': 'lu5', 'lū': 'lu1', 'lú': 'lu2', 'lǔ': 'lu3', 'lù': 'lu4', 'luan': 'luan5',
|
||||
'luān': 'luan1', 'luán': 'luan2', 'luǎn': 'luan3', 'luàn': 'luan4', 'lun': 'lun5', 'lūn': 'lun1', 'lún': 'lun2',
|
||||
'lǔn': 'lun3', 'lùn': 'lun4', 'luo': 'luo5', 'luō': 'luo1', 'luó': 'luo2', 'luǒ': 'luo3', 'luò': 'luo4',
|
||||
'lü': 'lv5', 'lǖ': 'lv1', 'lǘ': 'lv2', 'lǚ': 'lv3', 'lǜ': 'lv4', 'lüe': 'lve5', 'lüē': 'lve1', 'lüé': 'lve2',
|
||||
'lüě': 'lve3', 'lüè': 'lve4', 'ma': 'ma5', 'mā': 'ma1', 'má': 'ma2', 'mǎ': 'ma3', 'mà': 'ma4', 'mai': 'mai5',
|
||||
'māi': 'mai1', 'mái': 'mai2', 'mǎi': 'mai3', 'mài': 'mai4', 'man': 'man5', 'mān': 'man1', 'mán': 'man2',
|
||||
'mǎn': 'man3', 'màn': 'man4', 'mang': 'mang5', 'māng': 'mang1', 'máng': 'mang2', 'mǎng': 'mang3', 'màng': 'mang4',
|
||||
'mao': 'mao5', 'māo': 'mao1', 'máo': 'mao2', 'mǎo': 'mao3', 'mào': 'mao4', 'me': 'me5', 'mē': 'me1', 'mé': 'me2',
|
||||
'mě': 'me3', 'mè': 'me4', 'mei': 'mei5', 'mēi': 'mei1', 'méi': 'mei2', 'měi': 'mei3', 'mèi': 'mei4', 'men': 'men5',
|
||||
'mēn': 'men1', 'mén': 'men2', 'měn': 'men3', 'mèn': 'men4', 'meng': 'meng5', 'mēng': 'meng1', 'méng': 'meng2',
|
||||
'měng': 'meng3', 'mèng': 'meng4', 'mi': 'mi5', 'mī': 'mi1', 'mí': 'mi2', 'mǐ': 'mi3', 'mì': 'mi4', 'mian': 'mian5',
|
||||
'miān': 'mian1', 'mián': 'mian2', 'miǎn': 'mian3', 'miàn': 'mian4', 'miao': 'miao5', 'miāo': 'miao1',
|
||||
'miáo': 'miao2', 'miǎo': 'miao3', 'miào': 'miao4', 'mie': 'mie5', 'miē': 'mie1', 'mié': 'mie2', 'miě': 'mie3',
|
||||
'miè': 'mie4', 'min': 'min5', 'mīn': 'min1', 'mín': 'min2', 'mǐn': 'min3', 'mìn': 'min4', 'ming': 'ming5',
|
||||
'mīng': 'ming1', 'míng': 'ming2', 'mǐng': 'ming3', 'mìng': 'ming4', 'miu': 'miu5', 'miū': 'miu1', 'miú': 'miu2',
|
||||
'miǔ': 'miu3', 'miù': 'miu4', 'mo': 'mo5', 'mō': 'mo1', 'mó': 'mo2', 'mǒ': 'mo3', 'mò': 'mo4', 'mou': 'mou5',
|
||||
'mōu': 'mou1', 'móu': 'mou2', 'mǒu': 'mou3', 'mòu': 'mou4', 'mu': 'mu5', 'mū': 'mu1', 'mú': 'mu2', 'mǔ': 'mu3',
|
||||
'mù': 'mu4', 'na': 'na5', 'nā': 'na1', 'ná': 'na2', 'nǎ': 'na3', 'nà': 'na4', 'nai': 'nai5', 'nāi': 'nai1',
|
||||
'nái': 'nai2', 'nǎi': 'nai3', 'nài': 'nai4', 'nan': 'nan5', 'nān': 'nan1', 'nán': 'nan2', 'nǎn': 'nan3',
|
||||
'nàn': 'nan4', 'nang': 'nang5', 'nāng': 'nang1', 'náng': 'nang2', 'nǎng': 'nang3', 'nàng': 'nang4', 'nao': 'nao5',
|
||||
'nāo': 'nao1', 'náo': 'nao2', 'nǎo': 'nao3', 'nào': 'nao4', 'ne': 'ne5', 'nē': 'ne1', 'né': 'ne2', 'ně': 'ne3',
|
||||
'nè': 'ne4', 'nei': 'nei5', 'nēi': 'nei1', 'néi': 'nei2', 'něi': 'nei3', 'nèi': 'nei4', 'nen': 'nen5',
|
||||
'nēn': 'nen1', 'nén': 'nen2', 'něn': 'nen3', 'nèn': 'nen4', 'neng': 'neng5', 'nēng': 'neng1', 'néng': 'neng2',
|
||||
'něng': 'neng3', 'nèng': 'neng4', 'ni': 'ni5', 'nī': 'ni1', 'ní': 'ni2', 'nǐ': 'ni3', 'nì': 'ni4', 'nian': 'nian5',
|
||||
'niān': 'nian1', 'nián': 'nian2', 'niǎn': 'nian3', 'niàn': 'nian4', 'niang': 'niang5', 'niāng': 'niang1',
|
||||
'niáng': 'niang2', 'niǎng': 'niang3', 'niàng': 'niang4', 'niao': 'niao5', 'niāo': 'niao1', 'niáo': 'niao2',
|
||||
'niǎo': 'niao3', 'niào': 'niao4', 'nie': 'nie5', 'niē': 'nie1', 'nié': 'nie2', 'niě': 'nie3', 'niè': 'nie4',
|
||||
'nin': 'nin5', 'nīn': 'nin1', 'nín': 'nin2', 'nǐn': 'nin3', 'nìn': 'nin4', 'ning': 'ning5', 'nīng': 'ning1',
|
||||
'níng': 'ning2', 'nǐng': 'ning3', 'nìng': 'ning4', 'niu': 'niu5', 'niū': 'niu1', 'niú': 'niu2', 'niǔ': 'niu3',
|
||||
'niù': 'niu4', 'nong': 'nong5', 'nōng': 'nong1', 'nóng': 'nong2', 'nǒng': 'nong3', 'nòng': 'nong4', 'nou': 'nou5',
|
||||
'nōu': 'nou1', 'nóu': 'nou2', 'nǒu': 'nou3', 'nòu': 'nou4', 'nu': 'nu5', 'nū': 'nu1', 'nú': 'nu2', 'nǔ': 'nu3',
|
||||
'nù': 'nu4', 'nuan': 'nuan5', 'nuān': 'nuan1', 'nuán': 'nuan2', 'nuǎn': 'nuan3', 'nuàn': 'nuan4', 'nuo': 'nuo5',
|
||||
'nuō': 'nuo1', 'nuó': 'nuo2', 'nuǒ': 'nuo3', 'nuò': 'nuo4', 'nü': 'nv5', 'nǖ': 'nv1', 'nǘ': 'nv2', 'nǚ': 'nv3',
|
||||
'nǜ': 'nv4', 'nüe': 'nve5', 'nüē': 'nve1', 'nüé': 'nve2', 'nüě': 'nve3', 'nüè': 'nve4', 'o': 'o5', 'ō': 'o1',
|
||||
'ó': 'o2', 'ǒ': 'o3', 'ò': 'o4', 'ou': 'ou5', 'ōu': 'ou1', 'óu': 'ou2', 'ǒu': 'ou3', 'òu': 'ou4', 'pa': 'pa5',
|
||||
'pā': 'pa1', 'pá': 'pa2', 'pǎ': 'pa3', 'pà': 'pa4', 'pai': 'pai5', 'pāi': 'pai1', 'pái': 'pai2', 'pǎi': 'pai3',
|
||||
'pài': 'pai4', 'pan': 'pan5', 'pān': 'pan1', 'pán': 'pan2', 'pǎn': 'pan3', 'pàn': 'pan4', 'pang': 'pang5',
|
||||
'pāng': 'pang1', 'páng': 'pang2', 'pǎng': 'pang3', 'pàng': 'pang4', 'pao': 'pao5', 'pāo': 'pao1', 'páo': 'pao2',
|
||||
'pǎo': 'pao3', 'pào': 'pao4', 'pei': 'pei5', 'pēi': 'pei1', 'péi': 'pei2', 'pěi': 'pei3', 'pèi': 'pei4',
|
||||
'pen': 'pen5', 'pēn': 'pen1', 'pén': 'pen2', 'pěn': 'pen3', 'pèn': 'pen4', 'peng': 'peng5', 'pēng': 'peng1',
|
||||
'péng': 'peng2', 'pěng': 'peng3', 'pèng': 'peng4', 'pi': 'pi5', 'pī': 'pi1', 'pí': 'pi2', 'pǐ': 'pi3', 'pì': 'pi4',
|
||||
'pian': 'pian5', 'piān': 'pian1', 'pián': 'pian2', 'piǎn': 'pian3', 'piàn': 'pian4', 'piao': 'piao5',
|
||||
'piāo': 'piao1', 'piáo': 'piao2', 'piǎo': 'piao3', 'piào': 'piao4', 'pie': 'pie5', 'piē': 'pie1', 'pié': 'pie2',
|
||||
'piě': 'pie3', 'piè': 'pie4', 'pin': 'pin5', 'pīn': 'pin1', 'pín': 'pin2', 'pǐn': 'pin3', 'pìn': 'pin4',
|
||||
'ping': 'ping5', 'pīng': 'ping1', 'píng': 'ping2', 'pǐng': 'ping3', 'pìng': 'ping4', 'po': 'po5', 'pō': 'po1',
|
||||
'pó': 'po2', 'pǒ': 'po3', 'pò': 'po4', 'pou': 'pou5', 'pōu': 'pou1', 'póu': 'pou2', 'pǒu': 'pou3', 'pòu': 'pou4',
|
||||
'pu': 'pu5', 'pū': 'pu1', 'pú': 'pu2', 'pǔ': 'pu3', 'pù': 'pu4', 'qi': 'qi5', 'qī': 'qi1', 'qí': 'qi2', 'qǐ': 'qi3',
|
||||
'qì': 'qi4', 'qia': 'qia5', 'qiā': 'qia1', 'qiá': 'qia2', 'qiǎ': 'qia3', 'qià': 'qia4', 'qian': 'qian5',
|
||||
'qiān': 'qian1', 'qián': 'qian2', 'qiǎn': 'qian3', 'qiàn': 'qian4', 'qiang': 'qiang5', 'qiāng': 'qiang1',
|
||||
'qiáng': 'qiang2', 'qiǎng': 'qiang3', 'qiàng': 'qiang4', 'qiao': 'qiao5', 'qiāo': 'qiao1', 'qiáo': 'qiao2',
|
||||
'qiǎo': 'qiao3', 'qiào': 'qiao4', 'qie': 'qie5', 'qiē': 'qie1', 'qié': 'qie2', 'qiě': 'qie3', 'qiè': 'qie4',
|
||||
'qin': 'qin5', 'qīn': 'qin1', 'qín': 'qin2', 'qǐn': 'qin3', 'qìn': 'qin4', 'qing': 'qing5', 'qīng': 'qing1',
|
||||
'qíng': 'qing2', 'qǐng': 'qing3', 'qìng': 'qing4', 'qiong': 'qiong5', 'qiōng': 'qiong1', 'qióng': 'qiong2',
|
||||
'qiǒng': 'qiong3', 'qiòng': 'qiong4', 'qiu': 'qiu5', 'qiū': 'qiu1', 'qiú': 'qiu2', 'qiǔ': 'qiu3', 'qiù': 'qiu4',
|
||||
'qu': 'qu5', 'qū': 'qu1', 'qú': 'qu2', 'qǔ': 'qu3', 'qù': 'qu4', 'quan': 'quan5', 'quān': 'quan1', 'quán': 'quan2',
|
||||
'quǎn': 'quan3', 'quàn': 'quan4', 'que': 'que5', 'quē': 'que1', 'qué': 'que2', 'quě': 'que3', 'què': 'que4',
|
||||
'qun': 'qun5', 'qūn': 'qun1', 'qún': 'qun2', 'qǔn': 'qun3', 'qùn': 'qun4', 'ran': 'ran5', 'rān': 'ran1',
|
||||
'rán': 'ran2', 'rǎn': 'ran3', 'ràn': 'ran4', 'rang': 'rang5', 'rāng': 'rang1', 'ráng': 'rang2', 'rǎng': 'rang3',
|
||||
'ràng': 'rang4', 'rao': 'rao5', 'rāo': 'rao1', 'ráo': 'rao2', 'rǎo': 'rao3', 'rào': 'rao4', 're': 're5',
|
||||
'rē': 're1', 'ré': 're2', 'rě': 're3', 'rè': 're4', 'ren': 'ren5', 'rēn': 'ren1', 'rén': 'ren2', 'rěn': 'ren3',
|
||||
'rèn': 'ren4', 'reng': 'reng5', 'rēng': 'reng1', 'réng': 'reng2', 'rěng': 'reng3', 'rèng': 'reng4', 'ri': 'ri5',
|
||||
'rī': 'ri1', 'rí': 'ri2', 'rǐ': 'ri3', 'rì': 'ri4', 'rong': 'rong5', 'rōng': 'rong1', 'róng': 'rong2',
|
||||
'rǒng': 'rong3', 'ròng': 'rong4', 'rou': 'rou5', 'rōu': 'rou1', 'róu': 'rou2', 'rǒu': 'rou3', 'ròu': 'rou4',
|
||||
'ru': 'ru5', 'rū': 'ru1', 'rú': 'ru2', 'rǔ': 'ru3', 'rù': 'ru4', 'ruan': 'ruan5', 'ruān': 'ruan1', 'ruán': 'ruan2',
|
||||
'ruǎn': 'ruan3', 'ruàn': 'ruan4', 'rui': 'rui5', 'ruī': 'rui1', 'ruí': 'rui2', 'ruǐ': 'rui3', 'ruì': 'rui4',
|
||||
'run': 'run5', 'rūn': 'run1', 'rún': 'run2', 'rǔn': 'run3', 'rùn': 'run4', 'ruo': 'ruo5', 'ruō': 'ruo1',
|
||||
'ruó': 'ruo2', 'ruǒ': 'ruo3', 'ruò': 'ruo4', 'sa': 'sa5', 'sā': 'sa1', 'sá': 'sa2', 'sǎ': 'sa3', 'sà': 'sa4',
|
||||
'sai': 'sai5', 'sāi': 'sai1', 'sái': 'sai2', 'sǎi': 'sai3', 'sài': 'sai4', 'san': 'san5', 'sān': 'san1',
|
||||
'sán': 'san2', 'sǎn': 'san3', 'sàn': 'san4', 'sang': 'sang5', 'sāng': 'sang1', 'sáng': 'sang2', 'sǎng': 'sang3',
|
||||
'sàng': 'sang4', 'sao': 'sao5', 'sāo': 'sao1', 'sáo': 'sao2', 'sǎo': 'sao3', 'sào': 'sao4', 'se': 'se5',
|
||||
'sē': 'se1', 'sé': 'se2', 'sě': 'se3', 'sè': 'se4', 'sen': 'sen5', 'sēn': 'sen1', 'sén': 'sen2', 'sěn': 'sen3',
|
||||
'sèn': 'sen4', 'seng': 'seng5', 'sēng': 'seng1', 'séng': 'seng2', 'sěng': 'seng3', 'sèng': 'seng4', 'sha': 'sha5',
|
||||
'shā': 'sha1', 'shá': 'sha2', 'shǎ': 'sha3', 'shà': 'sha4', 'shai': 'shai5', 'shāi': 'shai1', 'shái': 'shai2',
|
||||
'shǎi': 'shai3', 'shài': 'shai4', 'shan': 'shan5', 'shān': 'shan1', 'shán': 'shan2', 'shǎn': 'shan3',
|
||||
'shàn': 'shan4', 'shang': 'shang5', 'shāng': 'shang1', 'sháng': 'shang2', 'shǎng': 'shang3', 'shàng': 'shang4',
|
||||
'shao': 'shao5', 'shāo': 'shao1', 'sháo': 'shao2', 'shǎo': 'shao3', 'shào': 'shao4', 'she': 'she5', 'shē': 'she1',
|
||||
'shé': 'she2', 'shě': 'she3', 'shè': 'she4', 'shei': 'shei5', 'shēi': 'shei1', 'shéi': 'shei2', 'shěi': 'shei3',
|
||||
'shèi': 'shei4', 'shen': 'shen5', 'shēn': 'shen1', 'shén': 'shen2', 'shěn': 'shen3', 'shèn': 'shen4',
|
||||
'sheng': 'sheng5', 'shēng': 'sheng1', 'shéng': 'sheng2', 'shěng': 'sheng3', 'shèng': 'sheng4', 'shi': 'shi5',
|
||||
'shī': 'shi1', 'shí': 'shi2', 'shǐ': 'shi3', 'shì': 'shi4', 'shou': 'shou5', 'shōu': 'shou1', 'shóu': 'shou2',
|
||||
'shǒu': 'shou3', 'shòu': 'shou4', 'shu': 'shu5', 'shū': 'shu1', 'shú': 'shu2', 'shǔ': 'shu3', 'shù': 'shu4',
|
||||
'shua': 'shua5', 'shuā': 'shua1', 'shuá': 'shua2', 'shuǎ': 'shua3', 'shuà': 'shua4', 'shuai': 'shuai5',
|
||||
'shuāi': 'shuai1', 'shuái': 'shuai2', 'shuǎi': 'shuai3', 'shuài': 'shuai4', 'shuan': 'shuan5', 'shuān': 'shuan1',
|
||||
'shuán': 'shuan2', 'shuǎn': 'shuan3', 'shuàn': 'shuan4', 'shuang': 'shuang5', 'shuāng': 'shuang1',
|
||||
'shuáng': 'shuang2', 'shuǎng': 'shuang3', 'shuàng': 'shuang4', 'shui': 'shui5', 'shuī': 'shui1', 'shuí': 'shui2',
|
||||
'shuǐ': 'shui3', 'shuì': 'shui4', 'shun': 'shun5', 'shūn': 'shun1', 'shún': 'shun2', 'shǔn': 'shun3',
|
||||
'shùn': 'shun4', 'shuo': 'shuo5', 'shuō': 'shuo1', 'shuó': 'shuo2', 'shuǒ': 'shuo3', 'shuò': 'shuo4', 'si': 'si5',
|
||||
'sī': 'si1', 'sí': 'si2', 'sǐ': 'si3', 'sì': 'si4', 'song': 'song5', 'sōng': 'song1', 'sóng': 'song2',
|
||||
'sǒng': 'song3', 'sòng': 'song4', 'sou': 'sou5', 'sōu': 'sou1', 'sóu': 'sou2', 'sǒu': 'sou3', 'sòu': 'sou4',
|
||||
'su': 'su5', 'sū': 'su1', 'sú': 'su2', 'sǔ': 'su3', 'sù': 'su4', 'suan': 'suan5', 'suān': 'suan1', 'suán': 'suan2',
|
||||
'suǎn': 'suan3', 'suàn': 'suan4', 'sui': 'sui5', 'suī': 'sui1', 'suí': 'sui2', 'suǐ': 'sui3', 'suì': 'sui4',
|
||||
'sun': 'sun5', 'sūn': 'sun1', 'sún': 'sun2', 'sǔn': 'sun3', 'sùn': 'sun4', 'suo': 'suo5', 'suō': 'suo1',
|
||||
'suó': 'suo2', 'suǒ': 'suo3', 'suò': 'suo4', 'ta': 'ta5', 'tā': 'ta1', 'tá': 'ta2', 'tǎ': 'ta3', 'tà': 'ta4',
|
||||
'tai': 'tai5', 'tāi': 'tai1', 'tái': 'tai2', 'tǎi': 'tai3', 'tài': 'tai4', 'tan': 'tan5', 'tān': 'tan1',
|
||||
'tán': 'tan2', 'tǎn': 'tan3', 'tàn': 'tan4', 'tang': 'tang5', 'tāng': 'tang1', 'táng': 'tang2', 'tǎng': 'tang3',
|
||||
'tàng': 'tang4', 'tao': 'tao5', 'tāo': 'tao1', 'táo': 'tao2', 'tǎo': 'tao3', 'tào': 'tao4', 'te': 'te5',
|
||||
'tē': 'te1', 'té': 'te2', 'tě': 'te3', 'tè': 'te4', 'teng': 'teng5', 'tēng': 'teng1', 'téng': 'teng2',
|
||||
'těng': 'teng3', 'tèng': 'teng4', 'ti': 'ti5', 'tī': 'ti1', 'tí': 'ti2', 'tǐ': 'ti3', 'tì': 'ti4', 'tian': 'tian5',
|
||||
'tiān': 'tian1', 'tián': 'tian2', 'tiǎn': 'tian3', 'tiàn': 'tian4', 'tiao': 'tiao5', 'tiāo': 'tiao1',
|
||||
'tiáo': 'tiao2', 'tiǎo': 'tiao3', 'tiào': 'tiao4', 'tie': 'tie5', 'tiē': 'tie1', 'tié': 'tie2', 'tiě': 'tie3',
|
||||
'tiè': 'tie4', 'ting': 'ting5', 'tīng': 'ting1', 'tíng': 'ting2', 'tǐng': 'ting3', 'tìng': 'ting4', 'tong': 'tong5',
|
||||
'tōng': 'tong1', 'tóng': 'tong2', 'tǒng': 'tong3', 'tòng': 'tong4', 'tou': 'tou5', 'tōu': 'tou1', 'tóu': 'tou2',
|
||||
'tǒu': 'tou3', 'tòu': 'tou4', 'tu': 'tu5', 'tū': 'tu1', 'tú': 'tu2', 'tǔ': 'tu3', 'tù': 'tu4', 'tuan': 'tuan5',
|
||||
'tuān': 'tuan1', 'tuán': 'tuan2', 'tuǎn': 'tuan3', 'tuàn': 'tuan4', 'tui': 'tui5', 'tuī': 'tui1', 'tuí': 'tui2',
|
||||
'tuǐ': 'tui3', 'tuì': 'tui4', 'tun': 'tun5', 'tūn': 'tun1', 'tún': 'tun2', 'tǔn': 'tun3', 'tùn': 'tun4',
|
||||
'tuo': 'tuo5', 'tuō': 'tuo1', 'tuó': 'tuo2', 'tuǒ': 'tuo3', 'tuò': 'tuo4', 'wa': 'wa5', 'wā': 'wa1', 'wá': 'wa2',
|
||||
'wǎ': 'wa3', 'wà': 'wa4', 'wai': 'wai5', 'wāi': 'wai1', 'wái': 'wai2', 'wǎi': 'wai3', 'wài': 'wai4', 'wan': 'wan5',
|
||||
'wān': 'wan1', 'wán': 'wan2', 'wǎn': 'wan3', 'wàn': 'wan4', 'wang': 'wang5', 'wāng': 'wang1', 'wáng': 'wang2',
|
||||
'wǎng': 'wang3', 'wàng': 'wang4', 'wei': 'wei5', 'wēi': 'wei1', 'wéi': 'wei2', 'wěi': 'wei3', 'wèi': 'wei4',
|
||||
'wen': 'wen5', 'wēn': 'wen1', 'wén': 'wen2', 'wěn': 'wen3', 'wèn': 'wen4', 'weng': 'weng5', 'wēng': 'weng1',
|
||||
'wéng': 'weng2', 'wěng': 'weng3', 'wèng': 'weng4', 'wo': 'wo5', 'wō': 'wo1', 'wó': 'wo2', 'wǒ': 'wo3', 'wò': 'wo4',
|
||||
'wu': 'wu5', 'wū': 'wu1', 'wú': 'wu2', 'wǔ': 'wu3', 'wù': 'wu4', 'xi': 'xi5', 'xī': 'xi1', 'xí': 'xi2', 'xǐ': 'xi3',
|
||||
'xì': 'xi4', 'xia': 'xia5', 'xiā': 'xia1', 'xiá': 'xia2', 'xiǎ': 'xia3', 'xià': 'xia4', 'xian': 'xian5',
|
||||
'xiān': 'xian1', 'xián': 'xian2', 'xiǎn': 'xian3', 'xiàn': 'xian4', 'xiang': 'xiang5', 'xiāng': 'xiang1',
|
||||
'xiáng': 'xiang2', 'xiǎng': 'xiang3', 'xiàng': 'xiang4', 'xiao': 'xiao5', 'xiāo': 'xiao1', 'xiáo': 'xiao2',
|
||||
'xiǎo': 'xiao3', 'xiào': 'xiao4', 'xie': 'xie5', 'xiē': 'xie1', 'xié': 'xie2', 'xiě': 'xie3', 'xiè': 'xie4',
|
||||
'xin': 'xin5', 'xīn': 'xin1', 'xín': 'xin2', 'xǐn': 'xin3', 'xìn': 'xin4', 'xing': 'xing5', 'xīng': 'xing1',
|
||||
'xíng': 'xing2', 'xǐng': 'xing3', 'xìng': 'xing4', 'xiong': 'xiong5', 'xiōng': 'xiong1', 'xióng': 'xiong2',
|
||||
'xiǒng': 'xiong3', 'xiòng': 'xiong4', 'xiu': 'xiu5', 'xiū': 'xiu1', 'xiú': 'xiu2', 'xiǔ': 'xiu3', 'xiù': 'xiu4',
|
||||
'xu': 'xu5', 'xū': 'xu1', 'xú': 'xu2', 'xǔ': 'xu3', 'xù': 'xu4', 'xuan': 'xuan5', 'xuān': 'xuan1', 'xuán': 'xuan2',
|
||||
'xuǎn': 'xuan3', 'xuàn': 'xuan4', 'xue': 'xue5', 'xuē': 'xue1', 'xué': 'xue2', 'xuě': 'xue3', 'xuè': 'xue4',
|
||||
'xun': 'xun5', 'xūn': 'xun1', 'xún': 'xun2', 'xǔn': 'xun3', 'xùn': 'xun4', 'ya': 'ya5', 'yā': 'ya1', 'yá': 'ya2',
|
||||
'yǎ': 'ya3', 'yà': 'ya4', 'yan': 'yan5', 'yān': 'yan1', 'yán': 'yan2', 'yǎn': 'yan3', 'yàn': 'yan4',
|
||||
'yang': 'yang5', 'yāng': 'yang1', 'yáng': 'yang2', 'yǎng': 'yang3', 'yàng': 'yang4', 'yao': 'yao5', 'yāo': 'yao1',
|
||||
'yáo': 'yao2', 'yǎo': 'yao3', 'yào': 'yao4', 'ye': 'ye5', 'yē': 'ye1', 'yé': 'ye2', 'yě': 'ye3', 'yè': 'ye4',
|
||||
'yi': 'yi5', 'yī': 'yi1', 'yí': 'yi2', 'yǐ': 'yi3', 'yì': 'yi4', 'yin': 'yin5', 'yīn': 'yin1', 'yín': 'yin2',
|
||||
'yǐn': 'yin3', 'yìn': 'yin4', 'ying': 'ying5', 'yīng': 'ying1', 'yíng': 'ying2', 'yǐng': 'ying3', 'yìng': 'ying4',
|
||||
'yo': 'yo5', 'yō': 'yo1', 'yó': 'yo2', 'yǒ': 'yo3', 'yò': 'yo4', 'yong': 'yong5', 'yōng': 'yong1', 'yóng': 'yong2',
|
||||
'yǒng': 'yong3', 'yòng': 'yong4', 'you': 'you5', 'yōu': 'you1', 'yóu': 'you2', 'yǒu': 'you3', 'yòu': 'you4',
|
||||
'yu': 'yu5', 'yū': 'yu1', 'yú': 'yu2', 'yǔ': 'yu3', 'yù': 'yu4', 'yuan': 'yuan5', 'yuān': 'yuan1', 'yuán': 'yuan2',
|
||||
'yuǎn': 'yuan3', 'yuàn': 'yuan4', 'yue': 'yue5', 'yuē': 'yue1', 'yué': 'yue2', 'yuě': 'yue3', 'yuè': 'yue4',
|
||||
'yun': 'yun5', 'yūn': 'yun1', 'yún': 'yun2', 'yǔn': 'yun3', 'yùn': 'yun4', 'za': 'za5', 'zā': 'za1', 'zá': 'za2',
|
||||
'zǎ': 'za3', 'zà': 'za4', 'zai': 'zai5', 'zāi': 'zai1', 'zái': 'zai2', 'zǎi': 'zai3', 'zài': 'zai4', 'zan': 'zan5',
|
||||
'zān': 'zan1', 'zán': 'zan2', 'zǎn': 'zan3', 'zàn': 'zan4', 'zang': 'zang5', 'zāng': 'zang1', 'záng': 'zang2',
|
||||
'zǎng': 'zang3', 'zàng': 'zang4', 'zao': 'zao5', 'zāo': 'zao1', 'záo': 'zao2', 'zǎo': 'zao3', 'zào': 'zao4',
|
||||
'ze': 'ze5', 'zē': 'ze1', 'zé': 'ze2', 'zě': 'ze3', 'zè': 'ze4', 'zei': 'zei5', 'zēi': 'zei1', 'zéi': 'zei2',
|
||||
'zěi': 'zei3', 'zèi': 'zei4', 'zen': 'zen5', 'zēn': 'zen1', 'zén': 'zen2', 'zěn': 'zen3', 'zèn': 'zen4',
|
||||
'zeng': 'zeng5', 'zēng': 'zeng1', 'zéng': 'zeng2', 'zěng': 'zeng3', 'zèng': 'zeng4', 'zha': 'zha5', 'zhā': 'zha1',
|
||||
'zhá': 'zha2', 'zhǎ': 'zha3', 'zhà': 'zha4', 'zhai': 'zhai5', 'zhāi': 'zhai1', 'zhái': 'zhai2', 'zhǎi': 'zhai3',
|
||||
'zhài': 'zhai4', 'zhan': 'zhan5', 'zhān': 'zhan1', 'zhán': 'zhan2', 'zhǎn': 'zhan3', 'zhàn': 'zhan4',
|
||||
'zhang': 'zhang5', 'zhāng': 'zhang1', 'zháng': 'zhang2', 'zhǎng': 'zhang3', 'zhàng': 'zhang4', 'zhao': 'zhao5',
|
||||
'zhāo': 'zhao1', 'zháo': 'zhao2', 'zhǎo': 'zhao3', 'zhào': 'zhao4', 'zhe': 'zhe5', 'zhē': 'zhe1', 'zhé': 'zhe2',
|
||||
'zhě': 'zhe3', 'zhè': 'zhe4', 'zhen': 'zhen5', 'zhēn': 'zhen1', 'zhén': 'zhen2', 'zhěn': 'zhen3', 'zhèn': 'zhen4',
|
||||
'zheng': 'zheng5', 'zhēng': 'zheng1', 'zhéng': 'zheng2', 'zhěng': 'zheng3', 'zhèng': 'zheng4', 'zhi': 'zhi5',
|
||||
'zhī': 'zhi1', 'zhí': 'zhi2', 'zhǐ': 'zhi3', 'zhì': 'zhi4', 'zhong': 'zhong5', 'zhōng': 'zhong1', 'zhóng': 'zhong2',
|
||||
'zhǒng': 'zhong3', 'zhòng': 'zhong4', 'zhou': 'zhou5', 'zhōu': 'zhou1', 'zhóu': 'zhou2', 'zhǒu': 'zhou3',
|
||||
'zhòu': 'zhou4', 'zhu': 'zhu5', 'zhū': 'zhu1', 'zhú': 'zhu2', 'zhǔ': 'zhu3', 'zhù': 'zhu4', 'zhua': 'zhua5',
|
||||
'zhuā': 'zhua1', 'zhuá': 'zhua2', 'zhuǎ': 'zhua3', 'zhuà': 'zhua4', 'zhuai': 'zhuai5', 'zhuāi': 'zhuai1',
|
||||
'zhuái': 'zhuai2', 'zhuǎi': 'zhuai3', 'zhuài': 'zhuai4', 'zhuan': 'zhuan5', 'zhuān': 'zhuan1', 'zhuán': 'zhuan2',
|
||||
'zhuǎn': 'zhuan3', 'zhuàn': 'zhuan4', 'zhuang': 'zhuang5', 'zhuāng': 'zhuang1', 'zhuáng': 'zhuang2',
|
||||
'zhuǎng': 'zhuang3', 'zhuàng': 'zhuang4', 'zhui': 'zhui5', 'zhuī': 'zhui1', 'zhuí': 'zhui2', 'zhuǐ': 'zhui3',
|
||||
'zhuì': 'zhui4', 'zhun': 'zhun5', 'zhūn': 'zhun1', 'zhún': 'zhun2', 'zhǔn': 'zhun3', 'zhùn': 'zhun4',
|
||||
'zhuo': 'zhuo5', 'zhuō': 'zhuo1', 'zhuó': 'zhuo2', 'zhuǒ': 'zhuo3', 'zhuò': 'zhuo4', 'zi': 'zi5', 'zī': 'zi1',
|
||||
'zí': 'zi2', 'zǐ': 'zi3', 'zì': 'zi4', 'zong': 'zong5', 'zōng': 'zong1', 'zóng': 'zong2', 'zǒng': 'zong3',
|
||||
'zòng': 'zong4', 'zou': 'zou5', 'zōu': 'zou1', 'zóu': 'zou2', 'zǒu': 'zou3', 'zòu': 'zou4', 'zu': 'zu5',
|
||||
'zū': 'zu1', 'zú': 'zu2', 'zǔ': 'zu3', 'zù': 'zu4', 'zuan': 'zuan5', 'zuān': 'zuan1', 'zuán': 'zuan2',
|
||||
'zuǎn': 'zuan3', 'zuàn': 'zuan4', 'zui': 'zui5', 'zuī': 'zui1', 'zuí': 'zui2', 'zuǐ': 'zui3', 'zuì': 'zui4',
|
||||
'zun': 'zun5', 'zūn': 'zun1', 'zún': 'zun2', 'zǔn': 'zun3', 'zùn': 'zun4', 'zuo': 'zuo5', 'zuō': 'zuo1',
|
||||
'zuó': 'zuo2', 'zuǒ': 'zuo3', 'zuò': 'zuo4', 'zhei': 'zhei5', 'zhēi': 'zhei1', 'zhéi': 'zhei2', 'zhěi': 'zhei3',
|
||||
'zhèi': 'zhei4', 'kei': 'kei5', 'kēi': 'kei1', 'kéi': 'kei2', 'kěi': 'kei3', 'kèi': 'kei4', 'tei': 'tei5',
|
||||
'tēi': 'tei1', 'téi': 'tei2', 'těi': 'tei3', 'tèi': 'tei4', 'len': 'len5', 'lēn': 'len1', 'lén': 'len2',
|
||||
'lěn': 'len3', 'lèn': 'len4', 'nun': 'nun5', 'nūn': 'nun1', 'nún': 'nun2', 'nǔn': 'nun3', 'nùn': 'nun4',
|
||||
'nia': 'nia5', 'niā': 'nia1', 'niá': 'nia2', 'niǎ': 'nia3', 'nià': 'nia4', 'rua': 'rua5', 'ruā': 'rua1',
|
||||
'ruá': 'rua2', 'ruǎ': 'rua3', 'ruà': 'rua4', 'fiao': 'fiao5', 'fiāo': 'fiao1', 'fiáo': 'fiao2', 'fiǎo': 'fiao3',
|
||||
'fiào': 'fiao4', 'cei': 'cei5', 'cēi': 'cei1', 'céi': 'cei2', 'cěi': 'cei3', 'cèi': 'cei4', 'wong': 'wong5',
|
||||
'wōng': 'wong1', 'wóng': 'wong2', 'wǒng': 'wong3', 'wòng': 'wong4', 'din': 'din5', 'dīn': 'din1', 'dín': 'din2',
|
||||
'dǐn': 'din3', 'dìn': 'din4', 'chua': 'chua5', 'chuā': 'chua1', 'chuá': 'chua2', 'chuǎ': 'chua3', 'chuà': 'chua4',
|
||||
'n': 'n5', 'n1': 'n1', 'ń': 'n2', 'ň': 'n3', 'ǹ': 'n4', 'ng': 'ng5', 'ng1': 'ng1', 'ńg': 'ng2', 'ňg': 'ng3',
|
||||
'ǹg': 'ng4'}
|
||||
|
||||
shengyundiao2guobiao_dict = {v: k for k, v in guobiao2shengyundiao_dict.items()}
|
||||
|
||||
|
||||
def guobiao2shengyundiao(pinyin_list):
|
||||
"""国标样式拼音转为声母韵母音调样式的拼音。"""
|
||||
out = []
|
||||
for pin in pinyin_list:
|
||||
out.append(guobiao2shengyundiao_dict.get(pin))
|
||||
return out
|
||||
|
||||
|
||||
def shengyundiao2guobiao(pinyin_list):
|
||||
"""声母韵母音调样式的拼音转为国标样式的拼音。"""
|
||||
out = []
|
||||
for pin in pinyin_list:
|
||||
out.append(shengyundiao2guobiao_dict.get(pin))
|
||||
return out
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger.info(__file__)
|
||||
out = shengyundiao2guobiao('ni2 hao3 a5'.split())
|
||||
assert out == ['ní', 'hǎo', 'a']
|
||||
out = guobiao2shengyundiao(out)
|
||||
assert out == ['ni2', 'hao3', 'a5']
|
@ -0,0 +1,19 @@
|
||||
Copyright (c) 2017 Keith Ito
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
@ -0,0 +1,116 @@
|
||||
"""
|
||||
### english
|
||||
|
||||
from https://github.com/keithito/tacotron "
|
||||
Cleaners are transformations that run over the input text at both training and eval time.
|
||||
|
||||
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
||||
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
|
||||
1. "english_cleaners" for English text
|
||||
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
||||
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
||||
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
||||
the symbols in symbols.py to match your data).
|
||||
"""
|
||||
import re
|
||||
import random
|
||||
from . import cleaners
|
||||
from .symbols import symbols
|
||||
|
||||
# Mappings from symbol to numeric ID and vice versa:
|
||||
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
||||
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
||||
|
||||
# Regular expression matching text enclosed in curly braces:
|
||||
_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
|
||||
|
||||
|
||||
def get_arpabet(word, dictionary):
|
||||
word_arpabet = dictionary.lookup(word)
|
||||
if word_arpabet is not None:
|
||||
return "{" + word_arpabet[0] + "}"
|
||||
else:
|
||||
return word
|
||||
|
||||
|
||||
def text_to_sequence(text, cleaner_names, dictionary=None, p_arpabet=1.0):
|
||||
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
||||
|
||||
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
|
||||
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
|
||||
|
||||
Args:
|
||||
text: string to convert to a sequence
|
||||
cleaner_names: names of the cleaner functions to run the text through
|
||||
dictionary: arpabet class with arpabet dictionary
|
||||
|
||||
Returns:
|
||||
List of integers corresponding to the symbols in the text
|
||||
'''
|
||||
sequence = []
|
||||
|
||||
space = _symbols_to_sequence(' ')
|
||||
# Check for curly braces and treat their contents as ARPAbet:
|
||||
while len(text):
|
||||
m = _curly_re.match(text)
|
||||
if not m:
|
||||
clean_text = _clean_text(text, cleaner_names)
|
||||
if dictionary is not None:
|
||||
clean_text = [get_arpabet(w, dictionary)
|
||||
if random.random() < p_arpabet else w
|
||||
for w in clean_text.split(" ")]
|
||||
|
||||
for i in range(len(clean_text)):
|
||||
t = clean_text[i]
|
||||
if t.startswith("{"):
|
||||
sequence += _arpabet_to_sequence(t[1:-1])
|
||||
else:
|
||||
sequence += _symbols_to_sequence(t)
|
||||
sequence += space
|
||||
else:
|
||||
sequence += _symbols_to_sequence(clean_text)
|
||||
break
|
||||
|
||||
clean_text = _clean_text(text, cleaner_names)
|
||||
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
|
||||
sequence += _arpabet_to_sequence(m.group(2))
|
||||
text = m.group(3)
|
||||
|
||||
# remove trailing space
|
||||
sequence = sequence[:-1] if sequence[-1] == space[0] else sequence
|
||||
return sequence
|
||||
|
||||
|
||||
def sequence_to_text(sequence):
|
||||
'''Converts a sequence of IDs back to a string'''
|
||||
result = []
|
||||
for symbol_id in sequence:
|
||||
if symbol_id in _id_to_symbol:
|
||||
s = _id_to_symbol[symbol_id]
|
||||
# Enclose ARPAbet back in curly braces:
|
||||
if len(s) > 1 and s[0] == '@':
|
||||
s = '{%s}' % s[1:]
|
||||
result.append(s)
|
||||
result = ''.join(result)
|
||||
return result.replace('}{', ' ')
|
||||
|
||||
|
||||
def _clean_text(text, cleaner_names):
|
||||
for name in cleaner_names:
|
||||
cleaner = getattr(cleaners, name)
|
||||
if not cleaner:
|
||||
raise Exception('Unknown cleaner: %s' % name)
|
||||
text = cleaner(text)
|
||||
return text
|
||||
|
||||
|
||||
def _symbols_to_sequence(symbols):
|
||||
return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
|
||||
|
||||
|
||||
def _arpabet_to_sequence(text):
|
||||
return _symbols_to_sequence(['@' + s for s in text.split()])
|
||||
|
||||
|
||||
def _should_keep_symbol(s):
|
||||
return s in _symbol_to_id and s is not '_' and s is not '~'
|
@ -0,0 +1,91 @@
|
||||
'''
|
||||
### english
|
||||
|
||||
from https://github.com/keithito/tacotron "
|
||||
Cleaners are transformations that run over the input text at both training and eval time.
|
||||
|
||||
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
||||
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
|
||||
1. "english_cleaners" for English text
|
||||
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
||||
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
||||
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
||||
the symbols in symbols.py to match your data).
|
||||
'''
|
||||
|
||||
import re
|
||||
from unidecode import unidecode
|
||||
from .numbers import normalize_numbers
|
||||
|
||||
|
||||
# Regular expression matching whitespace:
|
||||
_whitespace_re = re.compile(r'\s+')
|
||||
|
||||
# List of (regular expression, replacement) pairs for abbreviations:
|
||||
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
|
||||
('mrs', 'misess'),
|
||||
('mr', 'mister'),
|
||||
('dr', 'doctor'),
|
||||
('st', 'saint'),
|
||||
('co', 'company'),
|
||||
('jr', 'junior'),
|
||||
('maj', 'major'),
|
||||
('gen', 'general'),
|
||||
('drs', 'doctors'),
|
||||
('rev', 'reverend'),
|
||||
('lt', 'lieutenant'),
|
||||
('hon', 'honorable'),
|
||||
('sgt', 'sergeant'),
|
||||
('capt', 'captain'),
|
||||
('esq', 'esquire'),
|
||||
('ltd', 'limited'),
|
||||
('col', 'colonel'),
|
||||
('ft', 'fort'),
|
||||
]]
|
||||
|
||||
|
||||
def expand_abbreviations(text):
|
||||
for regex, replacement in _abbreviations:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def expand_numbers(text):
|
||||
return normalize_numbers(text)
|
||||
|
||||
|
||||
def lowercase(text):
|
||||
return text.lower()
|
||||
|
||||
|
||||
def collapse_whitespace(text):
|
||||
return re.sub(_whitespace_re, ' ', text)
|
||||
|
||||
|
||||
def convert_to_ascii(text):
|
||||
return unidecode(text)
|
||||
|
||||
|
||||
def basic_cleaners(text):
|
||||
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def transliteration_cleaners(text):
|
||||
'''Pipeline for non-English text that transliterates to ASCII.'''
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def english_cleaners(text):
|
||||
'''Pipeline for English text, including number and abbreviation expansion.'''
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = expand_numbers(text)
|
||||
text = expand_abbreviations(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,65 @@
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
|
||||
import re
|
||||
|
||||
|
||||
valid_symbols = [
|
||||
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
|
||||
'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
|
||||
'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
|
||||
'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
|
||||
'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
|
||||
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
|
||||
'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
|
||||
]
|
||||
|
||||
_valid_symbol_set = set(valid_symbols)
|
||||
|
||||
|
||||
class CMUDict:
|
||||
'''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
|
||||
def __init__(self, file_or_path, keep_ambiguous=True):
|
||||
if isinstance(file_or_path, str):
|
||||
with open(file_or_path, encoding='latin-1') as f:
|
||||
entries = _parse_cmudict(f)
|
||||
else:
|
||||
entries = _parse_cmudict(file_or_path)
|
||||
if not keep_ambiguous:
|
||||
entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
|
||||
self._entries = entries
|
||||
|
||||
|
||||
def __len__(self):
|
||||
return len(self._entries)
|
||||
|
||||
|
||||
def lookup(self, word):
|
||||
'''Returns list of ARPAbet pronunciations of the given word.'''
|
||||
return self._entries.get(word.upper())
|
||||
|
||||
|
||||
|
||||
_alt_re = re.compile(r'\([0-9]+\)')
|
||||
|
||||
|
||||
def _parse_cmudict(file):
|
||||
cmudict = {}
|
||||
for line in file:
|
||||
if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
|
||||
parts = line.split(' ')
|
||||
word = re.sub(_alt_re, '', parts[0])
|
||||
pronunciation = _get_pronunciation(parts[1])
|
||||
if pronunciation:
|
||||
if word in cmudict:
|
||||
cmudict[word].append(pronunciation)
|
||||
else:
|
||||
cmudict[word] = [pronunciation]
|
||||
return cmudict
|
||||
|
||||
|
||||
def _get_pronunciation(s):
|
||||
parts = s.strip().split(' ')
|
||||
for part in parts:
|
||||
if part not in _valid_symbol_set:
|
||||
return None
|
||||
return ' '.join(parts)
|
@ -0,0 +1,71 @@
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
|
||||
import inflect
|
||||
import re
|
||||
|
||||
|
||||
_inflect = inflect.engine()
|
||||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
||||
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
||||
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
|
||||
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
|
||||
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
|
||||
_number_re = re.compile(r'[0-9]+')
|
||||
|
||||
|
||||
def _remove_commas(m):
|
||||
return m.group(1).replace(',', '')
|
||||
|
||||
|
||||
def _expand_decimal_point(m):
|
||||
return m.group(1).replace('.', ' point ')
|
||||
|
||||
|
||||
def _expand_dollars(m):
|
||||
match = m.group(1)
|
||||
parts = match.split('.')
|
||||
if len(parts) > 2:
|
||||
return match + ' dollars' # Unexpected format
|
||||
dollars = int(parts[0]) if parts[0] else 0
|
||||
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
||||
if dollars and cents:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
|
||||
elif dollars:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
return '%s %s' % (dollars, dollar_unit)
|
||||
elif cents:
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s' % (cents, cent_unit)
|
||||
else:
|
||||
return 'zero dollars'
|
||||
|
||||
|
||||
def _expand_ordinal(m):
|
||||
return _inflect.number_to_words(m.group(0))
|
||||
|
||||
|
||||
def _expand_number(m):
|
||||
num = int(m.group(0))
|
||||
if num > 1000 and num < 3000:
|
||||
if num == 2000:
|
||||
return 'two thousand'
|
||||
elif num > 2000 and num < 2010:
|
||||
return 'two thousand ' + _inflect.number_to_words(num % 100)
|
||||
elif num % 100 == 0:
|
||||
return _inflect.number_to_words(num // 100) + ' hundred'
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='')
|
||||
|
||||
|
||||
def normalize_numbers(text):
|
||||
text = re.sub(_comma_number_re, _remove_commas, text)
|
||||
text = re.sub(_pounds_re, r'\1 pounds', text)
|
||||
text = re.sub(_dollars_re, _expand_dollars, text)
|
||||
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
||||
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
||||
text = re.sub(_number_re, _expand_number, text)
|
||||
return text
|
@ -0,0 +1,21 @@
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
|
||||
'''
|
||||
Defines the set of symbols used in text input to the model.
|
||||
|
||||
The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
|
||||
from . import cmudict
|
||||
|
||||
_punctuation = '!\'",.:;? '
|
||||
_math = '#%&*+-/[]()'
|
||||
_special = '_@©°½—₩€$'
|
||||
_accented = 'áçéêëñöøćž'
|
||||
_numbers = '0123456789'
|
||||
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
|
||||
|
||||
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as
|
||||
# uppercase letters):
|
||||
_arpabet = ['@' + s for s in cmudict.valid_symbols]
|
||||
|
||||
# Export all symbols:
|
||||
symbols = list(_punctuation + _math + _special + _accented + _numbers + _letters) + _arpabet
|
@ -0,0 +1,4 @@
|
||||
jieba
|
||||
inflect
|
||||
unidecode
|
||||
tqdm
|
@ -1,14 +0,0 @@
|
||||
=======
|
||||
Credits
|
||||
=======
|
||||
|
||||
Author and Maintainer
|
||||
---------------------
|
||||
|
||||
* Thomas Roten <https://github.com/tsroten>
|
||||
|
||||
Contributors
|
||||
------------
|
||||
|
||||
None yet. Why not be the first?
|
||||
|
@ -1,88 +0,0 @@
|
||||
Changes
|
||||
=======
|
||||
|
||||
v0.1.0 (2013-05-05)
|
||||
-------------------
|
||||
|
||||
* Initial release
|
||||
|
||||
v0.1.1 (2013-05-05)
|
||||
-------------------
|
||||
|
||||
* Adds zhon.cedict package to setup.py
|
||||
|
||||
v0.2.0 (2013-05-07)
|
||||
-------------------
|
||||
|
||||
* Allows for mapping between simplified and traditional.
|
||||
* Adds logging to build_string().
|
||||
* Adds constants for numbered Pinyin and accented Pinyin.
|
||||
|
||||
v0.2.1 (2013-05-07)
|
||||
-------------------
|
||||
|
||||
* Fixes typo in README.rst.
|
||||
|
||||
v.1.0.0 (2014-01-25)
|
||||
--------------------
|
||||
|
||||
* Complete rewrite that refactors code, renames constants, and improves Pinyin
|
||||
support.
|
||||
|
||||
v.1.1.0 (2014-01-28)
|
||||
--------------------
|
||||
|
||||
* Adds ``zhon.pinyin.punctuation`` constant.
|
||||
* Adds ``zhon.pinyin.accented_syllable``, ``zhon.pinyin.accented_word``, and
|
||||
``zhon.pinyin.accented_sentence`` constants.
|
||||
* Adds ``zhon.pinyin.numbered_syllable``, ``zhon.pinyin.numbered_word``, and
|
||||
``zhon.pinyin.numbered_sentence`` constants.
|
||||
* Fixes some README.rst typos.
|
||||
* Clarifies information regarding Traditional and Simplified character
|
||||
constants in README.rst.
|
||||
* Adds constant short names to README.rst.
|
||||
|
||||
v.1.1.1 (2014-01-29)
|
||||
--------------------
|
||||
|
||||
* Adds documentation.
|
||||
* Adds ``zhon.cedict.all`` constant.
|
||||
* Removes duplicate code ranges from ``zhon.hanzi.characters``.
|
||||
* Makes ``zhon.hanzi.non_stops`` a string containing all non-stops instead of
|
||||
a string containing code ranges.
|
||||
* Removes duplicate letters in ``zhon.pinyin.consonants``.
|
||||
* Refactors Pinyin vowels/consonant code.
|
||||
* Removes the Latin alpha from ``zhon.pinyin.vowels``. Fixes #16.
|
||||
* Adds ``cjk_ideographs`` alias for ``zhon.hanzi.characters``.
|
||||
* Fixes various typos.
|
||||
* Removes numbers from Pinyin word constants. Fixes #15.
|
||||
* Adds lowercase and uppercase constants to ``zhon.pinyin``.
|
||||
* Fixes a bug with ``zhon.pinyin.sentence``.
|
||||
* Adds ``sent`` alias for ``zhon.pinyin.sentence``.
|
||||
|
||||
v.1.1.2 (2014-01-31)
|
||||
--------------------
|
||||
|
||||
* Fixes bug with ``zhon.cedict.all``.
|
||||
|
||||
v.1.1.3 (2014-02-12)
|
||||
--------------------
|
||||
|
||||
* Adds Ideographic number zero to ``zhon.hanzi.characters``. Fixes #17.
|
||||
* Fixes r-suffix bug. Fixes #18.
|
||||
|
||||
v.1.1.4 (2015-01-25)
|
||||
--------------------
|
||||
|
||||
* Removes duplicate module declarations in documentation.
|
||||
* Moves tests inside zhon package.
|
||||
* Adds travis config file.
|
||||
* Adds Python 3.4 tests to travis and tox.
|
||||
* Fixes flake8 warnings.
|
||||
* Adds distutil fallback import statment to setup.py.
|
||||
* Adds missing hanzi punctuation. Fixes #19.
|
||||
|
||||
v.1.1.5 (2016-05-23)
|
||||
--------------------
|
||||
|
||||
* Add missing Zhuyin characters. Fixes #23.
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue