commit
d05ae8eeb0
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
Before Width: | Height: | Size: 85 KiB |
Before Width: | Height: | Size: 46 KiB |
Before Width: | Height: | Size: 47 KiB |
@ -1,8 +1,9 @@
|
||||
# Aishell-1
|
||||
|
||||
## Deepspeech2
|
||||
| Model | release | Config | Test set | CER |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| DeepSpeech2 | 2.1 | conf/deepspeech2.yaml | test | 0.078671 |
|
||||
| DeepSpeech2 | 2.0 | conf/deepspeech2.yaml | test | 0.078977 |
|
||||
| DeepSpeech2 | 1.8.5 | - | test | 0.080447 |
|
||||
|
||||
| Model | release | Config | Test set | Loss | CER |
|
||||
| --- | --- | --- | --- | --- | --- |
|
||||
| DeepSpeech2 | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 |
|
||||
| DeepSpeech2 | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 |
|
||||
| DeepSpeech2 | 1.8.5 | - | test | - | 0.080447 |
|
||||
|
@ -1,23 +0,0 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
if [ $# != 2 ];then
|
||||
echo "usage: ${0} ckpt_dir avg_num"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ckpt_dir=${1}
|
||||
average_num=${2}
|
||||
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
|
||||
|
||||
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
|
||||
--dst_model ${decode_checkpoint} \
|
||||
--ckpt_dir ${ckpt_dir} \
|
||||
--num ${average_num} \
|
||||
--val_best
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in avg ckpt!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -0,0 +1,2 @@
|
||||
data
|
||||
exp
|
@ -0,0 +1,85 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# https://github.com/rubber-duck-dragon/rubber-duck-dragon.github.io/blob/master/cc-cedict_parser/parser.py
|
||||
#A parser for the CC-Cedict. Convert the Chinese-English dictionary into a list of python dictionaries with "traditional","simplified", "pinyin", and "english" keys.
|
||||
#Make sure that the cedict_ts.u8 file is in the same folder as this file, and that the name matches the file name on line 13.
|
||||
#Before starting, open the CEDICT text file and delete the copyright information at the top. Otherwise the program will try to parse it and you will get an error message.
|
||||
#Characters that are commonly used as surnames have two entries in CC-CEDICT. This program will remove the surname entry if there is another entry for the character. If you want to include the surnames, simply delete lines 59 and 60.
|
||||
#This code was written by Franki Allegra in February 2020.
|
||||
import json
|
||||
import sys
|
||||
|
||||
# usage: bin ccedict dump.json
|
||||
|
||||
with open(sys.argv[1], 'rt') as file:
|
||||
text = file.read()
|
||||
lines = text.split('\n')
|
||||
dict_lines = list(lines)
|
||||
|
||||
def parse_line(line):
|
||||
parsed = {}
|
||||
if line == '':
|
||||
dict_lines.remove(line)
|
||||
return 0
|
||||
if line.startswith('#'):
|
||||
return 0
|
||||
if line.startswith('%'):
|
||||
return 0
|
||||
line = line.rstrip('/')
|
||||
line = line.split('/')
|
||||
if len(line) <= 1:
|
||||
return 0
|
||||
english = line[1]
|
||||
char_and_pinyin = line[0].split('[')
|
||||
characters = char_and_pinyin[0]
|
||||
characters = characters.split()
|
||||
traditional = characters[0]
|
||||
simplified = characters[1]
|
||||
pinyin = char_and_pinyin[1]
|
||||
pinyin = pinyin.rstrip()
|
||||
pinyin = pinyin.rstrip("]")
|
||||
parsed['traditional'] = traditional
|
||||
parsed['simplified'] = simplified
|
||||
parsed['pinyin'] = pinyin
|
||||
parsed['english'] = english
|
||||
list_of_dicts.append(parsed)
|
||||
|
||||
def remove_surnames():
|
||||
for x in range(len(list_of_dicts) - 1, -1, -1):
|
||||
if "surname " in list_of_dicts[x]['english']:
|
||||
if list_of_dicts[x]['traditional'] == list_of_dicts[x + 1][
|
||||
'traditional']:
|
||||
list_of_dicts.pop(x)
|
||||
|
||||
def main():
|
||||
|
||||
#make each line into a dictionary
|
||||
print("Parsing dictionary . . .")
|
||||
for line in dict_lines:
|
||||
parse_line(line)
|
||||
|
||||
#remove entries for surnames from the data (optional):
|
||||
print("Removing Surnames . . .")
|
||||
remove_surnames()
|
||||
|
||||
print("Saving to database (this may take a few minutes) . . .")
|
||||
with open(sys.argv[2], 'wt') as fout:
|
||||
for one_dict in list_of_dicts:
|
||||
json_str = json.dumps(one_dict)
|
||||
fout.write(json_str + "\n")
|
||||
print('Done!')
|
||||
|
||||
|
||||
list_of_dicts = []
|
||||
parsed_dict = main()
|
@ -0,0 +1,10 @@
|
||||
export MAIN_ROOT=${PWD}/../../
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
export LD_LIBRARY_PATH=/usr/local/lib/:${LD_LIBRARY_PATH}
|
@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
|
||||
# CC-CEDICT download: https://www.mdbg.net/chinese/dictionary?page=cc-cedict
|
||||
# The word dictionary of this website is based on CC-CEDICT.
|
||||
# CC-CEDICT is a continuation of the CEDICT project started by Paul Denisowski in 1997 with the
|
||||
# aim to provide a complete downloadable Chinese to English dictionary with pronunciation in pinyin for the Chinese characters.
|
||||
# This website allows you to easily add new entries or correct existing entries in CC-CEDICT.
|
||||
# Submitted entries will be checked and processed frequently and released for download in CEDICT format on this page.
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||
|
||||
|
||||
cedict_url=https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.zip
|
||||
cedict=cedict_1_0_ts_utf-8_mdbg.zip
|
||||
|
||||
mkdir -p data
|
||||
|
||||
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ];then
|
||||
test -f data/${cedict} || wget -O data/${cedict} ${cedict_url}
|
||||
pushd data
|
||||
unzip ${cedict}
|
||||
popd
|
||||
|
||||
fi
|
||||
|
||||
mkdir -p exp
|
||||
|
||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
|
||||
cp data/cedict_ts.u8 exp/cedict
|
||||
python3 local/parser.py exp/cedict exp/cedict.json
|
||||
fi
|
||||
|
@ -0,0 +1,2 @@
|
||||
data
|
||||
exp
|
@ -0,0 +1,5 @@
|
||||
# Download Baker dataset
|
||||
|
||||
Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset.
|
||||
|
||||
Download URL https://test.data-baker.com/#/data/index/source.
|
@ -0,0 +1,53 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import re
|
||||
|
||||
import jieba
|
||||
from pypinyin import lazy_pinyin
|
||||
from pypinyin import Style
|
||||
|
||||
|
||||
def extract_pinyin(source, target, use_jieba=False):
|
||||
with open(source, 'rt', encoding='utf-8') as fin:
|
||||
with open(target, 'wt', encoding='utf-8') as fout:
|
||||
for i, line in enumerate(fin):
|
||||
if i % 2 == 0:
|
||||
sentence_id, raw_text = line.strip().split()
|
||||
raw_text = re.sub(r'#\d', '', raw_text)
|
||||
if use_jieba:
|
||||
raw_text = jieba.lcut(raw_text)
|
||||
syllables = lazy_pinyin(
|
||||
raw_text,
|
||||
errors='ignore',
|
||||
style=Style.TONE3,
|
||||
neutral_tone_with_five=True)
|
||||
transcription = ' '.join(syllables)
|
||||
fout.write(f'{sentence_id} {transcription}\n')
|
||||
else:
|
||||
continue
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
|
||||
parser.add_argument(
|
||||
"input", type=str, help="source file of baker's prosody label file")
|
||||
parser.add_argument(
|
||||
"output", type=str, help="target file to write pinyin lables")
|
||||
parser.add_argument(
|
||||
"--use-jieba",
|
||||
action='store_true',
|
||||
help="use jieba for word segmentation.")
|
||||
args = parser.parse_args()
|
||||
extract_pinyin(args.input, args.output, use_jieba=args.use_jieba)
|
@ -0,0 +1,37 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
|
||||
|
||||
def extract_pinyin_lables(source, target):
|
||||
"""Extract pinyin labels from Baker's prosody labeling."""
|
||||
with open(source, 'rt', encoding='utf-8') as fin:
|
||||
with open(target, 'wt', encoding='utf-8') as fout:
|
||||
for i, line in enumerate(fin):
|
||||
if i % 2 == 0:
|
||||
sentence_id, raw_text = line.strip().split()
|
||||
fout.write(f'{sentence_id} ')
|
||||
else:
|
||||
transcription = line.strip()
|
||||
fout.write(f'{transcription}\n')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
|
||||
parser.add_argument(
|
||||
"input", type=str, help="source file of baker's prosody label file")
|
||||
parser.add_argument(
|
||||
"output", type=str, help="target file to write pinyin lables")
|
||||
args = parser.parse_args()
|
||||
extract_pinyin_lables(args.input, args.output)
|
@ -0,0 +1,100 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
from typing import List, Union
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def erized(syllable: str) -> bool:
|
||||
"""Whether the syllable contains erhua effect.
|
||||
|
||||
Example
|
||||
--------
|
||||
huar -> True
|
||||
guanr -> True
|
||||
er -> False
|
||||
"""
|
||||
# note: for pinyin, len(syllable) >=2 is always true
|
||||
# if not: there is something wrong in the data
|
||||
assert len(syllable) >= 2, f"inavlid syllable {syllable}"
|
||||
return syllable[:2] != "er" and syllable[-2] == 'r'
|
||||
|
||||
|
||||
def ignore_sandhi(reference: List[str], generated: List[str]) -> List[str]:
|
||||
"""
|
||||
Given a sequence of syllables from human annotation(reference),
|
||||
which makes sandhi explici and a sequence of syllables from some
|
||||
simple g2p program(generated), which does not consider sandhi,
|
||||
return a the reference sequence while ignore sandhi.
|
||||
|
||||
Example
|
||||
--------
|
||||
['lao2', 'hu3'], ['lao3', 'hu3'] -> ['lao3', 'hu3']
|
||||
"""
|
||||
i = 0
|
||||
j = 0
|
||||
|
||||
# sandhi ignored in the result while other errors are not included
|
||||
result = []
|
||||
while i < len(reference):
|
||||
if erized(reference[i]):
|
||||
result.append(reference[i])
|
||||
i += 1
|
||||
j += 2
|
||||
elif reference[i][:-1] == generated[i][:-1] and reference[i][
|
||||
-1] == '2' and generated[i][-1] == '3':
|
||||
result.append(generated[i])
|
||||
i += 1
|
||||
j += 1
|
||||
else:
|
||||
result.append(reference[i])
|
||||
i += 1
|
||||
j += 1
|
||||
assert j == len(
|
||||
generated
|
||||
), "length of transcriptions mismatch, There may be some characters that are ignored in the generated transcription."
|
||||
return result
|
||||
|
||||
|
||||
def convert_transcriptions(reference: Union[str, Path], generated: Union[str, Path], output: Union[str, Path]):
|
||||
with open(reference, 'rt') as f_ref:
|
||||
with open(generated, 'rt') as f_gen:
|
||||
with open(output, 'wt') as f_out:
|
||||
for i, (ref, gen) in enumerate(zip(f_ref, f_gen)):
|
||||
sentence_id, ref_transcription = ref.strip().split(' ', 1)
|
||||
_, gen_transcription = gen.strip().split(' ', 1)
|
||||
try:
|
||||
result = ignore_sandhi(ref_transcription.split(),
|
||||
gen_transcription.split())
|
||||
result = ' '.join(result)
|
||||
except Exception:
|
||||
print(
|
||||
f"sentence_id: {sentence_id} There is some annotation error in the reference or generated transcription. Use the reference."
|
||||
)
|
||||
result = ref_transcription
|
||||
f_out.write(f"{sentence_id} {result}\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="reference transcription but ignore sandhi.")
|
||||
parser.add_argument(
|
||||
"--reference",
|
||||
type=str,
|
||||
help="path to the reference transcription of baker dataset.")
|
||||
parser.add_argument(
|
||||
"--generated", type=str, help="path to the generated transcription.")
|
||||
parser.add_argument("--output", type=str, help="path to save result.")
|
||||
args = parser.parse_args()
|
||||
convert_transcriptions(args.reference, args.generated, args.output)
|
@ -0,0 +1,33 @@
|
||||
#!/bin/bash
|
||||
|
||||
exp_dir="exp"
|
||||
data_dir="data"
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||
|
||||
archive=${data_dir}/"BZNSYP.rar"
|
||||
if [ ! -f ${archive} ]; then
|
||||
echo "Baker Dataset not found! Download it first to the data_dir."
|
||||
exit -1
|
||||
fi
|
||||
|
||||
MD5='c4350563bf7dc298f7dd364b2607be83'
|
||||
md5_result=$(md5sum ${archive} | awk -F[' '] '{print $1}')
|
||||
if [ ${md5_result} != ${MD5} ]; then
|
||||
echo "MD5 mismatch! The Archive has been changed."
|
||||
exit -1
|
||||
fi
|
||||
|
||||
|
||||
label_file='ProsodyLabeling/000001-010000.txt'
|
||||
filename='000001-010000.txt'
|
||||
unrar e ${archive} ${label_file}
|
||||
cp ${filename} ${exp_dir}
|
||||
rm -f ${filename}
|
||||
|
||||
if [ ! -f ${exp_dir}/${filename} ];then
|
||||
echo "File extraction failed!"
|
||||
exit
|
||||
fi
|
||||
|
||||
exit 0
|
@ -0,0 +1,8 @@
|
||||
export MAIN_ROOT=${PWD}/../../
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
@ -0,0 +1 @@
|
||||
jieba
|
@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
source path.sh
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
|
||||
exp_dir=exp
|
||||
data_dir=data
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||
|
||||
mkdir -p ${exp_dir}
|
||||
|
||||
|
||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
|
||||
echo "stage 0: Extracting Prosody Labeling"
|
||||
bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data_dir}
|
||||
fi
|
||||
|
||||
# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
|
||||
filename="000001-010000.txt"
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
echo "stage 1: Processing transcriptions..."
|
||||
python3 local/extract_pinyin_label.py ${exp_dir}/${filename} ${exp_dir}/ref.pinyin
|
||||
|
||||
python3 local/convert_transcription.py ${exp_dir}/${filename} ${exp_dir}/trans.pinyin
|
||||
python3 local/convert_transcription.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/trans.jieba.pinyin
|
||||
fi
|
||||
|
||||
echo "done"
|
||||
exit 0
|
@ -1,23 +0,0 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
if [ $# != 2 ];then
|
||||
echo "usage: ${0} ckpt_dir avg_num"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ckpt_dir=${1}
|
||||
average_num=${2}
|
||||
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
|
||||
|
||||
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
|
||||
--dst_model ${decode_checkpoint} \
|
||||
--ckpt_dir ${ckpt_dir} \
|
||||
--num ${average_num} \
|
||||
--val_best
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in avg ckpt!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -1,23 +0,0 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
if [ $# != 2 ]; then
|
||||
echo "usage: ${0} ckpt_dir avg_num"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ckpt_dir=${1}
|
||||
average_num=${2}
|
||||
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
|
||||
|
||||
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
|
||||
--dst_model ${decode_checkpoint} \
|
||||
--ckpt_dir ${ckpt_dir} \
|
||||
--num ${average_num} \
|
||||
--val_best
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in avg ckpt!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -1,23 +0,0 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
if [ $# != 2 ];then
|
||||
echo "usage: ${0} ckpt_dir avg_num"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ckpt_dir=${1}
|
||||
average_num=${2}
|
||||
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
|
||||
|
||||
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
|
||||
--dst_model ${decode_checkpoint} \
|
||||
--ckpt_dir ${ckpt_dir} \
|
||||
--num ${average_num} \
|
||||
--val_best
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in avg ckpt!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -1,23 +0,0 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
if [ $# != 2 ];then
|
||||
echo "usage: ${0} ckpt_dir avg_num"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ckpt_dir=${1}
|
||||
average_num=${2}
|
||||
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
|
||||
|
||||
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
|
||||
--dst_model ${decode_checkpoint} \
|
||||
--ckpt_dir ${ckpt_dir} \
|
||||
--num ${average_num} \
|
||||
--val_best
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in avg ckpt!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -0,0 +1,99 @@
|
||||
# Copyright 2014 Bernard Yue
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
__doc__ = """
|
||||
Hanzi Converter 繁簡轉換器 | 繁简转换器
|
||||
This module provides functions converting chinese text between simplified and
|
||||
traditional characters. It returns unicode represnetation of the text.
|
||||
Class HanziConv is the main entry point of the module, you can import the
|
||||
class by doing:
|
||||
>>> from hanziconv import HanziConv
|
||||
"""
|
||||
|
||||
import os
|
||||
from zhon import cedict
|
||||
|
||||
class HanziConv():
|
||||
"""This class supports hanzi (漢字) convention between simplified and
|
||||
traditional format"""
|
||||
__traditional_charmap = cedict.traditional
|
||||
__simplified_charmap = cedict.simplified
|
||||
|
||||
@classmethod
|
||||
def __convert(cls, text, toTraditional=True):
|
||||
"""Convert `text` to Traditional characters if `toTraditional` is
|
||||
True, else convert to simplified characters
|
||||
:param text: data to convert
|
||||
:param toTraditional: True -- convert to traditional text
|
||||
False -- covert to simplified text
|
||||
:returns: converted 'text`
|
||||
"""
|
||||
if isinstance(text, bytes):
|
||||
text = text.decode('utf-8')
|
||||
|
||||
fromMap = cls.__simplified_charmap
|
||||
toMap = cls.__traditional_charmap
|
||||
if not toTraditional:
|
||||
fromMap = cls.__traditional_charmap
|
||||
toMap = cls.__simplified_charmap
|
||||
|
||||
final = []
|
||||
for c in text:
|
||||
index = fromMap.find(c)
|
||||
if index != -1:
|
||||
final.append(toMap[index])
|
||||
else:
|
||||
final.append(c)
|
||||
return ''.join(final)
|
||||
|
||||
@classmethod
|
||||
def toSimplified(cls, text):
|
||||
"""Convert `text` to simplified character string. Assuming text is
|
||||
traditional character string
|
||||
:param text: text to convert
|
||||
:returns: converted UTF-8 characters
|
||||
>>> from hanziconv import HanziConv
|
||||
>>> print(HanziConv.toSimplified('繁簡轉換器'))
|
||||
繁简转换器
|
||||
"""
|
||||
return cls.__convert(text, toTraditional=False)
|
||||
|
||||
@classmethod
|
||||
def toTraditional(cls, text):
|
||||
"""Convert `text` to traditional character string. Assuming text is
|
||||
simplified character string
|
||||
:param text: text to convert
|
||||
:returns: converted UTF-8 characters
|
||||
>>> from hanziconv import HanziConv
|
||||
>>> print(HanziConv.toTraditional('繁简转换器'))
|
||||
繁簡轉換器
|
||||
"""
|
||||
return cls.__convert(text, toTraditional=True)
|
||||
|
||||
@classmethod
|
||||
def same(cls, text1, text2):
|
||||
"""Return True if text1 and text2 meant literally the same, False
|
||||
otherwise
|
||||
:param text1: string to compare to ``text2``
|
||||
:param text2: string to compare to ``text1``
|
||||
:returns: **True** -- ``text1`` and ``text2`` are the same in meaning,
|
||||
**False** -- otherwise
|
||||
>>> from hanziconv import HanziConv
|
||||
>>> print(HanziConv.same('繁简转换器', '繁簡轉換器'))
|
||||
True
|
||||
"""
|
||||
t1 = cls.toSimplified(text1)
|
||||
t2 = cls.toSimplified(text2)
|
||||
return t1 == t2
|
@ -0,0 +1,339 @@
|
||||
# author: kuangdd
|
||||
# date: 2021/5/8
|
||||
"""
|
||||
#### style
|
||||
拼音格式转换。
|
||||
|
||||
国标样式的拼音和字母数字的样式的拼音相互转换。
|
||||
"""
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(Path(__file__).stem)
|
||||
|
||||
# 2100 = 420 * 5
|
||||
guobiao2shengyundiao_dict = {
|
||||
'a': 'a5', 'ā': 'a1', 'á': 'a2', 'ǎ': 'a3', 'à': 'a4', 'ai': 'ai5', 'āi': 'ai1', 'ái': 'ai2', 'ǎi': 'ai3',
|
||||
'ài': 'ai4', 'an': 'an5', 'ān': 'an1', 'án': 'an2', 'ǎn': 'an3', 'àn': 'an4', 'ang': 'ang5', 'āng': 'ang1',
|
||||
'áng': 'ang2', 'ǎng': 'ang3', 'àng': 'ang4', 'ao': 'ao5', 'āo': 'ao1', 'áo': 'ao2', 'ǎo': 'ao3', 'ào': 'ao4',
|
||||
'ba': 'ba5', 'bā': 'ba1', 'bá': 'ba2', 'bǎ': 'ba3', 'bà': 'ba4', 'bai': 'bai5', 'bāi': 'bai1', 'bái': 'bai2',
|
||||
'bǎi': 'bai3', 'bài': 'bai4', 'ban': 'ban5', 'bān': 'ban1', 'bán': 'ban2', 'bǎn': 'ban3', 'bàn': 'ban4',
|
||||
'bang': 'bang5', 'bāng': 'bang1', 'báng': 'bang2', 'bǎng': 'bang3', 'bàng': 'bang4', 'bao': 'bao5', 'bāo': 'bao1',
|
||||
'báo': 'bao2', 'bǎo': 'bao3', 'bào': 'bao4', 'bei': 'bei5', 'bēi': 'bei1', 'béi': 'bei2', 'běi': 'bei3',
|
||||
'bèi': 'bei4', 'ben': 'ben5', 'bēn': 'ben1', 'bén': 'ben2', 'běn': 'ben3', 'bèn': 'ben4', 'beng': 'beng5',
|
||||
'bēng': 'beng1', 'béng': 'beng2', 'běng': 'beng3', 'bèng': 'beng4', 'bi': 'bi5', 'bī': 'bi1', 'bí': 'bi2',
|
||||
'bǐ': 'bi3', 'bì': 'bi4', 'bian': 'bian5', 'biān': 'bian1', 'bián': 'bian2', 'biǎn': 'bian3', 'biàn': 'bian4',
|
||||
'biao': 'biao5', 'biāo': 'biao1', 'biáo': 'biao2', 'biǎo': 'biao3', 'biào': 'biao4', 'bie': 'bie5', 'biē': 'bie1',
|
||||
'bié': 'bie2', 'biě': 'bie3', 'biè': 'bie4', 'bin': 'bin5', 'bīn': 'bin1', 'bín': 'bin2', 'bǐn': 'bin3',
|
||||
'bìn': 'bin4', 'bing': 'bing5', 'bīng': 'bing1', 'bíng': 'bing2', 'bǐng': 'bing3', 'bìng': 'bing4', 'bo': 'bo5',
|
||||
'bō': 'bo1', 'bó': 'bo2', 'bǒ': 'bo3', 'bò': 'bo4', 'bu': 'bu5', 'bū': 'bu1', 'bú': 'bu2', 'bǔ': 'bu3', 'bù': 'bu4',
|
||||
'ca': 'ca5', 'cā': 'ca1', 'cá': 'ca2', 'cǎ': 'ca3', 'cà': 'ca4', 'cai': 'cai5', 'cāi': 'cai1', 'cái': 'cai2',
|
||||
'cǎi': 'cai3', 'cài': 'cai4', 'can': 'can5', 'cān': 'can1', 'cán': 'can2', 'cǎn': 'can3', 'càn': 'can4',
|
||||
'cang': 'cang5', 'cāng': 'cang1', 'cáng': 'cang2', 'cǎng': 'cang3', 'càng': 'cang4', 'cao': 'cao5', 'cāo': 'cao1',
|
||||
'cáo': 'cao2', 'cǎo': 'cao3', 'cào': 'cao4', 'ce': 'ce5', 'cē': 'ce1', 'cé': 'ce2', 'cě': 'ce3', 'cè': 'ce4',
|
||||
'cen': 'cen5', 'cēn': 'cen1', 'cén': 'cen2', 'cěn': 'cen3', 'cèn': 'cen4', 'ceng': 'ceng5', 'cēng': 'ceng1',
|
||||
'céng': 'ceng2', 'cěng': 'ceng3', 'cèng': 'ceng4', 'cha': 'cha5', 'chā': 'cha1', 'chá': 'cha2', 'chǎ': 'cha3',
|
||||
'chà': 'cha4', 'chai': 'chai5', 'chāi': 'chai1', 'chái': 'chai2', 'chǎi': 'chai3', 'chài': 'chai4', 'chan': 'chan5',
|
||||
'chān': 'chan1', 'chán': 'chan2', 'chǎn': 'chan3', 'chàn': 'chan4', 'chang': 'chang5', 'chāng': 'chang1',
|
||||
'cháng': 'chang2', 'chǎng': 'chang3', 'chàng': 'chang4', 'chao': 'chao5', 'chāo': 'chao1', 'cháo': 'chao2',
|
||||
'chǎo': 'chao3', 'chào': 'chao4', 'che': 'che5', 'chē': 'che1', 'ché': 'che2', 'chě': 'che3', 'chè': 'che4',
|
||||
'chen': 'chen5', 'chēn': 'chen1', 'chén': 'chen2', 'chěn': 'chen3', 'chèn': 'chen4', 'cheng': 'cheng5',
|
||||
'chēng': 'cheng1', 'chéng': 'cheng2', 'chěng': 'cheng3', 'chèng': 'cheng4', 'chi': 'chi5', 'chī': 'chi1',
|
||||
'chí': 'chi2', 'chǐ': 'chi3', 'chì': 'chi4', 'chong': 'chong5', 'chōng': 'chong1', 'chóng': 'chong2',
|
||||
'chǒng': 'chong3', 'chòng': 'chong4', 'chou': 'chou5', 'chōu': 'chou1', 'chóu': 'chou2', 'chǒu': 'chou3',
|
||||
'chòu': 'chou4', 'chu': 'chu5', 'chū': 'chu1', 'chú': 'chu2', 'chǔ': 'chu3', 'chù': 'chu4', 'chuai': 'chuai5',
|
||||
'chuāi': 'chuai1', 'chuái': 'chuai2', 'chuǎi': 'chuai3', 'chuài': 'chuai4', 'chuan': 'chuan5', 'chuān': 'chuan1',
|
||||
'chuán': 'chuan2', 'chuǎn': 'chuan3', 'chuàn': 'chuan4', 'chuang': 'chuang5', 'chuāng': 'chuang1',
|
||||
'chuáng': 'chuang2', 'chuǎng': 'chuang3', 'chuàng': 'chuang4', 'chui': 'chui5', 'chuī': 'chui1', 'chuí': 'chui2',
|
||||
'chuǐ': 'chui3', 'chuì': 'chui4', 'chun': 'chun5', 'chūn': 'chun1', 'chún': 'chun2', 'chǔn': 'chun3',
|
||||
'chùn': 'chun4', 'chuo': 'chuo5', 'chuō': 'chuo1', 'chuó': 'chuo2', 'chuǒ': 'chuo3', 'chuò': 'chuo4', 'ci': 'ci5',
|
||||
'cī': 'ci1', 'cí': 'ci2', 'cǐ': 'ci3', 'cì': 'ci4', 'cong': 'cong5', 'cōng': 'cong1', 'cóng': 'cong2',
|
||||
'cǒng': 'cong3', 'còng': 'cong4', 'cou': 'cou5', 'cōu': 'cou1', 'cóu': 'cou2', 'cǒu': 'cou3', 'còu': 'cou4',
|
||||
'cu': 'cu5', 'cū': 'cu1', 'cú': 'cu2', 'cǔ': 'cu3', 'cù': 'cu4', 'cuan': 'cuan5', 'cuān': 'cuan1', 'cuán': 'cuan2',
|
||||
'cuǎn': 'cuan3', 'cuàn': 'cuan4', 'cui': 'cui5', 'cuī': 'cui1', 'cuí': 'cui2', 'cuǐ': 'cui3', 'cuì': 'cui4',
|
||||
'cun': 'cun5', 'cūn': 'cun1', 'cún': 'cun2', 'cǔn': 'cun3', 'cùn': 'cun4', 'cuo': 'cuo5', 'cuō': 'cuo1',
|
||||
'cuó': 'cuo2', 'cuǒ': 'cuo3', 'cuò': 'cuo4', 'da': 'da5', 'dā': 'da1', 'dá': 'da2', 'dǎ': 'da3', 'dà': 'da4',
|
||||
'dai': 'dai5', 'dāi': 'dai1', 'dái': 'dai2', 'dǎi': 'dai3', 'dài': 'dai4', 'dan': 'dan5', 'dān': 'dan1',
|
||||
'dán': 'dan2', 'dǎn': 'dan3', 'dàn': 'dan4', 'dang': 'dang5', 'dāng': 'dang1', 'dáng': 'dang2', 'dǎng': 'dang3',
|
||||
'dàng': 'dang4', 'dao': 'dao5', 'dāo': 'dao1', 'dáo': 'dao2', 'dǎo': 'dao3', 'dào': 'dao4', 'de': 'de5',
|
||||
'dē': 'de1', 'dé': 'de2', 'dě': 'de3', 'dè': 'de4', 'dei': 'dei5', 'dēi': 'dei1', 'déi': 'dei2', 'děi': 'dei3',
|
||||
'dèi': 'dei4', 'den': 'den5', 'dēn': 'den1', 'dén': 'den2', 'děn': 'den3', 'dèn': 'den4', 'deng': 'deng5',
|
||||
'dēng': 'deng1', 'déng': 'deng2', 'děng': 'deng3', 'dèng': 'deng4', 'di': 'di5', 'dī': 'di1', 'dí': 'di2',
|
||||
'dǐ': 'di3', 'dì': 'di4', 'dia': 'dia5', 'diā': 'dia1', 'diá': 'dia2', 'diǎ': 'dia3', 'dià': 'dia4',
|
||||
'dian': 'dian5', 'diān': 'dian1', 'dián': 'dian2', 'diǎn': 'dian3', 'diàn': 'dian4', 'diao': 'diao5',
|
||||
'diāo': 'diao1', 'diáo': 'diao2', 'diǎo': 'diao3', 'diào': 'diao4', 'die': 'die5', 'diē': 'die1', 'dié': 'die2',
|
||||
'diě': 'die3', 'diè': 'die4', 'ding': 'ding5', 'dīng': 'ding1', 'díng': 'ding2', 'dǐng': 'ding3', 'dìng': 'ding4',
|
||||
'diu': 'diu5', 'diū': 'diu1', 'diú': 'diu2', 'diǔ': 'diu3', 'diù': 'diu4', 'dong': 'dong5', 'dōng': 'dong1',
|
||||
'dóng': 'dong2', 'dǒng': 'dong3', 'dòng': 'dong4', 'dou': 'dou5', 'dōu': 'dou1', 'dóu': 'dou2', 'dǒu': 'dou3',
|
||||
'dòu': 'dou4', 'du': 'du5', 'dū': 'du1', 'dú': 'du2', 'dǔ': 'du3', 'dù': 'du4', 'duan': 'duan5', 'duān': 'duan1',
|
||||
'duán': 'duan2', 'duǎn': 'duan3', 'duàn': 'duan4', 'dui': 'dui5', 'duī': 'dui1', 'duí': 'dui2', 'duǐ': 'dui3',
|
||||
'duì': 'dui4', 'dun': 'dun5', 'dūn': 'dun1', 'dún': 'dun2', 'dǔn': 'dun3', 'dùn': 'dun4', 'duo': 'duo5',
|
||||
'duō': 'duo1', 'duó': 'duo2', 'duǒ': 'duo3', 'duò': 'duo4', 'e': 'e5', 'ē': 'e1', 'é': 'e2', 'ě': 'e3', 'è': 'e4',
|
||||
'ei': 'ei5', 'ēi': 'ei1', 'éi': 'ei2', 'ěi': 'ei3', 'èi': 'ei4', 'en': 'en5', 'ēn': 'en1', 'én': 'en2', 'ěn': 'en3',
|
||||
'èn': 'en4', 'eng': 'eng5', 'ēng': 'eng1', 'éng': 'eng2', 'ěng': 'eng3', 'èng': 'eng4', 'er': 'er5', 'ēr': 'er1',
|
||||
'ér': 'er2', 'ěr': 'er3', 'èr': 'er4', 'fa': 'fa5', 'fā': 'fa1', 'fá': 'fa2', 'fǎ': 'fa3', 'fà': 'fa4',
|
||||
'fan': 'fan5', 'fān': 'fan1', 'fán': 'fan2', 'fǎn': 'fan3', 'fàn': 'fan4', 'fang': 'fang5', 'fāng': 'fang1',
|
||||
'fáng': 'fang2', 'fǎng': 'fang3', 'fàng': 'fang4', 'fei': 'fei5', 'fēi': 'fei1', 'féi': 'fei2', 'fěi': 'fei3',
|
||||
'fèi': 'fei4', 'fen': 'fen5', 'fēn': 'fen1', 'fén': 'fen2', 'fěn': 'fen3', 'fèn': 'fen4', 'feng': 'feng5',
|
||||
'fēng': 'feng1', 'féng': 'feng2', 'fěng': 'feng3', 'fèng': 'feng4', 'fo': 'fo5', 'fō': 'fo1', 'fó': 'fo2',
|
||||
'fǒ': 'fo3', 'fò': 'fo4', 'fou': 'fou5', 'fōu': 'fou1', 'fóu': 'fou2', 'fǒu': 'fou3', 'fòu': 'fou4', 'fu': 'fu5',
|
||||
'fū': 'fu1', 'fú': 'fu2', 'fǔ': 'fu3', 'fù': 'fu4', 'ga': 'ga5', 'gā': 'ga1', 'gá': 'ga2', 'gǎ': 'ga3', 'gà': 'ga4',
|
||||
'gai': 'gai5', 'gāi': 'gai1', 'gái': 'gai2', 'gǎi': 'gai3', 'gài': 'gai4', 'gan': 'gan5', 'gān': 'gan1',
|
||||
'gán': 'gan2', 'gǎn': 'gan3', 'gàn': 'gan4', 'gang': 'gang5', 'gāng': 'gang1', 'gáng': 'gang2', 'gǎng': 'gang3',
|
||||
'gàng': 'gang4', 'gao': 'gao5', 'gāo': 'gao1', 'gáo': 'gao2', 'gǎo': 'gao3', 'gào': 'gao4', 'ge': 'ge5',
|
||||
'gē': 'ge1', 'gé': 'ge2', 'gě': 'ge3', 'gè': 'ge4', 'gei': 'gei5', 'gēi': 'gei1', 'géi': 'gei2', 'gěi': 'gei3',
|
||||
'gèi': 'gei4', 'gen': 'gen5', 'gēn': 'gen1', 'gén': 'gen2', 'gěn': 'gen3', 'gèn': 'gen4', 'geng': 'geng5',
|
||||
'gēng': 'geng1', 'géng': 'geng2', 'gěng': 'geng3', 'gèng': 'geng4', 'gong': 'gong5', 'gōng': 'gong1',
|
||||
'góng': 'gong2', 'gǒng': 'gong3', 'gòng': 'gong4', 'gou': 'gou5', 'gōu': 'gou1', 'góu': 'gou2', 'gǒu': 'gou3',
|
||||
'gòu': 'gou4', 'gu': 'gu5', 'gū': 'gu1', 'gú': 'gu2', 'gǔ': 'gu3', 'gù': 'gu4', 'gua': 'gua5', 'guā': 'gua1',
|
||||
'guá': 'gua2', 'guǎ': 'gua3', 'guà': 'gua4', 'guai': 'guai5', 'guāi': 'guai1', 'guái': 'guai2', 'guǎi': 'guai3',
|
||||
'guài': 'guai4', 'guan': 'guan5', 'guān': 'guan1', 'guán': 'guan2', 'guǎn': 'guan3', 'guàn': 'guan4',
|
||||
'guang': 'guang5', 'guāng': 'guang1', 'guáng': 'guang2', 'guǎng': 'guang3', 'guàng': 'guang4', 'gui': 'gui5',
|
||||
'guī': 'gui1', 'guí': 'gui2', 'guǐ': 'gui3', 'guì': 'gui4', 'gun': 'gun5', 'gūn': 'gun1', 'gún': 'gun2',
|
||||
'gǔn': 'gun3', 'gùn': 'gun4', 'guo': 'guo5', 'guō': 'guo1', 'guó': 'guo2', 'guǒ': 'guo3', 'guò': 'guo4',
|
||||
'ha': 'ha5', 'hā': 'ha1', 'há': 'ha2', 'hǎ': 'ha3', 'hà': 'ha4', 'hai': 'hai5', 'hāi': 'hai1', 'hái': 'hai2',
|
||||
'hǎi': 'hai3', 'hài': 'hai4', 'han': 'han5', 'hān': 'han1', 'hán': 'han2', 'hǎn': 'han3', 'hàn': 'han4',
|
||||
'hang': 'hang5', 'hāng': 'hang1', 'háng': 'hang2', 'hǎng': 'hang3', 'hàng': 'hang4', 'hao': 'hao5', 'hāo': 'hao1',
|
||||
'háo': 'hao2', 'hǎo': 'hao3', 'hào': 'hao4', 'he': 'he5', 'hē': 'he1', 'hé': 'he2', 'hě': 'he3', 'hè': 'he4',
|
||||
'hei': 'hei5', 'hēi': 'hei1', 'héi': 'hei2', 'hěi': 'hei3', 'hèi': 'hei4', 'hen': 'hen5', 'hēn': 'hen1',
|
||||
'hén': 'hen2', 'hěn': 'hen3', 'hèn': 'hen4', 'heng': 'heng5', 'hēng': 'heng1', 'héng': 'heng2', 'hěng': 'heng3',
|
||||
'hèng': 'heng4', 'hong': 'hong5', 'hōng': 'hong1', 'hóng': 'hong2', 'hǒng': 'hong3', 'hòng': 'hong4', 'hou': 'hou5',
|
||||
'hōu': 'hou1', 'hóu': 'hou2', 'hǒu': 'hou3', 'hòu': 'hou4', 'hu': 'hu5', 'hū': 'hu1', 'hú': 'hu2', 'hǔ': 'hu3',
|
||||
'hù': 'hu4', 'hua': 'hua5', 'huā': 'hua1', 'huá': 'hua2', 'huǎ': 'hua3', 'huà': 'hua4', 'huai': 'huai5',
|
||||
'huāi': 'huai1', 'huái': 'huai2', 'huǎi': 'huai3', 'huài': 'huai4', 'huan': 'huan5', 'huān': 'huan1',
|
||||
'huán': 'huan2', 'huǎn': 'huan3', 'huàn': 'huan4', 'huang': 'huang5', 'huāng': 'huang1', 'huáng': 'huang2',
|
||||
'huǎng': 'huang3', 'huàng': 'huang4', 'hui': 'hui5', 'huī': 'hui1', 'huí': 'hui2', 'huǐ': 'hui3', 'huì': 'hui4',
|
||||
'hun': 'hun5', 'hūn': 'hun1', 'hún': 'hun2', 'hǔn': 'hun3', 'hùn': 'hun4', 'huo': 'huo5', 'huō': 'huo1',
|
||||
'huó': 'huo2', 'huǒ': 'huo3', 'huò': 'huo4', 'ji': 'ji5', 'jī': 'ji1', 'jí': 'ji2', 'jǐ': 'ji3', 'jì': 'ji4',
|
||||
'jia': 'jia5', 'jiā': 'jia1', 'jiá': 'jia2', 'jiǎ': 'jia3', 'jià': 'jia4', 'jian': 'jian5', 'jiān': 'jian1',
|
||||
'jián': 'jian2', 'jiǎn': 'jian3', 'jiàn': 'jian4', 'jiang': 'jiang5', 'jiāng': 'jiang1', 'jiáng': 'jiang2',
|
||||
'jiǎng': 'jiang3', 'jiàng': 'jiang4', 'jiao': 'jiao5', 'jiāo': 'jiao1', 'jiáo': 'jiao2', 'jiǎo': 'jiao3',
|
||||
'jiào': 'jiao4', 'jie': 'jie5', 'jiē': 'jie1', 'jié': 'jie2', 'jiě': 'jie3', 'jiè': 'jie4', 'jin': 'jin5',
|
||||
'jīn': 'jin1', 'jín': 'jin2', 'jǐn': 'jin3', 'jìn': 'jin4', 'jing': 'jing5', 'jīng': 'jing1', 'jíng': 'jing2',
|
||||
'jǐng': 'jing3', 'jìng': 'jing4', 'jiong': 'jiong5', 'jiōng': 'jiong1', 'jióng': 'jiong2', 'jiǒng': 'jiong3',
|
||||
'jiòng': 'jiong4', 'jiu': 'jiu5', 'jiū': 'jiu1', 'jiú': 'jiu2', 'jiǔ': 'jiu3', 'jiù': 'jiu4', 'ju': 'ju5',
|
||||
'jū': 'ju1', 'jú': 'ju2', 'jǔ': 'ju3', 'jù': 'ju4', 'juan': 'juan5', 'juān': 'juan1', 'juán': 'juan2',
|
||||
'juǎn': 'juan3', 'juàn': 'juan4', 'jue': 'jue5', 'juē': 'jue1', 'jué': 'jue2', 'juě': 'jue3', 'juè': 'jue4',
|
||||
'jun': 'jun5', 'jūn': 'jun1', 'jún': 'jun2', 'jǔn': 'jun3', 'jùn': 'jun4', 'ka': 'ka5', 'kā': 'ka1', 'ká': 'ka2',
|
||||
'kǎ': 'ka3', 'kà': 'ka4', 'kai': 'kai5', 'kāi': 'kai1', 'kái': 'kai2', 'kǎi': 'kai3', 'kài': 'kai4', 'kan': 'kan5',
|
||||
'kān': 'kan1', 'kán': 'kan2', 'kǎn': 'kan3', 'kàn': 'kan4', 'kang': 'kang5', 'kāng': 'kang1', 'káng': 'kang2',
|
||||
'kǎng': 'kang3', 'kàng': 'kang4', 'kao': 'kao5', 'kāo': 'kao1', 'káo': 'kao2', 'kǎo': 'kao3', 'kào': 'kao4',
|
||||
'ke': 'ke5', 'kē': 'ke1', 'ké': 'ke2', 'kě': 'ke3', 'kè': 'ke4', 'ken': 'ken5', 'kēn': 'ken1', 'kén': 'ken2',
|
||||
'kěn': 'ken3', 'kèn': 'ken4', 'keng': 'keng5', 'kēng': 'keng1', 'kéng': 'keng2', 'kěng': 'keng3', 'kèng': 'keng4',
|
||||
'kong': 'kong5', 'kōng': 'kong1', 'kóng': 'kong2', 'kǒng': 'kong3', 'kòng': 'kong4', 'kou': 'kou5', 'kōu': 'kou1',
|
||||
'kóu': 'kou2', 'kǒu': 'kou3', 'kòu': 'kou4', 'ku': 'ku5', 'kū': 'ku1', 'kú': 'ku2', 'kǔ': 'ku3', 'kù': 'ku4',
|
||||
'kua': 'kua5', 'kuā': 'kua1', 'kuá': 'kua2', 'kuǎ': 'kua3', 'kuà': 'kua4', 'kuai': 'kuai5', 'kuāi': 'kuai1',
|
||||
'kuái': 'kuai2', 'kuǎi': 'kuai3', 'kuài': 'kuai4', 'kuan': 'kuan5', 'kuān': 'kuan1', 'kuán': 'kuan2',
|
||||
'kuǎn': 'kuan3', 'kuàn': 'kuan4', 'kuang': 'kuang5', 'kuāng': 'kuang1', 'kuáng': 'kuang2', 'kuǎng': 'kuang3',
|
||||
'kuàng': 'kuang4', 'kui': 'kui5', 'kuī': 'kui1', 'kuí': 'kui2', 'kuǐ': 'kui3', 'kuì': 'kui4', 'kun': 'kun5',
|
||||
'kūn': 'kun1', 'kún': 'kun2', 'kǔn': 'kun3', 'kùn': 'kun4', 'kuo': 'kuo5', 'kuō': 'kuo1', 'kuó': 'kuo2',
|
||||
'kuǒ': 'kuo3', 'kuò': 'kuo4', 'la': 'la5', 'lā': 'la1', 'lá': 'la2', 'lǎ': 'la3', 'là': 'la4', 'lai': 'lai5',
|
||||
'lāi': 'lai1', 'lái': 'lai2', 'lǎi': 'lai3', 'lài': 'lai4', 'lan': 'lan5', 'lān': 'lan1', 'lán': 'lan2',
|
||||
'lǎn': 'lan3', 'làn': 'lan4', 'lang': 'lang5', 'lāng': 'lang1', 'láng': 'lang2', 'lǎng': 'lang3', 'làng': 'lang4',
|
||||
'lao': 'lao5', 'lāo': 'lao1', 'láo': 'lao2', 'lǎo': 'lao3', 'lào': 'lao4', 'le': 'le5', 'lē': 'le1', 'lé': 'le2',
|
||||
'lě': 'le3', 'lè': 'le4', 'lei': 'lei5', 'lēi': 'lei1', 'léi': 'lei2', 'lěi': 'lei3', 'lèi': 'lei4',
|
||||
'leng': 'leng5', 'lēng': 'leng1', 'léng': 'leng2', 'lěng': 'leng3', 'lèng': 'leng4', 'li': 'li5', 'lī': 'li1',
|
||||
'lí': 'li2', 'lǐ': 'li3', 'lì': 'li4', 'lia': 'lia5', 'liā': 'lia1', 'liá': 'lia2', 'liǎ': 'lia3', 'lià': 'lia4',
|
||||
'lian': 'lian5', 'liān': 'lian1', 'lián': 'lian2', 'liǎn': 'lian3', 'liàn': 'lian4', 'liang': 'liang5',
|
||||
'liāng': 'liang1', 'liáng': 'liang2', 'liǎng': 'liang3', 'liàng': 'liang4', 'liao': 'liao5', 'liāo': 'liao1',
|
||||
'liáo': 'liao2', 'liǎo': 'liao3', 'liào': 'liao4', 'lie': 'lie5', 'liē': 'lie1', 'lié': 'lie2', 'liě': 'lie3',
|
||||
'liè': 'lie4', 'lin': 'lin5', 'līn': 'lin1', 'lín': 'lin2', 'lǐn': 'lin3', 'lìn': 'lin4', 'ling': 'ling5',
|
||||
'līng': 'ling1', 'líng': 'ling2', 'lǐng': 'ling3', 'lìng': 'ling4', 'liu': 'liu5', 'liū': 'liu1', 'liú': 'liu2',
|
||||
'liǔ': 'liu3', 'liù': 'liu4', 'lo': 'lo5', 'lō': 'lo1', 'ló': 'lo2', 'lǒ': 'lo3', 'lò': 'lo4', 'long': 'long5',
|
||||
'lōng': 'long1', 'lóng': 'long2', 'lǒng': 'long3', 'lòng': 'long4', 'lou': 'lou5', 'lōu': 'lou1', 'lóu': 'lou2',
|
||||
'lǒu': 'lou3', 'lòu': 'lou4', 'lu': 'lu5', 'lū': 'lu1', 'lú': 'lu2', 'lǔ': 'lu3', 'lù': 'lu4', 'luan': 'luan5',
|
||||
'luān': 'luan1', 'luán': 'luan2', 'luǎn': 'luan3', 'luàn': 'luan4', 'lun': 'lun5', 'lūn': 'lun1', 'lún': 'lun2',
|
||||
'lǔn': 'lun3', 'lùn': 'lun4', 'luo': 'luo5', 'luō': 'luo1', 'luó': 'luo2', 'luǒ': 'luo3', 'luò': 'luo4',
|
||||
'lü': 'lv5', 'lǖ': 'lv1', 'lǘ': 'lv2', 'lǚ': 'lv3', 'lǜ': 'lv4', 'lüe': 'lve5', 'lüē': 'lve1', 'lüé': 'lve2',
|
||||
'lüě': 'lve3', 'lüè': 'lve4', 'ma': 'ma5', 'mā': 'ma1', 'má': 'ma2', 'mǎ': 'ma3', 'mà': 'ma4', 'mai': 'mai5',
|
||||
'māi': 'mai1', 'mái': 'mai2', 'mǎi': 'mai3', 'mài': 'mai4', 'man': 'man5', 'mān': 'man1', 'mán': 'man2',
|
||||
'mǎn': 'man3', 'màn': 'man4', 'mang': 'mang5', 'māng': 'mang1', 'máng': 'mang2', 'mǎng': 'mang3', 'màng': 'mang4',
|
||||
'mao': 'mao5', 'māo': 'mao1', 'máo': 'mao2', 'mǎo': 'mao3', 'mào': 'mao4', 'me': 'me5', 'mē': 'me1', 'mé': 'me2',
|
||||
'mě': 'me3', 'mè': 'me4', 'mei': 'mei5', 'mēi': 'mei1', 'méi': 'mei2', 'měi': 'mei3', 'mèi': 'mei4', 'men': 'men5',
|
||||
'mēn': 'men1', 'mén': 'men2', 'měn': 'men3', 'mèn': 'men4', 'meng': 'meng5', 'mēng': 'meng1', 'méng': 'meng2',
|
||||
'měng': 'meng3', 'mèng': 'meng4', 'mi': 'mi5', 'mī': 'mi1', 'mí': 'mi2', 'mǐ': 'mi3', 'mì': 'mi4', 'mian': 'mian5',
|
||||
'miān': 'mian1', 'mián': 'mian2', 'miǎn': 'mian3', 'miàn': 'mian4', 'miao': 'miao5', 'miāo': 'miao1',
|
||||
'miáo': 'miao2', 'miǎo': 'miao3', 'miào': 'miao4', 'mie': 'mie5', 'miē': 'mie1', 'mié': 'mie2', 'miě': 'mie3',
|
||||
'miè': 'mie4', 'min': 'min5', 'mīn': 'min1', 'mín': 'min2', 'mǐn': 'min3', 'mìn': 'min4', 'ming': 'ming5',
|
||||
'mīng': 'ming1', 'míng': 'ming2', 'mǐng': 'ming3', 'mìng': 'ming4', 'miu': 'miu5', 'miū': 'miu1', 'miú': 'miu2',
|
||||
'miǔ': 'miu3', 'miù': 'miu4', 'mo': 'mo5', 'mō': 'mo1', 'mó': 'mo2', 'mǒ': 'mo3', 'mò': 'mo4', 'mou': 'mou5',
|
||||
'mōu': 'mou1', 'móu': 'mou2', 'mǒu': 'mou3', 'mòu': 'mou4', 'mu': 'mu5', 'mū': 'mu1', 'mú': 'mu2', 'mǔ': 'mu3',
|
||||
'mù': 'mu4', 'na': 'na5', 'nā': 'na1', 'ná': 'na2', 'nǎ': 'na3', 'nà': 'na4', 'nai': 'nai5', 'nāi': 'nai1',
|
||||
'nái': 'nai2', 'nǎi': 'nai3', 'nài': 'nai4', 'nan': 'nan5', 'nān': 'nan1', 'nán': 'nan2', 'nǎn': 'nan3',
|
||||
'nàn': 'nan4', 'nang': 'nang5', 'nāng': 'nang1', 'náng': 'nang2', 'nǎng': 'nang3', 'nàng': 'nang4', 'nao': 'nao5',
|
||||
'nāo': 'nao1', 'náo': 'nao2', 'nǎo': 'nao3', 'nào': 'nao4', 'ne': 'ne5', 'nē': 'ne1', 'né': 'ne2', 'ně': 'ne3',
|
||||
'nè': 'ne4', 'nei': 'nei5', 'nēi': 'nei1', 'néi': 'nei2', 'něi': 'nei3', 'nèi': 'nei4', 'nen': 'nen5',
|
||||
'nēn': 'nen1', 'nén': 'nen2', 'něn': 'nen3', 'nèn': 'nen4', 'neng': 'neng5', 'nēng': 'neng1', 'néng': 'neng2',
|
||||
'něng': 'neng3', 'nèng': 'neng4', 'ni': 'ni5', 'nī': 'ni1', 'ní': 'ni2', 'nǐ': 'ni3', 'nì': 'ni4', 'nian': 'nian5',
|
||||
'niān': 'nian1', 'nián': 'nian2', 'niǎn': 'nian3', 'niàn': 'nian4', 'niang': 'niang5', 'niāng': 'niang1',
|
||||
'niáng': 'niang2', 'niǎng': 'niang3', 'niàng': 'niang4', 'niao': 'niao5', 'niāo': 'niao1', 'niáo': 'niao2',
|
||||
'niǎo': 'niao3', 'niào': 'niao4', 'nie': 'nie5', 'niē': 'nie1', 'nié': 'nie2', 'niě': 'nie3', 'niè': 'nie4',
|
||||
'nin': 'nin5', 'nīn': 'nin1', 'nín': 'nin2', 'nǐn': 'nin3', 'nìn': 'nin4', 'ning': 'ning5', 'nīng': 'ning1',
|
||||
'níng': 'ning2', 'nǐng': 'ning3', 'nìng': 'ning4', 'niu': 'niu5', 'niū': 'niu1', 'niú': 'niu2', 'niǔ': 'niu3',
|
||||
'niù': 'niu4', 'nong': 'nong5', 'nōng': 'nong1', 'nóng': 'nong2', 'nǒng': 'nong3', 'nòng': 'nong4', 'nou': 'nou5',
|
||||
'nōu': 'nou1', 'nóu': 'nou2', 'nǒu': 'nou3', 'nòu': 'nou4', 'nu': 'nu5', 'nū': 'nu1', 'nú': 'nu2', 'nǔ': 'nu3',
|
||||
'nù': 'nu4', 'nuan': 'nuan5', 'nuān': 'nuan1', 'nuán': 'nuan2', 'nuǎn': 'nuan3', 'nuàn': 'nuan4', 'nuo': 'nuo5',
|
||||
'nuō': 'nuo1', 'nuó': 'nuo2', 'nuǒ': 'nuo3', 'nuò': 'nuo4', 'nü': 'nv5', 'nǖ': 'nv1', 'nǘ': 'nv2', 'nǚ': 'nv3',
|
||||
'nǜ': 'nv4', 'nüe': 'nve5', 'nüē': 'nve1', 'nüé': 'nve2', 'nüě': 'nve3', 'nüè': 'nve4', 'o': 'o5', 'ō': 'o1',
|
||||
'ó': 'o2', 'ǒ': 'o3', 'ò': 'o4', 'ou': 'ou5', 'ōu': 'ou1', 'óu': 'ou2', 'ǒu': 'ou3', 'òu': 'ou4', 'pa': 'pa5',
|
||||
'pā': 'pa1', 'pá': 'pa2', 'pǎ': 'pa3', 'pà': 'pa4', 'pai': 'pai5', 'pāi': 'pai1', 'pái': 'pai2', 'pǎi': 'pai3',
|
||||
'pài': 'pai4', 'pan': 'pan5', 'pān': 'pan1', 'pán': 'pan2', 'pǎn': 'pan3', 'pàn': 'pan4', 'pang': 'pang5',
|
||||
'pāng': 'pang1', 'páng': 'pang2', 'pǎng': 'pang3', 'pàng': 'pang4', 'pao': 'pao5', 'pāo': 'pao1', 'páo': 'pao2',
|
||||
'pǎo': 'pao3', 'pào': 'pao4', 'pei': 'pei5', 'pēi': 'pei1', 'péi': 'pei2', 'pěi': 'pei3', 'pèi': 'pei4',
|
||||
'pen': 'pen5', 'pēn': 'pen1', 'pén': 'pen2', 'pěn': 'pen3', 'pèn': 'pen4', 'peng': 'peng5', 'pēng': 'peng1',
|
||||
'péng': 'peng2', 'pěng': 'peng3', 'pèng': 'peng4', 'pi': 'pi5', 'pī': 'pi1', 'pí': 'pi2', 'pǐ': 'pi3', 'pì': 'pi4',
|
||||
'pian': 'pian5', 'piān': 'pian1', 'pián': 'pian2', 'piǎn': 'pian3', 'piàn': 'pian4', 'piao': 'piao5',
|
||||
'piāo': 'piao1', 'piáo': 'piao2', 'piǎo': 'piao3', 'piào': 'piao4', 'pie': 'pie5', 'piē': 'pie1', 'pié': 'pie2',
|
||||
'piě': 'pie3', 'piè': 'pie4', 'pin': 'pin5', 'pīn': 'pin1', 'pín': 'pin2', 'pǐn': 'pin3', 'pìn': 'pin4',
|
||||
'ping': 'ping5', 'pīng': 'ping1', 'píng': 'ping2', 'pǐng': 'ping3', 'pìng': 'ping4', 'po': 'po5', 'pō': 'po1',
|
||||
'pó': 'po2', 'pǒ': 'po3', 'pò': 'po4', 'pou': 'pou5', 'pōu': 'pou1', 'póu': 'pou2', 'pǒu': 'pou3', 'pòu': 'pou4',
|
||||
'pu': 'pu5', 'pū': 'pu1', 'pú': 'pu2', 'pǔ': 'pu3', 'pù': 'pu4', 'qi': 'qi5', 'qī': 'qi1', 'qí': 'qi2', 'qǐ': 'qi3',
|
||||
'qì': 'qi4', 'qia': 'qia5', 'qiā': 'qia1', 'qiá': 'qia2', 'qiǎ': 'qia3', 'qià': 'qia4', 'qian': 'qian5',
|
||||
'qiān': 'qian1', 'qián': 'qian2', 'qiǎn': 'qian3', 'qiàn': 'qian4', 'qiang': 'qiang5', 'qiāng': 'qiang1',
|
||||
'qiáng': 'qiang2', 'qiǎng': 'qiang3', 'qiàng': 'qiang4', 'qiao': 'qiao5', 'qiāo': 'qiao1', 'qiáo': 'qiao2',
|
||||
'qiǎo': 'qiao3', 'qiào': 'qiao4', 'qie': 'qie5', 'qiē': 'qie1', 'qié': 'qie2', 'qiě': 'qie3', 'qiè': 'qie4',
|
||||
'qin': 'qin5', 'qīn': 'qin1', 'qín': 'qin2', 'qǐn': 'qin3', 'qìn': 'qin4', 'qing': 'qing5', 'qīng': 'qing1',
|
||||
'qíng': 'qing2', 'qǐng': 'qing3', 'qìng': 'qing4', 'qiong': 'qiong5', 'qiōng': 'qiong1', 'qióng': 'qiong2',
|
||||
'qiǒng': 'qiong3', 'qiòng': 'qiong4', 'qiu': 'qiu5', 'qiū': 'qiu1', 'qiú': 'qiu2', 'qiǔ': 'qiu3', 'qiù': 'qiu4',
|
||||
'qu': 'qu5', 'qū': 'qu1', 'qú': 'qu2', 'qǔ': 'qu3', 'qù': 'qu4', 'quan': 'quan5', 'quān': 'quan1', 'quán': 'quan2',
|
||||
'quǎn': 'quan3', 'quàn': 'quan4', 'que': 'que5', 'quē': 'que1', 'qué': 'que2', 'quě': 'que3', 'què': 'que4',
|
||||
'qun': 'qun5', 'qūn': 'qun1', 'qún': 'qun2', 'qǔn': 'qun3', 'qùn': 'qun4', 'ran': 'ran5', 'rān': 'ran1',
|
||||
'rán': 'ran2', 'rǎn': 'ran3', 'ràn': 'ran4', 'rang': 'rang5', 'rāng': 'rang1', 'ráng': 'rang2', 'rǎng': 'rang3',
|
||||
'ràng': 'rang4', 'rao': 'rao5', 'rāo': 'rao1', 'ráo': 'rao2', 'rǎo': 'rao3', 'rào': 'rao4', 're': 're5',
|
||||
'rē': 're1', 'ré': 're2', 'rě': 're3', 'rè': 're4', 'ren': 'ren5', 'rēn': 'ren1', 'rén': 'ren2', 'rěn': 'ren3',
|
||||
'rèn': 'ren4', 'reng': 'reng5', 'rēng': 'reng1', 'réng': 'reng2', 'rěng': 'reng3', 'rèng': 'reng4', 'ri': 'ri5',
|
||||
'rī': 'ri1', 'rí': 'ri2', 'rǐ': 'ri3', 'rì': 'ri4', 'rong': 'rong5', 'rōng': 'rong1', 'róng': 'rong2',
|
||||
'rǒng': 'rong3', 'ròng': 'rong4', 'rou': 'rou5', 'rōu': 'rou1', 'róu': 'rou2', 'rǒu': 'rou3', 'ròu': 'rou4',
|
||||
'ru': 'ru5', 'rū': 'ru1', 'rú': 'ru2', 'rǔ': 'ru3', 'rù': 'ru4', 'ruan': 'ruan5', 'ruān': 'ruan1', 'ruán': 'ruan2',
|
||||
'ruǎn': 'ruan3', 'ruàn': 'ruan4', 'rui': 'rui5', 'ruī': 'rui1', 'ruí': 'rui2', 'ruǐ': 'rui3', 'ruì': 'rui4',
|
||||
'run': 'run5', 'rūn': 'run1', 'rún': 'run2', 'rǔn': 'run3', 'rùn': 'run4', 'ruo': 'ruo5', 'ruō': 'ruo1',
|
||||
'ruó': 'ruo2', 'ruǒ': 'ruo3', 'ruò': 'ruo4', 'sa': 'sa5', 'sā': 'sa1', 'sá': 'sa2', 'sǎ': 'sa3', 'sà': 'sa4',
|
||||
'sai': 'sai5', 'sāi': 'sai1', 'sái': 'sai2', 'sǎi': 'sai3', 'sài': 'sai4', 'san': 'san5', 'sān': 'san1',
|
||||
'sán': 'san2', 'sǎn': 'san3', 'sàn': 'san4', 'sang': 'sang5', 'sāng': 'sang1', 'sáng': 'sang2', 'sǎng': 'sang3',
|
||||
'sàng': 'sang4', 'sao': 'sao5', 'sāo': 'sao1', 'sáo': 'sao2', 'sǎo': 'sao3', 'sào': 'sao4', 'se': 'se5',
|
||||
'sē': 'se1', 'sé': 'se2', 'sě': 'se3', 'sè': 'se4', 'sen': 'sen5', 'sēn': 'sen1', 'sén': 'sen2', 'sěn': 'sen3',
|
||||
'sèn': 'sen4', 'seng': 'seng5', 'sēng': 'seng1', 'séng': 'seng2', 'sěng': 'seng3', 'sèng': 'seng4', 'sha': 'sha5',
|
||||
'shā': 'sha1', 'shá': 'sha2', 'shǎ': 'sha3', 'shà': 'sha4', 'shai': 'shai5', 'shāi': 'shai1', 'shái': 'shai2',
|
||||
'shǎi': 'shai3', 'shài': 'shai4', 'shan': 'shan5', 'shān': 'shan1', 'shán': 'shan2', 'shǎn': 'shan3',
|
||||
'shàn': 'shan4', 'shang': 'shang5', 'shāng': 'shang1', 'sháng': 'shang2', 'shǎng': 'shang3', 'shàng': 'shang4',
|
||||
'shao': 'shao5', 'shāo': 'shao1', 'sháo': 'shao2', 'shǎo': 'shao3', 'shào': 'shao4', 'she': 'she5', 'shē': 'she1',
|
||||
'shé': 'she2', 'shě': 'she3', 'shè': 'she4', 'shei': 'shei5', 'shēi': 'shei1', 'shéi': 'shei2', 'shěi': 'shei3',
|
||||
'shèi': 'shei4', 'shen': 'shen5', 'shēn': 'shen1', 'shén': 'shen2', 'shěn': 'shen3', 'shèn': 'shen4',
|
||||
'sheng': 'sheng5', 'shēng': 'sheng1', 'shéng': 'sheng2', 'shěng': 'sheng3', 'shèng': 'sheng4', 'shi': 'shi5',
|
||||
'shī': 'shi1', 'shí': 'shi2', 'shǐ': 'shi3', 'shì': 'shi4', 'shou': 'shou5', 'shōu': 'shou1', 'shóu': 'shou2',
|
||||
'shǒu': 'shou3', 'shòu': 'shou4', 'shu': 'shu5', 'shū': 'shu1', 'shú': 'shu2', 'shǔ': 'shu3', 'shù': 'shu4',
|
||||
'shua': 'shua5', 'shuā': 'shua1', 'shuá': 'shua2', 'shuǎ': 'shua3', 'shuà': 'shua4', 'shuai': 'shuai5',
|
||||
'shuāi': 'shuai1', 'shuái': 'shuai2', 'shuǎi': 'shuai3', 'shuài': 'shuai4', 'shuan': 'shuan5', 'shuān': 'shuan1',
|
||||
'shuán': 'shuan2', 'shuǎn': 'shuan3', 'shuàn': 'shuan4', 'shuang': 'shuang5', 'shuāng': 'shuang1',
|
||||
'shuáng': 'shuang2', 'shuǎng': 'shuang3', 'shuàng': 'shuang4', 'shui': 'shui5', 'shuī': 'shui1', 'shuí': 'shui2',
|
||||
'shuǐ': 'shui3', 'shuì': 'shui4', 'shun': 'shun5', 'shūn': 'shun1', 'shún': 'shun2', 'shǔn': 'shun3',
|
||||
'shùn': 'shun4', 'shuo': 'shuo5', 'shuō': 'shuo1', 'shuó': 'shuo2', 'shuǒ': 'shuo3', 'shuò': 'shuo4', 'si': 'si5',
|
||||
'sī': 'si1', 'sí': 'si2', 'sǐ': 'si3', 'sì': 'si4', 'song': 'song5', 'sōng': 'song1', 'sóng': 'song2',
|
||||
'sǒng': 'song3', 'sòng': 'song4', 'sou': 'sou5', 'sōu': 'sou1', 'sóu': 'sou2', 'sǒu': 'sou3', 'sòu': 'sou4',
|
||||
'su': 'su5', 'sū': 'su1', 'sú': 'su2', 'sǔ': 'su3', 'sù': 'su4', 'suan': 'suan5', 'suān': 'suan1', 'suán': 'suan2',
|
||||
'suǎn': 'suan3', 'suàn': 'suan4', 'sui': 'sui5', 'suī': 'sui1', 'suí': 'sui2', 'suǐ': 'sui3', 'suì': 'sui4',
|
||||
'sun': 'sun5', 'sūn': 'sun1', 'sún': 'sun2', 'sǔn': 'sun3', 'sùn': 'sun4', 'suo': 'suo5', 'suō': 'suo1',
|
||||
'suó': 'suo2', 'suǒ': 'suo3', 'suò': 'suo4', 'ta': 'ta5', 'tā': 'ta1', 'tá': 'ta2', 'tǎ': 'ta3', 'tà': 'ta4',
|
||||
'tai': 'tai5', 'tāi': 'tai1', 'tái': 'tai2', 'tǎi': 'tai3', 'tài': 'tai4', 'tan': 'tan5', 'tān': 'tan1',
|
||||
'tán': 'tan2', 'tǎn': 'tan3', 'tàn': 'tan4', 'tang': 'tang5', 'tāng': 'tang1', 'táng': 'tang2', 'tǎng': 'tang3',
|
||||
'tàng': 'tang4', 'tao': 'tao5', 'tāo': 'tao1', 'táo': 'tao2', 'tǎo': 'tao3', 'tào': 'tao4', 'te': 'te5',
|
||||
'tē': 'te1', 'té': 'te2', 'tě': 'te3', 'tè': 'te4', 'teng': 'teng5', 'tēng': 'teng1', 'téng': 'teng2',
|
||||
'těng': 'teng3', 'tèng': 'teng4', 'ti': 'ti5', 'tī': 'ti1', 'tí': 'ti2', 'tǐ': 'ti3', 'tì': 'ti4', 'tian': 'tian5',
|
||||
'tiān': 'tian1', 'tián': 'tian2', 'tiǎn': 'tian3', 'tiàn': 'tian4', 'tiao': 'tiao5', 'tiāo': 'tiao1',
|
||||
'tiáo': 'tiao2', 'tiǎo': 'tiao3', 'tiào': 'tiao4', 'tie': 'tie5', 'tiē': 'tie1', 'tié': 'tie2', 'tiě': 'tie3',
|
||||
'tiè': 'tie4', 'ting': 'ting5', 'tīng': 'ting1', 'tíng': 'ting2', 'tǐng': 'ting3', 'tìng': 'ting4', 'tong': 'tong5',
|
||||
'tōng': 'tong1', 'tóng': 'tong2', 'tǒng': 'tong3', 'tòng': 'tong4', 'tou': 'tou5', 'tōu': 'tou1', 'tóu': 'tou2',
|
||||
'tǒu': 'tou3', 'tòu': 'tou4', 'tu': 'tu5', 'tū': 'tu1', 'tú': 'tu2', 'tǔ': 'tu3', 'tù': 'tu4', 'tuan': 'tuan5',
|
||||
'tuān': 'tuan1', 'tuán': 'tuan2', 'tuǎn': 'tuan3', 'tuàn': 'tuan4', 'tui': 'tui5', 'tuī': 'tui1', 'tuí': 'tui2',
|
||||
'tuǐ': 'tui3', 'tuì': 'tui4', 'tun': 'tun5', 'tūn': 'tun1', 'tún': 'tun2', 'tǔn': 'tun3', 'tùn': 'tun4',
|
||||
'tuo': 'tuo5', 'tuō': 'tuo1', 'tuó': 'tuo2', 'tuǒ': 'tuo3', 'tuò': 'tuo4', 'wa': 'wa5', 'wā': 'wa1', 'wá': 'wa2',
|
||||
'wǎ': 'wa3', 'wà': 'wa4', 'wai': 'wai5', 'wāi': 'wai1', 'wái': 'wai2', 'wǎi': 'wai3', 'wài': 'wai4', 'wan': 'wan5',
|
||||
'wān': 'wan1', 'wán': 'wan2', 'wǎn': 'wan3', 'wàn': 'wan4', 'wang': 'wang5', 'wāng': 'wang1', 'wáng': 'wang2',
|
||||
'wǎng': 'wang3', 'wàng': 'wang4', 'wei': 'wei5', 'wēi': 'wei1', 'wéi': 'wei2', 'wěi': 'wei3', 'wèi': 'wei4',
|
||||
'wen': 'wen5', 'wēn': 'wen1', 'wén': 'wen2', 'wěn': 'wen3', 'wèn': 'wen4', 'weng': 'weng5', 'wēng': 'weng1',
|
||||
'wéng': 'weng2', 'wěng': 'weng3', 'wèng': 'weng4', 'wo': 'wo5', 'wō': 'wo1', 'wó': 'wo2', 'wǒ': 'wo3', 'wò': 'wo4',
|
||||
'wu': 'wu5', 'wū': 'wu1', 'wú': 'wu2', 'wǔ': 'wu3', 'wù': 'wu4', 'xi': 'xi5', 'xī': 'xi1', 'xí': 'xi2', 'xǐ': 'xi3',
|
||||
'xì': 'xi4', 'xia': 'xia5', 'xiā': 'xia1', 'xiá': 'xia2', 'xiǎ': 'xia3', 'xià': 'xia4', 'xian': 'xian5',
|
||||
'xiān': 'xian1', 'xián': 'xian2', 'xiǎn': 'xian3', 'xiàn': 'xian4', 'xiang': 'xiang5', 'xiāng': 'xiang1',
|
||||
'xiáng': 'xiang2', 'xiǎng': 'xiang3', 'xiàng': 'xiang4', 'xiao': 'xiao5', 'xiāo': 'xiao1', 'xiáo': 'xiao2',
|
||||
'xiǎo': 'xiao3', 'xiào': 'xiao4', 'xie': 'xie5', 'xiē': 'xie1', 'xié': 'xie2', 'xiě': 'xie3', 'xiè': 'xie4',
|
||||
'xin': 'xin5', 'xīn': 'xin1', 'xín': 'xin2', 'xǐn': 'xin3', 'xìn': 'xin4', 'xing': 'xing5', 'xīng': 'xing1',
|
||||
'xíng': 'xing2', 'xǐng': 'xing3', 'xìng': 'xing4', 'xiong': 'xiong5', 'xiōng': 'xiong1', 'xióng': 'xiong2',
|
||||
'xiǒng': 'xiong3', 'xiòng': 'xiong4', 'xiu': 'xiu5', 'xiū': 'xiu1', 'xiú': 'xiu2', 'xiǔ': 'xiu3', 'xiù': 'xiu4',
|
||||
'xu': 'xu5', 'xū': 'xu1', 'xú': 'xu2', 'xǔ': 'xu3', 'xù': 'xu4', 'xuan': 'xuan5', 'xuān': 'xuan1', 'xuán': 'xuan2',
|
||||
'xuǎn': 'xuan3', 'xuàn': 'xuan4', 'xue': 'xue5', 'xuē': 'xue1', 'xué': 'xue2', 'xuě': 'xue3', 'xuè': 'xue4',
|
||||
'xun': 'xun5', 'xūn': 'xun1', 'xún': 'xun2', 'xǔn': 'xun3', 'xùn': 'xun4', 'ya': 'ya5', 'yā': 'ya1', 'yá': 'ya2',
|
||||
'yǎ': 'ya3', 'yà': 'ya4', 'yan': 'yan5', 'yān': 'yan1', 'yán': 'yan2', 'yǎn': 'yan3', 'yàn': 'yan4',
|
||||
'yang': 'yang5', 'yāng': 'yang1', 'yáng': 'yang2', 'yǎng': 'yang3', 'yàng': 'yang4', 'yao': 'yao5', 'yāo': 'yao1',
|
||||
'yáo': 'yao2', 'yǎo': 'yao3', 'yào': 'yao4', 'ye': 'ye5', 'yē': 'ye1', 'yé': 'ye2', 'yě': 'ye3', 'yè': 'ye4',
|
||||
'yi': 'yi5', 'yī': 'yi1', 'yí': 'yi2', 'yǐ': 'yi3', 'yì': 'yi4', 'yin': 'yin5', 'yīn': 'yin1', 'yín': 'yin2',
|
||||
'yǐn': 'yin3', 'yìn': 'yin4', 'ying': 'ying5', 'yīng': 'ying1', 'yíng': 'ying2', 'yǐng': 'ying3', 'yìng': 'ying4',
|
||||
'yo': 'yo5', 'yō': 'yo1', 'yó': 'yo2', 'yǒ': 'yo3', 'yò': 'yo4', 'yong': 'yong5', 'yōng': 'yong1', 'yóng': 'yong2',
|
||||
'yǒng': 'yong3', 'yòng': 'yong4', 'you': 'you5', 'yōu': 'you1', 'yóu': 'you2', 'yǒu': 'you3', 'yòu': 'you4',
|
||||
'yu': 'yu5', 'yū': 'yu1', 'yú': 'yu2', 'yǔ': 'yu3', 'yù': 'yu4', 'yuan': 'yuan5', 'yuān': 'yuan1', 'yuán': 'yuan2',
|
||||
'yuǎn': 'yuan3', 'yuàn': 'yuan4', 'yue': 'yue5', 'yuē': 'yue1', 'yué': 'yue2', 'yuě': 'yue3', 'yuè': 'yue4',
|
||||
'yun': 'yun5', 'yūn': 'yun1', 'yún': 'yun2', 'yǔn': 'yun3', 'yùn': 'yun4', 'za': 'za5', 'zā': 'za1', 'zá': 'za2',
|
||||
'zǎ': 'za3', 'zà': 'za4', 'zai': 'zai5', 'zāi': 'zai1', 'zái': 'zai2', 'zǎi': 'zai3', 'zài': 'zai4', 'zan': 'zan5',
|
||||
'zān': 'zan1', 'zán': 'zan2', 'zǎn': 'zan3', 'zàn': 'zan4', 'zang': 'zang5', 'zāng': 'zang1', 'záng': 'zang2',
|
||||
'zǎng': 'zang3', 'zàng': 'zang4', 'zao': 'zao5', 'zāo': 'zao1', 'záo': 'zao2', 'zǎo': 'zao3', 'zào': 'zao4',
|
||||
'ze': 'ze5', 'zē': 'ze1', 'zé': 'ze2', 'zě': 'ze3', 'zè': 'ze4', 'zei': 'zei5', 'zēi': 'zei1', 'zéi': 'zei2',
|
||||
'zěi': 'zei3', 'zèi': 'zei4', 'zen': 'zen5', 'zēn': 'zen1', 'zén': 'zen2', 'zěn': 'zen3', 'zèn': 'zen4',
|
||||
'zeng': 'zeng5', 'zēng': 'zeng1', 'zéng': 'zeng2', 'zěng': 'zeng3', 'zèng': 'zeng4', 'zha': 'zha5', 'zhā': 'zha1',
|
||||
'zhá': 'zha2', 'zhǎ': 'zha3', 'zhà': 'zha4', 'zhai': 'zhai5', 'zhāi': 'zhai1', 'zhái': 'zhai2', 'zhǎi': 'zhai3',
|
||||
'zhài': 'zhai4', 'zhan': 'zhan5', 'zhān': 'zhan1', 'zhán': 'zhan2', 'zhǎn': 'zhan3', 'zhàn': 'zhan4',
|
||||
'zhang': 'zhang5', 'zhāng': 'zhang1', 'zháng': 'zhang2', 'zhǎng': 'zhang3', 'zhàng': 'zhang4', 'zhao': 'zhao5',
|
||||
'zhāo': 'zhao1', 'zháo': 'zhao2', 'zhǎo': 'zhao3', 'zhào': 'zhao4', 'zhe': 'zhe5', 'zhē': 'zhe1', 'zhé': 'zhe2',
|
||||
'zhě': 'zhe3', 'zhè': 'zhe4', 'zhen': 'zhen5', 'zhēn': 'zhen1', 'zhén': 'zhen2', 'zhěn': 'zhen3', 'zhèn': 'zhen4',
|
||||
'zheng': 'zheng5', 'zhēng': 'zheng1', 'zhéng': 'zheng2', 'zhěng': 'zheng3', 'zhèng': 'zheng4', 'zhi': 'zhi5',
|
||||
'zhī': 'zhi1', 'zhí': 'zhi2', 'zhǐ': 'zhi3', 'zhì': 'zhi4', 'zhong': 'zhong5', 'zhōng': 'zhong1', 'zhóng': 'zhong2',
|
||||
'zhǒng': 'zhong3', 'zhòng': 'zhong4', 'zhou': 'zhou5', 'zhōu': 'zhou1', 'zhóu': 'zhou2', 'zhǒu': 'zhou3',
|
||||
'zhòu': 'zhou4', 'zhu': 'zhu5', 'zhū': 'zhu1', 'zhú': 'zhu2', 'zhǔ': 'zhu3', 'zhù': 'zhu4', 'zhua': 'zhua5',
|
||||
'zhuā': 'zhua1', 'zhuá': 'zhua2', 'zhuǎ': 'zhua3', 'zhuà': 'zhua4', 'zhuai': 'zhuai5', 'zhuāi': 'zhuai1',
|
||||
'zhuái': 'zhuai2', 'zhuǎi': 'zhuai3', 'zhuài': 'zhuai4', 'zhuan': 'zhuan5', 'zhuān': 'zhuan1', 'zhuán': 'zhuan2',
|
||||
'zhuǎn': 'zhuan3', 'zhuàn': 'zhuan4', 'zhuang': 'zhuang5', 'zhuāng': 'zhuang1', 'zhuáng': 'zhuang2',
|
||||
'zhuǎng': 'zhuang3', 'zhuàng': 'zhuang4', 'zhui': 'zhui5', 'zhuī': 'zhui1', 'zhuí': 'zhui2', 'zhuǐ': 'zhui3',
|
||||
'zhuì': 'zhui4', 'zhun': 'zhun5', 'zhūn': 'zhun1', 'zhún': 'zhun2', 'zhǔn': 'zhun3', 'zhùn': 'zhun4',
|
||||
'zhuo': 'zhuo5', 'zhuō': 'zhuo1', 'zhuó': 'zhuo2', 'zhuǒ': 'zhuo3', 'zhuò': 'zhuo4', 'zi': 'zi5', 'zī': 'zi1',
|
||||
'zí': 'zi2', 'zǐ': 'zi3', 'zì': 'zi4', 'zong': 'zong5', 'zōng': 'zong1', 'zóng': 'zong2', 'zǒng': 'zong3',
|
||||
'zòng': 'zong4', 'zou': 'zou5', 'zōu': 'zou1', 'zóu': 'zou2', 'zǒu': 'zou3', 'zòu': 'zou4', 'zu': 'zu5',
|
||||
'zū': 'zu1', 'zú': 'zu2', 'zǔ': 'zu3', 'zù': 'zu4', 'zuan': 'zuan5', 'zuān': 'zuan1', 'zuán': 'zuan2',
|
||||
'zuǎn': 'zuan3', 'zuàn': 'zuan4', 'zui': 'zui5', 'zuī': 'zui1', 'zuí': 'zui2', 'zuǐ': 'zui3', 'zuì': 'zui4',
|
||||
'zun': 'zun5', 'zūn': 'zun1', 'zún': 'zun2', 'zǔn': 'zun3', 'zùn': 'zun4', 'zuo': 'zuo5', 'zuō': 'zuo1',
|
||||
'zuó': 'zuo2', 'zuǒ': 'zuo3', 'zuò': 'zuo4', 'zhei': 'zhei5', 'zhēi': 'zhei1', 'zhéi': 'zhei2', 'zhěi': 'zhei3',
|
||||
'zhèi': 'zhei4', 'kei': 'kei5', 'kēi': 'kei1', 'kéi': 'kei2', 'kěi': 'kei3', 'kèi': 'kei4', 'tei': 'tei5',
|
||||
'tēi': 'tei1', 'téi': 'tei2', 'těi': 'tei3', 'tèi': 'tei4', 'len': 'len5', 'lēn': 'len1', 'lén': 'len2',
|
||||
'lěn': 'len3', 'lèn': 'len4', 'nun': 'nun5', 'nūn': 'nun1', 'nún': 'nun2', 'nǔn': 'nun3', 'nùn': 'nun4',
|
||||
'nia': 'nia5', 'niā': 'nia1', 'niá': 'nia2', 'niǎ': 'nia3', 'nià': 'nia4', 'rua': 'rua5', 'ruā': 'rua1',
|
||||
'ruá': 'rua2', 'ruǎ': 'rua3', 'ruà': 'rua4', 'fiao': 'fiao5', 'fiāo': 'fiao1', 'fiáo': 'fiao2', 'fiǎo': 'fiao3',
|
||||
'fiào': 'fiao4', 'cei': 'cei5', 'cēi': 'cei1', 'céi': 'cei2', 'cěi': 'cei3', 'cèi': 'cei4', 'wong': 'wong5',
|
||||
'wōng': 'wong1', 'wóng': 'wong2', 'wǒng': 'wong3', 'wòng': 'wong4', 'din': 'din5', 'dīn': 'din1', 'dín': 'din2',
|
||||
'dǐn': 'din3', 'dìn': 'din4', 'chua': 'chua5', 'chuā': 'chua1', 'chuá': 'chua2', 'chuǎ': 'chua3', 'chuà': 'chua4',
|
||||
'n': 'n5', 'n1': 'n1', 'ń': 'n2', 'ň': 'n3', 'ǹ': 'n4', 'ng': 'ng5', 'ng1': 'ng1', 'ńg': 'ng2', 'ňg': 'ng3',
|
||||
'ǹg': 'ng4'}
|
||||
|
||||
shengyundiao2guobiao_dict = {v: k for k, v in guobiao2shengyundiao_dict.items()}
|
||||
|
||||
|
||||
def guobiao2shengyundiao(pinyin_list):
|
||||
"""国标样式拼音转为声母韵母音调样式的拼音。"""
|
||||
out = []
|
||||
for pin in pinyin_list:
|
||||
out.append(guobiao2shengyundiao_dict.get(pin))
|
||||
return out
|
||||
|
||||
|
||||
def shengyundiao2guobiao(pinyin_list):
|
||||
"""声母韵母音调样式的拼音转为国标样式的拼音。"""
|
||||
out = []
|
||||
for pin in pinyin_list:
|
||||
out.append(shengyundiao2guobiao_dict.get(pin))
|
||||
return out
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger.info(__file__)
|
||||
out = shengyundiao2guobiao('ni2 hao3 a5'.split())
|
||||
assert out == ['ní', 'hǎo', 'a']
|
||||
out = guobiao2shengyundiao(out)
|
||||
assert out == ['ni2', 'hao3', 'a5']
|
@ -0,0 +1,19 @@
|
||||
Copyright (c) 2017 Keith Ito
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
@ -0,0 +1,116 @@
|
||||
"""
|
||||
### english
|
||||
|
||||
from https://github.com/keithito/tacotron "
|
||||
Cleaners are transformations that run over the input text at both training and eval time.
|
||||
|
||||
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
||||
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
|
||||
1. "english_cleaners" for English text
|
||||
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
||||
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
||||
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
||||
the symbols in symbols.py to match your data).
|
||||
"""
|
||||
import re
|
||||
import random
|
||||
from . import cleaners
|
||||
from .symbols import symbols
|
||||
|
||||
# Mappings from symbol to numeric ID and vice versa:
|
||||
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
||||
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
||||
|
||||
# Regular expression matching text enclosed in curly braces:
|
||||
_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
|
||||
|
||||
|
||||
def get_arpabet(word, dictionary):
|
||||
word_arpabet = dictionary.lookup(word)
|
||||
if word_arpabet is not None:
|
||||
return "{" + word_arpabet[0] + "}"
|
||||
else:
|
||||
return word
|
||||
|
||||
|
||||
def text_to_sequence(text, cleaner_names, dictionary=None, p_arpabet=1.0):
|
||||
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
||||
|
||||
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
|
||||
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
|
||||
|
||||
Args:
|
||||
text: string to convert to a sequence
|
||||
cleaner_names: names of the cleaner functions to run the text through
|
||||
dictionary: arpabet class with arpabet dictionary
|
||||
|
||||
Returns:
|
||||
List of integers corresponding to the symbols in the text
|
||||
'''
|
||||
sequence = []
|
||||
|
||||
space = _symbols_to_sequence(' ')
|
||||
# Check for curly braces and treat their contents as ARPAbet:
|
||||
while len(text):
|
||||
m = _curly_re.match(text)
|
||||
if not m:
|
||||
clean_text = _clean_text(text, cleaner_names)
|
||||
if dictionary is not None:
|
||||
clean_text = [get_arpabet(w, dictionary)
|
||||
if random.random() < p_arpabet else w
|
||||
for w in clean_text.split(" ")]
|
||||
|
||||
for i in range(len(clean_text)):
|
||||
t = clean_text[i]
|
||||
if t.startswith("{"):
|
||||
sequence += _arpabet_to_sequence(t[1:-1])
|
||||
else:
|
||||
sequence += _symbols_to_sequence(t)
|
||||
sequence += space
|
||||
else:
|
||||
sequence += _symbols_to_sequence(clean_text)
|
||||
break
|
||||
|
||||
clean_text = _clean_text(text, cleaner_names)
|
||||
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
|
||||
sequence += _arpabet_to_sequence(m.group(2))
|
||||
text = m.group(3)
|
||||
|
||||
# remove trailing space
|
||||
sequence = sequence[:-1] if sequence[-1] == space[0] else sequence
|
||||
return sequence
|
||||
|
||||
|
||||
def sequence_to_text(sequence):
|
||||
'''Converts a sequence of IDs back to a string'''
|
||||
result = []
|
||||
for symbol_id in sequence:
|
||||
if symbol_id in _id_to_symbol:
|
||||
s = _id_to_symbol[symbol_id]
|
||||
# Enclose ARPAbet back in curly braces:
|
||||
if len(s) > 1 and s[0] == '@':
|
||||
s = '{%s}' % s[1:]
|
||||
result.append(s)
|
||||
result = ''.join(result)
|
||||
return result.replace('}{', ' ')
|
||||
|
||||
|
||||
def _clean_text(text, cleaner_names):
|
||||
for name in cleaner_names:
|
||||
cleaner = getattr(cleaners, name)
|
||||
if not cleaner:
|
||||
raise Exception('Unknown cleaner: %s' % name)
|
||||
text = cleaner(text)
|
||||
return text
|
||||
|
||||
|
||||
def _symbols_to_sequence(symbols):
|
||||
return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
|
||||
|
||||
|
||||
def _arpabet_to_sequence(text):
|
||||
return _symbols_to_sequence(['@' + s for s in text.split()])
|
||||
|
||||
|
||||
def _should_keep_symbol(s):
|
||||
return s in _symbol_to_id and s is not '_' and s is not '~'
|
@ -0,0 +1,91 @@
|
||||
'''
|
||||
### english
|
||||
|
||||
from https://github.com/keithito/tacotron "
|
||||
Cleaners are transformations that run over the input text at both training and eval time.
|
||||
|
||||
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
||||
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
|
||||
1. "english_cleaners" for English text
|
||||
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
||||
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
||||
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
||||
the symbols in symbols.py to match your data).
|
||||
'''
|
||||
|
||||
import re
|
||||
from unidecode import unidecode
|
||||
from .numbers import normalize_numbers
|
||||
|
||||
|
||||
# Regular expression matching whitespace:
|
||||
_whitespace_re = re.compile(r'\s+')
|
||||
|
||||
# List of (regular expression, replacement) pairs for abbreviations:
|
||||
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
|
||||
('mrs', 'misess'),
|
||||
('mr', 'mister'),
|
||||
('dr', 'doctor'),
|
||||
('st', 'saint'),
|
||||
('co', 'company'),
|
||||
('jr', 'junior'),
|
||||
('maj', 'major'),
|
||||
('gen', 'general'),
|
||||
('drs', 'doctors'),
|
||||
('rev', 'reverend'),
|
||||
('lt', 'lieutenant'),
|
||||
('hon', 'honorable'),
|
||||
('sgt', 'sergeant'),
|
||||
('capt', 'captain'),
|
||||
('esq', 'esquire'),
|
||||
('ltd', 'limited'),
|
||||
('col', 'colonel'),
|
||||
('ft', 'fort'),
|
||||
]]
|
||||
|
||||
|
||||
def expand_abbreviations(text):
|
||||
for regex, replacement in _abbreviations:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def expand_numbers(text):
|
||||
return normalize_numbers(text)
|
||||
|
||||
|
||||
def lowercase(text):
|
||||
return text.lower()
|
||||
|
||||
|
||||
def collapse_whitespace(text):
|
||||
return re.sub(_whitespace_re, ' ', text)
|
||||
|
||||
|
||||
def convert_to_ascii(text):
|
||||
return unidecode(text)
|
||||
|
||||
|
||||
def basic_cleaners(text):
|
||||
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def transliteration_cleaners(text):
|
||||
'''Pipeline for non-English text that transliterates to ASCII.'''
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def english_cleaners(text):
|
||||
'''Pipeline for English text, including number and abbreviation expansion.'''
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = expand_numbers(text)
|
||||
text = expand_abbreviations(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,65 @@
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
|
||||
import re
|
||||
|
||||
|
||||
valid_symbols = [
|
||||
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
|
||||
'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
|
||||
'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
|
||||
'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
|
||||
'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
|
||||
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
|
||||
'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
|
||||
]
|
||||
|
||||
_valid_symbol_set = set(valid_symbols)
|
||||
|
||||
|
||||
class CMUDict:
|
||||
'''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
|
||||
def __init__(self, file_or_path, keep_ambiguous=True):
|
||||
if isinstance(file_or_path, str):
|
||||
with open(file_or_path, encoding='latin-1') as f:
|
||||
entries = _parse_cmudict(f)
|
||||
else:
|
||||
entries = _parse_cmudict(file_or_path)
|
||||
if not keep_ambiguous:
|
||||
entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
|
||||
self._entries = entries
|
||||
|
||||
|
||||
def __len__(self):
|
||||
return len(self._entries)
|
||||
|
||||
|
||||
def lookup(self, word):
|
||||
'''Returns list of ARPAbet pronunciations of the given word.'''
|
||||
return self._entries.get(word.upper())
|
||||
|
||||
|
||||
|
||||
_alt_re = re.compile(r'\([0-9]+\)')
|
||||
|
||||
|
||||
def _parse_cmudict(file):
|
||||
cmudict = {}
|
||||
for line in file:
|
||||
if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
|
||||
parts = line.split(' ')
|
||||
word = re.sub(_alt_re, '', parts[0])
|
||||
pronunciation = _get_pronunciation(parts[1])
|
||||
if pronunciation:
|
||||
if word in cmudict:
|
||||
cmudict[word].append(pronunciation)
|
||||
else:
|
||||
cmudict[word] = [pronunciation]
|
||||
return cmudict
|
||||
|
||||
|
||||
def _get_pronunciation(s):
|
||||
parts = s.strip().split(' ')
|
||||
for part in parts:
|
||||
if part not in _valid_symbol_set:
|
||||
return None
|
||||
return ' '.join(parts)
|
@ -0,0 +1,71 @@
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
|
||||
import inflect
|
||||
import re
|
||||
|
||||
|
||||
_inflect = inflect.engine()
|
||||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
||||
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
||||
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
|
||||
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
|
||||
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
|
||||
_number_re = re.compile(r'[0-9]+')
|
||||
|
||||
|
||||
def _remove_commas(m):
|
||||
return m.group(1).replace(',', '')
|
||||
|
||||
|
||||
def _expand_decimal_point(m):
|
||||
return m.group(1).replace('.', ' point ')
|
||||
|
||||
|
||||
def _expand_dollars(m):
|
||||
match = m.group(1)
|
||||
parts = match.split('.')
|
||||
if len(parts) > 2:
|
||||
return match + ' dollars' # Unexpected format
|
||||
dollars = int(parts[0]) if parts[0] else 0
|
||||
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
||||
if dollars and cents:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
|
||||
elif dollars:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
return '%s %s' % (dollars, dollar_unit)
|
||||
elif cents:
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s' % (cents, cent_unit)
|
||||
else:
|
||||
return 'zero dollars'
|
||||
|
||||
|
||||
def _expand_ordinal(m):
|
||||
return _inflect.number_to_words(m.group(0))
|
||||
|
||||
|
||||
def _expand_number(m):
|
||||
num = int(m.group(0))
|
||||
if num > 1000 and num < 3000:
|
||||
if num == 2000:
|
||||
return 'two thousand'
|
||||
elif num > 2000 and num < 2010:
|
||||
return 'two thousand ' + _inflect.number_to_words(num % 100)
|
||||
elif num % 100 == 0:
|
||||
return _inflect.number_to_words(num // 100) + ' hundred'
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='')
|
||||
|
||||
|
||||
def normalize_numbers(text):
|
||||
text = re.sub(_comma_number_re, _remove_commas, text)
|
||||
text = re.sub(_pounds_re, r'\1 pounds', text)
|
||||
text = re.sub(_dollars_re, _expand_dollars, text)
|
||||
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
||||
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
||||
text = re.sub(_number_re, _expand_number, text)
|
||||
return text
|
@ -0,0 +1,21 @@
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
|
||||
'''
|
||||
Defines the set of symbols used in text input to the model.
|
||||
|
||||
The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
|
||||
from . import cmudict
|
||||
|
||||
_punctuation = '!\'",.:;? '
|
||||
_math = '#%&*+-/[]()'
|
||||
_special = '_@©°½—₩€$'
|
||||
_accented = 'áçéêëñöøćž'
|
||||
_numbers = '0123456789'
|
||||
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
|
||||
|
||||
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as
|
||||
# uppercase letters):
|
||||
_arpabet = ['@' + s for s in cmudict.valid_symbols]
|
||||
|
||||
# Export all symbols:
|
||||
symbols = list(_punctuation + _math + _special + _accented + _numbers + _letters) + _arpabet
|
@ -0,0 +1,4 @@
|
||||
jieba
|
||||
inflect
|
||||
unidecode
|
||||
tqdm
|
@ -1,14 +0,0 @@
|
||||
=======
|
||||
Credits
|
||||
=======
|
||||
|
||||
Author and Maintainer
|
||||
---------------------
|
||||
|
||||
* Thomas Roten <https://github.com/tsroten>
|
||||
|
||||
Contributors
|
||||
------------
|
||||
|
||||
None yet. Why not be the first?
|
||||
|
@ -1,88 +0,0 @@
|
||||
Changes
|
||||
=======
|
||||
|
||||
v0.1.0 (2013-05-05)
|
||||
-------------------
|
||||
|
||||
* Initial release
|
||||
|
||||
v0.1.1 (2013-05-05)
|
||||
-------------------
|
||||
|
||||
* Adds zhon.cedict package to setup.py
|
||||
|
||||
v0.2.0 (2013-05-07)
|
||||
-------------------
|
||||
|
||||
* Allows for mapping between simplified and traditional.
|
||||
* Adds logging to build_string().
|
||||
* Adds constants for numbered Pinyin and accented Pinyin.
|
||||
|
||||
v0.2.1 (2013-05-07)
|
||||
-------------------
|
||||
|
||||
* Fixes typo in README.rst.
|
||||
|
||||
v.1.0.0 (2014-01-25)
|
||||
--------------------
|
||||
|
||||
* Complete rewrite that refactors code, renames constants, and improves Pinyin
|
||||
support.
|
||||
|
||||
v.1.1.0 (2014-01-28)
|
||||
--------------------
|
||||
|
||||
* Adds ``zhon.pinyin.punctuation`` constant.
|
||||
* Adds ``zhon.pinyin.accented_syllable``, ``zhon.pinyin.accented_word``, and
|
||||
``zhon.pinyin.accented_sentence`` constants.
|
||||
* Adds ``zhon.pinyin.numbered_syllable``, ``zhon.pinyin.numbered_word``, and
|
||||
``zhon.pinyin.numbered_sentence`` constants.
|
||||
* Fixes some README.rst typos.
|
||||
* Clarifies information regarding Traditional and Simplified character
|
||||
constants in README.rst.
|
||||
* Adds constant short names to README.rst.
|
||||
|
||||
v.1.1.1 (2014-01-29)
|
||||
--------------------
|
||||
|
||||
* Adds documentation.
|
||||
* Adds ``zhon.cedict.all`` constant.
|
||||
* Removes duplicate code ranges from ``zhon.hanzi.characters``.
|
||||
* Makes ``zhon.hanzi.non_stops`` a string containing all non-stops instead of
|
||||
a string containing code ranges.
|
||||
* Removes duplicate letters in ``zhon.pinyin.consonants``.
|
||||
* Refactors Pinyin vowels/consonant code.
|
||||
* Removes the Latin alpha from ``zhon.pinyin.vowels``. Fixes #16.
|
||||
* Adds ``cjk_ideographs`` alias for ``zhon.hanzi.characters``.
|
||||
* Fixes various typos.
|
||||
* Removes numbers from Pinyin word constants. Fixes #15.
|
||||
* Adds lowercase and uppercase constants to ``zhon.pinyin``.
|
||||
* Fixes a bug with ``zhon.pinyin.sentence``.
|
||||
* Adds ``sent`` alias for ``zhon.pinyin.sentence``.
|
||||
|
||||
v.1.1.2 (2014-01-31)
|
||||
--------------------
|
||||
|
||||
* Fixes bug with ``zhon.cedict.all``.
|
||||
|
||||
v.1.1.3 (2014-02-12)
|
||||
--------------------
|
||||
|
||||
* Adds Ideographic number zero to ``zhon.hanzi.characters``. Fixes #17.
|
||||
* Fixes r-suffix bug. Fixes #18.
|
||||
|
||||
v.1.1.4 (2015-01-25)
|
||||
--------------------
|
||||
|
||||
* Removes duplicate module declarations in documentation.
|
||||
* Moves tests inside zhon package.
|
||||
* Adds travis config file.
|
||||
* Adds Python 3.4 tests to travis and tox.
|
||||
* Fixes flake8 warnings.
|
||||
* Adds distutil fallback import statment to setup.py.
|
||||
* Adds missing hanzi punctuation. Fixes #19.
|
||||
|
||||
v.1.1.5 (2016-05-23)
|
||||
--------------------
|
||||
|
||||
* Add missing Zhuyin characters. Fixes #23.
|
@ -1,107 +0,0 @@
|
||||
============
|
||||
Contributing
|
||||
============
|
||||
|
||||
Contributions are welcome, and they are greatly appreciated! Every
|
||||
little bit helps, and credit will always be given.
|
||||
|
||||
You can contribute in many ways:
|
||||
|
||||
Types of Contributions
|
||||
----------------------
|
||||
|
||||
Report Bugs
|
||||
~~~~~~~~~~~
|
||||
|
||||
Report bugs at https://github.com/tsroten/zhon/issues.
|
||||
|
||||
If you are reporting a bug, please include:
|
||||
|
||||
* Your operating system name and version.
|
||||
* Any details about your local setup that might be helpful in troubleshooting.
|
||||
* Detailed steps to reproduce the bug.
|
||||
|
||||
Fix Bugs
|
||||
~~~~~~~~
|
||||
|
||||
Look through the GitHub issues for bugs. Anything tagged with "bug"
|
||||
is open to whoever wants to implement it.
|
||||
|
||||
Implement Features
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Look through the GitHub issues for features. Anything tagged with "feature"
|
||||
is open to whoever wants to implement it.
|
||||
|
||||
Write Documentation
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Zhon could always use more documentation, whether as part of the
|
||||
official Zhon docs, in docstrings, or even on the web in blog posts,
|
||||
articles, and such.
|
||||
|
||||
Submit Feedback
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
The best way to send feedback is to file an issue at https://github.com/tsroten/zhon/issues.
|
||||
|
||||
If you are proposing a feature:
|
||||
|
||||
* Explain in detail how it would work.
|
||||
* Keep the scope as narrow as possible, to make it easier to implement.
|
||||
* Remember that this is a volunteer-driven project, and that contributions
|
||||
are welcome :)
|
||||
|
||||
Get Started!
|
||||
------------
|
||||
|
||||
Ready to contribute? Here's how to set up `zhon` for local development.
|
||||
|
||||
1. Fork the `zhon` repo on GitHub.
|
||||
2. Clone your fork locally::
|
||||
|
||||
$ git clone git@github.com:your_name_here/zhon.git
|
||||
|
||||
3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
|
||||
|
||||
$ mkvirtualenv zhon
|
||||
$ cd zhon/
|
||||
$ python setup.py develop
|
||||
|
||||
4. Create a branch for local development::
|
||||
|
||||
$ git checkout -b name-of-your-bugfix-or-feature
|
||||
|
||||
Now you can make your changes locally.
|
||||
|
||||
5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox::
|
||||
|
||||
$ flake8 zhon
|
||||
$ python setup.py test
|
||||
$ tox
|
||||
|
||||
To get flake8 and tox, just pip install them into your virtualenv.
|
||||
|
||||
You can ignore the flake8 errors regarding `zhon.cedict` files. Rather than include hundreds of newline characters in each file, we are ignoring those errors.
|
||||
|
||||
6. Commit your changes and push your branch to GitHub::
|
||||
|
||||
$ git add .
|
||||
$ git commit -m "Your detailed description of your changes."
|
||||
$ git push origin name-of-your-bugfix-or-feature
|
||||
|
||||
7. Submit a pull request through the GitHub website.
|
||||
|
||||
Pull Request Guidelines
|
||||
-----------------------
|
||||
|
||||
Before you submit a pull request, check that it meets these guidelines:
|
||||
|
||||
1. The pull request should include tests.
|
||||
2. If the pull request adds functionality, the docs should be updated. Put
|
||||
your new functionality into a function with a docstring, and add the
|
||||
feature to the list in README.rst.
|
||||
3. The pull request should work for Python 2.7, 3.3, and 3.4. Check
|
||||
https://travis-ci.org/tsroten/zhon/pull_requests
|
||||
and make sure that the tests pass for all supported Python versions.
|
||||
4. If you want to receive credit, add your name to `AUTHORS.rst`.
|
@ -1,7 +0,0 @@
|
||||
Copyright (c) 2013-2014 Thomas Roten
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
Loading…
Reference in new issue