Merge pull request #960 from PaddlePaddle/paddlespeech
[paddlespeech] merge deepspeech, parakeet and text_processing into paddlespeechpull/963/head
commit
58b24aa49f
@ -1,37 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
setup_env(){
|
||||
cd tools && make && cd -
|
||||
}
|
||||
|
||||
install(){
|
||||
if [ -f "setup.sh" ]; then
|
||||
bash setup.sh
|
||||
#export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
|
||||
fi
|
||||
if [ $? != 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
print_env(){
|
||||
cat /etc/lsb-release
|
||||
gcc -v
|
||||
g++ -v
|
||||
}
|
||||
|
||||
abort(){
|
||||
echo "Run install failed" 1>&2
|
||||
echo "Please check your code" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
trap 'abort' 0
|
||||
set -e
|
||||
|
||||
print_env
|
||||
setup_env
|
||||
source tools/venv/bin/activate
|
||||
install
|
||||
|
||||
trap : 0
|
@ -1,23 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
function abort(){
|
||||
echo "Your commit not fit PaddlePaddle code style" 1>&2
|
||||
echo "Please use pre-commit scripts to auto-format your code" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
|
||||
trap 'abort' 0
|
||||
set -e
|
||||
|
||||
source tools/venv/bin/activate
|
||||
|
||||
python3 --version
|
||||
|
||||
if ! pre-commit run -a ; then
|
||||
ls -lh
|
||||
git diff --exit-code
|
||||
exit 1
|
||||
fi
|
||||
|
||||
trap : 0
|
@ -1,54 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
|
||||
|
||||
abort(){
|
||||
echo "Run unittest failed" 1>&2
|
||||
echo "Please check your code" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
|
||||
unittest(){
|
||||
cd $1 > /dev/null
|
||||
if [ -f "setup.sh" ]; then
|
||||
bash setup.sh
|
||||
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
|
||||
fi
|
||||
if [ $? != 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
find . -path ./tools/venv -prune -false -o -name 'tests' -type d -print0 | \
|
||||
xargs -0 -I{} -n1 bash -c \
|
||||
'python3 -m unittest discover -v -s {}'
|
||||
cd - > /dev/null
|
||||
}
|
||||
|
||||
coverage(){
|
||||
cd $1 > /dev/null
|
||||
|
||||
if [ -f "setup.sh" ]; then
|
||||
bash setup.sh
|
||||
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
|
||||
fi
|
||||
if [ $? != 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
find . -path ./tools/venv -prune -false -o -name 'tests' -type d -print0 | \
|
||||
xargs -0 -I{} -n1 bash -c \
|
||||
'python3 -m coverage run --branch {}'
|
||||
python3 -m coverage report -m
|
||||
python3 -m coverage html
|
||||
cd - > /dev/null
|
||||
}
|
||||
|
||||
trap 'abort' 0
|
||||
set -e
|
||||
|
||||
source tools/venv/bin/activate
|
||||
#pip3 install pytest
|
||||
#unittest .
|
||||
coverage .
|
||||
|
||||
trap : 0
|
@ -1,35 +0,0 @@
|
||||
@ECHO OFF
|
||||
|
||||
pushd %~dp0
|
||||
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
if "%SPHINXBUILD%" == "" (
|
||||
set SPHINXBUILD=sphinx-build
|
||||
)
|
||||
set SOURCEDIR=source
|
||||
set BUILDDIR=build
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
if errorlevel 9009 (
|
||||
echo.
|
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||
echo.may add the Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.http://sphinx-doc.org/
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
goto end
|
||||
|
||||
:help
|
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
|
||||
:end
|
||||
popd
|
@ -1,2 +0,0 @@
|
||||
data
|
||||
exp
|
@ -1,3 +0,0 @@
|
||||
# G2P
|
||||
|
||||
* zh - Chinese G2P
|
@ -1,53 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import re
|
||||
|
||||
import jieba
|
||||
from pypinyin import lazy_pinyin
|
||||
from pypinyin import Style
|
||||
|
||||
|
||||
def extract_pinyin(source, target, use_jieba=False):
|
||||
with open(source, 'rt', encoding='utf-8') as fin:
|
||||
with open(target, 'wt', encoding='utf-8') as fout:
|
||||
for i, line in enumerate(fin):
|
||||
if i % 2 == 0:
|
||||
sentence_id, raw_text = line.strip().split()
|
||||
raw_text = re.sub(r'#\d', '', raw_text)
|
||||
if use_jieba:
|
||||
raw_text = jieba.lcut(raw_text)
|
||||
syllables = lazy_pinyin(
|
||||
raw_text,
|
||||
errors='ignore',
|
||||
style=Style.TONE3,
|
||||
neutral_tone_with_five=True)
|
||||
transcription = ' '.join(syllables)
|
||||
fout.write(f'{sentence_id} {transcription}\n')
|
||||
else:
|
||||
continue
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
|
||||
parser.add_argument(
|
||||
"input", type=str, help="source file of baker's prosody label file")
|
||||
parser.add_argument(
|
||||
"output", type=str, help="target file to write pinyin lables")
|
||||
parser.add_argument(
|
||||
"--use-jieba",
|
||||
action='store_true',
|
||||
help="use jieba for word segmentation.")
|
||||
args = parser.parse_args()
|
||||
extract_pinyin(args.input, args.output, use_jieba=args.use_jieba)
|
@ -1,37 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
|
||||
|
||||
def extract_pinyin_lables(source, target):
|
||||
"""Extract pinyin labels from Baker's prosody labeling."""
|
||||
with open(source, 'rt', encoding='utf-8') as fin:
|
||||
with open(target, 'wt', encoding='utf-8') as fout:
|
||||
for i, line in enumerate(fin):
|
||||
if i % 2 == 0:
|
||||
sentence_id, raw_text = line.strip().split()
|
||||
fout.write(f'{sentence_id} ')
|
||||
else:
|
||||
transcription = line.strip()
|
||||
fout.write(f'{transcription}\n')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
|
||||
parser.add_argument(
|
||||
"input", type=str, help="source file of baker's prosody label file")
|
||||
parser.add_argument(
|
||||
"output", type=str, help="target file to write pinyin lables")
|
||||
args = parser.parse_args()
|
||||
extract_pinyin_lables(args.input, args.output)
|
@ -1,103 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from typing import Union
|
||||
|
||||
|
||||
def erized(syllable: str) -> bool:
|
||||
"""Whether the syllable contains erhua effect.
|
||||
|
||||
Example
|
||||
--------
|
||||
huar -> True
|
||||
guanr -> True
|
||||
er -> False
|
||||
"""
|
||||
# note: for pinyin, len(syllable) >=2 is always true
|
||||
# if not: there is something wrong in the data
|
||||
assert len(syllable) >= 2, f"inavlid syllable {syllable}"
|
||||
return syllable[:2] != "er" and syllable[-2] == 'r'
|
||||
|
||||
|
||||
def ignore_sandhi(reference: List[str], generated: List[str]) -> List[str]:
|
||||
"""
|
||||
Given a sequence of syllables from human annotation(reference),
|
||||
which makes sandhi explici and a sequence of syllables from some
|
||||
simple g2p program(generated), which does not consider sandhi,
|
||||
return a the reference sequence while ignore sandhi.
|
||||
|
||||
Example
|
||||
--------
|
||||
['lao2', 'hu3'], ['lao3', 'hu3'] -> ['lao3', 'hu3']
|
||||
"""
|
||||
i = 0
|
||||
j = 0
|
||||
|
||||
# sandhi ignored in the result while other errors are not included
|
||||
result = []
|
||||
while i < len(reference):
|
||||
if erized(reference[i]):
|
||||
result.append(reference[i])
|
||||
i += 1
|
||||
j += 2
|
||||
elif reference[i][:-1] == generated[i][:-1] and reference[i][
|
||||
-1] == '2' and generated[i][-1] == '3':
|
||||
result.append(generated[i])
|
||||
i += 1
|
||||
j += 1
|
||||
else:
|
||||
result.append(reference[i])
|
||||
i += 1
|
||||
j += 1
|
||||
assert j == len(
|
||||
generated
|
||||
), "length of transcriptions mismatch, There may be some characters that are ignored in the generated transcription."
|
||||
return result
|
||||
|
||||
|
||||
def convert_transcriptions(reference: Union[str, Path],
|
||||
generated: Union[str, Path],
|
||||
output: Union[str, Path]):
|
||||
with open(reference, 'rt') as f_ref:
|
||||
with open(generated, 'rt') as f_gen:
|
||||
with open(output, 'wt') as f_out:
|
||||
for i, (ref, gen) in enumerate(zip(f_ref, f_gen)):
|
||||
sentence_id, ref_transcription = ref.strip().split(' ', 1)
|
||||
_, gen_transcription = gen.strip().split(' ', 1)
|
||||
try:
|
||||
result = ignore_sandhi(ref_transcription.split(),
|
||||
gen_transcription.split())
|
||||
result = ' '.join(result)
|
||||
except Exception:
|
||||
print(
|
||||
f"sentence_id: {sentence_id} There is some annotation error in the reference or generated transcription. Use the reference."
|
||||
)
|
||||
result = ref_transcription
|
||||
f_out.write(f"{sentence_id} {result}\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="reference transcription but ignore sandhi.")
|
||||
parser.add_argument(
|
||||
"--reference",
|
||||
type=str,
|
||||
help="path to the reference transcription of baker dataset.")
|
||||
parser.add_argument(
|
||||
"--generated", type=str, help="path to the generated transcription.")
|
||||
parser.add_argument("--output", type=str, help="path to save result.")
|
||||
args = parser.parse_args()
|
||||
convert_transcriptions(args.reference, args.generated, args.output)
|
@ -1,33 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
exp_dir="exp"
|
||||
data_dir="data"
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||
|
||||
archive=${data_dir}/"BZNSYP.rar"
|
||||
if [ ! -f ${archive} ]; then
|
||||
echo "Baker Dataset not found! Download it first to the data_dir."
|
||||
exit -1
|
||||
fi
|
||||
|
||||
MD5='c4350563bf7dc298f7dd364b2607be83'
|
||||
md5_result=$(md5sum ${archive} | awk -F[' '] '{print $1}')
|
||||
if [ ${md5_result} != ${MD5} ]; then
|
||||
echo "MD5 mismatch! The Archive has been changed."
|
||||
exit -1
|
||||
fi
|
||||
|
||||
|
||||
label_file='ProsodyLabeling/000001-010000.txt'
|
||||
filename='000001-010000.txt'
|
||||
unrar e ${archive} ${label_file}
|
||||
cp ${filename} ${exp_dir}
|
||||
rm -f ${filename}
|
||||
|
||||
if [ ! -f ${exp_dir}/${filename} ];then
|
||||
echo "File extraction failed!"
|
||||
exit
|
||||
fi
|
||||
|
||||
exit 0
|
@ -1,8 +0,0 @@
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
@ -1 +0,0 @@
|
||||
jieba
|
@ -1,37 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
source path.sh
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
|
||||
exp_dir=exp
|
||||
data=data
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||
|
||||
mkdir -p ${exp_dir}
|
||||
|
||||
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ];then
|
||||
mkdir -p ${data}
|
||||
test -e ${data}/BZNSYP.rar || { echo "Please download BZNSYP.rar and put it in "${data}; exit -1; }
|
||||
fi
|
||||
|
||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
|
||||
echo "stage 0: Extracting Prosody Labeling"
|
||||
bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data}
|
||||
fi
|
||||
|
||||
# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
|
||||
filename="000001-010000.txt"
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
echo "stage 1: Processing transcriptions..."
|
||||
python3 local/extract_pinyin_label.py ${exp_dir}/${filename} ${exp_dir}/ref.pinyin
|
||||
|
||||
python3 local/convert_transcription.py ${exp_dir}/${filename} ${exp_dir}/trans.pinyin
|
||||
python3 local/convert_transcription.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/trans.jieba.pinyin
|
||||
fi
|
||||
|
||||
echo "done"
|
||||
exit 0
|
@ -1 +0,0 @@
|
||||
exp
|
@ -1,29 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
|
||||
from text_processing import normalization
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Normalize text in Chinese with some rules.")
|
||||
parser.add_argument("input", type=str, help="the input sentences")
|
||||
parser.add_argument("output", type=str, help="path to save the output file.")
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.input, 'rt') as fin:
|
||||
with open(args.output, 'wt') as fout:
|
||||
for sent in fin:
|
||||
sent = normalization.normalize_sentence(sent.strip())
|
||||
fout.write(sent)
|
||||
fout.write('\n')
|
@ -1,8 +0,0 @@
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${MAIN_ROOT}/third_party:${PYTHONPATH}#
|
@ -1,26 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
source path.sh
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
|
||||
exp_dir=exp
|
||||
data_dir=data
|
||||
filename="sentences.txt"
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||
|
||||
mkdir -p ${exp_dir}
|
||||
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
echo "stage 1: Processing "
|
||||
python3 local/test_normalization.py ${data_dir}/${filename} ${exp_dir}/normalized.txt
|
||||
if [ -f "${exp_dir}/normalized.txt" ]; then
|
||||
echo "Normalized text save at ${exp_dir}/normalized.txt"
|
||||
fi
|
||||
# TODO(chenfeiyu): compute edit distance against ground-truth
|
||||
fi
|
||||
|
||||
echo "done"
|
||||
exit 0
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue