Merge pull request #960 from PaddlePaddle/paddlespeech
[paddlespeech] merge deepspeech, parakeet and text_processing into paddlespeechpull/963/head
commit
58b24aa49f
@ -1,37 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
setup_env(){
|
|
||||||
cd tools && make && cd -
|
|
||||||
}
|
|
||||||
|
|
||||||
install(){
|
|
||||||
if [ -f "setup.sh" ]; then
|
|
||||||
bash setup.sh
|
|
||||||
#export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
|
|
||||||
fi
|
|
||||||
if [ $? != 0 ]; then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
print_env(){
|
|
||||||
cat /etc/lsb-release
|
|
||||||
gcc -v
|
|
||||||
g++ -v
|
|
||||||
}
|
|
||||||
|
|
||||||
abort(){
|
|
||||||
echo "Run install failed" 1>&2
|
|
||||||
echo "Please check your code" 1>&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
trap 'abort' 0
|
|
||||||
set -e
|
|
||||||
|
|
||||||
print_env
|
|
||||||
setup_env
|
|
||||||
source tools/venv/bin/activate
|
|
||||||
install
|
|
||||||
|
|
||||||
trap : 0
|
|
@ -1,23 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
function abort(){
|
|
||||||
echo "Your commit not fit PaddlePaddle code style" 1>&2
|
|
||||||
echo "Please use pre-commit scripts to auto-format your code" 1>&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
trap 'abort' 0
|
|
||||||
set -e
|
|
||||||
|
|
||||||
source tools/venv/bin/activate
|
|
||||||
|
|
||||||
python3 --version
|
|
||||||
|
|
||||||
if ! pre-commit run -a ; then
|
|
||||||
ls -lh
|
|
||||||
git diff --exit-code
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
trap : 0
|
|
@ -1,54 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
abort(){
|
|
||||||
echo "Run unittest failed" 1>&2
|
|
||||||
echo "Please check your code" 1>&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
unittest(){
|
|
||||||
cd $1 > /dev/null
|
|
||||||
if [ -f "setup.sh" ]; then
|
|
||||||
bash setup.sh
|
|
||||||
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
|
|
||||||
fi
|
|
||||||
if [ $? != 0 ]; then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
find . -path ./tools/venv -prune -false -o -name 'tests' -type d -print0 | \
|
|
||||||
xargs -0 -I{} -n1 bash -c \
|
|
||||||
'python3 -m unittest discover -v -s {}'
|
|
||||||
cd - > /dev/null
|
|
||||||
}
|
|
||||||
|
|
||||||
coverage(){
|
|
||||||
cd $1 > /dev/null
|
|
||||||
|
|
||||||
if [ -f "setup.sh" ]; then
|
|
||||||
bash setup.sh
|
|
||||||
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
|
|
||||||
fi
|
|
||||||
if [ $? != 0 ]; then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
find . -path ./tools/venv -prune -false -o -name 'tests' -type d -print0 | \
|
|
||||||
xargs -0 -I{} -n1 bash -c \
|
|
||||||
'python3 -m coverage run --branch {}'
|
|
||||||
python3 -m coverage report -m
|
|
||||||
python3 -m coverage html
|
|
||||||
cd - > /dev/null
|
|
||||||
}
|
|
||||||
|
|
||||||
trap 'abort' 0
|
|
||||||
set -e
|
|
||||||
|
|
||||||
source tools/venv/bin/activate
|
|
||||||
#pip3 install pytest
|
|
||||||
#unittest .
|
|
||||||
coverage .
|
|
||||||
|
|
||||||
trap : 0
|
|
@ -1,35 +0,0 @@
|
|||||||
@ECHO OFF
|
|
||||||
|
|
||||||
pushd %~dp0
|
|
||||||
|
|
||||||
REM Command file for Sphinx documentation
|
|
||||||
|
|
||||||
if "%SPHINXBUILD%" == "" (
|
|
||||||
set SPHINXBUILD=sphinx-build
|
|
||||||
)
|
|
||||||
set SOURCEDIR=source
|
|
||||||
set BUILDDIR=build
|
|
||||||
|
|
||||||
if "%1" == "" goto help
|
|
||||||
|
|
||||||
%SPHINXBUILD% >NUL 2>NUL
|
|
||||||
if errorlevel 9009 (
|
|
||||||
echo.
|
|
||||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
|
||||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
|
||||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
|
||||||
echo.may add the Sphinx directory to PATH.
|
|
||||||
echo.
|
|
||||||
echo.If you don't have Sphinx installed, grab it from
|
|
||||||
echo.http://sphinx-doc.org/
|
|
||||||
exit /b 1
|
|
||||||
)
|
|
||||||
|
|
||||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
|
||||||
goto end
|
|
||||||
|
|
||||||
:help
|
|
||||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
|
||||||
|
|
||||||
:end
|
|
||||||
popd
|
|
@ -1,2 +0,0 @@
|
|||||||
data
|
|
||||||
exp
|
|
@ -1,3 +0,0 @@
|
|||||||
# G2P
|
|
||||||
|
|
||||||
* zh - Chinese G2P
|
|
@ -1,53 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import argparse
|
|
||||||
import re
|
|
||||||
|
|
||||||
import jieba
|
|
||||||
from pypinyin import lazy_pinyin
|
|
||||||
from pypinyin import Style
|
|
||||||
|
|
||||||
|
|
||||||
def extract_pinyin(source, target, use_jieba=False):
|
|
||||||
with open(source, 'rt', encoding='utf-8') as fin:
|
|
||||||
with open(target, 'wt', encoding='utf-8') as fout:
|
|
||||||
for i, line in enumerate(fin):
|
|
||||||
if i % 2 == 0:
|
|
||||||
sentence_id, raw_text = line.strip().split()
|
|
||||||
raw_text = re.sub(r'#\d', '', raw_text)
|
|
||||||
if use_jieba:
|
|
||||||
raw_text = jieba.lcut(raw_text)
|
|
||||||
syllables = lazy_pinyin(
|
|
||||||
raw_text,
|
|
||||||
errors='ignore',
|
|
||||||
style=Style.TONE3,
|
|
||||||
neutral_tone_with_five=True)
|
|
||||||
transcription = ' '.join(syllables)
|
|
||||||
fout.write(f'{sentence_id} {transcription}\n')
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
|
|
||||||
parser.add_argument(
|
|
||||||
"input", type=str, help="source file of baker's prosody label file")
|
|
||||||
parser.add_argument(
|
|
||||||
"output", type=str, help="target file to write pinyin lables")
|
|
||||||
parser.add_argument(
|
|
||||||
"--use-jieba",
|
|
||||||
action='store_true',
|
|
||||||
help="use jieba for word segmentation.")
|
|
||||||
args = parser.parse_args()
|
|
||||||
extract_pinyin(args.input, args.output, use_jieba=args.use_jieba)
|
|
@ -1,37 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
|
|
||||||
def extract_pinyin_lables(source, target):
|
|
||||||
"""Extract pinyin labels from Baker's prosody labeling."""
|
|
||||||
with open(source, 'rt', encoding='utf-8') as fin:
|
|
||||||
with open(target, 'wt', encoding='utf-8') as fout:
|
|
||||||
for i, line in enumerate(fin):
|
|
||||||
if i % 2 == 0:
|
|
||||||
sentence_id, raw_text = line.strip().split()
|
|
||||||
fout.write(f'{sentence_id} ')
|
|
||||||
else:
|
|
||||||
transcription = line.strip()
|
|
||||||
fout.write(f'{transcription}\n')
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
|
|
||||||
parser.add_argument(
|
|
||||||
"input", type=str, help="source file of baker's prosody label file")
|
|
||||||
parser.add_argument(
|
|
||||||
"output", type=str, help="target file to write pinyin lables")
|
|
||||||
args = parser.parse_args()
|
|
||||||
extract_pinyin_lables(args.input, args.output)
|
|
@ -1,103 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import argparse
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import List
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
|
|
||||||
def erized(syllable: str) -> bool:
|
|
||||||
"""Whether the syllable contains erhua effect.
|
|
||||||
|
|
||||||
Example
|
|
||||||
--------
|
|
||||||
huar -> True
|
|
||||||
guanr -> True
|
|
||||||
er -> False
|
|
||||||
"""
|
|
||||||
# note: for pinyin, len(syllable) >=2 is always true
|
|
||||||
# if not: there is something wrong in the data
|
|
||||||
assert len(syllable) >= 2, f"inavlid syllable {syllable}"
|
|
||||||
return syllable[:2] != "er" and syllable[-2] == 'r'
|
|
||||||
|
|
||||||
|
|
||||||
def ignore_sandhi(reference: List[str], generated: List[str]) -> List[str]:
|
|
||||||
"""
|
|
||||||
Given a sequence of syllables from human annotation(reference),
|
|
||||||
which makes sandhi explici and a sequence of syllables from some
|
|
||||||
simple g2p program(generated), which does not consider sandhi,
|
|
||||||
return a the reference sequence while ignore sandhi.
|
|
||||||
|
|
||||||
Example
|
|
||||||
--------
|
|
||||||
['lao2', 'hu3'], ['lao3', 'hu3'] -> ['lao3', 'hu3']
|
|
||||||
"""
|
|
||||||
i = 0
|
|
||||||
j = 0
|
|
||||||
|
|
||||||
# sandhi ignored in the result while other errors are not included
|
|
||||||
result = []
|
|
||||||
while i < len(reference):
|
|
||||||
if erized(reference[i]):
|
|
||||||
result.append(reference[i])
|
|
||||||
i += 1
|
|
||||||
j += 2
|
|
||||||
elif reference[i][:-1] == generated[i][:-1] and reference[i][
|
|
||||||
-1] == '2' and generated[i][-1] == '3':
|
|
||||||
result.append(generated[i])
|
|
||||||
i += 1
|
|
||||||
j += 1
|
|
||||||
else:
|
|
||||||
result.append(reference[i])
|
|
||||||
i += 1
|
|
||||||
j += 1
|
|
||||||
assert j == len(
|
|
||||||
generated
|
|
||||||
), "length of transcriptions mismatch, There may be some characters that are ignored in the generated transcription."
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def convert_transcriptions(reference: Union[str, Path],
|
|
||||||
generated: Union[str, Path],
|
|
||||||
output: Union[str, Path]):
|
|
||||||
with open(reference, 'rt') as f_ref:
|
|
||||||
with open(generated, 'rt') as f_gen:
|
|
||||||
with open(output, 'wt') as f_out:
|
|
||||||
for i, (ref, gen) in enumerate(zip(f_ref, f_gen)):
|
|
||||||
sentence_id, ref_transcription = ref.strip().split(' ', 1)
|
|
||||||
_, gen_transcription = gen.strip().split(' ', 1)
|
|
||||||
try:
|
|
||||||
result = ignore_sandhi(ref_transcription.split(),
|
|
||||||
gen_transcription.split())
|
|
||||||
result = ' '.join(result)
|
|
||||||
except Exception:
|
|
||||||
print(
|
|
||||||
f"sentence_id: {sentence_id} There is some annotation error in the reference or generated transcription. Use the reference."
|
|
||||||
)
|
|
||||||
result = ref_transcription
|
|
||||||
f_out.write(f"{sentence_id} {result}\n")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="reference transcription but ignore sandhi.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--reference",
|
|
||||||
type=str,
|
|
||||||
help="path to the reference transcription of baker dataset.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--generated", type=str, help="path to the generated transcription.")
|
|
||||||
parser.add_argument("--output", type=str, help="path to save result.")
|
|
||||||
args = parser.parse_args()
|
|
||||||
convert_transcriptions(args.reference, args.generated, args.output)
|
|
@ -1,33 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
exp_dir="exp"
|
|
||||||
data_dir="data"
|
|
||||||
|
|
||||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
|
||||||
|
|
||||||
archive=${data_dir}/"BZNSYP.rar"
|
|
||||||
if [ ! -f ${archive} ]; then
|
|
||||||
echo "Baker Dataset not found! Download it first to the data_dir."
|
|
||||||
exit -1
|
|
||||||
fi
|
|
||||||
|
|
||||||
MD5='c4350563bf7dc298f7dd364b2607be83'
|
|
||||||
md5_result=$(md5sum ${archive} | awk -F[' '] '{print $1}')
|
|
||||||
if [ ${md5_result} != ${MD5} ]; then
|
|
||||||
echo "MD5 mismatch! The Archive has been changed."
|
|
||||||
exit -1
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
label_file='ProsodyLabeling/000001-010000.txt'
|
|
||||||
filename='000001-010000.txt'
|
|
||||||
unrar e ${archive} ${label_file}
|
|
||||||
cp ${filename} ${exp_dir}
|
|
||||||
rm -f ${filename}
|
|
||||||
|
|
||||||
if [ ! -f ${exp_dir}/${filename} ];then
|
|
||||||
echo "File extraction failed!"
|
|
||||||
exit
|
|
||||||
fi
|
|
||||||
|
|
||||||
exit 0
|
|
@ -1,8 +0,0 @@
|
|||||||
export MAIN_ROOT=`realpath ${PWD}/../../../../`
|
|
||||||
|
|
||||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
|
||||||
export LC_ALL=C
|
|
||||||
|
|
||||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
|
||||||
export PYTHONIOENCODING=UTF-8
|
|
||||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
|
@ -1 +0,0 @@
|
|||||||
jieba
|
|
@ -1,37 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
source path.sh
|
|
||||||
|
|
||||||
stage=-1
|
|
||||||
stop_stage=100
|
|
||||||
|
|
||||||
exp_dir=exp
|
|
||||||
data=data
|
|
||||||
|
|
||||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
|
||||||
|
|
||||||
mkdir -p ${exp_dir}
|
|
||||||
|
|
||||||
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ];then
|
|
||||||
mkdir -p ${data}
|
|
||||||
test -e ${data}/BZNSYP.rar || { echo "Please download BZNSYP.rar and put it in "${data}; exit -1; }
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
|
|
||||||
echo "stage 0: Extracting Prosody Labeling"
|
|
||||||
bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data}
|
|
||||||
fi
|
|
||||||
|
|
||||||
# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
|
|
||||||
filename="000001-010000.txt"
|
|
||||||
|
|
||||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|
||||||
echo "stage 1: Processing transcriptions..."
|
|
||||||
python3 local/extract_pinyin_label.py ${exp_dir}/${filename} ${exp_dir}/ref.pinyin
|
|
||||||
|
|
||||||
python3 local/convert_transcription.py ${exp_dir}/${filename} ${exp_dir}/trans.pinyin
|
|
||||||
python3 local/convert_transcription.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/trans.jieba.pinyin
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "done"
|
|
||||||
exit 0
|
|
@ -1 +0,0 @@
|
|||||||
exp
|
|
@ -1,29 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
from text_processing import normalization
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Normalize text in Chinese with some rules.")
|
|
||||||
parser.add_argument("input", type=str, help="the input sentences")
|
|
||||||
parser.add_argument("output", type=str, help="path to save the output file.")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
with open(args.input, 'rt') as fin:
|
|
||||||
with open(args.output, 'wt') as fout:
|
|
||||||
for sent in fin:
|
|
||||||
sent = normalization.normalize_sentence(sent.strip())
|
|
||||||
fout.write(sent)
|
|
||||||
fout.write('\n')
|
|
@ -1,8 +0,0 @@
|
|||||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
|
||||||
|
|
||||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
|
||||||
export LC_ALL=C
|
|
||||||
|
|
||||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
|
||||||
export PYTHONIOENCODING=UTF-8
|
|
||||||
export PYTHONPATH=${MAIN_ROOT}:${MAIN_ROOT}/third_party:${PYTHONPATH}#
|
|
@ -1,26 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
source path.sh
|
|
||||||
|
|
||||||
stage=-1
|
|
||||||
stop_stage=100
|
|
||||||
|
|
||||||
exp_dir=exp
|
|
||||||
data_dir=data
|
|
||||||
filename="sentences.txt"
|
|
||||||
|
|
||||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
|
||||||
|
|
||||||
mkdir -p ${exp_dir}
|
|
||||||
|
|
||||||
|
|
||||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|
||||||
echo "stage 1: Processing "
|
|
||||||
python3 local/test_normalization.py ${data_dir}/${filename} ${exp_dir}/normalized.txt
|
|
||||||
if [ -f "${exp_dir}/normalized.txt" ]; then
|
|
||||||
echo "Normalized text save at ${exp_dir}/normalized.txt"
|
|
||||||
fi
|
|
||||||
# TODO(chenfeiyu): compute edit distance against ground-truth
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "done"
|
|
||||||
exit 0
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue