Merge pull request #960 from PaddlePaddle/paddlespeech

[paddlespeech] merge deepspeech, parakeet and text_processing into paddlespeech
pull/963/head
Hui Zhang 3 years ago committed by GitHub
commit 58b24aa49f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -41,13 +41,13 @@ pull_request_rules:
remove: ["conflicts"]
- name: "auto add label=S2T"
conditions:
- files~=^deepspeech/
- files~=^paddlespeech/s2t/
actions:
label:
add: ["S2T"]
- name: "auto add label=T2S"
conditions:
- files~=^parakeet/
- files~=^paddlespeech/t2s/
actions:
label:
add: ["T2S"]
@ -59,7 +59,7 @@ pull_request_rules:
add: ["Audio"]
- name: "auto add label=TextProcess"
conditions:
- files~=^text_processing/
- files~=^paddlespeech/text/
actions:
label:
add: ["TextProcess"]

@ -1,37 +0,0 @@
#!/bin/bash
setup_env(){
cd tools && make && cd -
}
install(){
if [ -f "setup.sh" ]; then
bash setup.sh
#export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
fi
if [ $? != 0 ]; then
exit 1
fi
}
print_env(){
cat /etc/lsb-release
gcc -v
g++ -v
}
abort(){
echo "Run install failed" 1>&2
echo "Please check your code" 1>&2
exit 1
}
trap 'abort' 0
set -e
print_env
setup_env
source tools/venv/bin/activate
install
trap : 0

@ -1,23 +0,0 @@
#!/bin/bash
function abort(){
echo "Your commit not fit PaddlePaddle code style" 1>&2
echo "Please use pre-commit scripts to auto-format your code" 1>&2
exit 1
}
trap 'abort' 0
set -e
source tools/venv/bin/activate
python3 --version
if ! pre-commit run -a ; then
ls -lh
git diff --exit-code
exit 1
fi
trap : 0

@ -1,54 +0,0 @@
#!/bin/bash
abort(){
echo "Run unittest failed" 1>&2
echo "Please check your code" 1>&2
exit 1
}
unittest(){
cd $1 > /dev/null
if [ -f "setup.sh" ]; then
bash setup.sh
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
fi
if [ $? != 0 ]; then
exit 1
fi
find . -path ./tools/venv -prune -false -o -name 'tests' -type d -print0 | \
xargs -0 -I{} -n1 bash -c \
'python3 -m unittest discover -v -s {}'
cd - > /dev/null
}
coverage(){
cd $1 > /dev/null
if [ -f "setup.sh" ]; then
bash setup.sh
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
fi
if [ $? != 0 ]; then
exit 1
fi
find . -path ./tools/venv -prune -false -o -name 'tests' -type d -print0 | \
xargs -0 -I{} -n1 bash -c \
'python3 -m coverage run --branch {}'
python3 -m coverage report -m
python3 -m coverage html
cd - > /dev/null
}
trap 'abort' 0
set -e
source tools/venv/bin/activate
#pip3 install pytest
#unittest .
coverage .
trap : 0

@ -17,7 +17,6 @@ from typing import List
import numpy as np
import paddle
from paddleaudio.backends import load as load_audio
from paddleaudio.features import melspectrogram
from paddleaudio.models.panns import cnn14

@ -17,7 +17,6 @@ import os
from typing import Dict
import numpy as np
from paddleaudio.utils import logger
# yapf: disable

@ -16,11 +16,10 @@ import os
import numpy as np
from paddle import inference
from scipy.special import softmax
from paddleaudio.backends import load as load_audio
from paddleaudio.datasets import ESC50
from paddleaudio.features import melspectrogram
from scipy.special import softmax
# yapf: disable
parser = argparse.ArgumentParser()

@ -16,7 +16,6 @@ import os
import paddle
from model import SoundClassifier
from paddleaudio.datasets import ESC50
from paddleaudio.models.panns import cnn14

@ -17,7 +17,6 @@ import numpy as np
import paddle
import paddle.nn.functional as F
from model import SoundClassifier
from paddleaudio.backends import load as load_audio
from paddleaudio.datasets import ESC50
from paddleaudio.features import melspectrogram

@ -16,7 +16,6 @@ import os
import paddle
from model import SoundClassifier
from paddleaudio.datasets import ESC50
from paddleaudio.models.panns import cnn14
from paddleaudio.utils import logger

@ -15,7 +15,6 @@ from typing import List
import numpy as np
from numpy import ndarray as array
from paddleaudio.backends import depth_convert
from paddleaudio.utils import ParameterError

@ -20,9 +20,8 @@ import numpy as np
import scipy
from numpy import ndarray as array
from numpy.lib.stride_tricks import as_strided
from scipy.signal import get_window
from paddleaudio.utils import ParameterError
from scipy.signal import get_window
__all__ = [
'stft',

@ -18,6 +18,7 @@ version = '0.1.0a'
with open("README.md", "r") as fh:
long_description = fh.read()
setuptools.setup(
name="paddleaudio",
version=version,
@ -35,8 +36,12 @@ setuptools.setup(
],
python_requires='>=3.6',
install_requires=[
'numpy >= 1.15.0', 'scipy >= 1.0.0', 'resampy >= 0.2.2',
'soundfile >= 0.9.0'
'numpy >= 1.15.0',
'scipy >= 1.0.0',
'resampy >= 0.2.2',
'soundfile >= 0.9.0',
'colorlog',
'pathos',
],
extras_require={'dev': ['pytest>=3.7', 'librosa>=0.7.2']
} # for dev only, install: pip install -e .[dev]

@ -13,9 +13,8 @@
# limitations under the License.
import librosa
import numpy as np
import pytest
import paddleaudio
import pytest
TEST_FILE = './test/data/test_audio.wav'

@ -13,9 +13,8 @@
# limitations under the License.
import librosa
import numpy as np
import pytest
import paddleaudio as pa
import pytest
@pytest.mark.filterwarnings("ignore::DeprecationWarning")

@ -1,35 +0,0 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd

@ -61,7 +61,7 @@ For feature extraction, three methods are implemented, which are linear (FFT wit
Currently, the released deepspeech2 online model use the linear feature extraction method.
```
The code for feature extraction
vi deepspeech/frontend/featurizer/audio_featurizer.py
vi paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
```
### Encoder
@ -69,7 +69,7 @@ The encoder is composed of two 2D convolution subsampling layers and a number of
The code of Encoder is in:
```
vi deepspeech/models/ds2_online/deepspeech2.py
vi paddlespeech/s2t/models/ds2_online/deepspeech2.py
```
### Decoder
@ -78,9 +78,9 @@ To got the character possibilities of each frame, the feature representation of
The code of the decoder is in:
```
# The code of constructing the decoder in model
vi deepspeech/models/ds2_online/deepspeech2.py
vi paddlespeech/s2t/models/ds2_online/deepspeech2.py
# The code of CTC Decoder
vi deepspeech/modules/ctc.py
vi paddlespeech/s2t/modules/ctc.py
```
### Training Process
@ -169,7 +169,7 @@ For data preparation and decoder, the deepspeech2 offline model is same with the
The code of encoder and decoder for deepspeech2 offline model is in:
```
vi deepspeech/models/ds2/deepspeech2.py
vi paddlespeech/s2t/models/ds2/deepspeech2.py
```
The training process and testing process of deepspeech2 offline model is very similary to deepspeech2 online model.

@ -28,8 +28,8 @@ autodoc_mock_imports = ["soundfile", "librosa"]
# -- Project information -----------------------------------------------------
project = 'paddle speech'
copyright = '2021, Deepspeech-developers'
author = 'Deepspeech-developers'
copyright = '2021, paddlespeech-developers'
author = 'paddlespeech-developers'
# The full version, including alpha/beta/rc tags
release = '2.1'

@ -1,7 +1,7 @@
Welcome to paddle Deepspeech documentation !
Welcome to paddle PaddleSpeech documentation !
==============================================
**Deepspeech** is a Speech toolkits implemented by paddlepaddle.
**PaddleSpeech** is a Speech toolkits implemented by paddlepaddle.
Contents

@ -33,9 +33,9 @@ make install
```bash
git clone https://github.com/PaddlePaddle/DeepSpeech.git
cd DeepSpeech
pushd tools; make; popd
pushd tools; make virtualenv.done:; popd
source tools/venv/bin/activate
bash setup.sh
pip install -e .
```
- Source venv before do experiment.

@ -67,7 +67,7 @@ There are two common ways to define a model which consists of several modules.
```
When a model is a complicated and made up of several components, each of which has a separate functionality, and can be replaced by other components with the same functionality, we prefer to define it in this way.
In the directory structure of PaddleSpeech TTS, modules with high reusability are placed in `parakeet.modules`, but models for specific tasks are placed in `parakeet.models`. When developing a new model, developers need to consider the feasibility of splitting the modules, and the degree of generality of the modules, and place them in appropriate directories.
In the directory structure of PaddleSpeech TTS, modules with high reusability are placed in `paddlespeech.t2s.modules`, but models for specific tasks are placed in `paddlespeech.t2s.models`. When developing a new model, developers need to consider the feasibility of splitting the modules, and the degree of generality of the modules, and place them in appropriate directories.
## PaddleSpeech TTS's Data Components
Another critical componnet for a deep learning project is data.
@ -93,7 +93,7 @@ Then we need to select a format for saving metadata to the hard disk. There are
Meanwhile, `cache` is added here, and a multi-process Manager is used to share memory between multiple processes. When `num_workers` is used, it is guaranteed that each sub process will not cache a copy.
The implementation of `DataTable` can be found in `parakeet/datasets/data_table.py`.
The implementation of `DataTable` can be found in `paddlespeech/t2s/datasets/data_table.py`.
```python
class DataTable(Dataset):
"""Dataset to load and convert data for general purpose.
@ -179,9 +179,9 @@ We think this method is a little ugly. We prefer to return the necessary informa
It takes advantage of the globality of Python's module level variables and the effect of context manager.
There is a module level variable in `parakeet/training/reporter.py` `OBSERVATIONS`which is a `Dict` to store key-value.
There is a module level variable in `paddlespeech/t2s/training/reporter.py` `OBSERVATIONS`which is a `Dict` to store key-value.
```python
# parakeet/training/reporter.py
# paddlespeech/t2s/training/reporter.py
@contextlib.contextmanager
def scope(observations):

@ -102,9 +102,9 @@ import numpy as np
import paddle
import yaml
from yacs.config import CfgNode
from parakeet.models.fastspeech2 import FastSpeech2
from parakeet.models.fastspeech2 import FastSpeech2Inference
from parakeet.modules.normalizer import ZScore
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference
from paddlespeech.t2s.modules.normalizer import ZScore
# examples/fastspeech2/baker/frontend.py
from frontend import Frontend
@ -161,9 +161,9 @@ import paddle
import soundfile as sf
import yaml
from yacs.config import CfgNode
from parakeet.models.parallel_wavegan import PWGGenerator
from parakeet.models.parallel_wavegan import PWGInference
from parakeet.modules.normalizer import ZScore
from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
from paddlespeech.t2s.models.parallel_wavegan import PWGInference
from paddlespeech.t2s.modules.normalizer import ZScore
# load the pretrained model
checkpoint_dir = Path("parallel_wavegan_baker_ckpt_0.4")

@ -11,4 +11,4 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=deepspeech2
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin

@ -12,7 +12,7 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
# model exp
MODEL=u2
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
# srilm

@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=fastspeech2
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}

@ -39,7 +39,7 @@ There are silence in the edge of AISHELL-3's wavs, and the audio amplitude is ve
We use Montreal Force Aligner 1.0. The label in aishell3 include pinyinso the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$` and `%`) need to be removed. You shoud preprocess the dataset into the format which MFA needs, the texts have the same name with wavs and have the suffix `.lab`.
We use [lexicon.txt](https://github.com/PaddlePaddle/DeepSpeech/blob/develop/parakeet/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
We use [lexicon.txt](https://github.com/PaddlePaddle/DeepSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.

@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=voice_cloning/tacotron2_ge2e
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}

@ -12,4 +12,4 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=u2
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin

@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=speedyspeech
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}

@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=fastspeech2
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}

@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=parallelwave_gan
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL}
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}

@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=multi_band_melgan
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL}
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}

@ -12,4 +12,4 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=deepspeech2
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin

@ -12,4 +12,4 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=u2
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin

@ -12,7 +12,7 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=u2_kaldi
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
# srilm
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10

@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=tacotron2
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}

@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=transformer_tts
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}

@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=fastspeech2
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}

@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=waveflow
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}

@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=parallelwave_gan
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL}
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}

@ -21,7 +21,7 @@ from paddle import nn
from paddle.fluid import core
from paddle.nn import functional as F
from deepspeech.utils.log import Log
from paddlespeech.s2t.utils.log import Log
#TODO(Hui Zhang): remove fluid import
logger = Log(__name__).getlog()

@ -14,9 +14,9 @@
"""Evaluation for DeepSpeech2 model."""
from src_deepspeech2x.test_model import DeepSpeech2Tester as Tester
from deepspeech.exps.deepspeech2.config import get_cfg_defaults
from deepspeech.training.cli import default_argument_parser
from deepspeech.utils.utility import print_arguments
from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
def main_sp(config, args):

@ -19,11 +19,11 @@ from paddle import nn
from src_deepspeech2x.models.ds2.rnn import RNNStack
from yacs.config import CfgNode
from deepspeech.models.ds2.conv import ConvStack
from deepspeech.modules.ctc import CTCDecoder
from deepspeech.utils import layer_tools
from deepspeech.utils.checkpoint import Checkpoint
from deepspeech.utils.log import Log
from paddlespeech.s2t.models.ds2.conv import ConvStack
from paddlespeech.s2t.modules.ctc import CTCDecoder
from paddlespeech.s2t.utils import layer_tools
from paddlespeech.s2t.utils.checkpoint import Checkpoint
from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel']

@ -18,9 +18,9 @@ from paddle import nn
from paddle.nn import functional as F
from paddle.nn import initializer as I
from deepspeech.modules.activation import brelu
from deepspeech.modules.mask import make_non_pad_mask
from deepspeech.utils.log import Log
from paddlespeech.s2t.modules.activation import brelu
from paddlespeech.s2t.modules.mask import make_non_pad_mask
from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
__all__ = ['RNNStack']

@ -26,19 +26,19 @@ from src_deepspeech2x.models.ds2 import DeepSpeech2InferModel
from src_deepspeech2x.models.ds2 import DeepSpeech2Model
from yacs.config import CfgNode
from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset
from deepspeech.io.sampler import SortagradBatchSampler
from deepspeech.io.sampler import SortagradDistributedBatchSampler
from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline
from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
from deepspeech.training.trainer import Trainer
from deepspeech.utils import error_rate
from deepspeech.utils import layer_tools
from deepspeech.utils import mp_tools
from deepspeech.utils.log import Log
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.io.collator import SpeechCollator
from paddlespeech.s2t.io.dataset import ManifestDataset
from paddlespeech.s2t.io.sampler import SortagradBatchSampler
from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline
from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
from paddlespeech.s2t.training.trainer import Trainer
from paddlespeech.s2t.utils import error_rate
from paddlespeech.s2t.utils import layer_tools
from paddlespeech.s2t.utils import mp_tools
from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()

@ -1,2 +0,0 @@
data
exp

@ -1,3 +0,0 @@
# G2P
* zh - Chinese G2P

@ -1,93 +0,0 @@
# G2P
* WS
jieba
* G2P
pypinyin
* Tone sandhi
simple
We recommend using [Paraket](https://github.com/PaddlePaddle/Parakeet] [TextFrontEnd](https://github.com/PaddlePaddle/Parakeet/blob/develop/parakeet/frontend/__init__.py) to do G2P.
The phoneme set should be changed, you can reference `examples/thchs30/a0/data/dict/syllable.lexicon`.
## Download Baker dataset
[Baker](https://test.data-baker.com/#/data/index/source) dataset has to be downloaded mannually and moved to './data',
because you will have to pass the `CATTCHA` from a browswe to download the dataset.
## RUN
```
. path.sh
./run.sh
```
## Result
```
exp/
|-- 000001-010000.txt
|-- ref.pinyin
|-- trans.jieba.pinyin
`-- trans.pinyin
0 directories, 4 files
```
```
4f5a368441eb16aaf43dc1972f8b63dd exp/000001-010000.txt
01707896391c2de9b6fc4a39654be942 exp/ref.pinyin
43380ef160f65a23a3a0544700aa49b8 exp/trans.jieba.pinyin
8e6ff1fc22d8e8584082e804e8bcdeb7 exp/trans.pinyin
```
```
==> exp/000001-010000.txt <==
000001 卡尔普#2陪外孙#1玩滑梯#4。
ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1
000002 假语村言#2别再#1拥抱我#4。
jia2 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3
000003 宝马#1配挂#1跛骡鞍#3貂蝉#1怨枕#2董翁榻#4。
bao2 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4
000004 邓小平#2与#1撒切尔#2会晤#4。
deng4 xiao3 ping2 yu3 sa4 qie4 er3 hui4 wu4
000005 老虎#1幼崽#2与#1宠物犬#1玩耍#4。
lao2 hu3 you4 zai3 yu2 chong3 wu4 quan3 wan2 shua3
==> exp/ref.pinyin <==
000001 ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1
000002 jia2 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3
000003 bao2 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4
000004 deng4 xiao3 ping2 yu3 sa4 qie4 er3 hui4 wu4
000005 lao2 hu3 you4 zai3 yu2 chong3 wu4 quan3 wan2 shua3
000006 shen1 chang2 yue1 wu2 chi3 er4 cun4 wu3 fen1 huo4 yi3 shang4
000007 zhao4 di2 yue1 cao2 yun2 teng2 qu4 gui3 wu1
000008 zhan2 pin3 sui1 you3 zhan3 yuan2 que4 tui2
000009 yi2 san3 ju1 er2 tong2 he2 you4 tuo1 er2 tong2 wei2 zhu3
000010 ke1 te4 ni1 shen1 chuan1 bao4 wen2 da4 yi1
==> exp/trans.jieba.pinyin <==
000001 ka3 er3 pu3 pei2 wai4 sun1 wan2 hua2 ti1
000002 jia3 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3
000003 bao3 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4
000004 deng4 xiao3 ping2 yu3 sa1 qie4 er3 hui4 wu4
000005 lao3 hu3 you4 zai3 yu3 chong3 wu4 quan3 wan2 shua3
000006 shen1 chang2 yue1 wu3 chi3 er4 cun4 wu3 fen1 huo4 yi3 shang4
000007 zhao4 di2 yue1 cao2 yun2 teng2 qu4 gui3 wu1
000008 zhan3 pin3 sui1 you3 zhan3 yuan2 que4 tui2
000009 yi3 san3 ju1 er2 tong2 he2 you4 tuo1 er2 tong2 wei2 zhu3
000010 ke1 te4 ni1 shen1 chuan1 bao4 wen2 da4 yi1
==> exp/trans.pinyin <==
000001 ka3 er3 pu3 pei2 wai4 sun1 wan2 hua2 ti1
000002 jia3 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3
000003 bao3 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4
000004 deng4 xiao3 ping2 yu3 sa1 qie4 er3 hui4 wu4
000005 lao3 hu3 you4 zai3 yu3 chong3 wu4 quan3 wan2 shua3
000006 shen1 chang2 yue1 wu3 chi3 er4 cun4 wu3 fen1 huo4 yi3 shang4
000007 zhao4 di2 yue1 cao2 yun2 teng2 qu4 gui3 wu1
000008 zhan3 pin3 sui1 you3 zhan3 yuan2 que4 tui2
000009 yi3 san3 ju1 er2 tong2 he2 you4 tuo1 er2 tong2 wei2 zhu3
000010 ke1 te4 ni1 shen1 chuan1 bao4 wen2 da4 yi1
```

@ -1,53 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import re
import jieba
from pypinyin import lazy_pinyin
from pypinyin import Style
def extract_pinyin(source, target, use_jieba=False):
with open(source, 'rt', encoding='utf-8') as fin:
with open(target, 'wt', encoding='utf-8') as fout:
for i, line in enumerate(fin):
if i % 2 == 0:
sentence_id, raw_text = line.strip().split()
raw_text = re.sub(r'#\d', '', raw_text)
if use_jieba:
raw_text = jieba.lcut(raw_text)
syllables = lazy_pinyin(
raw_text,
errors='ignore',
style=Style.TONE3,
neutral_tone_with_five=True)
transcription = ' '.join(syllables)
fout.write(f'{sentence_id} {transcription}\n')
else:
continue
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
parser.add_argument(
"input", type=str, help="source file of baker's prosody label file")
parser.add_argument(
"output", type=str, help="target file to write pinyin lables")
parser.add_argument(
"--use-jieba",
action='store_true',
help="use jieba for word segmentation.")
args = parser.parse_args()
extract_pinyin(args.input, args.output, use_jieba=args.use_jieba)

@ -1,37 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
def extract_pinyin_lables(source, target):
"""Extract pinyin labels from Baker's prosody labeling."""
with open(source, 'rt', encoding='utf-8') as fin:
with open(target, 'wt', encoding='utf-8') as fout:
for i, line in enumerate(fin):
if i % 2 == 0:
sentence_id, raw_text = line.strip().split()
fout.write(f'{sentence_id} ')
else:
transcription = line.strip()
fout.write(f'{transcription}\n')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
parser.add_argument(
"input", type=str, help="source file of baker's prosody label file")
parser.add_argument(
"output", type=str, help="target file to write pinyin lables")
args = parser.parse_args()
extract_pinyin_lables(args.input, args.output)

@ -1,103 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
from typing import List
from typing import Union
def erized(syllable: str) -> bool:
"""Whether the syllable contains erhua effect.
Example
--------
huar -> True
guanr -> True
er -> False
"""
# note: for pinyin, len(syllable) >=2 is always true
# if not: there is something wrong in the data
assert len(syllable) >= 2, f"inavlid syllable {syllable}"
return syllable[:2] != "er" and syllable[-2] == 'r'
def ignore_sandhi(reference: List[str], generated: List[str]) -> List[str]:
"""
Given a sequence of syllables from human annotation(reference),
which makes sandhi explici and a sequence of syllables from some
simple g2p program(generated), which does not consider sandhi,
return a the reference sequence while ignore sandhi.
Example
--------
['lao2', 'hu3'], ['lao3', 'hu3'] -> ['lao3', 'hu3']
"""
i = 0
j = 0
# sandhi ignored in the result while other errors are not included
result = []
while i < len(reference):
if erized(reference[i]):
result.append(reference[i])
i += 1
j += 2
elif reference[i][:-1] == generated[i][:-1] and reference[i][
-1] == '2' and generated[i][-1] == '3':
result.append(generated[i])
i += 1
j += 1
else:
result.append(reference[i])
i += 1
j += 1
assert j == len(
generated
), "length of transcriptions mismatch, There may be some characters that are ignored in the generated transcription."
return result
def convert_transcriptions(reference: Union[str, Path],
generated: Union[str, Path],
output: Union[str, Path]):
with open(reference, 'rt') as f_ref:
with open(generated, 'rt') as f_gen:
with open(output, 'wt') as f_out:
for i, (ref, gen) in enumerate(zip(f_ref, f_gen)):
sentence_id, ref_transcription = ref.strip().split(' ', 1)
_, gen_transcription = gen.strip().split(' ', 1)
try:
result = ignore_sandhi(ref_transcription.split(),
gen_transcription.split())
result = ' '.join(result)
except Exception:
print(
f"sentence_id: {sentence_id} There is some annotation error in the reference or generated transcription. Use the reference."
)
result = ref_transcription
f_out.write(f"{sentence_id} {result}\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="reference transcription but ignore sandhi.")
parser.add_argument(
"--reference",
type=str,
help="path to the reference transcription of baker dataset.")
parser.add_argument(
"--generated", type=str, help="path to the generated transcription.")
parser.add_argument("--output", type=str, help="path to save result.")
args = parser.parse_args()
convert_transcriptions(args.reference, args.generated, args.output)

@ -1,33 +0,0 @@
#!/bin/bash
exp_dir="exp"
data_dir="data"
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
archive=${data_dir}/"BZNSYP.rar"
if [ ! -f ${archive} ]; then
echo "Baker Dataset not found! Download it first to the data_dir."
exit -1
fi
MD5='c4350563bf7dc298f7dd364b2607be83'
md5_result=$(md5sum ${archive} | awk -F[' '] '{print $1}')
if [ ${md5_result} != ${MD5} ]; then
echo "MD5 mismatch! The Archive has been changed."
exit -1
fi
label_file='ProsodyLabeling/000001-010000.txt'
filename='000001-010000.txt'
unrar e ${archive} ${label_file}
cp ${filename} ${exp_dir}
rm -f ${filename}
if [ ! -f ${exp_dir}/${filename} ];then
echo "File extraction failed!"
exit
fi
exit 0

@ -1,8 +0,0 @@
export MAIN_ROOT=`realpath ${PWD}/../../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}

@ -1,37 +0,0 @@
#!/usr/bin/env bash
source path.sh
stage=-1
stop_stage=100
exp_dir=exp
data=data
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
mkdir -p ${exp_dir}
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ];then
mkdir -p ${data}
test -e ${data}/BZNSYP.rar || { echo "Please download BZNSYP.rar and put it in "${data}; exit -1; }
fi
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
echo "stage 0: Extracting Prosody Labeling"
bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data}
fi
# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
filename="000001-010000.txt"
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
echo "stage 1: Processing transcriptions..."
python3 local/extract_pinyin_label.py ${exp_dir}/${filename} ${exp_dir}/ref.pinyin
python3 local/convert_transcription.py ${exp_dir}/${filename} ${exp_dir}/trans.pinyin
python3 local/convert_transcription.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/trans.jieba.pinyin
fi
echo "done"
exit 0

@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=ge2e
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}

@ -15,8 +15,8 @@ import argparse
import re
from pathlib import Path
from parakeet.frontend.zh_frontend import Frontend as zhFrontend
from parakeet.utils.error_rate import word_errors
from paddlespeech.t2s.frontend.zh_frontend import Frontend as zhFrontend
from paddlespeech.t2s.utils.error_rate import word_errors
SILENCE_TOKENS = {"sp", "sil", "sp1", "spl"}

@ -15,8 +15,8 @@ import argparse
import re
from pathlib import Path
from parakeet.frontend.zh_normalization.text_normlization import TextNormalizer
from parakeet.utils.error_rate import char_errors
from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
from paddlespeech.t2s.utils.error_rate import char_errors
# delete english characters

@ -1,36 +0,0 @@
# Regular expression based text normalization for Chinese
For simplicity and ease of implementation, text normalization is basically done by rules and dictionaries. Here's an example.
## Run
```
. path.sh
bash run.sh
```
## Results
```
exp/
`-- normalized.txt
0 directories, 1 file
```
```
aff31f8aa08e2a7360228c9ce5886b98 exp/normalized.txt
```
```
今天的最低气温达到零下十度.
只要有四分之三十三的人同意,就可以通过决议。
一九四五年五月二日,苏联士兵在德国国会大厦上升起了胜利旗,象征着攻占柏林并战胜了纳粹德国。
四月十六日,清晨的战斗以炮击揭幕,数以千计的大炮和喀秋莎火箭炮开始炮轰德军阵地,炮击持续了数天之久。
如果剩下的百分之三十点六是过去,那么还有百分之六十九点四.
事情发生在二零二零年三月三十一日的上午八点.
警方正在找一支点二二口径的手枪。
欢迎致电中国联通,北京二零二二年冬奥会官方合作伙伴为您服务
充值缴费请按一,查询话费及余量请按二,跳过本次提醒请按井号键。
快速解除流量封顶请按星号键腾讯王卡产品介绍、使用说明、特权及活动请按九查询话费、套餐余量、积分及活动返款请按一手机上网流量开通及取消请按二<EFBFBD><EFBFBD><EFBFBD>本机号码及本号所使用套餐请按四密码修改及重置请按五紧急开机请按六挂失请按七查询充值记录请按八其它自助服务及工服务请按零
```

@ -1,26 +0,0 @@
今天的最低气温达到-10°C.
只要有33/4的人同意就可以通过决议。
1945年5月2日苏联士兵在德国国会大厦上升起了胜利旗象征着攻占柏林并战胜了纳粹德国。
4月16日清晨的战斗以炮击揭幕数以千计的大炮和喀秋莎火箭炮开始炮轰德军阵地炮击持续了数天之久。
如果剩下的30.6%是过去那么还有69.4%.
事情发生在2020/03/31的上午8:00.
警方正在找一支.22口径的手枪。
欢迎致电中国联通北京2022年冬奥会官方合作伙伴为您服务
充值缴费请按1查询话费及余量请按2跳过本次提醒请按井号键。
快速解除流量封顶请按星号键腾讯王卡产品介绍、使用说明、特权及活动请按9查询话费、套餐余量、积分及活动返款请按1手机上网流量开通及取消请按2查询本机号码及本号所使用套餐请按4密码修改及重置请按5紧急开机请按6挂失请按7查询充值记录请按8其它自助服务及人工服务请按0
智能客服助理快速查话费、查流量请按9了解北京联通业务请按1宽带IPTV新装、查询请按2障碍报修请按3充值缴费请按4投诉建议请按5政企业务请按7人工服务请按0for english severice press star key
您的帐户当前可用余额为63.89元本月消费为2.17元。您的消费、套餐余量和其它信息将以短信形式下发,请您注意查收。谢谢使用,再见!。
您的帐户当前可用余额为负15.5元本月消费为59.6元。您的消费、套餐余量和其它信息将以短信形式下发,请您注意查收。谢谢使用,再见!。
尊敬的客户您目前的话费余额为负14.60元已低于10元为保证您的通信畅通请及时缴纳费用。
您的流量已用完,为避免您产生额外费用,建议您根据需求开通一个流量包以作补充。
您可以直接说,查询话费及余量、开通流量包、缴费,您也可以说出其它需求,请问有什么可以帮您?
您的账户当前可用余额为负36.00元本月消费36.00元。
请问你是电话13985608526的机主吗
如您对处理结果不满意可拨打中国联通集团投诉电话10015进行投诉按本地通话费收费返回自助服务请按井号键
“26314”号VIP客服代表为您服务。
尊敬的5G用户欢迎您致电中国联通
首先是应用了M1芯片的iPad Pro新款的iPad Pro支持5G这也是苹果的第二款5G产品线。
除此之外,摄像头方面再次升级,增加了前摄全新超广角摄像头,支持人物居中功能,搭配超广角可实现视频中始终让人物居中效果。
屏幕方面iPad Pro 12.9版本支持XDR体验的Mini-LEDS显示屏支持HDR10、杜比视界还支持杜比全景声。
iPad Pro的秒控键盘这次也推出白色版本。
售价方面11英寸版本售价799美元起12.9英寸售价1099美元起。

@ -1,29 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from text_processing import normalization
parser = argparse.ArgumentParser(
description="Normalize text in Chinese with some rules.")
parser.add_argument("input", type=str, help="the input sentences")
parser.add_argument("output", type=str, help="path to save the output file.")
args = parser.parse_args()
with open(args.input, 'rt') as fin:
with open(args.output, 'wt') as fout:
for sent in fin:
sent = normalization.normalize_sentence(sent.strip())
fout.write(sent)
fout.write('\n')

@ -1,8 +0,0 @@
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${MAIN_ROOT}/third_party:${PYTHONPATH}#

@ -1,26 +0,0 @@
#!/usr/bin/env bash
source path.sh
stage=-1
stop_stage=100
exp_dir=exp
data_dir=data
filename="sentences.txt"
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
mkdir -p ${exp_dir}
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
echo "stage 1: Processing "
python3 local/test_normalization.py ${data_dir}/${filename} ${exp_dir}/normalized.txt
if [ -f "${exp_dir}/normalized.txt" ]; then
echo "Normalized text save at ${exp_dir}/normalized.txt"
fi
# TODO(chenfeiyu): compute edit distance against ground-truth
fi
echo "done"
exit 0

@ -12,4 +12,4 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=u2_st
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin

@ -12,4 +12,4 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=u2
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin

@ -11,4 +11,4 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=deepspeech2
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin

@ -11,4 +11,4 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=u2
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save