diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 000000000..b31d98631 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,42 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: '' +assignees: '' + +--- + +For support and discussions, please use our [Discourse forums](https://github.com/PaddlePaddle/DeepSpeech/discussions). + +If you've found a bug then please create an issue with the following information: + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Screenshots** +If applicable, add screenshots to help explain your problem. + +** Environment (please complete the following information):** + - OS: [e.g. Ubuntu] + - GCC/G++ Version [e.g. 8.3] + - Python Version [e.g. 3.7] + - PaddlePaddle Version [e.g. 2.0.0] + - Model Version [e.g. 2.0.0] + - GPU/DRIVER Informationo [e.g. Tesla V100-SXM2-32GB/440.64.00] + - CUDA/CUDNN Version [e.g. cuda-10.2] + - MKL Version +- TensorRT Version + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 000000000..94d507035 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,24 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: "[Feature request]" +labels: feature request +assignees: '' + +--- + +For support and discussions, please use our [Discourse forums](https://github.com/PaddlePaddle/DeepSpeech/discussions). + +If you've found a feature request then please create an issue with the following information: + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/README.md b/README.md index 98d523890..cc2543e7c 100644 --- a/README.md +++ b/README.md @@ -18,30 +18,16 @@ * python>=3.7 * paddlepaddle>=2.0.0 -- Run the setup script for the remaining dependencies - -```bash -git clone https://github.com/PaddlePaddle/DeepSpeech.git -cd DeepSpeech -pushd tools; make; popd -source tools/venv/bin/activate -bash setup.sh -``` - -- Source venv before do experiment. - -```bash -source tools/venv/bin/activate -``` +Please see [install](docs/install.md). ## Getting Started -Please see [Getting Started](docs/geting_started.md) and [tiny egs](examples/tiny/README.md). +Please see [Getting Started](docs/getting_started.md) and [tiny egs](examples/tiny/README.md). ## More Information * [Install](docs/install.md) -* [Getting Started](docs/geting_stared.md) +* [Getting Started](docs/getting_started.md) * [Data Prepration](docs/data_preparation.md) * [Data Augmentation](docs/augmentation.md) * [Ngram LM](docs/ngram_lm.md) @@ -53,7 +39,7 @@ Please see [Getting Started](docs/geting_started.md) and [tiny egs](examples/tin ## Questions and Help -You are welcome to submit questions and bug reports in [Github Issues](https://github.com/PaddlePaddle/DeepSpeech/issues). You are also welcome to contribute to this project. +You are welcome to submit questions in [Github Discussions](https://github.com/PaddlePaddle/DeepSpeech/discussions) and bug reports in [Github Issues](https://github.com/PaddlePaddle/DeepSpeech/issues). You are also welcome to contribute to this project. ## License diff --git a/README_cn.md b/README_cn.md index 713e16ebd..d322749b4 100644 --- a/README_cn.md +++ b/README_cn.md @@ -14,33 +14,20 @@ * [Baidu's Deep Speech2](http://proceedings.mlr.press/v48/amodei16.pdf) ## 安装 + * python>=3.7 * paddlepaddle>=2.0.0 -- 安装依赖 - -```bash -git clone https://github.com/PaddlePaddle/DeepSpeech.git -cd DeepSpeech -pushd tools; make; popd -source tools/venv/bin/activate -bash setup.sh -``` - -- 开始实验前要source环境. - -```bash -source tools/venv/bin/activate -``` +参看 [安装](docs/install.md)。 ## 开始 -请查看 [Getting Started](docs/geting_started.md) 和 [tiny egs](examples/tiny/README.md)。 +请查看 [Getting Started](docs/getting_started.md) 和 [tiny egs](examples/tiny/README.md)。 ## 更多信息 * [安装](docs/install.md) -* [开始](docs/geting_stared.md) +* [开始](docs/getting_started.md) * [数据处理](docs/data_preparation.md) * [数据增强](docs/augmentation.md) * [语言模型](docs/ngram_lm.md) @@ -51,7 +38,7 @@ source tools/venv/bin/activate ## 问题和帮助 -欢迎您在[Github问题](https://github.com/PaddlePaddle/models/issues)中提交问题和bug。也欢迎您为这个项目做出贡献。 +欢迎您在[Github讨论](https://github.com/PaddlePaddle/DeepSpeech/discussions)提交问题,[Github问题](https://github.com/PaddlePaddle/models/issues)中反馈bug。也欢迎您为这个项目做出贡献。 ## License diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index e6779be63..390d21d2b 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -39,7 +39,6 @@ from deepspeech.io.sampler import SortagradDistributedBatchSampler from deepspeech.io.sampler import SortagradBatchSampler from deepspeech.io.dataset import ManifestDataset -from deepspeech.modules.loss import CTCLoss from deepspeech.models.deepspeech2 import DeepSpeech2Model from deepspeech.models.deepspeech2 import DeepSpeech2InferModel @@ -63,8 +62,6 @@ class DeepSpeech2Trainer(Trainer): losses_np = { 'train_loss': float(loss), - 'train_loss_div_batchsize': - float(loss) / self.config.data.batch_size } msg = "Train: Rank: {}, ".format(dist.get_rank()) msg += "epoch: {}, ".format(self.epoch) @@ -90,8 +87,6 @@ class DeepSpeech2Trainer(Trainer): loss = self.model(*batch) valid_losses['val_loss'].append(float(loss)) - valid_losses['val_loss_div_batchsize'].append( - float(loss) / self.config.data.batch_size) # write visual log valid_losses = {k: np.mean(v) for k, v in valid_losses.items()} diff --git a/deepspeech/models/deepspeech2.py b/deepspeech/models/deepspeech2.py index ffe678a69..4e66a75f8 100644 --- a/deepspeech/models/deepspeech2.py +++ b/deepspeech/models/deepspeech2.py @@ -170,7 +170,8 @@ class DeepSpeech2Model(nn.Layer): odim=dict_size + 1, # is append after vocab blank_id=dict_size, # last token is dropout_rate=0.0, - reduction=True) + reduction=True, # sum + batch_average=True) # sum / batch_size def forward(self, audio, text, audio_len, text_len): """Compute Model loss diff --git a/deepspeech/modules/ctc.py b/deepspeech/modules/ctc.py index 66737f599..74b21d395 100644 --- a/deepspeech/modules/ctc.py +++ b/deepspeech/modules/ctc.py @@ -36,14 +36,16 @@ class CTCDecoder(nn.Layer): odim, blank_id=0, dropout_rate: float=0.0, - reduction: bool=True): + reduction: bool=True, + batch_average: bool=False): """CTC decoder Args: enc_n_units ([int]): encoder output dimention vocab_size ([int]): text vocabulary size dropout_rate (float): dropout rate (0.0 ~ 1.0) - reduction (bool): reduce the CTC loss into a scalar + reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none' + batch_average (bool): do batch dim wise average. """ assert check_argument_types() super().__init__() @@ -53,7 +55,10 @@ class CTCDecoder(nn.Layer): self.dropout_rate = dropout_rate self.ctc_lo = nn.Linear(enc_n_units, self.odim) reduction_type = "sum" if reduction else "none" - self.criterion = CTCLoss(blank=self.blank_id, reduction=reduction_type) + self.criterion = CTCLoss( + blank=self.blank_id, + reduction=reduction_type, + batch_average=batch_average) # CTCDecoder LM Score handle self._ext_scorer = None diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py index 9e1d34a89..04a594eed 100644 --- a/deepspeech/modules/loss.py +++ b/deepspeech/modules/loss.py @@ -25,32 +25,33 @@ __all__ = ['CTCLoss', "LabelSmoothingLoss"] class CTCLoss(nn.Layer): - def __init__(self, blank=0, reduction='sum'): + def __init__(self, blank=0, reduction='sum', batch_average=False): super().__init__() # last token id as blank id self.loss = nn.CTCLoss(blank=blank, reduction=reduction) + self.batch_average = batch_average def forward(self, logits, ys_pad, hlens, ys_lens): """Compute CTC loss. Args: - logits ([paddle.Tensor]): [description] - ys_pad ([paddle.Tensor]): [description] - hlens ([paddle.Tensor]): [description] - ys_lens ([paddle.Tensor]): [description] + logits ([paddle.Tensor]): [B, Tmax, D] + ys_pad ([paddle.Tensor]): [B, Tmax] + hlens ([paddle.Tensor]): [B] + ys_lens ([paddle.Tensor]): [B] Returns: [paddle.Tensor]: scalar. If reduction is 'none', then (N), where N = \text{batch size}. """ + B = paddle.shape(logits)[0] # warp-ctc need logits, and do softmax on logits by itself # warp-ctc need activation with shape [T, B, V + 1] # logits: (B, L, D) -> (L, B, D) logits = logits.transpose([1, 0, 2]) loss = self.loss(logits, ys_pad, hlens, ys_lens) - - # wenet do batch-size average, deepspeech2 not do this - # Batch-size average - # loss = loss / paddle.shape(logits)[1] + if self.batch_average: + # Batch-size average + loss = loss / B return loss diff --git a/docs/geting_started.md b/docs/getting_started.md similarity index 100% rename from docs/geting_started.md rename to docs/getting_started.md diff --git a/docs/ngram_lm.md b/docs/ngram_lm.md index 48c557ce9..a12e6bc52 100644 --- a/docs/ngram_lm.md +++ b/docs/ngram_lm.md @@ -1,6 +1,8 @@ # Prepare Language Model -A language model is required to improve the decoder's performance. We have prepared two language models (with lossy compression) for users to download and try. One is for English and the other is for Mandarin. Users can simply run this to download the preprared language models: +A language model is required to improve the decoder's performance. We have prepared two language models (with lossy compression) for users to download and try. One is for English and the other is for Mandarin. The bash script to download LM is example's `local/download_lm_*.sh`. + +For example, users can simply run this to download the preprared mandarin language models: ```bash cd examples/aishell @@ -8,7 +10,9 @@ source path.sh bash local/download_lm_ch.sh ``` -If you wish to train your own better language model, please refer to [KenLM](https://github.com/kpu/kenlm) for tutorials. Here we provide some tips to show how we preparing our English and Mandarin language models. You can take it as a reference when you train your own. +If you wish to train your own better language model, please refer to [KenLM](https://github.com/kpu/kenlm) for tutorials. +Here we provide some tips to show how we preparing our English and Mandarin language models. +You can take it as a reference when you train your own. ## English LM diff --git a/examples/aishell/.gitignore b/examples/aishell/.gitignore index 389676a70..3c13afe8a 100644 --- a/examples/aishell/.gitignore +++ b/examples/aishell/.gitignore @@ -2,3 +2,4 @@ data ckpt* demo_cache *.log +log diff --git a/examples/aishell/README.md b/examples/aishell/README.md index 6d67d19a9..ded740d10 100644 --- a/examples/aishell/README.md +++ b/examples/aishell/README.md @@ -1,7 +1,7 @@ # Aishell-1 ## CTC -| Model | Config | Test set | CER | -| --- | --- | --- | --- | -| DeepSpeech2 | conf/deepspeech2.yaml | test | 0.078977 | -| DeepSpeech2 | release 1.8.5 | test | 0.080447 | +| Model | Config | Test Set | CER | Valid Loss | +| --- | --- | --- | --- | --- | +| DeepSpeech2 | conf/deepspeech2.yaml | test | 0.077249 | 7.036566 | +| DeepSpeech2 | release 1.8.5 | test | 0.087004 | 8.575452 | diff --git a/examples/aishell/conf/deepspeech2.yaml b/examples/aishell/conf/deepspeech2.yaml index 821c183e5..a50a7ecf5 100644 --- a/examples/aishell/conf/deepspeech2.yaml +++ b/examples/aishell/conf/deepspeech2.yaml @@ -29,8 +29,8 @@ model: use_gru: True share_rnn_weights: False training: - n_epoch: 30 - lr: 5e-4 + n_epoch: 50 + lr: 2e-3 lr_decay: 0.83 weight_decay: 1e-06 global_grad_clip: 5.0 @@ -39,7 +39,7 @@ decoding: error_rate_type: cer decoding_method: ctc_beam_search lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm - alpha: 2.6 + alpha: 1.9 beta: 5.0 beam_size: 300 cutoff_prob: 0.99 diff --git a/examples/aishell/local/infer.sh b/examples/aishell/local/infer.sh index 41ccabf80..8c6a4dca2 100644 --- a/examples/aishell/local/infer.sh +++ b/examples/aishell/local/infer.sh @@ -1,6 +1,6 @@ #! /usr/bin/env bash -if [[ $# != 1 ]]; +if [[ $# != 1 ]]; then echo "usage: $0 ckpt-path" exit -1 fi diff --git a/examples/aishell/local/train.sh b/examples/aishell/local/train.sh index c286566a8..245ed2172 100644 --- a/examples/aishell/local/train.sh +++ b/examples/aishell/local/train.sh @@ -2,7 +2,7 @@ # train model # if you wish to resume from an exists model, uncomment --init_from_pretrained_model -export FLAGS_sync_nccl_allreduce=0 +#export FLAGS_sync_nccl_allreduce=0 ngpu=$(echo ${CUDA_VISIBLE_DEVICES} | python -c 'import sys; a = sys.stdin.read(); print(len(a.split(",")));') echo "using $ngpu gpus..." diff --git a/examples/aishell/run.sh b/examples/aishell/run.sh index 8beb6bf0f..2e215a999 100644 --- a/examples/aishell/run.sh +++ b/examples/aishell/run.sh @@ -7,7 +7,7 @@ source path.sh bash ./local/data.sh # train model -CUDA_VISIBLE_DEVICES=0,1,2,3 bash ./local/train.sh +CUDA_VISIBLE_DEVICES=0,1,2,3 bash ./local/train.sh baseline # test model CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh @@ -16,4 +16,4 @@ CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh CUDA_VISIBLE_DEVICES=0 bash ./local/infer.sh ckpt/checkpoints/step-3284 # export model -bash ./local/export.sh ckpt/checkpoints/step-3284 jit.model \ No newline at end of file +bash ./local/export.sh ckpt/checkpoints/step-3284 jit.model diff --git a/examples/librispeech/README.md b/examples/librispeech/README.md index 1e694df1c..d553faecf 100644 --- a/examples/librispeech/README.md +++ b/examples/librispeech/README.md @@ -1,7 +1,7 @@ # LibriSpeech ## CTC -| Model | Config | Test set | WER | -| --- | --- | --- | --- | -| DeepSpeech2 | conf/deepspeech2.yaml | test-clean | 0.073973 | -| DeepSpeech2 | release 1.8.5 | test-clean | 0.074939 | +| Model | Config | Test Set | WER | Valid Loss | +| --- | --- | --- | --- | --- | +| DeepSpeech2 | conf/deepspeech2.yaml | test-clean | 0.069357 | 15.078561 | +| DeepSpeech2 | release 1.8.5 | test-clean | 0.074939 | 15.351633 | diff --git a/examples/librispeech/conf/deepspeech2.yaml b/examples/librispeech/conf/deepspeech2.yaml index 15fd4cbe3..3368374b0 100644 --- a/examples/librispeech/conf/deepspeech2.yaml +++ b/examples/librispeech/conf/deepspeech2.yaml @@ -29,8 +29,8 @@ model: use_gru: False share_rnn_weights: True training: - n_epoch: 20 - lr: 5e-4 + n_epoch: 50 + lr: 1e-3 lr_decay: 0.83 weight_decay: 1e-06 global_grad_clip: 5.0 @@ -39,7 +39,7 @@ decoding: error_rate_type: wer decoding_method: ctc_beam_search lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 + alpha: 1.9 beta: 0.3 beam_size: 500 cutoff_prob: 1.0 diff --git a/examples/librispeech/local/infer.sh b/examples/librispeech/local/infer.sh index 6fc8d39fc..98b3b016a 100644 --- a/examples/librispeech/local/infer.sh +++ b/examples/librispeech/local/infer.sh @@ -1,6 +1,6 @@ #! /usr/bin/env bash -if [[ $# != 1 ]]; +if [[ $# != 1 ]];then echo "usage: $0 ckpt-path" exit -1 fi diff --git a/examples/librispeech/local/train.sh b/examples/librispeech/local/train.sh index 507947e9e..cbccb1896 100644 --- a/examples/librispeech/local/train.sh +++ b/examples/librispeech/local/train.sh @@ -1,8 +1,9 @@ #! /usr/bin/env bash -export FLAGS_sync_nccl_allreduce=0 +#export FLAGS_sync_nccl_allreduce=0 + # https://github.com/PaddlePaddle/Paddle/pull/28484 -export NCCL_SHM_DISABLE=1 +#export NCCL_SHM_DISABLE=1 ngpu=$(echo ${CUDA_VISIBLE_DEVICES} | python -c 'import sys; a = sys.stdin.read(); print(len(a.split(",")));') echo "using $ngpu gpus..." @@ -11,7 +12,7 @@ python3 -u ${BIN_DIR}/train.py \ --device 'gpu' \ --nproc ${ngpu} \ --config conf/deepspeech2.yaml \ ---output ckpt +--output ckpt-${1} if [ $? -ne 0 ]; then echo "Failed in training!" diff --git a/examples/tiny/local/infer.sh b/examples/tiny/local/infer.sh index 1243c0d08..b36f9000a 100644 --- a/examples/tiny/local/infer.sh +++ b/examples/tiny/local/infer.sh @@ -1,6 +1,6 @@ #! /usr/bin/env bash -if [[ $# != 1 ]]; +if [[ $# != 1 ]];then echo "usage: $0 ckpt-path" exit -1 fi diff --git a/examples/tiny/local/test.sh b/examples/tiny/local/test.sh index a0f200799..8c8c278c6 100644 --- a/examples/tiny/local/test.sh +++ b/examples/tiny/local/test.sh @@ -6,7 +6,6 @@ if [ $? -ne 0 ]; then exit 1 fi -CUDA_VISIBLE_DEVICES=0 \ python3 -u ${BIN_DIR}/test.py \ --device 'gpu' \ --nproc 1 \ diff --git a/examples/tiny/local/train.sh b/examples/tiny/local/train.sh index 369ccc924..af62ae55f 100644 --- a/examples/tiny/local/train.sh +++ b/examples/tiny/local/train.sh @@ -2,7 +2,6 @@ export FLAGS_sync_nccl_allreduce=0 -CUDA_VISIBLE_DEVICES=0 \ python3 -u ${BIN_DIR}/train.py \ --device 'gpu' \ --nproc 1 \ diff --git a/setup.sh b/setup.sh index c681583b8..f8ba6d98f 100644 --- a/setup.sh +++ b/setup.sh @@ -1,13 +1,19 @@ #! /usr/bin/env bash +source utils/log.sh + + SUDO='sudo' if [ $(id -u) -eq 0 ]; then SUDO='' fi -if [ -e /etc/lsb-release ];then +if [ -e /etc/lsb-release ]; then #${SUDO} apt-get update ${SUDO} apt-get install -y pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev +else + error_msg "Please using Ubuntu or install `pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev` by user." + exit -1 fi # install python dependencies @@ -15,17 +21,17 @@ if [ -f "requirements.txt" ]; then pip3 install -r requirements.txt fi if [ $? != 0 ]; then - echo "Install python dependencies failed !!!" + error_msg "Install python dependencies failed !!!" exit 1 fi # install package libsndfile python3 -c "import soundfile" if [ $? != 0 ]; then - echo "Install package libsndfile into default system path." + info_msg "Install package libsndfile into default system path." wget "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz" if [ $? != 0 ]; then - echo "Download libsndfile-1.0.28.tar.gz failed !!!" + error_msg "Download libsndfile-1.0.28.tar.gz failed !!!" exit 1 fi tar -zxvf libsndfile-1.0.28.tar.gz @@ -43,6 +49,10 @@ if [ $? != 0 ]; then sh setup.sh cd - > /dev/null fi +python3 -c "import pkg_resources; pkg_resources.require(\"swig_decoders==1.1\")" +if [ $? != 0 ]; then + error_msg "Please check why decoder install error!" + exit -1 +fi - -echo "Install all dependencies successfully." +info_msg "Install all dependencies successfully." diff --git a/utils/log.sh b/utils/log.sh new file mode 100644 index 000000000..84591b076 --- /dev/null +++ b/utils/log.sh @@ -0,0 +1,11 @@ +_HDR_FMT="%.23s %s[%s]: " +_ERR_MSG_FMT="ERROR: ${_HDR_FMT}%s\n" +_INFO_MSG_FMT="INFO: ${_HDR_FMT}%s\n" + +error_msg() { + printf "$_ERR_MSG_FMT" $(date +%F.%T.%N) ${BASH_SOURCE[1]##*/} ${BASH_LINENO[0]} "${@}" +} + +info_msg() { + printf "$_INFO_MSG_FMT" $(date +%F.%T.%N) ${BASH_SOURCE[1]##*/} ${BASH_LINENO[0]} "${@}" +}