Merge pull request #1095 from KPatr1ck/demo

[Demo]Add tts demo.
4 years ago · a6e0a69da8
parent 963e906f56 662b10dbed
commit a6e0a69da8
15 changed files with 207 additions and 152 deletions
--- a/demos/audio_tagging/README.md
+++ b/demos/audio_tagging/README.md
@ -22,7 +22,7 @@ wget https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech
 ### 3. Usage
 - Command Line(Recommended)
  ```bash
-  paddlespeech cls --input ~/cat.wav --topk 10
+  paddlespeech cls --input ./cat.wav --topk 10
  ```
  Usage:
  ```bash
--- a/demos/audio_tagging/run.sh
+++ b/demos/audio_tagging/run.sh
@ -0,0 +1,4 @@
+#!/bin/bash
+
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav
+paddlespeech cls --input ./cat.wav --topk 10
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@ -22,7 +22,7 @@ wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.
 ### 3. Usage
 - Command Line(Recommended)
  ```bash
-  paddlespeech asr --input ~/zh.wav
+  paddlespeech asr --input ./zh.wav
  ```
  Usage:
  ```bash
--- a/demos/speech_recognition/run.sh
+++ b/demos/speech_recognition/run.sh
@ -0,0 +1,4 @@
+#!/bin/bash
+
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+paddlespeech asr --input ./zh.wav
--- a/demos/speech_translation/README.md
+++ b/demos/speech_translation/README.md
@ -22,7 +22,7 @@ wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.
 ### 3. Usage
 - Command Line(Recommended)
  ```bash
-  paddlespeech st --input ~/en.wav
+  paddlespeech st --input ./en.wav
  ```
  Usage:
  ```bash
--- a/demos/speech_translation/run.sh
+++ b/demos/speech_translation/run.sh
@ -0,0 +1,4 @@
+#!/bin/bash
+
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+paddlespeech st --input ./en.wav
--- a/demos/text_to_speech/README.md
+++ b/demos/text_to_speech/README.md
@ -0,0 +1,102 @@
+# TTS(Text To Speech)
+
+## Introduction
+Text-to-speech (TTS) is a natural language modeling process that requires changing units of text into units of speech for audio presentation. 
+
+This demo is an implementation to generate an audio from the giving text. It can be done by a single command or a few lines in python using `PaddleSpeech`. 
+
+## Usage
+### 1. Installation
+```bash
+pip install paddlespeech
+```
+
+### 2. Prepare Input
+Input of this demo should be a text of the specific language that can be passed via argument.
+
+
+### 3. Usage
+- Command Line(Recommended)
+  ```bash
+  paddlespeech tts --input 今天的天气不错啊
+  ```
+  Usage:
+  ```bash
+  paddlespeech tts --help
+  ```
+  Arguments:
+  - `input`(required): Input text to generate..
+  - `am`: Acoustic model type of tts task. Default: `fastspeech2_csmsc`.
+  - `am_config`: Config of acoustic model. Use deault config when it is None. Default: `None`.
+  - `am_ckpt`: Acoustic model checkpoint. Use pretrained model when it is None. Default: `None`.
+  - `am_stat`: Mean and standard deviation used to normalize spectrogram when training acoustic model. Default: `None`.
+  - `phones_dict`: Phone vocabulary file. Default: `None`.
+  - `tones_dict`: Tone vocabulary file. Default: `None`.
+  - `speaker_dict`: speaker id map file. Default: `None`.
+  - `spk_id`: Speaker id for multi speaker acoustic model. Default: `0`.
+  - `voc`: Vocoder type of tts task. Default: `pwgan_csmsc`.
+  - `voc_config`: Config of vocoder. Use deault config when it is None. Default: `None`.
+  - `voc_ckpt`: Vocoder checkpoint. Use pretrained model when it is None. Default: `None`.
+  - `voc_stat`: Mean and standard deviation used to normalize spectrogram when training vocoder. Default: `None`.
+  - `lang`: Language of tts task. Default: `zh`.
+  - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment.
+  - `output`: Output wave filepath. Default: `output.wav`.
+
+  Output:
+  ```bash
+  [2021-12-09 20:49:58,955] [    INFO] [log.py] [L57] - Wave file has been generated: output.wav
+  ```
+
+- Python API
+  ```python
+  import paddle
+  from paddlespeech.cli import TTSExecutor
+
+  tts_executor = TTSExecutor()
+  wav_file = tts_executor(
+      text='今天的天气不错啊',
+      output='output.wav',
+      am='fastspeech2_csmsc',
+      am_config=None,
+      am_ckpt=None,
+      am_stat=None,
+      spk_id=0,
+      phones_dict=None,
+      tones_dict=None,
+      speaker_dict=None,
+      voc='pwgan_csmsc',
+      voc_config=None,
+      voc_ckpt=None,
+      voc_stat=None,
+      lang='zh',
+      device=paddle.get_device())
+  print('Wave file has been generated: {}'.format(wav_file))
+  ```
+
+  Output:
+  ```bash
+  Wave file has been generated: output.wav
+  ```
+
+
+### 4.Pretrained Models
+
+Here is a list of pretrained models released by PaddleSpeech that can be used by command and python api:
+
+- Acoustic model
+  | Model | Language
+  | :--- | :---: |
+  | speedyspeech_csmsc| zh
+  | fastspeech2_csmsc| zh
+  | fastspeech2_aishell3| zh
+  | fastspeech2_ljspeech| en
+  | fastspeech2_vctk| en
+
+- Vocoder
+  | Model | Language
+  | :--- | :---: |
+  | pwgan_csmsc| zh
+  | pwgan_aishell3| zh
+  | pwgan_ljspeech| en
+  | pwgan_vctk| en
+  | mb_melgan_csmsc| zh
--- a/demos/text_to_speech/run.sh
+++ b/demos/text_to_speech/run.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+
+paddlespeech tts --input 今天的天气不错啊
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@ -27,9 +27,9 @@ import yaml
 from yacs.config import CfgNode

 from ..executor import BaseExecutor
+from ..log import logger
 from ..utils import cli_register
 from ..utils import download_and_decompress
-from ..utils import logger
 from ..utils import MODEL_HOME
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.transform.transformation import Transformation
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@ -20,14 +20,14 @@ from typing import Union
 import numpy as np
 import paddle
 import yaml
-from paddleaudio import load
-from paddleaudio.features import LogMelSpectrogram

 from ..executor import BaseExecutor
+from ..log import logger
 from ..utils import cli_register
 from ..utils import download_and_decompress
-from ..utils import logger
 from ..utils import MODEL_HOME
+from paddleaudio import load
+from paddleaudio.features import LogMelSpectrogram
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import

 __all__ = ['CLSExecutor']
--- a/paddlespeech/cli/download.py
+++ b/paddlespeech/cli/download.py
@ -20,49 +20,21 @@ import os
 import os.path as osp
 import shutil
 import subprocess
-import sys
 import tarfile
 import time
 import zipfile

 import requests
+from tqdm import tqdm

-try:
-    from tqdm import tqdm
-except:
+from .log import logger

-    class tqdm(object):
-        def __init__(self, total=None):
-            self.total = total
-            self.n = 0
-
-        def update(self, n):
-            self.n += n
-            if self.total is None:
-                sys.stderr.write("\r{0:.1f} bytes".format(self.n))
-            else:
-                sys.stderr.write(
-                    "\r{0:.1f}%".format(100 * self.n / float(self.total)))
-            sys.stderr.flush()
-
-        def __enter__(self):
-            return self
-
-        def __exit__(self, exc_type, exc_val, exc_tb):
-            sys.stderr.write('\n')
-
-
-import logging
-logger = logging.getLogger(__name__)
-
-__all__ = ['get_weights_path_from_url']
-
-WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights")
+__all__ = ['get_path_from_url']

 DOWNLOAD_RETRY_LIMIT = 3


-def is_url(path):
+def _is_url(path):
    """
    Whether path is URL.
    Args:
@ -71,25 +43,6 @@ def is_url(path):
    return path.startswith('http://') or path.startswith('https://')


-def get_weights_path_from_url(url, md5sum=None):
-    """Get weights path from WEIGHT_HOME, if not exists,
-    download it from url.
-    Args:
-        url (str): download url
-        md5sum (str): md5 sum of download package
-    
-    Returns:
-        str: a local path to save downloaded weights.
-    Examples:
-        .. code-block:: python
-            from paddle.utils.download import get_weights_path_from_url
-            resnet18_pretrained_weight_url = 'https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams'
-            local_weight_path = get_weights_path_from_url(resnet18_pretrained_weight_url)
-    """
-    path = get_path_from_url(url, WEIGHTS_HOME, md5sum)
-    return path
-
-
 def _map_path(url, root_dir):
    # parse path after download under root_dir
    fname = osp.split(url)[-1]
@ -135,7 +88,7 @@ def get_path_from_url(url,

    from paddle.fluid.dygraph.parallel import ParallelEnv

-    assert is_url(url), "downloading from {} not a url".format(url)
+    assert _is_url(url), "downloading from {} not a url".format(url)
    # parse path after download to decompress under root_dir
    fullpath = _map_path(url, root_dir)
    # Mainly used to solve the problem of downloading data from different 
--- a/paddlespeech/cli/log.py
+++ b/paddlespeech/cli/log.py
@ -0,0 +1,60 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import logging
+
+__all__ = [
+    'logger',
+]
+
+
+class Logger(object):
+    def __init__(self, name: str=None):
+        name = 'PaddleSpeech' if not name else name
+        self.logger = logging.getLogger(name)
+
+        log_config = {
+            'DEBUG': 10,
+            'INFO': 20,
+            'TRAIN': 21,
+            'EVAL': 22,
+            'WARNING': 30,
+            'ERROR': 40,
+            'CRITICAL': 50,
+            'EXCEPTION': 100,
+        }
+        for key, level in log_config.items():
+            logging.addLevelName(level, key)
+            if key == 'EXCEPTION':
+                self.__dict__[key.lower()] = self.logger.exception
+            else:
+                self.__dict__[key.lower()] = functools.partial(self.__call__,
+                                                               level)
+
+        self.format = logging.Formatter(
+            fmt='[%(asctime)-15s] [%(levelname)8s] [%(filename)s] [L%(lineno)d] - %(message)s'
+        )
+
+        self.handler = logging.StreamHandler()
+        self.handler.setFormatter(self.format)
+
+        self.logger.addHandler(self.handler)
+        self.logger.setLevel(logging.DEBUG)
+        self.logger.propagate = False
+
+    def __call__(self, log_level: str, msg: str):
+        self.logger.log(log_level, msg)
+
+
+logger = Logger()
--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@ -26,9 +26,9 @@ from kaldiio import WriteHelper
 from yacs.config import CfgNode

 from ..executor import BaseExecutor
+from ..log import logger
 from ..utils import cli_register
 from ..utils import download_and_decompress
-from ..utils import logger
 from ..utils import MODEL_HOME
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@ -25,9 +25,9 @@ import yaml
 from yacs.config import CfgNode

 from ..executor import BaseExecutor
+from ..log import logger
 from ..utils import cli_register
 from ..utils import download_and_decompress
-from ..utils import logger
 from ..utils import MODEL_HOME
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.t2s.frontend import English
@ -236,6 +236,7 @@ class TTSExecutor(BaseExecutor):
        self.parser.add_argument(
            "--am_stat",
            type=str,
+            default=None,
            help="mean and standard deviation used to normalize spectrogram when training acoustic model."
        )
        self.parser.add_argument(
@ -282,6 +283,7 @@ class TTSExecutor(BaseExecutor):
        self.parser.add_argument(
            "--voc_stat",
            type=str,
+            default=None,
            help="mean and standard deviation used to normalize spectrogram when training voc."
        )
        # other
@ -535,7 +537,7 @@ class TTSExecutor(BaseExecutor):
        wav = self.voc_inference(mel)
        self._outputs['wav'] = wav

-    def postprocess(self, output: str='output.wav'):
+    def postprocess(self, output: str='output.wav') -> Union[str, os.PathLike]:
        """
        Output postprocess and return results.
        This method get model output from self._outputs and convert it into human-readable results.
@ -543,6 +545,7 @@ class TTSExecutor(BaseExecutor):
        Returns:
            Union[str, os.PathLike]: Human-readable results such as texts and audio files.
        """
+        output = os.path.abspath(os.path.expanduser(output))
        sf.write(
            output, self._outputs['wav'].numpy(), samplerate=self.am_config.fs)
        return output
@ -593,7 +596,7 @@ class TTSExecutor(BaseExecutor):
                lang=lang,
                device=device,
                output=output)
-            logger.info('TTS Result Saved in: {}'.format(res))
+            logger.info('Wave file has been generated: {}'.format(res))
            return True
        except Exception as e:
            logger.exception(e)
--- a/paddlespeech/cli/utils.py
+++ b/paddlespeech/cli/utils.py
@ -11,15 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import functools
-import hashlib
-import logging
 import os
 import tarfile
 import zipfile
 from typing import Any
 from typing import Dict
-from typing import List

 from paddle.framework import load

@ -31,7 +27,6 @@ __all__ = [
    'get_command',
    'download_and_decompress',
    'load_state_dict_from_url',
-    'logger',
 ]


@ -59,38 +54,27 @@ def get_command(name: str) -> Any:
    return com['_entry']


-def _md5check(filepath: os.PathLike, md5sum: str) -> bool:
-    logger.info("File {} md5 checking...".format(filepath))
-    md5 = hashlib.md5()
-    with open(filepath, 'rb') as f:
-        for chunk in iter(lambda: f.read(4096), b""):
-            md5.update(chunk)
-    calc_md5sum = md5.hexdigest()
-
-    if calc_md5sum != md5sum:
-        logger.info("File {} md5 check failed, {}(calc) != "
-                    "{}(base)".format(filepath, calc_md5sum, md5sum))
-        return False
-    else:
-        logger.info("File {} md5 check passed.".format(filepath))
-        return True
-
-
 def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike:
    file_dir = os.path.dirname(filepath)
+    is_zip_file = False
    if tarfile.is_tarfile(filepath):
        files = tarfile.open(filepath, "r:*")
        file_list = files.getnames()
    elif zipfile.is_zipfile(filepath):
        files = zipfile.ZipFile(filepath, 'r')
        file_list = files.namelist()
+        is_zip_file = True
    else:
        return file_dir
-    if _is_a_single_file(file_list):
+
+    if download._is_a_single_file(file_list):
        rootpath = file_list[0]
        uncompressed_path = os.path.join(file_dir, rootpath)
-    elif _is_a_single_dir(file_list):
-        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[0]
+    elif download._is_a_single_dir(file_list):
+        if is_zip_file:
+            rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[0]
+        else:
+            rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
        uncompressed_path = os.path.join(file_dir, rootpath)
    else:
        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
@ -100,28 +84,6 @@ def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike:
    return uncompressed_path


-def _is_a_single_file(file_list: List[os.PathLike]) -> bool:
-    if len(file_list) == 1 and file_list[0].find(os.sep) < -1:
-        return True
-    return False
-
-
-def _is_a_single_dir(file_list: List[os.PathLike]) -> bool:
-    new_file_list = []
-    for file_path in file_list:
-        if '/' in file_path:
-            file_path = file_path.replace('/', os.sep)
-        elif '\\' in file_path:
-            file_path = file_path.replace('\\', os.sep)
-        new_file_list.append(file_path)
-
-    file_name = new_file_list[0].split(os.sep)[0]
-    for i in range(1, len(new_file_list)):
-        if file_name != new_file_list[i].split(os.sep)[0]:
-            return False
-    return True
-
-
 def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike:
    """
    Download archieves and decompress to specific path.
@ -133,7 +95,8 @@ def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike:
        'Dictionary keys of "url" and "md5" are required in the archive, but got: {}'.format(list(archive.keys()))

    filepath = os.path.join(path, os.path.basename(archive['url']))
-    if os.path.isfile(filepath) and _md5check(filepath, archive['md5']):
+    if os.path.isfile(filepath) and download._md5check(filepath,
+                                                       archive['md5']):
        uncompress_path = _get_uncompress_path(filepath)
        if not os.path.isdir(uncompress_path):
            download._decompress(filepath)
@ -183,44 +146,3 @@ def _get_sub_home(directory):

 PPSPEECH_HOME = _get_paddlespcceh_home()
 MODEL_HOME = _get_sub_home('models')
-
-
-class Logger(object):
-    def __init__(self, name: str=None):
-        name = 'PaddleSpeech' if not name else name
-        self.logger = logging.getLogger(name)
-
-        log_config = {
-            'DEBUG': 10,
-            'INFO': 20,
-            'TRAIN': 21,
-            'EVAL': 22,
-            'WARNING': 30,
-            'ERROR': 40,
-            'CRITICAL': 50,
-            'EXCEPTION': 100,
-        }
-        for key, level in log_config.items():
-            logging.addLevelName(level, key)
-            if key == 'EXCEPTION':
-                self.__dict__[key.lower()] = self.logger.exception
-            else:
-                self.__dict__[key.lower()] = functools.partial(self.__call__,
-                                                               level)
-
-        self.format = logging.Formatter(
-            fmt='[%(asctime)-15s] [%(levelname)8s] [%(filename)s] [L%(lineno)d] - %(message)s'
-        )
-
-        self.handler = logging.StreamHandler()
-        self.handler.setFormatter(self.format)
-
-        self.logger.addHandler(self.handler)
-        self.logger.setLevel(logging.DEBUG)
-        self.logger.propagate = False
-
-    def __call__(self, log_level: str, msg: str):
-        self.logger.log(log_level, msg)
-
-
-logger = Logger()