Merge branch 'develop' into ctc_grad_norm

pull/923/head
Hui Zhang 4 years ago committed by GitHub
commit aa2dca8b4e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

4
.gitignore vendored

@ -11,6 +11,10 @@
*.npz
*.done
*.whl
*.egg-info
build
docs/build/
tools/venv
tools/kenlm

@ -0,0 +1,30 @@
# .readthedocs.yml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 2
# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/src/conf.py
# Build documentation with MkDocs
#mkdocs:
# configuration: mkdocs.yml
# Optionally build your docs in additional formats such as PDF
formats: []
# Optionally set the version of Python and requirements required to build your docs
python:
version: 3.7
install:
- method: pip
path: .
extra_requirements:
- doc
- requirements: docs/requirements.txt

@ -1,4 +1,4 @@
# PaddlePaddle Speech to Any toolkit
# PaddlePaddle Speech toolkit
![License](https://img.shields.io/badge/license-Apache%202-red.svg)
![python version](https://img.shields.io/badge/python-3.7+-orange.svg)
@ -9,7 +9,7 @@
## Features
See [feature list](docs/src/feature_list.md) for more information.
See [feature list](docs/source/asr/feature_list.md) for more information.
## Setup
@ -18,20 +18,20 @@ All tested under:
* python>=3.7
* paddlepaddle==2.1.2
Please see [install](docs/src/install.md).
Please see [install](docs/source/asr/install.md).
## Getting Started
Please see [Getting Started](docs/src/getting_started.md) and [tiny egs](examples/tiny/s0/README.md).
Please see [Getting Started](docs/source/asr/getting_started.md) and [tiny egs](examples/tiny/s0/README.md).
## More Information
* [Data Prepration](docs/src/data_preparation.md)
* [Data Augmentation](docs/src/augmentation.md)
* [Ngram LM](docs/src/ngram_lm.md)
* [Benchmark](docs/src/benchmark.md)
* [Relased Model](docs/src/released_model.md)
* [Data Prepration](docs/source/asr/data_preparation.md)
* [Data Augmentation](docs/source/asr/augmentation.md)
* [Ngram LM](docs/source/asr/ngram_lm.md)
* [Benchmark](docs/source/asr/benchmark.md)
* [Relased Model](docs/source/asr/released_model.md)
## Questions and Help
@ -45,4 +45,4 @@ DeepSpeech is provided under the [Apache-2.0 License](./LICENSE).
## Acknowledgement
We depends on many open source repos. See [References](docs/src/reference.md) for more information.
We depends on many open source repos. See [References](docs/source/asr/reference.md) for more information.

@ -355,6 +355,8 @@ if not hasattr(paddle.Tensor, 'tolist'):
setattr(paddle.Tensor, 'tolist', tolist)
########### hcak paddle.nn.functional #############
# hack loss
def ctc_loss(logits,
labels,
@ -389,3 +391,152 @@ logger.debug(
"override ctc_loss of paddle.nn.functional if exists, remove this when fixed!"
)
F.ctc_loss = ctc_loss
########### hcak paddle.nn #############
from paddle.nn import Layer
from typing import Optional
from typing import Mapping
from typing import Iterable
from typing import Tuple
from typing import Iterator
from collections import OrderedDict, abc as container_abcs
class LayerDict(paddle.nn.Layer):
r"""Holds submodules in a dictionary.
:class:`~paddle.nn.LayerDict` can be indexed like a regular Python dictionary,
but modules it contains are properly registered, and will be visible by all
:class:`~paddle.nn.Layer` methods.
:class:`~paddle.nn.LayerDict` is an **ordered** dictionary that respects
* the order of insertion, and
* in :meth:`~paddle.nn.LayerDict.update`, the order of the merged
``OrderedDict``, ``dict`` (started from Python 3.6) or another
:class:`~paddle.nn.LayerDict` (the argument to
:meth:`~paddle.nn.LayerDict.update`).
Note that :meth:`~paddle.nn.LayerDict.update` with other unordered mapping
types (e.g., Python's plain ``dict`` before Python version 3.6) does not
preserve the order of the merged mapping.
Args:
modules (iterable, optional): a mapping (dictionary) of (string: module)
or an iterable of key-value pairs of type (string, module)
Example::
class MyModule(nn.Layer):
def __init__(self):
super(MyModule, self).__init__()
self.choices = nn.LayerDict({
'conv': nn.Conv2d(10, 10, 3),
'pool': nn.MaxPool2d(3)
})
self.activations = nn.LayerDict([
['lrelu', nn.LeakyReLU()],
['prelu', nn.PReLU()]
])
def forward(self, x, choice, act):
x = self.choices[choice](x)
x = self.activations[act](x)
return x
"""
def __init__(self, modules: Optional[Mapping[str, Layer]] = None) -> None:
super(LayerDict, self).__init__()
if modules is not None:
self.update(modules)
def __getitem__(self, key: str) -> Layer:
return self._modules[key]
def __setitem__(self, key: str, module: Layer) -> None:
self.add_module(key, module)
def __delitem__(self, key: str) -> None:
del self._modules[key]
def __len__(self) -> int:
return len(self._modules)
def __iter__(self) -> Iterator[str]:
return iter(self._modules)
def __contains__(self, key: str) -> bool:
return key in self._modules
def clear(self) -> None:
"""Remove all items from the LayerDict.
"""
self._modules.clear()
def pop(self, key: str) -> Layer:
r"""Remove key from the LayerDict and return its module.
Args:
key (string): key to pop from the LayerDict
"""
v = self[key]
del self[key]
return v
def keys(self) -> Iterable[str]:
r"""Return an iterable of the LayerDict keys.
"""
return self._modules.keys()
def items(self) -> Iterable[Tuple[str, Layer]]:
r"""Return an iterable of the LayerDict key/value pairs.
"""
return self._modules.items()
def values(self) -> Iterable[Layer]:
r"""Return an iterable of the LayerDict values.
"""
return self._modules.values()
def update(self, modules: Mapping[str, Layer]) -> None:
r"""Update the :class:`~paddle.nn.LayerDict` with the key-value pairs from a
mapping or an iterable, overwriting existing keys.
.. note::
If :attr:`modules` is an ``OrderedDict``, a :class:`~paddle.nn.LayerDict`, or
an iterable of key-value pairs, the order of new elements in it is preserved.
Args:
modules (iterable): a mapping (dictionary) from string to :class:`~paddle.nn.Layer`,
or an iterable of key-value pairs of type (string, :class:`~paddle.nn.Layer`)
"""
if not isinstance(modules, container_abcs.Iterable):
raise TypeError("LayerDict.update should be called with an "
"iterable of key/value pairs, but got " +
type(modules).__name__)
if isinstance(modules, (OrderedDict, LayerDict, container_abcs.Mapping)):
for key, module in modules.items():
self[key] = module
else:
# modules here can be a list with two items
for j, m in enumerate(modules):
if not isinstance(m, container_abcs.Iterable):
raise TypeError("LayerDict update sequence element "
"#" + str(j) + " should be Iterable; is" +
type(m).__name__)
if not len(m) == 2:
raise ValueError("LayerDict update sequence element "
"#" + str(j) + " has length " + str(len(m)) +
"; 2 is required")
# modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)]
# that's too cumbersome to type correctly with overloads, so we add an ignore here
self[m[0]] = m[1] # type: ignore[assignment]
# remove forward alltogether to fallback on Module's _forward_unimplemented
if not hasattr(paddle.nn, 'LayerDict'):
logger.debug(
"register user LayerDict to paddle.nn, remove this when fixed!")
setattr(paddle.nn, 'LayerDict', LayerDict)

@ -0,0 +1,528 @@
"""Beam search module."""
from itertools import chain
import logger
from typing import Any
from typing import Dict
from typing import List
from typing import NamedTuple
from typing import Tuple
from typing import Union
import paddle
from .utils import end_detect
from .scorers.scorer_interface import PartialScorerInterface
from .scorers.scorer_interface import ScorerInterface
from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
class Hypothesis(NamedTuple):
"""Hypothesis data type."""
yseq: paddle.Tensor # (T,)
score: Union[float, paddle.Tensor] = 0
scores: Dict[str, Union[float, paddle.Tensor]] = dict()
states: Dict[str, Any] = dict()
def asdict(self) -> dict:
"""Convert data to JSON-friendly dict."""
return self._replace(
yseq=self.yseq.tolist(),
score=float(self.score),
scores={k: float(v) for k, v in self.scores.items()},
)._asdict()
class BeamSearch(paddle.nn.Layer):
"""Beam search implementation."""
def __init__(
self,
scorers: Dict[str, ScorerInterface],
weights: Dict[str, float],
beam_size: int,
vocab_size: int,
sos: int,
eos: int,
token_list: List[str] = None,
pre_beam_ratio: float = 1.5,
pre_beam_score_key: str = None,
):
"""Initialize beam search.
Args:
scorers (dict[str, ScorerInterface]): Dict of decoder modules
e.g., Decoder, CTCPrefixScorer, LM
The scorer will be ignored if it is `None`
weights (dict[str, float]): Dict of weights for each scorers
The scorer will be ignored if its weight is 0
beam_size (int): The number of hypotheses kept during search
vocab_size (int): The number of vocabulary
sos (int): Start of sequence id
eos (int): End of sequence id
token_list (list[str]): List of tokens for debug log
pre_beam_score_key (str): key of scores to perform pre-beam search
pre_beam_ratio (float): beam size in the pre-beam search
will be `int(pre_beam_ratio * beam_size)`
"""
super().__init__()
# set scorers
self.weights = weights
self.scorers = dict() # all = full + partial
self.full_scorers = dict() # full tokens
self.part_scorers = dict() # partial tokens
# this module dict is required for recursive cast
# `self.to(device, dtype)` in `recog.py`
self.nn_dict = paddle.nn.LayerDict() # nn.Layer
for k, v in scorers.items():
w = weights.get(k, 0)
if w == 0 or v is None:
continue
assert isinstance(
v, ScorerInterface
), f"{k} ({type(v)}) does not implement ScorerInterface"
self.scorers[k] = v
if isinstance(v, PartialScorerInterface):
self.part_scorers[k] = v
else:
self.full_scorers[k] = v
if isinstance(v, paddle.nn.Layer):
self.nn_dict[k] = v
# set configurations
self.sos = sos
self.eos = eos
self.token_list = token_list
# pre_beam_size > beam_size
self.pre_beam_size = int(pre_beam_ratio * beam_size)
self.beam_size = beam_size
self.n_vocab = vocab_size
if (
pre_beam_score_key is not None
and pre_beam_score_key != "full"
and pre_beam_score_key not in self.full_scorers
):
raise KeyError(f"{pre_beam_score_key} is not found in {self.full_scorers}")
# selected `key` scorer to do pre beam search
self.pre_beam_score_key = pre_beam_score_key
# do_pre_beam when need, valid and has part_scorers
self.do_pre_beam = (
self.pre_beam_score_key is not None
and self.pre_beam_size < self.n_vocab
and len(self.part_scorers) > 0
)
def init_hyp(self, x: paddle.Tensor) -> List[Hypothesis]:
"""Get an initial hypothesis data.
Args:
x (paddle.Tensor): The encoder output feature, (T, D)
Returns:
Hypothesis: The initial hypothesis.
"""
init_states = dict()
init_scores = dict()
for k, d in self.scorers.items():
init_states[k] = d.init_state(x)
init_scores[k] = 0.0
return [
Hypothesis(
yseq=paddle.to_tensor([self.sos], place=x.place),
score=0.0,
scores=init_scores,
states=init_states,
)
]
@staticmethod
def append_token(xs: paddle.Tensor, x: int) -> paddle.Tensor:
"""Append new token to prefix tokens.
Args:
xs (paddle.Tensor): The prefix token, (T,)
x (int): The new token to append
Returns:
paddle.Tensor: (T+1,), New tensor contains: xs + [x] with xs.dtype and xs.device
"""
x = paddle.to_tensor([x], dtype=xs.dtype, place=xs.place)
return paddle.cat((xs, x))
def score_full(
self, hyp: Hypothesis, x: paddle.Tensor
) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]:
"""Score new hypothesis by `self.full_scorers`.
Args:
hyp (Hypothesis): Hypothesis with prefix tokens to score
x (paddle.Tensor): Corresponding input feature, (T, D)
Returns:
Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: Tuple of
score dict of `hyp` that has string keys of `self.full_scorers`
and tensor score values of shape: `(self.n_vocab,)`,
and state dict that has string keys
and state values of `self.full_scorers`
"""
scores = dict()
states = dict()
for k, d in self.full_scorers.items():
# scores[k] shape (self.n_vocab,)
scores[k], states[k] = d.score(hyp.yseq, hyp.states[k], x)
return scores, states
def score_partial(
self, hyp: Hypothesis, ids: paddle.Tensor, x: paddle.Tensor
) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]:
"""Score new hypothesis by `self.part_scorers`.
Args:
hyp (Hypothesis): Hypothesis with prefix tokens to score
ids (paddle.Tensor): 1D tensor of new partial tokens to score,
len(ids) < n_vocab
x (paddle.Tensor): Corresponding input feature, (T, D)
Returns:
Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: Tuple of
score dict of `hyp` that has string keys of `self.part_scorers`
and tensor score values of shape: `(len(ids),)`,
and state dict that has string keys
and state values of `self.part_scorers`
"""
scores = dict()
states = dict()
for k, d in self.part_scorers.items():
# scores[k] shape (len(ids),)
scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k], x)
return scores, states
def beam(
self, weighted_scores: paddle.Tensor, ids: paddle.Tensor
) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute topk full token ids and partial token ids.
Args:
weighted_scores (paddle.Tensor): The weighted sum scores for each tokens.
Its shape is `(self.n_vocab,)`.
ids (paddle.Tensor): The partial token ids(Global) to compute topk.
Returns:
Tuple[paddle.Tensor, paddle.Tensor]:
The topk full token ids and partial token ids.
Their shapes are `(self.beam_size,)`.
i.e. (global ids, global relative local ids).
"""
# no pre beam performed, `ids` equal to `weighted_scores`
if weighted_scores.size(0) == ids.size(0):
top_ids = weighted_scores.topk(self.beam_size)[1] # index in n_vocab
return top_ids, top_ids
# mask pruned in pre-beam not to select in topk
tmp = weighted_scores[ids]
weighted_scores[:] = -float("inf")
weighted_scores[ids] = tmp
# top_ids no equal to local_ids, since ids shape not same
top_ids = weighted_scores.topk(self.beam_size)[1] # index in n_vocab
local_ids = weighted_scores[ids].topk(self.beam_size)[1] # index in len(ids)
return top_ids, local_ids
@staticmethod
def merge_scores(
prev_scores: Dict[str, float],
next_full_scores: Dict[str, paddle.Tensor],
full_idx: int,
next_part_scores: Dict[str, paddle.Tensor],
part_idx: int,
) -> Dict[str, paddle.Tensor]:
"""Merge scores for new hypothesis.
Args:
prev_scores (Dict[str, float]):
The previous hypothesis scores by `self.scorers`
next_full_scores (Dict[str, paddle.Tensor]): scores by `self.full_scorers`
full_idx (int): The next token id for `next_full_scores`
next_part_scores (Dict[str, paddle.Tensor]):
scores of partial tokens by `self.part_scorers`
part_idx (int): The new token id for `next_part_scores`
Returns:
Dict[str, paddle.Tensor]: The new score dict.
Its keys are names of `self.full_scorers` and `self.part_scorers`.
Its values are scalar tensors by the scorers.
"""
new_scores = dict()
for k, v in next_full_scores.items():
new_scores[k] = prev_scores[k] + v[full_idx]
for k, v in next_part_scores.items():
new_scores[k] = prev_scores[k] + v[part_idx]
return new_scores
def merge_states(self, states: Any, part_states: Any, part_idx: int) -> Any:
"""Merge states for new hypothesis.
Args:
states: states of `self.full_scorers`
part_states: states of `self.part_scorers`
part_idx (int): The new token id for `part_scores`
Returns:
Dict[str, paddle.Tensor]: The new score dict.
Its keys are names of `self.full_scorers` and `self.part_scorers`.
Its values are states of the scorers.
"""
new_states = dict()
for k, v in states.items():
new_states[k] = v
for k, d in self.part_scorers.items():
new_states[k] = d.select_state(part_states[k], part_idx)
return new_states
def search(
self, running_hyps: List[Hypothesis], x: paddle.Tensor
) -> List[Hypothesis]:
"""Search new tokens for running hypotheses and encoded speech x.
Args:
running_hyps (List[Hypothesis]): Running hypotheses on beam
x (paddle.Tensor): Encoded speech feature (T, D)
Returns:
List[Hypotheses]: Best sorted hypotheses
"""
best_hyps = []
part_ids = paddle.arange(self.n_vocab) # no pre-beam
for hyp in running_hyps:
# scoring
weighted_scores = paddle.zeros(self.n_vocab, dtype=x.dtype)
scores, states = self.score_full(hyp, x)
for k in self.full_scorers:
weighted_scores += self.weights[k] * scores[k]
# partial scoring
if self.do_pre_beam:
pre_beam_scores = (
weighted_scores
if self.pre_beam_score_key == "full"
else scores[self.pre_beam_score_key]
)
part_ids = paddle.topk(pre_beam_scores, self.pre_beam_size)[1]
part_scores, part_states = self.score_partial(hyp, part_ids, x)
for k in self.part_scorers:
weighted_scores[part_ids] += self.weights[k] * part_scores[k]
# add previous hyp score
weighted_scores += hyp.score
# update hyps
for j, part_j in zip(*self.beam(weighted_scores, part_ids)):
# `part_j` is `j` relative id in `part_scores`
# will be (2 x beam at most)
best_hyps.append(
Hypothesis(
score=weighted_scores[j],
yseq=self.append_token(hyp.yseq, j),
scores=self.merge_scores(
hyp.scores, scores, j, part_scores, part_j
),
states=self.merge_states(states, part_states, part_j),
)
)
# sort and prune 2 x beam -> beam
best_hyps = sorted(best_hyps, key=lambda x: x.score, reverse=True)[
: min(len(best_hyps), self.beam_size)
]
return best_hyps
def forward(
self, x: paddle.Tensor, maxlenratio: float = 0.0, minlenratio: float = 0.0
) -> List[Hypothesis]:
"""Perform beam search.
Args:
x (paddle.Tensor): Encoded speech feature (T, D)
maxlenratio (float): Input length ratio to obtain max output length.
If maxlenratio=0.0 (default), it uses a end-detect function
to automatically find maximum hypothesis lengths
If maxlenratio<0.0, its absolute value is interpreted
as a constant max output length.
minlenratio (float): Input length ratio to obtain min output length.
Returns:
list[Hypothesis]: N-best decoding results
"""
# set length bounds
if maxlenratio == 0:
maxlen = x.shape[0]
elif maxlenratio < 0:
maxlen = -1 * int(maxlenratio)
else:
maxlen = max(1, int(maxlenratio * x.size(0)))
minlen = int(minlenratio * x.size(0))
logger.info("decoder input length: " + str(x.shape[0]))
logger.info("max output length: " + str(maxlen))
logger.info("min output length: " + str(minlen))
# main loop of prefix search
running_hyps = self.init_hyp(x)
ended_hyps = []
for i in range(maxlen):
logger.debug("position " + str(i))
best = self.search(running_hyps, x)
# post process of one iteration
running_hyps = self.post_process(i, maxlen, maxlenratio, best, ended_hyps)
# end detection
if maxlenratio == 0.0 and end_detect([h.asdict() for h in ended_hyps], i):
logger.info(f"end detected at {i}")
break
if len(running_hyps) == 0:
logger.info("no hypothesis. Finish decoding.")
break
else:
logger.debug(f"remained hypotheses: {len(running_hyps)}")
nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True)
# check the number of hypotheses reaching to eos
if len(nbest_hyps) == 0:
logger.warning(
"there is no N-best results, perform recognition "
"again with smaller minlenratio."
)
return (
[]
if minlenratio < 0.1
else self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1))
)
# report the best result
best = nbest_hyps[0]
for k, v in best.scores.items():
logger.info(
f"{v:6.2f} * {self.weights[k]:3} = {v * self.weights[k]:6.2f} for {k}"
)
logger.info(f"total log probability: {best.score:.2f}")
logger.info(f"normalized log probability: {best.score / len(best.yseq):.2f}")
logger.info(f"total number of ended hypotheses: {len(nbest_hyps)}")
if self.token_list is not None:
logger.info(
"best hypo: "
+ "".join([self.token_list[x] for x in best.yseq[1:-1]])
+ "\n"
)
return nbest_hyps
def post_process(
self,
i: int,
maxlen: int,
maxlenratio: float,
running_hyps: List[Hypothesis],
ended_hyps: List[Hypothesis],
) -> List[Hypothesis]:
"""Perform post-processing of beam search iterations.
Args:
i (int): The length of hypothesis tokens.
maxlen (int): The maximum length of tokens in beam search.
maxlenratio (int): The maximum length ratio in beam search.
running_hyps (List[Hypothesis]): The running hypotheses in beam search.
ended_hyps (List[Hypothesis]): The ended hypotheses in beam search.
Returns:
List[Hypothesis]: The new running hypotheses.
"""
logger.debug(f"the number of running hypotheses: {len(running_hyps)}")
if self.token_list is not None:
logger.debug(
"best hypo: "
+ "".join([self.token_list[x] for x in running_hyps[0].yseq[1:]])
)
# add eos in the final loop to avoid that there are no ended hyps
if i == maxlen - 1:
logger.info("adding <eos> in the last position in the loop")
running_hyps = [
h._replace(yseq=self.append_token(h.yseq, self.eos))
for h in running_hyps
]
# add ended hypotheses to a final list, and removed them from current hypotheses
# (this will be a problem, number of hyps < beam)
remained_hyps = []
for hyp in running_hyps:
if hyp.yseq[-1] == self.eos:
# e.g., Word LM needs to add final <eos> score
for k, d in chain(self.full_scorers.items(), self.part_scorers.items()):
s = d.final_score(hyp.states[k])
hyp.scores[k] += s
hyp = hyp._replace(score=hyp.score + self.weights[k] * s)
ended_hyps.append(hyp)
else:
remained_hyps.append(hyp)
return remained_hyps
def beam_search(
x: paddle.Tensor,
sos: int,
eos: int,
beam_size: int,
vocab_size: int,
scorers: Dict[str, ScorerInterface],
weights: Dict[str, float],
token_list: List[str] = None,
maxlenratio: float = 0.0,
minlenratio: float = 0.0,
pre_beam_ratio: float = 1.5,
pre_beam_score_key: str = "full",
) -> list:
"""Perform beam search with scorers.
Args:
x (paddle.Tensor): Encoded speech feature (T, D)
sos (int): Start of sequence id
eos (int): End of sequence id
beam_size (int): The number of hypotheses kept during search
vocab_size (int): The number of vocabulary
scorers (dict[str, ScorerInterface]): Dict of decoder modules
e.g., Decoder, CTCPrefixScorer, LM
The scorer will be ignored if it is `None`
weights (dict[str, float]): Dict of weights for each scorers
The scorer will be ignored if its weight is 0
token_list (list[str]): List of tokens for debug log
maxlenratio (float): Input length ratio to obtain max output length.
If maxlenratio=0.0 (default), it uses a end-detect function
to automatically find maximum hypothesis lengths
minlenratio (float): Input length ratio to obtain min output length.
pre_beam_score_key (str): key of scores to perform pre-beam search
pre_beam_ratio (float): beam size in the pre-beam search
will be `int(pre_beam_ratio * beam_size)`
Returns:
List[Dict]: N-best decoding results
"""
ret = BeamSearch(
scorers,
weights,
beam_size=beam_size,
vocab_size=vocab_size,
pre_beam_ratio=pre_beam_ratio,
pre_beam_score_key=pre_beam_score_key,
sos=sos,
eos=eos,
token_list=token_list,
).forward(x=x, maxlenratio=maxlenratio, minlenratio=minlenratio)
return [h.asdict() for h in ret]

@ -53,7 +53,8 @@ std::string ctc_greedy_decoder(
std::string best_path_result;
for (size_t i = 0; i < idx_vec.size(); ++i) {
if (idx_vec[i] != blank_id) {
best_path_result += vocabulary[idx_vec[i]];
std::string ch = vocabulary[idx_vec[i]];
best_path_result += (ch == kSPACE) ? tSPACE : ch;
}
}
return best_path_result;

@ -74,7 +74,8 @@ std::vector<std::pair<double, std::string>> get_beam_search_result(
// convert index to string
std::string output_str;
for (size_t j = 0; j < output.size(); j++) {
output_str += vocabulary[output[j]];
std::string ch = vocabulary[output[j]];
output_str += (ch == kSPACE) ? tSPACE : ch;
}
std::pair<double, std::string> output_pair(
-space_prefixes[i]->approx_ctc, output_str);

@ -21,6 +21,7 @@
#include "path_trie.h"
const std::string kSPACE = "<space>";
const std::string tSPACE = " ";
const float NUM_FLT_INF = std::numeric_limits<float>::max();
const float NUM_FLT_MIN = std::numeric_limits<float>::min();

@ -10,7 +10,7 @@ fi
if [ ! -d openfst-1.6.3 ]; then
echo "Download and extract openfst ..."
wget http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.6.3.tar.gz
wget http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.6.3.tar.gz --no-check-certificate
tar -xzvf openfst-1.6.3.tar.gz
echo -e "\n"
fi

@ -15,8 +15,8 @@
import numpy as np
import paddle
from .ctc_prefix_score import CTCPrefixScore
from .ctc_prefix_score import CTCPrefixScorePD
from .ctc_prefix_score import CTCPrefixScorer
from .ctc_prefix_score import CTCPrefixScorerPD
from .scorer_interface import BatchPartialScorerInterface

@ -6,7 +6,7 @@ import paddle
import six
class CTCPrefixScorePD():
class CTCPrefixScorerPD():
"""Batch processing of CTCPrefixScore
which is based on Algorithm 2 in WATANABE et al.
@ -267,7 +267,7 @@ class CTCPrefixScorePD():
return (r_prev_new, s_prev, f_min_prev, f_max_prev)
class CTCPrefixScore():
class CTCPrefixScorer():
"""Compute CTC label sequence scores
which is based on Algorithm 2 in WATANABE et al.

@ -145,9 +145,11 @@ class PartialScorerInterface(ScorerInterface):
and receives pre-pruned next tokens to score because it is too heavy to score
all the tokens.
Score sub-set of tokens, not all.
Examples:
* Prefix search for connectionist-temporal-classification models
* :class:`espnet.nets.scorers.ctc.CTCPrefixScorer`
* :class:`decoders.scorers.ctc.CTCPrefixScorer`
"""

@ -20,6 +20,7 @@ from deepspeech.utils.utility import print_arguments
def main_sp(config, args):
exp = Tester(config, args)
with exp.eval():
exp.setup()
exp.run_export()

@ -20,6 +20,7 @@ from deepspeech.utils.utility import print_arguments
def main_sp(config, args):
exp = Tester(config, args)
with exp.eval():
exp.setup()
exp.run_test()

@ -20,6 +20,7 @@ from deepspeech.utils.utility import print_arguments
def main_sp(config, args):
exp = ExportTester(config, args)
with exp.eval():
exp.setup()
exp.run_test()

@ -56,11 +56,6 @@ class DeepSpeech2Tester_hub():
cutoff_prob=cfg.cutoff_prob,
cutoff_top_n=cfg.cutoff_top_n,
num_processes=cfg.num_proc_bsearch)
#replace the '<space>' with ' '
result_transcripts = [
self._text_featurizer.detokenize(sentence)
for sentence in result_transcripts
]
return result_transcripts

@ -341,11 +341,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
cutoff_prob=cfg.cutoff_prob,
cutoff_top_n=cfg.cutoff_top_n,
num_processes=cfg.num_proc_bsearch)
#replace the <space> with ' '
result_transcripts = [
self._text_featurizer.detokenize(sentence)
for sentence in result_transcripts
]
self.autolog.times.stamp()
self.autolog.times.stamp()

@ -20,6 +20,7 @@ from deepspeech.utils.utility import print_arguments
def main_sp(config, args):
exp = Tester(config, args)
with exp.eval():
exp.setup()
exp.run_align()

@ -20,6 +20,7 @@ from deepspeech.utils.utility import print_arguments
def main_sp(config, args):
exp = Tester(config, args)
with exp.eval():
exp.setup()
exp.run_export()

@ -24,6 +24,7 @@ from deepspeech.utils.utility import print_arguments
def main_sp(config, args):
exp = Tester(config, args)
with exp.eval():
exp.setup()
exp.run_test()

@ -0,0 +1,187 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Evaluation for U2 model."""
import cProfile
import os
import sys
import paddle
import soundfile
from deepspeech.exps.u2.config import get_cfg_defaults
from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
from deepspeech.io.collator import SpeechCollator
from deepspeech.models.u2 import U2Model
from deepspeech.training.cli import default_argument_parser
from deepspeech.training.trainer import Trainer
from deepspeech.utils import layer_tools
from deepspeech.utils import mp_tools
from deepspeech.utils.log import Log
from deepspeech.utils.utility import print_arguments
from deepspeech.utils.utility import UpdateConfig
logger = Log(__name__).getlog()
# TODO(hui zhang): dynamic load
class U2Tester_Hub(Trainer):
def __init__(self, config, args):
# super().__init__(config, args)
self.args = args
self.config = config
self.audio_file = args.audio_file
self.collate_fn_test = SpeechCollator.from_config(config)
self._text_featurizer = TextFeaturizer(
unit_type=config.collator.unit_type,
vocab_filepath=None,
spm_model_prefix=config.collator.spm_model_prefix)
def setup_model(self):
config = self.config
model_conf = config.model
with UpdateConfig(model_conf):
model_conf.input_dim = self.collate_fn_test.feature_size
model_conf.output_dim = self.collate_fn_test.vocab_size
model = U2Model.from_config(model_conf)
if self.parallel:
model = paddle.DataParallel(model)
logger.info(f"{model}")
layer_tools.print_params(model, logger.info)
self.model = model
logger.info("Setup model")
@mp_tools.rank_zero_only
@paddle.no_grad()
def test(self):
self.model.eval()
cfg = self.config.decoding
audio_file = self.audio_file
collate_fn_test = self.collate_fn_test
audio, _ = collate_fn_test.process_utterance(
audio_file=audio_file, transcript="Hello")
audio_len = audio.shape[0]
audio = paddle.to_tensor(audio, dtype='float32')
audio_len = paddle.to_tensor(audio_len)
audio = paddle.unsqueeze(audio, axis=0)
vocab_list = collate_fn_test.vocab_list
text_feature = self.collate_fn_test.text_feature
result_transcripts = self.model.decode(
audio,
audio_len,
text_feature=text_feature,
decoding_method=cfg.decoding_method,
lang_model_path=cfg.lang_model_path,
beam_alpha=cfg.alpha,
beam_beta=cfg.beta,
beam_size=cfg.beam_size,
cutoff_prob=cfg.cutoff_prob,
cutoff_top_n=cfg.cutoff_top_n,
num_processes=cfg.num_proc_bsearch,
ctc_weight=cfg.ctc_weight,
decoding_chunk_size=cfg.decoding_chunk_size,
num_decoding_left_chunks=cfg.num_decoding_left_chunks,
simulate_streaming=cfg.simulate_streaming)
logger.info("The result_transcripts: " + result_transcripts[0][0])
def run_test(self):
self.resume()
try:
self.test()
except KeyboardInterrupt:
sys.exit(-1)
def setup(self):
"""Setup the experiment.
"""
paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
#self.setup_output_dir()
#self.setup_checkpointer()
#self.setup_dataloader()
self.setup_model()
self.iteration = 0
self.epoch = 0
def resume(self):
"""Resume from the checkpoint at checkpoints in the output
directory or load a specified checkpoint.
"""
params_path = self.args.checkpoint_path + ".pdparams"
model_dict = paddle.load(params_path)
self.model.set_state_dict(model_dict)
def check(audio_file):
logger.info("checking the audio file format......")
try:
sig, sample_rate = soundfile.read(audio_file)
except Exception as e:
logger.error(str(e))
logger.error(
"can not open the wav file, please check the audio file format")
sys.exit(-1)
logger.info("The sample rate is %d" % sample_rate)
assert (sample_rate == 16000)
logger.info("The audio file format is right")
def main_sp(config, args):
exp = U2Tester_Hub(config, args)
with exp.eval():
exp.setup()
exp.run_test()
def main(config, args):
main_sp(config, args)
if __name__ == "__main__":
parser = default_argument_parser()
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
parser.add_argument(
"--audio_file", type=str, help="path of the input audio file")
args = parser.parse_args()
print_arguments(args, globals())
if not os.path.isfile(args.audio_file):
print("Please input the right audio file path")
sys.exit(-1)
check(args.audio_file)
# https://yaml.org/type/float.html
config = get_cfg_defaults()
if args.config:
config.merge_from_file(args.config)
if args.opts:
config.merge_from_list(args.opts)
config.freeze()
print(config)
if args.dump_config:
with open(args.dump_config, 'w') as f:
print(config, file=f)
# Setting for profiling
pr = cProfile.Profile()
pr.runcall(main, config, args)
pr.dump_stats('test.profile')

@ -444,7 +444,7 @@ class U2Tester(U2Trainer):
start_time = time.time()
text_feature = self.test_loader.collate_fn.text_feature
target_transcripts = self.ordid2token(texts, texts_len)
result_transcripts = self.model.decode(
result_transcripts, result_tokenids = self.model.decode(
audio,
audio_len,
text_feature=text_feature,
@ -462,14 +462,19 @@ class U2Tester(U2Trainer):
simulate_streaming=cfg.simulate_streaming)
decode_time = time.time() - start_time
for utt, target, result in zip(utts, target_transcripts,
result_transcripts):
for utt, target, result, rec_tids in zip(
utts, target_transcripts, result_transcripts, result_tokenids):
errors, len_ref = errors_func(target, result)
errors_sum += errors
len_refs += len_ref
num_ins += 1
if fout:
fout.write({"utt": utt, "ref": target, "hyp": result})
fout.write({
"utt": utt,
"refs": [target],
"hyps": [result],
"hyps_tokenid": [rec_tids],
})
logger.info(f"Utt: {utt}")
logger.info(f"Ref: {target}")
logger.info(f"Hyp: {result}")

@ -86,7 +86,8 @@ class U2Trainer(Trainer):
batch_size=config.collator.batch_size,
shuffle=False,
drop_last=False,
collate_fn=collate_fn_dev)
collate_fn=collate_fn_dev,
num_workers=config.collator.num_workers, )
# test dataset, return raw text
config.data.manifest = config.data.test_manifest

@ -29,8 +29,8 @@ model_test_alias = {
def main_sp(config, args):
class_obj = dynamic_import(args.model_name, model_test_alias)
exp = class_obj(config, args)
with exp.eval():
exp.setup()
if args.run_mode == 'test':
exp.run_test()
elif args.run_mode == 'export':

@ -265,7 +265,7 @@ class U2Trainer(Trainer):
batch_frames_out=0,
batch_frames_inout=0,
preprocess_conf=None,
n_iter_processes=1,
n_iter_processes=config.collator.num_workers,
subsampling_factor=1,
num_encs=1)
@ -390,6 +390,10 @@ class U2Tester(U2Trainer):
def __init__(self, config, args):
super().__init__(config, args)
self.text_feature = TextFeaturizer(
unit_type=self.config.collator.unit_type,
vocab_filepath=self.config.collator.vocab_filepath,
spm_model_prefix=self.config.collator.spm_model_prefix)
def id2token(self, texts, texts_len, text_feature):
""" ord() id to chr() chr """
@ -413,15 +417,11 @@ class U2Tester(U2Trainer):
error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
start_time = time.time()
text_feature = TextFeaturizer(
unit_type=self.config.collator.unit_type,
vocab_filepath=self.config.collator.vocab_filepath,
spm_model_prefix=self.config.collator.spm_model_prefix)
target_transcripts = self.id2token(texts, texts_len, text_feature)
result_transcripts = self.model.decode(
target_transcripts = self.id2token(texts, texts_len, self.text_feature)
result_transcripts, result_tokenids = self.model.decode(
audio,
audio_len,
text_feature=text_feature,
text_feature=self.text_feature,
decoding_method=cfg.decoding_method,
lang_model_path=cfg.lang_model_path,
beam_alpha=cfg.alpha,
@ -436,14 +436,19 @@ class U2Tester(U2Trainer):
simulate_streaming=cfg.simulate_streaming)
decode_time = time.time() - start_time
for utt, target, result in zip(utts, target_transcripts,
result_transcripts):
for i, (utt, target, result, rec_tids) in enumerate(zip(
utts, target_transcripts, result_transcripts, result_tokenids)):
errors, len_ref = errors_func(target, result)
errors_sum += errors
len_refs += len_ref
num_ins += 1
if fout:
fout.write({"utt": utt, "ref": target, "hyp": result})
fout.write({
"utt": utt,
"refs": [target],
"hyps": [result],
"hyps_tokenid": [rec_tids],
})
logger.info(f"Utt: {utt}")
logger.info(f"Ref: {target}")
logger.info(f"Hyp: {result}")

@ -20,6 +20,7 @@ from deepspeech.utils.utility import print_arguments
def main_sp(config, args):
exp = Tester(config, args)
with exp.eval():
exp.setup()
exp.run_export()

@ -24,6 +24,7 @@ from deepspeech.utils.utility import print_arguments
def main_sp(config, args):
exp = Tester(config, args)
with exp.eval():
exp.setup()
exp.run_test()

@ -140,7 +140,7 @@ class TextFeaturizer():
Returns:
str: text string.
"""
tokens = tokens.replace(SPACE, " ")
tokens = [t.replace(SPACE, " ") for t in tokens ]
return "".join(tokens)
def word_tokenize(self, text):

@ -354,7 +354,7 @@ def make_batchset(
:param int batch_frames_out: maximum number of output frames in a minibatch.
:param int batch_frames_out: maximum number of input+output frames in a minibatch.
:param str count: strategy to count maximum size of batch.
For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES
For choices, see io.batchfy.BATCH_COUNT_CHOICES
:param int max_length_in: maximum length of input to decide adaptive batch size
:param int max_length_out: maximum length of output to decide adaptive batch size

@ -32,7 +32,7 @@ __all__ = ["SpeechCollator", "TripletSpeechCollator"]
logger = Log(__name__).getlog()
def tokenids(text, keep_transcription_text):
def _tokenids(text, keep_transcription_text):
# for training text is token ids
tokens = text # token ids
@ -93,6 +93,8 @@ class SpeechCollatorBase():
a user-defined shape) within one batch.
"""
self.keep_transcription_text = keep_transcription_text
self.train_mode = not keep_transcription_text
self.stride_ms = stride_ms
self.window_ms = window_ms
self.feat_dim = feat_dim
@ -192,6 +194,7 @@ class SpeechCollatorBase():
texts = []
text_lens = []
utts = []
tids = [] # tokenids
for idx, item in enumerate(batch):
utts.append(item['utt'])
@ -203,7 +206,7 @@ class SpeechCollatorBase():
audios.append(audio) # [T, D]
audio_lens.append(audio.shape[0])
tokens = tokenids(text, self.keep_transcription_text)
tokens = _tokenids(text, self.keep_transcription_text)
texts.append(tokens)
text_lens.append(tokens.shape[0])
@ -351,7 +354,7 @@ class TripletSpeechCollator(SpeechCollator):
tokens = [[], []]
for idx, text in enumerate([translation, transcription]):
tokens[idx] = tokenids(text, self.keep_transcription_text)
tokens[idx] = _tokenids(text, self.keep_transcription_text)
translation_text.append(tokens[0])
translation_text_lens.append(tokens[0].shape[0])

@ -142,6 +142,15 @@ class BatchDataLoader():
collate_fn=batch_collate,
num_workers=self.n_iter_processes, )
def __len__(self):
return len(self.dataloader)
def __iter__(self):
return self.dataloader.__iter__()
def __call__(self):
return self.__iter__()
def __repr__(self):
echo = f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}> "
echo += f"train_mode: {self.train_mode}, "
@ -159,12 +168,3 @@ class BatchDataLoader():
echo += f"num_workers: {self.n_iter_processes}, "
echo += f"file: {self.json_file}"
return echo
def __len__(self):
return len(self.dataloader)
def __iter__(self):
return self.dataloader.__iter__()
def __call__(self):
return self.__iter__()

@ -219,10 +219,10 @@ class DeepSpeech2Model(nn.Layer):
The model built from pretrained result.
"""
model = cls(
#feat_size=dataloader.collate_fn.feature_size,
feat_size=dataloader.dataset.feature_size,
#dict_size=dataloader.collate_fn.vocab_size,
dict_size=dataloader.dataset.vocab_size,
feat_size=dataloader.collate_fn.feature_size,
#feat_size=dataloader.dataset.feature_size,
dict_size=dataloader.collate_fn.vocab_size,
#dict_size=dataloader.dataset.vocab_size,
num_conv_layers=config.model.num_conv_layers,
num_rnn_layers=config.model.num_rnn_layers,
rnn_size=config.model.rnn_layer_size,

@ -809,7 +809,8 @@ class U2BaseModel(nn.Layer):
raise ValueError(f"Not support decoding method: {decoding_method}")
res = [text_feature.defeaturize(hyp) for hyp in hyps]
return res
res_tokenids = [hyp for hyp in hyps]
return res, res_tokenids
class U2Model(U2BaseModel):

@ -23,9 +23,9 @@ from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
try:
from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch # noqa: F401
from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder # noqa: F401
from deepspeech.decoders.swig_wrapper import Scorer # noqa: F401
from deepspeech.decoders.ctcdecoder.swig_wrapper import ctc_beam_search_decoder_batch # noqa: F401
from deepspeech.decoders.ctcdecoder.swig_wrapper import ctc_greedy_decoder # noqa: F401
from deepspeech.decoders.ctcdecoder.swig_wrapper import Scorer # noqa: F401
except Exception as e:
logger.info("ctcdecoder not installed!")

@ -42,7 +42,7 @@ def all_version():
"paddle_commit": paddle.version.commit,
"soundfile": soundfile.__version__,
}
logger.info(f"Deps Module Version:{pformat(vers.items())}")
logger.info(f"Deps Module Version:{pformat(list(vers.items()))}")
@contextmanager

@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 335 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 117 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 75 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 116 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 69 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 87 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 131 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 304 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 143 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 361 KiB

@ -0,0 +1,35 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd

@ -14,10 +14,11 @@ In addition, the training process and the testing process are also introduced.
The arcitecture of the model is shown in Fig.1.
<p align="center">
<img src="../images/ds2onlineModel.png" width=800>
<img src="../../images/ds2onlineModel.png" width=800>
<br/>Fig.1 The Arcitecture of deepspeech2 online model
</p>
### Data Preparation
#### Vocabulary
For English data, the vocabulary dictionary is composed of 26 English characters with " ' ", space, \<blank\> and \<eos\>. The \<blank\> represents the blank label in CTC, the \<unk\> represents the unknown character and the \<eos\> represents the start and the end characters. For mandarin, the vocabulary dictionary is composed of chinese characters statisticed from the training set and three additional characters are added. The added characters are \<blank\>, \<unk\> and \<eos\>. For both English and mandarin data, we set the default indexs that \<blank\>=0, \<unk\>=1 and \<eos\>= last index.
@ -161,12 +162,13 @@ The deepspeech2 offline model is similarity to the deepspeech2 online model. The
The arcitecture of the model is shown in Fig.2.
<p align="center">
<img src="../images/ds2offlineModel.png" width=800>
<img src="../../images/ds2offlineModel.png" width=800>
<br/>Fig.2 The Arcitecture of deepspeech2 offline model
</p>
For data preparation and decoder, the deepspeech2 offline model is same with the deepspeech2 online model.
The code of encoder and decoder for deepspeech2 offline model is in:

@ -0,0 +1,81 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
import recommonmark.parser
import sphinx_rtd_theme
# -- Project information -----------------------------------------------------
project = 'paddle speech'
copyright = '2021, Deepspeech-developers'
author = 'Deepspeech-developers'
# The full version, including alpha/beta/rc tags
release = '2.1'
# -- General configuration ---------------------------------------------------
source_parsers = {
'.md': recommonmark.parser.CommonMarkParser,
}
source_suffix = ['.rst', '.md']
master_doc = 'index'
pygments_style = 'sphinx'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.viewcode',
'sphinx_rtd_theme',
'sphinx.ext.mathjax',
'sphinx.ext.autosummary',
'numpydoc',
'myst_parser',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = [
'_build',
]
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
smartquotes = False
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# -- Extension configuration -------------------------------------------------
# numpydoc_show_class_members = False

@ -0,0 +1,47 @@
Welcome to paddle Deepspeech documentation !
==============================================
**Deepspeech** is a Speech toolkits implemented by paddlepaddle.
Contents
--------
.. toctree::
:maxdepth: 1
:caption: Introduction
asr/deepspeech_architecture
.. toctree::
:maxdepth: 1
:caption: Getting_started
asr/install
asr/getting_started
.. toctree::
:maxdepth: 1
:caption: More Information
asr/data_preparation
asr/augmentation
asr/feature_list
asr/ngram_lm
.. toctree::
:maxdepth: 1
:caption: Released_model
asr/released_model
.. toctree::
:maxdepth: 1
:caption: Acknowledgement
asr/reference

@ -0,0 +1,130 @@
# Parakeet
Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It is built on PaddlePaddle dynamic graph and includes many influential TTS models.
<div align="center">
<img src="../../images/logo.png" width=300 /> <br>
</div>
## News <img src="../../images/news_icon.png" width="40"/>
- Oct-12-2021, Refector examples code.
- Oct-12-2021, Parallel WaveGAN with LJSpeech. Check [examples/GANVocoder/parallelwave_gan/ljspeech](./examples/GANVocoder/parallelwave_gan/ljspeech).
- Oct-12-2021, FastSpeech2/FastPitch with LJSpeech. Check [examples/fastspeech2/ljspeech](./examples/fastspeech2/ljspeech).
- Sep-14-2021, Reconstruction of TransformerTTS. Check [examples/transformer_tts/ljspeech](./examples/transformer_tts/ljspeech).
- Aug-31-2021, Chinese Text Frontend. Check [examples/text_frontend](./examples/text_frontend).
- Aug-23-2021, FastSpeech2/FastPitch with AISHELL-3. Check [examples/fastspeech2/aishell3](./examples/fastspeech2/aishell3).
- Aug-03-2021, FastSpeech2/FastPitch with CSMSC. Check [examples/fastspeech2/baker](./examples/fastspeech2/baker).
- Jul-19-2021, SpeedySpeech with CSMSC. Check [examples/speedyspeech/baker](./examples/speedyspeech/baker).
- Jul-01-2021, Parallel WaveGAN with CSMSC. Check [examples/GANVocoder/parallelwave_gan/baker](./examples/GANVocoder/parallelwave_gan/baker).
- Jul-01-2021, Montreal-Forced-Aligner. Check [examples/use_mfa](./examples/use_mfa).
- May-07-2021, Voice Cloning in Chinese. Check [examples/tacotron2_aishell3](./examples/tacotron2_aishell3).
## Overview
In order to facilitate exploiting the existing TTS models directly and developing the new ones, Parakeet selects typical models and provides their reference implementations in PaddlePaddle. Further more, Parakeet abstracts the TTS pipeline and standardizes the procedure of data preprocessing, common modules sharing, model configuration, and the process of training and synthesis. The models supported here include Text FrontEnd, end-to-end Acoustic models and Vocoders:
- Text FrontEnd
- Rule based Chinese frontend.
- Acoustic Models
- [【FastSpeech2】FastSpeech 2: Fast and High-Quality End-to-End Text to Speech](https://arxiv.org/abs/2006.04558)
- [【SpeedySpeech】SpeedySpeech: Efficient Neural Speech Synthesis](https://arxiv.org/abs/2008.03802)
- [【Transformer TTS】Neural Speech Synthesis with Transformer Network](https://arxiv.org/abs/1809.08895)
- [【Tacotron2】Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884)
- Vocoders
- [【Parallel WaveGAN】Parallel WaveGAN: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram](https://arxiv.org/abs/1910.11480)
- [【WaveFlow】WaveFlow: A Compact Flow-based Model for Raw Audio](https://arxiv.org/abs/1912.01219)
- Voice Cloning
- [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558v4.pdf)
- [【GE2E】Generalized End-to-End Loss for Speaker Verification](https://arxiv.org/abs/1710.10467)
## Setup
It's difficult to install some dependent libraries for this repo in Windows system, we recommend that you **DO NOT** use Windows system, please use `Linux`.
Make sure the library `libsndfile1` is installed, e.g., on Ubuntu.
```bash
sudo apt-get install libsndfile1
```
### Install PaddlePaddle
See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. This repo requires PaddlePaddle **2.1.2** or above.
### Install Parakeet
```bash
git clone https://github.com/PaddlePaddle/Parakeet
cd Parakeet
pip install -e .
```
If some python dependent packages cannot be installed successfully, you can run the following script first.
(replace `python3.6` with your own python version)
```bash
sudo apt install -y python3.6-dev
```
See [install](https://paddle-parakeet.readthedocs.io/en/latest/install.html) for more details.
## Examples
Entries to the introduction, and the launch of training and synthsis for different example models:
- [>>> Chinese Text Frontend](./examples/text_frontend)
- [>>> FastSpeech2/FastPitch](./examples/fastspeech2)
- [>>> Montreal-Forced-Aligner](./examples/use_mfa)
- [>>> Parallel WaveGAN](./examples/GANVocoder/parallelwave_gan)
- [>>> SpeedySpeech](./examples/speedyspeech)
- [>>> Tacotron2_AISHELL3](./examples/tacotron2_aishell3)
- [>>> GE2E](./examples/ge2e)
- [>>> WaveFlow](./examples/waveflow)
- [>>> TransformerTTS](./examples/transformer_tts)
- [>>> Tacotron2](./examples/tacotron2)
## Audio samples
### TTS models (Acoustic Model + Neural Vocoder)
Check our [website](https://paddleparakeet.readthedocs.io/en/latest/demo.html) for audio sampels.
## Released Model
### Acoustic Model
#### FastSpeech2/FastPitch
1. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
2. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
3. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
#### SpeedySpeech
1. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
#### TransformerTTS
1. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
#### Tacotron2
1. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
2. [tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
### Vocoder
#### WaveFlow
1. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)
#### Parallel WaveGAN
1. [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)
2. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)
### Voice Cloning
#### Tacotron2_AISHELL3
1. [tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)
#### GE2E
1. [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
## License
Parakeet is provided under the [Apache-2.0 license](LICENSE).

@ -0,0 +1,333 @@
# Advanced Usage
This sections covers how to extend parakeet by implementing your own models and experiments. Guidelines on implementation are also elaborated.
For the general deep learning experiment, there are several parts to deal with:
1. Preprocess the data according to the needs of the model, and iterate the dataset by batch.
2. Define the model, optimizer and other components.
3. Write out the training process (generally including forward / backward calculation, parameter update, log recording, visualization, periodic evaluation, etc.).
5. Configure and run the experiment.
## Parakeet's Model Components
In order to balance the reusability and function of models, we divide models into several types according to its characteristics.
For the commonly used modules that can be used as part of other larger models, we try to implement them as simple and universal as possible, because they will be reused. Modules with trainable parameters are generally implemented as subclasses of `paddle.nn.Layer`. Modules without trainable parameters can be directly implemented as a function, and its input and output are `paddle.Tensor`.
Models for a specific task are implemented as subclasses of `paddle.nn.Layer`. Models could be simple, like a single layer RNN. For complicated models, it is recommended to split the model into different components.
For a seq-to-seq model, it's natural to split it into encoder and decoder. For a model composed of several similar layers, it's natural to extract the sublayer as a separate layer.
There are two common ways to define a model which consists of several modules.
1. Define a module given the specifications. Here is an example with multilayer perceptron.
```python
class MLP(nn.Layer):
def __init__(self, input_size, hidden_size, output_size):
self.linear1 = nn.Linear(input_size, hidden_size)
self.linear2 = nn.Linear(hidden_size, output_size)
def forward(self, x):
return self.linear2(paddle.tanh(self.linear1(x))
module = MLP(16, 32, 4) # intialize a module
```
When the module is intended to be a generic and reusable layer that can be integrated into a larger model, we prefer to define it in this way.
For considerations of readability and usability, we strongly recommend **NOT** to pack specifications into a single object. Heres an example below.
```python
class MLP(nn.Layer):
def __init__(self, hparams):
self.linear1 = nn.Linear(hparams.input_size, hparams.hidden_size)
self.linear2 = nn.Linear(hparams.hidden_size, hparams.output_size)
def forward(self, x):
return self.linear2(paddle.tanh(self.linear1(x))
```
For a module defined in this way, its harder for the user to initialize an instance. Users have to read the code to check what attributes are used.
Also, code in this style tend to be abused by passing a huge config object to initialize every module used in an experiment, thought each module may not need the whole configuration.
We prefer to be explicit.
2. Define a module as a combination given its components. Here is an example for a sequence-to-sequence model.
```python
class Seq2Seq(nn.Layer):
def __init__(self, encoder, decoder):
self.encoder = encoder
self.decoder = decoder
def forward(self, x):
encoder_output = self.encoder(x)
output = self.decoder(encoder_output)
return output
encoder = Encoder(...)
decoder = Decoder(...)
# compose two components
model = Seq2Seq(encoder, decoder)
```
When a model is a complicated and made up of several components, each of which has a separate functionality, and can be replaced by other components with the same functionality, we prefer to define it in this way.
In the directory structure of Parakeet, modules with high reusability are placed in `parakeet.modules`, but models for specific tasks are placed in `parakeet.models`. When developing a new model, developers need to consider the feasibility of splitting the modules, and the degree of generality of the modules, and place them in appropriate directories.
## Parakeet's Data Components
Another critical componnet for a deep learning project is data.
Parakeet uses the following methods for training data:
1. Preprocess the data.
2. Load the preprocessed data for training.
Previously, we wrote the preprocessing in the `__getitem__` of the Dataset, which will process when accessing a certain batch samples, but encountered some problems:
1. Efficiency problem. Even if Paddle has a design to load data asynchronously, when the batch size is large, each sample needs to be preprocessed and set up batches, which takes a lot of time , and may even seriously slow down the training process.
2. Data filtering problem. Some filtering conditions depend on the features of the processed sample. For example, filtering samples that are too short according to text length. If the text length can only be known after `__getitem__`, every time you filter, the entire dataset needed to be loaded once! In addition, if you do not pre-filter, A small exception (such as too short text ) in `__getitem__` will cause an exception in the entire data flow, which is not feasible, because `collate_fn ` presupposes that the acquisition of each sample can be normal. Even if some special flags, such as `None`, are used to mark data acquisition failures, and skip `collate_fn`, it will change batch_size .
Therefore, it is not realistic to put preprocessing entirely on `__getitem__`. We use the method mentioned above instead.
During preprocessing, we can do filtering, We can also save more intermediate features, such as text length, audio length, etc., which can be used for subsequent filtering. Because of the habit of TTS field, data is stored in multiple files, and the processed results are stored in `npy` format.
Use a list-like way to store metadata and store the file path in it, so that you can not be restricted by the specific storage location of the file. In addition to the file path, other metadata can also be stored in it. For example, the path of the text, the path of the audio, the path of the spectrum, the number of frames, the number of sampling points, and so on.
Then for the path, there are multiple opening methods, such as `sf.read`, `np.load`, etc., so it's best to use a parameter that can be input, we don't even want to determine the reading method by it's extension, it's best to let the users input it , in this way, users can define their own method to parse the data.
So we learned from the design of `DataFrame`, but our construction method is simpler, only need a `list of dicts`, a dict represents a record, and it's convenient to interact with formats such as `json`, `yaml`. For each selected field, we need to give a parser (called `converter` in the interface), and that's it.
Then we need to select a format for saving metadata to the hard disk. There are two square brackets when storing the list of records in `json`, which is not convenient for stream reading and writing, so we use `jsonlines`. We don't use `yaml` because it occupies too many rows when storing the list of records.
Meanwhile, `cache` is added here, and a multi-process Manager is used to share memory between multiple processes. When `num_workers` is used, it is guaranteed that each sub process will not cache a copy.
The implementation of `DataTable` can be found in `parakeet/datasets/data_table.py`.
```python
class DataTable(Dataset):
"""Dataset to load and convert data for general purpose.
Parameters
----------
data : List[Dict[str, Any]]
Metadata, a list of meta datum, each of which is composed of
several fields
fields : List[str], optional
Fields to use, if not specified, all the fields in the data are
used, by default None
converters : Dict[str, Callable], optional
Converters used to process each field, by default None
use_cache : bool, optional
Whether to use cache, by default False
Raises
------
ValueError
If there is some field that does not exist in data.
ValueError
If there is some field in converters that does not exist in fields.
"""
def __init__(self,
data: List[Dict[str, Any]],
fields: List[str]=None,
converters: Dict[str, Callable]=None,
use_cache: bool=False):
```
It's `__getitem__` method is to parse each field with their own parser, and then compose a dictionary to return.
```python
def _convert(self, meta_datum: Dict[str, Any]) -> Dict[str, Any]:
"""Convert a meta datum to an example by applying the corresponding
converters to each fields requested.
Parameters
----------
meta_datum : Dict[str, Any]
Meta datum
Returns
-------
Dict[str, Any]
Converted example
"""
example = {}
for field in self.fields:
converter = self.converters.get(field, None)
meta_datum_field = meta_datum[field]
if converter is not None:
converted_field = converter(meta_datum_field)
else:
converted_field = meta_datum_field
example[field] = converted_field
return example
```
## Parakeet's Training Components
A typical training process includes the following processes:
1. Iterate the dataset.
2. Process batch data.
3. Neural network forward/backward calculation.
4. Parameter update.
5. Evaluate the model on the validation dataset, when some special conditions are reached.
6. Write logs, visualize, and in some cases save necessary intermediate results.
7. Save the state of the model and optimizer.
Here, we mainly introduce the training related components of Parakeet and why we designed it like this.
### Global Repoter
When training and modifying Deep Learning modelslogging is often needed, and it has even become the key to model debugging and modifying. We usually use various visualization toolssuch as , `visualdl` in `paddle`, `tensorboard` in `tensorflow` and `vidsom`, `wnb` ,etc. Besides, `logging` and `print` are usuaally used for different purpose.
In these tools, `print` is the simplestit doesn't have the concept of `logger` and `handler` in `logging``summarywriter` and `logdir` in `tensorboard`, when printing, there is no need for `global_step` It's light enough to appear anywhere in the code, and it's printed to a common stdout. Of course, its customizability is limited, for example, it is no longer intuitive when printing dictionaries or more complex objects. And it's fleeting, people need to use redirection to save information.
For TTS models developmentwe hope to have a more universal multimedia stdout, which is actually a tool similar to `tensorboard`, which allows many multimedia forms, but it needs a `summary writer` when using, and a `step` when writing information. If the data are images or voices, some format control parameters are needed.
This will destroy the modular design to a certain extent. For example, If my model is composed of multiple sublayers, and I want to record some important information in the forward method of some sublayers. For this reason, I may need to pass the `summary writer` to this sublayers, but for the sublayers, its function is calculation, it should not have extra considerations, and it's also difficult for us to tolerate that the initialization of an `nn.Linear` has an optional `visualizer` in the method. And, for a calculation module, **HOW** can it know the global step? These are things related to the training process!
Therefore, a more common approach is not to put writing_log_code in the definition of layer, but return it, then obtain them during training, and write them to `summary writer`. However, the return values need to be modified. `summary writer ` is a broadcaster at the training level, and then each module transmits information to it by modifying the return values.
We think this method is a little ugly. We prefer to return the necessary information only rather than change the return values to accommodate visualization and recording. When you need to report some information, you should be able to report it without difficult. So we imitate the design of `chainer` and use the `global repoter`.
It takes advantage of the globality of Python's module level variables and the effect of context manager.
There is a module level variable in `parakeet/training/reporter.py` `OBSERVATIONS`which is a `Dict` to store key-value.
```python
# parakeet/training/reporter.py
@contextlib.contextmanager
def scope(observations):
# make `observation` the target to report to.
# it is basically a dictionary that stores temporary observations
global OBSERVATIONS
old = OBSERVATIONS
OBSERVATIONS = observations
try:
yield
finally:
OBSERVATIONS = old
```
Then we implement a context manager `scope`, which is used to switch the variables bound by the name of `OBSERVATIONS`. Then a `getter` function is defined to get the dictionary bound by `OBSERVATIONS`.
```python
def get_observations():
global OBSERVATIONS
return OBSERVATIONS
```
Then we define a function to get the current `OBSERVATIONS`and write key-value pair into it.
```python
def report(name, value):
# a simple function to report named value
# you can use it everywhere, it will get the default target and writ to it
# you can think of it as std.out
observations = get_observations()
if observations is None:
return
else:
observations[name] = value
```
The test code following shows the usage method.
- use `first` as the current `OBSERVATION`, write `first_begin=1`,
- then, open the second `OBSERVATION`, write `second_begin=2`,
- then, open the third `OBSERVATION`, write `third_begin=3`
- exit the third `OBSERVATION` , we back to the second `OBSERVATION` automatically
- write some context in the second `OBSERVATION` , then exit it, and we back to the first `OBSERVATION` automatically
```python
def test_reporter_scope():
first = {}
second = {}
third = {}
with scope(first):
report("first_begin", 1)
with scope(second):
report("second_begin", 2)
with scope(third):
report("third_begin", 3)
report("third_end", 4)
report("seconf_end", 5)
report("first_end", 6)
assert first == {'first_begin': 1, 'first_end': 6}
assert second == {'second_begin': 2, 'seconf_end': 5}
assert third == {'third_begin': 3, 'third_end': 4}
```
In this way, when we write modular components, we can directly call `report`. The caller will decide where to report as long as it's ready for `OBSERVATION`, then it opens a `scope` and calls the component within this `scope`.
The `Trainer` in Parakeet report the information in this way.
```python
while True:
self.observation = {}
# set observation as the report target
# you can use report freely in Updater.update()
# updating parameters and state
with scope(self.observation):
update() # training for a step is defined here
```
### Updater: Model Training Process
In order to maintain the purity of function and the reusability of code, we abstract the model code into a subclass of `paddle.nn.Layer`, and write the core computing functions in it.
We tend to write the forward process of training in `forward()`, but only write to the prediction result, not to the loss. Therefore, this module can be called by a larger module.
However, when we compose an experiment, we need to add some other things, such as training process, evaluation process, checkpoint saving, visualization and the like. In this process, we will encounter some things that only exist in the training process, such as `optimizer`, `learning rate scheduler`, `visualizer`, etc. These things are not part of the model, they should **NOT** be written in the model code.
We made an abstraction for these intermediate processes, that is, `Updater`, which takes the `model`, `optimizer`, and `data stream` as input, and its function is training. Since there may be differences in training methods of different models, we tend to write a corresponding `Updater` for each model. But this is different from the final training script, there is still a certain degree of encapsulation, just to extract the details of regular saving, visualization, evaluation, etc., and only retain the most basic function, that is, training the model.
### Visualizer
Because we choose observation as the communication mode, we can simply write the things in observation into `visualizer`.
## Parakeet's Configuration Components
Deep learning experiments often have many options to configure. These configurations can be roughly divided into several categories.
1. Data source and data processing mode configuration.
2. Save path configuration of experimental results.
3. Data preprocessing mode configuration.
4. Model structure and hyperparameterconfiguration.
5. Training process configuration.
Its common to change the running configuration to compare results. To keep track of running configuration, we use `yaml` configuration files.
Also, we want to interact with command line options. Some options that usually change according to running environments is provided by command line arguments. In addition, we want to override an option in the config file without editing it.
Taking these requirements in to consideration, we use [yacs](https://github.com/rbgirshick/yacs) as a config management tool. Other tools like [omegaconf](https://github.com/omry/omegaconf) are also powerful and have similar functions.
In each example provided, there is a `config.py`, the default config is defined at `conf/default.yaml`. If you want to get the default config, import `config.py` and call `get_cfg_defaults()` to get it. Then it can be updated with `yaml` config file or command line arguments if needed.
For details about how to use yacs in experiments, see [yacs](https://github.com/rbgirshick/yacs).
The following is the basic `ArgumentParser`:
1. `--config` is used to support configuration file parsing, and the configuration file itself handles the unique options of each experiment.
2. `--train-metadata` is the path to the training data.
3. `--output-dir` is the dir to save the training results.if there are checkpoints in `checkpoints/` of `--output-dir` , it's defalut to reload the newest checkpoint to train)
4. `--device` and `--nprocs` determine operation modes`--device` specifies the type of running device, whether to run on `cpu` or `gpu`. `--nprocs` refers to the number of training processes. If `nprocs` > 1, it means that multi process parallel training is used. (Note: currently only GPU multi card multi process training is supported.)
Developers can refer to the examples in `Parakeet/examples` to write the default configuration file when adding new experiments.
## Parakeet's Experiment template
The experimental codes in Parakeet are generally organized as follows:
```text
├── conf
│ └── default.yaml (defalut config)
├── README.md (help information)
├── batch_fn.py (organize metadata into batch)
├── config.py (code to read default config)
├── *_updater.py (Updater of a specific model)
├── preprocess.py (data preprocessing code)
├── preprocess.sh (script to call data preprocessing.py)
├── synthesis.py (synthesis from metadata)
├── synthesis.sh (script to call synthesis.py)
├── synthesis_e2e.py (synthesis from raw text)
├── synthesis_e2e.sh (script to call synthesis_e2e.py)
├── train.py (train code)
└── run.sh (script to call train.py)
```
We add a named argument. `--output-dir` to each training script to specify the output directory. The directory structure is as follows, It's best for developers to follow this specification:
```text
exp/default/
├── checkpoints/
│ ├── records.jsonl (record file)
│ └── snapshot_iter_*.pdz (checkpoint files)
├── config.yaml (config fille of this experiment)
├── vdlrecords.*.log (visualdl record file)
├── worker_*.log (text logging, one file per process)
├── validation/ (output dir during training, information_iter_*/ is the output of each step, if necessary)
├── inference/ (output dir of exported static graph model, which is only used in the final stage of training, if implemented)
└── test/ (output dir of synthesis results)
```
You can view the examples we provide in `Parakeet/examples`. These experiments are provided to users as examples which can be run directly. Users are welcome to add new models and experiments and contribute code to Parakeet.

@ -0,0 +1,115 @@
# Basic Usage
This section shows how to use pretrained models provided by parakeet and make inference with them.
Pretrained models in v0.4 are provided in a archive. Extract it to get a folder like this:
```
checkpoint_name/
├──default.yaml
├──snapshot_iter_76000.pdz
├──speech_stats.npy
└──phone_id_map.txt
```
`default.yaml` stores the config used to train the model.
`snapshot_iter_N.pdz` is the chechpoint file, where `N` is the steps it has been trained.
`*_stats.npy` is the stats file of feature if it has been normalized before training.
`phone_id_map.txt` is the map of phonemes to phoneme_ids.
The example code below shows how to use the models for prediction.
## Acoustic Models (text to spectrogram)
The code below show how to use a `FastSpeech2` model. After loading the pretrained model, use it and normalizer object to construct a prediction objectthen use fastspeech2_inferencet(phone_ids) to generate spectrograms, which can be further used to synthesize raw audio with a vocoder.
```python
from pathlib import Path
import numpy as np
import paddle
import yaml
from yacs.config import CfgNode
from parakeet.models.fastspeech2 import FastSpeech2
from parakeet.models.fastspeech2 import FastSpeech2Inference
from parakeet.modules.normalizer import ZScore
# Parakeet/examples/fastspeech2/baker/frontend.py
from frontend import Frontend
# load the pretrained model
checkpoint_dir = Path("fastspeech2_nosil_baker_ckpt_0.4")
with open(checkpoint_dir / "phone_id_map.txt", "r") as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
with open(checkpoint_dir / "default.yaml") as f:
fastspeech2_config = CfgNode(yaml.safe_load(f))
odim = fastspeech2_config.n_mels
model = FastSpeech2(
idim=vocab_size, odim=odim, **fastspeech2_config["model"])
model.set_state_dict(
paddle.load(args.fastspeech2_checkpoint)["main_params"])
model.eval()
# load stats file
stat = np.load(checkpoint_dir / "speech_stats.npy")
mu, std = stat
mu = paddle.to_tensor(mu)
std = paddle.to_tensor(std)
fastspeech2_normalizer = ZScore(mu, std)
# construct a prediction object
fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)
# load Chinese Frontend
frontend = Frontend(checkpoint_dir / "phone_id_map.txt")
# text to spectrogram
sentence = "你好吗?"
input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
phone_ids = input_ids["phone_ids"]
flags = 0
# The output of Chinese text frontend is segmented
for part_phone_ids in phone_ids:
with paddle.no_grad():
temp_mel = fastspeech2_inference(part_phone_ids)
if flags == 0:
mel = temp_mel
flags = 1
else:
mel = paddle.concat([mel, temp_mel])
```
## Vocoder (spectrogram to wave)
The code below show how to use a ` Parallel WaveGAN` model. Like the example above, after loading the pretrained model, use it and normalizer object to construct a prediction objectthen use pwg_inference(mel) to generate raw audio (in wav format).
```python
from pathlib import Path
import numpy as np
import paddle
import soundfile as sf
import yaml
from yacs.config import CfgNode
from parakeet.models.parallel_wavegan import PWGGenerator
from parakeet.models.parallel_wavegan import PWGInference
from parakeet.modules.normalizer import ZScore
# load the pretrained model
checkpoint_dir = Path("parallel_wavegan_baker_ckpt_0.4")
with open(checkpoint_dir / "pwg_default.yaml") as f:
pwg_config = CfgNode(yaml.safe_load(f))
vocoder = PWGGenerator(**pwg_config["generator_params"])
vocoder.set_state_dict(paddle.load(args.pwg_params))
vocoder.remove_weight_norm()
vocoder.eval()
# load stats file
stat = np.load(checkpoint_dir / "pwg_stats.npy")
mu, std = stat
mu = paddle.to_tensor(mu)
std = paddle.to_tensor(std)
pwg_normalizer = ZScore(mu, std)
# construct a prediction object
pwg_inference = PWGInference(pwg_normalizer, vocoder)
# spectrogram to wave
wav = pwg_inference(mel)
sf.write(
audio_path,
wav.numpy(),
samplerate=fastspeech2_config.fs)
```

@ -0,0 +1,108 @@
# Chinese Rule Based Text Frontend
TTS system mainly includes three modules: `text frontend`, `Acoustic model` and `Vocoder`. We provide a complete Chinese text frontend module in Parakeet, see exapmle in `Parakeet/examples/text_frontend/`.
A text frontend module mainly includes:
- Text Segmentation
- Text Normalization (TN)
- Word Segmentation (mainly in Chinese)
- Part-of-Speech
- Prosody
- G2P (Grapheme-to-Phoneme, include Polyphone and Tone Sandhi, etc.)
- Linguistic Features/Charactors/Phonemes
```text
• text: 90 后为中华人民共和国成立 70 周年准备了大礼
• Text Normalization: 九零后为中华人民共和国成立七十周年准备了大礼
• Word Segmentation: 九零后/为/中华人民/共和国/成立/七十/周年/准备/了/大礼
• G2P:
jiu3 ling2 hou4 wei4 zhong1 hua2 ren2 min2 gong4 he2 guo2 ...
• Prosody (prosodic words #1, prosodic phrases #2, intonation phrases #3, sentence #4):
九零后#1为中华人民#1共和国#2成立七十周年#3准备了大礼#4
```
Among them, Text Normalization and G2P are the most important modules. We mainly introduce them here.
## Text Normalization
### Supported NSW (Non-Standard-Word) Normalization
|NSW type|raw|normalized|
|:--|:-|:-|
|serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
|cardinal|这块黄金重达324.75克<br>我们班的最高总分为583分|这块黄金重达三百二十四点七五克<br>我们班的最高总分为五百八十三分|
|numeric range |12\~23<br>-1.5\~2|十二到二十三<br>负一点五到二|
|date|她出生于86年8月18日她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日|
|time|等会请在12:05请通知我|等会请在十二点零五分请通知我
|temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
|fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
|percentage|明天有62的概率降雨|明天有百分之六十二的概率降雨|
|money|随便来几个价格12块534.5元20.1万|随便来几个价格十二块五,三十四点五元,二十点一万|
|telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一|
## Grapheme-to-Phoneme
In Chinese, G2P is a very complex module, which mainly includes **polyphone** and **tone sandhi**.
We use [g2pM](https://github.com/kakaobrain/g2pM) and [pypinyin](https://github.com/mozillazg/python-pinyin) as the defalut g2p tools. They can solve the problem of polyphone to a certain extent. In the future, we intend to use a trainable language model (for example, [BERT](https://arxiv.org/abs/1810.04805)) for polyphone.
However, g2pM and pypinyin do not perform well in tone sandhi, we use rules to solve this problem, which requires relevant linguistic knowledge.
The **tone sandhi** in Chinese mainly include:
- soft tone sandhi (轻声变调)
- "一" "不" tone sandhi ("一" "不" 变调)
- three tone sandhi (三声变调)
For ease of understanding, we list the tone sandhi rules in Chinese here
### 1. 轻声变调
| |cases |
|:--|:-|
| 语气助词“吧、呢、啊”等 | 吃吧、走吗、去呢、跑啊 |
| 结构助词:“的、地、得”| 我的书、慢慢地走、跑得很快等 |
|有的轻声音节和非轻声音节构成对比区别意义 |买卖:一指生意;二指买和卖。 <br/> 地道:一指纯粹、真正;二指地下通道。<br> 大意:一指没有注意;二指主要的意思。 <br/> 东西:一指各种事物;二指东面与西面。<br> 言语:一指所说的话;二指开口,招呼。<br/>运气:一指一种锻炼的方法。二指幸运。<br> |
|名词的后缀:“们、子、头”|你们、房子、石头 |
|名词或动词的第二个重叠音节 | 奶奶、姐姐、爸爸、试试、看看、说说、问问 |
|名词后面表示方位的:“上、下、里” |桌上、地下、院里 |
| 动态助词:“了、着、过” | 走了、看着、去过|
| 作宾语的人称代词:“我、你、他” | 找我、请你、麻烦他。 |
| 约定俗成 | 匀称、盘算、枇杷、篱笆、活泼、玄乎。狐狸、学生、拾掇、麻烦、蛤蟆、石榴。玫瑰、凉快、萝卜、朋友、奴才、云彩。脑袋、老爷、老婆、嘴巴、指头、指甲。委屈、喇叭、讲究、打发、打听、喜欢。点心、伙计、打扮、哑巴、女婿、首饰。自在、吓唬、力气、漂亮、队伍、地方。痛快、念叨、笑语、丈夫、志气、钥匙。月亮、正经、位置、秀气、上司、悟性。告示、动静、热闹、屁股、阔气、意思。等 |
### 2. "一" "不" 变调
#### "一" 变调
| | 是否变调 | cases|
|:--|:-|:-|
| 单独念 | 否 | 第一、一楼|
| 序数 |否 | |
| 用在语句末尾 | 否 | |
| 去声前变阳平(四声前变二声) | | 一栋yí dòng、一段yí duàn、一律yí lǜ、一路yí lù|
| 非去声前变去声(非四声前变四声) | | 阴平(一声)<br>一发yì fā 、一端yì duān、一天yì tiān、一忽yì hū<br>阳平(二声)<br>一叠yì dié 、一同yì tóng 、一头yì tóu 、一条yì tiáo<br>上声(三声)<br>一统yì tǒng、一体yì tǐ、一览yì lǎn、一口yì kǒu|
|轻读,当“一”嵌在重叠式的动词之间 | | 听一听 tīng yi tīng|
#### "不" 变调
| | 是否变调 | cases|
|:--|:-|:-|
|单独念|否 | |
| 用在语句末尾| 否 | 我不|
|去声前变阳平(四声前变成二声) | | 不怕bú pà、不妙bú miào、不犯bú fàn、不忿bú fèn|
| 轻读,不”夹在重叠动词或重叠形容词之间、夹在动词和补语之间 | |懂不懂 dǒng bu dǒng 、看不清 kàn bu qīng |
### 3. 三声变调
| | 子类别| 如何变调|cases|
|:--|:-|:-|:-|
|单独念 | | 否| |
|句末 | | 否| |
|在句中停顿并没被后音节影响 | |否 | |
|三声+三声 | | 二声+三声|保险、保养、党委、尽管、老板、本领、引导、古老、敏感、鼓舞、永远、语法、口语、岛屿、保姆、远景、北海、首长、母语 |
| 三个三声相连| 双音节+单音节(“双单格”结构)| 前两个变二声|演讲稿、跑马场、展览馆、管理组、水彩笔、蒙古语、选取法、古典舞、虎骨酒、洗脸水、草稿纸|
| | 单音节+双音节(“单双格”结构)|第二个变二声|史小姐、党小组、好小伙、跑百米、纸老虎、李厂长、老保姆、冷处理、很友好、小雨伞|
| | 单音节+单音节+单音节(“单三格”结构)| 前两个变二声| 软懒散、稳准狠|
| 更多三声音节相连时| | 按语意与若干二字组成三字组,然后按以上变调规律处理|岂有 / 此理。<br>请你 / 给我 / 打点儿 / 洗脸水。<br>手表厂 / 有五种 /好产品。|
## References
- [chinese_text_normalization](https://github.com/speechio/chinese_text_normalization)
- [声调篇|这些“一、不”变调规律,你不得不知](https://zhuanlan.zhihu.com/p/36156170)
- [TTS前端模块中的普通话变调规则](https://zhuanlan.zhihu.com/p/65091429)
- [轻声和变调](https://wenku.baidu.com/view/ad2016d94693daef5ef73db1.html)
- [必读轻声词语表546条](http://www.chaziwang.com/article-view-504.html)

@ -0,0 +1,583 @@
Audio Sample
==================
The main processes of TTS include:
1. Convert the original text into characters/phonemes, through ``text frontend`` module.
2. Convert characters/phonemes into acoustic features , such as linear spectrogram, mel spectrogram, LPC features, etc. through ``Acoustic models``.
3. Convert acoustic features into waveforms through ``Vocoders``.
When training ``Tacotron2````TransformerTTS`` and ``WaveFlow``, we use English single speaker TTS dataset `LJSpeech <https://keithito.com/LJ-Speech-Dataset/>`_ by default. However, when training ``SpeedySpeech``, ``FastSpeech2`` and ``ParallelWaveGAN``, we use Chinese single speaker dataset `CSMSC <https://test.data-baker.com/data/index/source/>`_ by default.
In the future, ``Parakeet`` will mainly use Chinese TTS datasets for default examples.
Here, we will display three types of audio samples:
1. Analysis/synthesis (ground-truth spectrograms + Vocoder)
2. TTS (Acoustic model + Vocoder)
3. Chinese TTS with/without text frontend (mainly tone sandhi)
Analysis/synthesis
--------------------------
Audio samples generated from ground-truth spectrograms with a vocoder.
.. raw:: html
<b>LJSpeech(English)</b>
<br>
</br>
<table>
<tr>
<th align="left"> GT </th>
<th align="left"> WaveFlow </th>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0001.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0003.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0004.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0005.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_0.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_1.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_2.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_3.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_4.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
</table>
<br>
</br>
<b>CSMSC(Chinese)</b>
<br>
</br>
<table>
<tr>
<th align="left"> GT (convert to 24k) </th>
<th align="left"> ParallelWaveGAN </th>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009901.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009902.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009903.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009904.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009905.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009901.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009902.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009903.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009904.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009905.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
</table>
TTS
-------------------
Audio samples generated by a TTS system. Text is first transformed into spectrogram by a text-to-spectrogram model, then the spectrogram is converted into raw audio by a vocoder.
.. raw:: html
<table>
<tr>
<th align="left"> TransformerTTS + WaveFlow </th>
<th align="left"> Tacotron2 + WaveFlow </th>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/001.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/003.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/004.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/005.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/006.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/007.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/008.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/009.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_1.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_2.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_3.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_4.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_5.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_6.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_7.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_8.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_9.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
</table>
<table>
<tr>
<th align="left"> SpeedySpeech + ParallelWaveGAN </th>
<th align="left"> FastSpeech2 + ParallelWaveGAN </th>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/001.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/003.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/004.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/005.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/006.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/007.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/008.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/009.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/001.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/003.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/004.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/005.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/006.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/007.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/008.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/009.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
</table>
Chinese TTS with/without text frontend
--------------------------------------
We provide a complete Chinese text frontend module in ``Parakeet``. ``Text Normalization`` and ``G2P`` are the most important modules in text frontend, We assume that the texts are normalized already, and mainly compare ``G2P`` module here.
We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
.. raw:: html
<table>
<tr>
<th align="left"> With Text Frontend </th>
<th align="left"> Without Text Frontend </th>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/001.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/003.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/004.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/005.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/006.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/007.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/008.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/009.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/010.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/001.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/003.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/004.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/005.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/006.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/007.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/008.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/009.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/010.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<table>

@ -0,0 +1,45 @@
.. parakeet documentation master file, created by
sphinx-quickstart on Fri Sep 10 14:22:24 2021.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Parakeet
====================================
``parakeet`` is a deep learning based text-to-speech toolkit built upon ``paddlepaddle`` framework. It aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It includes many influential TTS models proposed by `Baidu Research <http://research.baidu.com>`_ and other research groups.
``parakeet`` mainly consists of components below.
#. Implementation of models and commonly used neural network layers.
#. Dataset abstraction and common data preprocessing pipelines.
#. Ready-to-run experiments.
.. toctree::
:maxdepth: 1
:caption: Introduction
introduction
.. toctree::
:maxdepth: 1
:caption: Getting started
install
basic_usage
advanced_usage
cn_text_frontend
released_models
.. toctree::
:maxdepth: 1
:caption: Demos
demo
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

@ -0,0 +1,47 @@
# Installation
## Install PaddlePaddle
Parakeet requires PaddlePaddle as its backend. Note that 2.1.2 or newer versions of paddle is required.
Since paddlepaddle has multiple packages depending on the device (cpu or gpu) and the dependency libraries, it is recommended to install a proper package of paddlepaddle with respect to the device and dependency library versons via `pip`.
Installing paddlepaddle with conda or build paddlepaddle from source is also supported. Please refer to [PaddlePaddle installation](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html) for more details.
Example instruction to install paddlepaddle via pip is listed below.
### PaddlePaddle with GPU
```python
# CUDA10.1 的 PaddlePaddle
python -m pip install paddlepaddle-gpu==2.1.2.post101 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
# CUDA10.2 的 PaddlePaddle
python -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple
# CUDA11.0 的 PaddlePaddle
python -m pip install paddlepaddle-gpu==2.1.2.post110 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
# CUDA11.2 的 PaddlePaddle
python -m pip install paddlepaddle-gpu==2.1.2.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
```
### PaddlePaddle with CPU
```python
python -m pip install paddlepaddle==2.1.2 -i https://mirror.baidu.com/pypi/simple
```
## Install libsndfile
Experimemts in parakeet often involve audio and spectrum processing, thus `librosa` and `soundfile` are required. `soundfile` requires a extra C library `libsndfile`, which is not always handled by pip.
For Windows and Mac users, `libsndfile` is also installed when installing `soundfile` via pip, but for Linux users, installing `libsndfile` via system package manager is required. Example commands for popular distributions are listed below.
```bash
# ubuntu, debian
sudo apt-get install libsndfile1
# centos, fedora
sudo yum install libsndfile
# openSUSE
sudo zypper in libsndfile
```
For any problem with installtion of soundfile, please refer to [SoundFile](https://pypi.org/project/SoundFile/).
## Install Parakeet
There are two ways to install parakeet according to the purpose of using it.
1. If you want to run experiments provided by parakeet or add new models and experiments, it is recommended to clone the project from github (Parakeet), and install it in editable mode.
```python
git clone https://github.com/PaddlePaddle/Parakeet
cd Parakeet
pip install -e .
```

@ -0,0 +1,27 @@
# Parakeet - PAddle PARAllel text-to-speech toolKIT
## What is Parakeet?
Parakeet is a deep learning based text-to-speech toolkit built upon paddlepaddle framework. It aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It includes many influential TTS models proposed by Baidu Research and other research groups.
## What can Parakeet do?
Parakeet mainly consists of components below:
- Implementation of models and commonly used neural network layers.
- Dataset abstraction and common data preprocessing pipelines.
- Ready-to-run experiments.
Parakeet provides you with a complete TTS pipeline, including:
- Text FrontEnd
- Rule based Chinese frontend.
- Acoustic Models
- FastSpeech2
- SpeedySpeech
- TransformerTTS
- Tacotron2
- Vocoders
- Parallel WaveGAN
- WaveFlow
- Voice Cloning
- Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
- GE2E
Parakeet helps you to train TTS models with simple commands.

@ -0,0 +1,295 @@
# Released Models
TTS system mainly includes three modules: `text frontend`, `Acoustic model` and `Vocoder`. We introduce a rule based Chinese text frontend in [cn_text_frontend.md](./cn_text_frontend.md). Here, we will introduce acoustic models and vocoders, which are trainable models.
The main processes of TTS include:
1. Convert the original text into characters/phonemes, through `text frontend` module.
2. Convert characters/phonemes into acoustic features , such as linear spectrogram, mel spectrogram, LPC features, etc. through `Acoustic models`.
3. Convert acoustic features into waveforms through `Vocoders`.
A simple text frontend module can be implemented by rules. Acoustic models and vocoders need to be trained. The models provided by Parakeet are acoustic models and vocoders.
## Acoustic Models
### Modeling Objectives of Acoustic Models
Modeling the mapping relationship between text sequences and speech features
```text
text X = {x1,...,xM}
specch Y = {y1,...yN}
```
Modeling Objectives:
```text
Ω = argmax p(Y|X,Ω)
```
### Modeling process of Acoustic Models
At present, there are two mainstream acoustic model structures.
- Frame level acoustic model:
- Duration model (M Tokens - > N Frames).
- Acoustic decoder (N Frames - > N Frames).
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/frame_level_am.png" width=500 /> <br>
</div>
- Sequence to sequence acoustic model:
- M Tokens - > N Frames.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/seq2seq_am.png" width=500 /> <br>
</div>
### Tacotron2
[Tacotron](https://arxiv.org/abs/1703.10135) is the first end-to-end acoustic model based on deep learning, and it is also the most widely used acoustic model.
[Tacotron2](https://arxiv.org/abs/1712.05884) is the Improvement of Tacotron.
#### Tacotron
**Features of Tacotron:**
- Encoder.
- CBHG.
- Input: character sequence.
- Decoder.
- Global soft attention.
- unidirectional RNN.
- Autoregressive teacher force training (input real speech feature).
- Multi frame prediction.
- CBHG postprocess.
- Vocoder: Griffin-Lim.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/tacotron.png" width=700 /> <br>
</div>
**Advantage of Tacotron:**
- No need for complex text frontend analysis modules.
- No need for additional duration model.
- Greatly simplifies the acoustic model construction process and reduces the dependence of speech synthesis tasks on domain knowledge.
**Disadvantages of Tacotron:**
- The CBHG is complex and the amount of parameters is relatively large.
- Global soft attention.
- Poor stability for speech synthesis tasks.
- In training, the less the number of speech frames predicted at each moment, the more difficult it is to train.
- Phase problem in Griffin-Lim casues speech distortion during wave reconstruction.
- The autoregressive decoder cannot be stopped during the generation process.
#### Tacotron2
**Features of Tacotron2:**
- Reduction of parameters.
- CBHG -> PostNet (3 Conv layers + BLSTM or 5 Conv layers).
- remove Attention RNN.
- Speech distortion caused by Griffin-Lim.
- WaveNet.
- Improvements of PostNet.
- CBHG -> 5 Conv layers.
- The input and output of the PostNet calculate `L2` loss with real Mel spectrogram.
- Residual connection.
- Bad stop in autoregressive decoder.
- Predict whether it should stop at each moment of decoding (stop token).
- Set a threshold to determine whether to stop generating when decoding.
- Stability of attention.
- Location-aware attention.
- The alignment matrix of previous time is considered at the step `t` of decoder.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/tacotron2.png" width=500 /> <br>
</div>
You can find Parakeet's tacotron2 example at `Parakeet/examples/tacotron2`.
### TransformerTTS
**Disadvantages of the Tacotrons:**
- Encodr and decoder are relatively weak at global information modeling
- Vanishing gradient of RNN.
- Fixed-length context modeling problem in CNN kernel.
- Training is relatively inefficient.
- The attention is not robust enough and the stability is poor.
Transformer TTS is a combination of Tacotron2 and Transformer.
#### Transformer
[Transformer](https://arxiv.org/abs/1706.03762) is a seq2seq model based entirely on attention mechanism.
**Features of Transformer:**
- Encoder.
- `N` blocks based on self-attention mechanism.
- Positional Encoding.
- Decoder.
- `N` blocks based on self-attention mechanism.
- Add Mask to the self-attention in blocks to cover up the information after `t` step.
- Attentions between encoder and decoder.
- Positional Encoding.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/transformer.png" width=500 /> <br>
</div>
#### Transformer TTS
Transformer TTS is a seq2seq acoustic model based on Transformer and Tacotron2.
**Motivations**
- RNNs in Tacotron2 make the inefficiency of training.
- Vanishing gradient of RNN makes the model's ability to model long-term contexts weak.
- Self-attention doesn't contain any recursive structure which can be trained in parallel.
- Self-attention can model global context information well.
**Features of Transformer TTS:**
- Add conv based PreNet in encoder and decoder.
- Stop Token in decoder controls when to stop autoregressive generation.
- Add PostNet after decoder to improve the quality of synthetic speech.
- Scaled position encoding.
- Uniform scale position encoding may have a negative impact on input or output sequences.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/transformer_tts.png" width=500 /> <br>
</div>
**Disadvantages of Transformer TTS:**
- The ability of position encoding for timing information is still relatively weak.
- The ability to perceive local information is weak, and local information is more related to pronunciation.
- Stability is worse than Tacotron2.
You can find Parakeet's Transformer TTS example at `Parakeet/examples/transformer_tts`.
### FastSpeech2
**Disadvantage of seq2seq models:**
- In the seq2seq model based on attention, no matter how to improve the attention mechanism, it's difficult to avoid generation errors in the decoding stage.
Frame level acoustic models use duration models to determine the pronunciation duration of phonemes, and the frame level mapping does not have the uncertainty of sequence generation.
In seq2saq models, the concept of duration models is used as the alignment module of two sequences to replace attention, which can avoid the uncertainty in attention, and significantly improve the stability of the seq2saq models.
#### FastSpeech
Instead of using the encoder-attention-decoder based architecture as adopted by most seq2seq based autoregressive and non-autoregressive generation, [FastSpeech](https://arxiv.org/abs/1905.09263) is a novel feed-forward structure, which can generate a target mel spectrogram sequence in parallel.
**Features of FastSpeech:**
- Encoder: based on Transformer.
- Change `FFN` to `CNN` in self-attention.
- Model local dependency.
- Length regulator.
- Use real phoneme durations to expand output frame of encoder during training.
- Non autoregressive decode.
- Improve generation efficiency.
**Length predictor:**
- Pretrain a TransformerTTS model.
- Get alignment matrix of train data.
- Caculate the phoneme durations according to the probability of the alignment matrix.
- Use the output of encoder to predict the phoneme durations and calculate the MSE loss.
- Use real phoneme durations to expand output frame of encoder during training.
- Use phoneme durations predicted by the duration model to expand the frame during prediction.
- Attentrion can not control phoneme durations. The explicit duration modeling can control durations through duration coefficient (duration coefficient is `1` during training).
**Advantages of non-autoregressive decoder:**
- The built-in duration model of the seq2seq model has converted the input length `M` to the output length `N`.
- The length of output is known, `stop token` is no longer used, avoiding the problem of being unable to stop.
• Can be generated in parallel (decoding time is less affected by sequence length)
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/fastspeech.png" width=800 /> <br>
</div>
#### FastPitch
[FastPitch](https://arxiv.org/abs/2006.06873) follows FastSpeech. A single pitch value is predicted for every temporal location, which improves the overall quality of synthesized speech.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/fastpitch.png" width=500 /> <br>
</div>
#### FastSpeech2
**Disadvantages of FastSpeech:**
- The teacher-student distillation pipeline is complicated and time-consuming.
- The duration extracted from the teacher model is not accurate enough.
- The target mel spectrograms distilled from teacher model suffer from information loss due to data simplification.
[FastSpeech2](https://arxiv.org/abs/2006.04558) addresses the issues in FastSpeech and better solves the one-to-many mapping problem in TTS.
**Features of FastSpeech2:**
- Directly training the model with ground-truth target instead of the simplified output from teacher.
- Introducing more variation information of speech as conditional inputs, extract `duration`, `pitch` and `energy` from speech waveform and directly take them as conditional inputs in training and use predicted values in inference.
FastSpeech2 is similar to FastPitch but introduces more variation information of speech.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/fastspeech2.png" width=800 /> <br>
</div>
You can find Parakeet's FastSpeech2/FastPitch example at `Parakeet/examples/fastspeech2`, We use token-averaged pitch and energy values introduced in FastPitch rather than frame level ones in FastSpeech2.
### SpeedySpeech
[SpeedySpeech](https://arxiv.org/abs/2008.03802) simplify the teacher-student architecture of FastSpeech and provide a fast and stable training procedure.
**Features of SpeedySpeech:**
- Use a simpler, smaller and faster-to-train convolutional teacher model ([Deepvoice3](https://arxiv.org/abs/1710.07654) and [DCTTS](https://arxiv.org/abs/1710.08969)) with a single attention layer instead of Transformer used in FastSpeech.
- Show that self-attention layers in the student network are not needed for high-quality speech synthesis.
- Describe a simple data augmentation technique that can be used early in the training to make the teacher network robust to sequential error propagation.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/speedyspeech.png" width=500 /> <br>
</div>
You can find Parakeet's SpeedySpeech example at `Parakeet/examples/speedyspeech/baker`.
## Vocoders
In speech synthesis, the main task of the vocoder is to convert the spectral parameters predicted by the acoustic model into the final speech waveform.
Taking into account the short-term change frequency of the waveform, the acoustic model usually avoids direct modeling of the speech waveform, but firstly models the spectral features extracted from the speech waveform, and then reconstructs the waveform by the decoding part of the vocoder.
A vocoder usually consists of a pair of encoders and decoders for speech analysis and synthesis. The encoder estimate the parameters, and then the decoder restores the speech.
Vocoders based on neural networks usually is speech synthesis, which learns the mapping relationship from spectral features to waveforms through training data.
### Categories of neural vocodes
- Autoregression
- WaveNet
- WaveRNN
- LPCNet
- Flow
- **WaveFlow**
- WaveGlow
- FloWaveNet
- Parallel WaveNet
- GAN
- WaveGAN
- **Parallel WaveGAN**
- MelGAN
- HiFi-GAN
- VAE
- Wave-VAE
- Diffusion
- WaveGrad
- DiffWave
**Motivations of GAN-based vocoders:**
- Modeling speech signal by estimating probability distribution usually has high requirements for the expression ability of the model itself. In addition, specific assumptions need to be made about the distribution of waveforms.
- Although autoregressive neural vocoders can obtain high-quality synthetic speech, such models usually have a **slow generation speed**.
- The training of inverse autoregressive flow vocoders is complex, and they also require the modeling capability of long term context information.
- Vocoders based on Bipartite Transformation converge slowly and are complex.
- GAN-based vocoders don't need to make assumptions about the speech distribution, and train through adversarial learning.
Here, we introduce a Flow-based vocoder WaveFlow and a GAN-based vocoder Parallel WaveGAN.
### WaveFlow
[WaveFlow](https://arxiv.org/abs/1912.01219) is proposed by Baidu Research.
**Features of WaveFlow:**
- It can synthesize 22.05 kHz high-fidelity speech around 40x faster than real-time on a Nvidia V100 GPU without engineered inference kernels, which is faster than [WaveGlow](https://github.com/NVIDIA/waveglow) and serveral orders of magnitude faster than WaveNet.
- It is a small-footprint flow-based model for raw audio. It has only 5.9M parameters, which is 15x smalller than WaveGlow (87.9M).
- It is directly trained with maximum likelihood without probability density distillation and auxiliary losses as used in [Parallel WaveNet](https://arxiv.org/abs/1711.10433) and [ClariNet](https://openreview.net/pdf?id=HklY120cYm), which simplifies the training pipeline and reduces the cost of development.
You can find Parakeet's WaveFlow example at `Parakeet/examples/waveflow`.
### Parallel WaveGAN
[Parallel WaveGAN](https://arxiv.org/abs/1910.11480) trains a non-autoregressive WaveNet variant as a generator in a GAN based training method.
**Features of Parallel WaveGAN:**
- Use non-causal convolution instead of causal convolution.
- The input is random Gaussian white noise.
- The model is non-autoregressive both in training and prediction, which is fast
- Multi-resolution STFT loss.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/pwg.png" width=600 /> <br>
</div>
You can find Parakeet's Parallel WaveGAN example at `Parakeet/examples/parallelwave_gan/baker`.

@ -1,4 +1,12 @@
# ASR
* s0 for deepspeech2 offline
* s1 for u2
* s0 for deepspeech2
* s1 for u2/transformer/conformer
## Data
| Data Subset | Duration in Seconds |
| ------------------- | --------------------- |
| data/manifest.train | 1.23 ~ 14.53125 |
| data/manifest.dev | 1.645 ~ 12.533 |
| data/manifest.test | 1.859125 ~ 14.6999375 |

@ -1,11 +1,5 @@
# Aishell-1
## Data
| Data Subset | Duration in Seconds |
| data/manifest.train | 1.23 ~ 14.53125 |
| data/manifest.dev | 1.645 ~ 12.533 |
| data/manifest.test | 1.859125 ~ 14.6999375 |
## Deepspeech2
| Model | Params | Release | Config | Test set | Loss | CER |

@ -26,22 +26,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# download data, generate manifests
# build vocabulary
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
--vocab_path="data/vocab.txt" \
--manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# compute mean and stddev for normalizer
num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
@ -62,6 +46,20 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
fi
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# download data, generate manifests
# build vocabulary
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
--vocab_path="data/vocab.txt" \
--manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size

@ -11,6 +11,7 @@
## Chunk Conformer
Need set `decoding.decoding_chunk_size=16` when decoding.
| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
@ -18,10 +19,3 @@
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 |
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 |
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | - | 0.059400 |
## Transformer
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | ---|
| transformer | - | conf/transformer.yaml | spec_aug + shift | test | attention | - | - |

@ -19,17 +19,17 @@
{
"type": "specaug",
"params": {
"W": 0,
"warp_mode": "PIL",
"F": 10,
"T": 50,
"n_freq_masks": 2,
"T": 50,
"n_time_masks": 2,
"p": 1.0,
"W": 80,
"adaptive_number_ratio": 0,
"adaptive_size_ratio": 0,
"max_n_time_masks": 20,
"replace_with_zero": true,
"warp_mode": "PIL"
"replace_with_zero": true
},
"prob": 1.0
}

@ -26,22 +26,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# download data, generate manifests
# build vocabulary
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
--vocab_path="data/vocab.txt" \
--manifest_paths "data/manifest.train.raw"
if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# compute mean and stddev for normalizer
num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
@ -63,6 +47,20 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
fi
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# download data, generate manifests
# build vocabulary
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
--vocab_path="data/vocab.txt" \
--manifest_paths "data/manifest.train.raw"
if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size

@ -0,0 +1,47 @@
#!/bin/bash
if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix audio_file"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1
ckpt_prefix=$2
audio_file=$3
chunk_mode=false
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
chunk_mode=true
fi
# download language model
#bash local/download_lm_ch.sh
#if [ $? -ne 0 ]; then
# exit 1
#fi
for type in attention_rescoring; do
echo "decoding ${type}"
batch_size=1
output_dir=${ckpt_prefix}
mkdir -p ${output_dir}
python3 -u ${BIN_DIR}/test_hub.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} \
--audio_file ${audio_file}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
done
exit 0

@ -25,5 +25,5 @@ export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!"
. $KALDI_ROOT/tools/config/common_path.sh || true
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh

@ -1,6 +1,6 @@
#!/bin/bash
set -e
source path.sh
set -e
stage=0
stop_stage=100
@ -13,6 +13,8 @@ avg_ckpt=avg_${avg_num}
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
echo "checkpoint name ${ckpt}"
audio_file="data/tmp.wav"
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/data.sh || exit -1
@ -48,3 +50,8 @@ fi
# train lm and build TLG
./local/tlg.sh --corpus aishell --lmtype srilm
fi
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
# test a single .wav file
CUDA_VISIBLE_DEVICES=3 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
fi

@ -0,0 +1,4 @@
# Aishell3
* tts0 - fastspeech2
* vc0 - tactron2 voice clone

@ -0,0 +1,112 @@
## Tacotron2 + AISHELL-3 数据集训练语音克隆模型
本实验的内容是利用 AISHELL-3 数据集和 Tacotron 2 模型进行语音克隆任务,使用的模型大体结构和论文 [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) 相同。大致步骤如下:
1. Speaker Encoder: 我们使用了一个 Speaker Verification 任务训练一个 speaker encoder。这部分任务所用的数据集和训练 Tacotron 2 的数据集不同,因为不需要 transcription 的缘故,我们使用了较多的训练数据,可以参考实现 [ge2e](../ge2e)。
2. Synthesizer: 然后使用训练好的 speaker encoder 为 AISHELL-3 数据集中的每个句子生成对应的 utterance embedding. 这个 Embedding 作为 Tacotron 模型中的一个额外输入和 encoder outputs 拼接在一起。
3. Vocoder: 我们使用的声码器是 WaveFlow参考实验 [waveflow](../waveflow).
## 数据处理
### utterance embedding 的生成
使用训练好的 speaker encoder 为 AISHELL-3 数据集中的每个句子生成对应的 utterance embedding. 以和音频文件夹同构的方式存储。存储格式是 `.npy` 文件。
首先 cd 到 [ge2e](../ge2e) 文件夹。下载训练好的 [模型](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip),然后运行脚本生成每个句子的 utterance embedding.
```bash
python inference.py --input=<intput> --output=<output> --device="gpu" --checkpoint_path=<pretrained checkpoint>
```
其中 input 是只包含音频文件夹的文件。这里可以用 `~/datasets/aishell3/train/wav`,然后 output 是用于存储 utterance embed 的文件夹,这里可以用 `~/datasets/aishell3/train/embed`。Utterance embedding 会以和音频文件夹相同的文件结构存储,格式为 `.npy`.
utterance embedding 的计算可能会用几个小时的时间,请耐心等待。
### 音频处理
因为 AISHELL-3 数据集前后有一些空白,静音片段,而且语音幅值很小,所以我们需要进行空白移除和音量规范化。空白移除可以简单的使用基于音量或者能量的方法,但是效果不是很好,对于不同的句子很难取到一个一致的阈值。我们使用的是先利用 Force Aligner 进行文本和语音的对齐。然后根据对齐结果截除空白。
我们使用的工具是 Montreal Force Aligner 1.0. 因为 aishell 的标注包含拼音标注,所以我们提供给 Montreal Force Aligner 的是拼音 transcription 而不是汉字 transcription. 而且需要把其中的韵律标记(`$` 和 `%`)去除,并且处理成 Montreal Force Alinger 所需要的文件形式。和音频同名的文本文件,扩展名为 `.lab`.
此外还需要准备词典文件。其中包含把拼音序列转换为 phone 序列的映射关系。在这里我们只做声母和韵母的切分,而声调则归为韵母的一部分。我们使用的[词典文件](./lexicon.txt)可以下载。
准备好之后运行训练和对齐。首先下载 [Montreal Force Aligner 1.0](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/tag/v1.0.1).下载之后解压即可运行。cd 到其中的 bin 文件夹运行命令,即可进行训练和对齐。前三个命令行参数分别是音频文件夹的路径,词典路径和对齐文件输出路径。可以通过`-o` 传入训练得到的模型保存路径。
```bash
./mfa_train_and_align \
~/datasets/aishell3/train/wav \
lexicon.txt \
~/datasets/aishell3/train/alignment \
-o aishell3_model \
-v
```
因为训练和对齐的时间比较长。我们提供了对齐后的 [alignment 文件](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz),其中每个句子对应的文件为 `.TextGrid` 格式的文本。
得到了对齐文件之后,可以运行 `process_wav.py` 脚本来处理音频。
```bash
python process_wav.py --input=<input> --output=<output> --alignment=<alignment>
```
默认 input, output, alignment 分别是 `~/datasets/aishell3/train/wav`, `~/datasets/aishell3/train/normalized_wav`, `~/datasets/aishell3/train/alignment`.
处理结束后,会将处理好的音频保存在 `<output>` 文件夹中。
### 转录文本处理
把文本转换成为 phone 和 tone 的形式,并存储起来。值得注意的是,这里我们的处理和用于 montreal force aligner 的不一样。我们把声调分了出来。这是一个处理方式,当然也可以只做声母和韵母的切分。
运行脚本处理转录文本。
```bash
python preprocess_transcription.py --input=<input> --output=<output>
```
默认的 input 是 `~/datasets/aishell3/train`,其中会包含 `label_train-set.txt` 文件,处理后的结果会 `metadata.yaml``metadata.pickle`. 前者是文本格式,方便查看,后者是二进制格式,方便直接读取。
### mel 频谱提取
对处理后的音频进行 mel 频谱的提取,并且以和音频文件夹同构的方式存储,存储格式是 `.npy` 文件。
```python
python extract_mel.py --input=<intput> --output=<output>
```
input 是处理后的音频所在的文件夹output 是输出频谱的文件夹。
## 训练
运行脚本训练。
```python
python train.py --data=<data> --output=<output> --device="gpu"
```
我们的模型去掉了 tacotron2 模型中的 stop token prediction。因为实践中由于 stop token prediction 是一个正负样例比例极不平衡的问题,每个句子可能有几百帧对应负样例,只有一帧正样例,而且这个 stop token prediction 对音频静音的裁切十分敏感。我们转用 attention 的最高点到达 encoder 侧的最后一个符号为终止条件。
另外,为了加速模型的收敛,我们加上了 guided attention loss, 诱导 encoder-decoder 之间的 alignment 更快地呈现对角线。
可以使用 visualdl 查看训练过程的 log。
```bash
visualdl --logdir=<output> --host=$HOSTNAME
```
示例 training loss / validation loss 曲线如下。
![train](./images/train.png)
![valid](./images/valid.png)
<img src="images/alignment-step2000.png" alt="alignment-step2000" style="zoom:50%;" />
大约从训练 2000 步左右就从 validation 过程中产出的 alignement 中可以观察到模糊的对角线。随着训练步数增加,对角线会更加清晰。但因为 validation 也是以 teacher forcing 的方式进行的,所以要在真正的 auto regressive 合成中产出的 alignment 中观察到对角线,需要更长的时间。
## 预训练模型
预训练模型下载链接。[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip).
## 使用
本实验包含了一个简单的使用示例,用户可以替换作为参考的声音以及文本,用训练好的模型来合成语音。使用方式参考 [notebook](./voice_cloning.ipynb) 上的使用说明。

@ -0,0 +1,88 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pickle
from pathlib import Path
import numpy as np
from paddle.io import Dataset
from parakeet.frontend import Vocab
from parakeet.data import batch_text_id, batch_spec
from preprocess_transcription import _phones, _tones
voc_phones = Vocab(sorted(list(_phones)))
print("vocab_phones:\n", voc_phones)
voc_tones = Vocab(sorted(list(_tones)))
print("vocab_tones:\n", voc_tones)
class AiShell3(Dataset):
"""Processed AiShell3 dataset."""
def __init__(self, root):
super().__init__()
self.root = Path(root).expanduser()
self.embed_dir = self.root / "embed"
self.mel_dir = self.root / "mel"
with open(self.root / "metadata.pickle", 'rb') as f:
self.records = pickle.load(f)
def __getitem__(self, index):
metadatum = self.records[index]
sentence_id = metadatum["sentence_id"]
speaker_id = sentence_id[:7]
phones = metadatum["phones"]
tones = metadatum["tones"]
phones = np.array(
[voc_phones.lookup(item) for item in phones], dtype=np.int64)
tones = np.array(
[voc_tones.lookup(item) for item in tones], dtype=np.int64)
mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy")))
embed = np.load(
str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
return phones, tones, mel, embed
def __len__(self):
return len(self.records)
def collate_aishell3_examples(examples):
phones, tones, mel, embed = list(zip(*examples))
text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
T_dec = np.max(spec_lengths)
stop_tokens = (
np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
phones, _ = batch_text_id(phones)
tones, _ = batch_text_id(tones)
mel, _ = batch_spec(mel)
mel = np.transpose(mel, (0, 2, 1))
embed = np.stack(embed)
# 7 fields
# (B, T), (B, T), (B, T, C), (B, C), (B,), (B,), (B, T)
return phones, tones, mel, embed, text_lengths, spec_lengths, stop_tokens
if __name__ == "__main__":
dataset = AiShell3("~/datasets/aishell3/train")
example = dataset[0]
examples = [dataset[i] for i in range(10)]
batch = collate_aishell3_examples(examples)
for field in batch:
print(field.shape, field.dtype)

@ -0,0 +1,39 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Tuple
from pypinyin import lazy_pinyin, Style
from preprocess_transcription import split_syllable
def convert_to_pinyin(text: str) -> List[str]:
"""convert text into list of syllables, other characters that are not chinese, thus
cannot be converted to pinyin are splited.
"""
syllables = lazy_pinyin(
text, style=Style.TONE3, neutral_tone_with_five=True)
return syllables
def convert_sentence(text: str) -> List[Tuple[str]]:
"""convert a sentence into two list: phones and tones"""
syllables = convert_to_pinyin(text)
phones = []
tones = []
for syllable in syllables:
p, t = split_syllable(syllable)
phones.extend(p)
tones.extend(t)
return phones, tones

@ -0,0 +1,82 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from yacs.config import CfgNode as CN
_C = CN()
_C.data = CN(
dict(
batch_size=32, # batch size
valid_size=64, # the first N examples are reserved for validation
sample_rate=22050, # Hz, sample rate
n_fft=1024, # fft frame size
win_length=1024, # window size
hop_length=256, # hop size between ajacent frame
fmax=8000, # Hz, max frequency when converting to mel
fmin=0, # Hz, min frequency when converting to mel
d_mels=80, # mel bands
padding_idx=0, # text embedding's padding index
))
_C.model = CN(
dict(
vocab_size=70,
n_tones=10,
reduction_factor=1, # reduction factor
d_encoder=512, # embedding & encoder's internal size
encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
d_prenet=256, # hidden size of decoder prenet
# hidden size of the first rnn layer in tacotron2 decoder
d_attention_rnn=1024,
# hidden size of the second rnn layer in tacotron2 decoder
d_decoder_rnn=1024,
d_attention=128, # hidden size of decoder location linear layer
attention_filters=32, # number of filter in decoder location conv layer
attention_kernel_size=31, # kernel size of decoder location conv layer
d_postnet=512, # hidden size of decoder postnet
postnet_kernel_size=5, # kernel size of conv layers in postnet
postnet_conv_layers=5, # number of conv layer in decoder postnet
p_encoder_dropout=0.5, # droput probability in encoder
p_prenet_dropout=0.5, # droput probability in decoder prenet
# droput probability of first rnn layer in decoder
p_attention_dropout=0.1,
# droput probability of second rnn layer in decoder
p_decoder_dropout=0.1,
p_postnet_dropout=0.5, # droput probability in decoder postnet
guided_attention_loss_sigma=0.2,
d_global_condition=256,
# whether to use a classifier to predict stop probability
use_stop_token=False,
# whether to use guided attention loss in training
use_guided_attention_loss=True, ))
_C.training = CN(
dict(
lr=1e-3, # learning rate
weight_decay=1e-6, # the coeff of weight decay
grad_clip_thresh=1.0, # the clip norm of grad clip.
valid_interval=1000, # validation
save_interval=1000, # checkpoint
max_iteration=500000, # max iteration to train
))
def get_cfg_defaults():
"""Get a yacs CfgNode object with default values for my_project."""
# Return a clone so that the defaults will not be altered
# This is for the "local variable" use pattern
return _C.clone()

@ -0,0 +1,96 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import multiprocessing as mp
from functools import partial
from pathlib import Path
import numpy as np
from parakeet.audio import AudioProcessor
from parakeet.audio.spec_normalizer import NormalizerBase, LogMagnitude
import tqdm
from config import get_cfg_defaults
def extract_mel(fname: Path,
input_dir: Path,
output_dir: Path,
p: AudioProcessor,
n: NormalizerBase):
relative_path = fname.relative_to(input_dir)
out_path = (output_dir / relative_path).with_suffix(".npy")
out_path.parent.mkdir(parents=True, exist_ok=True)
wav = p.read_wav(fname)
mel = p.mel_spectrogram(wav)
mel = n.transform(mel)
np.save(out_path, mel)
def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
input_dir = Path(input_dir).expanduser()
fnames = list(input_dir.rglob(f"*{extension}"))
output_dir = Path(output_dir).expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,
config.hop_length, config.n_mels, config.fmin,
config.fmax)
n = LogMagnitude(1e-5)
func = partial(
extract_mel, input_dir=input_dir, output_dir=output_dir, p=p, n=n)
with mp.Pool(16) as pool:
list(
tqdm.tqdm(
pool.imap(func, fnames), total=len(fnames), unit="utterance"))
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Extract mel spectrogram from processed wav in AiShell3 training dataset."
)
parser.add_argument(
"--config",
type=str,
help="yaml config file to overwrite the default config")
parser.add_argument(
"--input",
type=str,
default="~/datasets/aishell3/train/normalized_wav",
help="path of the processed wav folder")
parser.add_argument(
"--output",
type=str,
default="~/datasets/aishell3/train/mel",
help="path of the folder to save mel spectrograms")
parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
default_config = get_cfg_defaults()
args = parser.parse_args()
if args.config:
default_config.merge_from_file(args.config)
if args.opts:
default_config.merge_from_list(args.opts)
default_config.freeze()
audio_config = default_config.data
extract_mel_multispeaker(audio_config, args.input, args.output)

Binary file not shown.

After

Width:  |  Height:  |  Size: 221 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 550 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 514 KiB

File diff suppressed because it is too large Load Diff

@ -0,0 +1,258 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
import re
import pickle
import yaml
import tqdm
zh_pattern = re.compile("[\u4e00-\u9fa5]")
_tones = {'<pad>', '<s>', '</s>', '0', '1', '2', '3', '4', '5'}
_pauses = {'%', '$'}
_initials = {
'b',
'p',
'm',
'f',
'd',
't',
'n',
'l',
'g',
'k',
'h',
'j',
'q',
'x',
'zh',
'ch',
'sh',
'r',
'z',
'c',
's',
}
_finals = {
'ii',
'iii',
'a',
'o',
'e',
'ea',
'ai',
'ei',
'ao',
'ou',
'an',
'en',
'ang',
'eng',
'er',
'i',
'ia',
'io',
'ie',
'iai',
'iao',
'iou',
'ian',
'ien',
'iang',
'ieng',
'u',
'ua',
'uo',
'uai',
'uei',
'uan',
'uen',
'uang',
'ueng',
'v',
've',
'van',
'ven',
'veng',
}
_ernized_symbol = {'&r'}
_specials = {'<pad>', '<unk>', '<s>', '</s>'}
_phones = _initials | _finals | _ernized_symbol | _specials | _pauses
def is_zh(word):
global zh_pattern
match = zh_pattern.search(word)
return match is not None
def ernized(syllable):
return syllable[:2] != "er" and syllable[-2] == 'r'
def convert(syllable):
# expansion of o -> uo
syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
# syllable = syllable.replace("bo", "buo").replace("po", "puo").replace("mo", "muo").replace("fo", "fuo")
# expansion for iong, ong
syllable = syllable.replace("iong", "veng").replace("ong", "ueng")
# expansion for ing, in
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
# expansion for un, ui, iu
syllable = syllable.replace("un", "uen").replace("ui",
"uei").replace("iu", "iou")
# rule for variants of i
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
.replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\
.replace("ri", "riii")
# rule for y preceding i, u
syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")
# rule for w
syllable = syllable.replace("wu", "u").replace("w", "u")
# rule for v following j, q, x
syllable = syllable.replace("ju", "jv").replace("qu",
"qv").replace("xu", "xv")
return syllable
def split_syllable(syllable: str):
"""Split a syllable in pinyin into a list of phones and a list of tones.
Initials have no tone, represented by '0', while finals have tones from
'1,2,3,4,5'.
e.g.
zhang -> ['zh', 'ang'], ['0', '1']
"""
if syllable in _pauses:
# syllable, tone
return [syllable], ['0']
tone = syllable[-1]
syllable = convert(syllable[:-1])
phones = []
tones = []
global _initials
if syllable[:2] in _initials:
phones.append(syllable[:2])
tones.append('0')
phones.append(syllable[2:])
tones.append(tone)
elif syllable[0] in _initials:
phones.append(syllable[0])
tones.append('0')
phones.append(syllable[1:])
tones.append(tone)
else:
phones.append(syllable)
tones.append(tone)
return phones, tones
def load_aishell3_transcription(line: str):
sentence_id, pinyin, text = line.strip().split("|")
syllables = pinyin.strip().split()
results = []
for syllable in syllables:
if syllable in _pauses:
results.append(syllable)
elif not ernized(syllable):
results.append(syllable)
else:
results.append(syllable[:-2] + syllable[-1])
results.append('&r5')
phones = []
tones = []
for syllable in results:
p, t = split_syllable(syllable)
phones.extend(p)
tones.extend(t)
for p in phones:
assert p in _phones, p
return {
"sentence_id": sentence_id,
"text": text,
"syllables": results,
"phones": phones,
"tones": tones
}
def process_aishell3(dataset_root, output_dir):
dataset_root = Path(dataset_root).expanduser()
output_dir = Path(output_dir).expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
prosody_label_path = dataset_root / "label_train-set.txt"
with open(prosody_label_path, 'rt') as f:
lines = [line.strip() for line in f]
records = lines[5:]
processed_records = []
for record in tqdm.tqdm(records):
new_record = load_aishell3_transcription(record)
processed_records.append(new_record)
print(new_record)
with open(output_dir / "metadata.pickle", 'wb') as f:
pickle.dump(processed_records, f)
with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
yaml.safe_dump(
processed_records, f, default_flow_style=None, allow_unicode=True)
print("metadata done!")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
)
parser.add_argument(
"--input",
type=str,
default="~/datasets/aishell3/train",
help="path of the training dataset,(contains a label_train-set.txt).")
parser.add_argument(
"--output",
type=str,
help="the directory to save the processed transcription."
"If not provided, it would be the same as the input.")
args = parser.parse_args()
if args.output is None:
args.output = args.input
process_aishell3(args.input, args.output)

@ -0,0 +1,95 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
from multiprocessing import Pool
from functools import partial
import numpy as np
import librosa
import soundfile as sf
from tqdm import tqdm
from praatio import tgio
def get_valid_part(fpath):
f = tgio.openTextgrid(fpath)
start = 0
phone_entry_list = f.tierDict['phones'].entryList
first_entry = phone_entry_list[0]
if first_entry.label == "sil":
start = first_entry.end
last_entry = phone_entry_list[-1]
if last_entry.label == "sp":
end = last_entry.start
else:
end = last_entry.end
return start, end
def process_utterance(fpath, source_dir, target_dir, alignment_dir):
rel_path = fpath.relative_to(source_dir)
opath = target_dir / rel_path
apath = (alignment_dir / rel_path).with_suffix(".TextGrid")
opath.parent.mkdir(parents=True, exist_ok=True)
start, end = get_valid_part(apath)
wav, _ = librosa.load(fpath, sr=22050, offset=start, duration=end - start)
normalized_wav = wav / np.max(wav) * 0.999
sf.write(opath, normalized_wav, samplerate=22050, subtype='PCM_16')
# print(f"{fpath} => {opath}")
def preprocess_aishell3(source_dir, target_dir, alignment_dir):
source_dir = Path(source_dir).expanduser()
target_dir = Path(target_dir).expanduser()
alignment_dir = Path(alignment_dir).expanduser()
wav_paths = list(source_dir.rglob("*.wav"))
print(f"there are {len(wav_paths)} audio files in total")
fx = partial(
process_utterance,
source_dir=source_dir,
target_dir=target_dir,
alignment_dir=alignment_dir)
with Pool(16) as p:
list(
tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Process audio in AiShell3, trim silence according to the alignment "
"files generated by MFA, and normalize volume by peak.")
parser.add_argument(
"--input",
type=str,
default="~/datasets/aishell3/train/wav",
help="path of the original audio folder in aishell3.")
parser.add_argument(
"--output",
type=str,
default="~/datasets/aishell3/train/normalized_wav",
help="path of the folder to save the processed audio files.")
parser.add_argument(
"--alignment",
type=str,
default="~/datasets/aishell3/train/alignment",
help="path of the alignment files.")
args = parser.parse_args()
preprocess_aishell3(args.input, args.output, args.alignment)

@ -0,0 +1,262 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from pathlib import Path
from collections import defaultdict
import numpy as np
from matplotlib import pyplot as plt
import paddle
from paddle import distributed as dist
from paddle.io import DataLoader, DistributedBatchSampler
from parakeet.data import dataset
from parakeet.training.cli import default_argument_parser
from parakeet.training.experiment import ExperimentBase
from parakeet.utils import display, mp_tools
from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss
from config import get_cfg_defaults
from aishell3 import AiShell3, collate_aishell3_examples
class Experiment(ExperimentBase):
def compute_losses(self, inputs, outputs):
texts, tones, mel_targets, utterance_embeds, text_lens, output_lens, stop_tokens = inputs
mel_outputs = outputs["mel_output"]
mel_outputs_postnet = outputs["mel_outputs_postnet"]
alignments = outputs["alignments"]
losses = self.criterion(mel_outputs, mel_outputs_postnet, mel_targets,
alignments, output_lens, text_lens)
return losses
def train_batch(self):
start = time.time()
batch = self.read_batch()
data_loader_time = time.time() - start
self.optimizer.clear_grad()
self.model.train()
texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
outputs = self.model(
texts,
text_lens,
mels,
output_lens,
tones=tones,
global_condition=utterance_embeds)
losses = self.compute_losses(batch, outputs)
loss = losses["loss"]
loss.backward()
self.optimizer.step()
iteration_time = time.time() - start
losses_np = {k: float(v) for k, v in losses.items()}
# logging
msg = "Rank: {}, ".format(dist.get_rank())
msg += "step: {}, ".format(self.iteration)
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
iteration_time)
msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_np.items())
self.logger.info(msg)
if dist.get_rank() == 0:
for key, value in losses_np.items():
self.visualizer.add_scalar(f"train_loss/{key}", value,
self.iteration)
@mp_tools.rank_zero_only
@paddle.no_grad()
def valid(self):
valid_losses = defaultdict(list)
for i, batch in enumerate(self.valid_loader):
texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
outputs = self.model(
texts,
text_lens,
mels,
output_lens,
tones=tones,
global_condition=utterance_embeds)
losses = self.compute_losses(batch, outputs)
for key, value in losses.items():
valid_losses[key].append(float(value))
attention_weights = outputs["alignments"]
self.visualizer.add_figure(
f"valid_sentence_{i}_alignments",
display.plot_alignment(attention_weights[0].numpy().T),
self.iteration)
self.visualizer.add_figure(
f"valid_sentence_{i}_target_spectrogram",
display.plot_spectrogram(mels[0].numpy().T), self.iteration)
mel_pred = outputs['mel_outputs_postnet']
self.visualizer.add_figure(
f"valid_sentence_{i}_predicted_spectrogram",
display.plot_spectrogram(mel_pred[0].numpy().T), self.iteration)
# write visual log
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
# logging
msg = "Valid: "
msg += "step: {}, ".format(self.iteration)
msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in valid_losses.items())
self.logger.info(msg)
for key, value in valid_losses.items():
self.visualizer.add_scalar(f"valid/{key}", value, self.iteration)
@mp_tools.rank_zero_only
@paddle.no_grad()
def eval(self):
"""Evaluation of Tacotron2 in autoregressive manner."""
self.model.eval()
mel_dir = Path(self.output_dir / ("eval_{}".format(self.iteration)))
mel_dir.mkdir(parents=True, exist_ok=True)
for i, batch in enumerate(self.test_loader):
texts, tones, mels, utterance_embeds, *_ = batch
outputs = self.model.infer(
texts, tones=tones, global_condition=utterance_embeds)
display.plot_alignment(outputs["alignments"][0].numpy().T)
plt.savefig(mel_dir / f"sentence_{i}.png")
plt.close()
np.save(mel_dir / f"sentence_{i}",
outputs["mel_outputs_postnet"][0].numpy().T)
print(f"sentence_{i}")
def setup_model(self):
config = self.config
model = Tacotron2(
vocab_size=config.model.vocab_size,
n_tones=config.model.n_tones,
d_mels=config.data.d_mels,
d_encoder=config.model.d_encoder,
encoder_conv_layers=config.model.encoder_conv_layers,
encoder_kernel_size=config.model.encoder_kernel_size,
d_prenet=config.model.d_prenet,
d_attention_rnn=config.model.d_attention_rnn,
d_decoder_rnn=config.model.d_decoder_rnn,
attention_filters=config.model.attention_filters,
attention_kernel_size=config.model.attention_kernel_size,
d_attention=config.model.d_attention,
d_postnet=config.model.d_postnet,
postnet_kernel_size=config.model.postnet_kernel_size,
postnet_conv_layers=config.model.postnet_conv_layers,
reduction_factor=config.model.reduction_factor,
p_encoder_dropout=config.model.p_encoder_dropout,
p_prenet_dropout=config.model.p_prenet_dropout,
p_attention_dropout=config.model.p_attention_dropout,
p_decoder_dropout=config.model.p_decoder_dropout,
p_postnet_dropout=config.model.p_postnet_dropout,
d_global_condition=config.model.d_global_condition,
use_stop_token=config.model.use_stop_token, )
if self.parallel:
model = paddle.DataParallel(model)
grad_clip = paddle.nn.ClipGradByGlobalNorm(
config.training.grad_clip_thresh)
optimizer = paddle.optimizer.Adam(
learning_rate=config.training.lr,
parameters=model.parameters(),
weight_decay=paddle.regularizer.L2Decay(
config.training.weight_decay),
grad_clip=grad_clip)
criterion = Tacotron2Loss(
use_stop_token_loss=config.model.use_stop_token,
use_guided_attention_loss=config.model.use_guided_attention_loss,
sigma=config.model.guided_attention_loss_sigma)
self.model = model
self.optimizer = optimizer
self.criterion = criterion
def setup_dataloader(self):
args = self.args
config = self.config
ljspeech_dataset = AiShell3(args.data)
valid_set, train_set = dataset.split(ljspeech_dataset,
config.data.valid_size)
batch_fn = collate_aishell3_examples
if not self.parallel:
self.train_loader = DataLoader(
train_set,
batch_size=config.data.batch_size,
shuffle=True,
drop_last=True,
collate_fn=batch_fn)
else:
sampler = DistributedBatchSampler(
train_set,
batch_size=config.data.batch_size,
shuffle=True,
drop_last=True)
self.train_loader = DataLoader(
train_set, batch_sampler=sampler, collate_fn=batch_fn)
self.valid_loader = DataLoader(
valid_set,
batch_size=config.data.batch_size,
shuffle=False,
drop_last=False,
collate_fn=batch_fn)
self.test_loader = DataLoader(
valid_set,
batch_size=1,
shuffle=False,
drop_last=False,
collate_fn=batch_fn)
def main_sp(config, args):
exp = Experiment(config, args)
exp.setup()
exp.resume_or_load()
if not args.test:
exp.run()
else:
exp.eval()
def main(config, args):
if args.nprocs > 1 and args.device == "gpu":
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
else:
main_sp(config, args)
if __name__ == "__main__":
config = get_cfg_defaults()
parser = default_argument_parser()
parser.add_argument("--test", action="store_true")
args = parser.parse_args()
if args.config:
config.merge_from_file(args.config)
if args.opts:
config.merge_from_list(args.opts)
config.freeze()
print(config)
print(args)
main(config, args)

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save