@ -0,0 +1,30 @@
|
||||
# .readthedocs.yml
|
||||
# Read the Docs configuration file
|
||||
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
|
||||
|
||||
# Required
|
||||
version: 2
|
||||
|
||||
# Build documentation in the docs/ directory with Sphinx
|
||||
sphinx:
|
||||
configuration: docs/src/conf.py
|
||||
|
||||
# Build documentation with MkDocs
|
||||
#mkdocs:
|
||||
# configuration: mkdocs.yml
|
||||
|
||||
# Optionally build your docs in additional formats such as PDF
|
||||
formats: []
|
||||
|
||||
# Optionally set the version of Python and requirements required to build your docs
|
||||
python:
|
||||
version: 3.7
|
||||
install:
|
||||
- method: pip
|
||||
path: .
|
||||
extra_requirements:
|
||||
- doc
|
||||
|
||||
- requirements: docs/requirements.txt
|
||||
|
||||
|
@ -0,0 +1,528 @@
|
||||
"""Beam search module."""
|
||||
|
||||
from itertools import chain
|
||||
import logger
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import NamedTuple
|
||||
from typing import Tuple
|
||||
from typing import Union
|
||||
|
||||
import paddle
|
||||
|
||||
from .utils import end_detect
|
||||
from .scorers.scorer_interface import PartialScorerInterface
|
||||
from .scorers.scorer_interface import ScorerInterface
|
||||
|
||||
from deepspeech.utils.log import Log
|
||||
|
||||
logger = Log(__name__).getlog()
|
||||
|
||||
class Hypothesis(NamedTuple):
|
||||
"""Hypothesis data type."""
|
||||
|
||||
yseq: paddle.Tensor # (T,)
|
||||
score: Union[float, paddle.Tensor] = 0
|
||||
scores: Dict[str, Union[float, paddle.Tensor]] = dict()
|
||||
states: Dict[str, Any] = dict()
|
||||
|
||||
def asdict(self) -> dict:
|
||||
"""Convert data to JSON-friendly dict."""
|
||||
return self._replace(
|
||||
yseq=self.yseq.tolist(),
|
||||
score=float(self.score),
|
||||
scores={k: float(v) for k, v in self.scores.items()},
|
||||
)._asdict()
|
||||
|
||||
|
||||
class BeamSearch(paddle.nn.Layer):
|
||||
"""Beam search implementation."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scorers: Dict[str, ScorerInterface],
|
||||
weights: Dict[str, float],
|
||||
beam_size: int,
|
||||
vocab_size: int,
|
||||
sos: int,
|
||||
eos: int,
|
||||
token_list: List[str] = None,
|
||||
pre_beam_ratio: float = 1.5,
|
||||
pre_beam_score_key: str = None,
|
||||
):
|
||||
"""Initialize beam search.
|
||||
|
||||
Args:
|
||||
scorers (dict[str, ScorerInterface]): Dict of decoder modules
|
||||
e.g., Decoder, CTCPrefixScorer, LM
|
||||
The scorer will be ignored if it is `None`
|
||||
weights (dict[str, float]): Dict of weights for each scorers
|
||||
The scorer will be ignored if its weight is 0
|
||||
beam_size (int): The number of hypotheses kept during search
|
||||
vocab_size (int): The number of vocabulary
|
||||
sos (int): Start of sequence id
|
||||
eos (int): End of sequence id
|
||||
token_list (list[str]): List of tokens for debug log
|
||||
pre_beam_score_key (str): key of scores to perform pre-beam search
|
||||
pre_beam_ratio (float): beam size in the pre-beam search
|
||||
will be `int(pre_beam_ratio * beam_size)`
|
||||
|
||||
"""
|
||||
super().__init__()
|
||||
# set scorers
|
||||
self.weights = weights
|
||||
self.scorers = dict() # all = full + partial
|
||||
self.full_scorers = dict() # full tokens
|
||||
self.part_scorers = dict() # partial tokens
|
||||
# this module dict is required for recursive cast
|
||||
# `self.to(device, dtype)` in `recog.py`
|
||||
self.nn_dict = paddle.nn.LayerDict() # nn.Layer
|
||||
for k, v in scorers.items():
|
||||
w = weights.get(k, 0)
|
||||
if w == 0 or v is None:
|
||||
continue
|
||||
assert isinstance(
|
||||
v, ScorerInterface
|
||||
), f"{k} ({type(v)}) does not implement ScorerInterface"
|
||||
self.scorers[k] = v
|
||||
if isinstance(v, PartialScorerInterface):
|
||||
self.part_scorers[k] = v
|
||||
else:
|
||||
self.full_scorers[k] = v
|
||||
if isinstance(v, paddle.nn.Layer):
|
||||
self.nn_dict[k] = v
|
||||
|
||||
# set configurations
|
||||
self.sos = sos
|
||||
self.eos = eos
|
||||
self.token_list = token_list
|
||||
# pre_beam_size > beam_size
|
||||
self.pre_beam_size = int(pre_beam_ratio * beam_size)
|
||||
self.beam_size = beam_size
|
||||
self.n_vocab = vocab_size
|
||||
if (
|
||||
pre_beam_score_key is not None
|
||||
and pre_beam_score_key != "full"
|
||||
and pre_beam_score_key not in self.full_scorers
|
||||
):
|
||||
raise KeyError(f"{pre_beam_score_key} is not found in {self.full_scorers}")
|
||||
# selected `key` scorer to do pre beam search
|
||||
self.pre_beam_score_key = pre_beam_score_key
|
||||
# do_pre_beam when need, valid and has part_scorers
|
||||
self.do_pre_beam = (
|
||||
self.pre_beam_score_key is not None
|
||||
and self.pre_beam_size < self.n_vocab
|
||||
and len(self.part_scorers) > 0
|
||||
)
|
||||
|
||||
def init_hyp(self, x: paddle.Tensor) -> List[Hypothesis]:
|
||||
"""Get an initial hypothesis data.
|
||||
|
||||
Args:
|
||||
x (paddle.Tensor): The encoder output feature, (T, D)
|
||||
|
||||
Returns:
|
||||
Hypothesis: The initial hypothesis.
|
||||
|
||||
"""
|
||||
init_states = dict()
|
||||
init_scores = dict()
|
||||
for k, d in self.scorers.items():
|
||||
init_states[k] = d.init_state(x)
|
||||
init_scores[k] = 0.0
|
||||
return [
|
||||
Hypothesis(
|
||||
yseq=paddle.to_tensor([self.sos], place=x.place),
|
||||
score=0.0,
|
||||
scores=init_scores,
|
||||
states=init_states,
|
||||
)
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def append_token(xs: paddle.Tensor, x: int) -> paddle.Tensor:
|
||||
"""Append new token to prefix tokens.
|
||||
|
||||
Args:
|
||||
xs (paddle.Tensor): The prefix token, (T,)
|
||||
x (int): The new token to append
|
||||
|
||||
Returns:
|
||||
paddle.Tensor: (T+1,), New tensor contains: xs + [x] with xs.dtype and xs.device
|
||||
|
||||
"""
|
||||
x = paddle.to_tensor([x], dtype=xs.dtype, place=xs.place)
|
||||
return paddle.cat((xs, x))
|
||||
|
||||
def score_full(
|
||||
self, hyp: Hypothesis, x: paddle.Tensor
|
||||
) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]:
|
||||
"""Score new hypothesis by `self.full_scorers`.
|
||||
|
||||
Args:
|
||||
hyp (Hypothesis): Hypothesis with prefix tokens to score
|
||||
x (paddle.Tensor): Corresponding input feature, (T, D)
|
||||
|
||||
Returns:
|
||||
Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: Tuple of
|
||||
score dict of `hyp` that has string keys of `self.full_scorers`
|
||||
and tensor score values of shape: `(self.n_vocab,)`,
|
||||
and state dict that has string keys
|
||||
and state values of `self.full_scorers`
|
||||
|
||||
"""
|
||||
scores = dict()
|
||||
states = dict()
|
||||
for k, d in self.full_scorers.items():
|
||||
# scores[k] shape (self.n_vocab,)
|
||||
scores[k], states[k] = d.score(hyp.yseq, hyp.states[k], x)
|
||||
return scores, states
|
||||
|
||||
def score_partial(
|
||||
self, hyp: Hypothesis, ids: paddle.Tensor, x: paddle.Tensor
|
||||
) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]:
|
||||
"""Score new hypothesis by `self.part_scorers`.
|
||||
|
||||
Args:
|
||||
hyp (Hypothesis): Hypothesis with prefix tokens to score
|
||||
ids (paddle.Tensor): 1D tensor of new partial tokens to score,
|
||||
len(ids) < n_vocab
|
||||
x (paddle.Tensor): Corresponding input feature, (T, D)
|
||||
|
||||
Returns:
|
||||
Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: Tuple of
|
||||
score dict of `hyp` that has string keys of `self.part_scorers`
|
||||
and tensor score values of shape: `(len(ids),)`,
|
||||
and state dict that has string keys
|
||||
and state values of `self.part_scorers`
|
||||
|
||||
"""
|
||||
scores = dict()
|
||||
states = dict()
|
||||
for k, d in self.part_scorers.items():
|
||||
# scores[k] shape (len(ids),)
|
||||
scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k], x)
|
||||
return scores, states
|
||||
|
||||
def beam(
|
||||
self, weighted_scores: paddle.Tensor, ids: paddle.Tensor
|
||||
) -> Tuple[paddle.Tensor, paddle.Tensor]:
|
||||
"""Compute topk full token ids and partial token ids.
|
||||
|
||||
Args:
|
||||
weighted_scores (paddle.Tensor): The weighted sum scores for each tokens.
|
||||
Its shape is `(self.n_vocab,)`.
|
||||
ids (paddle.Tensor): The partial token ids(Global) to compute topk.
|
||||
|
||||
Returns:
|
||||
Tuple[paddle.Tensor, paddle.Tensor]:
|
||||
The topk full token ids and partial token ids.
|
||||
Their shapes are `(self.beam_size,)`.
|
||||
i.e. (global ids, global relative local ids).
|
||||
|
||||
"""
|
||||
# no pre beam performed, `ids` equal to `weighted_scores`
|
||||
if weighted_scores.size(0) == ids.size(0):
|
||||
top_ids = weighted_scores.topk(self.beam_size)[1] # index in n_vocab
|
||||
return top_ids, top_ids
|
||||
|
||||
# mask pruned in pre-beam not to select in topk
|
||||
tmp = weighted_scores[ids]
|
||||
weighted_scores[:] = -float("inf")
|
||||
weighted_scores[ids] = tmp
|
||||
# top_ids no equal to local_ids, since ids shape not same
|
||||
top_ids = weighted_scores.topk(self.beam_size)[1] # index in n_vocab
|
||||
local_ids = weighted_scores[ids].topk(self.beam_size)[1] # index in len(ids)
|
||||
return top_ids, local_ids
|
||||
|
||||
@staticmethod
|
||||
def merge_scores(
|
||||
prev_scores: Dict[str, float],
|
||||
next_full_scores: Dict[str, paddle.Tensor],
|
||||
full_idx: int,
|
||||
next_part_scores: Dict[str, paddle.Tensor],
|
||||
part_idx: int,
|
||||
) -> Dict[str, paddle.Tensor]:
|
||||
"""Merge scores for new hypothesis.
|
||||
|
||||
Args:
|
||||
prev_scores (Dict[str, float]):
|
||||
The previous hypothesis scores by `self.scorers`
|
||||
next_full_scores (Dict[str, paddle.Tensor]): scores by `self.full_scorers`
|
||||
full_idx (int): The next token id for `next_full_scores`
|
||||
next_part_scores (Dict[str, paddle.Tensor]):
|
||||
scores of partial tokens by `self.part_scorers`
|
||||
part_idx (int): The new token id for `next_part_scores`
|
||||
|
||||
Returns:
|
||||
Dict[str, paddle.Tensor]: The new score dict.
|
||||
Its keys are names of `self.full_scorers` and `self.part_scorers`.
|
||||
Its values are scalar tensors by the scorers.
|
||||
|
||||
"""
|
||||
new_scores = dict()
|
||||
for k, v in next_full_scores.items():
|
||||
new_scores[k] = prev_scores[k] + v[full_idx]
|
||||
for k, v in next_part_scores.items():
|
||||
new_scores[k] = prev_scores[k] + v[part_idx]
|
||||
return new_scores
|
||||
|
||||
def merge_states(self, states: Any, part_states: Any, part_idx: int) -> Any:
|
||||
"""Merge states for new hypothesis.
|
||||
|
||||
Args:
|
||||
states: states of `self.full_scorers`
|
||||
part_states: states of `self.part_scorers`
|
||||
part_idx (int): The new token id for `part_scores`
|
||||
|
||||
Returns:
|
||||
Dict[str, paddle.Tensor]: The new score dict.
|
||||
Its keys are names of `self.full_scorers` and `self.part_scorers`.
|
||||
Its values are states of the scorers.
|
||||
|
||||
"""
|
||||
new_states = dict()
|
||||
for k, v in states.items():
|
||||
new_states[k] = v
|
||||
for k, d in self.part_scorers.items():
|
||||
new_states[k] = d.select_state(part_states[k], part_idx)
|
||||
return new_states
|
||||
|
||||
def search(
|
||||
self, running_hyps: List[Hypothesis], x: paddle.Tensor
|
||||
) -> List[Hypothesis]:
|
||||
"""Search new tokens for running hypotheses and encoded speech x.
|
||||
|
||||
Args:
|
||||
running_hyps (List[Hypothesis]): Running hypotheses on beam
|
||||
x (paddle.Tensor): Encoded speech feature (T, D)
|
||||
|
||||
Returns:
|
||||
List[Hypotheses]: Best sorted hypotheses
|
||||
|
||||
"""
|
||||
best_hyps = []
|
||||
part_ids = paddle.arange(self.n_vocab) # no pre-beam
|
||||
for hyp in running_hyps:
|
||||
# scoring
|
||||
weighted_scores = paddle.zeros(self.n_vocab, dtype=x.dtype)
|
||||
scores, states = self.score_full(hyp, x)
|
||||
for k in self.full_scorers:
|
||||
weighted_scores += self.weights[k] * scores[k]
|
||||
# partial scoring
|
||||
if self.do_pre_beam:
|
||||
pre_beam_scores = (
|
||||
weighted_scores
|
||||
if self.pre_beam_score_key == "full"
|
||||
else scores[self.pre_beam_score_key]
|
||||
)
|
||||
part_ids = paddle.topk(pre_beam_scores, self.pre_beam_size)[1]
|
||||
part_scores, part_states = self.score_partial(hyp, part_ids, x)
|
||||
for k in self.part_scorers:
|
||||
weighted_scores[part_ids] += self.weights[k] * part_scores[k]
|
||||
# add previous hyp score
|
||||
weighted_scores += hyp.score
|
||||
|
||||
# update hyps
|
||||
for j, part_j in zip(*self.beam(weighted_scores, part_ids)):
|
||||
# `part_j` is `j` relative id in `part_scores`
|
||||
# will be (2 x beam at most)
|
||||
best_hyps.append(
|
||||
Hypothesis(
|
||||
score=weighted_scores[j],
|
||||
yseq=self.append_token(hyp.yseq, j),
|
||||
scores=self.merge_scores(
|
||||
hyp.scores, scores, j, part_scores, part_j
|
||||
),
|
||||
states=self.merge_states(states, part_states, part_j),
|
||||
)
|
||||
)
|
||||
|
||||
# sort and prune 2 x beam -> beam
|
||||
best_hyps = sorted(best_hyps, key=lambda x: x.score, reverse=True)[
|
||||
: min(len(best_hyps), self.beam_size)
|
||||
]
|
||||
return best_hyps
|
||||
|
||||
def forward(
|
||||
self, x: paddle.Tensor, maxlenratio: float = 0.0, minlenratio: float = 0.0
|
||||
) -> List[Hypothesis]:
|
||||
"""Perform beam search.
|
||||
|
||||
Args:
|
||||
x (paddle.Tensor): Encoded speech feature (T, D)
|
||||
maxlenratio (float): Input length ratio to obtain max output length.
|
||||
If maxlenratio=0.0 (default), it uses a end-detect function
|
||||
to automatically find maximum hypothesis lengths
|
||||
If maxlenratio<0.0, its absolute value is interpreted
|
||||
as a constant max output length.
|
||||
minlenratio (float): Input length ratio to obtain min output length.
|
||||
|
||||
Returns:
|
||||
list[Hypothesis]: N-best decoding results
|
||||
|
||||
"""
|
||||
# set length bounds
|
||||
if maxlenratio == 0:
|
||||
maxlen = x.shape[0]
|
||||
elif maxlenratio < 0:
|
||||
maxlen = -1 * int(maxlenratio)
|
||||
else:
|
||||
maxlen = max(1, int(maxlenratio * x.size(0)))
|
||||
minlen = int(minlenratio * x.size(0))
|
||||
logger.info("decoder input length: " + str(x.shape[0]))
|
||||
logger.info("max output length: " + str(maxlen))
|
||||
logger.info("min output length: " + str(minlen))
|
||||
|
||||
# main loop of prefix search
|
||||
running_hyps = self.init_hyp(x)
|
||||
ended_hyps = []
|
||||
for i in range(maxlen):
|
||||
logger.debug("position " + str(i))
|
||||
best = self.search(running_hyps, x)
|
||||
# post process of one iteration
|
||||
running_hyps = self.post_process(i, maxlen, maxlenratio, best, ended_hyps)
|
||||
# end detection
|
||||
if maxlenratio == 0.0 and end_detect([h.asdict() for h in ended_hyps], i):
|
||||
logger.info(f"end detected at {i}")
|
||||
break
|
||||
if len(running_hyps) == 0:
|
||||
logger.info("no hypothesis. Finish decoding.")
|
||||
break
|
||||
else:
|
||||
logger.debug(f"remained hypotheses: {len(running_hyps)}")
|
||||
|
||||
nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True)
|
||||
# check the number of hypotheses reaching to eos
|
||||
if len(nbest_hyps) == 0:
|
||||
logger.warning(
|
||||
"there is no N-best results, perform recognition "
|
||||
"again with smaller minlenratio."
|
||||
)
|
||||
return (
|
||||
[]
|
||||
if minlenratio < 0.1
|
||||
else self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1))
|
||||
)
|
||||
|
||||
# report the best result
|
||||
best = nbest_hyps[0]
|
||||
for k, v in best.scores.items():
|
||||
logger.info(
|
||||
f"{v:6.2f} * {self.weights[k]:3} = {v * self.weights[k]:6.2f} for {k}"
|
||||
)
|
||||
logger.info(f"total log probability: {best.score:.2f}")
|
||||
logger.info(f"normalized log probability: {best.score / len(best.yseq):.2f}")
|
||||
logger.info(f"total number of ended hypotheses: {len(nbest_hyps)}")
|
||||
if self.token_list is not None:
|
||||
logger.info(
|
||||
"best hypo: "
|
||||
+ "".join([self.token_list[x] for x in best.yseq[1:-1]])
|
||||
+ "\n"
|
||||
)
|
||||
return nbest_hyps
|
||||
|
||||
def post_process(
|
||||
self,
|
||||
i: int,
|
||||
maxlen: int,
|
||||
maxlenratio: float,
|
||||
running_hyps: List[Hypothesis],
|
||||
ended_hyps: List[Hypothesis],
|
||||
) -> List[Hypothesis]:
|
||||
"""Perform post-processing of beam search iterations.
|
||||
|
||||
Args:
|
||||
i (int): The length of hypothesis tokens.
|
||||
maxlen (int): The maximum length of tokens in beam search.
|
||||
maxlenratio (int): The maximum length ratio in beam search.
|
||||
running_hyps (List[Hypothesis]): The running hypotheses in beam search.
|
||||
ended_hyps (List[Hypothesis]): The ended hypotheses in beam search.
|
||||
|
||||
Returns:
|
||||
List[Hypothesis]: The new running hypotheses.
|
||||
|
||||
"""
|
||||
logger.debug(f"the number of running hypotheses: {len(running_hyps)}")
|
||||
if self.token_list is not None:
|
||||
logger.debug(
|
||||
"best hypo: "
|
||||
+ "".join([self.token_list[x] for x in running_hyps[0].yseq[1:]])
|
||||
)
|
||||
# add eos in the final loop to avoid that there are no ended hyps
|
||||
if i == maxlen - 1:
|
||||
logger.info("adding <eos> in the last position in the loop")
|
||||
running_hyps = [
|
||||
h._replace(yseq=self.append_token(h.yseq, self.eos))
|
||||
for h in running_hyps
|
||||
]
|
||||
|
||||
# add ended hypotheses to a final list, and removed them from current hypotheses
|
||||
# (this will be a problem, number of hyps < beam)
|
||||
remained_hyps = []
|
||||
for hyp in running_hyps:
|
||||
if hyp.yseq[-1] == self.eos:
|
||||
# e.g., Word LM needs to add final <eos> score
|
||||
for k, d in chain(self.full_scorers.items(), self.part_scorers.items()):
|
||||
s = d.final_score(hyp.states[k])
|
||||
hyp.scores[k] += s
|
||||
hyp = hyp._replace(score=hyp.score + self.weights[k] * s)
|
||||
ended_hyps.append(hyp)
|
||||
else:
|
||||
remained_hyps.append(hyp)
|
||||
return remained_hyps
|
||||
|
||||
|
||||
def beam_search(
|
||||
x: paddle.Tensor,
|
||||
sos: int,
|
||||
eos: int,
|
||||
beam_size: int,
|
||||
vocab_size: int,
|
||||
scorers: Dict[str, ScorerInterface],
|
||||
weights: Dict[str, float],
|
||||
token_list: List[str] = None,
|
||||
maxlenratio: float = 0.0,
|
||||
minlenratio: float = 0.0,
|
||||
pre_beam_ratio: float = 1.5,
|
||||
pre_beam_score_key: str = "full",
|
||||
) -> list:
|
||||
"""Perform beam search with scorers.
|
||||
|
||||
Args:
|
||||
x (paddle.Tensor): Encoded speech feature (T, D)
|
||||
sos (int): Start of sequence id
|
||||
eos (int): End of sequence id
|
||||
beam_size (int): The number of hypotheses kept during search
|
||||
vocab_size (int): The number of vocabulary
|
||||
scorers (dict[str, ScorerInterface]): Dict of decoder modules
|
||||
e.g., Decoder, CTCPrefixScorer, LM
|
||||
The scorer will be ignored if it is `None`
|
||||
weights (dict[str, float]): Dict of weights for each scorers
|
||||
The scorer will be ignored if its weight is 0
|
||||
token_list (list[str]): List of tokens for debug log
|
||||
maxlenratio (float): Input length ratio to obtain max output length.
|
||||
If maxlenratio=0.0 (default), it uses a end-detect function
|
||||
to automatically find maximum hypothesis lengths
|
||||
minlenratio (float): Input length ratio to obtain min output length.
|
||||
pre_beam_score_key (str): key of scores to perform pre-beam search
|
||||
pre_beam_ratio (float): beam size in the pre-beam search
|
||||
will be `int(pre_beam_ratio * beam_size)`
|
||||
|
||||
Returns:
|
||||
List[Dict]: N-best decoding results
|
||||
|
||||
"""
|
||||
ret = BeamSearch(
|
||||
scorers,
|
||||
weights,
|
||||
beam_size=beam_size,
|
||||
vocab_size=vocab_size,
|
||||
pre_beam_ratio=pre_beam_ratio,
|
||||
pre_beam_score_key=pre_beam_score_key,
|
||||
sos=sos,
|
||||
eos=eos,
|
||||
token_list=token_list,
|
||||
).forward(x=x, maxlenratio=maxlenratio, minlenratio=minlenratio)
|
||||
return [h.asdict() for h in ret]
|
@ -0,0 +1,187 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Evaluation for U2 model."""
|
||||
import cProfile
|
||||
import os
|
||||
import sys
|
||||
|
||||
import paddle
|
||||
import soundfile
|
||||
|
||||
from deepspeech.exps.u2.config import get_cfg_defaults
|
||||
from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
|
||||
from deepspeech.io.collator import SpeechCollator
|
||||
from deepspeech.models.u2 import U2Model
|
||||
from deepspeech.training.cli import default_argument_parser
|
||||
from deepspeech.training.trainer import Trainer
|
||||
from deepspeech.utils import layer_tools
|
||||
from deepspeech.utils import mp_tools
|
||||
from deepspeech.utils.log import Log
|
||||
from deepspeech.utils.utility import print_arguments
|
||||
from deepspeech.utils.utility import UpdateConfig
|
||||
logger = Log(__name__).getlog()
|
||||
|
||||
# TODO(hui zhang): dynamic load
|
||||
|
||||
|
||||
class U2Tester_Hub(Trainer):
|
||||
def __init__(self, config, args):
|
||||
# super().__init__(config, args)
|
||||
self.args = args
|
||||
self.config = config
|
||||
self.audio_file = args.audio_file
|
||||
self.collate_fn_test = SpeechCollator.from_config(config)
|
||||
self._text_featurizer = TextFeaturizer(
|
||||
unit_type=config.collator.unit_type,
|
||||
vocab_filepath=None,
|
||||
spm_model_prefix=config.collator.spm_model_prefix)
|
||||
|
||||
def setup_model(self):
|
||||
config = self.config
|
||||
model_conf = config.model
|
||||
|
||||
with UpdateConfig(model_conf):
|
||||
model_conf.input_dim = self.collate_fn_test.feature_size
|
||||
model_conf.output_dim = self.collate_fn_test.vocab_size
|
||||
|
||||
model = U2Model.from_config(model_conf)
|
||||
|
||||
if self.parallel:
|
||||
model = paddle.DataParallel(model)
|
||||
|
||||
logger.info(f"{model}")
|
||||
layer_tools.print_params(model, logger.info)
|
||||
|
||||
self.model = model
|
||||
logger.info("Setup model")
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def test(self):
|
||||
self.model.eval()
|
||||
cfg = self.config.decoding
|
||||
audio_file = self.audio_file
|
||||
collate_fn_test = self.collate_fn_test
|
||||
audio, _ = collate_fn_test.process_utterance(
|
||||
audio_file=audio_file, transcript="Hello")
|
||||
audio_len = audio.shape[0]
|
||||
audio = paddle.to_tensor(audio, dtype='float32')
|
||||
audio_len = paddle.to_tensor(audio_len)
|
||||
audio = paddle.unsqueeze(audio, axis=0)
|
||||
vocab_list = collate_fn_test.vocab_list
|
||||
|
||||
text_feature = self.collate_fn_test.text_feature
|
||||
result_transcripts = self.model.decode(
|
||||
audio,
|
||||
audio_len,
|
||||
text_feature=text_feature,
|
||||
decoding_method=cfg.decoding_method,
|
||||
lang_model_path=cfg.lang_model_path,
|
||||
beam_alpha=cfg.alpha,
|
||||
beam_beta=cfg.beta,
|
||||
beam_size=cfg.beam_size,
|
||||
cutoff_prob=cfg.cutoff_prob,
|
||||
cutoff_top_n=cfg.cutoff_top_n,
|
||||
num_processes=cfg.num_proc_bsearch,
|
||||
ctc_weight=cfg.ctc_weight,
|
||||
decoding_chunk_size=cfg.decoding_chunk_size,
|
||||
num_decoding_left_chunks=cfg.num_decoding_left_chunks,
|
||||
simulate_streaming=cfg.simulate_streaming)
|
||||
logger.info("The result_transcripts: " + result_transcripts[0][0])
|
||||
|
||||
def run_test(self):
|
||||
self.resume()
|
||||
try:
|
||||
self.test()
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(-1)
|
||||
|
||||
def setup(self):
|
||||
"""Setup the experiment.
|
||||
"""
|
||||
paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
|
||||
|
||||
#self.setup_output_dir()
|
||||
#self.setup_checkpointer()
|
||||
|
||||
#self.setup_dataloader()
|
||||
self.setup_model()
|
||||
|
||||
self.iteration = 0
|
||||
self.epoch = 0
|
||||
|
||||
def resume(self):
|
||||
"""Resume from the checkpoint at checkpoints in the output
|
||||
directory or load a specified checkpoint.
|
||||
"""
|
||||
params_path = self.args.checkpoint_path + ".pdparams"
|
||||
model_dict = paddle.load(params_path)
|
||||
self.model.set_state_dict(model_dict)
|
||||
|
||||
|
||||
def check(audio_file):
|
||||
logger.info("checking the audio file format......")
|
||||
try:
|
||||
sig, sample_rate = soundfile.read(audio_file)
|
||||
except Exception as e:
|
||||
logger.error(str(e))
|
||||
logger.error(
|
||||
"can not open the wav file, please check the audio file format")
|
||||
sys.exit(-1)
|
||||
logger.info("The sample rate is %d" % sample_rate)
|
||||
assert (sample_rate == 16000)
|
||||
logger.info("The audio file format is right")
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
exp = U2Tester_Hub(config, args)
|
||||
with exp.eval():
|
||||
exp.setup()
|
||||
exp.run_test()
|
||||
|
||||
|
||||
def main(config, args):
|
||||
main_sp(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = default_argument_parser()
|
||||
# save asr result to
|
||||
parser.add_argument(
|
||||
"--result_file", type=str, help="path of save the asr result")
|
||||
parser.add_argument(
|
||||
"--audio_file", type=str, help="path of the input audio file")
|
||||
args = parser.parse_args()
|
||||
print_arguments(args, globals())
|
||||
|
||||
if not os.path.isfile(args.audio_file):
|
||||
print("Please input the right audio file path")
|
||||
sys.exit(-1)
|
||||
check(args.audio_file)
|
||||
# https://yaml.org/type/float.html
|
||||
config = get_cfg_defaults()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
if args.dump_config:
|
||||
with open(args.dump_config, 'w') as f:
|
||||
print(config, file=f)
|
||||
|
||||
# Setting for profiling
|
||||
pr = cProfile.Profile()
|
||||
pr.runcall(main, config, args)
|
||||
pr.dump_stats('test.profile')
|
@ -0,0 +1,20 @@
|
||||
# Minimal makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line, and also
|
||||
# from the environment for the first two.
|
||||
SPHINXOPTS ?=
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SOURCEDIR = source
|
||||
BUILDDIR = build
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
After Width: | Height: | Size: 19 KiB |
After Width: | Height: | Size: 47 KiB |
After Width: | Height: | Size: 335 KiB |
After Width: | Height: | Size: 117 KiB |
After Width: | Height: | Size: 60 KiB |
After Width: | Height: | Size: 33 KiB |
After Width: | Height: | Size: 75 KiB |
After Width: | Height: | Size: 116 KiB |
After Width: | Height: | Size: 69 KiB |
After Width: | Height: | Size: 41 KiB |
After Width: | Height: | Size: 87 KiB |
After Width: | Height: | Size: 131 KiB |
After Width: | Height: | Size: 304 KiB |
After Width: | Height: | Size: 143 KiB |
After Width: | Height: | Size: 361 KiB |
@ -0,0 +1,35 @@
|
||||
@ECHO OFF
|
||||
|
||||
pushd %~dp0
|
||||
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
if "%SPHINXBUILD%" == "" (
|
||||
set SPHINXBUILD=sphinx-build
|
||||
)
|
||||
set SOURCEDIR=source
|
||||
set BUILDDIR=build
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
if errorlevel 9009 (
|
||||
echo.
|
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||
echo.may add the Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.http://sphinx-doc.org/
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
goto end
|
||||
|
||||
:help
|
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
|
||||
:end
|
||||
popd
|
@ -0,0 +1,81 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file only contains a selection of the most common options. For a full
|
||||
# list see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
import recommonmark.parser
|
||||
import sphinx_rtd_theme
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = 'paddle speech'
|
||||
copyright = '2021, Deepspeech-developers'
|
||||
author = 'Deepspeech-developers'
|
||||
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = '2.1'
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
source_parsers = {
|
||||
'.md': recommonmark.parser.CommonMarkParser,
|
||||
}
|
||||
source_suffix = ['.rst', '.md']
|
||||
|
||||
master_doc = 'index'
|
||||
pygments_style = 'sphinx'
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.viewcode',
|
||||
'sphinx_rtd_theme',
|
||||
'sphinx.ext.mathjax',
|
||||
'sphinx.ext.autosummary',
|
||||
'numpydoc',
|
||||
'myst_parser',
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This pattern also affects html_static_path and html_extra_path.
|
||||
exclude_patterns = [
|
||||
'_build',
|
||||
]
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
|
||||
smartquotes = False
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ['_static']
|
||||
|
||||
# -- Extension configuration -------------------------------------------------
|
||||
# numpydoc_show_class_members = False
|
@ -0,0 +1,47 @@
|
||||
Welcome to paddle Deepspeech documentation !
|
||||
==============================================
|
||||
|
||||
**Deepspeech** is a Speech toolkits implemented by paddlepaddle.
|
||||
|
||||
|
||||
Contents
|
||||
--------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Introduction
|
||||
|
||||
asr/deepspeech_architecture
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Getting_started
|
||||
|
||||
asr/install
|
||||
asr/getting_started
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: More Information
|
||||
|
||||
asr/data_preparation
|
||||
asr/augmentation
|
||||
asr/feature_list
|
||||
asr/ngram_lm
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Released_model
|
||||
|
||||
asr/released_model
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Acknowledgement
|
||||
|
||||
asr/reference
|
||||
|
@ -0,0 +1,130 @@
|
||||
# Parakeet
|
||||
Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It is built on PaddlePaddle dynamic graph and includes many influential TTS models.
|
||||
|
||||
<div align="center">
|
||||
<img src="../../images/logo.png" width=300 /> <br>
|
||||
</div>
|
||||
|
||||
|
||||
## News <img src="../../images/news_icon.png" width="40"/>
|
||||
- Oct-12-2021, Refector examples code.
|
||||
- Oct-12-2021, Parallel WaveGAN with LJSpeech. Check [examples/GANVocoder/parallelwave_gan/ljspeech](./examples/GANVocoder/parallelwave_gan/ljspeech).
|
||||
- Oct-12-2021, FastSpeech2/FastPitch with LJSpeech. Check [examples/fastspeech2/ljspeech](./examples/fastspeech2/ljspeech).
|
||||
- Sep-14-2021, Reconstruction of TransformerTTS. Check [examples/transformer_tts/ljspeech](./examples/transformer_tts/ljspeech).
|
||||
- Aug-31-2021, Chinese Text Frontend. Check [examples/text_frontend](./examples/text_frontend).
|
||||
- Aug-23-2021, FastSpeech2/FastPitch with AISHELL-3. Check [examples/fastspeech2/aishell3](./examples/fastspeech2/aishell3).
|
||||
- Aug-03-2021, FastSpeech2/FastPitch with CSMSC. Check [examples/fastspeech2/baker](./examples/fastspeech2/baker).
|
||||
- Jul-19-2021, SpeedySpeech with CSMSC. Check [examples/speedyspeech/baker](./examples/speedyspeech/baker).
|
||||
- Jul-01-2021, Parallel WaveGAN with CSMSC. Check [examples/GANVocoder/parallelwave_gan/baker](./examples/GANVocoder/parallelwave_gan/baker).
|
||||
- Jul-01-2021, Montreal-Forced-Aligner. Check [examples/use_mfa](./examples/use_mfa).
|
||||
- May-07-2021, Voice Cloning in Chinese. Check [examples/tacotron2_aishell3](./examples/tacotron2_aishell3).
|
||||
|
||||
## Overview
|
||||
|
||||
In order to facilitate exploiting the existing TTS models directly and developing the new ones, Parakeet selects typical models and provides their reference implementations in PaddlePaddle. Further more, Parakeet abstracts the TTS pipeline and standardizes the procedure of data preprocessing, common modules sharing, model configuration, and the process of training and synthesis. The models supported here include Text FrontEnd, end-to-end Acoustic models and Vocoders:
|
||||
|
||||
- Text FrontEnd
|
||||
- Rule based Chinese frontend.
|
||||
|
||||
- Acoustic Models
|
||||
- [【FastSpeech2】FastSpeech 2: Fast and High-Quality End-to-End Text to Speech](https://arxiv.org/abs/2006.04558)
|
||||
- [【SpeedySpeech】SpeedySpeech: Efficient Neural Speech Synthesis](https://arxiv.org/abs/2008.03802)
|
||||
- [【Transformer TTS】Neural Speech Synthesis with Transformer Network](https://arxiv.org/abs/1809.08895)
|
||||
- [【Tacotron2】Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884)
|
||||
- Vocoders
|
||||
- [【Parallel WaveGAN】Parallel WaveGAN: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram](https://arxiv.org/abs/1910.11480)
|
||||
- [【WaveFlow】WaveFlow: A Compact Flow-based Model for Raw Audio](https://arxiv.org/abs/1912.01219)
|
||||
- Voice Cloning
|
||||
- [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558v4.pdf)
|
||||
- [【GE2E】Generalized End-to-End Loss for Speaker Verification](https://arxiv.org/abs/1710.10467)
|
||||
|
||||
## Setup
|
||||
It's difficult to install some dependent libraries for this repo in Windows system, we recommend that you **DO NOT** use Windows system, please use `Linux`.
|
||||
|
||||
Make sure the library `libsndfile1` is installed, e.g., on Ubuntu.
|
||||
|
||||
```bash
|
||||
sudo apt-get install libsndfile1
|
||||
```
|
||||
### Install PaddlePaddle
|
||||
See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. This repo requires PaddlePaddle **2.1.2** or above.
|
||||
|
||||
### Install Parakeet
|
||||
|
||||
```bash
|
||||
git clone https://github.com/PaddlePaddle/Parakeet
|
||||
cd Parakeet
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
If some python dependent packages cannot be installed successfully, you can run the following script first.
|
||||
(replace `python3.6` with your own python version)
|
||||
```bash
|
||||
sudo apt install -y python3.6-dev
|
||||
```
|
||||
|
||||
See [install](https://paddle-parakeet.readthedocs.io/en/latest/install.html) for more details.
|
||||
|
||||
## Examples
|
||||
Entries to the introduction, and the launch of training and synthsis for different example models:
|
||||
|
||||
- [>>> Chinese Text Frontend](./examples/text_frontend)
|
||||
- [>>> FastSpeech2/FastPitch](./examples/fastspeech2)
|
||||
- [>>> Montreal-Forced-Aligner](./examples/use_mfa)
|
||||
- [>>> Parallel WaveGAN](./examples/GANVocoder/parallelwave_gan)
|
||||
- [>>> SpeedySpeech](./examples/speedyspeech)
|
||||
- [>>> Tacotron2_AISHELL3](./examples/tacotron2_aishell3)
|
||||
- [>>> GE2E](./examples/ge2e)
|
||||
- [>>> WaveFlow](./examples/waveflow)
|
||||
- [>>> TransformerTTS](./examples/transformer_tts)
|
||||
- [>>> Tacotron2](./examples/tacotron2)
|
||||
|
||||
## Audio samples
|
||||
### TTS models (Acoustic Model + Neural Vocoder)
|
||||
Check our [website](https://paddleparakeet.readthedocs.io/en/latest/demo.html) for audio sampels.
|
||||
|
||||
## Released Model
|
||||
|
||||
### Acoustic Model
|
||||
|
||||
#### FastSpeech2/FastPitch
|
||||
1. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
|
||||
2. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
|
||||
3. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
|
||||
|
||||
#### SpeedySpeech
|
||||
1. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
|
||||
|
||||
#### TransformerTTS
|
||||
|
||||
1. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
|
||||
|
||||
#### Tacotron2
|
||||
|
||||
1. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
|
||||
2. [tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
|
||||
|
||||
### Vocoder
|
||||
|
||||
#### WaveFlow
|
||||
|
||||
1. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)
|
||||
|
||||
#### Parallel WaveGAN
|
||||
|
||||
1. [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)
|
||||
2. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)
|
||||
|
||||
### Voice Cloning
|
||||
|
||||
#### Tacotron2_AISHELL3
|
||||
|
||||
1. [tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)
|
||||
|
||||
#### GE2E
|
||||
|
||||
1. [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
|
||||
|
||||
## License
|
||||
|
||||
Parakeet is provided under the [Apache-2.0 license](LICENSE).
|
@ -0,0 +1,583 @@
|
||||
Audio Sample
|
||||
==================
|
||||
|
||||
The main processes of TTS include:
|
||||
|
||||
1. Convert the original text into characters/phonemes, through ``text frontend`` module.
|
||||
|
||||
2. Convert characters/phonemes into acoustic features , such as linear spectrogram, mel spectrogram, LPC features, etc. through ``Acoustic models``.
|
||||
|
||||
3. Convert acoustic features into waveforms through ``Vocoders``.
|
||||
|
||||
When training ``Tacotron2``、``TransformerTTS`` and ``WaveFlow``, we use English single speaker TTS dataset `LJSpeech <https://keithito.com/LJ-Speech-Dataset/>`_ by default. However, when training ``SpeedySpeech``, ``FastSpeech2`` and ``ParallelWaveGAN``, we use Chinese single speaker dataset `CSMSC <https://test.data-baker.com/data/index/source/>`_ by default.
|
||||
|
||||
In the future, ``Parakeet`` will mainly use Chinese TTS datasets for default examples.
|
||||
|
||||
Here, we will display three types of audio samples:
|
||||
|
||||
1. Analysis/synthesis (ground-truth spectrograms + Vocoder)
|
||||
|
||||
2. TTS (Acoustic model + Vocoder)
|
||||
|
||||
3. Chinese TTS with/without text frontend (mainly tone sandhi)
|
||||
|
||||
Analysis/synthesis
|
||||
--------------------------
|
||||
|
||||
Audio samples generated from ground-truth spectrograms with a vocoder.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<b>LJSpeech(English)</b>
|
||||
<br>
|
||||
</br>
|
||||
<table>
|
||||
<tr>
|
||||
<th align="left"> GT </th>
|
||||
<th align="left"> WaveFlow </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0001.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0002.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0003.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0004.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0005.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_0.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_1.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_2.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_3.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_4.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<br>
|
||||
</br>
|
||||
<b>CSMSC(Chinese)</b>
|
||||
<br>
|
||||
</br>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th align="left"> GT (convert to 24k) </th>
|
||||
<th align="left"> ParallelWaveGAN </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009901.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009902.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009903.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009904.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009905.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009901.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009902.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009903.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009904.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009905.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
</table>
|
||||
|
||||
|
||||
TTS
|
||||
-------------------
|
||||
|
||||
Audio samples generated by a TTS system. Text is first transformed into spectrogram by a text-to-spectrogram model, then the spectrogram is converted into raw audio by a vocoder.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th align="left"> TransformerTTS + WaveFlow </th>
|
||||
<th align="left"> Tacotron2 + WaveFlow </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/001.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/002.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/003.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/004.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/005.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/006.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/007.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/008.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/009.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_1.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_2.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_3.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_4.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_5.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_6.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_7.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_8.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_9.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th align="left"> SpeedySpeech + ParallelWaveGAN </th>
|
||||
<th align="left"> FastSpeech2 + ParallelWaveGAN </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/001.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/002.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/003.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/004.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/005.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/006.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/007.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/008.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/009.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/001.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/002.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/003.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/004.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/005.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/006.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/007.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/008.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/009.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
|
||||
|
||||
Chinese TTS with/without text frontend
|
||||
--------------------------------------
|
||||
|
||||
We provide a complete Chinese text frontend module in ``Parakeet``. ``Text Normalization`` and ``G2P`` are the most important modules in text frontend, We assume that the texts are normalized already, and mainly compare ``G2P`` module here.
|
||||
|
||||
We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th align="left"> With Text Frontend </th>
|
||||
<th align="left"> Without Text Frontend </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/001.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/002.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/003.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/004.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/005.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/006.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/007.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/008.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/009.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/010.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/001.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/002.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/003.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/004.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/005.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/006.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/007.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/008.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/009.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/010.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
|
||||
<table>
|
@ -0,0 +1,45 @@
|
||||
.. parakeet documentation master file, created by
|
||||
sphinx-quickstart on Fri Sep 10 14:22:24 2021.
|
||||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
Parakeet
|
||||
====================================
|
||||
|
||||
``parakeet`` is a deep learning based text-to-speech toolkit built upon ``paddlepaddle`` framework. It aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It includes many influential TTS models proposed by `Baidu Research <http://research.baidu.com>`_ and other research groups.
|
||||
|
||||
``parakeet`` mainly consists of components below.
|
||||
|
||||
#. Implementation of models and commonly used neural network layers.
|
||||
#. Dataset abstraction and common data preprocessing pipelines.
|
||||
#. Ready-to-run experiments.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Introduction
|
||||
|
||||
introduction
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Getting started
|
||||
|
||||
install
|
||||
basic_usage
|
||||
advanced_usage
|
||||
cn_text_frontend
|
||||
released_models
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Demos
|
||||
|
||||
demo
|
||||
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
* :ref:`genindex`
|
||||
* :ref:`modindex`
|
||||
* :ref:`search`
|
@ -0,0 +1,47 @@
|
||||
# Installation
|
||||
## Install PaddlePaddle
|
||||
Parakeet requires PaddlePaddle as its backend. Note that 2.1.2 or newer versions of paddle is required.
|
||||
|
||||
Since paddlepaddle has multiple packages depending on the device (cpu or gpu) and the dependency libraries, it is recommended to install a proper package of paddlepaddle with respect to the device and dependency library versons via `pip`.
|
||||
|
||||
Installing paddlepaddle with conda or build paddlepaddle from source is also supported. Please refer to [PaddlePaddle installation](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html) for more details.
|
||||
|
||||
Example instruction to install paddlepaddle via pip is listed below.
|
||||
|
||||
### PaddlePaddle with GPU
|
||||
```python
|
||||
# CUDA10.1 的 PaddlePaddle
|
||||
python -m pip install paddlepaddle-gpu==2.1.2.post101 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
|
||||
# CUDA10.2 的 PaddlePaddle
|
||||
python -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple
|
||||
# CUDA11.0 的 PaddlePaddle
|
||||
python -m pip install paddlepaddle-gpu==2.1.2.post110 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
|
||||
# CUDA11.2 的 PaddlePaddle
|
||||
python -m pip install paddlepaddle-gpu==2.1.2.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
|
||||
```
|
||||
### PaddlePaddle with CPU
|
||||
```python
|
||||
python -m pip install paddlepaddle==2.1.2 -i https://mirror.baidu.com/pypi/simple
|
||||
```
|
||||
## Install libsndfile
|
||||
Experimemts in parakeet often involve audio and spectrum processing, thus `librosa` and `soundfile` are required. `soundfile` requires a extra C library `libsndfile`, which is not always handled by pip.
|
||||
|
||||
For Windows and Mac users, `libsndfile` is also installed when installing `soundfile` via pip, but for Linux users, installing `libsndfile` via system package manager is required. Example commands for popular distributions are listed below.
|
||||
```bash
|
||||
# ubuntu, debian
|
||||
sudo apt-get install libsndfile1
|
||||
# centos, fedora
|
||||
sudo yum install libsndfile
|
||||
# openSUSE
|
||||
sudo zypper in libsndfile
|
||||
```
|
||||
For any problem with installtion of soundfile, please refer to [SoundFile](https://pypi.org/project/SoundFile/).
|
||||
## Install Parakeet
|
||||
There are two ways to install parakeet according to the purpose of using it.
|
||||
|
||||
1. If you want to run experiments provided by parakeet or add new models and experiments, it is recommended to clone the project from github (Parakeet), and install it in editable mode.
|
||||
```python
|
||||
git clone https://github.com/PaddlePaddle/Parakeet
|
||||
cd Parakeet
|
||||
pip install -e .
|
||||
```
|
@ -0,0 +1,27 @@
|
||||
# Parakeet - PAddle PARAllel text-to-speech toolKIT
|
||||
|
||||
## What is Parakeet?
|
||||
Parakeet is a deep learning based text-to-speech toolkit built upon paddlepaddle framework. It aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It includes many influential TTS models proposed by Baidu Research and other research groups.
|
||||
|
||||
## What can Parakeet do?
|
||||
Parakeet mainly consists of components below:
|
||||
- Implementation of models and commonly used neural network layers.
|
||||
- Dataset abstraction and common data preprocessing pipelines.
|
||||
- Ready-to-run experiments.
|
||||
|
||||
Parakeet provides you with a complete TTS pipeline, including:
|
||||
- Text FrontEnd
|
||||
- Rule based Chinese frontend.
|
||||
- Acoustic Models
|
||||
- FastSpeech2
|
||||
- SpeedySpeech
|
||||
- TransformerTTS
|
||||
- Tacotron2
|
||||
- Vocoders
|
||||
- Parallel WaveGAN
|
||||
- WaveFlow
|
||||
- Voice Cloning
|
||||
- Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
|
||||
- GE2E
|
||||
|
||||
Parakeet helps you to train TTS models with simple commands.
|
@ -1,4 +1,12 @@
|
||||
# ASR
|
||||
|
||||
* s0 for deepspeech2 offline
|
||||
* s1 for u2
|
||||
* s0 for deepspeech2
|
||||
* s1 for u2/transformer/conformer
|
||||
|
||||
## Data
|
||||
|
||||
| Data Subset | Duration in Seconds |
|
||||
| ------------------- | --------------------- |
|
||||
| data/manifest.train | 1.23 ~ 14.53125 |
|
||||
| data/manifest.dev | 1.645 ~ 12.533 |
|
||||
| data/manifest.test | 1.859125 ~ 14.6999375 |
|
||||
|
@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# != 3 ];then
|
||||
echo "usage: ${0} config_path ckpt_path_prefix audio_file"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
echo "using $ngpu gpus..."
|
||||
|
||||
config_path=$1
|
||||
ckpt_prefix=$2
|
||||
audio_file=$3
|
||||
|
||||
chunk_mode=false
|
||||
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
|
||||
chunk_mode=true
|
||||
fi
|
||||
|
||||
# download language model
|
||||
#bash local/download_lm_ch.sh
|
||||
#if [ $? -ne 0 ]; then
|
||||
# exit 1
|
||||
#fi
|
||||
|
||||
|
||||
|
||||
for type in attention_rescoring; do
|
||||
echo "decoding ${type}"
|
||||
batch_size=1
|
||||
output_dir=${ckpt_prefix}
|
||||
mkdir -p ${output_dir}
|
||||
python3 -u ${BIN_DIR}/test_hub.py \
|
||||
--nproc ${ngpu} \
|
||||
--config ${config_path} \
|
||||
--result_file ${output_dir}/${type}.rsl \
|
||||
--checkpoint_path ${ckpt_prefix} \
|
||||
--opts decoding.decoding_method ${type} \
|
||||
--opts decoding.batch_size ${batch_size} \
|
||||
--audio_file ${audio_file}
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in evaluation!"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
exit 0
|
@ -0,0 +1,4 @@
|
||||
# Aishell3
|
||||
|
||||
* tts0 - fastspeech2
|
||||
* vc0 - tactron2 voice clone
|
@ -0,0 +1,88 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from paddle.io import Dataset
|
||||
from parakeet.frontend import Vocab
|
||||
from parakeet.data import batch_text_id, batch_spec
|
||||
|
||||
from preprocess_transcription import _phones, _tones
|
||||
|
||||
voc_phones = Vocab(sorted(list(_phones)))
|
||||
print("vocab_phones:\n", voc_phones)
|
||||
voc_tones = Vocab(sorted(list(_tones)))
|
||||
print("vocab_tones:\n", voc_tones)
|
||||
|
||||
|
||||
class AiShell3(Dataset):
|
||||
"""Processed AiShell3 dataset."""
|
||||
|
||||
def __init__(self, root):
|
||||
super().__init__()
|
||||
self.root = Path(root).expanduser()
|
||||
self.embed_dir = self.root / "embed"
|
||||
self.mel_dir = self.root / "mel"
|
||||
|
||||
with open(self.root / "metadata.pickle", 'rb') as f:
|
||||
self.records = pickle.load(f)
|
||||
|
||||
def __getitem__(self, index):
|
||||
metadatum = self.records[index]
|
||||
sentence_id = metadatum["sentence_id"]
|
||||
speaker_id = sentence_id[:7]
|
||||
phones = metadatum["phones"]
|
||||
tones = metadatum["tones"]
|
||||
phones = np.array(
|
||||
[voc_phones.lookup(item) for item in phones], dtype=np.int64)
|
||||
tones = np.array(
|
||||
[voc_tones.lookup(item) for item in tones], dtype=np.int64)
|
||||
mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy")))
|
||||
embed = np.load(
|
||||
str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
|
||||
return phones, tones, mel, embed
|
||||
|
||||
def __len__(self):
|
||||
return len(self.records)
|
||||
|
||||
|
||||
def collate_aishell3_examples(examples):
|
||||
phones, tones, mel, embed = list(zip(*examples))
|
||||
|
||||
text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
|
||||
spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
|
||||
T_dec = np.max(spec_lengths)
|
||||
stop_tokens = (
|
||||
np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
|
||||
phones, _ = batch_text_id(phones)
|
||||
tones, _ = batch_text_id(tones)
|
||||
mel, _ = batch_spec(mel)
|
||||
mel = np.transpose(mel, (0, 2, 1))
|
||||
embed = np.stack(embed)
|
||||
# 7 fields
|
||||
# (B, T), (B, T), (B, T, C), (B, C), (B,), (B,), (B, T)
|
||||
return phones, tones, mel, embed, text_lengths, spec_lengths, stop_tokens
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dataset = AiShell3("~/datasets/aishell3/train")
|
||||
example = dataset[0]
|
||||
|
||||
examples = [dataset[i] for i in range(10)]
|
||||
batch = collate_aishell3_examples(examples)
|
||||
|
||||
for field in batch:
|
||||
print(field.shape, field.dtype)
|
@ -0,0 +1,39 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import List, Tuple
|
||||
from pypinyin import lazy_pinyin, Style
|
||||
from preprocess_transcription import split_syllable
|
||||
|
||||
|
||||
def convert_to_pinyin(text: str) -> List[str]:
|
||||
"""convert text into list of syllables, other characters that are not chinese, thus
|
||||
cannot be converted to pinyin are splited.
|
||||
"""
|
||||
syllables = lazy_pinyin(
|
||||
text, style=Style.TONE3, neutral_tone_with_five=True)
|
||||
return syllables
|
||||
|
||||
|
||||
def convert_sentence(text: str) -> List[Tuple[str]]:
|
||||
"""convert a sentence into two list: phones and tones"""
|
||||
syllables = convert_to_pinyin(text)
|
||||
phones = []
|
||||
tones = []
|
||||
for syllable in syllables:
|
||||
p, t = split_syllable(syllable)
|
||||
phones.extend(p)
|
||||
tones.extend(t)
|
||||
|
||||
return phones, tones
|
@ -0,0 +1,82 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from yacs.config import CfgNode as CN
|
||||
|
||||
_C = CN()
|
||||
_C.data = CN(
|
||||
dict(
|
||||
batch_size=32, # batch size
|
||||
valid_size=64, # the first N examples are reserved for validation
|
||||
sample_rate=22050, # Hz, sample rate
|
||||
n_fft=1024, # fft frame size
|
||||
win_length=1024, # window size
|
||||
hop_length=256, # hop size between ajacent frame
|
||||
fmax=8000, # Hz, max frequency when converting to mel
|
||||
fmin=0, # Hz, min frequency when converting to mel
|
||||
d_mels=80, # mel bands
|
||||
padding_idx=0, # text embedding's padding index
|
||||
))
|
||||
|
||||
_C.model = CN(
|
||||
dict(
|
||||
vocab_size=70,
|
||||
n_tones=10,
|
||||
reduction_factor=1, # reduction factor
|
||||
d_encoder=512, # embedding & encoder's internal size
|
||||
encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
|
||||
encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
|
||||
d_prenet=256, # hidden size of decoder prenet
|
||||
# hidden size of the first rnn layer in tacotron2 decoder
|
||||
d_attention_rnn=1024,
|
||||
# hidden size of the second rnn layer in tacotron2 decoder
|
||||
d_decoder_rnn=1024,
|
||||
d_attention=128, # hidden size of decoder location linear layer
|
||||
attention_filters=32, # number of filter in decoder location conv layer
|
||||
attention_kernel_size=31, # kernel size of decoder location conv layer
|
||||
d_postnet=512, # hidden size of decoder postnet
|
||||
postnet_kernel_size=5, # kernel size of conv layers in postnet
|
||||
postnet_conv_layers=5, # number of conv layer in decoder postnet
|
||||
p_encoder_dropout=0.5, # droput probability in encoder
|
||||
p_prenet_dropout=0.5, # droput probability in decoder prenet
|
||||
|
||||
# droput probability of first rnn layer in decoder
|
||||
p_attention_dropout=0.1,
|
||||
# droput probability of second rnn layer in decoder
|
||||
p_decoder_dropout=0.1,
|
||||
p_postnet_dropout=0.5, # droput probability in decoder postnet
|
||||
guided_attention_loss_sigma=0.2,
|
||||
d_global_condition=256,
|
||||
|
||||
# whether to use a classifier to predict stop probability
|
||||
use_stop_token=False,
|
||||
# whether to use guided attention loss in training
|
||||
use_guided_attention_loss=True, ))
|
||||
|
||||
_C.training = CN(
|
||||
dict(
|
||||
lr=1e-3, # learning rate
|
||||
weight_decay=1e-6, # the coeff of weight decay
|
||||
grad_clip_thresh=1.0, # the clip norm of grad clip.
|
||||
valid_interval=1000, # validation
|
||||
save_interval=1000, # checkpoint
|
||||
max_iteration=500000, # max iteration to train
|
||||
))
|
||||
|
||||
|
||||
def get_cfg_defaults():
|
||||
"""Get a yacs CfgNode object with default values for my_project."""
|
||||
# Return a clone so that the defaults will not be altered
|
||||
# This is for the "local variable" use pattern
|
||||
return _C.clone()
|
@ -0,0 +1,96 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import multiprocessing as mp
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from parakeet.audio import AudioProcessor
|
||||
from parakeet.audio.spec_normalizer import NormalizerBase, LogMagnitude
|
||||
|
||||
import tqdm
|
||||
|
||||
from config import get_cfg_defaults
|
||||
|
||||
|
||||
def extract_mel(fname: Path,
|
||||
input_dir: Path,
|
||||
output_dir: Path,
|
||||
p: AudioProcessor,
|
||||
n: NormalizerBase):
|
||||
relative_path = fname.relative_to(input_dir)
|
||||
out_path = (output_dir / relative_path).with_suffix(".npy")
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
wav = p.read_wav(fname)
|
||||
mel = p.mel_spectrogram(wav)
|
||||
mel = n.transform(mel)
|
||||
np.save(out_path, mel)
|
||||
|
||||
|
||||
def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
|
||||
input_dir = Path(input_dir).expanduser()
|
||||
fnames = list(input_dir.rglob(f"*{extension}"))
|
||||
output_dir = Path(output_dir).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,
|
||||
config.hop_length, config.n_mels, config.fmin,
|
||||
config.fmax)
|
||||
n = LogMagnitude(1e-5)
|
||||
|
||||
func = partial(
|
||||
extract_mel, input_dir=input_dir, output_dir=output_dir, p=p, n=n)
|
||||
|
||||
with mp.Pool(16) as pool:
|
||||
list(
|
||||
tqdm.tqdm(
|
||||
pool.imap(func, fnames), total=len(fnames), unit="utterance"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract mel spectrogram from processed wav in AiShell3 training dataset."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
help="yaml config file to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/normalized_wav",
|
||||
help="path of the processed wav folder")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/mel",
|
||||
help="path of the folder to save mel spectrograms")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
default_config = get_cfg_defaults()
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
default_config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
default_config.merge_from_list(args.opts)
|
||||
default_config.freeze()
|
||||
audio_config = default_config.data
|
||||
|
||||
extract_mel_multispeaker(audio_config, args.input, args.output)
|
After Width: | Height: | Size: 221 KiB |
After Width: | Height: | Size: 550 KiB |
After Width: | Height: | Size: 514 KiB |
@ -0,0 +1,258 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import re
|
||||
import pickle
|
||||
|
||||
import yaml
|
||||
import tqdm
|
||||
|
||||
zh_pattern = re.compile("[\u4e00-\u9fa5]")
|
||||
|
||||
_tones = {'<pad>', '<s>', '</s>', '0', '1', '2', '3', '4', '5'}
|
||||
|
||||
_pauses = {'%', '$'}
|
||||
|
||||
_initials = {
|
||||
'b',
|
||||
'p',
|
||||
'm',
|
||||
'f',
|
||||
'd',
|
||||
't',
|
||||
'n',
|
||||
'l',
|
||||
'g',
|
||||
'k',
|
||||
'h',
|
||||
'j',
|
||||
'q',
|
||||
'x',
|
||||
'zh',
|
||||
'ch',
|
||||
'sh',
|
||||
'r',
|
||||
'z',
|
||||
'c',
|
||||
's',
|
||||
}
|
||||
|
||||
_finals = {
|
||||
'ii',
|
||||
'iii',
|
||||
'a',
|
||||
'o',
|
||||
'e',
|
||||
'ea',
|
||||
'ai',
|
||||
'ei',
|
||||
'ao',
|
||||
'ou',
|
||||
'an',
|
||||
'en',
|
||||
'ang',
|
||||
'eng',
|
||||
'er',
|
||||
'i',
|
||||
'ia',
|
||||
'io',
|
||||
'ie',
|
||||
'iai',
|
||||
'iao',
|
||||
'iou',
|
||||
'ian',
|
||||
'ien',
|
||||
'iang',
|
||||
'ieng',
|
||||
'u',
|
||||
'ua',
|
||||
'uo',
|
||||
'uai',
|
||||
'uei',
|
||||
'uan',
|
||||
'uen',
|
||||
'uang',
|
||||
'ueng',
|
||||
'v',
|
||||
've',
|
||||
'van',
|
||||
'ven',
|
||||
'veng',
|
||||
}
|
||||
|
||||
_ernized_symbol = {'&r'}
|
||||
|
||||
_specials = {'<pad>', '<unk>', '<s>', '</s>'}
|
||||
|
||||
_phones = _initials | _finals | _ernized_symbol | _specials | _pauses
|
||||
|
||||
|
||||
def is_zh(word):
|
||||
global zh_pattern
|
||||
match = zh_pattern.search(word)
|
||||
return match is not None
|
||||
|
||||
|
||||
def ernized(syllable):
|
||||
return syllable[:2] != "er" and syllable[-2] == 'r'
|
||||
|
||||
|
||||
def convert(syllable):
|
||||
# expansion of o -> uo
|
||||
syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
|
||||
# syllable = syllable.replace("bo", "buo").replace("po", "puo").replace("mo", "muo").replace("fo", "fuo")
|
||||
# expansion for iong, ong
|
||||
syllable = syllable.replace("iong", "veng").replace("ong", "ueng")
|
||||
|
||||
# expansion for ing, in
|
||||
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
|
||||
|
||||
# expansion for un, ui, iu
|
||||
syllable = syllable.replace("un", "uen").replace("ui",
|
||||
"uei").replace("iu", "iou")
|
||||
|
||||
# rule for variants of i
|
||||
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
|
||||
.replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\
|
||||
.replace("ri", "riii")
|
||||
|
||||
# rule for y preceding i, u
|
||||
syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")
|
||||
|
||||
# rule for w
|
||||
syllable = syllable.replace("wu", "u").replace("w", "u")
|
||||
|
||||
# rule for v following j, q, x
|
||||
syllable = syllable.replace("ju", "jv").replace("qu",
|
||||
"qv").replace("xu", "xv")
|
||||
|
||||
return syllable
|
||||
|
||||
|
||||
def split_syllable(syllable: str):
|
||||
"""Split a syllable in pinyin into a list of phones and a list of tones.
|
||||
Initials have no tone, represented by '0', while finals have tones from
|
||||
'1,2,3,4,5'.
|
||||
|
||||
e.g.
|
||||
|
||||
zhang -> ['zh', 'ang'], ['0', '1']
|
||||
"""
|
||||
if syllable in _pauses:
|
||||
# syllable, tone
|
||||
return [syllable], ['0']
|
||||
|
||||
tone = syllable[-1]
|
||||
syllable = convert(syllable[:-1])
|
||||
|
||||
phones = []
|
||||
tones = []
|
||||
|
||||
global _initials
|
||||
if syllable[:2] in _initials:
|
||||
phones.append(syllable[:2])
|
||||
tones.append('0')
|
||||
phones.append(syllable[2:])
|
||||
tones.append(tone)
|
||||
elif syllable[0] in _initials:
|
||||
phones.append(syllable[0])
|
||||
tones.append('0')
|
||||
phones.append(syllable[1:])
|
||||
tones.append(tone)
|
||||
else:
|
||||
phones.append(syllable)
|
||||
tones.append(tone)
|
||||
return phones, tones
|
||||
|
||||
|
||||
def load_aishell3_transcription(line: str):
|
||||
sentence_id, pinyin, text = line.strip().split("|")
|
||||
syllables = pinyin.strip().split()
|
||||
|
||||
results = []
|
||||
|
||||
for syllable in syllables:
|
||||
if syllable in _pauses:
|
||||
results.append(syllable)
|
||||
elif not ernized(syllable):
|
||||
results.append(syllable)
|
||||
else:
|
||||
results.append(syllable[:-2] + syllable[-1])
|
||||
results.append('&r5')
|
||||
|
||||
phones = []
|
||||
tones = []
|
||||
for syllable in results:
|
||||
p, t = split_syllable(syllable)
|
||||
phones.extend(p)
|
||||
tones.extend(t)
|
||||
for p in phones:
|
||||
assert p in _phones, p
|
||||
return {
|
||||
"sentence_id": sentence_id,
|
||||
"text": text,
|
||||
"syllables": results,
|
||||
"phones": phones,
|
||||
"tones": tones
|
||||
}
|
||||
|
||||
|
||||
def process_aishell3(dataset_root, output_dir):
|
||||
dataset_root = Path(dataset_root).expanduser()
|
||||
output_dir = Path(output_dir).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
prosody_label_path = dataset_root / "label_train-set.txt"
|
||||
with open(prosody_label_path, 'rt') as f:
|
||||
lines = [line.strip() for line in f]
|
||||
|
||||
records = lines[5:]
|
||||
|
||||
processed_records = []
|
||||
for record in tqdm.tqdm(records):
|
||||
new_record = load_aishell3_transcription(record)
|
||||
processed_records.append(new_record)
|
||||
print(new_record)
|
||||
|
||||
with open(output_dir / "metadata.pickle", 'wb') as f:
|
||||
pickle.dump(processed_records, f)
|
||||
|
||||
with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
|
||||
yaml.safe_dump(
|
||||
processed_records, f, default_flow_style=None, allow_unicode=True)
|
||||
|
||||
print("metadata done!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train",
|
||||
help="path of the training dataset,(contains a label_train-set.txt).")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
help="the directory to save the processed transcription."
|
||||
"If not provided, it would be the same as the input.")
|
||||
args = parser.parse_args()
|
||||
if args.output is None:
|
||||
args.output = args.input
|
||||
|
||||
process_aishell3(args.input, args.output)
|
@ -0,0 +1,95 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from multiprocessing import Pool
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import librosa
|
||||
import soundfile as sf
|
||||
from tqdm import tqdm
|
||||
from praatio import tgio
|
||||
|
||||
|
||||
def get_valid_part(fpath):
|
||||
f = tgio.openTextgrid(fpath)
|
||||
|
||||
start = 0
|
||||
phone_entry_list = f.tierDict['phones'].entryList
|
||||
first_entry = phone_entry_list[0]
|
||||
if first_entry.label == "sil":
|
||||
start = first_entry.end
|
||||
|
||||
last_entry = phone_entry_list[-1]
|
||||
if last_entry.label == "sp":
|
||||
end = last_entry.start
|
||||
else:
|
||||
end = last_entry.end
|
||||
return start, end
|
||||
|
||||
|
||||
def process_utterance(fpath, source_dir, target_dir, alignment_dir):
|
||||
rel_path = fpath.relative_to(source_dir)
|
||||
opath = target_dir / rel_path
|
||||
apath = (alignment_dir / rel_path).with_suffix(".TextGrid")
|
||||
opath.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
start, end = get_valid_part(apath)
|
||||
wav, _ = librosa.load(fpath, sr=22050, offset=start, duration=end - start)
|
||||
normalized_wav = wav / np.max(wav) * 0.999
|
||||
sf.write(opath, normalized_wav, samplerate=22050, subtype='PCM_16')
|
||||
# print(f"{fpath} => {opath}")
|
||||
|
||||
|
||||
def preprocess_aishell3(source_dir, target_dir, alignment_dir):
|
||||
source_dir = Path(source_dir).expanduser()
|
||||
target_dir = Path(target_dir).expanduser()
|
||||
alignment_dir = Path(alignment_dir).expanduser()
|
||||
|
||||
wav_paths = list(source_dir.rglob("*.wav"))
|
||||
print(f"there are {len(wav_paths)} audio files in total")
|
||||
fx = partial(
|
||||
process_utterance,
|
||||
source_dir=source_dir,
|
||||
target_dir=target_dir,
|
||||
alignment_dir=alignment_dir)
|
||||
with Pool(16) as p:
|
||||
list(
|
||||
tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Process audio in AiShell3, trim silence according to the alignment "
|
||||
"files generated by MFA, and normalize volume by peak.")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/wav",
|
||||
help="path of the original audio folder in aishell3.")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/normalized_wav",
|
||||
help="path of the folder to save the processed audio files.")
|
||||
parser.add_argument(
|
||||
"--alignment",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/alignment",
|
||||
help="path of the alignment files.")
|
||||
args = parser.parse_args()
|
||||
|
||||
preprocess_aishell3(args.input, args.output, args.alignment)
|
@ -0,0 +1,262 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import time
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
import paddle
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader, DistributedBatchSampler
|
||||
|
||||
from parakeet.data import dataset
|
||||
from parakeet.training.cli import default_argument_parser
|
||||
from parakeet.training.experiment import ExperimentBase
|
||||
from parakeet.utils import display, mp_tools
|
||||
from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss
|
||||
|
||||
from config import get_cfg_defaults
|
||||
from aishell3 import AiShell3, collate_aishell3_examples
|
||||
|
||||
|
||||
class Experiment(ExperimentBase):
|
||||
def compute_losses(self, inputs, outputs):
|
||||
texts, tones, mel_targets, utterance_embeds, text_lens, output_lens, stop_tokens = inputs
|
||||
|
||||
mel_outputs = outputs["mel_output"]
|
||||
mel_outputs_postnet = outputs["mel_outputs_postnet"]
|
||||
alignments = outputs["alignments"]
|
||||
|
||||
losses = self.criterion(mel_outputs, mel_outputs_postnet, mel_targets,
|
||||
alignments, output_lens, text_lens)
|
||||
return losses
|
||||
|
||||
def train_batch(self):
|
||||
start = time.time()
|
||||
batch = self.read_batch()
|
||||
data_loader_time = time.time() - start
|
||||
|
||||
self.optimizer.clear_grad()
|
||||
self.model.train()
|
||||
texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
|
||||
outputs = self.model(
|
||||
texts,
|
||||
text_lens,
|
||||
mels,
|
||||
output_lens,
|
||||
tones=tones,
|
||||
global_condition=utterance_embeds)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
loss = losses["loss"]
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
iteration_time = time.time() - start
|
||||
|
||||
losses_np = {k: float(v) for k, v in losses.items()}
|
||||
# logging
|
||||
msg = "Rank: {}, ".format(dist.get_rank())
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||
iteration_time)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_np.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
for key, value in losses_np.items():
|
||||
self.visualizer.add_scalar(f"train_loss/{key}", value,
|
||||
self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def valid(self):
|
||||
valid_losses = defaultdict(list)
|
||||
for i, batch in enumerate(self.valid_loader):
|
||||
texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
|
||||
outputs = self.model(
|
||||
texts,
|
||||
text_lens,
|
||||
mels,
|
||||
output_lens,
|
||||
tones=tones,
|
||||
global_condition=utterance_embeds)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
for key, value in losses.items():
|
||||
valid_losses[key].append(float(value))
|
||||
|
||||
attention_weights = outputs["alignments"]
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_alignments",
|
||||
display.plot_alignment(attention_weights[0].numpy().T),
|
||||
self.iteration)
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_target_spectrogram",
|
||||
display.plot_spectrogram(mels[0].numpy().T), self.iteration)
|
||||
mel_pred = outputs['mel_outputs_postnet']
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_predicted_spectrogram",
|
||||
display.plot_spectrogram(mel_pred[0].numpy().T), self.iteration)
|
||||
|
||||
# write visual log
|
||||
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||
|
||||
# logging
|
||||
msg = "Valid: "
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in valid_losses.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
for key, value in valid_losses.items():
|
||||
self.visualizer.add_scalar(f"valid/{key}", value, self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def eval(self):
|
||||
"""Evaluation of Tacotron2 in autoregressive manner."""
|
||||
self.model.eval()
|
||||
mel_dir = Path(self.output_dir / ("eval_{}".format(self.iteration)))
|
||||
mel_dir.mkdir(parents=True, exist_ok=True)
|
||||
for i, batch in enumerate(self.test_loader):
|
||||
texts, tones, mels, utterance_embeds, *_ = batch
|
||||
outputs = self.model.infer(
|
||||
texts, tones=tones, global_condition=utterance_embeds)
|
||||
|
||||
display.plot_alignment(outputs["alignments"][0].numpy().T)
|
||||
plt.savefig(mel_dir / f"sentence_{i}.png")
|
||||
plt.close()
|
||||
np.save(mel_dir / f"sentence_{i}",
|
||||
outputs["mel_outputs_postnet"][0].numpy().T)
|
||||
print(f"sentence_{i}")
|
||||
|
||||
def setup_model(self):
|
||||
config = self.config
|
||||
model = Tacotron2(
|
||||
vocab_size=config.model.vocab_size,
|
||||
n_tones=config.model.n_tones,
|
||||
d_mels=config.data.d_mels,
|
||||
d_encoder=config.model.d_encoder,
|
||||
encoder_conv_layers=config.model.encoder_conv_layers,
|
||||
encoder_kernel_size=config.model.encoder_kernel_size,
|
||||
d_prenet=config.model.d_prenet,
|
||||
d_attention_rnn=config.model.d_attention_rnn,
|
||||
d_decoder_rnn=config.model.d_decoder_rnn,
|
||||
attention_filters=config.model.attention_filters,
|
||||
attention_kernel_size=config.model.attention_kernel_size,
|
||||
d_attention=config.model.d_attention,
|
||||
d_postnet=config.model.d_postnet,
|
||||
postnet_kernel_size=config.model.postnet_kernel_size,
|
||||
postnet_conv_layers=config.model.postnet_conv_layers,
|
||||
reduction_factor=config.model.reduction_factor,
|
||||
p_encoder_dropout=config.model.p_encoder_dropout,
|
||||
p_prenet_dropout=config.model.p_prenet_dropout,
|
||||
p_attention_dropout=config.model.p_attention_dropout,
|
||||
p_decoder_dropout=config.model.p_decoder_dropout,
|
||||
p_postnet_dropout=config.model.p_postnet_dropout,
|
||||
d_global_condition=config.model.d_global_condition,
|
||||
use_stop_token=config.model.use_stop_token, )
|
||||
|
||||
if self.parallel:
|
||||
model = paddle.DataParallel(model)
|
||||
|
||||
grad_clip = paddle.nn.ClipGradByGlobalNorm(
|
||||
config.training.grad_clip_thresh)
|
||||
optimizer = paddle.optimizer.Adam(
|
||||
learning_rate=config.training.lr,
|
||||
parameters=model.parameters(),
|
||||
weight_decay=paddle.regularizer.L2Decay(
|
||||
config.training.weight_decay),
|
||||
grad_clip=grad_clip)
|
||||
criterion = Tacotron2Loss(
|
||||
use_stop_token_loss=config.model.use_stop_token,
|
||||
use_guided_attention_loss=config.model.use_guided_attention_loss,
|
||||
sigma=config.model.guided_attention_loss_sigma)
|
||||
self.model = model
|
||||
self.optimizer = optimizer
|
||||
self.criterion = criterion
|
||||
|
||||
def setup_dataloader(self):
|
||||
args = self.args
|
||||
config = self.config
|
||||
ljspeech_dataset = AiShell3(args.data)
|
||||
|
||||
valid_set, train_set = dataset.split(ljspeech_dataset,
|
||||
config.data.valid_size)
|
||||
batch_fn = collate_aishell3_examples
|
||||
|
||||
if not self.parallel:
|
||||
self.train_loader = DataLoader(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=batch_fn)
|
||||
else:
|
||||
sampler = DistributedBatchSampler(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
self.train_loader = DataLoader(
|
||||
train_set, batch_sampler=sampler, collate_fn=batch_fn)
|
||||
|
||||
self.valid_loader = DataLoader(
|
||||
valid_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
|
||||
self.test_loader = DataLoader(
|
||||
valid_set,
|
||||
batch_size=1,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
exp = Experiment(config, args)
|
||||
exp.setup()
|
||||
exp.resume_or_load()
|
||||
if not args.test:
|
||||
exp.run()
|
||||
else:
|
||||
exp.eval()
|
||||
|
||||
|
||||
def main(config, args):
|
||||
if args.nprocs > 1 and args.device == "gpu":
|
||||
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
|
||||
else:
|
||||
main_sp(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
parser = default_argument_parser()
|
||||
parser.add_argument("--test", action="store_true")
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
print(args)
|
||||
|
||||
main(config, args)
|