diff --git a/docs/source/dependencies.md b/docs/source/dependencies.md index 5f641f35..905730e4 100644 --- a/docs/source/dependencies.md +++ b/docs/source/dependencies.md @@ -13,7 +13,7 @@ bc flac jq vim tig tree pkg-config libsndfile1 libflac-dev libvorbis-dev libboos ``` build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev gcc-5 g++-5 ``` - + ### The dependencies of sox: ``` @@ -25,7 +25,7 @@ libvorbis-dev libmp3lame-dev libmad-ocaml-dev ``` kenlm -sox +sox mfa openblas kaldi diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 67441fae..cf02d7d3 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -554,10 +554,11 @@ class U2Tester(U2Trainer): @paddle.no_grad() def align(self): - ctc_utils.ctc_align( - self.model, self.align_loader, self.config.decoding.batch_size, - self.align_loader.collate_fn.stride_ms, - self.align_loader.collate_fn.vocab_list, self.args.result_file) + ctc_utils.ctc_align(self.config, self.model, self.align_loader, + self.config.decoding.batch_size, + self.align_loader.collate_fn.stride_ms, + self.align_loader.collate_fn.vocab_list, + self.args.result_file) def load_inferspec(self): """infer model and input spec. diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index e37784aa..0d8508c2 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -527,10 +527,11 @@ class U2Tester(U2Trainer): @paddle.no_grad() def align(self): - ctc_utils.ctc_align( - self.model, self.align_loader, self.config.decoding.batch_size, - self.align_loader.collate_fn.stride_ms, - self.align_loader.collate_fn.vocab_list, self.args.result_file) + ctc_utils.ctc_align(self.config, self.model, self.align_loader, + self.config.decoding.batch_size, + self.align_loader.collate_fn.stride_ms, + self.align_loader.collate_fn.vocab_list, + self.args.result_file) def load_inferspec(self): """infer model and input spec. diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index f458216e..91390afe 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -543,10 +543,10 @@ class U2STTester(U2STTrainer): @paddle.no_grad() def align(self): - ctc_utils.ctc_align( - self.model, self.align_loader, self.config.decoding.batch_size, - self.align_loader.collate_fn.stride_ms, - self.align_loader.collate_fn.vocab_list, self.args.result_file) + ctc_utils.ctc_align(self.config, self.model, self.align_loader, + self.config.decoding.batch_size, + self.config.collator.stride_ms, self.vocab_list, + self.args.result_file) def load_inferspec(self): """infer model and input spec. diff --git a/paddlespeech/s2t/utils/ctc_utils.py b/paddlespeech/s2t/utils/ctc_utils.py index e005e5d2..886b7203 100644 --- a/paddlespeech/s2t/utils/ctc_utils.py +++ b/paddlespeech/s2t/utils/ctc_utils.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # Modified from wenet(https://github.com/wenet-e2e/wenet) +from pathlib import Path from typing import List import numpy as np @@ -139,26 +140,27 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, return output_alignment -def ctc_align(model, dataloader, batch_size, stride_ms, token_dict, +def ctc_align(config, model, dataloader, batch_size, stride_ms, token_dict, result_file): """ctc alignment. Args: + config (cfgNode): config model (nn.Layer): U2 Model. dataloader (io.DataLoader): dataloader. batch_size (int): decoding batchsize. stride_ms (int): audio feature stride in ms unit. token_dict (List[str]): vocab list, e.g. ['blank', 'unk', 'a', 'b', '']. - result_file (str): alignment output file, e.g. xxx.align. + result_file (str): alignment output file, e.g. /path/to/xxx.align. """ if batch_size > 1: logger.fatal('alignment mode must be running with batch_size == 1') sys.exit(1) - assert result_file and result_file.endswith('.align') model.eval() - + # conv subsampling rate + subsample = utility.get_subsample(config) logger.info(f"Align Total Examples: {len(dataloader.dataset)}") with open(result_file, 'w') as fout: @@ -187,13 +189,11 @@ def ctc_align(model, dataloader, batch_size, stride_ms, token_dict, logger.info(f"align tokens: {key[0]}, {align_segs}") # IntervalTier, List["start end token\n"] - subsample = utility.get_subsample(self.config) - tierformat = text_grid.align_to_tierformat(align_segs, subsample, token_dict) # write tier - align_output_path = Path(self.args.result_file).parent / "align" + align_output_path = Path(result_file).parent / "align" align_output_path.mkdir(parents=True, exist_ok=True) tier_path = align_output_path / (key[0] + ".tier") with tier_path.open('w') as f: diff --git a/paddlespeech/t2s/exps/tacotron2/ljspeech.py b/paddlespeech/t2s/exps/tacotron2/ljspeech.py index 4facde40..08db2a64 100644 --- a/paddlespeech/t2s/exps/tacotron2/ljspeech.py +++ b/paddlespeech/t2s/exps/tacotron2/ljspeech.py @@ -67,19 +67,16 @@ class LJSpeechCollector(object): # Sort by text_len in descending order texts = [ - i - for i, _ in sorted( + i for i, _ in sorted( zip(texts, text_lens), key=lambda x: x[1], reverse=True) ] mels = [ - i - for i, _ in sorted( + i for i, _ in sorted( zip(mels, text_lens), key=lambda x: x[1], reverse=True) ] mel_lens = [ - i - for i, _ in sorted( + i for i, _ in sorted( zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True) ]