Add rhythm tags for MFA, test=tts (#2615)

* Add rhythm tags for MFA, test=tts
pull/2639/head
HuangLiangJie 2 years ago committed by GitHub
parent fd73a184e7
commit f43d026000
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -4,3 +4,6 @@ Run the following script to get started, for more detail, please see `run.sh`.
```bash ```bash
./run.sh ./run.sh
``` ```
# Rhythm tags for MFA
If you want to get rhythm tags with duration through MFA tool, you may add flag `--rhy-with-duration` in the first two commands in `run.sh`
Note that only CSMSC dataset is supported so far, and we replace `#` with `sp` in rhythm tags for MFA.

@ -182,12 +182,17 @@ if __name__ == "__main__":
"--with-tone", action="store_true", help="whether to consider tone.") "--with-tone", action="store_true", help="whether to consider tone.")
parser.add_argument( parser.add_argument(
"--with-r", action="store_true", help="whether to consider erhua.") "--with-r", action="store_true", help="whether to consider erhua.")
parser.add_argument(
"--rhy-with-duration",
action="store_true", )
args = parser.parse_args() args = parser.parse_args()
lexicon = generate_lexicon(args.with_tone, args.with_r) lexicon = generate_lexicon(args.with_tone, args.with_r)
symbols = generate_symbols(lexicon) symbols = generate_symbols(lexicon)
with open(args.output + ".lexicon", 'wt') as f: with open(args.output + ".lexicon", 'wt') as f:
if args.rhy_with_duration:
f.write("sp1 sp1\nsp2 sp2\nsp3 sp3\nsp4 sp4\n")
for k, v in lexicon.items(): for k, v in lexicon.items():
f.write(f"{k} {v}\n") f.write(f"{k} {v}\n")

@ -23,6 +23,7 @@ for more details.
""" """
import argparse import argparse
import os import os
import re
import shutil import shutil
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from pathlib import Path from pathlib import Path
@ -32,6 +33,22 @@ import librosa
import soundfile as sf import soundfile as sf
from tqdm import tqdm from tqdm import tqdm
repalce_dict = {
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": ""
}
def get_transcripts(path: Union[str, Path]): def get_transcripts(path: Union[str, Path]):
transcripts = {} transcripts = {}
@ -55,9 +72,13 @@ def resample_and_save(source, target, sr=16000):
def reorganize_baker(root_dir: Union[str, Path], def reorganize_baker(root_dir: Union[str, Path],
output_dir: Union[str, Path]=None, output_dir: Union[str, Path]=None,
resample_audio=False): resample_audio=False,
rhy_dur=False):
root_dir = Path(root_dir).expanduser() root_dir = Path(root_dir).expanduser()
transcript_path = root_dir / "ProsodyLabeling" / "000001-010000.txt" if rhy_dur:
transcript_path = root_dir / "ProsodyLabeling" / "000001-010000_rhy.txt"
else:
transcript_path = root_dir / "ProsodyLabeling" / "000001-010000.txt"
transcriptions = get_transcripts(transcript_path) transcriptions = get_transcripts(transcript_path)
wave_dir = root_dir / "Wave" wave_dir = root_dir / "Wave"
@ -92,6 +113,46 @@ def reorganize_baker(root_dir: Union[str, Path],
print("Done!") print("Done!")
def insert_rhy(sentence_first, sentence_second):
sub = '#'
return_words = []
sentence_first = sentence_first.translate(str.maketrans(repalce_dict))
rhy_idx = [substr.start() for substr in re.finditer(sub, sentence_first)]
re_rhy_idx = []
sentence_first_ = sentence_first.replace("#1", "").replace(
"#2", "").replace("#3", "").replace("#4", "")
sentence_seconds = sentence_second.split(" ")
for i, w in enumerate(rhy_idx):
re_rhy_idx.append(w - i * 2)
i = 0
# print("re_rhy_idx: ", re_rhy_idx)
for sentence_s in (sentence_seconds):
return_words.append(sentence_s)
if i < len(re_rhy_idx) and len(return_words) - i == re_rhy_idx[i]:
return_words.append("sp" + sentence_first[rhy_idx[i] + 1:rhy_idx[i]
+ 2])
i = i + 1
return return_words
def normalize_rhy(root_dir: Union[str, Path]):
root_dir = Path(root_dir).expanduser()
transcript_path = root_dir / "ProsodyLabeling" / "000001-010000.txt"
target_transcript_path = root_dir / "ProsodyLabeling" / "000001-010000_rhy.txt"
with open(transcript_path) as f:
lines = f.readlines()
with open(target_transcript_path, 'wt') as f:
for i in range(0, len(lines), 2):
sentence_first = lines[i] #第一行直接保存
f.write(sentence_first)
transcription = lines[i + 1].strip()
f.write("\t" + " ".join(
insert_rhy(sentence_first.split('\t')[1], transcription)) +
"\n")
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Reorganize Baker dataset for MFA") description="Reorganize Baker dataset for MFA")
@ -104,6 +165,12 @@ if __name__ == "__main__":
"--resample-audio", "--resample-audio",
action="store_true", action="store_true",
help="To resample audio files or just copy them") help="To resample audio files or just copy them")
parser.add_argument(
"--rhy-with-duration",
action="store_true", )
args = parser.parse_args() args = parser.parse_args()
reorganize_baker(args.root_dir, args.output_dir, args.resample_audio) if args.rhy_with_duration:
normalize_rhy(args.root_dir)
reorganize_baker(args.root_dir, args.output_dir, args.resample_audio,
args.rhy_with_duration)

Loading…
Cancel
Save