parallel data scripts; more mask test; need pybind11 repo

pull/618/head
Hui Zhang 4 years ago
parent 0fe80f0fc7
commit 467e823577

2
.gitignore vendored

@ -1,7 +1,7 @@
.DS_Store .DS_Store
*.pyc *.pyc
.vscode .vscode
*.log *log
*.pdmodel *.pdmodel
*.pdiparams* *.pdiparams*
*.zip *.zip

@ -170,7 +170,7 @@ class DeepSpeech2Trainer(Trainer):
train_dataset, train_dataset,
batch_sampler=batch_sampler, batch_sampler=batch_sampler,
collate_fn=collate_fn, collate_fn=collate_fn,
num_workers=config.data.num_workers, ) num_workers=config.data.num_workers)
self.valid_loader = DataLoader( self.valid_loader = DataLoader(
dev_dataset, dev_dataset,
batch_size=config.data.batch_size, batch_size=config.data.batch_size,

@ -66,19 +66,22 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size # format manifest with tokenids, vocab size
for dataset in train dev test; do for dataset in train dev test; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \ python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \ --feat_type "raw" \
--cmvn_path "data/mean_std.json" \ --cmvn_path "data/mean_std.json" \
--unit_type "char" \ --unit_type "char" \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \
--manifest_path="data/manifest.${dataset}.raw" \ --manifest_path="data/manifest.${dataset}.raw" \
--output_path="data/manifest.${dataset}" --output_path="data/manifest.${dataset}"
done
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Formt mnaifest failed. Terminated." echo "Formt mnaifest failed. Terminated."
exit 1 exit 1
fi fi
} &
done
wait
fi fi
echo "Aishell data preparation done." echo "Aishell data preparation done."

@ -14,7 +14,7 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
python3 ${TARGET_DIR}/aishell/aishell.py \ python3 ${TARGET_DIR}/aishell/aishell.py \
--manifest_prefix="data/manifest" \ --manifest_prefix="data/manifest" \
--target_dir="${TARGET_DIR}/aishell" --target_dir="${TARGET_DIR}/aishell"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Prepare Aishell failed. Terminated." echo "Prepare Aishell failed. Terminated."
exit 1 exit 1
@ -33,7 +33,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--count_threshold=0 \ --count_threshold=0 \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \
--manifest_paths "data/manifest.train.raw" --manifest_paths "data/manifest.train.raw"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated." echo "Build vocabulary failed. Terminated."
exit 1 exit 1
@ -56,7 +56,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--num_samples=-1 \ --num_samples=-1 \
--num_workers=${num_workers} \ --num_workers=${num_workers} \
--output_path="data/mean_std.json" --output_path="data/mean_std.json"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Compute mean and stddev failed. Terminated." echo "Compute mean and stddev failed. Terminated."
exit 1 exit 1
@ -67,19 +67,22 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size # format manifest with tokenids, vocab size
for dataset in train dev test; do for dataset in train dev test; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \ python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \ --feat_type "raw" \
--cmvn_path "data/mean_std.json" \ --cmvn_path "data/mean_std.json" \
--unit_type "char" \ --unit_type "char" \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \
--manifest_path="data/manifest.${dataset}.raw" \ --manifest_path="data/manifest.${dataset}.raw" \
--output_path="data/manifest.${dataset}" --output_path="data/manifest.${dataset}"
if [ $? -ne 0 ]; then
echo "Formt mnaifest failed. Terminated."
exit 1
fi
} &
done done
wait
if [ $? -ne 0 ]; then
echo "Formt mnaifest failed. Terminated."
exit 1
fi
fi fi
echo "Aishell data preparation done." echo "Aishell data preparation done."

@ -8,4 +8,5 @@ SoundFile==0.9.0.post1
sox sox
tensorboardX tensorboardX
typeguard typeguard
yacs yacs
pybind11

@ -48,7 +48,9 @@ class TestU2Model(unittest.TestCase):
def test_make_pad_mask(self): def test_make_pad_mask(self):
res = make_pad_mask(self.lengths) res = make_pad_mask(self.lengths)
res1 = make_non_pad_mask(self.lengths).logical_not()
self.assertSequenceEqual(res.numpy().tolist(), self.pad_masks.tolist()) self.assertSequenceEqual(res.numpy().tolist(), self.pad_masks.tolist())
self.assertSequenceEqual(res.numpy().tolist(), res1.tolist())
if __name__ == '__main__': if __name__ == '__main__':

Loading…
Cancel
Save