vocab into data/lang_char

pull/1023/head
Hui Zhang 3 years ago
parent 7cddfd27f7
commit 4f54e36294

@ -14,7 +14,7 @@ collator:
batch_size: 64 # one gpu
mean_std_filepath: data/mean_std.json
unit_type: char
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:

@ -14,7 +14,7 @@ collator:
batch_size: 64 # one gpu
mean_std_filepath: data/mean_std.json
unit_type: char
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:

@ -3,9 +3,12 @@
stage=-1
stop_stage=100
source ${MAIN_ROOT}/utils/parse_options.sh
dict_dir=data/lang_char
. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
mkdir -p data
mkdir -p ${dict_dir}
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
@ -52,7 +55,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
if [ $? -ne 0 ]; then
@ -68,7 +71,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type "char" \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.${dataset}.raw" \
--output_path="data/manifest.${dataset}"

@ -12,7 +12,7 @@ data:
collator:
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml

@ -12,7 +12,7 @@ data:
collator:
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml

@ -12,7 +12,7 @@ data:
collator:
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml

@ -2,10 +2,12 @@
stage=-1
stop_stage=100
dict_dir=data/lang_char
source ${MAIN_ROOT}/utils/parse_options.sh
. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
mkdir -p data
mkdir -p ${dict_dir}
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
@ -53,7 +55,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_paths "data/manifest.train.raw"
if [ $? -ne 0 ]; then
@ -69,7 +71,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type "char" \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.${dataset}.raw" \
--output_path="data/manifest.${dataset}"

@ -12,7 +12,7 @@ data:
collator:
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml

@ -12,7 +12,7 @@ data:
collator:
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml

@ -2,10 +2,12 @@
stage=-1
stop_stage=100
dict_dir=data/lang_char
source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data
mkdir -p ${dict_dir}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
for dataset in train dev test; do
@ -41,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_paths "data/manifest.train.raw"
if [ $? -ne 0 ]; then
@ -57,7 +59,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type "char" \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.${dataset}.raw" \
--output_path="data/manifest.${dataset}"

@ -14,7 +14,7 @@ collator:
batch_size: 20
mean_std_filepath: data/mean_std.json
unit_type: char
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:

@ -14,7 +14,7 @@ collator:
batch_size: 15
mean_std_filepath: data/mean_std.json
unit_type: char
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:

@ -4,10 +4,12 @@ stage=-1
stop_stage=100
unit_type=char
dict_dir=data/lang_char
source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data
mkdir -p ${dict_dir}
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
@ -67,7 +69,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type ${unit_type} \
--count_threshold=0 \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_paths="data/manifest.train.raw"
if [ $? -ne 0 ]; then
@ -83,7 +85,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type ${unit_type} \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.${set}.raw" \
--output_path="data/manifest.${set}"

@ -11,9 +11,9 @@ data:
max_output_input_ratio: 100.0
collator:
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml
batch_size: 16

@ -11,9 +11,9 @@ data:
max_output_input_ratio: 100.0
collator:
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml
batch_size: 16

@ -11,9 +11,9 @@ data:
max_output_input_ratio: 100.0
collator:
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml
batch_size: 32

@ -2,11 +2,12 @@
stage=-1
stop_stage=100
dict_dir=data/lang_char
# bpemode (unigram or bpe)
nbpe=5000
bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}"
bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
stride_ms=10
window_ms=25
@ -17,6 +18,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data
mkdir -p ${dict_dir}
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
@ -79,7 +81,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--spm_vocab_size=${nbpe} \
--spm_mode ${bpemode} \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_paths="data/manifest.train.raw"
if [ $? -ne 0 ]; then
@ -96,7 +98,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--cmvn_path "data/mean_std.json" \
--unit_type "spm" \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.${sub}.raw" \
--output_path="data/manifest.${sub}"

@ -11,9 +11,9 @@ data:
max_output_input_ratio: 20.0
collator:
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: data/bpe_unigram_8000
spm_model_prefix: data/lang_char/bpe_unigram_8000
mean_std_filepath: ""
# augmentation_config: conf/augmentation.json
batch_size: 10

@ -11,9 +11,9 @@ data:
max_output_input_ratio: 20.0
collator:
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: data/bpe_unigram_8000
spm_model_prefix: data/lang_char/bpe_unigram_8000
mean_std_filepath: ""
# augmentation_config: conf/augmentation.json
batch_size: 10

@ -4,19 +4,22 @@ set -e
stage=-1
stop_stage=100
dict_dir=data/lang_char
# bpemode (unigram or bpe)
nbpe=8000
bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}"
bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
data_dir=./TED-En-Zh
source ${MAIN_ROOT}/utils/parse_options.sh
. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
mkdir -p data
mkdir -p ${dict_dir}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
@ -73,11 +76,10 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--spm_vocab_size=${nbpe} \
--spm_mode ${bpemode} \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--text_keys 'text' 'text1' \
--manifest_paths="data/manifest.train.raw"
if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
@ -92,7 +94,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--cmvn_path "data/mean_std.json" \
--unit_type "spm" \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.${set}.raw" \
--output_path="data/manifest.${set}"

@ -11,9 +11,9 @@ data:
max_output_input_ratio: 20.0
collator:
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: data/bpe_unigram_8000
spm_model_prefix: data/lang_char/bpe_unigram_8000
mean_std_filepath: ""
# augmentation_config: conf/augmentation.json
batch_size: 10

@ -11,7 +11,7 @@ data:
max_output_input_ratio: 20.0
collator:
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: data/train_sp.en-zh-nlpr.zh-nlpr_bpe8000_tc
mean_std_filepath: ""

@ -4,11 +4,12 @@ set -e
stage=-1
stop_stage=100
dict_dir=data/lang_char
# bpemode (unigram or bpe)
nbpe=8000
bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}"
bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
data_dir=./TED_EnZh
@ -17,6 +18,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
mkdir -p data
mkdir -p ${dict_dir}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
@ -73,7 +75,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--spm_vocab_size=${nbpe} \
--spm_mode ${bpemode} \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--text_keys 'text' 'text1' \
--manifest_paths="data/manifest.train.raw"
@ -93,7 +95,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--cmvn_path "data/mean_std.json" \
--unit_type "spm" \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.${set}.raw" \
--output_path="data/manifest.${set}"

@ -11,7 +11,7 @@ data:
max_output_input_ratio: 1000.0
collator:
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: "word"
mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml

@ -3,15 +3,19 @@
stage=-1
stop_stage=100
dict_dir=data/lang_char
unit_type=word
TIMIT_path=
source ${MAIN_ROOT}/utils/parse_options.sh
. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
mkdir -p data
mkdir -p ${dict_dir}
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# download data, generate manifests
python3 ${TARGET_DIR}/timit/timit_kaldi_standard_split.py \
@ -52,7 +56,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type ${unit_type} \
--count_threshold=0 \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_paths="data/manifest.train.raw"
if [ $? -ne 0 ]; then
@ -68,7 +72,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type ${unit_type} \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.${set}.raw" \
--output_path="data/manifest.${set}"

@ -14,7 +14,7 @@ data:
collator:
mean_std_filepath: data/mean_std.json
unit_type: char
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:

@ -14,7 +14,7 @@ data:
collator:
mean_std_filepath: data/mean_std.json
unit_type: char
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:

@ -4,10 +4,12 @@ stage=-1
stop_stage=100
unit_type=char
dict_dir=data/lang_char
source ${MAIN_ROOT}/utils/parse_options.sh
. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
mkdir -p data
mkdir -p ${dict_dir}
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
@ -51,7 +53,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type ${unit_type} \
--count_threshold=0 \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_paths="data/manifest.tiny.raw"
if [ $? -ne 0 ]; then
@ -65,7 +67,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type ${unit_type} \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.tiny.raw" \
--output_path="data/manifest.tiny"

@ -12,7 +12,7 @@ data:
collator:
mean_std_filepath: ""
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200'
augmentation_config: conf/preprocess.yaml

@ -12,7 +12,7 @@ data:
collator:
mean_std_filepath: ""
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200'
augmentation_config: conf/preprocess.yaml

@ -12,7 +12,7 @@ data:
collator:
mean_std_filepath: ""
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200'
augmentation_config: conf/preprocess.yaml

@ -12,7 +12,7 @@ data:
collator:
mean_std_filepath: data/mean_std.json
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200'
augmentation_config: conf/preprocess.yaml

@ -3,14 +3,17 @@
stage=-1
stop_stage=100
dict_dir=data/lang_char
# bpemode (unigram or bpe)
nbpe=200
bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}"
bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
source ${MAIN_ROOT}/utils/parse_options.sh
. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
mkdir -p data
mkdir -p ${dict_dir}
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
@ -57,7 +60,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--spm_vocab_size=${nbpe} \
--spm_mode ${bpemode} \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_paths="data/manifest.tiny.raw"
if [ $? -ne 0 ]; then
@ -72,7 +75,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--cmvn_path "data/mean_std.json" \
--unit_type "spm" \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.tiny.raw" \
--output_path="data/manifest.tiny"

@ -51,7 +51,7 @@ data:
max_output_input_ratio: 10.0
collator:
vocab_filepath: data/vocab.txt
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml

Loading…
Cancel
Save