From 233247d41e8aca381ef50f542719832b478f5417 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 20 Apr 2022 12:00:54 +0000 Subject: [PATCH] parallel sort --- speechx/examples/ngram/zh/local/aishell_train_lms.sh | 6 +++--- speechx/examples/ngram/zh/local/split_data.sh | 0 2 files changed, 3 insertions(+), 3 deletions(-) mode change 100644 => 100755 speechx/examples/ngram/zh/local/split_data.sh diff --git a/speechx/examples/ngram/zh/local/aishell_train_lms.sh b/speechx/examples/ngram/zh/local/aishell_train_lms.sh index 9e6e7e7ba..544a1f59a 100755 --- a/speechx/examples/ngram/zh/local/aishell_train_lms.sh +++ b/speechx/examples/ngram/zh/local/aishell_train_lms.sh @@ -44,15 +44,15 @@ cat ${text_dir}/split${nj}/*/${split_name}.no_oov > $cleantext # compute word counts, sort in descending order # line: count word -cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ - sort -nr > $dir/word.counts || exit 1; +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort --parallel=`nproc` | uniq -c | \ + sort --parallel=`nproc` -nr > $dir/word.counts || exit 1; # Get counts from acoustic training transcripts, and add one-count # for each word in the lexicon (but not silence, we don't want it # in the LM-- we'll add it optionally later). cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ - sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; + sort --parallel=`nproc` | uniq -c | sort --parallel=`nproc` -nr > $dir/unigram.counts || exit 1; # word with cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist diff --git a/speechx/examples/ngram/zh/local/split_data.sh b/speechx/examples/ngram/zh/local/split_data.sh old mode 100644 new mode 100755