#!/usr/bin/env bash

# 2020 Author Jiayu DU
# Apache 2.0

# This script uses kenlm to estimate an arpa model from plain text,
# it is a resort when you hit memory limit dealing with large corpus
# kenlm estimates arpa using on-disk structure,
# as long as you have big enough hard disk, memory shouldn't be a problem.
# by default, kenlm use up to 50% of your local memory,
# you can control this through -S option

[ -f path.sh ] && . ./path.sh;

kenlm_opts="" # e.g. "-o 4 -S 50% --prune 0 5 7 7"

if [ $# != 4 ]; then
  echo "$0 <text> <kaldi_symbol_table> <working_dir> <arpa_name>"
  echo "e.g. $0 train.txt words.txt wdir 4gram"
  exit 1
fi

text=$1
symbol_table=$2
dir=$3
arpa_name=$4

if ! which lmplz >& /dev/null ; then
  echo "$0: cannot find training tool *lmplz*."
  echo "tools/extras/install_kenlm_query_only.sh installs kenlm at tools/kenlm"
  echo "it only supports runtime mode, to actually train an arpa using KenLM,"
  echo "you need a complete KenLM installation(depends on EIGEN and BOOST),"
  echo "follow KenLM's building instructions at (https://github.com/kpu/kenlm)"
  exit 1
fi

# the text should be properly pre-processed, e.g:
#   cleand, normalized and possibly word-segmented

# get rid off irrelavent symbols
grep -v '<eps>' $symbol_table \
  | grep -v '#0' \
  | grep -v '<unk>' | grep -v '<UNK>' \
  | grep -v '<s>' | grep -v '</s>' \
  | awk '{print $1}' \
  > $dir/ngram.vocab

# To make sure that kenlm & kaldi have strictly the same vocabulary:
# 1. feed vocabulary into kenlm via --limit_vocab_file
# 2. cat vocabulary to training text, so each word at least appear once
# 
# TL;DR reason:
# Unlike SRILM's -limit-vocab, kenlm's --limit_vocab_file option 
# spcifies a *valid* set of vocabulary, whereas *valid but unseen* 
# words are discarded in final arpa.
# So the trick is, 
# we explicitly add kaldi's vocab(one word per line) to training text, 
# making each word appear at least once.
# kenlm never prunes unigram, 
# so this always generates consistent kenlm vocabuary as kaldi has.
# The effect of this is like add-one smoothing to unigram counts,
# shouldn't have significant impacts in practice.
cat $dir/ngram.vocab $text \
  | lmplz $kenlm_opts --limit_vocab_file $dir/ngram.vocab \
  > $dir/${arpa_name}.arpa

echo "$0: Done training arpa to: $dir/${arpa_name}.arpa"