mv ctc_beam_search_decoder into deep_speech_2/

9 years ago · c943ca79ac
parent e016778e20
commit c943ca79ac
2 changed files with 231 additions and 0 deletions
--- a/ctc_beam_search_decoder.py
+++ b/ctc_beam_search_decoder.py
@ -0,0 +1,162 @@
 ## This is a prototype of ctc beam search decoder
 import copy
 import random
 import numpy as np
 # vocab = blank + space + English characters
 #vocab = ['-', ' '] + [chr(i) for i in range(97, 123)]
 vocab = ['-', '_', 'a']
 def ids_str2list(ids_str):
    ids_str = ids_str.split(' ')
    ids_list = [int(elem) for elem in ids_str]
    return ids_list
 def ids_list2str(ids_list):
    ids_str = [str(elem) for elem in ids_list]
    ids_str = ' '.join(ids_str)
    return ids_str
 def ids_id2token(ids_list):
    ids_str = ''
    for ids in ids_list:
        ids_str += vocab[ids]
    return ids_str
 def ctc_beam_search_decoder(input_probs_matrix,
                            beam_size,
                            max_time_steps=None,
                            lang_model=None,
                            alpha=1.0,
                            beta=1.0,
                            blank_id=0,
                            space_id=1,
                            num_results_per_sample=None):
    '''
    beam search decoder for CTC-trained network, called outside of the recurrent group. 
    adapted from Algorithm 1 in https://arxiv.org/abs/1408.2873.
    param input_probs_matrix: probs matrix for input sequence, row major
    type input_probs_matrix: 2D matrix. 
    param beam_size: width for beam search
    type beam_size: int
    max_time_steps: maximum steps' number for input sequence, <=len(input_probs_matrix)
    type max_time_steps: int
    lang_model: language model for scoring
    type lang_model: function
    ......
    '''
    if num_results_per_sample is None:
        num_results_per_sample = beam_size
    assert num_results_per_sample <= beam_size
    if max_time_steps is None:
        max_time_steps = len(input_probs_matrix)
    else:
        max_time_steps = min(max_time_steps, len(input_probs_matrix))
    assert max_time_steps > 0
    vocab_dim = len(input_probs_matrix[0])
    assert blank_id < vocab_dim
    assert space_id < vocab_dim
    ## initialize 
    start_id = -1
    # the set containing selected prefixes 
    prefix_set_prev = {str(start_id): 1.0}
    probs_b, probs_nb = {str(start_id): 1.0}, {str(start_id): 0.0}
    ## extend prefix in loop 
    for time_step in range(max_time_steps):
        # the set containing candidate prefixes
        prefix_set_next = {}
        probs_b_cur, probs_nb_cur = {}, {}
        for l in prefix_set_prev:
            prob = input_probs_matrix[time_step]
            # convert ids in string to list
            ids_list = ids_str2list(l)
            end_id = ids_list[-1]
            if not prefix_set_next.has_key(l):
                probs_b_cur[l], probs_nb_cur[l] = 0.0, 0.0
            # extend prefix by travering vocabulary
            for c in range(0, vocab_dim):
                if c == blank_id:
                    probs_b_cur[l] += prob[c] * (probs_b[l] + probs_nb[l])
                else:
                    l_plus = l + ' ' + str(c)
                    if not prefix_set_next.has_key(l_plus):
                        probs_b_cur[l_plus], probs_nb_cur[l_plus] = 0.0, 0.0
                    if c == end_id:
                        probs_nb_cur[l_plus] += prob[c] * probs_b[l]
                        probs_nb_cur[l] += prob[c] * probs_nb[l]
                    elif c == space_id:
                        lm = 1.0 if lang_model is None \
                               else np.power(lang_model(ids_list), alpha)
                        probs_nb_cur[l_plus] += lm * prob[c] * (
                            probs_b[l] + probs_nb[l])
                    else:
                        probs_nb_cur[l_plus] += prob[c] * (
                            probs_b[l] + probs_nb[l])
                # add l_plus into prefix_set_next
                    prefix_set_next[l_plus] = probs_nb_cur[
                        l_plus] + probs_b_cur[l_plus]
            # add l into prefix_set_next
            prefix_set_next[l] = probs_b_cur[l] + probs_nb_cur[l]
        # update probs
        probs_b, probs_nb = copy.deepcopy(probs_b_cur), copy.deepcopy(
            probs_nb_cur)
        ## store top beam_size prefixes 
        prefix_set_prev = sorted(
            prefix_set_next.iteritems(), key=lambda asd: asd[1], reverse=True)
        if beam_size < len(prefix_set_prev):
            prefix_set_prev = prefix_set_prev[:beam_size]
        prefix_set_prev = dict(prefix_set_prev)
    beam_result = []
    for (seq, prob) in prefix_set_prev.items():
        if prob > 0.0:
            ids_list = ids_str2list(seq)
            log_prob = np.log(prob)
            beam_result.append([log_prob, ids_list[1:]])
    ## output top beam_size decoding results
    beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True)
    if num_results_per_sample < beam_size:
        beam_result = beam_result[:num_results_per_sample]
    return beam_result
 def language_model(input):
    # TODO
    return random.uniform(0, 1)
 def simple_test():
    input_probs_matrix = [[0.1, 0.3, 0.6], [0.2, 0.1, 0.7], [0.5, 0.2, 0.3]]
    beam_result = ctc_beam_search_decoder(
        input_probs_matrix=input_probs_matrix,
        beam_size=20,
        blank_id=0,
        space_id=1, )
    print "\nbeam search output:"
    for result in beam_result:
        print("%6f\t%s" % (result[0], ids_id2token(result[1])))
 if __name__ == '__main__':
    simple_test()
--- a/test_ctc_beam_search_decoder.py
+++ b/test_ctc_beam_search_decoder.py
@ -0,0 +1,69 @@
 from __future__ import absolute_import
 from __future__ import print_function
 import numpy as np
 import tensorflow as tf
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 import ctc_beam_search_decoder as tested_decoder
 def test_beam_search_decoder():
    max_time_steps = 6
    beam_size = 20
    num_results_per_sample = 20
    input_prob_matrix_0 = np.asarray(
        [
            [0.30999, 0.309938, 0.0679938, 0.0673362, 0.0708352, 0.173908],
            [0.215136, 0.439699, 0.0370931, 0.0393967, 0.0381581, 0.230517],
            [0.199959, 0.489485, 0.0233221, 0.0251417, 0.0233289, 0.238763],
            [0.279611, 0.452966, 0.0204795, 0.0209126, 0.0194803, 0.20655],
            [0.51286, 0.288951, 0.0243026, 0.0220788, 0.0219297, 0.129878],
            # Random entry added in at time=5
            [0.155251, 0.164444, 0.173517, 0.176138, 0.169979, 0.160671]
        ],
        dtype=np.float32)
    # Add arbitrary offset - this is fine
    input_log_prob_matrix_0 = np.log(input_prob_matrix_0)  #+ 2.0
    # len max_time_steps array of batch_size x depth matrices
    inputs = ([
        input_log_prob_matrix_0[t, :][np.newaxis, :]
        for t in range(max_time_steps)
    ])
    inputs_t = [ops.convert_to_tensor(x) for x in inputs]
    inputs_t = array_ops.stack(inputs_t)
    # run CTC beam search decoder in tensorflow
    with tf.Session() as sess:
        decoded, log_probabilities = tf.nn.ctc_beam_search_decoder(
            inputs_t, [max_time_steps],
            beam_width=beam_size,
            top_paths=num_results_per_sample,
            merge_repeated=False)
        tf_decoded = sess.run(decoded)
        tf_log_probs = sess.run(log_probabilities)
    # run tested CTC beam search decoder     
    beam_result = tested_decoder.ctc_beam_search_decoder(
        input_probs_matrix=input_prob_matrix_0,
        beam_size=beam_size,
        blank_id=5,  # default blank_id in tensorflow decoder is (num classes-1)
        space_id=4,  # doesn't matter
        max_time_steps=max_time_steps,
        num_results_per_sample=num_results_per_sample)
    # compare decoding result
    print(
        "{tf_decoder log probs} \t {tested_decoder log probs}:  {tf_decoder result}  {tested_decoder result}"
    )
    for index in range(len(beam_result)):
        print(('%6f\t%6f: ') % (tf_log_probs[0][index], beam_result[index][0]),
              tf_decoded[index].values, '  ', beam_result[index][1])
 if __name__ == '__main__':
    test_beam_search_decoder()