Add detailed errors about symbols out of vocab

8 years ago · f52ad5e5a8
parent dad1c2727e
commit f52ad5e5a8
1 changed files with 7 additions and 2 deletions
--- a/data_utils/featurizer/text_featurizer.py
+++ b/data_utils/featurizer/text_featurizer.py
@ -1,3 +1,4 @@
 # -*- coding: utf-8 -*-
 """Contains the text featurizer class."""
 from __future__ import absolute_import
 from __future__ import division
@ -32,8 +33,12 @@ class TextFeaturizer(object):
        :return: List of char-level token indices.
        :rtype: list
        """
-        tokens = self._char_tokenize(text)
+        result = []
-        return [self._vocab_dict[token] for token in tokens]
+        try:
            result = [self._vocab_dict[token] for token in text]
        except KeyError, e:
            print('Incorrect symbol "%s" found in string: ' % str(e).encode('utf-8'), text.encode('utf-8'))
        return result
    @property
    def vocab_size(self):