From f52ad5e5a8d752370867f737ced9bb04b1e63001 Mon Sep 17 00:00:00 2001 From: Rai220 Date: Thu, 22 Mar 2018 13:27:13 +0300 Subject: [PATCH] Add detailed errors about symbols out of vocab --- data_utils/featurizer/text_featurizer.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/data_utils/featurizer/text_featurizer.py b/data_utils/featurizer/text_featurizer.py index 89202163c..31a72c576 100644 --- a/data_utils/featurizer/text_featurizer.py +++ b/data_utils/featurizer/text_featurizer.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- """Contains the text featurizer class.""" from __future__ import absolute_import from __future__ import division @@ -32,8 +33,12 @@ class TextFeaturizer(object): :return: List of char-level token indices. :rtype: list """ - tokens = self._char_tokenize(text) - return [self._vocab_dict[token] for token in tokens] + result = [] + try: + result = [self._vocab_dict[token] for token in text] + except KeyError, e: + print('Incorrect symbol "%s" found in string: ' % str(e).encode('utf-8'), text.encode('utf-8')) + return result @property def vocab_size(self):