diff --git a/data_utils/data.py b/data_utils/data.py
index 159bf69d..14b02f99 100644
--- a/data_utils/data.py
+++ b/data_utils/data.py
@@ -91,7 +91,7 @@ class DataGenerator(object):
         :param transcript: Transcription text.
         :type transcript: basestring
         :return: Tuple of audio feature tensor and list of token ids for
-                 transcription. 
+                 transcription.
         :rtype: tuple of (2darray, list)
         """
         speech_segment = SpeechSegment.from_file(filename, transcript)
@@ -111,7 +111,7 @@ class DataGenerator(object):
         """
         Batch data reader creator for audio data. Return a callable generator
         function to produce batches of data.
-        
+
         Audio features within one batch will be padded with zeros to have the
         same shape, or a user-defined shape.
 
@@ -191,9 +191,9 @@ class DataGenerator(object):
     @property
     def feeding(self):
         """Returns data reader's feeding dict.
-        
+
         :return: Data feeding dict.
-        :rtype: dict 
+        :rtype: dict
         """
         return {"audio_spectrogram": 0, "transcript_text": 1}
 
diff --git a/data_utils/featurizer/text_featurizer.py b/data_utils/featurizer/text_featurizer.py
index 4f9a49b5..89202163 100644
--- a/data_utils/featurizer/text_featurizer.py
+++ b/data_utils/featurizer/text_featurizer.py
@@ -4,6 +4,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import codecs
 
 
 class TextFeaturizer(object):
@@ -59,7 +60,7 @@ class TextFeaturizer(object):
     def _load_vocabulary_from_file(self, vocab_filepath):
         """Load vocabulary from file."""
         vocab_lines = []
-        with open(vocab_filepath, 'r') as file:
+        with codecs.open(vocab_filepath, 'r', 'utf-8') as file:
             vocab_lines.extend(file.readlines())
         vocab_list = [line[:-1] for line in vocab_lines]
         vocab_dict = dict(
diff --git a/data_utils/utils.py b/data_utils/utils.py
index 3f116571..f970ff55 100644
--- a/data_utils/utils.py
+++ b/data_utils/utils.py
@@ -4,15 +4,16 @@ from __future__ import division
 from __future__ import print_function
 
 import json
+import codecs
 
 
 def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
     """Load and parse manifest file.
-    
+
     Instances with durations outside [min_duration, max_duration] will be
     filtered out.
 
-    :param manifest_path: Manifest file to load and parse. 
+    :param manifest_path: Manifest file to load and parse.
     :type manifest_path: basestring
     :param max_duration: Maximal duration in seconds for instance filter.
     :type max_duration: float
@@ -23,7 +24,7 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
     :raises IOError: If failed to parse the manifest.
     """
     manifest = []
-    for json_line in open(manifest_path):
+    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
         try:
             json_data = json.loads(json_line)
         except Exception as e:
diff --git a/datasets/librispeech/librispeech.py b/datasets/librispeech/librispeech.py
index 7e941f0e..d963a7d5 100644
--- a/datasets/librispeech/librispeech.py
+++ b/datasets/librispeech/librispeech.py
@@ -16,6 +16,7 @@ import tarfile
 import argparse
 import soundfile
 import json
+import codecs
 from paddle.v2.dataset.common import md5file
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
@@ -112,7 +113,7 @@ def create_manifest(data_dir, manifest_path):
                         'duration': duration,
                         'text': text
                     }))
-    with open(manifest_path, 'w') as out_file:
+    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
         for line in json_lines:
             out_file.write(line + '\n')
 
diff --git a/error_rate.py b/error_rate.py
index 0cf17921..ea829f47 100644
--- a/error_rate.py
+++ b/error_rate.py
@@ -10,47 +10,54 @@ import numpy as np
 
 
 def _levenshtein_distance(ref, hyp):
-    """Levenshtein distance is a string metric for measuring the difference between
-    two sequences. Informally, the levenshtein disctance is defined as the minimum
-    number of single-character edits (substitutions, insertions or deletions) 
-    required to change one word into the other. We can naturally extend the edits to 
-    word level when calculate levenshtein disctance for two sentences.
+    """Levenshtein distance is a string metric for measuring the difference
+    between two sequences. Informally, the levenshtein disctance is defined as
+    the minimum number of single-character edits (substitutions, insertions or
+    deletions) required to change one word into the other. We can naturally
+    extend the edits to word level when calculate levenshtein disctance for
+    two sentences.
     """
-    ref_len = len(ref)
-    hyp_len = len(hyp)
+    m = len(ref)
+    n = len(hyp)
 
     # special case
     if ref == hyp:
         return 0
-    if ref_len == 0:
-        return hyp_len
-    if hyp_len == 0:
-        return ref_len
+    if m == 0:
+        return n
+    if n == 0:
+        return m
 
-    distance = np.zeros((ref_len + 1, hyp_len + 1), dtype=np.int32)
+    if m < n:
+        ref, hyp = hyp, ref
+        m, n = n, m
+
+    # use O(min(m, n)) space
+    distance = np.zeros((2, n + 1), dtype=np.int32)
 
     # initialize distance matrix
-    for j in xrange(hyp_len + 1):
+    for j in xrange(n + 1):
         distance[0][j] = j
-    for i in xrange(ref_len + 1):
-        distance[i][0] = i
 
     # calculate levenshtein distance
-    for i in xrange(1, ref_len + 1):
-        for j in xrange(1, hyp_len + 1):
+    for i in xrange(1, m + 1):
+        prev_row_idx = (i - 1) % 2
+        cur_row_idx = i % 2
+        distance[cur_row_idx][0] = i
+        for j in xrange(1, n + 1):
             if ref[i - 1] == hyp[j - 1]:
-                distance[i][j] = distance[i - 1][j - 1]
+                distance[cur_row_idx][j] = distance[prev_row_idx][j - 1]
             else:
-                s_num = distance[i - 1][j - 1] + 1
-                i_num = distance[i][j - 1] + 1
-                d_num = distance[i - 1][j] + 1
-                distance[i][j] = min(s_num, i_num, d_num)
+                s_num = distance[prev_row_idx][j - 1] + 1
+                i_num = distance[cur_row_idx][j - 1] + 1
+                d_num = distance[prev_row_idx][j] + 1
+                distance[cur_row_idx][j] = min(s_num, i_num, d_num)
 
-    return distance[ref_len][hyp_len]
+    return distance[m % 2][n]
 
 
 def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
-    """Calculate word error rate (WER). WER compares reference text and 
+    """Calculate word error rate (WER). WER compares reference text and
     hypothesis text in word-level. WER is defined as:
 
     .. math::
@@ -65,8 +72,8 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
         Iw is the number of words inserted,
         Nw is the number of words in the reference
 
-    We can use levenshtein distance to calculate WER. Please draw an attention that 
-    empty items will be removed when splitting sentences by delimiter.
+    We can use levenshtein distance to calculate WER. Please draw an attention
+    that empty items will be removed when splitting sentences by delimiter.
 
     :param reference: The reference sentence.
     :type reference: basestring
@@ -95,7 +102,7 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
     return wer
 
 
-def cer(reference, hypothesis, ignore_case=False):
+def cer(reference, hypothesis, ignore_case=False, remove_space=False):
     """Calculate charactor error rate (CER). CER compares reference text and
     hypothesis text in char-level. CER is defined as:
 
@@ -111,10 +118,10 @@ def cer(reference, hypothesis, ignore_case=False):
         Ic is the number of characters inserted
         Nc is the number of characters in the reference
 
-    We can use levenshtein distance to calculate CER. Chinese input should be 
-    encoded to unicode. Please draw an attention that the leading and tailing 
-    white space characters will be truncated and multiple consecutive white 
-    space characters in a sentence will be replaced by one white space character.
+    We can use levenshtein distance to calculate CER. Chinese input should be
+    encoded to unicode. Please draw an attention that the leading and tailing
+    space characters will be truncated and multiple consecutive space
+    characters in a sentence will be replaced by one space character.
 
     :param reference: The reference sentence.
     :type reference: basestring
@@ -122,6 +129,8 @@ def cer(reference, hypothesis, ignore_case=False):
     :type hypothesis: basestring
     :param ignore_case: Whether case-sensitive or not.
     :type ignore_case: bool
+    :param remove_space: Whether remove internal space characters
+    :type remove_space: bool
     :return: Character error rate.
     :rtype: float
     :raises ValueError: If the reference length is zero.
@@ -130,8 +139,12 @@ def cer(reference, hypothesis, ignore_case=False):
         reference = reference.lower()
         hypothesis = hypothesis.lower()
 
-    reference = ' '.join(filter(None, reference.split(' ')))
-    hypothesis = ' '.join(filter(None, hypothesis.split(' ')))
+    join_char = ' '
+    if remove_space == True:
+        join_char = ''
+
+    reference = join_char.join(filter(None, reference.split(' ')))
+    hypothesis = join_char.join(filter(None, hypothesis.split(' ')))
 
     if len(reference) == 0:
         raise ValueError("Length of reference should be greater than 0.")
diff --git a/tests/test_error_rate.py b/tests/test_error_rate.py
index be7313f3..99e137a9 100644
--- a/tests/test_error_rate.py
+++ b/tests/test_error_rate.py
@@ -11,16 +11,54 @@ import error_rate
 class TestParse(unittest.TestCase):
     def test_wer_1(self):
         ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night'
-        hyp = 'i GOT IT TO the FULLEST i LOVE TO portable FROM OF STORES last night'
+        hyp = 'i GOT IT TO the FULLEST i LOVE TO portable FROM OF STORES last '\
+                'night'
         word_error_rate = error_rate.wer(ref, hyp)
         self.assertTrue(abs(word_error_rate - 0.769230769231) < 1e-6)
 
     def test_wer_2(self):
+        ref = 'as any in england i would say said gamewell proudly that is '\
+                'in his day'
+        hyp = 'as any in england i would say said came well proudly that is '\
+                'in his day'
+        word_error_rate = error_rate.wer(ref, hyp)
+        self.assertTrue(abs(word_error_rate - 0.1333333) < 1e-6)
+
+    def test_wer_3(self):
+        ref = 'the lieutenant governor lilburn w boggs afterward governor '\
+                'was a pronounced mormon hater and throughout the period of '\
+                'the troubles he manifested sympathy with the persecutors'
+        hyp = 'the lieutenant governor little bit how bags afterward '\
+                'governor was a pronounced warman hater and throughout the '\
+                'period of th troubles he manifests sympathy with the '\
+                'persecutors'
+        word_error_rate = error_rate.wer(ref, hyp)
+        self.assertTrue(abs(word_error_rate - 0.2692307692) < 1e-6)
+
+    def test_wer_4(self):
+        ref = 'the wood flamed up splendidly under the large brewing copper '\
+                'and it sighed so deeply'
+        hyp = 'the wood flame do splendidly under the large brewing copper '\
+                'and its side so deeply'
+        word_error_rate = error_rate.wer(ref, hyp)
+        self.assertTrue(abs(word_error_rate - 0.2666666667) < 1e-6)
+
+    def test_wer_5(self):
+        ref = 'all the morning they trudged up the mountain path and at noon '\
+                'unc and ojo sat on a fallen tree trunk and ate the last of '\
+                'the bread which the old munchkin had placed in his pocket'
+        hyp = 'all the morning they trudged up the mountain path and at noon '\
+                'unc in ojo sat on a fallen tree trunk and ate the last of '\
+                'the bread which the old munchkin had placed in his pocket'
+        word_error_rate = error_rate.wer(ref, hyp)
+        self.assertTrue(abs(word_error_rate - 0.027027027) < 1e-6)
+
+    def test_wer_6(self):
         ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night'
         word_error_rate = error_rate.wer(ref, ref)
         self.assertEqual(word_error_rate, 0.0)
 
-    def test_wer_3(self):
+    def test_wer_7(self):
         ref = ' '
         hyp = 'Hypothesis sentence'
         with self.assertRaises(ValueError):
@@ -33,22 +71,40 @@ class TestParse(unittest.TestCase):
         self.assertTrue(abs(char_error_rate - 0.25) < 1e-6)
 
     def test_cer_2(self):
+        ref = 'werewolf'
+        hyp = 'weae  wolf'
+        char_error_rate = error_rate.cer(ref, hyp, remove_space=True)
+        self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
+
+    def test_cer_3(self):
+        ref = 'were wolf'
+        hyp = 'were  wolf'
+        char_error_rate = error_rate.cer(ref, hyp)
+        self.assertTrue(abs(char_error_rate - 0.0) < 1e-6)
+
+    def test_cer_4(self):
         ref = 'werewolf'
         char_error_rate = error_rate.cer(ref, ref)
         self.assertEqual(char_error_rate, 0.0)
 
-    def test_cer_3(self):
+    def test_cer_5(self):
         ref = u'我是中国人'
         hyp = u'我是 美洲人'
         char_error_rate = error_rate.cer(ref, hyp)
         self.assertTrue(abs(char_error_rate - 0.6) < 1e-6)
 
-    def test_cer_4(self):
+    def test_cer_6(self):
+        ref = u'我 是 中 国 人'
+        hyp = u'我 是 美 洲 人'
+        char_error_rate = error_rate.cer(ref, hyp, remove_space=True)
+        self.assertTrue(abs(char_error_rate - 0.4) < 1e-6)
+
+    def test_cer_7(self):
         ref = u'我是中国人'
         char_error_rate = error_rate.cer(ref, ref)
         self.assertFalse(char_error_rate, 0.0)
 
-    def test_cer_5(self):
+    def test_cer_8(self):
         ref = ''
         hyp = 'Hypothesis'
         with self.assertRaises(ValueError):