From 39a6b428f5c6e1f558eca791f8bac8995e18bb4a Mon Sep 17 00:00:00 2001 From: M66B Date: Sun, 13 Feb 2022 19:04:40 +0100 Subject: [PATCH] Use ref charset and language for charset detection --- .../java/eu/faircode/email/ActivityEML.java | 2 +- .../java/eu/faircode/email/CharsetHelper.java | 13 +++++++--- .../java/eu/faircode/email/MessageHelper.java | 8 +++--- app/src/main/jni/fairemail.cc | 26 +++++++++++++++---- 4 files changed, 35 insertions(+), 14 deletions(-) diff --git a/app/src/main/java/eu/faircode/email/ActivityEML.java b/app/src/main/java/eu/faircode/email/ActivityEML.java index 86381edabc..3c821ccd4b 100644 --- a/app/src/main/java/eu/faircode/email/ActivityEML.java +++ b/app/src/main/java/eu/faircode/email/ActivityEML.java @@ -437,7 +437,6 @@ public class ActivityEML extends ActivityBase { Object content = part.getContent(); if (content instanceof String) { String text = (String) content; - Charset detected = CharsetHelper.detect(text); String charset; try { @@ -450,6 +449,7 @@ public class ActivityEML extends ActivityBase { charset = StandardCharsets.ISO_8859_1.name(); Charset cs = Charset.forName(charset); + Charset detected = CharsetHelper.detect(text, cs); boolean isUtf8 = CharsetHelper.isUTF8(text.getBytes(cs)); boolean isW1252 = !Objects.equals(text, CharsetHelper.utf8toW1252(text)); diff --git a/app/src/main/java/eu/faircode/email/CharsetHelper.java b/app/src/main/java/eu/faircode/email/CharsetHelper.java index a5907fec9d..1b8311c116 100644 --- a/app/src/main/java/eu/faircode/email/CharsetHelper.java +++ b/app/src/main/java/eu/faircode/email/CharsetHelper.java @@ -57,7 +57,7 @@ public class CharsetHelper { } } - private static native DetectResult jni_detect_charset(byte[] octets); + private static native DetectResult jni_detect_charset(byte[] octets, String ref, String lang); static boolean isUTF8(String text) { // Get extended ASCII characters @@ -159,7 +159,10 @@ public class CharsetHelper { } } - public static Charset detect(String text) { + public static Charset detect(String text, Charset ref) { + if (text == null) + return null; + try { byte[] octets = text.getBytes(StandardCharsets.ISO_8859_1); @@ -172,7 +175,9 @@ public class CharsetHelper { } Log.i("compact_enc_det sample=" + sample.length); - DetectResult detected = jni_detect_charset(sample); + DetectResult detected = jni_detect_charset(sample, + ref == null ? null : ref.name(), + Locale.getDefault().getLanguage()); if (TextUtils.isEmpty(detected.charset)) { Log.e("compact_enc_det result=" + detected); @@ -185,7 +190,7 @@ public class CharsetHelper { Log.e("compact_enc_det result=" + detected + " chinese=" + chinese); if (!chinese) return null; - } else // GBK, Big5, ISO-2022-JP, HZ-GB-2312, Shift_JIS + } else // GBK, Big5, ISO-2022-JP, HZ-GB-2312, Shift_JIS, x-binaryenc Log.e("compact_enc_det result=" + detected); return Charset.forName(detected.charset); diff --git a/app/src/main/java/eu/faircode/email/MessageHelper.java b/app/src/main/java/eu/faircode/email/MessageHelper.java index 70a5c25552..6c6e62850c 100644 --- a/app/src/main/java/eu/faircode/email/MessageHelper.java +++ b/app/src/main/java/eu/faircode/email/MessageHelper.java @@ -1840,7 +1840,7 @@ public class MessageHelper { if (header.trim().startsWith("=?")) return header; - Charset detected = CharsetHelper.detect(header); + Charset detected = CharsetHelper.detect(header, StandardCharsets.ISO_8859_1); if (detected == null && CharsetHelper.isUTF8(header)) detected = StandardCharsets.UTF_8; if (detected == null || @@ -2928,7 +2928,7 @@ public class MessageHelper { Log.i("Charset upgrade=UTF8"); result = new String(result.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8); } else { - Charset detected = CharsetHelper.detect(result); + Charset detected = CharsetHelper.detect(result, StandardCharsets.ISO_8859_1); if (detected == null) { if (CharsetHelper.isUTF8(result)) { Log.i("Charset plain=UTF8"); @@ -2984,7 +2984,7 @@ public class MessageHelper { // Fix incorrect UTF16 try { if (CHARSET16.contains(cs)) { - Charset detected = CharsetHelper.detect(result); + Charset detected = CharsetHelper.detect(result, cs); if (!CHARSET16.contains(detected)) Log.w(new Throwable("Charset=" + cs + " detected=" + detected)); if (StandardCharsets.US_ASCII.equals(detected) || @@ -3034,7 +3034,7 @@ public class MessageHelper { break; } - Charset detected = CharsetHelper.detect(result); + Charset detected = CharsetHelper.detect(result, c); if (c.equals(detected)) break; diff --git a/app/src/main/jni/fairemail.cc b/app/src/main/jni/fairemail.cc index e90e17706d..0685f122d3 100644 --- a/app/src/main/jni/fairemail.cc +++ b/app/src/main/jni/fairemail.cc @@ -24,11 +24,21 @@ void log_android(int prio, const char *fmt, ...) { extern "C" JNIEXPORT jobject JNICALL + Java_eu_faircode_email_CharsetHelper_jni_1detect_1charset( JNIEnv *env, jclass type, - jbyteArray _octets) { + jbyteArray _octets, jstring _ref, jstring _lang) { int len = env->GetArrayLength(_octets); jbyte *octets = env->GetByteArrayElements(_octets, nullptr); + const char *ref = env->GetStringUTFChars(_ref, 0); + const char *lang = env->GetStringUTFChars(_lang, 0); + + // ISO-8859-1 is unknown + Encoding encoding_hint; + EncodingFromName(ref, &encoding_hint); + + Language language_hint; + LanguageFromCode(lang, &language_hint); // https://github.com/google/compact_enc_det @@ -38,19 +48,25 @@ Java_eu_faircode_email_CharsetHelper_jni_1detect_1charset( Encoding encoding = CompactEncDet::DetectEncoding( (const char *) octets, len, nullptr, nullptr, nullptr, - UNKNOWN_ENCODING, - UNKNOWN_LANGUAGE, + encoding_hint, + language_hint, CompactEncDet::EMAIL_CORPUS, false, &bytes_consumed, &is_reliable); + // TODO: PreferredWebOutputEncoding? const char *name = MimeEncodingName(encoding); - log_android(ANDROID_LOG_DEBUG, "detect=%d/%s bytes=%d reliable=%d", - encoding, name, bytes_consumed, is_reliable); + log_android(ANDROID_LOG_DEBUG, + "MMM detect=%d/%s bytes=%d reliable=%d" + " ref=%s/%s lang=%s/%s", + encoding, name, bytes_consumed, is_reliable, + EncodingName(encoding_hint), ref, LanguageCode(language_hint), lang); // https://developer.android.com/training/articles/perf-jni#primitive-arrays env->ReleaseByteArrayElements(_octets, octets, JNI_ABORT); + env->ReleaseStringUTFChars(_ref, ref); + env->ReleaseStringUTFChars(_lang, lang); jclass cls = env->FindClass("eu/faircode/email/CharsetHelper$DetectResult"); jmethodID ctor = env->GetMethodID(cls, "", "(Ljava/lang/String;IIZ)V");