Use ref charset and language for charset detection

pull/194/merge
M66B 4 years ago
parent 34af1b4e72
commit 39a6b428f5

@ -437,7 +437,6 @@ public class ActivityEML extends ActivityBase {
Object content = part.getContent(); Object content = part.getContent();
if (content instanceof String) { if (content instanceof String) {
String text = (String) content; String text = (String) content;
Charset detected = CharsetHelper.detect(text);
String charset; String charset;
try { try {
@ -450,6 +449,7 @@ public class ActivityEML extends ActivityBase {
charset = StandardCharsets.ISO_8859_1.name(); charset = StandardCharsets.ISO_8859_1.name();
Charset cs = Charset.forName(charset); Charset cs = Charset.forName(charset);
Charset detected = CharsetHelper.detect(text, cs);
boolean isUtf8 = CharsetHelper.isUTF8(text.getBytes(cs)); boolean isUtf8 = CharsetHelper.isUTF8(text.getBytes(cs));
boolean isW1252 = !Objects.equals(text, CharsetHelper.utf8toW1252(text)); boolean isW1252 = !Objects.equals(text, CharsetHelper.utf8toW1252(text));

@ -57,7 +57,7 @@ public class CharsetHelper {
} }
} }
private static native DetectResult jni_detect_charset(byte[] octets); private static native DetectResult jni_detect_charset(byte[] octets, String ref, String lang);
static boolean isUTF8(String text) { static boolean isUTF8(String text) {
// Get extended ASCII characters // Get extended ASCII characters
@ -159,7 +159,10 @@ public class CharsetHelper {
} }
} }
public static Charset detect(String text) { public static Charset detect(String text, Charset ref) {
if (text == null)
return null;
try { try {
byte[] octets = text.getBytes(StandardCharsets.ISO_8859_1); byte[] octets = text.getBytes(StandardCharsets.ISO_8859_1);
@ -172,7 +175,9 @@ public class CharsetHelper {
} }
Log.i("compact_enc_det sample=" + sample.length); Log.i("compact_enc_det sample=" + sample.length);
DetectResult detected = jni_detect_charset(sample); DetectResult detected = jni_detect_charset(sample,
ref == null ? null : ref.name(),
Locale.getDefault().getLanguage());
if (TextUtils.isEmpty(detected.charset)) { if (TextUtils.isEmpty(detected.charset)) {
Log.e("compact_enc_det result=" + detected); Log.e("compact_enc_det result=" + detected);
@ -185,7 +190,7 @@ public class CharsetHelper {
Log.e("compact_enc_det result=" + detected + " chinese=" + chinese); Log.e("compact_enc_det result=" + detected + " chinese=" + chinese);
if (!chinese) if (!chinese)
return null; return null;
} else // GBK, Big5, ISO-2022-JP, HZ-GB-2312, Shift_JIS } else // GBK, Big5, ISO-2022-JP, HZ-GB-2312, Shift_JIS, x-binaryenc
Log.e("compact_enc_det result=" + detected); Log.e("compact_enc_det result=" + detected);
return Charset.forName(detected.charset); return Charset.forName(detected.charset);

@ -1840,7 +1840,7 @@ public class MessageHelper {
if (header.trim().startsWith("=?")) if (header.trim().startsWith("=?"))
return header; return header;
Charset detected = CharsetHelper.detect(header); Charset detected = CharsetHelper.detect(header, StandardCharsets.ISO_8859_1);
if (detected == null && CharsetHelper.isUTF8(header)) if (detected == null && CharsetHelper.isUTF8(header))
detected = StandardCharsets.UTF_8; detected = StandardCharsets.UTF_8;
if (detected == null || if (detected == null ||
@ -2928,7 +2928,7 @@ public class MessageHelper {
Log.i("Charset upgrade=UTF8"); Log.i("Charset upgrade=UTF8");
result = new String(result.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8); result = new String(result.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8);
} else { } else {
Charset detected = CharsetHelper.detect(result); Charset detected = CharsetHelper.detect(result, StandardCharsets.ISO_8859_1);
if (detected == null) { if (detected == null) {
if (CharsetHelper.isUTF8(result)) { if (CharsetHelper.isUTF8(result)) {
Log.i("Charset plain=UTF8"); Log.i("Charset plain=UTF8");
@ -2984,7 +2984,7 @@ public class MessageHelper {
// Fix incorrect UTF16 // Fix incorrect UTF16
try { try {
if (CHARSET16.contains(cs)) { if (CHARSET16.contains(cs)) {
Charset detected = CharsetHelper.detect(result); Charset detected = CharsetHelper.detect(result, cs);
if (!CHARSET16.contains(detected)) if (!CHARSET16.contains(detected))
Log.w(new Throwable("Charset=" + cs + " detected=" + detected)); Log.w(new Throwable("Charset=" + cs + " detected=" + detected));
if (StandardCharsets.US_ASCII.equals(detected) || if (StandardCharsets.US_ASCII.equals(detected) ||
@ -3034,7 +3034,7 @@ public class MessageHelper {
break; break;
} }
Charset detected = CharsetHelper.detect(result); Charset detected = CharsetHelper.detect(result, c);
if (c.equals(detected)) if (c.equals(detected))
break; break;

@ -24,11 +24,21 @@ void log_android(int prio, const char *fmt, ...) {
extern "C" extern "C"
JNIEXPORT jobject JNICALL JNIEXPORT jobject JNICALL
Java_eu_faircode_email_CharsetHelper_jni_1detect_1charset( Java_eu_faircode_email_CharsetHelper_jni_1detect_1charset(
JNIEnv *env, jclass type, JNIEnv *env, jclass type,
jbyteArray _octets) { jbyteArray _octets, jstring _ref, jstring _lang) {
int len = env->GetArrayLength(_octets); int len = env->GetArrayLength(_octets);
jbyte *octets = env->GetByteArrayElements(_octets, nullptr); jbyte *octets = env->GetByteArrayElements(_octets, nullptr);
const char *ref = env->GetStringUTFChars(_ref, 0);
const char *lang = env->GetStringUTFChars(_lang, 0);
// ISO-8859-1 is unknown
Encoding encoding_hint;
EncodingFromName(ref, &encoding_hint);
Language language_hint;
LanguageFromCode(lang, &language_hint);
// https://github.com/google/compact_enc_det // https://github.com/google/compact_enc_det
@ -38,19 +48,25 @@ Java_eu_faircode_email_CharsetHelper_jni_1detect_1charset(
Encoding encoding = CompactEncDet::DetectEncoding( Encoding encoding = CompactEncDet::DetectEncoding(
(const char *) octets, len, (const char *) octets, len,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
UNKNOWN_ENCODING, encoding_hint,
UNKNOWN_LANGUAGE, language_hint,
CompactEncDet::EMAIL_CORPUS, CompactEncDet::EMAIL_CORPUS,
false, false,
&bytes_consumed, &bytes_consumed,
&is_reliable); &is_reliable);
// TODO: PreferredWebOutputEncoding?
const char *name = MimeEncodingName(encoding); const char *name = MimeEncodingName(encoding);
log_android(ANDROID_LOG_DEBUG, "detect=%d/%s bytes=%d reliable=%d", log_android(ANDROID_LOG_DEBUG,
encoding, name, bytes_consumed, is_reliable); "MMM detect=%d/%s bytes=%d reliable=%d"
" ref=%s/%s lang=%s/%s",
encoding, name, bytes_consumed, is_reliable,
EncodingName(encoding_hint), ref, LanguageCode(language_hint), lang);
// https://developer.android.com/training/articles/perf-jni#primitive-arrays // https://developer.android.com/training/articles/perf-jni#primitive-arrays
env->ReleaseByteArrayElements(_octets, octets, JNI_ABORT); env->ReleaseByteArrayElements(_octets, octets, JNI_ABORT);
env->ReleaseStringUTFChars(_ref, ref);
env->ReleaseStringUTFChars(_lang, lang);
jclass cls = env->FindClass("eu/faircode/email/CharsetHelper$DetectResult"); jclass cls = env->FindClass("eu/faircode/email/CharsetHelper$DetectResult");
jmethodID ctor = env->GetMethodID(cls, "<init>", "(Ljava/lang/String;IIZ)V"); jmethodID ctor = env->GetMethodID(cls, "<init>", "(Ljava/lang/String;IIZ)V");

Loading…
Cancel
Save