Return charset detect results

pull/187/head
M66B 5 years ago
parent 0d854d9877
commit a453beff63

@ -23,17 +23,23 @@ import android.text.TextUtils;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Locale; import java.util.Locale;
class CharsetHelper { class CharsetHelper {
private static String CHINESE = new Locale("zh").getLanguage();
private static final int MAX_SAMPLE_SIZE = 8192; private static final int MAX_SAMPLE_SIZE = 8192;
private static String CHINESE = new Locale("zh").getLanguage();
private static final List<String> COMMON = Collections.unmodifiableList(Arrays.asList(
"US-ASCII", "ISO-8859-1", "ISO-8859-2", "windows-1250", "windows-1252", "windows-1257", "UTF-8"
));
static { static {
System.loadLibrary("compact_enc_det"); System.loadLibrary("compact_enc_det");
} }
private static native String jni_detect(byte[] chars); private static native DetectResult jni_detect(byte[] octets);
static boolean isUTF8(String text) { static boolean isUTF8(String text) {
// Get extended ASCII characters // Get extended ASCII characters
@ -80,30 +86,43 @@ class CharsetHelper {
} }
Log.i("compact_enc_det sample=" + sample.length); Log.i("compact_enc_det sample=" + sample.length);
String detected = jni_detect(sample); DetectResult detected = jni_detect(sample);
if ("US-ASCII".equals(detected) ||
"ISO-8859-1".equals(detected) || if (TextUtils.isEmpty(detected.charset)) {
"ISO-8859-2".equals(detected) || Log.e("compact_enc_det result=" + detected);
"windows-1250".equals(detected) || return null;
"windows-1252".equals(detected) || } else if (!BuildConfig.PLAY_STORE_RELEASE &&
"windows-1257".equals(detected) || COMMON.contains(detected.charset))
"UTF-8".equals(detected))
Log.w("compact_enc_det result=" + detected); Log.w("compact_enc_det result=" + detected);
else if ("GB18030".equals(detected) && else if ("GB18030".equals(detected.charset) &&
!Locale.getDefault().getLanguage().equals(CHINESE)) { !Locale.getDefault().getLanguage().equals(CHINESE)) {
// https://github.com/google/compact_enc_det/issues/8 // https://github.com/google/compact_enc_det/issues/8
Log.w("compact_enc_det result=" + detected);
return null;
} else // ISO-2022-JP, etc
Log.e("compact_enc_det result=" + detected); Log.e("compact_enc_det result=" + detected);
if (TextUtils.isEmpty(detected))
return null; return null;
} else
Log.e("compact_enc_det result=" + detected);
return Charset.forName(detected); return Charset.forName(detected.charset);
} catch (Throwable ex) { } catch (Throwable ex) {
Log.w(ex); Log.w(ex);
return null; return null;
} }
} }
private static class DetectResult {
String charset;
int bytes_consumed;
boolean is_reliable;
DetectResult(String charset, int bytes_consumed, boolean is_reliable) {
this.charset = charset;
this.bytes_consumed = bytes_consumed;
this.is_reliable = is_reliable;
}
@Override
public String toString() {
return charset + " c=" + bytes_consumed + " r=" + is_reliable;
}
}
} }

@ -14,10 +14,10 @@ void log_android(int prio, const char *fmt, ...) {
} }
} }
extern "C" JNIEXPORT jstring JNICALL extern "C" JNIEXPORT jobject JNICALL
Java_eu_faircode_email_CharsetHelper_jni_1detect(JNIEnv *env, jclass type, jbyteArray _bytes) { Java_eu_faircode_email_CharsetHelper_jni_1detect(JNIEnv *env, jclass type, jbyteArray _octets) {
int len = env->GetArrayLength(_bytes); int len = env->GetArrayLength(_octets);
jbyte *bytes = env->GetByteArrayElements(_bytes, nullptr); jbyte *octets = env->GetByteArrayElements(_octets, nullptr);
// https://github.com/google/compact_enc_det // https://github.com/google/compact_enc_det
@ -25,7 +25,7 @@ Java_eu_faircode_email_CharsetHelper_jni_1detect(JNIEnv *env, jclass type, jbyte
int bytes_consumed; int bytes_consumed;
Encoding encoding = CompactEncDet::DetectEncoding( Encoding encoding = CompactEncDet::DetectEncoding(
(const char *) bytes, len, (const char *) octets, len,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
UNKNOWN_ENCODING, UNKNOWN_ENCODING,
UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE,
@ -39,7 +39,10 @@ Java_eu_faircode_email_CharsetHelper_jni_1detect(JNIEnv *env, jclass type, jbyte
encoding, name, bytes_consumed, is_reliable); encoding, name, bytes_consumed, is_reliable);
// https://developer.android.com/training/articles/perf-jni#primitive-arrays // https://developer.android.com/training/articles/perf-jni#primitive-arrays
env->ReleaseByteArrayElements(_bytes, bytes, JNI_ABORT); env->ReleaseByteArrayElements(_octets, octets, JNI_ABORT);
return env->NewStringUTF(name); jclass cls = env->FindClass("eu/faircode/email/CharsetHelper$DetectResult");
jmethodID ctor = env->GetMethodID(cls, "<init>", "(Ljava/lang/String;IZ)V");
jstring jname = env->NewStringUTF(name);
return env->NewObject(cls, ctor, jname, (jint) bytes_consumed, (jboolean) is_reliable);
} }

Loading…
Cancel
Save