Less common charsets

pull/208/head
M66B 2 years ago
parent 4dd3b606a7
commit 66293e7a74

@ -38,10 +38,18 @@ public class CharsetHelper {
private static String CHINESE = new Locale("zh").getLanguage(); private static String CHINESE = new Locale("zh").getLanguage();
private static final List<String> COMMON = Collections.unmodifiableList(Arrays.asList( private static final List<String> COMMON = Collections.unmodifiableList(Arrays.asList(
"US-ASCII", "US-ASCII",
"ISO-8859-1", "ISO-8859-2", "ISO-8859-1", "ISO-8859-2", "ISO-8859-3", "ISO-8859-7",
"windows-1250", "windows-1252", "windows-1257", "windows-1250", "windows-1251", "windows-1252", "windows-1257",
"UTF-7", "UTF-8" "UTF-7", "UTF-8"
)); ));
private static final List<String> LESS_COMMON = Collections.unmodifiableList(Arrays.asList(
"GBK", "GB2312", "HZ-GB-2312",
"EUC", "EUC-KR",
"Big5", "BIG5-CP950",
"ISO-2022-JP", "Shift_JIS",
"cp852",
"x-binaryenc"
));
private static final int MIN_W1252 = 10; private static final int MIN_W1252 = 10;
private static final Pair<byte[], byte[]>[] sUtf8W1252 = new Pair[128]; private static final Pair<byte[], byte[]>[] sUtf8W1252 = new Pair[128];
@ -182,7 +190,7 @@ public class CharsetHelper {
if (TextUtils.isEmpty(detected.charset)) { if (TextUtils.isEmpty(detected.charset)) {
Log.e("compact_enc_det result=" + detected); Log.e("compact_enc_det result=" + detected);
return null; return null;
} else if (COMMON.contains(detected.charset)) } else if (COMMON.contains(detected.charset) || LESS_COMMON.contains(detected.charset))
Log.w("compact_enc_det result=" + detected); Log.w("compact_enc_det result=" + detected);
else if ("GB18030".equals(detected.charset)) { else if ("GB18030".equals(detected.charset)) {
boolean chinese = Locale.getDefault().getLanguage().equals(CHINESE); boolean chinese = Locale.getDefault().getLanguage().equals(CHINESE);
@ -190,7 +198,7 @@ public class CharsetHelper {
Log.e("compact_enc_det result=" + detected + " chinese=" + chinese); Log.e("compact_enc_det result=" + detected + " chinese=" + chinese);
if (!chinese) if (!chinese)
return null; return null;
} else // GBK, Big5, ISO-2022-JP, HZ-GB-2312, GB2312, Shift_JIS, x-binaryenc, EUC-KR } else
Log.e("compact_enc_det result=" + detected); Log.e("compact_enc_det result=" + detected);
return Charset.forName(detected.charset); return Charset.forName(detected.charset);

Loading…
Cancel
Save