Correct UTF-8 encoded Windows-1252 text

pull/194/merge
M66B 3 years ago
parent c5fa333903
commit b3dac145db

@ -67,6 +67,7 @@ import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Objects;
import java.util.Properties;
import javax.mail.Flags;
@ -438,6 +439,7 @@ public class ActivityEML extends ActivityBase {
Charset cs = Charset.forName(charset);
boolean isUtf8 = CharsetHelper.isUTF8(text.getBytes(cs));
boolean isW1252 = !Objects.equals(text, CharsetHelper.utf8toW1252(text));
for (int i = 0; i < level; i++)
ssb.append(" ");
@ -445,6 +447,7 @@ public class ActivityEML extends ActivityBase {
ssb.append("Detected: ")
.append(detected == null ? "?" : detected.toString())
.append(" isUTF8=").append(Boolean.toString(isUtf8))
.append(" isW1252=").append(Boolean.toString(isW1252))
.append('\n');
}
}

@ -20,8 +20,10 @@ package eu.faircode.email;
*/
import android.text.TextUtils;
import android.util.Pair;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
@ -38,9 +40,19 @@ public class CharsetHelper {
private static final List<String> COMMON = Collections.unmodifiableList(Arrays.asList(
"US-ASCII", "ISO-8859-1", "ISO-8859-2", "windows-1250", "windows-1252", "windows-1257", "UTF-8"
));
private static final int MIN_W1252 = 10;
private static final Pair<byte[], byte[]>[] sUtf8W1252 = new Pair[128];
static {
System.loadLibrary("fairemail");
// https://www.i18nqa.com/debug/utf8-debug.html
Charset w1252 = Charset.forName("windows-1252");
for (int c = 128; c < 256; c++) {
String y = new String(new byte[]{(byte) c}, w1252);
String x = new String(y.getBytes(), w1252);
sUtf8W1252[c - 128] = new Pair<>(x.getBytes(), y.getBytes());
}
}
private static native DetectResult jni_detect_charset(byte[] octets);
@ -64,6 +76,57 @@ public class CharsetHelper {
}
}
static String utf8toW1252(String text) {
try {
Charset w1252 = Charset.forName("windows-1252");
//String result = new String(text.getBytes(StandardCharsets.ISO_8859_1), w1252);
//for (int c = 0; c < 128; c++) {
// String y = new String(sUtf8W1252[c].second);
// String x = new String(sUtf8W1252[c].first);
// result = result.replace(x, y);
//}
//return result;
byte[] t = new String(text.getBytes(StandardCharsets.ISO_8859_1), w1252).getBytes();
byte[] result = new byte[t.length];
int i = 0;
int len = 0;
int count = 0;
while (i < t.length && (i < MAX_SAMPLE_SIZE || count >= MIN_W1252)) {
boolean found = false;
for (int c = 0; c < 128; c++) {
int sl = sUtf8W1252[c].first.length;
if (i + sl < t.length) {
found = true;
for (int a = 0; a < sl; a++)
if (t[i + a] != sUtf8W1252[c].first[a]) {
found = false;
break;
}
if (found) {
count++;
int tl = sUtf8W1252[c].second.length;
System.arraycopy(sUtf8W1252[c].second, 0, result, len, tl);
len += tl;
i += sl;
break;
}
}
if (found)
break;
}
if (!found)
result[len++] = t[i++];
}
return (count < MIN_W1252 ? text : new String(result, 0, len));
} catch (Throwable ex) {
Log.w(ex);
return text;
}
}
public static Charset detect(String text) {
try {
byte[] octets = text.getBytes(StandardCharsets.ISO_8859_1);

@ -2461,7 +2461,7 @@ public class MessageHelper {
}
if (h.isPlainText()) {
if (charset == null || StandardCharsets.ISO_8859_1.equals(cs))
if (charset == null || StandardCharsets.ISO_8859_1.equals(cs)) {
if (StandardCharsets.ISO_8859_1.equals(cs) && CharsetHelper.isUTF8(result)) {
Log.i("Charset upgrade=UTF8");
result = new String(result.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8);
@ -2477,6 +2477,8 @@ public class MessageHelper {
result = new String(result.getBytes(StandardCharsets.ISO_8859_1), detected);
}
}
} else if (StandardCharsets.UTF_8.equals(cs))
result = CharsetHelper.utf8toW1252(result);
if ("flowed".equalsIgnoreCase(h.contentType.getParameter("format")))
result = HtmlHelper.flow(result);
@ -2512,6 +2514,9 @@ public class MessageHelper {
CharsetHelper.isUTF8(result))
result = new String(result.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8);
//if (StandardCharsets.UTF_8.equals(cs))
// result = CharsetHelper.utf8w1252(result);
// Fix incorrect UTF16
try {
if (CHARSET16.contains(cs)) {

Loading…
Cancel
Save