From b063fb6503d5c85bfca91303061e6f4cf9923a72 Mon Sep 17 00:00:00 2001 From: M66B Date: Sat, 2 Jan 2021 14:33:53 +0100 Subject: [PATCH] Added experimental message classifier --- .../eu/faircode/email/MessageClassifier.java | 214 ++++++++++++++++++ .../java/eu/faircode/email/WorkerFts.java | 5 +- 2 files changed, 218 insertions(+), 1 deletion(-) create mode 100644 app/src/main/java/eu/faircode/email/MessageClassifier.java diff --git a/app/src/main/java/eu/faircode/email/MessageClassifier.java b/app/src/main/java/eu/faircode/email/MessageClassifier.java new file mode 100644 index 0000000000..2756189b15 --- /dev/null +++ b/app/src/main/java/eu/faircode/email/MessageClassifier.java @@ -0,0 +1,214 @@ +package eu.faircode.email; + +/* + This file is part of FairEmail. + + FairEmail is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + FairEmail is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with FairEmail. If not, see . + + Copyright 2018-2021 by Marcel Bokhorst (M66B) +*/ + +import android.content.Context; +import android.text.TextUtils; + +import org.jetbrains.annotations.NotNull; + +import java.io.File; +import java.io.IOException; +import java.text.BreakIterator; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class MessageClassifier { + private static Map classMessages = new HashMap<>(); + private static Map> wordClassFrequency = new HashMap<>(); + + private static final double COMMON_WORD_FACTOR = 0.75; + private static final double CHANCE_THRESHOLD = 2.0; + + static String classify(EntityMessage message, boolean added, Context context) { + DB db = DB.getInstance(context); + + if (!message.content) + throw new IllegalArgumentException("Message without content"); + + EntityFolder folder = db.folder().getFolder(message.folder); + if (folder == null) + return null; + + EntityAccount account = db.account().getAccount(folder.account); + if (account == null) + return null; + + if (!EntityFolder.INBOX.equals(folder.type) && + !EntityFolder.JUNK.equals(folder.type) && + !EntityFolder.USER.equals(folder.type) && + !(EntityFolder.ARCHIVE.equals(folder.type) && !account.isGmail())) + return null; + + File file = message.getFile(context); + String text; + try { + text = HtmlHelper.getFullText(file); + } catch (IOException ex) { + Log.w(ex); + text = null; + } + + if (TextUtils.isEmpty(text)) + return null; + + String classified = classify(folder.name, text, added); + + Integer m = classMessages.get(folder.name); + if (added) { + m = (m == null ? 1 : m + 1); + classMessages.put(folder.name, m); + } else { + if (m != null) + classMessages.put(folder.name, m - 1); + } + + return classified; + } + + static String classify(String classify, String text, boolean added) { + int maxFrequency = 0; + int maxMatchedWords = 0; + List words = new ArrayList<>(); + Map classStats = new HashMap<>(); + + BreakIterator boundary = BreakIterator.getWordInstance(); // TODO ICU + boundary.setText(text); + int start = boundary.first(); + for (int end = boundary.next(); end != BreakIterator.DONE; end = boundary.next()) { + String word = text.substring(start, end).toLowerCase(); + if (word.length() > 1 && + !words.contains(word) && + !word.matches(".*\\d.*")) { + words.add(word); + + Map classFrequency = wordClassFrequency.get(word); + if (!added) { + Integer c = (classFrequency == null ? null : classFrequency.get(classify)); + if (c != null) + classFrequency.put(classify, c - 1); + continue; + } + + if (classFrequency == null) { + classFrequency = new HashMap<>(); + wordClassFrequency.put(word, classFrequency); + } + + // Filter classes of common occurring words + List applyClasses = new ArrayList<>(classFrequency.keySet()); + for (String class1 : classFrequency.keySet()) + for (String class2 : classFrequency.keySet()) + if (!class1.equals(class2)) { + double percentage1 = (double) classFrequency.get(class1) / classMessages.get(class1); + double percentage2 = (double) classFrequency.get(class2) / classMessages.get(class2); + double factor = percentage1 / percentage2; + if (factor > 1) + factor = 1 / factor; + if (factor > COMMON_WORD_FACTOR) { + Log.i("Classifier skip class=" + class1 + " word=" + word); + applyClasses.remove(class1); + break; + } + } + + for (String clazz : applyClasses) { + int frequency = classFrequency.get(clazz); + if (frequency > maxFrequency) + maxFrequency = frequency; + + Stat stat = classStats.get(clazz); + if (stat == null) { + stat = new Stat(); + classStats.put(clazz, stat); + } + + stat.matchedWords++; + stat.totalFrequency += frequency; + + if (stat.matchedWords > maxMatchedWords) + maxMatchedWords = stat.matchedWords; + } + + Integer c = classFrequency.get(classify); + c = (c == null ? 1 : c + 1); + classFrequency.put(classify, c); + } + start = end; + } + + if (!added) + return null; + + List chances = new ArrayList<>(); + for (String clazz : classStats.keySet()) { + Stat stat = classStats.get(clazz); + double chance = ((double) stat.totalFrequency / maxFrequency / maxMatchedWords); + Chance c = new Chance(clazz, chance); + Log.i("Classifier " + c + + " frequency=" + stat.totalFrequency + "/" + maxFrequency + + " matched=" + stat.matchedWords + "/" + maxMatchedWords); + chances.add(c); + } + + if (chances.size() <= 1) + return null; + + Collections.sort(chances, new Comparator() { + @Override + public int compare(Chance c1, Chance c2) { + return -c1.chance.compareTo(c2.chance); + } + }); + + String classification = null; + if (chances.get(0).chance / chances.get(1).chance >= CHANCE_THRESHOLD) + classification = chances.get(0).clazz; + + Log.i("Classifier classify=" + classify + " classified=" + classification); + + return classification; + } + + private static class Stat { + int matchedWords = 0; + int totalFrequency = 0; + } + + private static class Chance { + String clazz; + Double chance; + + Chance(String clazz, Double chance) { + this.clazz = clazz; + this.chance = chance; + } + + @NotNull + @Override + public String toString() { + return clazz + "=" + chance; + } + } +} diff --git a/app/src/main/java/eu/faircode/email/WorkerFts.java b/app/src/main/java/eu/faircode/email/WorkerFts.java index 9535670510..b3eb2c5240 100644 --- a/app/src/main/java/eu/faircode/email/WorkerFts.java +++ b/app/src/main/java/eu/faircode/email/WorkerFts.java @@ -42,7 +42,7 @@ import io.requery.android.database.sqlite.SQLiteDatabase; import static android.os.Process.THREAD_PRIORITY_BACKGROUND; public class WorkerFts extends Worker { - private static final int INDEX_DELAY = 30; // seconds + private static final int INDEX_DELAY = BuildConfig.DEBUG ? 3 : 30; // seconds private static final int INDEX_BATCH_SIZE = 100; public WorkerFts(@NonNull Context context, @NonNull WorkerParameters workerParams) { @@ -78,6 +78,9 @@ public class WorkerFts extends Worker { continue; } + if (BuildConfig.DEBUG) + MessageClassifier.classify(message, true, context); + File file = message.getFile(context); String text = HtmlHelper.getFullText(file); if (TextUtils.isEmpty(text)) {