From c26482ff766107b8ab20c2908739c27986993ed0 Mon Sep 17 00:00:00 2001 From: M66B Date: Fri, 9 Sep 2022 19:46:27 +0200 Subject: [PATCH] Reduce classifier data size --- .../eu/faircode/email/MessageClassifier.java | 47 +++++++++++++++---- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/app/src/main/java/eu/faircode/email/MessageClassifier.java b/app/src/main/java/eu/faircode/email/MessageClassifier.java index ec71751b02..278a9b4f73 100644 --- a/app/src/main/java/eu/faircode/email/MessageClassifier.java +++ b/app/src/main/java/eu/faircode/email/MessageClassifier.java @@ -504,7 +504,7 @@ public class MessageClassifier { if (backup.exists()) file = backup; try { - _load(context, file); + _load(file); } catch (Throwable ex) { Log.e(ex); file.delete(); @@ -512,7 +512,7 @@ public class MessageClassifier { } } - private static synchronized void _load(Context context, File file) throws IOException { + private static synchronized void _load(File file) throws IOException { Log.i("Classifier read " + file); long start = new Date().getTime(); if (file.exists()) @@ -650,13 +650,42 @@ public class MessageClassifier { dirty = false; long elapsed = new Date().getTime() - start; - EntityLog.log(context, "Classifier data loaded elapsed=" + elapsed); - for (long account : classMessages.keySet()) - EntityLog.log(context, "Messages account=" + account + " classes=" + classMessages.get(account).size()); - for (long account : wordClassFrequency.keySet()) - EntityLog.log(context, "Words account=" + account + " words=" + wordClassFrequency.get(account).size()); - for (long account : accountMsgIds.keySet()) - EntityLog.log(context, "Classified account=" + account + " ids=" + accountMsgIds.get(account).size()); + Log.i("Classifier data loaded elapsed=" + elapsed); + + for (long account : wordClassFrequency.keySet()) { + Map total = new HashMap<>(); + Map count = new HashMap<>(); + + for (String word : wordClassFrequency.get(account).keySet()) + for (String clazz : wordClassFrequency.get(account).get(word).keySet()) { + int f = wordClassFrequency.get(account).get(word).get(clazz).count; + + if (!total.containsKey(clazz)) + total.put(clazz, 0L); + total.put(clazz, total.get(clazz) + f); + + if (!count.containsKey(clazz)) + count.put(clazz, 0); + count.put(clazz, count.get(clazz) + 1); + } + + for (String word : wordClassFrequency.get(account).keySet()) + for (String clazz : new ArrayList<>(wordClassFrequency.get(account).get(word).keySet())) { + int freq = wordClassFrequency.get(account).get(word).get(clazz).count; + long avg = total.get(clazz) / count.get(clazz); + if (freq < avg / 2) { + Log.i("Classifier dropping account=" + account + + " word=" + word + " class=" + clazz + " freq=" + freq + " avg=" + avg); + wordClassFrequency.get(account).get(word).remove(clazz); + } + } + + // Source 47 MB + // avg/1 = 21.3 + // avg/2 = 25.5 + // avg/3 = 29.0 + // avg/5 = 34.6 + } } static synchronized void cleanup(@NonNull Context context) {