Text classification experiment

pull/190/head
M66B 4 years ago
parent 22da1ad4ac
commit 75a1691c1b

@ -29,3 +29,4 @@ FairEmail uses:
* [Over-Scroll Support For Android's RecyclerView, ListView, GridView, ScrollView ...](https://github.com/EverythingMe/overscroll-decor). Copyright (c) 2015, DoAT Media Ltd. [BSD-2-Clause License](https://github.com/EverythingMe/overscroll-decor/blob/master/LICENSE) * [Over-Scroll Support For Android's RecyclerView, ListView, GridView, ScrollView ...](https://github.com/EverythingMe/overscroll-decor). Copyright (c) 2015, DoAT Media Ltd. [BSD-2-Clause License](https://github.com/EverythingMe/overscroll-decor/blob/master/LICENSE)
* [Compact Encoding Detection](https://github.com/google/compact_enc_det). Copyright 2016 Google Inc. [Apache License 2.0](https://github.com/google/compact_enc_det/blob/master/LICENSE). * [Compact Encoding Detection](https://github.com/google/compact_enc_det). Copyright 2016 Google Inc. [Apache License 2.0](https://github.com/google/compact_enc_det/blob/master/LICENSE).
* [POI-HMEF](https://poi.apache.org/components/hmef/index.html). Copyright © 2001-2020 The Apache Software Foundation. [Apache Software License v2](https://poi.apache.org/devel/guidelines.html#The+Licensing). * [POI-HMEF](https://poi.apache.org/components/hmef/index.html). Copyright © 2001-2020 The Apache Software Foundation. [Apache Software License v2](https://poi.apache.org/devel/guidelines.html#The+Licensing).
* [Java Naive Bayes Classifier](https://github.com/ptnplanet/Java-Naive-Bayes-Classifier). Copyright (c) 2012-2017 Philipp Nolte. [MIT License](https://github.com/ptnplanet/Java-Naive-Bayes-Classifier#the-mit-license-mit).

@ -278,6 +278,7 @@ dependencies {
def appauth_version = "0.7.1" def appauth_version = "0.7.1"
def jcharset_version = "2.1" def jcharset_version = "2.1"
def apache_poi = "3.17" def apache_poi = "3.17"
def bayes_version = "1.0.7"
// https://developer.android.com/jetpack/androidx/releases/ // https://developer.android.com/jetpack/androidx/releases/
@ -442,4 +443,7 @@ dependencies {
// https://poi.apache.org/components/hmef/index.html // https://poi.apache.org/components/hmef/index.html
// https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad // https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad
implementation "org.apache.poi:poi-scratchpad:$apache_poi" implementation "org.apache.poi:poi-scratchpad:$apache_poi"
// https://github.com/ptnplanet/Java-Naive-Bayes-Classifier
implementation "com.github.ptnplanet:Java-Naive-Bayes-Classifier:$bayes_version"
} }

@ -29,3 +29,4 @@ FairEmail uses:
* [Over-Scroll Support For Android's RecyclerView, ListView, GridView, ScrollView ...](https://github.com/EverythingMe/overscroll-decor). Copyright (c) 2015, DoAT Media Ltd. [BSD-2-Clause License](https://github.com/EverythingMe/overscroll-decor/blob/master/LICENSE) * [Over-Scroll Support For Android's RecyclerView, ListView, GridView, ScrollView ...](https://github.com/EverythingMe/overscroll-decor). Copyright (c) 2015, DoAT Media Ltd. [BSD-2-Clause License](https://github.com/EverythingMe/overscroll-decor/blob/master/LICENSE)
* [Compact Encoding Detection](https://github.com/google/compact_enc_det). Copyright 2016 Google Inc. [Apache License 2.0](https://github.com/google/compact_enc_det/blob/master/LICENSE). * [Compact Encoding Detection](https://github.com/google/compact_enc_det). Copyright 2016 Google Inc. [Apache License 2.0](https://github.com/google/compact_enc_det/blob/master/LICENSE).
* [POI-HMEF](https://poi.apache.org/components/hmef/index.html). Copyright © 2001-2020 The Apache Software Foundation. [Apache Software License v2](https://poi.apache.org/devel/guidelines.html#The+Licensing). * [POI-HMEF](https://poi.apache.org/components/hmef/index.html). Copyright © 2001-2020 The Apache Software Foundation. [Apache Software License v2](https://poi.apache.org/devel/guidelines.html#The+Licensing).
* [Java Naive Bayes Classifier](https://github.com/ptnplanet/Java-Naive-Bayes-Classifier). Copyright (c) 2012-2017 Philipp Nolte. [MIT License](https://github.com/ptnplanet/Java-Naive-Bayes-Classifier#the-mit-license-mit).

@ -22,6 +22,7 @@ package eu.faircode.email;
import android.content.Context; import android.content.Context;
import android.content.SharedPreferences; import android.content.SharedPreferences;
import android.database.Cursor; import android.database.Cursor;
import android.text.TextUtils;
import androidx.annotation.NonNull; import androidx.annotation.NonNull;
import androidx.preference.PreferenceManager; import androidx.preference.PreferenceManager;
@ -34,17 +35,22 @@ import androidx.work.WorkerParameters;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import de.daslaboratorium.machinelearning.classifier.Classification;
import de.daslaboratorium.machinelearning.classifier.bayes.BayesClassifier;
import io.requery.android.database.sqlite.SQLiteDatabase; import io.requery.android.database.sqlite.SQLiteDatabase;
import static android.os.Process.THREAD_PRIORITY_BACKGROUND; import static android.os.Process.THREAD_PRIORITY_BACKGROUND;
public class WorkerFts extends Worker { public class WorkerFts extends Worker {
private static final int INDEX_DELAY = 30; // seconds private static final int INDEX_DELAY = BuildConfig.DEBUG ? 3 : 30; // seconds
private static final int INDEX_BATCH_SIZE = 100; private static final int INDEX_BATCH_SIZE = 100;
private static BayesClassifier<String, String> classifier = new BayesClassifier<>();
public WorkerFts(@NonNull Context context, @NonNull WorkerParameters workerParams) { public WorkerFts(@NonNull Context context, @NonNull WorkerParameters workerParams) {
super(context, workerParams); super(context, workerParams);
Log.i("Instance " + getName()); Log.i("Instance " + getName());
@ -78,6 +84,30 @@ public class WorkerFts extends Worker {
File file = message.getFile(getApplicationContext()); File file = message.getFile(getApplicationContext());
String text = HtmlHelper.getFullText(file); String text = HtmlHelper.getFullText(file);
if (BuildConfig.DEBUG) {
EntityFolder folder = db.folder().getFolder(message.folder);
if (folder != null) {
// \\P{L}+
List<String> features = new ArrayList<>();
for (String word : text.trim().toLowerCase().split("\\W+")) {
if (word.matches(".*\\d.*"))
continue;
if (word.endsWith("."))
word = word.substring(0, word.length() - 1);
features.add(word);
}
Collection<Classification<String, String>> classifications = classifier.classifyDetailed(features);
for (Classification<String, String> classification : classifications)
Log.i("MMM folder=" + folder.name +
" classified=" + classification.getCategory() +
" probability=" + classification.getProbability() +
" features=" + TextUtils.join(", ", features.subList(0, Math.min(features.size(), 20))));
classifier.learn(EntityFolder.JUNK.equals(folder.type) ? "spam" : "ham", features);
}
}
try { try {
sdb.beginTransaction(); sdb.beginTransaction();
FtsDbHelper.insert(sdb, message, text); FtsDbHelper.insert(sdb, message, text);

Loading…
Cancel
Save