From 5f9928313b8106295423d209f48750dc825e76e7 Mon Sep 17 00:00:00 2001 From: M66B Date: Sun, 29 Oct 2023 20:28:42 +0100 Subject: [PATCH] Updated cld3 --- .../main/jni/cld_3/src/embedding_network.cc | 1 + .../main/jni/cld_3/src/lang_id_nn_params.h | 0 .../src/language_identifier_features_test.cc | 1 + .../main/jni/cld_3/src/nnet_lang_id_test.cc | 28 +++++++++++++++++++ .../jni/cld_3/src/nnet_language_identifier.cc | 16 +++++++---- .../jni/cld_3/src/nnet_language_identifier.h | 16 +++++++++++ .../cld_3/src/script_span/getonescriptspan.cc | 2 -- .../cld_3/src/script_span/getonescriptspan.h | 6 ++-- .../cld_3/src/script_span/utf8statetable.cc | 26 +++++++++++++---- .../main/jni/cld_3/src/sentence_features.cc | 8 +++--- .../main/jni/cld_3/src/sentence_features.h | 16 +++++++++-- 11 files changed, 97 insertions(+), 23 deletions(-) mode change 100644 => 100755 app/src/main/jni/cld_3/src/lang_id_nn_params.h diff --git a/app/src/main/jni/cld_3/src/embedding_network.cc b/app/src/main/jni/cld_3/src/embedding_network.cc index f2e0801c26..2296ea3f4f 100644 --- a/app/src/main/jni/cld_3/src/embedding_network.cc +++ b/app/src/main/jni/cld_3/src/embedding_network.cc @@ -167,6 +167,7 @@ EmbeddingNetwork::EmbeddingNetwork(const EmbeddingNetworkParams *model) for (int i = 0; i < model_->embedding_dim_size(); ++i) { CLD3_DCHECK(offset_sum == model_->concat_offset(i)); offset_sum += model_->embedding_dim(i) * model_->embedding_num_features(i); + (void)offset_sum; // Avoid compiler warning for "unused" variable. embedding_matrices_.emplace_back(model_->GetEmbeddingMatrix(i)); } diff --git a/app/src/main/jni/cld_3/src/lang_id_nn_params.h b/app/src/main/jni/cld_3/src/lang_id_nn_params.h old mode 100644 new mode 100755 diff --git a/app/src/main/jni/cld_3/src/language_identifier_features_test.cc b/app/src/main/jni/cld_3/src/language_identifier_features_test.cc index 5835c8615d..05fb86c82b 100644 --- a/app/src/main/jni/cld_3/src/language_identifier_features_test.cc +++ b/app/src/main/jni/cld_3/src/language_identifier_features_test.cc @@ -16,6 +16,7 @@ limitations under the License. #include #include #include +#include #include "base.h" #include "feature_extractor.h" diff --git a/app/src/main/jni/cld_3/src/nnet_lang_id_test.cc b/app/src/main/jni/cld_3/src/nnet_lang_id_test.cc index 358fe1b8ff..a7a252076d 100644 --- a/app/src/main/jni/cld_3/src/nnet_lang_id_test.cc +++ b/app/src/main/jni/cld_3/src/nnet_lang_id_test.cc @@ -209,6 +209,34 @@ bool TestMultipleLanguagesInInput() { << result.proportion << std::endl; return false; } + + // Skip over undefined language. + if (result.language == "und") + continue; + if (result.byte_ranges.size() != 1) { + std::cout << " Should only detect one span containing " << result.language + << std::endl; + return false; + } + // Check that specified byte ranges for language are correct. + int start_index = result.byte_ranges[0].start_index; + int end_index = result.byte_ranges[0].end_index; + std::string byte_ranges_text = text.substr(start_index, end_index - start_index); + if (result.language == "bg") { + if (byte_ranges_text.compare("Този текст е на Български.") != 0) { + std::cout << " Incorrect byte ranges returned for Bulgarian " << std::endl; + return false; + } + } else if (result.language == "en") { + if (byte_ranges_text.compare("This piece of text is in English. ") != 0) { + std::cout << " Incorrect byte ranges returned for English " << std::endl; + return false; + } + } else { + std::cout << " Got language other than English or Bulgarian " + << std::endl; + return false; + } } std::cout << " Success!" << std::endl; return true; diff --git a/app/src/main/jni/cld_3/src/nnet_language_identifier.cc b/app/src/main/jni/cld_3/src/nnet_language_identifier.cc index abc395012b..a878cf582b 100644 --- a/app/src/main/jni/cld_3/src/nnet_language_identifier.cc +++ b/app/src/main/jni/cld_3/src/nnet_language_identifier.cc @@ -47,6 +47,9 @@ struct LangChunksStats { // Number chunks corresponding to the language. int num_chunks = 0; + + // Specifies the byte ranges that language applies to. + std::vector byte_ranges; }; // Compares two pairs based on their values. @@ -281,8 +284,6 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text, CLD2::LangSpan script_span; std::unordered_map lang_stats; int total_num_bytes = 0; - Result result; - string language; int chunk_size = 0; // Use the default. while (ss.GetOneScriptSpanLower(&script_span)) { const int num_original_span_bytes = script_span.text_bytes; @@ -298,12 +299,16 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text, total_num_bytes += num_original_span_bytes; const string selected_text = SelectTextGivenScriptSpan(script_span); - result = FindLanguageOfValidUTF8(selected_text); - language = result.language; + + Result result = FindLanguageOfValidUTF8(selected_text); + string language = result.language; lang_stats[language].byte_sum += num_original_span_bytes; lang_stats[language].prob_sum += result.probability * num_original_span_bytes; lang_stats[language].num_chunks++; + // Add SpanInfo. Start and end indices are relative to original input. + lang_stats[language].byte_ranges.push_back(SpanInfo( + ss.MapBack(0), ss.MapBack(script_span.text_bytes), result.probability)); } // Sort the languages based on the number of bytes associated with them. @@ -329,6 +334,7 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text, result.probability = stats.prob_sum / stats.byte_sum; result.proportion = stats.byte_sum / byte_sum; result.is_reliable = ResultIsReliable(language, result.probability); + result.byte_ranges = stats.byte_ranges; results.push_back(result); } @@ -348,7 +354,7 @@ string NNetLanguageIdentifier::SelectTextGivenBeginAndSize( const char *text_begin, int text_size) { string output_text; - // If the size of the input is greater than the maxium number of bytes needed + // If the size of the input is greater than the maximum number of bytes needed // for a prediction, then concatenate snippets that are equally spread out // throughout the input. if (text_size > max_num_bytes_) { diff --git a/app/src/main/jni/cld_3/src/nnet_language_identifier.h b/app/src/main/jni/cld_3/src/nnet_language_identifier.h index 820aba6ba8..e5eb8627ef 100644 --- a/app/src/main/jni/cld_3/src/nnet_language_identifier.h +++ b/app/src/main/jni/cld_3/src/nnet_language_identifier.h @@ -44,6 +44,19 @@ class LanguageIdEmbeddingFeatureExtractor // Class for detecting the language of a document. class NNetLanguageIdentifier { public: + // Holds probability that Span, specified by start/end indices, is a given + // language. The langauge is not stored here; it can be found in Result, which + // holds a vector of SpanInfo. + struct SpanInfo { + SpanInfo(int start_index_val, int end_index_val, float probability_val) + : start_index(start_index_val), + end_index(end_index_val), + probability(probability_val) {} + int start_index = -1; + int end_index = -1; + float probability = 0.0; + }; + // Information about a predicted language. struct Result { string language = kUnknown; @@ -53,6 +66,9 @@ class NNetLanguageIdentifier { // Proportion of bytes associated with the language. If FindLanguage is // called, this variable is set to 1. float proportion = 0.0; + + // Specifies the byte ranges that |language| applies to. + std::vector byte_ranges; }; NNetLanguageIdentifier(); diff --git a/app/src/main/jni/cld_3/src/script_span/getonescriptspan.cc b/app/src/main/jni/cld_3/src/script_span/getonescriptspan.cc index 4607bd8fc0..715616d8df 100644 --- a/app/src/main/jni/cld_3/src/script_span/getonescriptspan.cc +++ b/app/src/main/jni/cld_3/src/script_span/getonescriptspan.cc @@ -878,7 +878,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) { // copying letters to buffer with single spaces for each run of non-letters while (take < byte_length_) { // Copy run of letters in same script (&LS | LS)* - int letter_count = 0; // Keep track of word length bool need_break = false; while (take < byte_length_) { @@ -963,7 +962,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) { map2original_.Delete(tlen - plen); } - ++letter_count; if (put >= kMaxScriptBytes) { // Buffer is full span->truncated = true; diff --git a/app/src/main/jni/cld_3/src/script_span/getonescriptspan.h b/app/src/main/jni/cld_3/src/script_span/getonescriptspan.h index 1fa60adad4..004f903ea8 100644 --- a/app/src/main/jni/cld_3/src/script_span/getonescriptspan.h +++ b/app/src/main/jni/cld_3/src/script_span/getonescriptspan.h @@ -33,14 +33,14 @@ static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room static const int kWithinScriptTail = 32; // Stop at word space in last // N bytes of script buffer -typedef struct { +struct LangSpan { char* text = nullptr; // Pointer to the span, somewhere int text_bytes = 0; // Number of bytes of text in the span int offset = 0; // Offset of start of span in original input buffer ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span bool truncated = false; // true if buffer filled up before a // different script or EOF was found -} LangSpan; +}; static inline bool IsContinuationByte(char c) { return static_cast(c) < -64; @@ -93,7 +93,7 @@ class ScriptScanner { // again with the first byte of the following range. int MapBack(int text_offset); - const char* GetBufferStart() {return start_byte_;}; + const char* GetBufferStart() {return start_byte_;} private: // Skip over tags and non-letters diff --git a/app/src/main/jni/cld_3/src/script_span/utf8statetable.cc b/app/src/main/jni/cld_3/src/script_span/utf8statetable.cc index 0b3d349357..8fcfb1e8f9 100644 --- a/app/src/main/jni/cld_3/src/script_span/utf8statetable.cc +++ b/app/src/main/jni/cld_3/src/script_span/utf8statetable.cc @@ -158,6 +158,20 @@ static const int kHtmlPlaintextFlag = 0x80; // Bit in add byte to distinguish * **/ +// All intentional fallthroughs in breakpad are in this file, so define +// this macro locally. +// If you ever move this to a .h file, make sure it's defined in a +// private header file: clang suggests the first macro expanding to +// [[clang::fallthrough]] in its diagnostics, so if BP_FALLTHROUGH +// is visible in code depending on breakpad, clang would suggest +// BP_FALLTHROUGH for code depending on breakpad, instead of the +// client code's own fallthrough macro. +#if defined(__clang__) +#define CLD_FALLTHROUGH [[clang::fallthrough]] +#else +#define CLD_FALLTHROUGH +#endif + // Return true if current Tbl pointer is within state0 range // Note that unsigned compare checks both ends of range simultaneously static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) { @@ -715,10 +729,10 @@ static int UTF8GenericReplaceInternal(const UTF8ReplaceObj* st, goto Do_state_table; case kExitReplace3: // update 3 bytes to change dst[-3] = (unsigned char)Tbl[c + (nEntries * 3)]; - // Fall into next case + CLD_FALLTHROUGH; case kExitReplace2: // update 2 bytes to change dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)]; - // Fall into next case + CLD_FALLTHROUGH; case kExitReplace1: // update 1 byte to change dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; total_changed++; @@ -736,7 +750,7 @@ static int UTF8GenericReplaceInternal(const UTF8ReplaceObj* st, } else { offset += ((unsigned char)Tbl[c + (nEntries * 2)] << 8); } - // Fall into next case + CLD_FALLTHROUGH; case kExitSpecial: // Apply special fixups [read: hacks] case kExitReplaceOffset1: if ((nEntries != 256) && InStateZero(st, Tbl)) { @@ -986,10 +1000,10 @@ static int UTF8GenericReplaceInternalTwoByte(const UTF8ReplaceObj_2* st, goto Do_state_table_2; case kExitReplace3_2: // update 3 bytes to change dst[-3] = (unsigned char)(Tbl[c + (nEntries * 2)] & 0xff); - // Fall into next case + CLD_FALLTHROUGH; case kExitReplace2_2: // update 2 bytes to change dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff); - // Fall into next case + CLD_FALLTHROUGH; case kExitReplace1_2: // update 1 byte to change dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); total_changed++; @@ -1007,7 +1021,7 @@ static int UTF8GenericReplaceInternalTwoByte(const UTF8ReplaceObj_2* st, } else { offset += ((unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff) << 8); } - // Fall into next case + CLD_FALLTHROUGH; case kExitReplaceOffset1_2: if ((nEntries != 256) && InStateZero_2(st, Tbl)) { // For space-optimized table, we need multiples of 256 bytes diff --git a/app/src/main/jni/cld_3/src/sentence_features.cc b/app/src/main/jni/cld_3/src/sentence_features.cc index cae6e368c8..70d64f40cc 100644 --- a/app/src/main/jni/cld_3/src/sentence_features.cc +++ b/app/src/main/jni/cld_3/src/sentence_features.cc @@ -19,11 +19,11 @@ limitations under the License. namespace chrome_lang_id { -// Declare registry for the whole Sentence feature functions. NOTE: this is not +// Define registry for the whole Sentence feature functions. NOTE: this is not // yet set to anything meaningful. It will be set so in NNetLanguageIdentifier // constructor, *before* we use any feature. template <> -WholeSentenceFeature::Registry - *RegisterableClass::registry_ = nullptr; +WholeSentenceFeature::Registry* + RegisterableClass::registry_ = nullptr; -} // namespace chrome_lang_id +} // namespace chrome_lang_id \ No newline at end of file diff --git a/app/src/main/jni/cld_3/src/sentence_features.h b/app/src/main/jni/cld_3/src/sentence_features.h index 165bfd5f1a..cc0be88330 100644 --- a/app/src/main/jni/cld_3/src/sentence_features.h +++ b/app/src/main/jni/cld_3/src/sentence_features.h @@ -26,9 +26,19 @@ limitations under the License. namespace chrome_lang_id { // Feature function that extracts features for the full Sentence. -typedef FeatureFunction WholeSentenceFeature; - -typedef FeatureExtractor WholeSentenceExtractor; +using WholeSentenceFeature = FeatureFunction; + +using WholeSentenceExtractor = FeatureExtractor; + +// Declare registry for the whole Sentence feature functions. This is required +// for clang's -Wundefined-var-template. However, MSVC has a bug which treats +// this declaration as a definition, leading to multiple definition errors, so +// omit this on MSVC. +#if !defined(COMPILER_MSVC) +template <> +WholeSentenceFeature::Registry + *RegisterableClass::registry_; +#endif } // namespace chrome_lang_id