From 21aa7a9d6a3bd6a98c480bea02e0e81b21f411af Mon Sep 17 00:00:00 2001 From: joe <219651+AdolfVonKleist@users.noreply.github.com> Date: Mon, 22 Mar 2021 17:26:20 +0000 Subject: [PATCH 7/7] Restore the sentence boundary marker insertion for the unigram trainer. Dramatically speeds up training time. --- src/unigram_model_trainer.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/unigram_model_trainer.cc b/src/unigram_model_trainer.cc index 5f26771..94c7adb 100644 --- a/src/unigram_model_trainer.cc +++ b/src/unigram_model_trainer.cc @@ -119,6 +119,7 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePieces() const { all_chars[string_util::UnicodeCharToUTF8(c)] += w.second; } } + array.push_back(kSentenceBoundary); // sentence boundary marker. } const node_int_type n = array.size(); -- 2.18.0.huawei.25