26 lines
863 B
Diff
26 lines
863 B
Diff
From 21aa7a9d6a3bd6a98c480bea02e0e81b21f411af Mon Sep 17 00:00:00 2001
|
|
From: joe <219651+AdolfVonKleist@users.noreply.github.com>
|
|
Date: Mon, 22 Mar 2021 17:26:20 +0000
|
|
Subject: [PATCH 7/7] Restore the sentence boundary marker insertion for the
|
|
unigram trainer. Dramatically speeds up training time.
|
|
|
|
---
|
|
src/unigram_model_trainer.cc | 1 +
|
|
1 file changed, 1 insertion(+)
|
|
|
|
diff --git a/src/unigram_model_trainer.cc b/src/unigram_model_trainer.cc
|
|
index 5f26771..94c7adb 100644
|
|
--- a/src/unigram_model_trainer.cc
|
|
+++ b/src/unigram_model_trainer.cc
|
|
@@ -119,6 +119,7 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePieces() const {
|
|
all_chars[string_util::UnicodeCharToUTF8(c)] += w.second;
|
|
}
|
|
}
|
|
+ array.push_back(kSentenceBoundary); // sentence boundary marker.
|
|
}
|
|
|
|
const node_int_type n = array.size();
|
|
--
|
|
2.18.0.huawei.25
|
|
|