Compare commits
11 Commits
7c594b864f
...
c25e44a078
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c25e44a078 | ||
|
|
28d259de7b | ||
|
|
5363370455 | ||
|
|
22c62f617e | ||
|
|
bf542b1c4c | ||
|
|
4a430e108e | ||
|
|
fe52606635 | ||
|
|
e743a48e03 | ||
|
|
95279e8ba7 | ||
|
|
2142c654db | ||
|
|
74e585ecaf |
30
Add-missing-include-for-BYTE_ORDER.patch
Normal file
30
Add-missing-include-for-BYTE_ORDER.patch
Normal file
@ -0,0 +1,30 @@
|
||||
From 624091a90e816f555106a1b1f994a45cb4989051 Mon Sep 17 00:00:00 2001
|
||||
From: Malcolm Smith <smith@chaquo.com>
|
||||
Date: Tue, 12 Jan 2021 13:43:28 +0000
|
||||
Subject: [PATCH 5/7] Add missing #include for BYTE_ORDER
|
||||
|
||||
---
|
||||
src/util.h | 7 +++++++
|
||||
1 file changed, 7 insertions(+)
|
||||
|
||||
diff --git a/src/util.h b/src/util.h
|
||||
index bf8a758..1680f4b 100644
|
||||
--- a/src/util.h
|
||||
+++ b/src/util.h
|
||||
@@ -36,6 +36,13 @@
|
||||
#include <pthread.h>
|
||||
#endif
|
||||
|
||||
+#if !defined(__APPLE__) && !defined(_WIN32)
|
||||
+#include <endian.h>
|
||||
+#if defined(BYTE_ORDER) && defined(__BIG_ENDIAN) && BYTE_ORDER == __BIG_ENDIAN
|
||||
+#define IS_BIG_ENDIAN
|
||||
+#endif
|
||||
+#endif
|
||||
+
|
||||
namespace sentencepiece {
|
||||
|
||||
template <typename T>
|
||||
--
|
||||
2.18.0.huawei.25
|
||||
|
||||
24
Added-split_digits-to-SentencePieceTrainer.patch
Normal file
24
Added-split_digits-to-SentencePieceTrainer.patch
Normal file
@ -0,0 +1,24 @@
|
||||
From 427d695ab4343568cc46411fbe83ef5ccc619752 Mon Sep 17 00:00:00 2001
|
||||
From: mingruimingrui <mingruimingrui@hotmail.com>
|
||||
Date: Sat, 27 Jun 2020 02:56:03 +0800
|
||||
Subject: [PATCH 1/7] Added split_digits to SentencePieceTrainer
|
||||
|
||||
---
|
||||
src/spec_parser.h | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/src/spec_parser.h b/src/spec_parser.h
|
||||
index 729e036..6dd054b 100644
|
||||
--- a/src/spec_parser.h
|
||||
+++ b/src/spec_parser.h
|
||||
@@ -207,6 +207,7 @@ util::Status SentencePieceTrainer::SetProtoField(const std::string &name,
|
||||
PARSE_BOOL(split_by_unicode_script);
|
||||
PARSE_BOOL(split_by_number);
|
||||
PARSE_BOOL(split_by_whitespace);
|
||||
+ PARSE_BOOL(split_digits);
|
||||
PARSE_BOOL(treat_whitespace_as_suffix);
|
||||
PARSE_REPEATED_STRING(control_symbols);
|
||||
PARSE_REPEATED_STRING(user_defined_symbols);
|
||||
--
|
||||
2.18.0.huawei.25
|
||||
|
||||
70
Create-options.md.patch
Normal file
70
Create-options.md.patch
Normal file
@ -0,0 +1,70 @@
|
||||
From 5c09745aafa151be7ed5d9a9101f3e8c79a8758b Mon Sep 17 00:00:00 2001
|
||||
From: stephantul <stephantul@gmail.com>
|
||||
Date: Thu, 1 Oct 2020 12:49:13 +0200
|
||||
Subject: [PATCH 3/7] Create options.md
|
||||
|
||||
---
|
||||
doc/options.md | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 51 insertions(+)
|
||||
create mode 100644 doc/options.md
|
||||
|
||||
diff --git a/doc/options.md b/doc/options.md
|
||||
new file mode 100644
|
||||
index 0000000..7861fdc
|
||||
--- /dev/null
|
||||
+++ b/doc/options.md
|
||||
@@ -0,0 +1,51 @@
|
||||
+# Training options
|
||||
+
|
||||
+The training options for the `spm_train` can be listed using `spm_train --help`. Since the standard `pip install` of sentencepiece does not necessarily install `spm_train`, the options are also listed here.
|
||||
+
|
||||
+```
|
||||
+--help (show help) type: bool default: false
|
||||
+--version (show version) type: bool default: false
|
||||
+--minloglevel (Messages logged at a lower level than this don't actually get logged anywhere) type: int default: 0
|
||||
+--input (comma separated list of input sentences) type: std::string default: ""
|
||||
+--input_format (Input format. Supported format is `text` or `tsv`.) type: std::string default: ""
|
||||
+--model_prefix (output model prefix) type: std::string default: "" --model_type (model algorithm: unigram, bpe, word or char) type: std::string default: "unigram"
|
||||
+--vocab_size (vocabulary size) type: int32 default: 8000
|
||||
+--accept_language (comma-separated list of languages this model can accept) type: std::string default: ""
|
||||
+--self_test_sample_size (the size of self test samples) type: int32 default: 0
|
||||
+--character_coverage (character coverage to determine the minimum symbols) type: double default: 0.9995
|
||||
+--input_sentence_size (maximum size of sentences the trainer loads) type: int32 default: 0
|
||||
+--shuffle_input_sentence (Randomly sample input sentences in advance. Valid when --input_sentence_size > 0) type: bool default: true
|
||||
+--seed_sentencepiece_size (the size of seed sentencepieces) type: int32 default: 1000000
|
||||
+--shrinking_factor (Keeps top shrinking_factor pieces with respect to the loss) type: double default: 0.75
|
||||
+--num_threads (number of threads for training) type: int32 default: 16
|
||||
+--num_sub_iterations (number of EM sub-iterations) type: int32 default: 2
|
||||
+--max_sentencepiece_length (maximum length of sentence piece) type: int32 default: 16
|
||||
+--max_sentence_length (maximum length of sentence in byte) type: int32 default: 4192
|
||||
+--split_by_unicode_script (use Unicode script to split sentence pieces) type: bool default: true
|
||||
+--split_by_number (split tokens by numbers (0-9)) type: bool default: true
|
||||
+--split_by_whitespace (use a white space to split sentence pieces) type: bool default: true
|
||||
+--split_digits (split all digits (0-9) into separate pieces) type: bool default: false
|
||||
+--treat_whitespace_as_suffix (treat whitespace marker as suffix instead of prefix.) type: bool default: false
|
||||
+--control_symbols (comma separated list of control symbols) type: std::string default: ""
|
||||
+--user_defined_symbols (comma separated list of user defined symbols) type: std::string default: ""
|
||||
+--required_chars (UTF8 characters in this flag are always used in the character set regardless of --character_coverage) type: std::string default: ""
|
||||
+--byte_fallback (decompose unknown pieces into UTF-8 byte pieces) type: bool default: false
|
||||
+--vocabulary_output_piece_score (Define score in vocab file) type: bool default: true
|
||||
+--normalization_rule_name (Normalization rule name. Choose from nfkc or identity) type: std::string default: "nmt_nfkc"
|
||||
+--normalization_rule_tsv (Normalization rule TSV file. ) type: std::string default: ""
|
||||
+--denormalization_rule_tsv (Denormalization rule TSV file.) type: std::string default: ""
|
||||
+--add_dummy_prefix (Add dummy whitespace at the beginning of text) type: bool default: true
|
||||
+--remove_extra_whitespaces (Removes leading, trailing, and duplicate internal whitespace) type: bool default: true
|
||||
+--hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.) type: bool default: true
|
||||
+--use_all_vocab (If set to true, use all tokens as vocab. Valid for word/char models.) type: bool default: false
|
||||
+--unk_id (Override UNK (<unk>) id.) type: int32 default: 0
|
||||
+--bos_id (Override BOS (<s>) id. Set -1 to disable BOS.) type: int32 default: 1
|
||||
+--eos_id (Override EOS (</s>) id. Set -1 to disable EOS.) type: int32 default: 2
|
||||
+--pad_id (Override PAD (<pad>) id. Set -1 to disable PAD.) type: int32 default: -1
|
||||
+--unk_piece (Override UNK (<unk>) piece.) type: std::string default: "<unk>"
|
||||
+--bos_piece (Override BOS (<s>) piece.) type: std::string default: "<s>"
|
||||
+--eos_piece (Override EOS (</s>) piece.) type: std::string default: "</s>"
|
||||
+--pad_piece (Override PAD (<pad>) piece.) type: std::string default: "<pad>"
|
||||
+--unk_surface (Dummy surface string for <unk>. In decoding <unk> is decoded to `unk_surface`.) type: std::string default: " ⁇ "
|
||||
+--train_extremely_large_corpus (Increase bit depth for unigram tokenization.) type: bool default: false
|
||||
+```
|
||||
--
|
||||
2.18.0.huawei.25
|
||||
|
||||
30
Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch
Normal file
30
Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch
Normal file
@ -0,0 +1,30 @@
|
||||
From 2ea571b8e509809bbe28e6cc3f1488b3cfde1ef9 Mon Sep 17 00:00:00 2001
|
||||
From: Kentaro Hayashi <hayashi@clear-code.com>
|
||||
Date: Sat, 17 Oct 2020 16:54:20 +0900
|
||||
Subject: [PATCH 4/7] Fix FTBFS on armel, mips, powerpc, m68k and sh4
|
||||
|
||||
---
|
||||
src/CMakeLists.txt | 7 +++++++
|
||||
1 file changed, 7 insertions(+)
|
||||
|
||||
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
|
||||
index 511b2ec..87765e5 100644
|
||||
--- a/src/CMakeLists.txt
|
||||
+++ b/src/CMakeLists.txt
|
||||
@@ -197,6 +197,13 @@ target_link_libraries(sentencepiece_train-static INTERFACE sentencepiece-static
|
||||
if (SPM_ENABLE_SHARED)
|
||||
target_link_libraries(sentencepiece ${SPM_LIBS})
|
||||
target_link_libraries(sentencepiece_train ${SPM_LIBS} sentencepiece)
|
||||
+ if ((${CMAKE_SYSTEM_PROCESSOR} STREQUAL "armv7l") OR
|
||||
+ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "mips") OR
|
||||
+ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "m68k") OR
|
||||
+ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "ppc") OR
|
||||
+ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "sh4"))
|
||||
+ list(APPEND SPM_LIBS "atomic")
|
||||
+ endif()
|
||||
set(SPM_INSTALLTARGETS sentencepiece sentencepiece_train sentencepiece-static sentencepiece_train-static)
|
||||
set_target_properties(sentencepiece sentencepiece_train PROPERTIES SOVERSION 0 VERSION 0.0.0)
|
||||
set_target_properties(sentencepiece PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS YES)
|
||||
--
|
||||
2.18.0.huawei.25
|
||||
|
||||
171
README.en.md
171
README.en.md
@ -8,15 +8,174 @@ Software architecture description
|
||||
|
||||
#### Installation
|
||||
|
||||
1. xxxx
|
||||
2. xxxx
|
||||
3. xxxx
|
||||
1. Python module
|
||||
|
||||
SentencePiece provides Python wrapper that supports both SentencePiece training and segmentation. You can install Python binary package of SentencePiece with.
|
||||
|
||||
% pip install sentencepiece
|
||||
|
||||
2. Build and install SentencePiece command line tools from C++ source
|
||||
|
||||
The following tools and libraries are required to build SentencePiece:
|
||||
|
||||
* cmake
|
||||
|
||||
* C++11 compiler
|
||||
|
||||
* gperftools library (optional, 10-40% performance improvement can be obtained.)
|
||||
|
||||
On Ubuntu, the build tools can be installed with apt-get:
|
||||
|
||||
% sudo apt-get install cmake build-essential pkg-config libgoogle-perftools-dev
|
||||
|
||||
Then, you can build and install command line tools as follows.
|
||||
|
||||
% git clone https://github.com/google/sentencepiece.git
|
||||
|
||||
% cd sentencepiece
|
||||
|
||||
% mkdir build
|
||||
|
||||
% cd build
|
||||
|
||||
% cmake ..
|
||||
|
||||
% make -j $(nproc)
|
||||
|
||||
% sudo make install
|
||||
|
||||
% sudo ldconfig -v
|
||||
|
||||
On OSX/macOS, replace the last command with sudo update_dyld_shared_cache.
|
||||
|
||||
3. Build and install using vcpkg
|
||||
|
||||
You can download and install sentencepiece using the vcpkg dependency manager:
|
||||
|
||||
git clone https://github.com/Microsoft/vcpkg.git
|
||||
|
||||
cd vcpkg
|
||||
|
||||
./bootstrap-vcpkg.sh
|
||||
|
||||
./vcpkg integrate install
|
||||
|
||||
./vcpkg install sentencepiece
|
||||
|
||||
The sentencepiece port in vcpkg is kept up to date by Microsoft team members and community contributors. If the version is out of date, please create an issue or pull request on the vcpkg repository.
|
||||
|
||||
|
||||
#### Instructions
|
||||
|
||||
1. xxxx
|
||||
2. xxxx
|
||||
3. xxxx
|
||||
1. Train SentencePiece Model
|
||||
|
||||
% spm_train --input=< input > --model_prefix=<model_name> --vocab_size=8000 --character_coverage=1.0 --model_type=<type>
|
||||
|
||||
* --input: one-sentence-per-line raw corpus file. No need to run tokenizer, normalizer or preprocessor. By default, SentencePiece normalizes the input with Unicode NFKC. You can pass a comma-separated list of files.
|
||||
|
||||
* --model_prefix: output model name prefix. <model_name>.model and <model_name>.vocab are generated.
|
||||
|
||||
* --vocab_size: vocabulary size, e.g., 8000, 16000, or 32000
|
||||
|
||||
* --character_coverage: amount of characters covered by the model, good defaults are: 0.9995 for languages with rich character set like Japanese or Chinese and 1.0 for other languages with small character set.
|
||||
|
||||
* --model_type: model type. Choose from unigram (default), bpe, char, or word. The input sentence must be pretokenized when using word type.
|
||||
|
||||
2. Encode raw text into sentence pieces/ids
|
||||
|
||||
% spm_encode --model=<model_file> --output_format=piece < input > output
|
||||
|
||||
% spm_encode --model=<model_file> --output_format=id < input > output
|
||||
|
||||
Use --extra_options flag to insert the BOS/EOS markers or reverse the input sequence.
|
||||
|
||||
% spm_encode --extra_options=eos (add </s> only)
|
||||
|
||||
% spm_encode --extra_options=bos:eos (add <s> and </s>)
|
||||
|
||||
% spm_encode --extra_options=reverse:bos:eos (reverse input and add <s> and </s>)
|
||||
|
||||
SentencePiece supports nbest segmentation and segmentation sampling with --output_format=(nbest|sample)_(piece|id) flags.
|
||||
|
||||
% spm_encode --model=<model_file> --output_format=sample_piece --nbest_size=-1 --alpha=0.5 < input > output
|
||||
|
||||
% spm_encode --model=<model_file> --output_format=nbest_id --nbest_size=10 < input > output
|
||||
|
||||
3. Decode sentence pieces/ids into raw text
|
||||
|
||||
% spm_decode --model=<model_file> --input_format=piece < input > output
|
||||
|
||||
% spm_decode --model=<model_file> --input_format=id < input > output
|
||||
|
||||
Use --extra_options flag to decode the text in reverse order.
|
||||
|
||||
% spm_decode --extra_options=reverse < input > output
|
||||
|
||||
4. End-to-End Example
|
||||
|
||||
% spm_train --input=data/botchan.txt --model_prefix=m --vocab_size=1000
|
||||
|
||||
unigram_model_trainer.cc(494) LOG(INFO) Starts training with :
|
||||
|
||||
input: "../data/botchan.txt"
|
||||
|
||||
... <snip>
|
||||
|
||||
unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=1 size=1100 obj=10.4973 num_tokens=37630 num_tokens/piece=34.2091
|
||||
|
||||
trainer_interface.cc(272) LOG(INFO) Saving model: m.model
|
||||
|
||||
trainer_interface.cc(281) LOG(INFO) Saving vocabs: m.vocab
|
||||
|
||||
% echo "I saw a girl with a telescope." | spm_encode --model=m.model
|
||||
|
||||
▁I ▁saw ▁a ▁girl ▁with ▁a ▁ te le s c o pe .
|
||||
|
||||
% echo "I saw a girl with a telescope." | spm_encode --model=m.model --output_format=id
|
||||
|
||||
9 459 11 939 44 11 4 142 82 8 28 21 132 6
|
||||
|
||||
% echo "9 459 11 939 44 11 4 142 82 8 28 21 132 6" | spm_decode --model=m.model --input_format=id
|
||||
|
||||
I saw a girl with a telescope.
|
||||
|
||||
You can find that the original input sentence is restored from the vocabulary id sequence.
|
||||
|
||||
5. Export vocabulary list
|
||||
|
||||
% spm_export_vocab --model=<model_file> --output=<output file>
|
||||
|
||||
<output file> stores a list of vocabulary and emission log probabilities. The vocabulary id corresponds to the line number in this file.
|
||||
|
||||
6. Redefine special meta tokens
|
||||
|
||||
By default, SentencePiece uses Unknown (<unk>), BOS (<s>) and EOS (</s>) tokens which have the ids of 0, 1, and 2 respectively. We can redefine this mapping in the training phase as follows.
|
||||
|
||||
% spm_train --bos_id=0 --eos_id=1 --unk_id=5 --input=... --model_prefix=... --character_coverage=...
|
||||
|
||||
When setting -1 id e.g., bos_id=-1, this special token is disabled. Note that the unknow id cannot be disabled. We can define an id for padding (<pad>) as --pad_id=3.
|
||||
|
||||
7. Vocabulary restriction
|
||||
|
||||
spm_encode accepts a --vocabulary and a --vocabulary_threshold option so that spm_encode will only produce symbols which also appear in the vocabulary (with at least some frequency).
|
||||
|
||||
The usage is basically the same as that of subword-nmt. Assuming that L1 and L2 are the two languages (source/target languages), train the shared spm model, and get resulting vocabulary for each:
|
||||
|
||||
% cat {train_file}.L1 {train_file}.L2 | shuffle > train
|
||||
|
||||
% spm_train --input=train --model_prefix=spm --vocab_size=8000 --character_coverage=0.9995
|
||||
|
||||
% spm_encode --model=spm.model --generate_vocabulary < {train_file}.L1 > {vocab_file}.L1
|
||||
|
||||
% spm_encode --model=spm.model --generate_vocabulary < {train_file}.L2 > {vocab_file}.L2
|
||||
|
||||
shuffle command is used just in case because spm_train loads the first 10M lines of corpus by default.
|
||||
|
||||
Then segment train/test corpus with --vocabulary option
|
||||
|
||||
% spm_encode --model=spm.model --vocabulary={vocab_file}.L1 --vocabulary_threshold=50 < {test_file}.L1 > {test_file}.seg.L1
|
||||
|
||||
% spm_encode --model=spm.model --vocabulary={vocab_file}.L2 --vocabulary_threshold=50 < {test_file}.L2 > {test_file}.seg.L2
|
||||
|
||||
#### Contribution
|
||||
|
||||
|
||||
170
README.md
170
README.md
@ -9,15 +9,173 @@ An unsupervised text tokenizer and detokenizer.
|
||||
|
||||
#### 安装教程
|
||||
|
||||
1. xxxx
|
||||
2. xxxx
|
||||
3. xxxx
|
||||
1. Python模块
|
||||
|
||||
SentencePiece 提供了支持 SentencePiece 训练和分割的 Python 包装器。你可以安装 SentencePiece 的 Python 二进制包。
|
||||
|
||||
% pip install sentencepiece
|
||||
|
||||
2. 从 C++ 源代码构建和安装 SentencePiece 命令行工具
|
||||
|
||||
构建 SentencePiece 需要以下工具和库:
|
||||
|
||||
* make
|
||||
|
||||
* C++11编译器
|
||||
|
||||
* gperftools库(可选,可以获得 10-40% 的性能提升。)
|
||||
|
||||
在 Ubuntu 上,可以使用 apt-get 安装构建工具:
|
||||
|
||||
% sudo apt-get install cmake build-essential pkg-config libgoogle-perftools-dev
|
||||
|
||||
然后,您可以按如下方式构建和安装命令行工具。
|
||||
|
||||
% git clone https://github.com/google/sentencepiece.git
|
||||
|
||||
% cd sentencepiece
|
||||
|
||||
% mkdir build
|
||||
|
||||
% cd build
|
||||
|
||||
% cmake ..
|
||||
|
||||
% make -j $(nproc)
|
||||
|
||||
% sudo make install
|
||||
|
||||
% sudo ldconfig -v
|
||||
|
||||
在 OSX/macOS 上,将最后一个命令替换为 sudo update_dyld_shared_cache
|
||||
|
||||
3. 用 vcpkg 构建和安装
|
||||
|
||||
您可以使用vcpkg依赖项管理器下载并安装句子:
|
||||
|
||||
git clone https://github.com/Microsoft/vcpkg.git
|
||||
|
||||
cd vcpkg
|
||||
|
||||
./bootstrap-vcpkg.sh
|
||||
|
||||
./vcpkg integrate install
|
||||
|
||||
./vcpkg install sentencepiece
|
||||
|
||||
vcpkg 中的sentencepiece端口由 Microsoft 团队成员和社区贡献者保持最新。
|
||||
|
||||
#### 使用说明
|
||||
|
||||
1. xxxx
|
||||
2. xxxx
|
||||
3. xxxx
|
||||
1. 训练句子模型
|
||||
|
||||
% spm_train --input=<input> --model_prefix=<model_name> --vocab_size=8000 --character_coverage=1.0 --model_type=<type>
|
||||
|
||||
* --input:每行一个句子的原始语料库文件。无需运行分词器、规范器或预处理器。默认情况下,SentencePiece 使用 Unicode NFKC 规范化输入。您可以传递逗号分隔的文件列表。
|
||||
|
||||
* --model_prefix: 输出模型名称前缀。<model_name>.model并<model_name>.vocab生成。
|
||||
|
||||
* --vocab_size: 词汇量,例如 8000、16000 或 32000
|
||||
|
||||
* --character_coverage: 模型覆盖的字符数,好的默认值是:0.9995对于具有丰富字符集的语言,如日语或中文,1.0以及其他具有小字符集的语言。
|
||||
|
||||
* --model_type: 型号。从unigram(默认)bpe、char、 或 中选择word。使用wordtype时,输入的句子必须预先标记。
|
||||
|
||||
2. 将原始文本编码为句子片段/id
|
||||
|
||||
% spm_encode --model=<model_file> --output_format=piece < input > output
|
||||
|
||||
% spm_encode --model=<model_file> --output_format=id < input > output
|
||||
|
||||
使用--extra_optionsflag 插入 BOS/EOS 标记或反转输入顺序。
|
||||
|
||||
% spm_encode --extra_options=eos (add </s> only)
|
||||
|
||||
% spm_encode --extra_options=bos:eos (add <s> and </s>)
|
||||
|
||||
% spm_encode --extra_options=reverse:bos:eos (reverse input and add <s> and </s>)
|
||||
|
||||
SentencePiece 支持 nbest 分割和带--output_format=(nbest|sample)_(piece|id)标志的分割采样。
|
||||
|
||||
% spm_encode --model=<model_file> --output_format=sample_piece --nbest_size=-1 --alpha=0.5 < input > output
|
||||
|
||||
% spm_encode --model=<model_file> --output_format=nbest_id --nbest_size=10 < input > output
|
||||
|
||||
3. 将句子片段/id 解码为原始文本
|
||||
|
||||
% spm_decode --model=<model_file> --input_format=piece < input > output
|
||||
|
||||
% spm_decode --model=<model_file> --input_format=id < input > output
|
||||
|
||||
使用--extra_options标志以相反的顺序解码文本。
|
||||
|
||||
% spm_decode --extra_options=reverse < input > output
|
||||
|
||||
4. 端到端示例
|
||||
|
||||
% spm_train --input=data/botchan.txt --model_prefix=m --vocab_size=1000
|
||||
|
||||
unigram_model_trainer.cc(494) LOG(INFO) Starts training with :
|
||||
|
||||
input: "../data/botchan.txt"
|
||||
|
||||
... <snip>
|
||||
|
||||
unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=1 size=1100 obj=10.4973 num_tokens=37630 num_tokens/piece=34.2091
|
||||
|
||||
trainer_interface.cc(272) LOG(INFO) Saving model: m.model
|
||||
|
||||
trainer_interface.cc(281) LOG(INFO) Saving vocabs: m.vocab
|
||||
|
||||
% echo "I saw a girl with a telescope." | spm_encode --model=m.model
|
||||
|
||||
▁I ▁saw ▁a ▁girl ▁with ▁a ▁ te le s c o pe .
|
||||
|
||||
% echo "I saw a girl with a telescope." | spm_encode --model=m.model --output_format=id
|
||||
|
||||
9 459 11 939 44 11 4 142 82 8 28 21 132 6
|
||||
|
||||
% echo "9 459 11 939 44 11 4 142 82 8 28 21 132 6" | spm_decode --model=m.model --input_format=id
|
||||
|
||||
I saw a girl with a telescope.
|
||||
|
||||
可以发现原来的输入句是从词表id序列中还原出来的。
|
||||
|
||||
5. 导出词汇表
|
||||
|
||||
% spm_export_vocab --model=<model_file> --output=<output file>
|
||||
|
||||
<output file>存储词汇表和排放日志概率列表。词汇 id 对应于该文件中的行号。
|
||||
|
||||
6. 重新定义特殊元标记
|
||||
|
||||
By default, SentencePiece uses Unknown (<unk>), BOS (<s>) and EOS (</s>) tokens which have the ids of 0, 1, and 2 respectively. 我们可以在训练阶段重新定义这个映射如下。
|
||||
|
||||
% spm_train --bos_id=0 --eos_id=1 --unk_id=5 --input=... --model_prefix=... --character_coverage=...
|
||||
|
||||
例如,设置 -1 id 时,bos_id=-1将禁用此特殊标记。请注意,无法禁用未知 ID。我们可以将填充的 id (<pad>) 定义为--pad_id=3.
|
||||
|
||||
7. 词汇限制
|
||||
|
||||
spm_encode接受 a--vocabulary和 a--vocabulary_threshold选项,这样spm_encode只会产生也出现在词汇表中的符号(至少有一些频率)。
|
||||
|
||||
用法与subword-nmt. 假设 L1 和 L2 是两种语言(源/目标语言),训练共享的 spm 模型,并为每个得到结果词汇:
|
||||
|
||||
% cat {train_file}.L1 {train_file}.L2 | shuffle > train
|
||||
|
||||
% spm_train --input=train --model_prefix=spm --vocab_size=8000 --character_coverage=0.9995
|
||||
|
||||
% spm_encode --model=spm.model --generate_vocabulary < {train_file}.L1 > {vocab_file}.L1
|
||||
|
||||
% spm_encode --model=spm.model --generate_vocabulary < {train_file}.L2 > {vocab_file}.L2
|
||||
|
||||
shuffle命令只是为了以防万一,因为spm_train默认情况下加载语料库的前 10M 行。
|
||||
|
||||
然后使用--vocabulary选项分割训练/测试语料库
|
||||
|
||||
% spm_encode --model=spm.model --vocabulary={vocab_file}.L1 --vocabulary_threshold=50 < {test_file}.L1 > {test_file}.seg.L1
|
||||
|
||||
% spm_encode --model=spm.model --vocabulary={vocab_file}.L2 --vocabulary_threshold=50 < {test_file}.L2 > {test_file}.seg.L2
|
||||
|
||||
#### 参与贡献
|
||||
|
||||
|
||||
54
Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch
Normal file
54
Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch
Normal file
@ -0,0 +1,54 @@
|
||||
From 82b8b6f61403fcfcef673ee49ed2dfe475ba4cf2 Mon Sep 17 00:00:00 2001
|
||||
From: Sarubi <stsarut@gmail.com>
|
||||
Date: Tue, 23 Feb 2021 20:47:25 +0530
|
||||
Subject: [PATCH] Removed codes where Zero Width Joiner replaced with
|
||||
whitespace.
|
||||
|
||||
---
|
||||
data/nmt_nfkc.tsv | 3 +--
|
||||
data/nmt_nfkc_cf.tsv | 3 +--
|
||||
src/builder.cc | 1 -
|
||||
3 files changed, 2 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/data/nmt_nfkc.tsv b/data/nmt_nfkc.tsv
|
||||
index 1ce2b71..5c8b48b 100644
|
||||
--- a/data/nmt_nfkc.tsv
|
||||
+++ b/data/nmt_nfkc.tsv
|
||||
@@ -57263,8 +57263,7 @@ FB9 F90 FB5 # ྐྵ => ྐྵ
|
||||
200A 20 # =>
|
||||
200B 20 # =>
|
||||
200C 20 # =>
|
||||
-200D 20 # =>
|
||||
-200E 20 # =>
|
||||
+200E 20 # =>
|
||||
200F 20 # =>
|
||||
2011 2010 # ‑ => ‐
|
||||
2017 20 333 # ‗ => ̳
|
||||
diff --git a/data/nmt_nfkc_cf.tsv b/data/nmt_nfkc_cf.tsv
|
||||
index 2178882..0d0e708 100644
|
||||
--- a/data/nmt_nfkc_cf.tsv
|
||||
+++ b/data/nmt_nfkc_cf.tsv
|
||||
@@ -57980,8 +57980,7 @@ FB9 F90 FB5 # ྐྵ => ྐྵ
|
||||
200A 20 # =>
|
||||
200B 20 # =>
|
||||
200C 20 # =>
|
||||
-200D 20 # =>
|
||||
-200E 20 # =>
|
||||
+200E 20 # =>
|
||||
200F 20 # =>
|
||||
2011 2010 # ‑ => ‐
|
||||
2017 20 333 # ‗ => ̳
|
||||
diff --git a/src/builder.cc b/src/builder.cc
|
||||
index d9442d3..9f47aac 100644
|
||||
--- a/src/builder.cc
|
||||
+++ b/src/builder.cc
|
||||
@@ -366,7 +366,6 @@ util::Status Builder::BuildNmtNFKCMap(CharsMap *chars_map) {
|
||||
nfkc_map[{0xFEFF}] = {0x20}; // ZERO WIDTH NO-BREAK
|
||||
nfkc_map[{0xFFFD}] = {0x20}; // REPLACEMENT CHARACTER
|
||||
nfkc_map[{0x200C}] = {0x20}; // ZERO WIDTH NON-JOINER
|
||||
- nfkc_map[{0x200D}] = {0x20}; // ZERO WIDTH JOINER
|
||||
|
||||
// Ascii Control characters
|
||||
nfkc_map[{0x0001}] = {};
|
||||
--
|
||||
|
||||
25
Restore-the-sentence-boundary-marker-insertion-for-t.patch
Normal file
25
Restore-the-sentence-boundary-marker-insertion-for-t.patch
Normal file
@ -0,0 +1,25 @@
|
||||
From 21aa7a9d6a3bd6a98c480bea02e0e81b21f411af Mon Sep 17 00:00:00 2001
|
||||
From: joe <219651+AdolfVonKleist@users.noreply.github.com>
|
||||
Date: Mon, 22 Mar 2021 17:26:20 +0000
|
||||
Subject: [PATCH 7/7] Restore the sentence boundary marker insertion for the
|
||||
unigram trainer. Dramatically speeds up training time.
|
||||
|
||||
---
|
||||
src/unigram_model_trainer.cc | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/src/unigram_model_trainer.cc b/src/unigram_model_trainer.cc
|
||||
index 5f26771..94c7adb 100644
|
||||
--- a/src/unigram_model_trainer.cc
|
||||
+++ b/src/unigram_model_trainer.cc
|
||||
@@ -119,6 +119,7 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePieces() const {
|
||||
all_chars[string_util::UnicodeCharToUTF8(c)] += w.second;
|
||||
}
|
||||
}
|
||||
+ array.push_back(kSentenceBoundary); // sentence boundary marker.
|
||||
}
|
||||
|
||||
const node_int_type n = array.size();
|
||||
--
|
||||
2.18.0.huawei.25
|
||||
|
||||
22
fix_of_an_unattainable_condition.patch
Normal file
22
fix_of_an_unattainable_condition.patch
Normal file
@ -0,0 +1,22 @@
|
||||
diff --git a/third_party/esaxx/sais.hxx b/third_party/esaxx/sais.hxx
|
||||
index f1702f8..b9071c8 100644
|
||||
--- a/third_party/esaxx/sais.hxx
|
||||
+++ b/third_party/esaxx/sais.hxx
|
||||
@@ -179,7 +179,7 @@ typedef typename std::iterator_traits<string_type>::value_type char_type;
|
||||
sort all the S-substrings */
|
||||
if(fs < (maxthreads * k)) {
|
||||
index_type *C, *B;
|
||||
- if((C = new index_type[maxthreads * k]) == 0) { return -2; }
|
||||
+ C = new index_type[maxthreads * k];
|
||||
B = (1 < maxthreads) ? C + k : C;
|
||||
getCounts(T, C, n, k); getBuckets(C, B, k, true); /* find ends of buckets */
|
||||
#ifdef _OPENMP
|
||||
@@ -271,7 +271,7 @@ typedef typename std::iterator_traits<string_type>::value_type char_type;
|
||||
/* stage 3: induce the result for the original problem */
|
||||
if(fs < (maxthreads * k)) {
|
||||
index_type *B, *C;
|
||||
- if((C = new index_type[maxthreads * k]) == 0) { return -2; }
|
||||
+ C = new index_type[maxthreads * k];
|
||||
B = (1 < maxthreads) ? C + k : C;
|
||||
/* put all left-most S characters into their buckets */
|
||||
getCounts(T, C, n, k); getBuckets(C, B, k, true); /* find ends of buckets */
|
||||
29
only-install-proto-headers-if-not-using-builtin-prot.patch
Normal file
29
only-install-proto-headers-if-not-using-builtin-prot.patch
Normal file
@ -0,0 +1,29 @@
|
||||
From a069cd5518c11750b734b85887dcc74ec6f9457f Mon Sep 17 00:00:00 2001
|
||||
From: mark <erasaur@gmail.com>
|
||||
Date: Wed, 10 Feb 2021 10:59:56 -0800
|
||||
Subject: [PATCH 6/7] only install proto headers if not using builtin proto
|
||||
|
||||
---
|
||||
src/CMakeLists.txt | 5 ++++-
|
||||
1 file changed, 4 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
|
||||
index 87765e5..3d31259 100644
|
||||
--- a/src/CMakeLists.txt
|
||||
+++ b/src/CMakeLists.txt
|
||||
@@ -272,8 +272,11 @@ install(TARGETS ${SPM_INSTALLTARGETS}
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
|
||||
-install(FILES sentencepiece_trainer.h sentencepiece_processor.h
|
||||
+install(FILES sentencepiece_trainer.h sentencepiece_processor.h ${SPM_PROTO_HDRS}
|
||||
DESTINATION ${CMAKE_INSTALL_INCDIR})
|
||||
+if (NOT SPM_USE_BUILTIN_PROTOBUF)
|
||||
+ install(FILES ${SPM_PROTO_HDRS} DESTINATION ${CMAKE_INSTALL_INCDIR})
|
||||
+endif()
|
||||
|
||||
file(TO_NATIVE_PATH "${PROJECT_SOURCE_DIR}/data" data_dir)
|
||||
|
||||
--
|
||||
2.18.0.huawei.25
|
||||
|
||||
27
sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch
Normal file
27
sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch
Normal file
@ -0,0 +1,27 @@
|
||||
From cc1380a1608d8e7913e943e8530798c882c4fe6c Mon Sep 17 00:00:00 2001
|
||||
From: Aaron Burke <aaburke@microsoft.com>
|
||||
Date: Fri, 21 Aug 2020 10:15:42 -0700
|
||||
Subject: [PATCH 2/7] sentencepiece.pc should be installed from
|
||||
CMAKE_CURRENT_BINARY_DIR, not CMAKE_BINARY_DIR, to support being included
|
||||
(and installed) from other projects
|
||||
|
||||
---
|
||||
CMakeLists.txt | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||
index 6481dfd..9124f9e 100644
|
||||
--- a/CMakeLists.txt
|
||||
+++ b/CMakeLists.txt
|
||||
@@ -78,7 +78,7 @@ configure_file("${PROJECT_SOURCE_DIR}/config.h.in" "config.h")
|
||||
configure_file("${PROJECT_SOURCE_DIR}/sentencepiece.pc.in" "sentencepiece.pc" @ONLY)
|
||||
|
||||
if (NOT MSVC)
|
||||
- install(FILES "${CMAKE_BINARY_DIR}/sentencepiece.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
||||
+ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/sentencepiece.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
||||
endif()
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_BINARY_DIR})
|
||||
--
|
||||
2.18.0.huawei.25
|
||||
|
||||
@ -1,13 +1,22 @@
|
||||
Name: sentencepiece
|
||||
Version: 0.1.92
|
||||
Release: 1
|
||||
Release: 6
|
||||
Summary: An unsupervised text tokenizer and detokenizer
|
||||
License: Apache-2.0
|
||||
URL: https://github.com/google/sentencepiece
|
||||
Source0: https://github.com/google/sentencepiece/archive/v%{version}.tar.gz
|
||||
BuildRequires: gcc-c++ gcc autoconf pkgconfig protobuf-compiler protobuf protobuf-devel
|
||||
Patch0: Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch
|
||||
Patch1: fix_of_an_unattainable_condition.patch
|
||||
Patch2: Added-split_digits-to-SentencePieceTrainer.patch
|
||||
Patch3: sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch
|
||||
Patch4: Create-options.md.patch
|
||||
Patch5: Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch
|
||||
Patch6: Add-missing-include-for-BYTE_ORDER.patch
|
||||
Patch7: only-install-proto-headers-if-not-using-builtin-prot.patch
|
||||
Patch8: Restore-the-sentence-boundary-marker-insertion-for-t.patch
|
||||
BuildRequires: gcc-c++ gcc autoconf pkgconfig protobuf-compiler protobuf
|
||||
BuildRequires: cmake >= 3.14.0
|
||||
Requires: protobuf-devel protobuf protobuf-compiler
|
||||
Requires: protobuf protobuf-compiler
|
||||
|
||||
%description
|
||||
SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation
|
||||
@ -26,11 +35,13 @@ cmake ../../ -DCMAKE_CXX_FLAGS="-D_FORTIFY_SOURCE=2 -O2 -Wno-unused-result -Wno-
|
||||
-DCMAKE_BUILD_TYPE=Release\
|
||||
-DSPM_USE_BUILTIN_PROTOBUF=ON\
|
||||
-DSPM_ENABLE_SHARED=OFF\
|
||||
-DSPM_BUILD_TEST=ON\
|
||||
-DCMAKE_INSTALL_LIBDIR=%{buildroot}%{_libdir}\
|
||||
-DCMAKE_INSTALL_BINDIR=%{buildroot}%{_bindir}\
|
||||
-DCMAKE_INSTALL_INCDIR=%{buildroot}%{_includedir}
|
||||
|
||||
make -j24 V=1
|
||||
make CTEST_OUTPUT_ON_FAILURE=1 test
|
||||
cd ../../
|
||||
|
||||
%install
|
||||
@ -42,8 +53,24 @@ make install
|
||||
%{_bindir}/spm_*
|
||||
%{_libdir}/*.a
|
||||
%{_libdir}/pkgconfig/*
|
||||
%{_includedir}/sentencepiece_*.h
|
||||
%{_includedir}/sentencepiece*.h
|
||||
|
||||
%changelog
|
||||
* Fri Nov 27 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92.6
|
||||
- Fix split_digits support to SentencepieceTrainer spec parser
|
||||
- Add sentencepiece.pc install
|
||||
- Add spm_train --help option
|
||||
- Fix FTBFS problem on armel/mips/powerpc/m68k/sh4
|
||||
- Fix endian problem on android plarform
|
||||
- Fix pb protobuf header file can't find problem
|
||||
- Restore the sentence boundary
|
||||
* Tue Nov 16 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92.5
|
||||
- add README.md/README.en.md
|
||||
* Tue Nov 2 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92-4
|
||||
- fix of an unattainable condition
|
||||
* Tue Nov 2 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92-3
|
||||
- Prevent Zero Width Joiner replaced with whitespace
|
||||
* Wed Sep 29 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92-2
|
||||
- add test cases
|
||||
* Wed Sep 23 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92-1
|
||||
- package init
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user