!6 上游社区PR回合
From: @xiefangqi Reviewed-by: @ouwenchang Signed-off-by: @ouwenchang
This commit is contained in:
commit
c25e44a078
30
Add-missing-include-for-BYTE_ORDER.patch
Normal file
30
Add-missing-include-for-BYTE_ORDER.patch
Normal file
@ -0,0 +1,30 @@
|
||||
From 624091a90e816f555106a1b1f994a45cb4989051 Mon Sep 17 00:00:00 2001
|
||||
From: Malcolm Smith <smith@chaquo.com>
|
||||
Date: Tue, 12 Jan 2021 13:43:28 +0000
|
||||
Subject: [PATCH 5/7] Add missing #include for BYTE_ORDER
|
||||
|
||||
---
|
||||
src/util.h | 7 +++++++
|
||||
1 file changed, 7 insertions(+)
|
||||
|
||||
diff --git a/src/util.h b/src/util.h
|
||||
index bf8a758..1680f4b 100644
|
||||
--- a/src/util.h
|
||||
+++ b/src/util.h
|
||||
@@ -36,6 +36,13 @@
|
||||
#include <pthread.h>
|
||||
#endif
|
||||
|
||||
+#if !defined(__APPLE__) && !defined(_WIN32)
|
||||
+#include <endian.h>
|
||||
+#if defined(BYTE_ORDER) && defined(__BIG_ENDIAN) && BYTE_ORDER == __BIG_ENDIAN
|
||||
+#define IS_BIG_ENDIAN
|
||||
+#endif
|
||||
+#endif
|
||||
+
|
||||
namespace sentencepiece {
|
||||
|
||||
template <typename T>
|
||||
--
|
||||
2.18.0.huawei.25
|
||||
|
||||
24
Added-split_digits-to-SentencePieceTrainer.patch
Normal file
24
Added-split_digits-to-SentencePieceTrainer.patch
Normal file
@ -0,0 +1,24 @@
|
||||
From 427d695ab4343568cc46411fbe83ef5ccc619752 Mon Sep 17 00:00:00 2001
|
||||
From: mingruimingrui <mingruimingrui@hotmail.com>
|
||||
Date: Sat, 27 Jun 2020 02:56:03 +0800
|
||||
Subject: [PATCH 1/7] Added split_digits to SentencePieceTrainer
|
||||
|
||||
---
|
||||
src/spec_parser.h | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/src/spec_parser.h b/src/spec_parser.h
|
||||
index 729e036..6dd054b 100644
|
||||
--- a/src/spec_parser.h
|
||||
+++ b/src/spec_parser.h
|
||||
@@ -207,6 +207,7 @@ util::Status SentencePieceTrainer::SetProtoField(const std::string &name,
|
||||
PARSE_BOOL(split_by_unicode_script);
|
||||
PARSE_BOOL(split_by_number);
|
||||
PARSE_BOOL(split_by_whitespace);
|
||||
+ PARSE_BOOL(split_digits);
|
||||
PARSE_BOOL(treat_whitespace_as_suffix);
|
||||
PARSE_REPEATED_STRING(control_symbols);
|
||||
PARSE_REPEATED_STRING(user_defined_symbols);
|
||||
--
|
||||
2.18.0.huawei.25
|
||||
|
||||
70
Create-options.md.patch
Normal file
70
Create-options.md.patch
Normal file
@ -0,0 +1,70 @@
|
||||
From 5c09745aafa151be7ed5d9a9101f3e8c79a8758b Mon Sep 17 00:00:00 2001
|
||||
From: stephantul <stephantul@gmail.com>
|
||||
Date: Thu, 1 Oct 2020 12:49:13 +0200
|
||||
Subject: [PATCH 3/7] Create options.md
|
||||
|
||||
---
|
||||
doc/options.md | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 51 insertions(+)
|
||||
create mode 100644 doc/options.md
|
||||
|
||||
diff --git a/doc/options.md b/doc/options.md
|
||||
new file mode 100644
|
||||
index 0000000..7861fdc
|
||||
--- /dev/null
|
||||
+++ b/doc/options.md
|
||||
@@ -0,0 +1,51 @@
|
||||
+# Training options
|
||||
+
|
||||
+The training options for the `spm_train` can be listed using `spm_train --help`. Since the standard `pip install` of sentencepiece does not necessarily install `spm_train`, the options are also listed here.
|
||||
+
|
||||
+```
|
||||
+--help (show help) type: bool default: false
|
||||
+--version (show version) type: bool default: false
|
||||
+--minloglevel (Messages logged at a lower level than this don't actually get logged anywhere) type: int default: 0
|
||||
+--input (comma separated list of input sentences) type: std::string default: ""
|
||||
+--input_format (Input format. Supported format is `text` or `tsv`.) type: std::string default: ""
|
||||
+--model_prefix (output model prefix) type: std::string default: "" --model_type (model algorithm: unigram, bpe, word or char) type: std::string default: "unigram"
|
||||
+--vocab_size (vocabulary size) type: int32 default: 8000
|
||||
+--accept_language (comma-separated list of languages this model can accept) type: std::string default: ""
|
||||
+--self_test_sample_size (the size of self test samples) type: int32 default: 0
|
||||
+--character_coverage (character coverage to determine the minimum symbols) type: double default: 0.9995
|
||||
+--input_sentence_size (maximum size of sentences the trainer loads) type: int32 default: 0
|
||||
+--shuffle_input_sentence (Randomly sample input sentences in advance. Valid when --input_sentence_size > 0) type: bool default: true
|
||||
+--seed_sentencepiece_size (the size of seed sentencepieces) type: int32 default: 1000000
|
||||
+--shrinking_factor (Keeps top shrinking_factor pieces with respect to the loss) type: double default: 0.75
|
||||
+--num_threads (number of threads for training) type: int32 default: 16
|
||||
+--num_sub_iterations (number of EM sub-iterations) type: int32 default: 2
|
||||
+--max_sentencepiece_length (maximum length of sentence piece) type: int32 default: 16
|
||||
+--max_sentence_length (maximum length of sentence in byte) type: int32 default: 4192
|
||||
+--split_by_unicode_script (use Unicode script to split sentence pieces) type: bool default: true
|
||||
+--split_by_number (split tokens by numbers (0-9)) type: bool default: true
|
||||
+--split_by_whitespace (use a white space to split sentence pieces) type: bool default: true
|
||||
+--split_digits (split all digits (0-9) into separate pieces) type: bool default: false
|
||||
+--treat_whitespace_as_suffix (treat whitespace marker as suffix instead of prefix.) type: bool default: false
|
||||
+--control_symbols (comma separated list of control symbols) type: std::string default: ""
|
||||
+--user_defined_symbols (comma separated list of user defined symbols) type: std::string default: ""
|
||||
+--required_chars (UTF8 characters in this flag are always used in the character set regardless of --character_coverage) type: std::string default: ""
|
||||
+--byte_fallback (decompose unknown pieces into UTF-8 byte pieces) type: bool default: false
|
||||
+--vocabulary_output_piece_score (Define score in vocab file) type: bool default: true
|
||||
+--normalization_rule_name (Normalization rule name. Choose from nfkc or identity) type: std::string default: "nmt_nfkc"
|
||||
+--normalization_rule_tsv (Normalization rule TSV file. ) type: std::string default: ""
|
||||
+--denormalization_rule_tsv (Denormalization rule TSV file.) type: std::string default: ""
|
||||
+--add_dummy_prefix (Add dummy whitespace at the beginning of text) type: bool default: true
|
||||
+--remove_extra_whitespaces (Removes leading, trailing, and duplicate internal whitespace) type: bool default: true
|
||||
+--hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.) type: bool default: true
|
||||
+--use_all_vocab (If set to true, use all tokens as vocab. Valid for word/char models.) type: bool default: false
|
||||
+--unk_id (Override UNK (<unk>) id.) type: int32 default: 0
|
||||
+--bos_id (Override BOS (<s>) id. Set -1 to disable BOS.) type: int32 default: 1
|
||||
+--eos_id (Override EOS (</s>) id. Set -1 to disable EOS.) type: int32 default: 2
|
||||
+--pad_id (Override PAD (<pad>) id. Set -1 to disable PAD.) type: int32 default: -1
|
||||
+--unk_piece (Override UNK (<unk>) piece.) type: std::string default: "<unk>"
|
||||
+--bos_piece (Override BOS (<s>) piece.) type: std::string default: "<s>"
|
||||
+--eos_piece (Override EOS (</s>) piece.) type: std::string default: "</s>"
|
||||
+--pad_piece (Override PAD (<pad>) piece.) type: std::string default: "<pad>"
|
||||
+--unk_surface (Dummy surface string for <unk>. In decoding <unk> is decoded to `unk_surface`.) type: std::string default: " ⁇ "
|
||||
+--train_extremely_large_corpus (Increase bit depth for unigram tokenization.) type: bool default: false
|
||||
+```
|
||||
--
|
||||
2.18.0.huawei.25
|
||||
|
||||
30
Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch
Normal file
30
Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch
Normal file
@ -0,0 +1,30 @@
|
||||
From 2ea571b8e509809bbe28e6cc3f1488b3cfde1ef9 Mon Sep 17 00:00:00 2001
|
||||
From: Kentaro Hayashi <hayashi@clear-code.com>
|
||||
Date: Sat, 17 Oct 2020 16:54:20 +0900
|
||||
Subject: [PATCH 4/7] Fix FTBFS on armel, mips, powerpc, m68k and sh4
|
||||
|
||||
---
|
||||
src/CMakeLists.txt | 7 +++++++
|
||||
1 file changed, 7 insertions(+)
|
||||
|
||||
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
|
||||
index 511b2ec..87765e5 100644
|
||||
--- a/src/CMakeLists.txt
|
||||
+++ b/src/CMakeLists.txt
|
||||
@@ -197,6 +197,13 @@ target_link_libraries(sentencepiece_train-static INTERFACE sentencepiece-static
|
||||
if (SPM_ENABLE_SHARED)
|
||||
target_link_libraries(sentencepiece ${SPM_LIBS})
|
||||
target_link_libraries(sentencepiece_train ${SPM_LIBS} sentencepiece)
|
||||
+ if ((${CMAKE_SYSTEM_PROCESSOR} STREQUAL "armv7l") OR
|
||||
+ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "mips") OR
|
||||
+ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "m68k") OR
|
||||
+ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "ppc") OR
|
||||
+ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "sh4"))
|
||||
+ list(APPEND SPM_LIBS "atomic")
|
||||
+ endif()
|
||||
set(SPM_INSTALLTARGETS sentencepiece sentencepiece_train sentencepiece-static sentencepiece_train-static)
|
||||
set_target_properties(sentencepiece sentencepiece_train PROPERTIES SOVERSION 0 VERSION 0.0.0)
|
||||
set_target_properties(sentencepiece PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS YES)
|
||||
--
|
||||
2.18.0.huawei.25
|
||||
|
||||
25
Restore-the-sentence-boundary-marker-insertion-for-t.patch
Normal file
25
Restore-the-sentence-boundary-marker-insertion-for-t.patch
Normal file
@ -0,0 +1,25 @@
|
||||
From 21aa7a9d6a3bd6a98c480bea02e0e81b21f411af Mon Sep 17 00:00:00 2001
|
||||
From: joe <219651+AdolfVonKleist@users.noreply.github.com>
|
||||
Date: Mon, 22 Mar 2021 17:26:20 +0000
|
||||
Subject: [PATCH 7/7] Restore the sentence boundary marker insertion for the
|
||||
unigram trainer. Dramatically speeds up training time.
|
||||
|
||||
---
|
||||
src/unigram_model_trainer.cc | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/src/unigram_model_trainer.cc b/src/unigram_model_trainer.cc
|
||||
index 5f26771..94c7adb 100644
|
||||
--- a/src/unigram_model_trainer.cc
|
||||
+++ b/src/unigram_model_trainer.cc
|
||||
@@ -119,6 +119,7 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePieces() const {
|
||||
all_chars[string_util::UnicodeCharToUTF8(c)] += w.second;
|
||||
}
|
||||
}
|
||||
+ array.push_back(kSentenceBoundary); // sentence boundary marker.
|
||||
}
|
||||
|
||||
const node_int_type n = array.size();
|
||||
--
|
||||
2.18.0.huawei.25
|
||||
|
||||
29
only-install-proto-headers-if-not-using-builtin-prot.patch
Normal file
29
only-install-proto-headers-if-not-using-builtin-prot.patch
Normal file
@ -0,0 +1,29 @@
|
||||
From a069cd5518c11750b734b85887dcc74ec6f9457f Mon Sep 17 00:00:00 2001
|
||||
From: mark <erasaur@gmail.com>
|
||||
Date: Wed, 10 Feb 2021 10:59:56 -0800
|
||||
Subject: [PATCH 6/7] only install proto headers if not using builtin proto
|
||||
|
||||
---
|
||||
src/CMakeLists.txt | 5 ++++-
|
||||
1 file changed, 4 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
|
||||
index 87765e5..3d31259 100644
|
||||
--- a/src/CMakeLists.txt
|
||||
+++ b/src/CMakeLists.txt
|
||||
@@ -272,8 +272,11 @@ install(TARGETS ${SPM_INSTALLTARGETS}
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
|
||||
-install(FILES sentencepiece_trainer.h sentencepiece_processor.h
|
||||
+install(FILES sentencepiece_trainer.h sentencepiece_processor.h ${SPM_PROTO_HDRS}
|
||||
DESTINATION ${CMAKE_INSTALL_INCDIR})
|
||||
+if (NOT SPM_USE_BUILTIN_PROTOBUF)
|
||||
+ install(FILES ${SPM_PROTO_HDRS} DESTINATION ${CMAKE_INSTALL_INCDIR})
|
||||
+endif()
|
||||
|
||||
file(TO_NATIVE_PATH "${PROJECT_SOURCE_DIR}/data" data_dir)
|
||||
|
||||
--
|
||||
2.18.0.huawei.25
|
||||
|
||||
27
sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch
Normal file
27
sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch
Normal file
@ -0,0 +1,27 @@
|
||||
From cc1380a1608d8e7913e943e8530798c882c4fe6c Mon Sep 17 00:00:00 2001
|
||||
From: Aaron Burke <aaburke@microsoft.com>
|
||||
Date: Fri, 21 Aug 2020 10:15:42 -0700
|
||||
Subject: [PATCH 2/7] sentencepiece.pc should be installed from
|
||||
CMAKE_CURRENT_BINARY_DIR, not CMAKE_BINARY_DIR, to support being included
|
||||
(and installed) from other projects
|
||||
|
||||
---
|
||||
CMakeLists.txt | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||
index 6481dfd..9124f9e 100644
|
||||
--- a/CMakeLists.txt
|
||||
+++ b/CMakeLists.txt
|
||||
@@ -78,7 +78,7 @@ configure_file("${PROJECT_SOURCE_DIR}/config.h.in" "config.h")
|
||||
configure_file("${PROJECT_SOURCE_DIR}/sentencepiece.pc.in" "sentencepiece.pc" @ONLY)
|
||||
|
||||
if (NOT MSVC)
|
||||
- install(FILES "${CMAKE_BINARY_DIR}/sentencepiece.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
||||
+ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/sentencepiece.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
||||
endif()
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_BINARY_DIR})
|
||||
--
|
||||
2.18.0.huawei.25
|
||||
|
||||
@ -1,12 +1,19 @@
|
||||
Name: sentencepiece
|
||||
Version: 0.1.92
|
||||
Release: 5
|
||||
Release: 6
|
||||
Summary: An unsupervised text tokenizer and detokenizer
|
||||
License: Apache-2.0
|
||||
URL: https://github.com/google/sentencepiece
|
||||
Source0: https://github.com/google/sentencepiece/archive/v%{version}.tar.gz
|
||||
Patch0: Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch
|
||||
Patch1: fix_of_an_unattainable_condition.patch
|
||||
Patch2: Added-split_digits-to-SentencePieceTrainer.patch
|
||||
Patch3: sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch
|
||||
Patch4: Create-options.md.patch
|
||||
Patch5: Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch
|
||||
Patch6: Add-missing-include-for-BYTE_ORDER.patch
|
||||
Patch7: only-install-proto-headers-if-not-using-builtin-prot.patch
|
||||
Patch8: Restore-the-sentence-boundary-marker-insertion-for-t.patch
|
||||
BuildRequires: gcc-c++ gcc autoconf pkgconfig protobuf-compiler protobuf
|
||||
BuildRequires: cmake >= 3.14.0
|
||||
Requires: protobuf protobuf-compiler
|
||||
@ -46,9 +53,17 @@ make install
|
||||
%{_bindir}/spm_*
|
||||
%{_libdir}/*.a
|
||||
%{_libdir}/pkgconfig/*
|
||||
%{_includedir}/sentencepiece_*.h
|
||||
%{_includedir}/sentencepiece*.h
|
||||
|
||||
%changelog
|
||||
* Fri Nov 27 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92.6
|
||||
- Fix split_digits support to SentencepieceTrainer spec parser
|
||||
- Add sentencepiece.pc install
|
||||
- Add spm_train --help option
|
||||
- Fix FTBFS problem on armel/mips/powerpc/m68k/sh4
|
||||
- Fix endian problem on android plarform
|
||||
- Fix pb protobuf header file can't find problem
|
||||
- Restore the sentence boundary
|
||||
* Tue Nov 16 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92.5
|
||||
- add README.md/README.en.md
|
||||
* Tue Nov 2 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92-4
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user