add patch to version v0.1.92-6

This commit is contained in:
xiefangqi 2021-11-27 19:30:16 +08:00
parent 5363370455
commit 28d259de7b
8 changed files with 252 additions and 2 deletions

View File

@ -0,0 +1,30 @@
From 624091a90e816f555106a1b1f994a45cb4989051 Mon Sep 17 00:00:00 2001
From: Malcolm Smith <smith@chaquo.com>
Date: Tue, 12 Jan 2021 13:43:28 +0000
Subject: [PATCH 5/7] Add missing #include for BYTE_ORDER
---
src/util.h | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/src/util.h b/src/util.h
index bf8a758..1680f4b 100644
--- a/src/util.h
+++ b/src/util.h
@@ -36,6 +36,13 @@
#include <pthread.h>
#endif
+#if !defined(__APPLE__) && !defined(_WIN32)
+#include <endian.h>
+#if defined(BYTE_ORDER) && defined(__BIG_ENDIAN) && BYTE_ORDER == __BIG_ENDIAN
+#define IS_BIG_ENDIAN
+#endif
+#endif
+
namespace sentencepiece {
template <typename T>
--
2.18.0.huawei.25

View File

@ -0,0 +1,24 @@
From 427d695ab4343568cc46411fbe83ef5ccc619752 Mon Sep 17 00:00:00 2001
From: mingruimingrui <mingruimingrui@hotmail.com>
Date: Sat, 27 Jun 2020 02:56:03 +0800
Subject: [PATCH 1/7] Added split_digits to SentencePieceTrainer
---
src/spec_parser.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/spec_parser.h b/src/spec_parser.h
index 729e036..6dd054b 100644
--- a/src/spec_parser.h
+++ b/src/spec_parser.h
@@ -207,6 +207,7 @@ util::Status SentencePieceTrainer::SetProtoField(const std::string &name,
PARSE_BOOL(split_by_unicode_script);
PARSE_BOOL(split_by_number);
PARSE_BOOL(split_by_whitespace);
+ PARSE_BOOL(split_digits);
PARSE_BOOL(treat_whitespace_as_suffix);
PARSE_REPEATED_STRING(control_symbols);
PARSE_REPEATED_STRING(user_defined_symbols);
--
2.18.0.huawei.25

70
Create-options.md.patch Normal file
View File

@ -0,0 +1,70 @@
From 5c09745aafa151be7ed5d9a9101f3e8c79a8758b Mon Sep 17 00:00:00 2001
From: stephantul <stephantul@gmail.com>
Date: Thu, 1 Oct 2020 12:49:13 +0200
Subject: [PATCH 3/7] Create options.md
---
doc/options.md | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 51 insertions(+)
create mode 100644 doc/options.md
diff --git a/doc/options.md b/doc/options.md
new file mode 100644
index 0000000..7861fdc
--- /dev/null
+++ b/doc/options.md
@@ -0,0 +1,51 @@
+# Training options
+
+The training options for the `spm_train` can be listed using `spm_train --help`. Since the standard `pip install` of sentencepiece does not necessarily install `spm_train`, the options are also listed here.
+
+```
+--help (show help) type: bool default: false
+--version (show version) type: bool default: false
+--minloglevel (Messages logged at a lower level than this don't actually get logged anywhere) type: int default: 0
+--input (comma separated list of input sentences) type: std::string default: ""
+--input_format (Input format. Supported format is `text` or `tsv`.) type: std::string default: ""
+--model_prefix (output model prefix) type: std::string default: "" --model_type (model algorithm: unigram, bpe, word or char) type: std::string default: "unigram"
+--vocab_size (vocabulary size) type: int32 default: 8000
+--accept_language (comma-separated list of languages this model can accept) type: std::string default: ""
+--self_test_sample_size (the size of self test samples) type: int32 default: 0
+--character_coverage (character coverage to determine the minimum symbols) type: double default: 0.9995
+--input_sentence_size (maximum size of sentences the trainer loads) type: int32 default: 0
+--shuffle_input_sentence (Randomly sample input sentences in advance. Valid when --input_sentence_size > 0) type: bool default: true
+--seed_sentencepiece_size (the size of seed sentencepieces) type: int32 default: 1000000
+--shrinking_factor (Keeps top shrinking_factor pieces with respect to the loss) type: double default: 0.75
+--num_threads (number of threads for training) type: int32 default: 16
+--num_sub_iterations (number of EM sub-iterations) type: int32 default: 2
+--max_sentencepiece_length (maximum length of sentence piece) type: int32 default: 16
+--max_sentence_length (maximum length of sentence in byte) type: int32 default: 4192
+--split_by_unicode_script (use Unicode script to split sentence pieces) type: bool default: true
+--split_by_number (split tokens by numbers (0-9)) type: bool default: true
+--split_by_whitespace (use a white space to split sentence pieces) type: bool default: true
+--split_digits (split all digits (0-9) into separate pieces) type: bool default: false
+--treat_whitespace_as_suffix (treat whitespace marker as suffix instead of prefix.) type: bool default: false
+--control_symbols (comma separated list of control symbols) type: std::string default: ""
+--user_defined_symbols (comma separated list of user defined symbols) type: std::string default: ""
+--required_chars (UTF8 characters in this flag are always used in the character set regardless of --character_coverage) type: std::string default: ""
+--byte_fallback (decompose unknown pieces into UTF-8 byte pieces) type: bool default: false
+--vocabulary_output_piece_score (Define score in vocab file) type: bool default: true
+--normalization_rule_name (Normalization rule name. Choose from nfkc or identity) type: std::string default: "nmt_nfkc"
+--normalization_rule_tsv (Normalization rule TSV file. ) type: std::string default: ""
+--denormalization_rule_tsv (Denormalization rule TSV file.) type: std::string default: ""
+--add_dummy_prefix (Add dummy whitespace at the beginning of text) type: bool default: true
+--remove_extra_whitespaces (Removes leading, trailing, and duplicate internal whitespace) type: bool default: true
+--hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.) type: bool default: true
+--use_all_vocab (If set to true, use all tokens as vocab. Valid for word/char models.) type: bool default: false
+--unk_id (Override UNK (<unk>) id.) type: int32 default: 0
+--bos_id (Override BOS (<s>) id. Set -1 to disable BOS.) type: int32 default: 1
+--eos_id (Override EOS (</s>) id. Set -1 to disable EOS.) type: int32 default: 2
+--pad_id (Override PAD (<pad>) id. Set -1 to disable PAD.) type: int32 default: -1
+--unk_piece (Override UNK (<unk>) piece.) type: std::string default: "<unk>"
+--bos_piece (Override BOS (<s>) piece.) type: std::string default: "<s>"
+--eos_piece (Override EOS (</s>) piece.) type: std::string default: "</s>"
+--pad_piece (Override PAD (<pad>) piece.) type: std::string default: "<pad>"
+--unk_surface (Dummy surface string for <unk>. In decoding <unk> is decoded to `unk_surface`.) type: std::string default: " ⁇ "
+--train_extremely_large_corpus (Increase bit depth for unigram tokenization.) type: bool default: false
+```
--
2.18.0.huawei.25

View File

@ -0,0 +1,30 @@
From 2ea571b8e509809bbe28e6cc3f1488b3cfde1ef9 Mon Sep 17 00:00:00 2001
From: Kentaro Hayashi <hayashi@clear-code.com>
Date: Sat, 17 Oct 2020 16:54:20 +0900
Subject: [PATCH 4/7] Fix FTBFS on armel, mips, powerpc, m68k and sh4
---
src/CMakeLists.txt | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 511b2ec..87765e5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -197,6 +197,13 @@ target_link_libraries(sentencepiece_train-static INTERFACE sentencepiece-static
if (SPM_ENABLE_SHARED)
target_link_libraries(sentencepiece ${SPM_LIBS})
target_link_libraries(sentencepiece_train ${SPM_LIBS} sentencepiece)
+ if ((${CMAKE_SYSTEM_PROCESSOR} STREQUAL "armv7l") OR
+ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "mips") OR
+ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "m68k") OR
+ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "ppc") OR
+ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "sh4"))
+ list(APPEND SPM_LIBS "atomic")
+ endif()
set(SPM_INSTALLTARGETS sentencepiece sentencepiece_train sentencepiece-static sentencepiece_train-static)
set_target_properties(sentencepiece sentencepiece_train PROPERTIES SOVERSION 0 VERSION 0.0.0)
set_target_properties(sentencepiece PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS YES)
--
2.18.0.huawei.25

View File

@ -0,0 +1,25 @@
From 21aa7a9d6a3bd6a98c480bea02e0e81b21f411af Mon Sep 17 00:00:00 2001
From: joe <219651+AdolfVonKleist@users.noreply.github.com>
Date: Mon, 22 Mar 2021 17:26:20 +0000
Subject: [PATCH 7/7] Restore the sentence boundary marker insertion for the
unigram trainer. Dramatically speeds up training time.
---
src/unigram_model_trainer.cc | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/unigram_model_trainer.cc b/src/unigram_model_trainer.cc
index 5f26771..94c7adb 100644
--- a/src/unigram_model_trainer.cc
+++ b/src/unigram_model_trainer.cc
@@ -119,6 +119,7 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePieces() const {
all_chars[string_util::UnicodeCharToUTF8(c)] += w.second;
}
}
+ array.push_back(kSentenceBoundary); // sentence boundary marker.
}
const node_int_type n = array.size();
--
2.18.0.huawei.25

View File

@ -0,0 +1,29 @@
From a069cd5518c11750b734b85887dcc74ec6f9457f Mon Sep 17 00:00:00 2001
From: mark <erasaur@gmail.com>
Date: Wed, 10 Feb 2021 10:59:56 -0800
Subject: [PATCH 6/7] only install proto headers if not using builtin proto
---
src/CMakeLists.txt | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 87765e5..3d31259 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -272,8 +272,11 @@ install(TARGETS ${SPM_INSTALLTARGETS}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
-install(FILES sentencepiece_trainer.h sentencepiece_processor.h
+install(FILES sentencepiece_trainer.h sentencepiece_processor.h ${SPM_PROTO_HDRS}
DESTINATION ${CMAKE_INSTALL_INCDIR})
+if (NOT SPM_USE_BUILTIN_PROTOBUF)
+ install(FILES ${SPM_PROTO_HDRS} DESTINATION ${CMAKE_INSTALL_INCDIR})
+endif()
file(TO_NATIVE_PATH "${PROJECT_SOURCE_DIR}/data" data_dir)
--
2.18.0.huawei.25

View File

@ -0,0 +1,27 @@
From cc1380a1608d8e7913e943e8530798c882c4fe6c Mon Sep 17 00:00:00 2001
From: Aaron Burke <aaburke@microsoft.com>
Date: Fri, 21 Aug 2020 10:15:42 -0700
Subject: [PATCH 2/7] sentencepiece.pc should be installed from
CMAKE_CURRENT_BINARY_DIR, not CMAKE_BINARY_DIR, to support being included
(and installed) from other projects
---
CMakeLists.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6481dfd..9124f9e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,7 +78,7 @@ configure_file("${PROJECT_SOURCE_DIR}/config.h.in" "config.h")
configure_file("${PROJECT_SOURCE_DIR}/sentencepiece.pc.in" "sentencepiece.pc" @ONLY)
if (NOT MSVC)
- install(FILES "${CMAKE_BINARY_DIR}/sentencepiece.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
+ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/sentencepiece.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
endif()
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_BINARY_DIR})
--
2.18.0.huawei.25

View File

@ -1,12 +1,19 @@
Name: sentencepiece
Version: 0.1.92
Release: 5
Release: 6
Summary: An unsupervised text tokenizer and detokenizer
License: Apache-2.0
URL: https://github.com/google/sentencepiece
Source0: https://github.com/google/sentencepiece/archive/v%{version}.tar.gz
Patch0: Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch
Patch1: fix_of_an_unattainable_condition.patch
Patch2: Added-split_digits-to-SentencePieceTrainer.patch
Patch3: sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch
Patch4: Create-options.md.patch
Patch5: Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch
Patch6: Add-missing-include-for-BYTE_ORDER.patch
Patch7: only-install-proto-headers-if-not-using-builtin-prot.patch
Patch8: Restore-the-sentence-boundary-marker-insertion-for-t.patch
BuildRequires: gcc-c++ gcc autoconf pkgconfig protobuf-compiler protobuf
BuildRequires: cmake >= 3.14.0
Requires: protobuf protobuf-compiler
@ -46,9 +53,17 @@ make install
%{_bindir}/spm_*
%{_libdir}/*.a
%{_libdir}/pkgconfig/*
%{_includedir}/sentencepiece_*.h
%{_includedir}/sentencepiece*.h
%changelog
* Fri Nov 27 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92.6
- Fix split_digits support to SentencepieceTrainer spec parser
- Add sentencepiece.pc install
- Add spm_train --help option
- Fix FTBFS problem on armel/mips/powerpc/m68k/sh4
- Fix endian problem on android plarform
- Fix pb protobuf header file can't find problem
- Restore the sentence boundary
* Tue Nov 16 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92.5
- add README.md/README.en.md
* Tue Nov 2 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92-4