!6 上游社区PR回合

From: @xiefangqi Reviewed-by: @ouwenchang Signed-off-by: @ouwenchang
2021-11-27 11:50:28 +00:00 · 2021-11-27 11:50:28 +00:00 · c25e44a078
commit c25e44a078
parent 5363370455 28d259de7b
8 changed files with 252 additions and 2 deletions
--- a/Add-missing-include-for-BYTE_ORDER.patch
+++ b/Add-missing-include-for-BYTE_ORDER.patch
@ -0,0 +1,30 @@
+From 624091a90e816f555106a1b1f994a45cb4989051 Mon Sep 17 00:00:00 2001
+From: Malcolm Smith <smith@chaquo.com>
+Date: Tue, 12 Jan 2021 13:43:28 +0000
+Subject: [PATCH 5/7] Add missing #include for BYTE_ORDER
+
+---
+ src/util.h | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/src/util.h b/src/util.h
+index bf8a758..1680f4b 100644
+--- a/src/util.h
+++ b/src/util.h
+@@ -36,6 +36,13 @@
+ #include <pthread.h>
+ #endif
+ 
+#if !defined(__APPLE__) && !defined(_WIN32)
+#include <endian.h>
+#if defined(BYTE_ORDER) && defined(__BIG_ENDIAN) && BYTE_ORDER == __BIG_ENDIAN
+#define IS_BIG_ENDIAN
+#endif
+#endif
+
+ namespace sentencepiece {
+ 
+ template <typename T>
+-- 
+2.18.0.huawei.25
+
--- a/Added-split_digits-to-SentencePieceTrainer.patch
+++ b/Added-split_digits-to-SentencePieceTrainer.patch
@ -0,0 +1,24 @@
+From 427d695ab4343568cc46411fbe83ef5ccc619752 Mon Sep 17 00:00:00 2001
+From: mingruimingrui <mingruimingrui@hotmail.com>
+Date: Sat, 27 Jun 2020 02:56:03 +0800
+Subject: [PATCH 1/7] Added split_digits to SentencePieceTrainer
+
+---
+ src/spec_parser.h | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/src/spec_parser.h b/src/spec_parser.h
+index 729e036..6dd054b 100644
+--- a/src/spec_parser.h
+++ b/src/spec_parser.h
+@@ -207,6 +207,7 @@ util::Status SentencePieceTrainer::SetProtoField(const std::string &name,
+   PARSE_BOOL(split_by_unicode_script);
+   PARSE_BOOL(split_by_number);
+   PARSE_BOOL(split_by_whitespace);
+  PARSE_BOOL(split_digits);
+   PARSE_BOOL(treat_whitespace_as_suffix);
+   PARSE_REPEATED_STRING(control_symbols);
+   PARSE_REPEATED_STRING(user_defined_symbols);
+-- 
+2.18.0.huawei.25
+
--- a/Create-options.md.patch
+++ b/Create-options.md.patch
@ -0,0 +1,70 @@
+From 5c09745aafa151be7ed5d9a9101f3e8c79a8758b Mon Sep 17 00:00:00 2001
+From: stephantul <stephantul@gmail.com>
+Date: Thu, 1 Oct 2020 12:49:13 +0200
+Subject: [PATCH 3/7] Create options.md
+
+---
+ doc/options.md | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 51 insertions(+)
+ create mode 100644 doc/options.md
+
+diff --git a/doc/options.md b/doc/options.md
+new file mode 100644
+index 0000000..7861fdc
+--- /dev/null
+++ b/doc/options.md
+@@ -0,0 +1,51 @@
+# Training options
+
+The training options for the `spm_train` can be listed using `spm_train --help`. Since the standard `pip install` of sentencepiece does not necessarily install `spm_train`, the options are also listed here.
+
+```
+--help (show help)  type: bool default: false
+--version (show version)  type: bool default: false
+--minloglevel (Messages logged at a lower level than this don't actually get logged anywhere)  type: int default: 0
+--input (comma separated list of input sentences)  type: std::string default: ""
+--input_format (Input format. Supported format is `text` or `tsv`.)  type: std::string default: ""
+--model_prefix (output model prefix)  type: std::string default: "" --model_type (model algorithm: unigram, bpe, word or char)  type: std::string default: "unigram"
+--vocab_size (vocabulary size)  type: int32 default: 8000
+--accept_language (comma-separated list of languages this model can accept)  type: std::string default: ""
+--self_test_sample_size (the size of self test samples)  type: int32 default: 0
+--character_coverage (character coverage to determine the minimum symbols)  type: double default: 0.9995
+--input_sentence_size (maximum size of sentences the trainer loads)  type: int32 default: 0
+--shuffle_input_sentence (Randomly sample input sentences in advance. Valid when --input_sentence_size > 0)  type: bool default: true
+--seed_sentencepiece_size (the size of seed sentencepieces)  type: int32 default: 1000000
+--shrinking_factor (Keeps top shrinking_factor pieces with respect to the loss)  type: double default: 0.75
+--num_threads (number of threads for training)  type: int32 default: 16
+--num_sub_iterations (number of EM sub-iterations)  type: int32 default: 2
+--max_sentencepiece_length (maximum length of sentence piece)  type: int32 default: 16
+--max_sentence_length (maximum length of sentence in byte)  type: int32 default: 4192
+--split_by_unicode_script (use Unicode script to split sentence pieces)  type: bool default: true
+--split_by_number (split tokens by numbers (0-9))  type: bool default: true
+--split_by_whitespace (use a white space to split sentence pieces)  type: bool default: true
+--split_digits (split all digits (0-9) into separate pieces)  type: bool default: false
+--treat_whitespace_as_suffix (treat whitespace marker as suffix instead of prefix.)  type: bool default: false
+--control_symbols (comma separated list of control symbols)  type: std::string default: ""
+--user_defined_symbols (comma separated list of user defined symbols)  type: std::string default: ""
+--required_chars (UTF8 characters in this flag are always used in the character set regardless of --character_coverage)  type: std::string default: ""
+--byte_fallback (decompose unknown pieces into UTF-8 byte pieces)  type: bool default: false
+--vocabulary_output_piece_score (Define score in vocab file)  type: bool default: true
+--normalization_rule_name (Normalization rule name. Choose from nfkc or identity)  type: std::string default: "nmt_nfkc"
+--normalization_rule_tsv (Normalization rule TSV file. )  type: std::string default: ""
+--denormalization_rule_tsv (Denormalization rule TSV file.)  type: std::string default: ""
+--add_dummy_prefix (Add dummy whitespace at the beginning of text)  type: bool default: true
+--remove_extra_whitespaces (Removes leading, trailing, and duplicate internal whitespace)  type: bool default: true
+--hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.)  type: bool default: true
+--use_all_vocab (If set to true, use all tokens as vocab. Valid for word/char models.)  type: bool default: false
+--unk_id (Override UNK (<unk>) id.)  type: int32 default: 0
+--bos_id (Override BOS (<s>) id. Set -1 to disable BOS.)  type: int32 default: 1
+--eos_id (Override EOS (</s>) id. Set -1 to disable EOS.)  type: int32 default: 2
+--pad_id (Override PAD (<pad>) id. Set -1 to disable PAD.)  type: int32 default: -1
+--unk_piece (Override UNK (<unk>) piece.)  type: std::string default: "<unk>"
+--bos_piece (Override BOS (<s>) piece.)  type: std::string default: "<s>"
+--eos_piece (Override EOS (</s>) piece.)  type: std::string default: "</s>"
+--pad_piece (Override PAD (<pad>) piece.)  type: std::string default: "<pad>"
+--unk_surface (Dummy surface string for <unk>. In decoding <unk> is decoded to `unk_surface`.)  type: std::string default: " ⁇ "
+--train_extremely_large_corpus (Increase bit depth for unigram tokenization.)  type: bool default: false
+```
+-- 
+2.18.0.huawei.25
+
--- a/Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch
+++ b/Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch
@ -0,0 +1,30 @@
+From 2ea571b8e509809bbe28e6cc3f1488b3cfde1ef9 Mon Sep 17 00:00:00 2001
+From: Kentaro Hayashi <hayashi@clear-code.com>
+Date: Sat, 17 Oct 2020 16:54:20 +0900
+Subject: [PATCH 4/7] Fix FTBFS on armel, mips, powerpc, m68k and sh4
+
+---
+ src/CMakeLists.txt | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
+index 511b2ec..87765e5 100644
+--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
+@@ -197,6 +197,13 @@ target_link_libraries(sentencepiece_train-static INTERFACE sentencepiece-static
+ if (SPM_ENABLE_SHARED)
+   target_link_libraries(sentencepiece ${SPM_LIBS})
+   target_link_libraries(sentencepiece_train ${SPM_LIBS} sentencepiece)
+  if ((${CMAKE_SYSTEM_PROCESSOR} STREQUAL "armv7l") OR
+      (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "mips") OR
+      (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "m68k") OR
+      (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "ppc") OR
+      (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "sh4"))
+    list(APPEND SPM_LIBS "atomic")
+  endif()
+   set(SPM_INSTALLTARGETS sentencepiece sentencepiece_train sentencepiece-static sentencepiece_train-static)
+   set_target_properties(sentencepiece sentencepiece_train PROPERTIES SOVERSION 0 VERSION 0.0.0)
+   set_target_properties(sentencepiece PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS YES)
+-- 
+2.18.0.huawei.25
+
--- a/Restore-the-sentence-boundary-marker-insertion-for-t.patch
+++ b/Restore-the-sentence-boundary-marker-insertion-for-t.patch
@ -0,0 +1,25 @@
+From 21aa7a9d6a3bd6a98c480bea02e0e81b21f411af Mon Sep 17 00:00:00 2001
+From: joe <219651+AdolfVonKleist@users.noreply.github.com>
+Date: Mon, 22 Mar 2021 17:26:20 +0000
+Subject: [PATCH 7/7] Restore the sentence boundary marker insertion for the
+ unigram trainer.  Dramatically speeds up training time.
+
+---
+ src/unigram_model_trainer.cc | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/src/unigram_model_trainer.cc b/src/unigram_model_trainer.cc
+index 5f26771..94c7adb 100644
+--- a/src/unigram_model_trainer.cc
+++ b/src/unigram_model_trainer.cc
+@@ -119,6 +119,7 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePieces() const {
+         all_chars[string_util::UnicodeCharToUTF8(c)] += w.second;
+       }
+     }
+    array.push_back(kSentenceBoundary);  // sentence boundary marker.
+   }
+ 
+   const node_int_type n = array.size();
+-- 
+2.18.0.huawei.25
+
--- a/only-install-proto-headers-if-not-using-builtin-prot.patch
+++ b/only-install-proto-headers-if-not-using-builtin-prot.patch
@ -0,0 +1,29 @@
+From a069cd5518c11750b734b85887dcc74ec6f9457f Mon Sep 17 00:00:00 2001
+From: mark <erasaur@gmail.com>
+Date: Wed, 10 Feb 2021 10:59:56 -0800
+Subject: [PATCH 6/7] only install proto headers if not using builtin proto
+
+---
+ src/CMakeLists.txt | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
+index 87765e5..3d31259 100644
+--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
+@@ -272,8 +272,11 @@ install(TARGETS ${SPM_INSTALLTARGETS}
+   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+-install(FILES sentencepiece_trainer.h sentencepiece_processor.h
+install(FILES sentencepiece_trainer.h sentencepiece_processor.h ${SPM_PROTO_HDRS}
+   DESTINATION ${CMAKE_INSTALL_INCDIR})
+if (NOT SPM_USE_BUILTIN_PROTOBUF)
+  install(FILES ${SPM_PROTO_HDRS} DESTINATION ${CMAKE_INSTALL_INCDIR})
+endif()
+ 
+ file(TO_NATIVE_PATH "${PROJECT_SOURCE_DIR}/data" data_dir)
+ 
+-- 
+2.18.0.huawei.25
+
--- a/sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch
+++ b/sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch
@ -0,0 +1,27 @@
+From cc1380a1608d8e7913e943e8530798c882c4fe6c Mon Sep 17 00:00:00 2001
+From: Aaron Burke <aaburke@microsoft.com>
+Date: Fri, 21 Aug 2020 10:15:42 -0700
+Subject: [PATCH 2/7] sentencepiece.pc should be installed from
+ CMAKE_CURRENT_BINARY_DIR, not CMAKE_BINARY_DIR, to support being included
+ (and installed) from other projects
+
+---
+ CMakeLists.txt | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 6481dfd..9124f9e 100644
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -78,7 +78,7 @@ configure_file("${PROJECT_SOURCE_DIR}/config.h.in" "config.h")
+ configure_file("${PROJECT_SOURCE_DIR}/sentencepiece.pc.in" "sentencepiece.pc" @ONLY)
+ 
+ if (NOT MSVC)
+-  install(FILES "${CMAKE_BINARY_DIR}/sentencepiece.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
+  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/sentencepiece.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
+ endif()
+ 
+ include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_BINARY_DIR})
+-- 
+2.18.0.huawei.25
+
--- a/sentencepiece.spec
+++ b/sentencepiece.spec
@ -1,12 +1,19 @@
 Name:		sentencepiece
 Version:	0.1.92
-Release:	5
+Release:	6
 Summary:	An unsupervised text tokenizer and detokenizer
 License:	Apache-2.0
 URL:		https://github.com/google/sentencepiece
 Source0:	https://github.com/google/sentencepiece/archive/v%{version}.tar.gz
 Patch0:         Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch
 Patch1:         fix_of_an_unattainable_condition.patch
+Patch2:         Added-split_digits-to-SentencePieceTrainer.patch
+Patch3:         sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch
+Patch4:         Create-options.md.patch
+Patch5:         Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch
+Patch6:         Add-missing-include-for-BYTE_ORDER.patch
+Patch7:         only-install-proto-headers-if-not-using-builtin-prot.patch
+Patch8:         Restore-the-sentence-boundary-marker-insertion-for-t.patch
 BuildRequires:	gcc-c++ gcc autoconf pkgconfig protobuf-compiler protobuf
 BuildRequires:  cmake >= 3.14.0
 Requires:       protobuf protobuf-compiler
@ -46,9 +53,17 @@ make install
 %{_bindir}/spm_*
 %{_libdir}/*.a
 %{_libdir}/pkgconfig/*
-%{_includedir}/sentencepiece_*.h
+%{_includedir}/sentencepiece*.h

 %changelog
+* Fri Nov 27 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92.6
+- Fix split_digits support to SentencepieceTrainer spec parser
+- Add sentencepiece.pc install
+- Add spm_train --help option
+- Fix FTBFS problem on armel/mips/powerpc/m68k/sh4
+- Fix endian problem on android plarform
+- Fix pb protobuf header file can't find problem
+- Restore the sentence boundary
 * Tue Nov 16 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92.5
 - add README.md/README.en.md
 * Tue Nov 2 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92-4