diff --git a/Add-missing-include-for-BYTE_ORDER.patch b/Add-missing-include-for-BYTE_ORDER.patch new file mode 100644 index 0000000..3bc5b75 --- /dev/null +++ b/Add-missing-include-for-BYTE_ORDER.patch @@ -0,0 +1,30 @@ +From 624091a90e816f555106a1b1f994a45cb4989051 Mon Sep 17 00:00:00 2001 +From: Malcolm Smith +Date: Tue, 12 Jan 2021 13:43:28 +0000 +Subject: [PATCH 5/7] Add missing #include for BYTE_ORDER + +--- + src/util.h | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/src/util.h b/src/util.h +index bf8a758..1680f4b 100644 +--- a/src/util.h ++++ b/src/util.h +@@ -36,6 +36,13 @@ + #include + #endif + ++#if !defined(__APPLE__) && !defined(_WIN32) ++#include ++#if defined(BYTE_ORDER) && defined(__BIG_ENDIAN) && BYTE_ORDER == __BIG_ENDIAN ++#define IS_BIG_ENDIAN ++#endif ++#endif ++ + namespace sentencepiece { + + template +-- +2.18.0.huawei.25 + diff --git a/Added-split_digits-to-SentencePieceTrainer.patch b/Added-split_digits-to-SentencePieceTrainer.patch new file mode 100644 index 0000000..7695587 --- /dev/null +++ b/Added-split_digits-to-SentencePieceTrainer.patch @@ -0,0 +1,24 @@ +From 427d695ab4343568cc46411fbe83ef5ccc619752 Mon Sep 17 00:00:00 2001 +From: mingruimingrui +Date: Sat, 27 Jun 2020 02:56:03 +0800 +Subject: [PATCH 1/7] Added split_digits to SentencePieceTrainer + +--- + src/spec_parser.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/src/spec_parser.h b/src/spec_parser.h +index 729e036..6dd054b 100644 +--- a/src/spec_parser.h ++++ b/src/spec_parser.h +@@ -207,6 +207,7 @@ util::Status SentencePieceTrainer::SetProtoField(const std::string &name, + PARSE_BOOL(split_by_unicode_script); + PARSE_BOOL(split_by_number); + PARSE_BOOL(split_by_whitespace); ++ PARSE_BOOL(split_digits); + PARSE_BOOL(treat_whitespace_as_suffix); + PARSE_REPEATED_STRING(control_symbols); + PARSE_REPEATED_STRING(user_defined_symbols); +-- +2.18.0.huawei.25 + diff --git a/Create-options.md.patch b/Create-options.md.patch new file mode 100644 index 0000000..6f8eb50 --- /dev/null +++ b/Create-options.md.patch @@ -0,0 +1,70 @@ +From 5c09745aafa151be7ed5d9a9101f3e8c79a8758b Mon Sep 17 00:00:00 2001 +From: stephantul +Date: Thu, 1 Oct 2020 12:49:13 +0200 +Subject: [PATCH 3/7] Create options.md + +--- + doc/options.md | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 51 insertions(+) + create mode 100644 doc/options.md + +diff --git a/doc/options.md b/doc/options.md +new file mode 100644 +index 0000000..7861fdc +--- /dev/null ++++ b/doc/options.md +@@ -0,0 +1,51 @@ ++# Training options ++ ++The training options for the `spm_train` can be listed using `spm_train --help`. Since the standard `pip install` of sentencepiece does not necessarily install `spm_train`, the options are also listed here. ++ ++``` ++--help (show help) type: bool default: false ++--version (show version) type: bool default: false ++--minloglevel (Messages logged at a lower level than this don't actually get logged anywhere) type: int default: 0 ++--input (comma separated list of input sentences) type: std::string default: "" ++--input_format (Input format. Supported format is `text` or `tsv`.) type: std::string default: "" ++--model_prefix (output model prefix) type: std::string default: "" --model_type (model algorithm: unigram, bpe, word or char) type: std::string default: "unigram" ++--vocab_size (vocabulary size) type: int32 default: 8000 ++--accept_language (comma-separated list of languages this model can accept) type: std::string default: "" ++--self_test_sample_size (the size of self test samples) type: int32 default: 0 ++--character_coverage (character coverage to determine the minimum symbols) type: double default: 0.9995 ++--input_sentence_size (maximum size of sentences the trainer loads) type: int32 default: 0 ++--shuffle_input_sentence (Randomly sample input sentences in advance. Valid when --input_sentence_size > 0) type: bool default: true ++--seed_sentencepiece_size (the size of seed sentencepieces) type: int32 default: 1000000 ++--shrinking_factor (Keeps top shrinking_factor pieces with respect to the loss) type: double default: 0.75 ++--num_threads (number of threads for training) type: int32 default: 16 ++--num_sub_iterations (number of EM sub-iterations) type: int32 default: 2 ++--max_sentencepiece_length (maximum length of sentence piece) type: int32 default: 16 ++--max_sentence_length (maximum length of sentence in byte) type: int32 default: 4192 ++--split_by_unicode_script (use Unicode script to split sentence pieces) type: bool default: true ++--split_by_number (split tokens by numbers (0-9)) type: bool default: true ++--split_by_whitespace (use a white space to split sentence pieces) type: bool default: true ++--split_digits (split all digits (0-9) into separate pieces) type: bool default: false ++--treat_whitespace_as_suffix (treat whitespace marker as suffix instead of prefix.) type: bool default: false ++--control_symbols (comma separated list of control symbols) type: std::string default: "" ++--user_defined_symbols (comma separated list of user defined symbols) type: std::string default: "" ++--required_chars (UTF8 characters in this flag are always used in the character set regardless of --character_coverage) type: std::string default: "" ++--byte_fallback (decompose unknown pieces into UTF-8 byte pieces) type: bool default: false ++--vocabulary_output_piece_score (Define score in vocab file) type: bool default: true ++--normalization_rule_name (Normalization rule name. Choose from nfkc or identity) type: std::string default: "nmt_nfkc" ++--normalization_rule_tsv (Normalization rule TSV file. ) type: std::string default: "" ++--denormalization_rule_tsv (Denormalization rule TSV file.) type: std::string default: "" ++--add_dummy_prefix (Add dummy whitespace at the beginning of text) type: bool default: true ++--remove_extra_whitespaces (Removes leading, trailing, and duplicate internal whitespace) type: bool default: true ++--hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.) type: bool default: true ++--use_all_vocab (If set to true, use all tokens as vocab. Valid for word/char models.) type: bool default: false ++--unk_id (Override UNK () id.) type: int32 default: 0 ++--bos_id (Override BOS () id. Set -1 to disable BOS.) type: int32 default: 1 ++--eos_id (Override EOS () id. Set -1 to disable EOS.) type: int32 default: 2 ++--pad_id (Override PAD () id. Set -1 to disable PAD.) type: int32 default: -1 ++--unk_piece (Override UNK () piece.) type: std::string default: "" ++--bos_piece (Override BOS () piece.) type: std::string default: "" ++--eos_piece (Override EOS () piece.) type: std::string default: "" ++--pad_piece (Override PAD () piece.) type: std::string default: "" ++--unk_surface (Dummy surface string for . In decoding is decoded to `unk_surface`.) type: std::string default: " ⁇ " ++--train_extremely_large_corpus (Increase bit depth for unigram tokenization.) type: bool default: false ++``` +-- +2.18.0.huawei.25 + diff --git a/Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch b/Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch new file mode 100644 index 0000000..1353431 --- /dev/null +++ b/Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch @@ -0,0 +1,30 @@ +From 2ea571b8e509809bbe28e6cc3f1488b3cfde1ef9 Mon Sep 17 00:00:00 2001 +From: Kentaro Hayashi +Date: Sat, 17 Oct 2020 16:54:20 +0900 +Subject: [PATCH 4/7] Fix FTBFS on armel, mips, powerpc, m68k and sh4 + +--- + src/CMakeLists.txt | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt +index 511b2ec..87765e5 100644 +--- a/src/CMakeLists.txt ++++ b/src/CMakeLists.txt +@@ -197,6 +197,13 @@ target_link_libraries(sentencepiece_train-static INTERFACE sentencepiece-static + if (SPM_ENABLE_SHARED) + target_link_libraries(sentencepiece ${SPM_LIBS}) + target_link_libraries(sentencepiece_train ${SPM_LIBS} sentencepiece) ++ if ((${CMAKE_SYSTEM_PROCESSOR} STREQUAL "armv7l") OR ++ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "mips") OR ++ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "m68k") OR ++ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "ppc") OR ++ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "sh4")) ++ list(APPEND SPM_LIBS "atomic") ++ endif() + set(SPM_INSTALLTARGETS sentencepiece sentencepiece_train sentencepiece-static sentencepiece_train-static) + set_target_properties(sentencepiece sentencepiece_train PROPERTIES SOVERSION 0 VERSION 0.0.0) + set_target_properties(sentencepiece PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS YES) +-- +2.18.0.huawei.25 + diff --git a/Restore-the-sentence-boundary-marker-insertion-for-t.patch b/Restore-the-sentence-boundary-marker-insertion-for-t.patch new file mode 100644 index 0000000..8d53ad6 --- /dev/null +++ b/Restore-the-sentence-boundary-marker-insertion-for-t.patch @@ -0,0 +1,25 @@ +From 21aa7a9d6a3bd6a98c480bea02e0e81b21f411af Mon Sep 17 00:00:00 2001 +From: joe <219651+AdolfVonKleist@users.noreply.github.com> +Date: Mon, 22 Mar 2021 17:26:20 +0000 +Subject: [PATCH 7/7] Restore the sentence boundary marker insertion for the + unigram trainer. Dramatically speeds up training time. + +--- + src/unigram_model_trainer.cc | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/src/unigram_model_trainer.cc b/src/unigram_model_trainer.cc +index 5f26771..94c7adb 100644 +--- a/src/unigram_model_trainer.cc ++++ b/src/unigram_model_trainer.cc +@@ -119,6 +119,7 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePieces() const { + all_chars[string_util::UnicodeCharToUTF8(c)] += w.second; + } + } ++ array.push_back(kSentenceBoundary); // sentence boundary marker. + } + + const node_int_type n = array.size(); +-- +2.18.0.huawei.25 + diff --git a/only-install-proto-headers-if-not-using-builtin-prot.patch b/only-install-proto-headers-if-not-using-builtin-prot.patch new file mode 100644 index 0000000..9a72915 --- /dev/null +++ b/only-install-proto-headers-if-not-using-builtin-prot.patch @@ -0,0 +1,29 @@ +From a069cd5518c11750b734b85887dcc74ec6f9457f Mon Sep 17 00:00:00 2001 +From: mark +Date: Wed, 10 Feb 2021 10:59:56 -0800 +Subject: [PATCH 6/7] only install proto headers if not using builtin proto + +--- + src/CMakeLists.txt | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt +index 87765e5..3d31259 100644 +--- a/src/CMakeLists.txt ++++ b/src/CMakeLists.txt +@@ -272,8 +272,11 @@ install(TARGETS ${SPM_INSTALLTARGETS} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) +-install(FILES sentencepiece_trainer.h sentencepiece_processor.h ++install(FILES sentencepiece_trainer.h sentencepiece_processor.h ${SPM_PROTO_HDRS} + DESTINATION ${CMAKE_INSTALL_INCDIR}) ++if (NOT SPM_USE_BUILTIN_PROTOBUF) ++ install(FILES ${SPM_PROTO_HDRS} DESTINATION ${CMAKE_INSTALL_INCDIR}) ++endif() + + file(TO_NATIVE_PATH "${PROJECT_SOURCE_DIR}/data" data_dir) + +-- +2.18.0.huawei.25 + diff --git a/sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch b/sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch new file mode 100644 index 0000000..ec371fe --- /dev/null +++ b/sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch @@ -0,0 +1,27 @@ +From cc1380a1608d8e7913e943e8530798c882c4fe6c Mon Sep 17 00:00:00 2001 +From: Aaron Burke +Date: Fri, 21 Aug 2020 10:15:42 -0700 +Subject: [PATCH 2/7] sentencepiece.pc should be installed from + CMAKE_CURRENT_BINARY_DIR, not CMAKE_BINARY_DIR, to support being included + (and installed) from other projects + +--- + CMakeLists.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 6481dfd..9124f9e 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -78,7 +78,7 @@ configure_file("${PROJECT_SOURCE_DIR}/config.h.in" "config.h") + configure_file("${PROJECT_SOURCE_DIR}/sentencepiece.pc.in" "sentencepiece.pc" @ONLY) + + if (NOT MSVC) +- install(FILES "${CMAKE_BINARY_DIR}/sentencepiece.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) ++ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/sentencepiece.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) + endif() + + include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_BINARY_DIR}) +-- +2.18.0.huawei.25 + diff --git a/sentencepiece.spec b/sentencepiece.spec index 599a3d8..4bf4dfd 100644 --- a/sentencepiece.spec +++ b/sentencepiece.spec @@ -1,12 +1,19 @@ Name: sentencepiece Version: 0.1.92 -Release: 5 +Release: 6 Summary: An unsupervised text tokenizer and detokenizer License: Apache-2.0 URL: https://github.com/google/sentencepiece Source0: https://github.com/google/sentencepiece/archive/v%{version}.tar.gz Patch0: Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch Patch1: fix_of_an_unattainable_condition.patch +Patch2: Added-split_digits-to-SentencePieceTrainer.patch +Patch3: sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch +Patch4: Create-options.md.patch +Patch5: Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch +Patch6: Add-missing-include-for-BYTE_ORDER.patch +Patch7: only-install-proto-headers-if-not-using-builtin-prot.patch +Patch8: Restore-the-sentence-boundary-marker-insertion-for-t.patch BuildRequires: gcc-c++ gcc autoconf pkgconfig protobuf-compiler protobuf BuildRequires: cmake >= 3.14.0 Requires: protobuf protobuf-compiler @@ -46,9 +53,17 @@ make install %{_bindir}/spm_* %{_libdir}/*.a %{_libdir}/pkgconfig/* -%{_includedir}/sentencepiece_*.h +%{_includedir}/sentencepiece*.h %changelog +* Fri Nov 27 2021 xiefangqi - 0.1.92.6 +- Fix split_digits support to SentencepieceTrainer spec parser +- Add sentencepiece.pc install +- Add spm_train --help option +- Fix FTBFS problem on armel/mips/powerpc/m68k/sh4 +- Fix endian problem on android plarform +- Fix pb protobuf header file can't find problem +- Restore the sentence boundary * Tue Nov 16 2021 xiefangqi - 0.1.92.5 - add README.md/README.en.md * Tue Nov 2 2021 xiefangqi - 0.1.92-4