diff --git a/Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch b/Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch new file mode 100644 index 0000000..7691daa --- /dev/null +++ b/Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch @@ -0,0 +1,54 @@ +From 82b8b6f61403fcfcef673ee49ed2dfe475ba4cf2 Mon Sep 17 00:00:00 2001 +From: Sarubi +Date: Tue, 23 Feb 2021 20:47:25 +0530 +Subject: [PATCH] Removed codes where Zero Width Joiner replaced with + whitespace. + +--- + data/nmt_nfkc.tsv | 3 +-- + data/nmt_nfkc_cf.tsv | 3 +-- + src/builder.cc | 1 - + 3 files changed, 2 insertions(+), 5 deletions(-) + +diff --git a/data/nmt_nfkc.tsv b/data/nmt_nfkc.tsv +index 1ce2b71..5c8b48b 100644 +--- a/data/nmt_nfkc.tsv ++++ b/data/nmt_nfkc.tsv +@@ -57263,8 +57263,7 @@ FB9 F90 FB5 # ྐྵ => ྐྵ + 200A 20 #   => + 200B 20 # ​ => + 200C 20 # ‌ => +-200D 20 # ‍ => +-200E 20 # ‎ => ++200E 20 # ‎ => + 200F 20 # ‏ => + 2011 2010 # ‑ => ‐ + 2017 20 333 # ‗ => ̳ +diff --git a/data/nmt_nfkc_cf.tsv b/data/nmt_nfkc_cf.tsv +index 2178882..0d0e708 100644 +--- a/data/nmt_nfkc_cf.tsv ++++ b/data/nmt_nfkc_cf.tsv +@@ -57980,8 +57980,7 @@ FB9 F90 FB5 # ྐྵ => ྐྵ + 200A 20 #   => + 200B 20 # ​ => + 200C 20 # ‌ => +-200D 20 # ‍ => +-200E 20 # ‎ => ++200E 20 # ‎ => + 200F 20 # ‏ => + 2011 2010 # ‑ => ‐ + 2017 20 333 # ‗ => ̳ +diff --git a/src/builder.cc b/src/builder.cc +index d9442d3..9f47aac 100644 +--- a/src/builder.cc ++++ b/src/builder.cc +@@ -366,7 +366,6 @@ util::Status Builder::BuildNmtNFKCMap(CharsMap *chars_map) { + nfkc_map[{0xFEFF}] = {0x20}; // ZERO WIDTH NO-BREAK + nfkc_map[{0xFFFD}] = {0x20}; // REPLACEMENT CHARACTER + nfkc_map[{0x200C}] = {0x20}; // ZERO WIDTH NON-JOINER +- nfkc_map[{0x200D}] = {0x20}; // ZERO WIDTH JOINER + + // Ascii Control characters + nfkc_map[{0x0001}] = {}; +-- + diff --git a/sentencepiece.spec b/sentencepiece.spec index 5f8de81..6f1120a 100644 --- a/sentencepiece.spec +++ b/sentencepiece.spec @@ -1,10 +1,11 @@ Name: sentencepiece Version: 0.1.92 -Release: 2 +Release: 3 Summary: An unsupervised text tokenizer and detokenizer License: Apache-2.0 URL: https://github.com/google/sentencepiece Source0: https://github.com/google/sentencepiece/archive/v%{version}.tar.gz +Patch0: Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch BuildRequires: gcc-c++ gcc autoconf pkgconfig protobuf-compiler protobuf BuildRequires: cmake >= 3.14.0 Requires: protobuf protobuf-compiler @@ -47,6 +48,8 @@ make install %{_includedir}/sentencepiece_*.h %changelog +* Tue Nov 2 2021 xiefangqi - 0.1.92-3 +- Prevent Zero Width Joiner replaced with whitespace * Wed Sep 29 2021 xiefangqi - 0.1.92-2 - add test cases * Wed Sep 23 2021 xiefangqi - 0.1.92-1