add patch0 to sentencepiece
This commit is contained in:
parent
95279e8ba7
commit
e743a48e03
54
Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch
Normal file
54
Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch
Normal file
@ -0,0 +1,54 @@
|
||||
From 82b8b6f61403fcfcef673ee49ed2dfe475ba4cf2 Mon Sep 17 00:00:00 2001
|
||||
From: Sarubi <stsarut@gmail.com>
|
||||
Date: Tue, 23 Feb 2021 20:47:25 +0530
|
||||
Subject: [PATCH] Removed codes where Zero Width Joiner replaced with
|
||||
whitespace.
|
||||
|
||||
---
|
||||
data/nmt_nfkc.tsv | 3 +--
|
||||
data/nmt_nfkc_cf.tsv | 3 +--
|
||||
src/builder.cc | 1 -
|
||||
3 files changed, 2 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/data/nmt_nfkc.tsv b/data/nmt_nfkc.tsv
|
||||
index 1ce2b71..5c8b48b 100644
|
||||
--- a/data/nmt_nfkc.tsv
|
||||
+++ b/data/nmt_nfkc.tsv
|
||||
@@ -57263,8 +57263,7 @@ FB9 F90 FB5 # ྐྵ => ྐྵ
|
||||
200A 20 # =>
|
||||
200B 20 # =>
|
||||
200C 20 # =>
|
||||
-200D 20 # =>
|
||||
-200E 20 # =>
|
||||
+200E 20 # =>
|
||||
200F 20 # =>
|
||||
2011 2010 # ‑ => ‐
|
||||
2017 20 333 # ‗ => ̳
|
||||
diff --git a/data/nmt_nfkc_cf.tsv b/data/nmt_nfkc_cf.tsv
|
||||
index 2178882..0d0e708 100644
|
||||
--- a/data/nmt_nfkc_cf.tsv
|
||||
+++ b/data/nmt_nfkc_cf.tsv
|
||||
@@ -57980,8 +57980,7 @@ FB9 F90 FB5 # ྐྵ => ྐྵ
|
||||
200A 20 # =>
|
||||
200B 20 # =>
|
||||
200C 20 # =>
|
||||
-200D 20 # =>
|
||||
-200E 20 # =>
|
||||
+200E 20 # =>
|
||||
200F 20 # =>
|
||||
2011 2010 # ‑ => ‐
|
||||
2017 20 333 # ‗ => ̳
|
||||
diff --git a/src/builder.cc b/src/builder.cc
|
||||
index d9442d3..9f47aac 100644
|
||||
--- a/src/builder.cc
|
||||
+++ b/src/builder.cc
|
||||
@@ -366,7 +366,6 @@ util::Status Builder::BuildNmtNFKCMap(CharsMap *chars_map) {
|
||||
nfkc_map[{0xFEFF}] = {0x20}; // ZERO WIDTH NO-BREAK
|
||||
nfkc_map[{0xFFFD}] = {0x20}; // REPLACEMENT CHARACTER
|
||||
nfkc_map[{0x200C}] = {0x20}; // ZERO WIDTH NON-JOINER
|
||||
- nfkc_map[{0x200D}] = {0x20}; // ZERO WIDTH JOINER
|
||||
|
||||
// Ascii Control characters
|
||||
nfkc_map[{0x0001}] = {};
|
||||
--
|
||||
|
||||
@ -1,10 +1,11 @@
|
||||
Name: sentencepiece
|
||||
Version: 0.1.92
|
||||
Release: 2
|
||||
Release: 3
|
||||
Summary: An unsupervised text tokenizer and detokenizer
|
||||
License: Apache-2.0
|
||||
URL: https://github.com/google/sentencepiece
|
||||
Source0: https://github.com/google/sentencepiece/archive/v%{version}.tar.gz
|
||||
Patch0: Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch
|
||||
BuildRequires: gcc-c++ gcc autoconf pkgconfig protobuf-compiler protobuf
|
||||
BuildRequires: cmake >= 3.14.0
|
||||
Requires: protobuf protobuf-compiler
|
||||
@ -47,6 +48,8 @@ make install
|
||||
%{_includedir}/sentencepiece_*.h
|
||||
|
||||
%changelog
|
||||
* Tue Nov 2 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92-3
|
||||
- Prevent Zero Width Joiner replaced with whitespace
|
||||
* Wed Sep 29 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92-2
|
||||
- add test cases
|
||||
* Wed Sep 23 2021 xiefangqi <xiefangqi2@huawei.com> - 0.1.92-1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user