!42 Fix CVE-2024-5206

From: @starlet-dx Reviewed-by: @cherry530 Signed-off-by: @cherry530
Fix CVE-2024-5206
2024-06-17 06:50:03 +00:00 · 2024-06-17 10:45:08 +08:00 · 2022-10-25 12:44:47 +00:00 · 2022-10-25 17:30:44 +08:00 · 2022-07-11 08:35:07 +00:00 · 2022-06-20 10:09:50 +08:00
4 changed files with 226 additions and 4 deletions
--- a/backport-CVE-2024-5206.patch
+++ b/backport-CVE-2024-5206.patch
@ -0,0 +1,207 @@
 From 70ca21f106b603b611da73012c9ade7cd8e438b8 Mon Sep 17 00:00:00 2001
 From: Olivier Grisel <olivier.grisel@ensta.org>
 Date: Mon, 22 Apr 2024 15:10:46 +0200
 Subject: [PATCH] FIX remove the computed stop_words_ attribute of text
 vectorizer (#28823)
 Origin:
 https://github.com/scikit-learn/scikit-learn/commit/70ca21f106b603b611da73012c9ade7cd8e438b8
 ---
 sklearn/feature_extraction/tests/test_text.py | 42 -------------------
 sklearn/feature_extraction/text.py            | 36 +---------------
 2 files changed, 2 insertions(+), 76 deletions(-)
 diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
 index b46958c..ac55021 100644
 --- a/sklearn/feature_extraction/tests/test_text.py
 +++ b/sklearn/feature_extraction/tests/test_text.py
@@ -764,21 +764,11 @@ def test_feature_names(get_names):
 @pytest.mark.parametrize("Vectorizer", (CountVectorizer, TfidfVectorizer))
 def test_vectorizer_max_features(Vectorizer):
     expected_vocabulary = {"burger", "beer", "salad", "pizza"}
 -    expected_stop_words = {
 -        "celeri",
 -        "tomato",
 -        "copyright",
 -        "coke",
 -        "sparkling",
 -        "water",
 -        "the",
 -    }
     # test bounded number of extracted features
     vectorizer = Vectorizer(max_df=0.6, max_features=4)
     vectorizer.fit(ALL_FOOD_DOCS)
     assert set(vectorizer.vocabulary_) == expected_vocabulary
 -    assert vectorizer.stop_words_ == expected_stop_words
 # TODO: Remove in 1.2 when get_feature_names is removed.
@@ -816,21 +806,16 @@ def test_vectorizer_max_df():
     vect.fit(test_data)
     assert "a" in vect.vocabulary_.keys()
     assert len(vect.vocabulary_.keys()) == 6
 -    assert len(vect.stop_words_) == 0
     vect.max_df = 0.5  # 0.5 * 3 documents -> max_doc_count == 1.5
     vect.fit(test_data)
     assert "a" not in vect.vocabulary_.keys()  # {ae} ignored
     assert len(vect.vocabulary_.keys()) == 4  # {bcdt} remain
 -    assert "a" in vect.stop_words_
 -    assert len(vect.stop_words_) == 2
     vect.max_df = 1
     vect.fit(test_data)
     assert "a" not in vect.vocabulary_.keys()  # {ae} ignored
     assert len(vect.vocabulary_.keys()) == 4  # {bcdt} remain
 -    assert "a" in vect.stop_words_
 -    assert len(vect.stop_words_) == 2
 def test_vectorizer_min_df():
@@ -839,21 +824,16 @@ def test_vectorizer_min_df():
     vect.fit(test_data)
     assert "a" in vect.vocabulary_.keys()
     assert len(vect.vocabulary_.keys()) == 6
 -    assert len(vect.stop_words_) == 0
     vect.min_df = 2
     vect.fit(test_data)
     assert "c" not in vect.vocabulary_.keys()  # {bcdt} ignored
     assert len(vect.vocabulary_.keys()) == 2  # {ae} remain
 -    assert "c" in vect.stop_words_
 -    assert len(vect.stop_words_) == 4
     vect.min_df = 0.8  # 0.8 * 3 documents -> min_doc_count == 2.4
     vect.fit(test_data)
     assert "c" not in vect.vocabulary_.keys()  # {bcdet} ignored
     assert len(vect.vocabulary_.keys()) == 1  # {a} remains
 -    assert "c" in vect.stop_words_
 -    assert len(vect.stop_words_) == 5
 @pytest.mark.parametrize(
@@ -1195,28 +1175,6 @@ def test_countvectorizer_vocab_dicts_when_pickling(get_names):
         assert_array_equal(getattr(cv, get_names)(), getattr(unpickled_cv, get_names)())
 -def test_stop_words_removal():
 -    # Ensure that deleting the stop_words_ attribute doesn't affect transform
 -
 -    fitted_vectorizers = (
 -        TfidfVectorizer().fit(JUNK_FOOD_DOCS),
 -        CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
 -        CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS),
 -    )
 -
 -    for vect in fitted_vectorizers:
 -        vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
 -
 -        vect.stop_words_ = None
 -        stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
 -
 -        delattr(vect, "stop_words_")
 -        stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
 -
 -        assert_array_equal(stop_None_transform, vect_transform)
 -        assert_array_equal(stop_del_transform, vect_transform)
 -
 -
 def test_pickling_transformer():
     X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)
     orig = TfidfTransformer().fit(X)
 diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
 index b565aea..2735ded 100644
 --- a/sklearn/feature_extraction/text.py
 +++ b/sklearn/feature_extraction/text.py
@@ -1040,15 +1040,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
         True if a fixed vocabulary of term to indices mapping
         is provided by the user.
 -    stop_words_ : set
 -        Terms that were ignored because they either:
 -
 -          - occurred in too many documents (`max_df`)
 -          - occurred in too few documents (`min_df`)
 -          - were cut off by feature selection (`max_features`).
 -
 -        This is only available if no vocabulary was given.
 -
     See Also
     --------
     HashingVectorizer : Convert a collection of text documents to a
@@ -1057,12 +1048,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
     TfidfVectorizer : Convert a collection of raw documents to a matrix
         of TF-IDF features.
 -    Notes
 -    -----
 -    The ``stop_words_`` attribute can get large and increase the model size
 -    when pickling. This attribute is provided only for introspection and can
 -    be safely removed using delattr or set to None before pickling.
 -
     Examples
     --------
     >>> from sklearn.feature_extraction.text import CountVectorizer
@@ -1175,19 +1160,17 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
             mask = new_mask
         new_indices = np.cumsum(mask) - 1  # maps old indices to new
 -        removed_terms = set()
         for term, old_index in list(vocabulary.items()):
             if mask[old_index]:
                 vocabulary[term] = new_indices[old_index]
             else:
                 del vocabulary[term]
 -                removed_terms.add(term)
         kept_indices = np.where(mask)[0]
         if len(kept_indices) == 0:
             raise ValueError(
                 "After pruning, no terms remain. Try a lower min_df or a higher max_df."
             )
 -        return X[:, kept_indices], removed_terms
 +        return X[:, kept_indices]
     def _count_vocab(self, raw_documents, fixed_vocab):
         """Create sparse feature matrix, and vocabulary where fixed_vocab=False"""
@@ -1352,7 +1335,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
                 raise ValueError("max_df corresponds to < documents than min_df")
             if max_features is not None:
                 X = self._sort_features(X, vocabulary)
 -            X, self.stop_words_ = self._limit_features(
 +            X = self._limit_features(
                 X, vocabulary, max_doc_count, min_doc_count, max_features
             )
             if max_features is None:
@@ -1882,15 +1865,6 @@ class TfidfVectorizer(CountVectorizer):
         The inverse document frequency (IDF) vector; only defined
         if ``use_idf`` is True.
 -    stop_words_ : set
 -        Terms that were ignored because they either:
 -
 -          - occurred in too many documents (`max_df`)
 -          - occurred in too few documents (`min_df`)
 -          - were cut off by feature selection (`max_features`).
 -
 -        This is only available if no vocabulary was given.
 -
     See Also
     --------
     CountVectorizer : Transforms text into a sparse matrix of n-gram counts.
@@ -1898,12 +1872,6 @@ class TfidfVectorizer(CountVectorizer):
     TfidfTransformer : Performs the TF-IDF transformation from a provided
         matrix of counts.
 -    Notes
 -    -----
 -    The ``stop_words_`` attribute can get large and increase the model size
 -    when pickling. This attribute is provided only for introspection and can
 -    be safely removed using delattr or set to None before pickling.
 -
     Examples
     --------
     >>> from sklearn.feature_extraction.text import TfidfVectorizer
 -- 
 2.33.0
--- a/python-scikit-learn.spec
+++ b/python-scikit-learn.spec
@ -2,11 +2,12 @@
 Name:           python-scikit-learn
 Summary:        A Python module for machine learning built on top of SciPy
-Version:        0.24.0
+Version:        1.1.1
 Release:        2
 License:        BSD
 URL:            https://scikit-learn.org/stable/
 Source0:        https://github.com/scikit-learn/scikit-learn/archive/%{version}/scikit-learn-%{version}.tar.gz
 Patch3000:      backport-CVE-2024-5206.patch
 %global _description\
 scikit-learn is a Python module for machine learning built on top of SciPy\
@ -16,15 +17,17 @@ and is distributed under the 3-Clause BSD license.\
 %package -n python3-scikit-learn
 Summary:        %summary
 %{?python_provide:%python_provide python3-scikit-learn}
 %{?python_provide:%python_provide python3-sklearn}
-BuildRequires:  git python3-devel python3-numpy python3-Cython python3-pytest python3-scipy
+BuildRequires:  python3-devel python3-numpy python3-Cython python3-pytest python3-scipy g++
 Requires:       python3 >= 3.5 python3-numpy >= 1.11.0
 Requires:       python3-scipy >= 0.17.0 python3-joblib >=  0.11
 %description -n python3-scikit-learn %_description
 %prep
-%autosetup -n scikit-learn-%{version} -p1 -Sgit
+%autosetup -n scikit-learn-%{version} -p1
 %build
 CFLAGS="$RPM_OPT_FLAGS -s"
@ -40,13 +43,25 @@ CFLAGS="$RPM_OPT_FLAGS -s"
 %{python3_sitearch}/scikit_learn-*.egg-info
 %changelog
 * Mon Jun 17 2024 yaoxin <yao_xin001@hoperun.com> - 1.1.1-2
 - Fix CVE-2024-5206
 * Tue Oct 25 2022 xu_ping <xuping33@h-partners.com> - 1.1.1-1
 - Upgrade to version 1.1.1
 * Fri Jul 30 2021 chenyanpanHW <chenyanpan@huawei.com> - 0.24.0-4
 - DESC: delete -Sgit from %autosetup, and delete BuildRequires git
 * Wed Jun 23 2021 liudabo <liudabo1@huawei.com> - 0.24.0-3
 - Add g++ build dependcy
 * Sat Mar 20 2021 shixuantong <shixuantong@huawei> - 0.24.0-2
 - strip binary files
 * Wed Jan 13 2021 SimpleUpdate Robot <tc@openeuler.org> - 0.24.0-1
 - Upgrade to version 0.24.0
-* Fri Oct 1 2020 Zhipeng Xie <xiezhipeng1@huawei.com> - 0.23.1-2
+* Thu Oct 1 2020 Zhipeng Xie <xiezhipeng1@huawei.com> - 0.23.1-2
 - upgrade to 0.23.2
 * Fri Jul 31 2020 Zhipeng Xie <xiezhipeng1@huawei.com> - 0.23.1-1
--- a/scikit-learn-0.24.0.tar.gz
+++ b/scikit-learn-0.24.0.tar.gz
--- a/scikit-learn-1.1.1.tar.gz
+++ b/scikit-learn-1.1.1.tar.gz
Author	SHA1	Message	Date
openeuler-ci-bot	1fbe368c5c	!42 Fix CVE-2024-5206 From: @starlet-dx Reviewed-by: @cherry530 Signed-off-by: @cherry530	2024-06-17 06:50:03 +00:00
starlet-dx	2d672cc801	Fix CVE-2024-5206	2024-06-17 10:45:08 +08:00
openeuler-ci-bot	6dc1ddbea9	!28 upgrade openEuler-22.03-LTS-Next python-scikit-learn 1.1.1 From: @cherry530 Reviewed-by: @caodongxia Signed-off-by: @caodongxia	2022-10-25 12:44:47 +00:00
cherry530	81384c143e	Upgrade 1.1.1 Signed-off-by: cherry530 <xuping33@huawei.com>	2022-10-25 17:30:44 +08:00
openeuler-ci-bot	d337a99fdc	!27 【轻量级PR】修正changelog中的错误日期 From: @konglidong Reviewed-by: @shinwell_hu Signed-off-by: @shinwell_hu	2022-07-11 08:35:07 +00:00
konglidong	a4d04f163d	modify bogus date in changelog	2022-06-20 10:09:50 +08:00
openeuler-ci-bot	213a736cbb	!17 python-scikit-learn delete -Sgit from %autosetup, and delete BuildRequires git From: @chenyanpanHW Reviewed-by: @shinwell_hu Signed-off-by: @shinwell_hu	2021-08-11 09:33:03 +00:00
chenyanpanHW	ea72cb8d6b	delete -Sgit from %autosetup, and delete BuildRequires git	2021-07-30 23:30:31 +08:00
openeuler-ci-bot	99fc7c7611	!15 Add g++ build dependency From: @run-is-pig Reviewed-by: @shinwell_hu Signed-off-by: @shinwell_hu	2021-06-25 06:51:24 +00:00
yangl777	1448d4912b	Add g++ build dependcy	2021-06-25 09:11:35 +08:00