From aa2bf56129077c562f274774182574e16e66e80a Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Wed, 4 Aug 2021 19:11:14 +0200 Subject: [PATCH 01/21] fixes typos and wording in ENN script --- .../_edited_nearest_neighbours.py | 62 +++++++++---------- imblearn/utils/_validation.py | 10 +-- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index e0eb866a7..a3b0d514c 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -1,4 +1,4 @@ -"""Class to perform under-sampling based on the edited nearest neighbour +"""Classes to perform under-sampling based on the edited nearest neighbour method.""" # Authors: Guillaume Lemaitre @@ -42,18 +42,18 @@ class EditedNearestNeighbours(BaseCleaningSampler): If ``int``, size of the neighbourhood to consider to compute the nearest neighbors. If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to - find the nearest-neighbors. + find the nearest-neighbours. kind_sel : {{'all', 'mode'}}, default='all' Strategy to use in order to exclude samples. - - If ``'all'``, all neighbours will have to agree with the samples of - interest to not be excluded. + - If ``'all'``, all neighbours will have to agree with a sample in order + not be excluded. - If ``'mode'``, the majority vote of the neighbours will be used in - order to exclude a sample. + order not to exclude a sample. The strategy `"all"` will be less conservative than `'mode'`. Thus, - more samples will be removed when `kind_sel="all"` generally. + more samples will be removed when `kind_sel="all"`, generally. {n_jobs} @@ -120,7 +120,7 @@ def __init__( def _validate_estimator(self): """Validate the estimator created in the ENN.""" self.nn_ = check_neighbors_object( - "n_neighbors", self.n_neighbors, additional_neighbor=1 + "n_neighbors", self.n_neighbors, additional_neighbor=0 ) self.nn_.set_params(**{"n_jobs": self.n_jobs}) @@ -174,7 +174,7 @@ def _more_tags(self): class RepeatedEditedNearestNeighbours(BaseCleaningSampler): """Undersample based on the repeated edited nearest neighbour method. - This method will repeat several time the ENN algorithm. + This method will repeat several times the ENN algorithm. Read more in the :ref:`User Guide `. @@ -189,19 +189,18 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): find the nearest-neighbors. max_iter : int, default=100 - Maximum number of iterations of the edited nearest neighbours - algorithm for a single run. + Maximum number of iterations of the edited nearest neighbours algorithm. kind_sel : {{'all', 'mode'}}, default='all' Strategy to use in order to exclude samples. - - If ``'all'``, all neighbours will have to agree with the samples of - interest to not be excluded. + - If ``'all'``, all neighbours will have to agree with a sample in order + not be excluded. - If ``'mode'``, the majority vote of the neighbours will be used in - order to exclude a sample. + order not to exclude a sample. The strategy `"all"` will be less conservative than `'mode'`. Thus, - more samples will be removed when `kind_sel="all"` generally. + more samples will be removed when `kind_sel="all"`, generally. {n_jobs} @@ -280,7 +279,7 @@ def _validate_estimator(self): ) self.nn_ = check_neighbors_object( - "n_neighbors", self.n_neighbors, additional_neighbor=1 + "n_neighbors", self.n_neighbors, additional_neighbor=0 ) self.enn_ = EditedNearestNeighbours( @@ -303,11 +302,12 @@ def _fit_resample(self, X, y): prev_len = y_.shape[0] X_enn, y_enn = self.enn_.fit_resample(X_, y_) - # Check the stopping criterion - # 1. If there is no changes for the vector y - # 2. If the number of samples in the other class become inferior to - # the number of samples in the majority class - # 3. If one of the class is disappearing + # Check the stopping criterion: + # 1. If there are no changes in the vector y + # (that is, if no further observations are removed) + # 2. If the number of samples in any of the other (majority) classes becomes + # smaller than the number of samples in the minority class + # 3. If one of the classes disappears # Case 1 b_conv = prev_len == y_enn.shape[0] @@ -359,8 +359,8 @@ def _more_tags(self): class AllKNN(BaseCleaningSampler): """Undersample based on the AllKNN method. - This method will apply ENN several time and will vary the number of nearest - neighbours. + This method will apply ENN several times increasing the number of nearest + neighbours at each round. Read more in the :ref:`User Guide `. @@ -377,13 +377,13 @@ class AllKNN(BaseCleaningSampler): kind_sel : {{'all', 'mode'}}, default='all' Strategy to use in order to exclude samples. - - If ``'all'``, all neighbours will have to agree with the samples of - interest to not be excluded. + - If ``'all'``, all neighbours will have to agree with a sample in order + not be excluded. - If ``'mode'``, the majority vote of the neighbours will be used in - order to exclude a sample. + order not to exclude a sample. The strategy `"all"` will be less conservative than `'mode'`. Thus, - more samples will be removed when `kind_sel="all"` generally. + more samples will be removed when `kind_sel="all"`, generally. allow_minority : bool, default=False If ``True``, it allows the majority classes to become the minority @@ -460,7 +460,7 @@ def _validate_estimator(self): raise NotImplementedError self.nn_ = check_neighbors_object( - "n_neighbors", self.n_neighbors, additional_neighbor=1 + "n_neighbors", self.n_neighbors, additional_neighbor=0 ) self.enn_ = EditedNearestNeighbours( @@ -484,10 +484,10 @@ def _fit_resample(self, X, y): X_enn, y_enn = self.enn_.fit_resample(X_, y_) - # Check the stopping criterion - # 1. If the number of samples in the other class become inferior to - # the number of samples in the majority class - # 2. If one of the class is disappearing + # Stopping criterion: + # 1. If the number of samples in any of the majority classes ends up + # smaller than the number of samples in the minority class + # 2. If one of the classes disappears # Case 1else: stats_enn = Counter(y_enn) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 7eb4099ea..533ed7c22 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -68,12 +68,12 @@ def _transfrom_one(self, array, props): def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): - """Check the objects is consistent to be a NN. + """Check the object is consistent with a NN. - Several methods in imblearn relies on NN. Until version 0.4, these + Several methods in imblearn rely on NN. Until version 0.4, these objects can be passed at initialisation as an integer or a - KNeighborsMixin. After only KNeighborsMixin will be accepted. This - utility allows for type checking and raise if the type is wrong. + KNeighborsMixin. After, only KNeighborsMixin will be accepted. This + utility allows for type checking and raises an error if the type is wrong. Parameters ---------- @@ -84,7 +84,7 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): The object to be checked. additional_neighbor : int, default=0 - Sometimes, some algorithm need an additional neighbors. + Some algorithms need an additional neighbor. Returns ------- From 651a3602131f3021eaf9b8b883169e03eb17f59e Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Wed, 4 Aug 2021 19:24:18 +0200 Subject: [PATCH 02/21] fixes tests to match bug fix --- .../tests/test_edited_nearest_neighbours.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py index 44999ddb5..7b56bd120 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py @@ -49,7 +49,7 @@ def test_enn_init(): def test_enn_fit_resample(): - enn = EditedNearestNeighbours() + enn = EditedNearestNeighbours(n_neighbors=4) X_resampled, y_resampled = enn.fit_resample(X, Y) X_gt = np.array( @@ -63,13 +63,14 @@ def test_enn_fit_resample(): [0.52726792, -0.38735648], ] ) + assert enn.nn_.n_neighbors == 4 y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_enn_fit_resample_mode(): - enn = EditedNearestNeighbours(kind_sel="mode") + enn = EditedNearestNeighbours(n_neighbors=4, kind_sel="mode") X_resampled, y_resampled = enn.fit_resample(X, Y) X_gt = np.array( @@ -90,6 +91,7 @@ def test_enn_fit_resample_mode(): [0.2821046, -0.07862747], ] ) + assert enn.nn_.n_neighbors == 4 y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -138,8 +140,8 @@ def test_enn_check_kind_selection(): n_samples=1000, n_classes=2, weights=[0.3, 0.7], random_state=0, ) - enn_all = EditedNearestNeighbours(kind_sel="all") - enn_mode = EditedNearestNeighbours(kind_sel="mode") + enn_all = EditedNearestNeighbours(kind_sel="all", n_neighbors=4) + enn_mode = EditedNearestNeighbours(kind_sel="mode", n_neighbors=4) enn_all.fit_resample(X, y) enn_mode.fit_resample(X, y) From 248f11f66a37e75be31af415173a82e52b4eadee Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Wed, 4 Aug 2021 19:32:51 +0200 Subject: [PATCH 03/21] fixes test RENN --- .../tests/test_repeated_edited_nearest_neighbours.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py index a206ec2b3..237724cb1 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py @@ -117,7 +117,7 @@ def test_renn_iter_wrong(): def test_renn_fit_resample(): - renn = RepeatedEditedNearestNeighbours() + renn = RepeatedEditedNearestNeighbours(n_neighbors=4) X_resampled, y_resampled = renn.fit_resample(X, Y) X_gt = np.array( @@ -153,13 +153,15 @@ def test_renn_fit_resample(): y_gt = np.array( [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] ) + assert renn.nn_.n_neighbors == 4 + assert renn.enn_.nn_.n_neighbors == 4 assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert 0 < renn.n_iter_ <= renn.max_iter def test_renn_fit_resample_mode_object(): - renn = RepeatedEditedNearestNeighbours(kind_sel="mode") + renn = RepeatedEditedNearestNeighbours(kind_sel="mode", n_neighbors=4) X_resampled, y_resampled = renn.fit_resample(X, Y) X_gt = np.array( @@ -238,6 +240,8 @@ def test_renn_fit_resample_mode_object(): 2, ] ) + assert renn.nn_.n_neighbors == 4 + assert renn.enn_.nn_.n_neighbors == 4 assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert 0 < renn.n_iter_ <= renn.max_iter From 5c0cb4a268247c11624d4d24dfe3fb890fbe5da6 Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Wed, 4 Aug 2021 20:48:02 +0200 Subject: [PATCH 04/21] [wip] updating allknn and tests --- .../_edited_nearest_neighbours.py | 18 +- .../_prototype_selection/tests/test_allknn.py | 159 ++---------------- 2 files changed, 31 insertions(+), 146 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index a3b0d514c..4cd94afce 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -374,6 +374,9 @@ class AllKNN(BaseCleaningSampler): :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to find the nearest-neighbors. By default, it will be a 3-NN. + max_iter : int, default=100 + Maximum number of iterations of the edited nearest neighbours algorithm. + kind_sel : {{'all', 'mode'}}, default='all' Strategy to use in order to exclude samples. @@ -400,6 +403,11 @@ class without early stopping. .. versionadded:: 0.4 + n_iter_ : int + Number of iterations run. + + .. versionadded:: 0.9 + See Also -------- CondensedNearestNeighbour: Under-sampling by condensing samples. @@ -444,6 +452,7 @@ def __init__( *, sampling_strategy="auto", n_neighbors=3, + max_iter=100, kind_sel="all", allow_minority=False, n_jobs=None, @@ -453,6 +462,7 @@ def __init__( self.kind_sel = kind_sel self.allow_minority = allow_minority self.n_jobs = n_jobs + self.max_iter = max_iter def _validate_estimator(self): """Create objects required by AllKNN""" @@ -479,11 +489,17 @@ def _fit_resample(self, X, y): self.sample_indices_ = np.arange(X.shape[0], dtype=int) - for curr_size_ngh in range(1, self.nn_.n_neighbors): + curr_size_ngh = self.n_neighbors + + for n_iter in range(self.max_iter): + self.enn_.n_neighbors = curr_size_ngh X_enn, y_enn = self.enn_.fit_resample(X_, y_) + # add a neighbour for the next round + curr_size_ngh = curr_size_ngh + 1 + # Stopping criterion: # 1. If the number of samples in any of the majority classes ends up # smaller than the number of samples in the minority class diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py index e4e91b91a..aab6782c5 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py @@ -115,59 +115,16 @@ def test_allknn_fit_resample(): [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], - [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], - [0.04296502, -0.37981873], - [0.28294738, -1.00125525], - [0.34218094, -0.58781961], - [0.2096964, -0.61814058], - [1.59068979, -0.96622933], - [0.73418199, -0.02222847], - [0.79270821, -0.41386668], - [1.16606871, -0.25641059], - [1.0304995, -0.16955962], - [0.48921682, -1.38504507], - [-0.03918551, -0.68540745], - [0.24991051, -1.00864997], - [0.80541964, -0.34465185], - [0.1732627, -1.61323172], ] ) y_gt = np.array( - [ - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - ] + [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_allclose(y_resampled, y_gt, rtol=R_TOL) @@ -208,72 +165,28 @@ def test_allknn_fit_resample_mode(): [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], - [-1.10146139, 0.91782682], + [1.32326943, 0.28393874], + [2.94290565, -0.13986434], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], + [1.84864913, 0.14729596], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], + [1.67314371, 0.19231498], [0.98382284, 0.37184502], - [0.69804044, 0.44810796], - [1.32319756, -0.13181616], - [0.04296502, -0.37981873], - [0.28294738, -1.00125525], - [0.34218094, -0.58781961], - [0.2096964, -0.61814058], - [1.59068979, -0.96622933], - [0.73418199, -0.02222847], - [0.79270821, -0.41386668], - [1.16606871, -0.25641059], - [1.0304995, -0.16955962], - [0.48921682, -1.38504507], - [-0.03918551, -0.68540745], - [0.24991051, -1.00864997], - [0.80541964, -0.34465185], - [0.1732627, -1.61323172], + [0.69804044, 0.44810796] ] ) y_gt = np.array( - [ - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - ] + [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] ) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_allknn_fit_resample_with_nn_object(): - nn = NearestNeighbors(n_neighbors=4) + nn = NearestNeighbors(n_neighbors=3) allknn = AllKNN(n_neighbors=nn, kind_sel="mode") X_resampled, y_resampled = allknn.fit_resample(X, Y) @@ -287,65 +200,21 @@ def test_allknn_fit_resample_with_nn_object(): [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], - [-1.10146139, 0.91782682], + [1.32326943, 0.28393874], + [2.94290565, -0.13986434], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], + [1.84864913, 0.14729596], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], + [1.67314371, 0.19231498], [0.98382284, 0.37184502], - [0.69804044, 0.44810796], - [1.32319756, -0.13181616], - [0.04296502, -0.37981873], - [0.28294738, -1.00125525], - [0.34218094, -0.58781961], - [0.2096964, -0.61814058], - [1.59068979, -0.96622933], - [0.73418199, -0.02222847], - [0.79270821, -0.41386668], - [1.16606871, -0.25641059], - [1.0304995, -0.16955962], - [0.48921682, -1.38504507], - [-0.03918551, -0.68540745], - [0.24991051, -1.00864997], - [0.80541964, -0.34465185], - [0.1732627, -1.61323172], + [0.69804044, 0.44810796] ] ) y_gt = np.array( - [ - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - ] + [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] ) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) From fc73cb385b3ee14931ab5c0ccda5a91afdaf8fd0 Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Thu, 5 Aug 2021 11:40:29 +0200 Subject: [PATCH 05/21] final edits to old version of docstrings --- .../_edited_nearest_neighbours.py | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index 4cd94afce..2916968a1 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -29,7 +29,7 @@ class EditedNearestNeighbours(BaseCleaningSampler): """Undersample based on the edited nearest neighbour method. - This method will clean the database by removing samples close to the + This method will clean the data set by removing samples close to the decision boundary. Read more in the :ref:`User Guide `. @@ -40,7 +40,7 @@ class EditedNearestNeighbours(BaseCleaningSampler): n_neighbors : int or object, default=3 If ``int``, size of the neighbourhood to consider to compute the - nearest neighbors. If object, an estimator that inherits from + nearest neighbours. If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to find the nearest-neighbours. @@ -48,7 +48,7 @@ class EditedNearestNeighbours(BaseCleaningSampler): Strategy to use in order to exclude samples. - If ``'all'``, all neighbours will have to agree with a sample in order - not be excluded. + not to be excluded. - If ``'mode'``, the majority vote of the neighbours will be used in order not to exclude a sample. @@ -70,7 +70,7 @@ class EditedNearestNeighbours(BaseCleaningSampler): RepeatedEditedNearestNeighbours : Undersample by repeating ENN algorithm. - AllKNN : Undersample using ENN and various number of neighbours. + AllKNN : Undersample using ENN and varying number of neighbours. Notes ----- @@ -81,8 +81,8 @@ class EditedNearestNeighbours(BaseCleaningSampler): References ---------- - .. [1] D. Wilson, Asymptotic" Properties of Nearest Neighbor Rules Using - Edited Data," In IEEE Transactions on Systems, Man, and Cybernetrics, + .. [1] D. Wilson, "Asymptotic Properties of Nearest Neighbor Rules Using + Edited Data", in IEEE Transactions on Systems, Man, and Cybernetics, vol. 2 (3), pp. 408-421, 1972. Examples @@ -184,9 +184,9 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): n_neighbors : int or object, default=3 If ``int``, size of the neighbourhood to consider to compute the - nearest neighbors. If object, an estimator that inherits from + nearest neighbours. If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to - find the nearest-neighbors. + find the nearest-neighbours. max_iter : int, default=100 Maximum number of iterations of the edited nearest neighbours algorithm. @@ -195,7 +195,7 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): Strategy to use in order to exclude samples. - If ``'all'``, all neighbours will have to agree with a sample in order - not be excluded. + not to be excluded. - If ``'mode'``, the majority vote of the neighbours will be used in order not to exclude a sample. @@ -212,7 +212,7 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): .. versionadded:: 0.4 n_iter_ : int - Number of iterations run. + Number of iterations that were actually run. .. versionadded:: 0.6 @@ -222,14 +222,14 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): EditedNearestNeighbours : Undersample by editing samples. - AllKNN : Undersample using ENN and various number of neighbours. + AllKNN : Undersample using ENN and varying number of neighbours. Notes ----- - The method is based on [1]_. A one-vs.-rest scheme is used when - sampling a class as proposed in [1]_. + The method is based on [1]_. - Supports multi-class resampling. + Supports multi-class resampling. A one-vs.-rest scheme is used when + sampling a class as proposed in [1]_. References ---------- @@ -370,9 +370,9 @@ class AllKNN(BaseCleaningSampler): n_neighbors : int or estimator object, default=3 If ``int``, size of the neighbourhood to consider to compute the - nearest neighbors. If object, an estimator that inherits from + nearest neighbours. If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to - find the nearest-neighbors. By default, it will be a 3-NN. + find the nearest-neighbours. By default, it will be a 3-NN. max_iter : int, default=100 Maximum number of iterations of the edited nearest neighbours algorithm. @@ -381,7 +381,7 @@ class AllKNN(BaseCleaningSampler): Strategy to use in order to exclude samples. - If ``'all'``, all neighbours will have to agree with a sample in order - not be excluded. + not to be excluded. - If ``'mode'``, the majority vote of the neighbours will be used in order not to exclude a sample. @@ -403,8 +403,8 @@ class without early stopping. .. versionadded:: 0.4 - n_iter_ : int - Number of iterations run. + n_iter_ : int + Number of iterations that were actually run. .. versionadded:: 0.9 From b8d22928981adbe7a243d3150e8eb62896b01e8b Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Thu, 5 Aug 2021 11:46:47 +0200 Subject: [PATCH 06/21] adds stopping criteria of RENN and AllKNN to docstrings --- .../_edited_nearest_neighbours.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index 2916968a1..a795542d2 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -174,7 +174,11 @@ def _more_tags(self): class RepeatedEditedNearestNeighbours(BaseCleaningSampler): """Undersample based on the repeated edited nearest neighbour method. - This method will repeat several times the ENN algorithm. + This method will repeat the ENN algorithm several times. The repetitions + will stop when i) the maximum number of iterations is reached, or ii) no + more observations are being removed, or iii) one of the majority classes + becomes a minority class or iv) one of the majority classes disappears + from the target after undersampling. Read more in the :ref:`User Guide `. @@ -359,8 +363,13 @@ def _more_tags(self): class AllKNN(BaseCleaningSampler): """Undersample based on the AllKNN method. - This method will apply ENN several times increasing the number of nearest - neighbours at each round. + This method will apply ENN several times, increasing the number + of nearest neighbours by 1 at each round. + + The repetitions will stop when i) the maximum number of iterations + is reached, or ii) one of the majority classes becomes a minority + class or iii) one of the majority classes disappears from the target + after undersampling. Read more in the :ref:`User Guide `. From cc24e4191a6f6645d313df4dbe57e537eff28f11 Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Thu, 5 Aug 2021 12:14:03 +0200 Subject: [PATCH 07/21] tidies tests ENN --- .../tests/test_edited_nearest_neighbours.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py index 7b56bd120..da06c3156 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py @@ -63,8 +63,9 @@ def test_enn_fit_resample(): [0.52726792, -0.38735648], ] ) - assert enn.nn_.n_neighbors == 4 y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) + + assert enn.nn_.n_neighbors == 4 assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -91,8 +92,9 @@ def test_enn_fit_resample_mode(): [0.2821046, -0.07862747], ] ) - assert enn.nn_.n_neighbors == 4 y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) + + assert enn.nn_.n_neighbors == 4 assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -121,6 +123,8 @@ def test_enn_fit_resample_with_nn_object(): ] ) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) + + assert enn.nn_.n_neighbors == 4 assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) From 749566fd2062d057d35d5855f0caca15bef31599 Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Thu, 5 Aug 2021 12:23:16 +0200 Subject: [PATCH 08/21] tidies up tests RENN --- .../tests/test_repeated_edited_nearest_neighbours.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py index 237724cb1..4b4d1fa6a 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py @@ -153,11 +153,13 @@ def test_renn_fit_resample(): y_gt = np.array( [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] ) + assert renn.nn_.n_neighbors == 4 assert renn.enn_.nn_.n_neighbors == 4 + assert renn.n_iter_ == 3 + assert 0 < renn.n_iter_ <= renn.max_iter assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) - assert 0 < renn.n_iter_ <= renn.max_iter def test_renn_fit_resample_mode_object(): @@ -328,6 +330,8 @@ def test_renn_fit_resample_mode(): 2, ] ) + assert renn.nn_.n_neighbors == 4 + assert renn.enn_.nn_.n_neighbors == 4 assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert 0 < renn.n_iter_ <= renn.max_iter From 4d441f94b8a4dc9319c8eeba0ca19eabb69b6e16 Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Thu, 5 Aug 2021 12:27:05 +0200 Subject: [PATCH 09/21] add max_iter to test_init_params --- .../tests/test_repeated_edited_nearest_neighbours.py | 1 + 1 file changed, 1 insertion(+) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py index 4b4d1fa6a..f2b94a731 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py @@ -107,6 +107,7 @@ def test_renn_init(): assert renn.n_neighbors == 3 assert renn.kind_sel == "all" assert renn.n_jobs is None + assert renn.max_iter == 100 def test_renn_iter_wrong(): From 7cc977da17f93c7afe137234b9f7b0b2a6f09b4d Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Thu, 5 Aug 2021 13:02:46 +0200 Subject: [PATCH 10/21] final update allknn sampler and its tests --- .../_edited_nearest_neighbours.py | 4 +++- .../_prototype_selection/tests/test_allknn.py | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index a795542d2..69f607636 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -498,7 +498,8 @@ def _fit_resample(self, X, y): self.sample_indices_ = np.arange(X.shape[0], dtype=int) - curr_size_ngh = self.n_neighbors + # find current number of neighbours + curr_size_ngh = self.nn_.n_neighbors for n_iter in range(self.max_iter): @@ -540,6 +541,7 @@ def _fit_resample(self, X, y): if b_min_bec_maj or b_remove_maj_class: break + self.n_iter_ = n_iter + 1 X_resampled, y_resampled = X_, y_ return X_resampled, y_resampled diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py index aab6782c5..bfef1fb23 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py @@ -103,6 +103,15 @@ R_TOL = 1e-4 +def test_allknn_init(): + allknn = AllKNN() + + assert allknn.n_neighbors == 3 + assert allknn.kind_sel == "all" + assert allknn.n_jobs is None + assert allknn.max_iter == 100 + + def test_allknn_fit_resample(): allknn = AllKNN() X_resampled, y_resampled = allknn.fit_resample(X, Y) @@ -126,6 +135,9 @@ def test_allknn_fit_resample(): y_gt = np.array( [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ) + assert allknn.nn_.n_neighbors == 3 + assert allknn.enn_.nn_.n_neighbors == 8 + assert allknn.n_iter_ == 6 assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_allclose(y_resampled, y_gt, rtol=R_TOL) @@ -181,6 +193,9 @@ def test_allknn_fit_resample_mode(): y_gt = np.array( [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] ) + assert allknn.nn_.n_neighbors == 3 + assert allknn.enn_.nn_.n_neighbors == 17 + assert allknn.n_iter_ == 15 assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -216,6 +231,9 @@ def test_allknn_fit_resample_with_nn_object(): y_gt = np.array( [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] ) + assert allknn.nn_.n_neighbors == 3 + assert allknn.enn_.nn_.n_neighbors == 17 + assert allknn.n_iter_ == 15 assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) From 7eb85e8d10ed3106d8a6067423f292c2de1fdefe Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Thu, 5 Aug 2021 14:00:22 +0200 Subject: [PATCH 11/21] fixes test smote_enn --- imblearn/combine/tests/test_smote_enn.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/imblearn/combine/tests/test_smote_enn.py b/imblearn/combine/tests/test_smote_enn.py index 0162c79b4..7b0e6a2e2 100644 --- a/imblearn/combine/tests/test_smote_enn.py +++ b/imblearn/combine/tests/test_smote_enn.py @@ -48,7 +48,9 @@ def test_sample_regular(): X_gt = np.array( [ + [0.53366841, -0.30312976], [1.52091956, -0.49283504], + [1.70580611, -0.11219234], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], @@ -57,7 +59,7 @@ def test_sample_regular(): [0.08711622, 0.93259929], ] ) - y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) + y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -72,7 +74,9 @@ def test_sample_regular_pass_smote_enn(): X_gt = np.array( [ + [0.53366841, -0.30312976], [1.52091956, -0.49283504], + [1.70580611, -0.11219234], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], @@ -81,7 +85,7 @@ def test_sample_regular_pass_smote_enn(): [0.08711622, 0.93259929], ] ) - y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) + y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -94,12 +98,13 @@ def test_sample_regular_half(): X_gt = np.array( [ [1.52091956, -0.49283504], + [1.70580611, -0.11219234], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929], ] ) - y_gt = np.array([0, 1, 1, 1]) + y_gt = np.array([0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -109,9 +114,12 @@ def test_validate_estimator_init(): enn = EditedNearestNeighbours(sampling_strategy="all") smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) + X_gt = np.array( [ + [0.53366841, -0.30312976], [1.52091956, -0.49283504], + [1.70580611, -0.11219234], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], @@ -120,7 +128,7 @@ def test_validate_estimator_init(): [0.08711622, 0.93259929], ] ) - y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) + y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -128,9 +136,12 @@ def test_validate_estimator_init(): def test_validate_estimator_default(): smt = SMOTEENN(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) + X_gt = np.array( [ + [0.53366841, -0.30312976], [1.52091956, -0.49283504], + [1.70580611, -0.11219234], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], @@ -139,7 +150,7 @@ def test_validate_estimator_default(): [0.08711622, 0.93259929], ] ) - y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) + y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) From a6310550d5e76d04463914a54ad1cad9b51e8a37 Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Thu, 5 Aug 2021 15:31:40 +0200 Subject: [PATCH 12/21] reverts tests to original format --- imblearn/combine/tests/test_smote_enn.py | 21 +++++-------------- .../tests/test_edited_nearest_neighbours.py | 14 ++++--------- ...test_repeated_edited_nearest_neighbours.py | 15 +++---------- 3 files changed, 12 insertions(+), 38 deletions(-) diff --git a/imblearn/combine/tests/test_smote_enn.py b/imblearn/combine/tests/test_smote_enn.py index 7b0e6a2e2..0162c79b4 100644 --- a/imblearn/combine/tests/test_smote_enn.py +++ b/imblearn/combine/tests/test_smote_enn.py @@ -48,9 +48,7 @@ def test_sample_regular(): X_gt = np.array( [ - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [1.70580611, -0.11219234], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], @@ -59,7 +57,7 @@ def test_sample_regular(): [0.08711622, 0.93259929], ] ) - y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1]) + y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -74,9 +72,7 @@ def test_sample_regular_pass_smote_enn(): X_gt = np.array( [ - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [1.70580611, -0.11219234], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], @@ -85,7 +81,7 @@ def test_sample_regular_pass_smote_enn(): [0.08711622, 0.93259929], ] ) - y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1]) + y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -98,13 +94,12 @@ def test_sample_regular_half(): X_gt = np.array( [ [1.52091956, -0.49283504], - [1.70580611, -0.11219234], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929], ] ) - y_gt = np.array([0, 0, 1, 1, 1]) + y_gt = np.array([0, 1, 1, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -114,12 +109,9 @@ def test_validate_estimator_init(): enn = EditedNearestNeighbours(sampling_strategy="all") smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) - X_gt = np.array( [ - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [1.70580611, -0.11219234], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], @@ -128,7 +120,7 @@ def test_validate_estimator_init(): [0.08711622, 0.93259929], ] ) - y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1]) + y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -136,12 +128,9 @@ def test_validate_estimator_init(): def test_validate_estimator_default(): smt = SMOTEENN(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) - X_gt = np.array( [ - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [1.70580611, -0.11219234], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], @@ -150,7 +139,7 @@ def test_validate_estimator_default(): [0.08711622, 0.93259929], ] ) - y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1]) + y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py index da06c3156..44999ddb5 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py @@ -49,7 +49,7 @@ def test_enn_init(): def test_enn_fit_resample(): - enn = EditedNearestNeighbours(n_neighbors=4) + enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_resample(X, Y) X_gt = np.array( @@ -64,14 +64,12 @@ def test_enn_fit_resample(): ] ) y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) - - assert enn.nn_.n_neighbors == 4 assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_enn_fit_resample_mode(): - enn = EditedNearestNeighbours(n_neighbors=4, kind_sel="mode") + enn = EditedNearestNeighbours(kind_sel="mode") X_resampled, y_resampled = enn.fit_resample(X, Y) X_gt = np.array( @@ -93,8 +91,6 @@ def test_enn_fit_resample_mode(): ] ) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) - - assert enn.nn_.n_neighbors == 4 assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -123,8 +119,6 @@ def test_enn_fit_resample_with_nn_object(): ] ) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) - - assert enn.nn_.n_neighbors == 4 assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -144,8 +138,8 @@ def test_enn_check_kind_selection(): n_samples=1000, n_classes=2, weights=[0.3, 0.7], random_state=0, ) - enn_all = EditedNearestNeighbours(kind_sel="all", n_neighbors=4) - enn_mode = EditedNearestNeighbours(kind_sel="mode", n_neighbors=4) + enn_all = EditedNearestNeighbours(kind_sel="all") + enn_mode = EditedNearestNeighbours(kind_sel="mode") enn_all.fit_resample(X, y) enn_mode.fit_resample(X, y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py index f2b94a731..a206ec2b3 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py @@ -107,7 +107,6 @@ def test_renn_init(): assert renn.n_neighbors == 3 assert renn.kind_sel == "all" assert renn.n_jobs is None - assert renn.max_iter == 100 def test_renn_iter_wrong(): @@ -118,7 +117,7 @@ def test_renn_iter_wrong(): def test_renn_fit_resample(): - renn = RepeatedEditedNearestNeighbours(n_neighbors=4) + renn = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = renn.fit_resample(X, Y) X_gt = np.array( @@ -154,17 +153,13 @@ def test_renn_fit_resample(): y_gt = np.array( [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] ) - - assert renn.nn_.n_neighbors == 4 - assert renn.enn_.nn_.n_neighbors == 4 - assert renn.n_iter_ == 3 - assert 0 < renn.n_iter_ <= renn.max_iter assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) + assert 0 < renn.n_iter_ <= renn.max_iter def test_renn_fit_resample_mode_object(): - renn = RepeatedEditedNearestNeighbours(kind_sel="mode", n_neighbors=4) + renn = RepeatedEditedNearestNeighbours(kind_sel="mode") X_resampled, y_resampled = renn.fit_resample(X, Y) X_gt = np.array( @@ -243,8 +238,6 @@ def test_renn_fit_resample_mode_object(): 2, ] ) - assert renn.nn_.n_neighbors == 4 - assert renn.enn_.nn_.n_neighbors == 4 assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert 0 < renn.n_iter_ <= renn.max_iter @@ -331,8 +324,6 @@ def test_renn_fit_resample_mode(): 2, ] ) - assert renn.nn_.n_neighbors == 4 - assert renn.enn_.nn_.n_neighbors == 4 assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert 0 < renn.n_iter_ <= renn.max_iter From 7bde8661dfdcc6ae97a2225024614ab0dc7d93b1 Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Thu, 5 Aug 2021 15:50:35 +0200 Subject: [PATCH 13/21] reverts back to original form of enn and renn --- .../_edited_nearest_neighbours.py | 4 ++-- .../_prototype_selection/tests/test_allknn.py | 22 ++++++------------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index 69f607636..6f5fbc11f 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -120,7 +120,7 @@ def __init__( def _validate_estimator(self): """Validate the estimator created in the ENN.""" self.nn_ = check_neighbors_object( - "n_neighbors", self.n_neighbors, additional_neighbor=0 + "n_neighbors", self.n_neighbors, additional_neighbor=1 ) self.nn_.set_params(**{"n_jobs": self.n_jobs}) @@ -283,7 +283,7 @@ def _validate_estimator(self): ) self.nn_ = check_neighbors_object( - "n_neighbors", self.n_neighbors, additional_neighbor=0 + "n_neighbors", self.n_neighbors, additional_neighbor=1 ) self.enn_ = EditedNearestNeighbours( diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py index bfef1fb23..6901882fb 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py @@ -135,9 +135,7 @@ def test_allknn_fit_resample(): y_gt = np.array( [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ) - assert allknn.nn_.n_neighbors == 3 - assert allknn.enn_.nn_.n_neighbors == 8 - assert allknn.n_iter_ == 6 + assert allknn.n_iter_ == 5 assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_allclose(y_resampled, y_gt, rtol=R_TOL) @@ -177,7 +175,6 @@ def test_allknn_fit_resample_mode(): [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], - [1.32326943, 0.28393874], [2.94290565, -0.13986434], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], @@ -187,15 +184,13 @@ def test_allknn_fit_resample_mode(): [0.62649535, 0.46600596], [1.67314371, 0.19231498], [0.98382284, 0.37184502], - [0.69804044, 0.44810796] + [0.69804044, 0.44810796], ] ) y_gt = np.array( - [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] ) - assert allknn.nn_.n_neighbors == 3 - assert allknn.enn_.nn_.n_neighbors == 17 - assert allknn.n_iter_ == 15 + assert allknn.n_iter_ == 14 assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -215,7 +210,6 @@ def test_allknn_fit_resample_with_nn_object(): [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], - [1.32326943, 0.28393874], [2.94290565, -0.13986434], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], @@ -225,15 +219,13 @@ def test_allknn_fit_resample_with_nn_object(): [0.62649535, 0.46600596], [1.67314371, 0.19231498], [0.98382284, 0.37184502], - [0.69804044, 0.44810796] + [0.69804044, 0.44810796], ] ) y_gt = np.array( - [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] ) - assert allknn.nn_.n_neighbors == 3 - assert allknn.enn_.nn_.n_neighbors == 17 - assert allknn.n_iter_ == 15 + assert allknn.n_iter_ == 14 assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) From cea8d7b7773eeb402c8e59ad0011972695621385 Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Thu, 5 Aug 2021 16:07:33 +0200 Subject: [PATCH 14/21] intermediate changes --- .../_prototype_selection/_edited_nearest_neighbours.py | 2 +- .../under_sampling/_prototype_selection/tests/test_allknn.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index 6f5fbc11f..a60d0e8f8 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -479,7 +479,7 @@ def _validate_estimator(self): raise NotImplementedError self.nn_ = check_neighbors_object( - "n_neighbors", self.n_neighbors, additional_neighbor=0 + "n_neighbors", self.n_neighbors, additional_neighbor=1 ) self.enn_ = EditedNearestNeighbours( diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py index 6901882fb..b21124fee 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py @@ -135,7 +135,7 @@ def test_allknn_fit_resample(): y_gt = np.array( [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ) - assert allknn.n_iter_ == 5 + assert allknn.n_iter_ == 2 assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_allclose(y_resampled, y_gt, rtol=R_TOL) From 4fb3339c941dd141a5753ab16e48d264e40f5414 Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Thu, 5 Aug 2021 16:27:33 +0200 Subject: [PATCH 15/21] revert back to original allknn --- .../_edited_nearest_neighbours.py | 18 +- .../_prototype_selection/tests/test_allknn.py | 165 +++++++++++++++--- 2 files changed, 144 insertions(+), 39 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index a60d0e8f8..14d2cdaa7 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -383,9 +383,6 @@ class or iii) one of the majority classes disappears from the target :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to find the nearest-neighbours. By default, it will be a 3-NN. - max_iter : int, default=100 - Maximum number of iterations of the edited nearest neighbours algorithm. - kind_sel : {{'all', 'mode'}}, default='all' Strategy to use in order to exclude samples. @@ -412,11 +409,6 @@ class without early stopping. .. versionadded:: 0.4 - n_iter_ : int - Number of iterations that were actually run. - - .. versionadded:: 0.9 - See Also -------- CondensedNearestNeighbour: Under-sampling by condensing samples. @@ -498,18 +490,11 @@ def _fit_resample(self, X, y): self.sample_indices_ = np.arange(X.shape[0], dtype=int) - # find current number of neighbours - curr_size_ngh = self.nn_.n_neighbors - - for n_iter in range(self.max_iter): - + for curr_size_ngh in range(1, self.nn_.n_neighbors): self.enn_.n_neighbors = curr_size_ngh X_enn, y_enn = self.enn_.fit_resample(X_, y_) - # add a neighbour for the next round - curr_size_ngh = curr_size_ngh + 1 - # Stopping criterion: # 1. If the number of samples in any of the majority classes ends up # smaller than the number of samples in the minority class @@ -541,7 +526,6 @@ def _fit_resample(self, X, y): if b_min_bec_maj or b_remove_maj_class: break - self.n_iter_ = n_iter + 1 X_resampled, y_resampled = X_, y_ return X_resampled, y_resampled diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py index b21124fee..e4e91b91a 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py @@ -103,15 +103,6 @@ R_TOL = 1e-4 -def test_allknn_init(): - allknn = AllKNN() - - assert allknn.n_neighbors == 3 - assert allknn.kind_sel == "all" - assert allknn.n_jobs is None - assert allknn.max_iter == 100 - - def test_allknn_fit_resample(): allknn = AllKNN() X_resampled, y_resampled = allknn.fit_resample(X, Y) @@ -124,18 +115,60 @@ def test_allknn_fit_resample(): [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], + [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], + [0.04296502, -0.37981873], + [0.28294738, -1.00125525], + [0.34218094, -0.58781961], + [0.2096964, -0.61814058], + [1.59068979, -0.96622933], + [0.73418199, -0.02222847], + [0.79270821, -0.41386668], + [1.16606871, -0.25641059], + [1.0304995, -0.16955962], + [0.48921682, -1.38504507], + [-0.03918551, -0.68540745], + [0.24991051, -1.00864997], + [0.80541964, -0.34465185], + [0.1732627, -1.61323172], ] ) y_gt = np.array( - [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + [ + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + ] ) - assert allknn.n_iter_ == 2 assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_allclose(y_resampled, y_gt, rtol=R_TOL) @@ -175,28 +208,72 @@ def test_allknn_fit_resample_mode(): [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], - [2.94290565, -0.13986434], + [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], - [1.84864913, 0.14729596], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], - [1.67314371, 0.19231498], [0.98382284, 0.37184502], [0.69804044, 0.44810796], + [1.32319756, -0.13181616], + [0.04296502, -0.37981873], + [0.28294738, -1.00125525], + [0.34218094, -0.58781961], + [0.2096964, -0.61814058], + [1.59068979, -0.96622933], + [0.73418199, -0.02222847], + [0.79270821, -0.41386668], + [1.16606871, -0.25641059], + [1.0304995, -0.16955962], + [0.48921682, -1.38504507], + [-0.03918551, -0.68540745], + [0.24991051, -1.00864997], + [0.80541964, -0.34465185], + [0.1732627, -1.61323172], ] ) y_gt = np.array( - [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + [ + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + ] ) - assert allknn.n_iter_ == 14 assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_allknn_fit_resample_with_nn_object(): - nn = NearestNeighbors(n_neighbors=3) + nn = NearestNeighbors(n_neighbors=4) allknn = AllKNN(n_neighbors=nn, kind_sel="mode") X_resampled, y_resampled = allknn.fit_resample(X, Y) @@ -210,22 +287,66 @@ def test_allknn_fit_resample_with_nn_object(): [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], - [2.94290565, -0.13986434], + [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], - [1.84864913, 0.14729596], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], - [1.67314371, 0.19231498], [0.98382284, 0.37184502], [0.69804044, 0.44810796], + [1.32319756, -0.13181616], + [0.04296502, -0.37981873], + [0.28294738, -1.00125525], + [0.34218094, -0.58781961], + [0.2096964, -0.61814058], + [1.59068979, -0.96622933], + [0.73418199, -0.02222847], + [0.79270821, -0.41386668], + [1.16606871, -0.25641059], + [1.0304995, -0.16955962], + [0.48921682, -1.38504507], + [-0.03918551, -0.68540745], + [0.24991051, -1.00864997], + [0.80541964, -0.34465185], + [0.1732627, -1.61323172], ] ) y_gt = np.array( - [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + [ + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + ] ) - assert allknn.n_iter_ == 14 assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) From 05a253e3db1b704de0650cdfe9b77d6d03e8770b Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Thu, 5 Aug 2021 16:36:20 +0200 Subject: [PATCH 16/21] removes max iter from allknn --- .../_edited_nearest_neighbours.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index 14d2cdaa7..7613d41f9 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -70,7 +70,7 @@ class EditedNearestNeighbours(BaseCleaningSampler): RepeatedEditedNearestNeighbours : Undersample by repeating ENN algorithm. - AllKNN : Undersample using ENN and varying number of neighbours. + AllKNN : Undersample using ENN and various number of neighbours. Notes ----- @@ -226,7 +226,7 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): EditedNearestNeighbours : Undersample by editing samples. - AllKNN : Undersample using ENN and varying number of neighbours. + AllKNN : Undersample using ENN and various number of neighbours. Notes ----- @@ -363,13 +363,12 @@ def _more_tags(self): class AllKNN(BaseCleaningSampler): """Undersample based on the AllKNN method. - This method will apply ENN several times, increasing the number - of nearest neighbours by 1 at each round. + This method will apply ENN several times, starting by 1-KNN and + increasing the number of nearest neighbours by 1 at each round. - The repetitions will stop when i) the maximum number of iterations - is reached, or ii) one of the majority classes becomes a minority - class or iii) one of the majority classes disappears from the target - after undersampling. + The repetitions will stop when i) one of the majority classes + becomes a minority class or ii) one of the majority classes + disappears from the target after undersampling. Read more in the :ref:`User Guide `. @@ -453,7 +452,6 @@ def __init__( *, sampling_strategy="auto", n_neighbors=3, - max_iter=100, kind_sel="all", allow_minority=False, n_jobs=None, @@ -463,7 +461,6 @@ def __init__( self.kind_sel = kind_sel self.allow_minority = allow_minority self.n_jobs = n_jobs - self.max_iter = max_iter def _validate_estimator(self): """Create objects required by AllKNN""" From 4e30c9b9a3122f4788097feccf659905160a6474 Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Thu, 5 Aug 2021 17:02:32 +0200 Subject: [PATCH 17/21] modifies docstring for n_neighbours in allknn --- .../_edited_nearest_neighbours.py | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index 7613d41f9..b1943f610 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -49,8 +49,8 @@ class EditedNearestNeighbours(BaseCleaningSampler): - If ``'all'``, all neighbours will have to agree with a sample in order not to be excluded. - - If ``'mode'``, the majority vote of the neighbours will be used in - order not to exclude a sample. + - If ``'mode'``, the majority of the neighbours will have to agree with + a sample in order not to be excluded. The strategy `"all"` will be less conservative than `'mode'`. Thus, more samples will be removed when `kind_sel="all"`, generally. @@ -193,15 +193,15 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): find the nearest-neighbours. max_iter : int, default=100 - Maximum number of iterations of the edited nearest neighbours algorithm. + Maximum number of repetitions of the edited nearest neighbours algorithm. kind_sel : {{'all', 'mode'}}, default='all' Strategy to use in order to exclude samples. - If ``'all'``, all neighbours will have to agree with a sample in order not to be excluded. - - If ``'mode'``, the majority vote of the neighbours will be used in - order not to exclude a sample. + - If ``'mode'``, the majority of the neighbours will have to agree with + a sample in order not to be excluded. The strategy `"all"` will be less conservative than `'mode'`. Thus, more samples will be removed when `kind_sel="all"`, generally. @@ -363,8 +363,10 @@ def _more_tags(self): class AllKNN(BaseCleaningSampler): """Undersample based on the AllKNN method. - This method will apply ENN several times, starting by 1-KNN and - increasing the number of nearest neighbours by 1 at each round. + This method will apply ENN several times, starting by looking at the + 1 closest neighbour, and increasing the number of nearest neighbours + by 1 at each round, up to the number of neighbours specified in + `n_neighbors`. The repetitions will stop when i) one of the majority classes becomes a minority class or ii) one of the majority classes @@ -377,18 +379,21 @@ class AllKNN(BaseCleaningSampler): {sampling_strategy} n_neighbors : int or estimator object, default=3 - If ``int``, size of the neighbourhood to consider to compute the - nearest neighbours. If object, an estimator that inherits from + If ``int``, maximum size of the neighbourhood to consider to compute the + nearest neighbours. The method will start by looking at the 1 closest + neighbour, and then repeat the edited nearest neighbours increasing + the neighbourhood by 1, until examining `n_neighbors`. + If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to - find the nearest-neighbours. By default, it will be a 3-NN. + find the nearest-neighbours in the final round. kind_sel : {{'all', 'mode'}}, default='all' Strategy to use in order to exclude samples. - If ``'all'``, all neighbours will have to agree with a sample in order not to be excluded. - - If ``'mode'``, the majority vote of the neighbours will be used in - order not to exclude a sample. + - If ``'mode'``, the majority of the neighbours will have to agree with + a sample in order not to be excluded. The strategy `"all"` will be less conservative than `'mode'`. Thus, more samples will be removed when `kind_sel="all"`, generally. From 75073311e76c41c88f5202e973587984db249b6b Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Thu, 5 Aug 2021 18:45:20 +0200 Subject: [PATCH 18/21] add more detail in param n_neighbor from AllKNN --- .../_edited_nearest_neighbours.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index b1943f610..dfdde54e5 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -379,13 +379,16 @@ class AllKNN(BaseCleaningSampler): {sampling_strategy} n_neighbors : int or estimator object, default=3 - If ``int``, maximum size of the neighbourhood to consider to compute the - nearest neighbours. The method will start by looking at the 1 closest - neighbour, and then repeat the edited nearest neighbours increasing - the neighbourhood by 1, until examining `n_neighbors`. + If ``int``, the maximum size of the neighbourhood to evaluate. + The method will start by looking at the 1 closest neighbour, and + then repeat the edited nearest neighbours increasing + the neighbourhood by 1, until examining a neighbourhood of + `n_neighbors` in the final iteration. If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to - find the nearest-neighbours in the final round. + find the nearest-neighbours in the final round. In this case, + AllKNN will repeat edited nearest neighbours starting from a 2-KNN + up to the specified KNN in the object. kind_sel : {{'all', 'mode'}}, default='all' Strategy to use in order to exclude samples. From 3f0e265d98b8d7eb7ba69f3c64742978a77106ce Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Thu, 5 Aug 2021 18:55:42 +0200 Subject: [PATCH 19/21] update dosctrings in validation --- imblearn/utils/_validation.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 533ed7c22..5d2d475d9 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -68,12 +68,13 @@ def _transfrom_one(self, array, props): def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): - """Check the object is consistent with a NN. + """Check that the object is consistent with a NN. Several methods in imblearn rely on NN. Until version 0.4, these objects can be passed at initialisation as an integer or a - KNeighborsMixin. After, only KNeighborsMixin will be accepted. This - utility allows for type checking and raises an error if the type is wrong. + KNeighborsMixin. In later versions, only KNeighborsMixin will be + accepted. This utility allows for type checking and raises an error + if the type is wrong. Parameters ---------- @@ -84,7 +85,9 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): The object to be checked. additional_neighbor : int, default=0 - Some algorithms need an additional neighbor. + Some algorithms need an additional neighbour. This is because to + explore a neighbourhood of 3, we need to train a 4-KNN algorithm + as the sample to examine is a neighbour itself. Returns ------- @@ -105,7 +108,7 @@ def _count_class_sample(y): def check_target_type(y, indicate_one_vs_all=False): - """Check the target types to be conform to the current samplers. + """Check the target types conform to the current samplers. The current samplers should be compatible with ``'binary'``, ``'multilabel-indicator'`` and ``'multiclass'`` targets only. @@ -116,7 +119,7 @@ def check_target_type(y, indicate_one_vs_all=False): The array containing the target. indicate_one_vs_all : bool, default=False - Either to indicate if the targets are encoded in a one-vs-all fashion. + Indicate if the targets are encoded in a one-vs-all fashion. Returns ------- @@ -407,7 +410,7 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): Checks that ``sampling_strategy`` is of consistent type and return a dictionary containing each targeted class with its corresponding - number of sample. It is used in :class:`~imblearn.base.BaseSampler`. + number of samples. It is used in :class:`~imblearn.base.BaseSampler`. Parameters ---------- @@ -435,7 +438,7 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): - When ``str``, specify the class targeted by the resampling. For **under- and over-sampling methods**, the number of samples in the - different classes will be equalized. For **cleaning methods**, the + different classes will be equal. For **cleaning methods**, the number of samples will not be equal. Possible choices are: ``'minority'``: resample only the minority class; @@ -461,8 +464,8 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): methods**. An error is raised with **cleaning methods**. Use a ``list`` instead. - - When ``list``, the list contains the targeted classes. It used only - for **cleaning methods**. + - When ``list``, the list contains the targeted classes. It is used + only in **cleaning methods**. .. warning:: ``list`` is available for **cleaning methods**. An error is raised From dae3e2e42dae06d0071d32cc651c1220c8529253 Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Thu, 5 Aug 2021 19:35:09 +0200 Subject: [PATCH 20/21] updates user guide for enn, renn and allknn --- doc/under_sampling.rst | 62 ++++++++++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst index 13798ad78..2b339446e 100644 --- a/doc/under_sampling.rst +++ b/doc/under_sampling.rst @@ -198,7 +198,7 @@ Cleaning under-sampling techniques ---------------------------------- Cleaning under-sampling techniques do not allow to specify the number of -samples to have in each class. In fact, each algorithm implement an heuristic +samples to have in each class. In fact, each algorithm implements an heuristic which will clean the dataset. .. _tomek_links: @@ -240,11 +240,17 @@ figure illustrates this behaviour. Edited data set using nearest neighbours ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:class:`EditedNearestNeighbours` applies a nearest-neighbors algorithm and -"edit" the dataset by removing samples which do not agree "enough" with their -neighboorhood :cite:`wilson1972asymptotic`. For each sample in the class to be -under-sampled, the nearest-neighbours are computed and if the selection -criterion is not fulfilled, the sample is removed:: +:class:`EditedNearestNeighbours` trains a nearest-neighbors algorithm and +then looks at the closest neighbours of each data point of the class to be +under-sampled, and "edits" the dataset by removing samples which do not agree +"enough" with their neighborhood :cite:`wilson1972asymptotic`. In short, +a KNN algorithm is trained on the data. Then, for each sample in the class +to be under-sampled, the (K-1) nearest-neighbours are identified. Note that +if a 4-KNN algorithm is trained, only 3 neighbours will be examined, because +the sample being inspected is the fourth neighbour returned by the algorithm. +Once the neighbours are identified, if all the neighbours or most of the +neighbours agree with the class of the sample being inspected, the sample is +kept, otherwise removed. Check the selection criteria below:: >>> sorted(Counter(y).items()) [(0, 64), (1, 262), (2, 4674)] @@ -256,10 +262,9 @@ criterion is not fulfilled, the sample is removed:: Two selection criteria are currently available: (i) the majority (i.e., ``kind_sel='mode'``) or (ii) all (i.e., ``kind_sel='all'``) the -nearest-neighbors have to belong to the same class than the sample inspected to -keep it in the dataset. Thus, it implies that `kind_sel='all'` will be less -conservative than `kind_sel='mode'`, and more samples will be excluded in -the former strategy than the latest:: +nearest-neighbors must belong to the same class than the sample inspected to +keep it in the dataset. This means that `kind_sel='all'` will be less +conservative than `kind_sel='mode'`, and more samples will be excluded:: >>> enn = EditedNearestNeighbours(kind_sel="all") >>> X_resampled, y_resampled = enn.fit_resample(X, y) @@ -270,14 +275,19 @@ the former strategy than the latest:: >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 234), (2, 4666)] -The parameter ``n_neighbors`` allows to give a classifier subclassed from -``KNeighborsMixin`` from scikit-learn to find the nearest neighbors and make -the decision to keep a given sample or not. +The parameter ``n_neighbors`` can take a classifier subclassed from +``KNeighborsMixin`` from scikit-learn to find the nearest neighbors. +Alternatively, an integer can be passed to indicate the size of the +neighborhood to examine to make a decision. Note that if ``n_neighbors=3`` +this means that the edited nearest neighbors will look at the 3 closest +neighbours of each sample, thus a 4-KNN algorithm will be trained +on the data. :class:`RepeatedEditedNearestNeighbours` extends :class:`EditedNearestNeighbours` by repeating the algorithm multiple times :cite:`tomek1976experiment`. Generally, repeating the algorithm will delete -more data:: +more data. The user indicates how many times to repeat the algorithm +through the parameter ``max_iter``:: >>> from imblearn.under_sampling import RepeatedEditedNearestNeighbours >>> renn = RepeatedEditedNearestNeighbours() @@ -285,10 +295,16 @@ more data:: >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 208), (2, 4551)] -:class:`AllKNN` differs from the previous -:class:`RepeatedEditedNearestNeighbours` since the number of neighbors of the -internal nearest neighbors algorithm is increased at each iteration -:cite:`tomek1976experiment`:: +:class:`AllKNN` extends :class:`EditedNearestNeighbours` by repeating +the algorithm multiple times, each time with an additional neighbour +:cite:`tomek1976experiment`. In other words, :class:`AllKNN` differs +from :class:`RepeatedEditedNearestNeighbours` in that the number of +neighbors of the internal nearest neighbors algorithm increases at +each iteration. In short, in the first iteration, a 2-KNN algorithm +is trained on the data to examine the 1 closest neighbour of each +sample from the class to be under-sampled. In each subsequent +iteration, the neighbourhood examined is increased by 1, until the +number of neighbours to examine indicated in the parameter ``n_neighbors``:: >>> from imblearn.under_sampling import AllKNN >>> allknn = AllKNN() @@ -296,6 +312,16 @@ internal nearest neighbors algorithm is increased at each iteration >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 220), (2, 4601)] + +The parameter ``n_neighbors`` can take an integer to indicate the size +of the neighborhood to examine to make a decision in the last iteration. +Thus, if ``n_neighbors=3``, AlKNN will examine the 1 closest neighbour +in the first iteration, the 2 closest neighbours in the second iteration +and the 3 closest neighbors in the third iteration. The parameter +``n_neighbors`` can also take a classifier subclassed from +``KNeighborsMixin`` from scikit-learn to find the nearest neighbors. +Again, this will be the KNN used in the last iteration. + In the example below, it can be seen that the three algorithms have similar impact by cleaning noisy samples next to the boundaries of the classes. From 1be039ed6f13dc9a01ba89aff21ec9d83f3fe5a1 Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Wed, 11 Aug 2021 17:06:42 +0200 Subject: [PATCH 21/21] final edits --- doc/under_sampling.rst | 60 ++++++++++--------- .../_edited_nearest_neighbours.py | 55 ++++++++--------- 2 files changed, 60 insertions(+), 55 deletions(-) diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst index 2b339446e..6f341f712 100644 --- a/doc/under_sampling.rst +++ b/doc/under_sampling.rst @@ -198,7 +198,7 @@ Cleaning under-sampling techniques ---------------------------------- Cleaning under-sampling techniques do not allow to specify the number of -samples to have in each class. In fact, each algorithm implements an heuristic +samples to have in each class. In fact, each algorithm implement an heuristic which will clean the dataset. .. _tomek_links: @@ -237,20 +237,18 @@ figure illustrates this behaviour. .. _edited_nearest_neighbors: -Edited data set using nearest neighbours -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Edited data set using nearest neighbors +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:class:`EditedNearestNeighbours` trains a nearest-neighbors algorithm and -then looks at the closest neighbours of each data point of the class to be +:class:`EditedNearestNeighbours` trains a nearest neighbors algorithm and +then looks at the closest neighbors of each data point of the class to be under-sampled, and "edits" the dataset by removing samples which do not agree "enough" with their neighborhood :cite:`wilson1972asymptotic`. In short, -a KNN algorithm is trained on the data. Then, for each sample in the class -to be under-sampled, the (K-1) nearest-neighbours are identified. Note that -if a 4-KNN algorithm is trained, only 3 neighbours will be examined, because -the sample being inspected is the fourth neighbour returned by the algorithm. -Once the neighbours are identified, if all the neighbours or most of the -neighbours agree with the class of the sample being inspected, the sample is -kept, otherwise removed. Check the selection criteria below:: +a nearest neighbors algorithm algorithm is trained on the data. Then, for each +sample in the class to be under-sampled, the nearest neighbors are identified. +Once the neighbors are identified, if all the neighbors or most of the neighbors +agree with the class of the sample being inspected, the sample is kept, otherwise +removed:: >>> sorted(Counter(y).items()) [(0, 64), (1, 262), (2, 4674)] @@ -261,8 +259,8 @@ kept, otherwise removed. Check the selection criteria below:: [(0, 64), (1, 213), (2, 4568)] Two selection criteria are currently available: (i) the majority (i.e., -``kind_sel='mode'``) or (ii) all (i.e., ``kind_sel='all'``) the -nearest-neighbors must belong to the same class than the sample inspected to +``kind_sel='mode'``) or (ii) all (i.e., ``kind_sel='all'``) of the +nearest neighbors must belong to the same class than the sample inspected to keep it in the dataset. This means that `kind_sel='all'` will be less conservative than `kind_sel='mode'`, and more samples will be excluded:: @@ -277,11 +275,12 @@ conservative than `kind_sel='mode'`, and more samples will be excluded:: The parameter ``n_neighbors`` can take a classifier subclassed from ``KNeighborsMixin`` from scikit-learn to find the nearest neighbors. -Alternatively, an integer can be passed to indicate the size of the -neighborhood to examine to make a decision. Note that if ``n_neighbors=3`` -this means that the edited nearest neighbors will look at the 3 closest -neighbours of each sample, thus a 4-KNN algorithm will be trained -on the data. +Note that if a 4-KNN classifier is passed, 3 neighbors will be +examined for the selection criteria, because the sample being inspected +is the fourth neighbor returned by the algorithm. Alternatively, an integer +can be passed to ``n_neighbors`` to indicate the size of the neighborhood +to examine to make a decision. Thus, if ``n_neighbors=3`` the edited nearest +neighbors will look at the 3 closest neighbors of each sample. :class:`RepeatedEditedNearestNeighbours` extends :class:`EditedNearestNeighbours` by repeating the algorithm multiple times @@ -295,16 +294,21 @@ through the parameter ``max_iter``:: >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 208), (2, 4551)] +Note that :class:`RepeatedEditedNearestNeighbours` will end before reaching +``max_iter`` if no more samples are removed from the data, or one of the +majority classes ends up disappearing or with less samples than the minority +after being "edited". + :class:`AllKNN` extends :class:`EditedNearestNeighbours` by repeating -the algorithm multiple times, each time with an additional neighbour +the algorithm multiple times, each time with an additional neighbor :cite:`tomek1976experiment`. In other words, :class:`AllKNN` differs from :class:`RepeatedEditedNearestNeighbours` in that the number of neighbors of the internal nearest neighbors algorithm increases at each iteration. In short, in the first iteration, a 2-KNN algorithm -is trained on the data to examine the 1 closest neighbour of each +is trained on the data to examine the 1 closest neighbor of each sample from the class to be under-sampled. In each subsequent -iteration, the neighbourhood examined is increased by 1, until the -number of neighbours to examine indicated in the parameter ``n_neighbors``:: +iteration, the neighborhood examined is increased by 1, until the +number of neighbors indicated in the parameter ``n_neighbors``:: >>> from imblearn.under_sampling import AllKNN >>> allknn = AllKNN() @@ -314,16 +318,16 @@ number of neighbours to examine indicated in the parameter ``n_neighbors``:: The parameter ``n_neighbors`` can take an integer to indicate the size -of the neighborhood to examine to make a decision in the last iteration. -Thus, if ``n_neighbors=3``, AlKNN will examine the 1 closest neighbour -in the first iteration, the 2 closest neighbours in the second iteration +of the neighborhood to examine in the last iteration. Thus, if +``n_neighbors=3``, AlKNN will examine the 1 closest neighbor in the +first iteration, the 2 closest neighbors in the second iteration and the 3 closest neighbors in the third iteration. The parameter ``n_neighbors`` can also take a classifier subclassed from ``KNeighborsMixin`` from scikit-learn to find the nearest neighbors. Again, this will be the KNN used in the last iteration. -In the example below, it can be seen that the three algorithms have similar -impact by cleaning noisy samples next to the boundaries of the classes. +In the example below, we can see that the three algorithms have a similar +impact on cleaning noisy samples at the boundaries of the classes. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_comparison_under_sampling_004.png :target: ./auto_examples/under-sampling/plot_comparison_under_sampling.html diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index dfdde54e5..0fdfdc925 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -1,4 +1,4 @@ -"""Classes to perform under-sampling based on the edited nearest neighbour +"""Classes to perform under-sampling based on the edited nearest neighbor method.""" # Authors: Guillaume Lemaitre @@ -27,7 +27,7 @@ n_jobs=_n_jobs_docstring, ) class EditedNearestNeighbours(BaseCleaningSampler): - """Undersample based on the edited nearest neighbour method. + """Undersample based on the edited nearest neighbor method. This method will clean the data set by removing samples close to the decision boundary. @@ -39,17 +39,17 @@ class EditedNearestNeighbours(BaseCleaningSampler): {sampling_strategy} n_neighbors : int or object, default=3 - If ``int``, size of the neighbourhood to consider to compute the - nearest neighbours. If object, an estimator that inherits from + If ``int``, size of the neighborhood to consider to compute the + nearest neighbors. If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to - find the nearest-neighbours. + find the nearest-neighbors. kind_sel : {{'all', 'mode'}}, default='all' Strategy to use in order to exclude samples. - - If ``'all'``, all neighbours will have to agree with a sample in order + - If ``'all'``, all neighbors will have to agree with a sample in order not to be excluded. - - If ``'mode'``, the majority of the neighbours will have to agree with + - If ``'mode'``, the majority of the neighbors will have to agree with a sample in order not to be excluded. The strategy `"all"` will be less conservative than `'mode'`. Thus, @@ -70,7 +70,7 @@ class EditedNearestNeighbours(BaseCleaningSampler): RepeatedEditedNearestNeighbours : Undersample by repeating ENN algorithm. - AllKNN : Undersample using ENN and various number of neighbours. + AllKNN : Undersample using ENN and various number of neighbors. Notes ----- @@ -172,7 +172,7 @@ def _more_tags(self): n_jobs=_n_jobs_docstring, ) class RepeatedEditedNearestNeighbours(BaseCleaningSampler): - """Undersample based on the repeated edited nearest neighbour method. + """Undersample based on the repeated edited nearest neighbor method. This method will repeat the ENN algorithm several times. The repetitions will stop when i) the maximum number of iterations is reached, or ii) no @@ -187,20 +187,20 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): {sampling_strategy} n_neighbors : int or object, default=3 - If ``int``, size of the neighbourhood to consider to compute the - nearest neighbours. If object, an estimator that inherits from + If ``int``, size of the neighborhood to consider to compute the + nearest neighbors. If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to - find the nearest-neighbours. + find the nearest-neighbors. max_iter : int, default=100 - Maximum number of repetitions of the edited nearest neighbours algorithm. + Maximum number of repetitions of the edited nearest neighbors algorithm. kind_sel : {{'all', 'mode'}}, default='all' Strategy to use in order to exclude samples. - - If ``'all'``, all neighbours will have to agree with a sample in order + - If ``'all'``, all neighbors will have to agree with a sample in order not to be excluded. - - If ``'mode'``, the majority of the neighbours will have to agree with + - If ``'mode'``, the majority of the neighbors will have to agree with a sample in order not to be excluded. The strategy `"all"` will be less conservative than `'mode'`. Thus, @@ -226,7 +226,7 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): EditedNearestNeighbours : Undersample by editing samples. - AllKNN : Undersample using ENN and various number of neighbours. + AllKNN : Undersample using ENN and various number of neighbors. Notes ----- @@ -364,8 +364,8 @@ class AllKNN(BaseCleaningSampler): """Undersample based on the AllKNN method. This method will apply ENN several times, starting by looking at the - 1 closest neighbour, and increasing the number of nearest neighbours - by 1 at each round, up to the number of neighbours specified in + 1 closest neighbor, and increasing the number of nearest neighbors + by 1 at each round, up to the number of neighbors specified in `n_neighbors`. The repetitions will stop when i) one of the majority classes @@ -379,23 +379,24 @@ class AllKNN(BaseCleaningSampler): {sampling_strategy} n_neighbors : int or estimator object, default=3 - If ``int``, the maximum size of the neighbourhood to evaluate. - The method will start by looking at the 1 closest neighbour, and - then repeat the edited nearest neighbours increasing - the neighbourhood by 1, until examining a neighbourhood of + If ``int``, the maximum size of the the neighborhood to evaluate. + The method will start by looking at the 1 closest neighbor, and + then repeat the edited nearest neighbors increasing + the neighborhood by 1, until examining a neighborhood of `n_neighbors` in the final iteration. + If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to - find the nearest-neighbours in the final round. In this case, - AllKNN will repeat edited nearest neighbours starting from a 2-KNN + find the nearest-neighbors in the final round. In this case, + AllKNN will repeat edited nearest neighbors starting from a 2-KNN up to the specified KNN in the object. kind_sel : {{'all', 'mode'}}, default='all' Strategy to use in order to exclude samples. - - If ``'all'``, all neighbours will have to agree with a sample in order + - If ``'all'``, all neighbors will have to agree with a sample in order not to be excluded. - - If ``'mode'``, the majority of the neighbours will have to agree with + - If ``'mode'``, the majority of the neighbors will have to agree with a sample in order not to be excluded. The strategy `"all"` will be less conservative than `'mode'`. Thus, @@ -434,7 +435,7 @@ class without early stopping. References ---------- .. [1] I. Tomek, "An Experiment with the Edited Nearest-Neighbor - Rule," IEEE Transactions on Systems, Man, and Cybernetics, vol. 6(6), + Rule", IEEE Transactions on Systems, Man, and Cybernetics, vol. 6(6), pp. 448-452, June 1976. Examples