Fix scikit-learn#27839: Adjust LocalOutlierFactor for data with dupli…

…cated samples Previously, when the dataset had values repeat more times than the algorithm's number of neighbors, it miscalculates the outliers. Because the distance between the duplicated samples is 0, the local reachability density is equal to 1e10. This leads to values that are close to the duplicated values having a really low negative outlier factor (under -1e7), labeling them as outliers. This fix checks if the minimum negative outlier factor is under -1e7 and, if so, raises the number of neighbors to the number of occurrences of the most frequent value + 1, also raising a warning. Notes: Added a handle_duplicates variable, which allows developers to manually handle the duplicate values, if desired. Also added a memory_limit variable to avoid creating memory errors for really large datasets, which can also be changed manually by developers.
HenriqueProj · Apr 3, 2024 · 69f62fb · 69f62fb
1 parent 30f4d9d
commit 69f62fb
Show file tree

Hide file tree

Showing 2 changed files with 75 additions and 0 deletions.
diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py
@@ -317,6 +317,48 @@ def fit(self, X, y=None):
                 self.negative_outlier_factor_, 100.0 * self.contamination
             )
 
+        # DEV: Assign false to manually handle the duplicate values
+        handle_duplicates = True
+
+        """
+        Verify if negative_outlier_factor_ values are within acceptable range
+        Novelty must also be false to detect outliers
+        """
+        if (
+            np.min(self.negative_outlier_factor_) < -1e7
+            and handle_duplicates
+            and not self.novelty
+        ):
+            # Raises error in case of 1D sparse array
+            check_array(X, accept_sparse=False)
+
+            # Gets the most frequent element in the array
+            unique_elements, duplicates = np.unique(X, return_counts=True)
+            max_duplicates = duplicates.max()
+
+            # DEV: Memory usage limit, can raise with risk of Memory Errors
+            memory_limit = 1e8  # Bytes
+
+            # Size needed to recalculate knn with new n_neighbors
+            kneighbors_array_memory = (max_duplicates + 1) * n_samples * 8  # Bytes
+
+            """
+            Check if number of duplicates is higher than neighbors
+            Check memory usage, as it can raise error for really large datasets
+            """
+            if (
+                max_duplicates >= self.n_neighbors
+                and kneighbors_array_memory < memory_limit
+            ):
+                self.n_neighbors = max_duplicates + 1
+
+                warnings.warn(
+                    "Duplicate values are leading to incorrect results. "
+                    "Increasing number of neighbors."
+                )
+                # Redoes the fitting with the new number of neighbors
+                return self.fit(X)
+
         return self
 
     def _check_novelty_predict(self):

diff --git a/sklearn/neighbors/tests/test_lof.py b/sklearn/neighbors/tests/test_lof.py
@@ -359,3 +359,36 @@ def test_lof_dtype_equivalence(algorithm, novelty, contamination):
             y_pred_32 = getattr(lof_32, method)(X_32)
             y_pred_64 = getattr(lof_64, method)(X_64)
             assert_allclose(y_pred_32, y_pred_64, atol=0.0002)
+
+
+def test_lof_duplicate_samples():
+    """
+    Check that outliers are correct when the data has duplicate values
+
+    Test for: https://github.com/scikit-learn/scikit-learn/issues/27839
+    """
+
+    rng = np.random.default_rng(0)
+
+    # 100 times the number of elements of the example shown in the issue
+    x = rng.permutation(
+        np.hstack(
+            [
+                [0.1] * 1000,  # constant values
+                np.linspace(0.1, 0.3, num=3000),
+                rng.random(500) * 100,  # the clear outliers
+            ]
+        )
+    )
+    X = x.reshape(-1, 1)
+
+    lof = neighbors.LocalOutlierFactor(n_neighbors=5, contamination=0.1)
+    outliers = lof.fit_predict(X)
+
+    indices = np.where(outliers == -1)
+
+    outliers = X[indices]
+
+    # Check that only values outside of the [0.1, 0.3] range are selected
+    for outlier in outliers:
+        assert outlier < 0.1 or outlier > 0.3