Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Fix LocalOutlierFactor's output for data with duplicated samples #28773

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
10 changes: 10 additions & 0 deletions sklearn/neighbors/_lof.py
Expand Up @@ -317,6 +317,16 @@ def fit(self, X, y=None):
self.negative_outlier_factor_, 100.0 * self.contamination
)

"""
Verify if negative_outlier_factor_ values are within acceptable range
Novelty must also be false to detect outliers
"""
HenriqueProj marked this conversation as resolved.
Show resolved Hide resolved
if np.min(self.negative_outlier_factor_) < -1e7 and not self.novelty:
warnings.warn(
"Duplicate values are leading to incorrect results. "
"Increase the number of neighbors for more accurate results."
)

return self

def _check_novelty_predict(self):
Expand Down
34 changes: 34 additions & 0 deletions sklearn/neighbors/tests/test_lof.py
Expand Up @@ -359,3 +359,37 @@ def test_lof_dtype_equivalence(algorithm, novelty, contamination):
y_pred_32 = getattr(lof_32, method)(X_32)
y_pred_64 = getattr(lof_64, method)(X_64)
assert_allclose(y_pred_32, y_pred_64, atol=0.0002)


def test_lof_duplicate_samples():
"""
Check that LocalOutlierFactor raises a warning when duplicate values
in the training data are causing innacurate results.

Test for: https://github.com/scikit-learn/scikit-learn/issues/27839
"""

rng = np.random.default_rng(0)

# 100 times the number of elements of the example shown in the issue
x = rng.permutation(
np.hstack(
[
[0.1] * 1000, # constant values
np.linspace(0.1, 0.3, num=3000),
rng.random(500) * 100, # the clear outliers
]
)
)
X = x.reshape(-1, 1)

error_msg = (
"Duplicate values are leading to incorrect results. "
"Increase the number of neighbors for more accurate results."
)

lof = neighbors.LocalOutlierFactor(n_neighbors=5, contamination=0.1)

# Catch the warning
with pytest.warns(UserWarning, match=re.escape(error_msg)):
lof.fit_predict(X)